# HG changeset patch # User Mads Kiilerich # Date 2019-12-22 23:56:45 # Node ID 6c381371d106044325f0303ccfba8f4bc64775d0 # Parent 620c13a373c5f016ddafdcf831a3894b487ae387 py3: fix non-ASCII URLs - decode unicode correctly before passing them to controllers as unicode strings This is needed for supporting localized repo path names in the path of URLs. Some references: https://www.python.org/dev/peps/pep-0333/#unicode-issues https://bugs.python.org/issue16679 http://lucumr.pocoo.org/2010/5/25/wsgi-on-python-3/ https://bugs.launchpad.net/pecan/+bug/1451842 https://github.com/tipabu/eventlet/commit/a5a7751b013fe99b6d30acbca79e819770e9ae5d diff --git a/kallithea/config/routing.py b/kallithea/config/routing.py --- a/kallithea/config/routing.py +++ b/kallithea/config/routing.py @@ -19,14 +19,34 @@ may take precedent over the more generic refer to the routes manual at http://routes.groovie.org/docs/ """ -from routes import Mapper +import routes from tg import request +from kallithea.lib.utils2 import safe_str + # prefix for non repository related links needs to be prefixed with `/` ADMIN_PREFIX = '/_admin' +class Mapper(routes.Mapper): + """ + Subclassed Mapper with routematch patched to decode "unicode" str url to + *real* unicode str before applying matches and invoking controller methods. + """ + + def routematch(self, url=None, environ=None): + """ + routematch that also decode url from "fake bytes" to real unicode + string before matching and invoking controllers. + """ + # Process url like get_path_info does ... but PATH_INFO has already + # been retrieved from environ and is passed, so - let's just use that + # instead. + url = safe_str(url.encode('latin1')) + return super().routematch(url=url, environ=environ) + + def make_map(config): """Create, configure and return the routes Mapper""" rmap = Mapper(directory=config['paths']['controllers'], diff --git a/kallithea/lib/base.py b/kallithea/lib/base.py --- a/kallithea/lib/base.py +++ b/kallithea/lib/base.py @@ -97,12 +97,17 @@ def _get_ip_addr(environ): def get_path_info(environ): - """Return unicode PATH_INFO from environ ... using tg.original_request if available. + """Return PATH_INFO from environ ... using tg.original_request if available. + + In Python 3 WSGI, PATH_INFO is a unicode str, but kind of contains encoded + bytes. The code points are guaranteed to only use the lower 8 bit bits, and + encoding the string with the 1:1 encoding latin1 will give the + corresponding byte string ... which then can be decoded to proper unicode. """ org_req = environ.get('tg.original_request') if org_req is not None: environ = org_req.environ - return safe_str(environ['PATH_INFO']) + return safe_str(environ['PATH_INFO'].encode('latin1')) def log_in_user(user, remember, is_external_auth, ip_addr): diff --git a/kallithea/lib/middleware/permanent_repo_url.py b/kallithea/lib/middleware/permanent_repo_url.py --- a/kallithea/lib/middleware/permanent_repo_url.py +++ b/kallithea/lib/middleware/permanent_repo_url.py @@ -33,9 +33,9 @@ class PermanentRepoUrl(object): def __call__(self, environ, start_response): # Extract path_info as get_path_info does, but do it explicitly because # we also have to do the reverse operation when patching it back in - path_info = safe_str(environ['PATH_INFO']) + path_info = safe_str(environ['PATH_INFO'].encode('latin1')) if path_info.startswith('/'): # it must path_info = '/' + fix_repo_id_name(path_info[1:]) - environ['PATH_INFO'] = safe_bytes(path_info) + environ['PATH_INFO'] = safe_bytes(path_info).decode('latin1') return self.application(environ, start_response)