# HG changeset patch # User Gregory Szorc # Date 2018-03-11 20:11:13 # Node ID d7fd203e36cc3b4cb20c4c2236428a5e8a0c706b # Parent b2a3308d6a2186a3d7770e1bee6adcb5ada2e44d hgweb: refactor repository name URL parsing The hgwebdir WSGI application detects when a requested URL is for a known repository and it effectively forwards the request to the hgweb WSGI application. The hgweb WSGI application needs to route the request based on the base URL for the repository. The way this normally works is SCRIPT_NAME is used to resolve the base URL and PATH_INFO contains the path after the script. But with hgwebdir, SCRIPT_NAME refers to hgwebdir, not the base URL for the repository. So, there was a hacky REPO_NAME environment variable being set to convey the part of the URL that represented the repository so hgweb could ignore this path component for routing purposes. The use of the environment variable for passing internal state is pretty hacky. Plus, it wasn't clear from the perspective of the URL parsing code what was going on. This commit improves matters by making the repository name an explicit argument to the request parser. The logic around handling of this value has been shored up. We add various checks that the argument is used properly - that the repository name does represent the prefix of the PATH_INFO. Differential Revision: https://phab.mercurial-scm.org/D2819 diff --git a/mercurial/hgweb/hgwebdir_mod.py b/mercurial/hgweb/hgwebdir_mod.py --- a/mercurial/hgweb/hgwebdir_mod.py +++ b/mercurial/hgweb/hgwebdir_mod.py @@ -452,13 +452,10 @@ class hgwebdir(object): for virtualrepo in _virtualdirs(): real = repos.get(virtualrepo) if real: - wsgireq.env['REPO_NAME'] = virtualrepo - # We have to re-parse because of updated environment - # variable. - # TODO this is kind of hacky and we should have a better - # way of doing this than with REPO_NAME side-effects. + # Re-parse the WSGI environment to take into account our + # repository path component. wsgireq.req = requestmod.parserequestfromenv( - wsgireq.env, wsgireq.req.bodyfh) + wsgireq.env, wsgireq.req.bodyfh, reponame=virtualrepo) try: # ensure caller gets private copy of ui repo = hg.repository(self.ui.copy(), real) diff --git a/mercurial/hgweb/request.py b/mercurial/hgweb/request.py --- a/mercurial/hgweb/request.py +++ b/mercurial/hgweb/request.py @@ -155,11 +155,16 @@ class parsedrequest(object): # Request body input stream. bodyfh = attr.ib() -def parserequestfromenv(env, bodyfh): +def parserequestfromenv(env, bodyfh, reponame=None): """Parse URL components from environment variables. WSGI defines request attributes via environment variables. This function parses the environment variables into a data structure. + + If ``reponame`` is defined, the leading path components matching that + string are effectively shifted from ``PATH_INFO`` to ``SCRIPT_NAME``. + This simulates the world view of a WSGI application that processes + requests from the base URL of a repo. """ # PEP-0333 defines the WSGI spec and is a useful reference for this code. @@ -215,28 +220,34 @@ def parserequestfromenv(env, bodyfh): fullurl += '?' + env['QUERY_STRING'] advertisedfullurl += '?' + env['QUERY_STRING'] - # When dispatching requests, we look at the URL components (PATH_INFO - # and QUERY_STRING) after the application root (SCRIPT_NAME). But hgwebdir - # has the concept of "virtual" repositories. This is defined via REPO_NAME. - # If REPO_NAME is defined, we append it to SCRIPT_NAME to form a new app - # root. We also exclude its path components from PATH_INFO when resolving - # the dispatch path. + # If ``reponame`` is defined, that must be a prefix on PATH_INFO + # that represents the repository being dispatched to. When computing + # the dispatch info, we ignore these leading path components. apppath = env.get('SCRIPT_NAME', '') - if env.get('REPO_NAME'): - if not apppath.endswith('/'): - apppath += '/' + if reponame: + repoprefix = '/' + reponame.strip('/') - apppath += env.get('REPO_NAME') + if not env.get('PATH_INFO'): + raise error.ProgrammingError('reponame requires PATH_INFO') + + if not env['PATH_INFO'].startswith(repoprefix): + raise error.ProgrammingError('PATH_INFO does not begin with repo ' + 'name: %s (%s)' % (env['PATH_INFO'], + reponame)) - if 'PATH_INFO' in env: - dispatchparts = env['PATH_INFO'].strip('/').split('/') + dispatchpath = env['PATH_INFO'][len(repoprefix):] - # Strip out repo parts. - repoparts = env.get('REPO_NAME', '').split('/') - if dispatchparts[:len(repoparts)] == repoparts: - dispatchparts = dispatchparts[len(repoparts):] + if dispatchpath and not dispatchpath.startswith('/'): + raise error.ProgrammingError('reponame prefix of PATH_INFO does ' + 'not end at path delimiter: %s (%s)' % + (env['PATH_INFO'], reponame)) + + apppath = apppath.rstrip('/') + repoprefix + dispatchparts = dispatchpath.strip('/').split('/') + elif env.get('PATH_INFO', '').strip('/'): + dispatchparts = env['PATH_INFO'].strip('/').split('/') else: dispatchparts = [] @@ -283,7 +294,7 @@ def parserequestfromenv(env, bodyfh): apppath=apppath, dispatchparts=dispatchparts, dispatchpath=dispatchpath, havepathinfo='PATH_INFO' in env, - reponame=env.get('REPO_NAME'), + reponame=reponame, querystring=querystring, qsparams=qsparams, headers=headers, diff --git a/tests/test-wsgirequest.py b/tests/test-wsgirequest.py --- a/tests/test-wsgirequest.py +++ b/tests/test-wsgirequest.py @@ -5,6 +5,9 @@ import unittest from mercurial.hgweb import ( request as requestmod, ) +from mercurial import ( + error, +) DEFAULT_ENV = { r'REQUEST_METHOD': r'GET', @@ -20,11 +23,11 @@ DEFAULT_ENV = { r'wsgi.run_once': False, } -def parse(env, bodyfh=None, extra=None): +def parse(env, bodyfh=None, reponame=None, extra=None): env = dict(env) env.update(extra or {}) - return requestmod.parserequestfromenv(env, bodyfh) + return requestmod.parserequestfromenv(env, bodyfh, reponame=reponame) class ParseRequestTests(unittest.TestCase): def testdefault(self): @@ -203,24 +206,26 @@ class ParseRequestTests(unittest.TestCas self.assertTrue(r.havepathinfo) def testreponame(self): - """REPO_NAME path components get stripped from URL.""" - r = parse(DEFAULT_ENV, extra={ - r'REPO_NAME': r'repo', - r'PATH_INFO': r'/path1/path2' - }) + """repository path components get stripped from URL.""" + + with self.assertRaisesRegexp(error.ProgrammingError, + b'reponame requires PATH_INFO'): + parse(DEFAULT_ENV, reponame=b'repo') - self.assertEqual(r.url, b'http://testserver/path1/path2') - self.assertEqual(r.baseurl, b'http://testserver') - self.assertEqual(r.advertisedurl, b'http://testserver/path1/path2') - self.assertEqual(r.advertisedbaseurl, b'http://testserver') - self.assertEqual(r.apppath, b'/repo') - self.assertEqual(r.dispatchparts, [b'path1', b'path2']) - self.assertEqual(r.dispatchpath, b'path1/path2') - self.assertTrue(r.havepathinfo) - self.assertEqual(r.reponame, b'repo') + with self.assertRaisesRegexp(error.ProgrammingError, + b'PATH_INFO does not begin with repo ' + b'name'): + parse(DEFAULT_ENV, reponame=b'repo', extra={ + r'PATH_INFO': r'/pathinfo', + }) - r = parse(DEFAULT_ENV, extra={ - r'REPO_NAME': r'repo', + with self.assertRaisesRegexp(error.ProgrammingError, + b'reponame prefix of PATH_INFO'): + parse(DEFAULT_ENV, reponame=b'repo', extra={ + r'PATH_INFO': r'/repoextra/path', + }) + + r = parse(DEFAULT_ENV, reponame=b'repo', extra={ r'PATH_INFO': r'/repo/path1/path2', }) @@ -234,8 +239,7 @@ class ParseRequestTests(unittest.TestCas self.assertTrue(r.havepathinfo) self.assertEqual(r.reponame, b'repo') - r = parse(DEFAULT_ENV, extra={ - r'REPO_NAME': r'prefix/repo', + r = parse(DEFAULT_ENV, reponame=b'prefix/repo', extra={ r'PATH_INFO': r'/prefix/repo/path1/path2', })