##// END OF EJS Templates
hgweb: use a capped reader for WSGI input stream...
Gregory Szorc -
r36870:290fc4c3 default
parent child Browse files
Show More
@@ -1,312 +1,320
1 # hgweb/request.py - An http request from either CGI or the standalone server.
1 # hgweb/request.py - An http request from either CGI or the standalone server.
2 #
2 #
3 # Copyright 21 May 2005 - (c) 2005 Jake Edge <jake@edge2.net>
3 # Copyright 21 May 2005 - (c) 2005 Jake Edge <jake@edge2.net>
4 # Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
4 # Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
5 #
5 #
6 # This software may be used and distributed according to the terms of the
6 # This software may be used and distributed according to the terms of the
7 # GNU General Public License version 2 or any later version.
7 # GNU General Public License version 2 or any later version.
8
8
9 from __future__ import absolute_import
9 from __future__ import absolute_import
10
10
11 import cgi
11 import cgi
12 import errno
12 import errno
13 import socket
13 import socket
14 import wsgiref.headers as wsgiheaders
14 import wsgiref.headers as wsgiheaders
15 #import wsgiref.validate
15 #import wsgiref.validate
16
16
17 from .common import (
17 from .common import (
18 ErrorResponse,
18 ErrorResponse,
19 HTTP_NOT_MODIFIED,
19 HTTP_NOT_MODIFIED,
20 statusmessage,
20 statusmessage,
21 )
21 )
22
22
23 from ..thirdparty import (
23 from ..thirdparty import (
24 attr,
24 attr,
25 )
25 )
26 from .. import (
26 from .. import (
27 pycompat,
27 pycompat,
28 util,
28 util,
29 )
29 )
30
30
31 shortcuts = {
31 shortcuts = {
32 'cl': [('cmd', ['changelog']), ('rev', None)],
32 'cl': [('cmd', ['changelog']), ('rev', None)],
33 'sl': [('cmd', ['shortlog']), ('rev', None)],
33 'sl': [('cmd', ['shortlog']), ('rev', None)],
34 'cs': [('cmd', ['changeset']), ('node', None)],
34 'cs': [('cmd', ['changeset']), ('node', None)],
35 'f': [('cmd', ['file']), ('filenode', None)],
35 'f': [('cmd', ['file']), ('filenode', None)],
36 'fl': [('cmd', ['filelog']), ('filenode', None)],
36 'fl': [('cmd', ['filelog']), ('filenode', None)],
37 'fd': [('cmd', ['filediff']), ('node', None)],
37 'fd': [('cmd', ['filediff']), ('node', None)],
38 'fa': [('cmd', ['annotate']), ('filenode', None)],
38 'fa': [('cmd', ['annotate']), ('filenode', None)],
39 'mf': [('cmd', ['manifest']), ('manifest', None)],
39 'mf': [('cmd', ['manifest']), ('manifest', None)],
40 'ca': [('cmd', ['archive']), ('node', None)],
40 'ca': [('cmd', ['archive']), ('node', None)],
41 'tags': [('cmd', ['tags'])],
41 'tags': [('cmd', ['tags'])],
42 'tip': [('cmd', ['changeset']), ('node', ['tip'])],
42 'tip': [('cmd', ['changeset']), ('node', ['tip'])],
43 'static': [('cmd', ['static']), ('file', None)]
43 'static': [('cmd', ['static']), ('file', None)]
44 }
44 }
45
45
46 def normalize(form):
46 def normalize(form):
47 # first expand the shortcuts
47 # first expand the shortcuts
48 for k in shortcuts:
48 for k in shortcuts:
49 if k in form:
49 if k in form:
50 for name, value in shortcuts[k]:
50 for name, value in shortcuts[k]:
51 if value is None:
51 if value is None:
52 value = form[k]
52 value = form[k]
53 form[name] = value
53 form[name] = value
54 del form[k]
54 del form[k]
55 # And strip the values
55 # And strip the values
56 bytesform = {}
56 bytesform = {}
57 for k, v in form.iteritems():
57 for k, v in form.iteritems():
58 bytesform[pycompat.bytesurl(k)] = [
58 bytesform[pycompat.bytesurl(k)] = [
59 pycompat.bytesurl(i.strip()) for i in v]
59 pycompat.bytesurl(i.strip()) for i in v]
60 return bytesform
60 return bytesform
61
61
62 @attr.s(frozen=True)
62 @attr.s(frozen=True)
63 class parsedrequest(object):
63 class parsedrequest(object):
64 """Represents a parsed WSGI request / static HTTP request parameters."""
64 """Represents a parsed WSGI request / static HTTP request parameters."""
65
65
66 # Request method.
66 # Request method.
67 method = attr.ib()
67 method = attr.ib()
68 # Full URL for this request.
68 # Full URL for this request.
69 url = attr.ib()
69 url = attr.ib()
70 # URL without any path components. Just <proto>://<host><port>.
70 # URL without any path components. Just <proto>://<host><port>.
71 baseurl = attr.ib()
71 baseurl = attr.ib()
72 # Advertised URL. Like ``url`` and ``baseurl`` but uses SERVER_NAME instead
72 # Advertised URL. Like ``url`` and ``baseurl`` but uses SERVER_NAME instead
73 # of HTTP: Host header for hostname. This is likely what clients used.
73 # of HTTP: Host header for hostname. This is likely what clients used.
74 advertisedurl = attr.ib()
74 advertisedurl = attr.ib()
75 advertisedbaseurl = attr.ib()
75 advertisedbaseurl = attr.ib()
76 # WSGI application path.
76 # WSGI application path.
77 apppath = attr.ib()
77 apppath = attr.ib()
78 # List of path parts to be used for dispatch.
78 # List of path parts to be used for dispatch.
79 dispatchparts = attr.ib()
79 dispatchparts = attr.ib()
80 # URL path component (no query string) used for dispatch.
80 # URL path component (no query string) used for dispatch.
81 dispatchpath = attr.ib()
81 dispatchpath = attr.ib()
82 # Whether there is a path component to this request. This can be true
82 # Whether there is a path component to this request. This can be true
83 # when ``dispatchpath`` is empty due to REPO_NAME muckery.
83 # when ``dispatchpath`` is empty due to REPO_NAME muckery.
84 havepathinfo = attr.ib()
84 havepathinfo = attr.ib()
85 # Raw query string (part after "?" in URL).
85 # Raw query string (part after "?" in URL).
86 querystring = attr.ib()
86 querystring = attr.ib()
87 # List of 2-tuples of query string arguments.
87 # List of 2-tuples of query string arguments.
88 querystringlist = attr.ib()
88 querystringlist = attr.ib()
89 # Dict of query string arguments. Values are lists with at least 1 item.
89 # Dict of query string arguments. Values are lists with at least 1 item.
90 querystringdict = attr.ib()
90 querystringdict = attr.ib()
91 # wsgiref.headers.Headers instance. Operates like a dict with case
91 # wsgiref.headers.Headers instance. Operates like a dict with case
92 # insensitive keys.
92 # insensitive keys.
93 headers = attr.ib()
93 headers = attr.ib()
94
94
95 def parserequestfromenv(env):
95 def parserequestfromenv(env):
96 """Parse URL components from environment variables.
96 """Parse URL components from environment variables.
97
97
98 WSGI defines request attributes via environment variables. This function
98 WSGI defines request attributes via environment variables. This function
99 parses the environment variables into a data structure.
99 parses the environment variables into a data structure.
100 """
100 """
101 # PEP-0333 defines the WSGI spec and is a useful reference for this code.
101 # PEP-0333 defines the WSGI spec and is a useful reference for this code.
102
102
103 # We first validate that the incoming object conforms with the WSGI spec.
103 # We first validate that the incoming object conforms with the WSGI spec.
104 # We only want to be dealing with spec-conforming WSGI implementations.
104 # We only want to be dealing with spec-conforming WSGI implementations.
105 # TODO enable this once we fix internal violations.
105 # TODO enable this once we fix internal violations.
106 #wsgiref.validate.check_environ(env)
106 #wsgiref.validate.check_environ(env)
107
107
108 # PEP-0333 states that environment keys and values are native strings
108 # PEP-0333 states that environment keys and values are native strings
109 # (bytes on Python 2 and str on Python 3). The code points for the Unicode
109 # (bytes on Python 2 and str on Python 3). The code points for the Unicode
110 # strings on Python 3 must be between \00000-\000FF. We deal with bytes
110 # strings on Python 3 must be between \00000-\000FF. We deal with bytes
111 # in Mercurial, so mass convert string keys and values to bytes.
111 # in Mercurial, so mass convert string keys and values to bytes.
112 if pycompat.ispy3:
112 if pycompat.ispy3:
113 env = {k.encode('latin-1'): v for k, v in env.iteritems()}
113 env = {k.encode('latin-1'): v for k, v in env.iteritems()}
114 env = {k: v.encode('latin-1') if isinstance(v, str) else v
114 env = {k: v.encode('latin-1') if isinstance(v, str) else v
115 for k, v in env.iteritems()}
115 for k, v in env.iteritems()}
116
116
117 # https://www.python.org/dev/peps/pep-0333/#environ-variables defines
117 # https://www.python.org/dev/peps/pep-0333/#environ-variables defines
118 # the environment variables.
118 # the environment variables.
119 # https://www.python.org/dev/peps/pep-0333/#url-reconstruction defines
119 # https://www.python.org/dev/peps/pep-0333/#url-reconstruction defines
120 # how URLs are reconstructed.
120 # how URLs are reconstructed.
121 fullurl = env['wsgi.url_scheme'] + '://'
121 fullurl = env['wsgi.url_scheme'] + '://'
122 advertisedfullurl = fullurl
122 advertisedfullurl = fullurl
123
123
124 def addport(s):
124 def addport(s):
125 if env['wsgi.url_scheme'] == 'https':
125 if env['wsgi.url_scheme'] == 'https':
126 if env['SERVER_PORT'] != '443':
126 if env['SERVER_PORT'] != '443':
127 s += ':' + env['SERVER_PORT']
127 s += ':' + env['SERVER_PORT']
128 else:
128 else:
129 if env['SERVER_PORT'] != '80':
129 if env['SERVER_PORT'] != '80':
130 s += ':' + env['SERVER_PORT']
130 s += ':' + env['SERVER_PORT']
131
131
132 return s
132 return s
133
133
134 if env.get('HTTP_HOST'):
134 if env.get('HTTP_HOST'):
135 fullurl += env['HTTP_HOST']
135 fullurl += env['HTTP_HOST']
136 else:
136 else:
137 fullurl += env['SERVER_NAME']
137 fullurl += env['SERVER_NAME']
138 fullurl = addport(fullurl)
138 fullurl = addport(fullurl)
139
139
140 advertisedfullurl += env['SERVER_NAME']
140 advertisedfullurl += env['SERVER_NAME']
141 advertisedfullurl = addport(advertisedfullurl)
141 advertisedfullurl = addport(advertisedfullurl)
142
142
143 baseurl = fullurl
143 baseurl = fullurl
144 advertisedbaseurl = advertisedfullurl
144 advertisedbaseurl = advertisedfullurl
145
145
146 fullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
146 fullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
147 advertisedfullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
147 advertisedfullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
148 fullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
148 fullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
149 advertisedfullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
149 advertisedfullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
150
150
151 if env.get('QUERY_STRING'):
151 if env.get('QUERY_STRING'):
152 fullurl += '?' + env['QUERY_STRING']
152 fullurl += '?' + env['QUERY_STRING']
153 advertisedfullurl += '?' + env['QUERY_STRING']
153 advertisedfullurl += '?' + env['QUERY_STRING']
154
154
155 # When dispatching requests, we look at the URL components (PATH_INFO
155 # When dispatching requests, we look at the URL components (PATH_INFO
156 # and QUERY_STRING) after the application root (SCRIPT_NAME). But hgwebdir
156 # and QUERY_STRING) after the application root (SCRIPT_NAME). But hgwebdir
157 # has the concept of "virtual" repositories. This is defined via REPO_NAME.
157 # has the concept of "virtual" repositories. This is defined via REPO_NAME.
158 # If REPO_NAME is defined, we append it to SCRIPT_NAME to form a new app
158 # If REPO_NAME is defined, we append it to SCRIPT_NAME to form a new app
159 # root. We also exclude its path components from PATH_INFO when resolving
159 # root. We also exclude its path components from PATH_INFO when resolving
160 # the dispatch path.
160 # the dispatch path.
161
161
162 apppath = env['SCRIPT_NAME']
162 apppath = env['SCRIPT_NAME']
163
163
164 if env.get('REPO_NAME'):
164 if env.get('REPO_NAME'):
165 if not apppath.endswith('/'):
165 if not apppath.endswith('/'):
166 apppath += '/'
166 apppath += '/'
167
167
168 apppath += env.get('REPO_NAME')
168 apppath += env.get('REPO_NAME')
169
169
170 if 'PATH_INFO' in env:
170 if 'PATH_INFO' in env:
171 dispatchparts = env['PATH_INFO'].strip('/').split('/')
171 dispatchparts = env['PATH_INFO'].strip('/').split('/')
172
172
173 # Strip out repo parts.
173 # Strip out repo parts.
174 repoparts = env.get('REPO_NAME', '').split('/')
174 repoparts = env.get('REPO_NAME', '').split('/')
175 if dispatchparts[:len(repoparts)] == repoparts:
175 if dispatchparts[:len(repoparts)] == repoparts:
176 dispatchparts = dispatchparts[len(repoparts):]
176 dispatchparts = dispatchparts[len(repoparts):]
177 else:
177 else:
178 dispatchparts = []
178 dispatchparts = []
179
179
180 dispatchpath = '/'.join(dispatchparts)
180 dispatchpath = '/'.join(dispatchparts)
181
181
182 querystring = env.get('QUERY_STRING', '')
182 querystring = env.get('QUERY_STRING', '')
183
183
184 # We store as a list so we have ordering information. We also store as
184 # We store as a list so we have ordering information. We also store as
185 # a dict to facilitate fast lookup.
185 # a dict to facilitate fast lookup.
186 querystringlist = util.urlreq.parseqsl(querystring, keep_blank_values=True)
186 querystringlist = util.urlreq.parseqsl(querystring, keep_blank_values=True)
187
187
188 querystringdict = {}
188 querystringdict = {}
189 for k, v in querystringlist:
189 for k, v in querystringlist:
190 if k in querystringdict:
190 if k in querystringdict:
191 querystringdict[k].append(v)
191 querystringdict[k].append(v)
192 else:
192 else:
193 querystringdict[k] = [v]
193 querystringdict[k] = [v]
194
194
195 # HTTP_* keys contain HTTP request headers. The Headers structure should
195 # HTTP_* keys contain HTTP request headers. The Headers structure should
196 # perform case normalization for us. We just rewrite underscore to dash
196 # perform case normalization for us. We just rewrite underscore to dash
197 # so keys match what likely went over the wire.
197 # so keys match what likely went over the wire.
198 headers = []
198 headers = []
199 for k, v in env.iteritems():
199 for k, v in env.iteritems():
200 if k.startswith('HTTP_'):
200 if k.startswith('HTTP_'):
201 headers.append((k[len('HTTP_'):].replace('_', '-'), v))
201 headers.append((k[len('HTTP_'):].replace('_', '-'), v))
202
202
203 headers = wsgiheaders.Headers(headers)
203 headers = wsgiheaders.Headers(headers)
204
204
205 # This is kind of a lie because the HTTP header wasn't explicitly
205 # This is kind of a lie because the HTTP header wasn't explicitly
206 # sent. But for all intents and purposes it should be OK to lie about
206 # sent. But for all intents and purposes it should be OK to lie about
207 # this, since a consumer will either either value to determine how many
207 # this, since a consumer will either either value to determine how many
208 # bytes are available to read.
208 # bytes are available to read.
209 if 'CONTENT_LENGTH' in env and 'HTTP_CONTENT_LENGTH' not in env:
209 if 'CONTENT_LENGTH' in env and 'HTTP_CONTENT_LENGTH' not in env:
210 headers['Content-Length'] = env['CONTENT_LENGTH']
210 headers['Content-Length'] = env['CONTENT_LENGTH']
211
211
212 return parsedrequest(method=env['REQUEST_METHOD'],
212 return parsedrequest(method=env['REQUEST_METHOD'],
213 url=fullurl, baseurl=baseurl,
213 url=fullurl, baseurl=baseurl,
214 advertisedurl=advertisedfullurl,
214 advertisedurl=advertisedfullurl,
215 advertisedbaseurl=advertisedbaseurl,
215 advertisedbaseurl=advertisedbaseurl,
216 apppath=apppath,
216 apppath=apppath,
217 dispatchparts=dispatchparts, dispatchpath=dispatchpath,
217 dispatchparts=dispatchparts, dispatchpath=dispatchpath,
218 havepathinfo='PATH_INFO' in env,
218 havepathinfo='PATH_INFO' in env,
219 querystring=querystring,
219 querystring=querystring,
220 querystringlist=querystringlist,
220 querystringlist=querystringlist,
221 querystringdict=querystringdict,
221 querystringdict=querystringdict,
222 headers=headers)
222 headers=headers)
223
223
224 class wsgirequest(object):
224 class wsgirequest(object):
225 """Higher-level API for a WSGI request.
225 """Higher-level API for a WSGI request.
226
226
227 WSGI applications are invoked with 2 arguments. They are used to
227 WSGI applications are invoked with 2 arguments. They are used to
228 instantiate instances of this class, which provides higher-level APIs
228 instantiate instances of this class, which provides higher-level APIs
229 for obtaining request parameters, writing HTTP output, etc.
229 for obtaining request parameters, writing HTTP output, etc.
230 """
230 """
231 def __init__(self, wsgienv, start_response):
231 def __init__(self, wsgienv, start_response):
232 version = wsgienv[r'wsgi.version']
232 version = wsgienv[r'wsgi.version']
233 if (version < (1, 0)) or (version >= (2, 0)):
233 if (version < (1, 0)) or (version >= (2, 0)):
234 raise RuntimeError("Unknown and unsupported WSGI version %d.%d"
234 raise RuntimeError("Unknown and unsupported WSGI version %d.%d"
235 % version)
235 % version)
236 self.inp = wsgienv[r'wsgi.input']
236 self.inp = wsgienv[r'wsgi.input']
237
238 if r'HTTP_CONTENT_LENGTH' in wsgienv:
239 self.inp = util.cappedreader(self.inp,
240 int(wsgienv[r'HTTP_CONTENT_LENGTH']))
241 elif r'CONTENT_LENGTH' in wsgienv:
242 self.inp = util.cappedreader(self.inp,
243 int(wsgienv[r'CONTENT_LENGTH']))
244
237 self.err = wsgienv[r'wsgi.errors']
245 self.err = wsgienv[r'wsgi.errors']
238 self.threaded = wsgienv[r'wsgi.multithread']
246 self.threaded = wsgienv[r'wsgi.multithread']
239 self.multiprocess = wsgienv[r'wsgi.multiprocess']
247 self.multiprocess = wsgienv[r'wsgi.multiprocess']
240 self.run_once = wsgienv[r'wsgi.run_once']
248 self.run_once = wsgienv[r'wsgi.run_once']
241 self.env = wsgienv
249 self.env = wsgienv
242 self.form = normalize(cgi.parse(self.inp,
250 self.form = normalize(cgi.parse(self.inp,
243 self.env,
251 self.env,
244 keep_blank_values=1))
252 keep_blank_values=1))
245 self._start_response = start_response
253 self._start_response = start_response
246 self.server_write = None
254 self.server_write = None
247 self.headers = []
255 self.headers = []
248
256
249 def drain(self):
257 def drain(self):
250 '''need to read all data from request, httplib is half-duplex'''
258 '''need to read all data from request, httplib is half-duplex'''
251 length = int(self.env.get('CONTENT_LENGTH') or 0)
259 length = int(self.env.get('CONTENT_LENGTH') or 0)
252 for s in util.filechunkiter(self.inp, limit=length):
260 for s in util.filechunkiter(self.inp, limit=length):
253 pass
261 pass
254
262
255 def respond(self, status, type, filename=None, body=None):
263 def respond(self, status, type, filename=None, body=None):
256 if not isinstance(type, str):
264 if not isinstance(type, str):
257 type = pycompat.sysstr(type)
265 type = pycompat.sysstr(type)
258 if self._start_response is not None:
266 if self._start_response is not None:
259 self.headers.append((r'Content-Type', type))
267 self.headers.append((r'Content-Type', type))
260 if filename:
268 if filename:
261 filename = (filename.rpartition('/')[-1]
269 filename = (filename.rpartition('/')[-1]
262 .replace('\\', '\\\\').replace('"', '\\"'))
270 .replace('\\', '\\\\').replace('"', '\\"'))
263 self.headers.append(('Content-Disposition',
271 self.headers.append(('Content-Disposition',
264 'inline; filename="%s"' % filename))
272 'inline; filename="%s"' % filename))
265 if body is not None:
273 if body is not None:
266 self.headers.append((r'Content-Length', str(len(body))))
274 self.headers.append((r'Content-Length', str(len(body))))
267
275
268 for k, v in self.headers:
276 for k, v in self.headers:
269 if not isinstance(v, str):
277 if not isinstance(v, str):
270 raise TypeError('header value must be string: %r' % (v,))
278 raise TypeError('header value must be string: %r' % (v,))
271
279
272 if isinstance(status, ErrorResponse):
280 if isinstance(status, ErrorResponse):
273 self.headers.extend(status.headers)
281 self.headers.extend(status.headers)
274 if status.code == HTTP_NOT_MODIFIED:
282 if status.code == HTTP_NOT_MODIFIED:
275 # RFC 2616 Section 10.3.5: 304 Not Modified has cases where
283 # RFC 2616 Section 10.3.5: 304 Not Modified has cases where
276 # it MUST NOT include any headers other than these and no
284 # it MUST NOT include any headers other than these and no
277 # body
285 # body
278 self.headers = [(k, v) for (k, v) in self.headers if
286 self.headers = [(k, v) for (k, v) in self.headers if
279 k in ('Date', 'ETag', 'Expires',
287 k in ('Date', 'ETag', 'Expires',
280 'Cache-Control', 'Vary')]
288 'Cache-Control', 'Vary')]
281 status = statusmessage(status.code, pycompat.bytestr(status))
289 status = statusmessage(status.code, pycompat.bytestr(status))
282 elif status == 200:
290 elif status == 200:
283 status = '200 Script output follows'
291 status = '200 Script output follows'
284 elif isinstance(status, int):
292 elif isinstance(status, int):
285 status = statusmessage(status)
293 status = statusmessage(status)
286
294
287 self.server_write = self._start_response(
295 self.server_write = self._start_response(
288 pycompat.sysstr(status), self.headers)
296 pycompat.sysstr(status), self.headers)
289 self._start_response = None
297 self._start_response = None
290 self.headers = []
298 self.headers = []
291 if body is not None:
299 if body is not None:
292 self.write(body)
300 self.write(body)
293 self.server_write = None
301 self.server_write = None
294
302
295 def write(self, thing):
303 def write(self, thing):
296 if thing:
304 if thing:
297 try:
305 try:
298 self.server_write(thing)
306 self.server_write(thing)
299 except socket.error as inst:
307 except socket.error as inst:
300 if inst[0] != errno.ECONNRESET:
308 if inst[0] != errno.ECONNRESET:
301 raise
309 raise
302
310
303 def flush(self):
311 def flush(self):
304 return None
312 return None
305
313
306 def wsgiapplication(app_maker):
314 def wsgiapplication(app_maker):
307 '''For compatibility with old CGI scripts. A plain hgweb() or hgwebdir()
315 '''For compatibility with old CGI scripts. A plain hgweb() or hgwebdir()
308 can and should now be used as a WSGI application.'''
316 can and should now be used as a WSGI application.'''
309 application = app_maker()
317 application = app_maker()
310 def run_wsgi(env, respond):
318 def run_wsgi(env, respond):
311 return application(env, respond)
319 return application(env, respond)
312 return run_wsgi
320 return run_wsgi
General Comments 0
You need to be logged in to leave comments. Login now