##// END OF EJS Templates
hgweb: use our forked wsgiheaders module instead of stdlib one...
Augie Fackler -
r37624:da84e26d default
parent child Browse files
Show More
@@ -1,563 +1,564
1 1 # hgweb/request.py - An http request from either CGI or the standalone server.
2 2 #
3 3 # Copyright 21 May 2005 - (c) 2005 Jake Edge <jake@edge2.net>
4 4 # Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 import wsgiref.headers as wsgiheaders
12 11 #import wsgiref.validate
13 12
14 13 from ..thirdparty import (
15 14 attr,
16 15 )
17 16 from .. import (
18 17 error,
19 18 pycompat,
20 19 util,
21 20 )
22 21
23 22 class multidict(object):
24 23 """A dict like object that can store multiple values for a key.
25 24
26 25 Used to store parsed request parameters.
27 26
28 27 This is inspired by WebOb's class of the same name.
29 28 """
30 29 def __init__(self):
31 30 self._items = {}
32 31
33 32 def __getitem__(self, key):
34 33 """Returns the last set value for a key."""
35 34 return self._items[key][-1]
36 35
37 36 def __setitem__(self, key, value):
38 37 """Replace a values for a key with a new value."""
39 38 self._items[key] = [value]
40 39
41 40 def __delitem__(self, key):
42 41 """Delete all values for a key."""
43 42 del self._items[key]
44 43
45 44 def __contains__(self, key):
46 45 return key in self._items
47 46
48 47 def __len__(self):
49 48 return len(self._items)
50 49
51 50 def get(self, key, default=None):
52 51 try:
53 52 return self.__getitem__(key)
54 53 except KeyError:
55 54 return default
56 55
57 56 def add(self, key, value):
58 57 """Add a new value for a key. Does not replace existing values."""
59 58 self._items.setdefault(key, []).append(value)
60 59
61 60 def getall(self, key):
62 61 """Obtains all values for a key."""
63 62 return self._items.get(key, [])
64 63
65 64 def getone(self, key):
66 65 """Obtain a single value for a key.
67 66
68 67 Raises KeyError if key not defined or it has multiple values set.
69 68 """
70 69 vals = self._items[key]
71 70
72 71 if len(vals) > 1:
73 72 raise KeyError('multiple values for %r' % key)
74 73
75 74 return vals[0]
76 75
77 76 def asdictoflists(self):
78 77 return {k: list(v) for k, v in self._items.iteritems()}
79 78
80 79 @attr.s(frozen=True)
81 80 class parsedrequest(object):
82 81 """Represents a parsed WSGI request.
83 82
84 83 Contains both parsed parameters as well as a handle on the input stream.
85 84 """
86 85
87 86 # Request method.
88 87 method = attr.ib()
89 88 # Full URL for this request.
90 89 url = attr.ib()
91 90 # URL without any path components. Just <proto>://<host><port>.
92 91 baseurl = attr.ib()
93 92 # Advertised URL. Like ``url`` and ``baseurl`` but uses SERVER_NAME instead
94 93 # of HTTP: Host header for hostname. This is likely what clients used.
95 94 advertisedurl = attr.ib()
96 95 advertisedbaseurl = attr.ib()
97 96 # URL scheme (part before ``://``). e.g. ``http`` or ``https``.
98 97 urlscheme = attr.ib()
99 98 # Value of REMOTE_USER, if set, or None.
100 99 remoteuser = attr.ib()
101 100 # Value of REMOTE_HOST, if set, or None.
102 101 remotehost = attr.ib()
103 102 # Relative WSGI application path. If defined, will begin with a
104 103 # ``/``.
105 104 apppath = attr.ib()
106 105 # List of path parts to be used for dispatch.
107 106 dispatchparts = attr.ib()
108 107 # URL path component (no query string) used for dispatch. Can be
109 108 # ``None`` to signal no path component given to the request, an
110 109 # empty string to signal a request to the application's root URL,
111 110 # or a string not beginning with ``/`` containing the requested
112 111 # path under the application.
113 112 dispatchpath = attr.ib()
114 113 # The name of the repository being accessed.
115 114 reponame = attr.ib()
116 115 # Raw query string (part after "?" in URL).
117 116 querystring = attr.ib()
118 117 # multidict of query string parameters.
119 118 qsparams = attr.ib()
120 119 # wsgiref.headers.Headers instance. Operates like a dict with case
121 120 # insensitive keys.
122 121 headers = attr.ib()
123 122 # Request body input stream.
124 123 bodyfh = attr.ib()
125 124 # WSGI environment dict, unmodified.
126 125 rawenv = attr.ib()
127 126
128 127 def parserequestfromenv(env, reponame=None, altbaseurl=None):
129 128 """Parse URL components from environment variables.
130 129
131 130 WSGI defines request attributes via environment variables. This function
132 131 parses the environment variables into a data structure.
133 132
134 133 If ``reponame`` is defined, the leading path components matching that
135 134 string are effectively shifted from ``PATH_INFO`` to ``SCRIPT_NAME``.
136 135 This simulates the world view of a WSGI application that processes
137 136 requests from the base URL of a repo.
138 137
139 138 If ``altbaseurl`` (typically comes from ``web.baseurl`` config option)
140 139 is defined, it is used - instead of the WSGI environment variables - for
141 140 constructing URL components up to and including the WSGI application path.
142 141 For example, if the current WSGI application is at ``/repo`` and a request
143 142 is made to ``/rev/@`` with this argument set to
144 143 ``http://myserver:9000/prefix``, the URL and path components will resolve as
145 144 if the request were to ``http://myserver:9000/prefix/rev/@``. In other
146 145 words, ``wsgi.url_scheme``, ``SERVER_NAME``, ``SERVER_PORT``, and
147 146 ``SCRIPT_NAME`` are all effectively replaced by components from this URL.
148 147 """
149 148 # PEP 3333 defines the WSGI spec and is a useful reference for this code.
150 149
151 150 # We first validate that the incoming object conforms with the WSGI spec.
152 151 # We only want to be dealing with spec-conforming WSGI implementations.
153 152 # TODO enable this once we fix internal violations.
154 153 #wsgiref.validate.check_environ(env)
155 154
156 155 # PEP-0333 states that environment keys and values are native strings
157 156 # (bytes on Python 2 and str on Python 3). The code points for the Unicode
158 157 # strings on Python 3 must be between \00000-\000FF. We deal with bytes
159 158 # in Mercurial, so mass convert string keys and values to bytes.
160 159 if pycompat.ispy3:
161 160 env = {k.encode('latin-1'): v for k, v in env.iteritems()}
162 161 env = {k: v.encode('latin-1') if isinstance(v, str) else v
163 162 for k, v in env.iteritems()}
164 163
165 164 if altbaseurl:
166 165 altbaseurl = util.url(altbaseurl)
167 166
168 167 # https://www.python.org/dev/peps/pep-0333/#environ-variables defines
169 168 # the environment variables.
170 169 # https://www.python.org/dev/peps/pep-0333/#url-reconstruction defines
171 170 # how URLs are reconstructed.
172 171 fullurl = env['wsgi.url_scheme'] + '://'
173 172
174 173 if altbaseurl and altbaseurl.scheme:
175 174 advertisedfullurl = altbaseurl.scheme + '://'
176 175 else:
177 176 advertisedfullurl = fullurl
178 177
179 178 def addport(s, port):
180 179 if s.startswith('https://'):
181 180 if port != '443':
182 181 s += ':' + port
183 182 else:
184 183 if port != '80':
185 184 s += ':' + port
186 185
187 186 return s
188 187
189 188 if env.get('HTTP_HOST'):
190 189 fullurl += env['HTTP_HOST']
191 190 else:
192 191 fullurl += env['SERVER_NAME']
193 192 fullurl = addport(fullurl, env['SERVER_PORT'])
194 193
195 194 if altbaseurl and altbaseurl.host:
196 195 advertisedfullurl += altbaseurl.host
197 196
198 197 if altbaseurl.port:
199 198 port = altbaseurl.port
200 199 elif altbaseurl.scheme == 'http' and not altbaseurl.port:
201 200 port = '80'
202 201 elif altbaseurl.scheme == 'https' and not altbaseurl.port:
203 202 port = '443'
204 203 else:
205 204 port = env['SERVER_PORT']
206 205
207 206 advertisedfullurl = addport(advertisedfullurl, port)
208 207 else:
209 208 advertisedfullurl += env['SERVER_NAME']
210 209 advertisedfullurl = addport(advertisedfullurl, env['SERVER_PORT'])
211 210
212 211 baseurl = fullurl
213 212 advertisedbaseurl = advertisedfullurl
214 213
215 214 fullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
216 215 fullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
217 216
218 217 if altbaseurl:
219 218 path = altbaseurl.path or ''
220 219 if path and not path.startswith('/'):
221 220 path = '/' + path
222 221 advertisedfullurl += util.urlreq.quote(path)
223 222 else:
224 223 advertisedfullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
225 224
226 225 advertisedfullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
227 226
228 227 if env.get('QUERY_STRING'):
229 228 fullurl += '?' + env['QUERY_STRING']
230 229 advertisedfullurl += '?' + env['QUERY_STRING']
231 230
232 231 # If ``reponame`` is defined, that must be a prefix on PATH_INFO
233 232 # that represents the repository being dispatched to. When computing
234 233 # the dispatch info, we ignore these leading path components.
235 234
236 235 if altbaseurl:
237 236 apppath = altbaseurl.path or ''
238 237 if apppath and not apppath.startswith('/'):
239 238 apppath = '/' + apppath
240 239 else:
241 240 apppath = env.get('SCRIPT_NAME', '')
242 241
243 242 if reponame:
244 243 repoprefix = '/' + reponame.strip('/')
245 244
246 245 if not env.get('PATH_INFO'):
247 246 raise error.ProgrammingError('reponame requires PATH_INFO')
248 247
249 248 if not env['PATH_INFO'].startswith(repoprefix):
250 249 raise error.ProgrammingError('PATH_INFO does not begin with repo '
251 250 'name: %s (%s)' % (env['PATH_INFO'],
252 251 reponame))
253 252
254 253 dispatchpath = env['PATH_INFO'][len(repoprefix):]
255 254
256 255 if dispatchpath and not dispatchpath.startswith('/'):
257 256 raise error.ProgrammingError('reponame prefix of PATH_INFO does '
258 257 'not end at path delimiter: %s (%s)' %
259 258 (env['PATH_INFO'], reponame))
260 259
261 260 apppath = apppath.rstrip('/') + repoprefix
262 261 dispatchparts = dispatchpath.strip('/').split('/')
263 262 dispatchpath = '/'.join(dispatchparts)
264 263
265 264 elif 'PATH_INFO' in env:
266 265 if env['PATH_INFO'].strip('/'):
267 266 dispatchparts = env['PATH_INFO'].strip('/').split('/')
268 267 dispatchpath = '/'.join(dispatchparts)
269 268 else:
270 269 dispatchparts = []
271 270 dispatchpath = ''
272 271 else:
273 272 dispatchparts = []
274 273 dispatchpath = None
275 274
276 275 querystring = env.get('QUERY_STRING', '')
277 276
278 277 # We store as a list so we have ordering information. We also store as
279 278 # a dict to facilitate fast lookup.
280 279 qsparams = multidict()
281 280 for k, v in util.urlreq.parseqsl(querystring, keep_blank_values=True):
282 281 qsparams.add(k, v)
283 282
284 283 # HTTP_* keys contain HTTP request headers. The Headers structure should
285 284 # perform case normalization for us. We just rewrite underscore to dash
286 285 # so keys match what likely went over the wire.
287 286 headers = []
288 287 for k, v in env.iteritems():
289 288 if k.startswith('HTTP_'):
290 289 headers.append((k[len('HTTP_'):].replace('_', '-'), v))
291 290
291 from . import wsgiheaders # avoid cycle
292 292 headers = wsgiheaders.Headers(headers)
293 293
294 294 # This is kind of a lie because the HTTP header wasn't explicitly
295 295 # sent. But for all intents and purposes it should be OK to lie about
296 296 # this, since a consumer will either either value to determine how many
297 297 # bytes are available to read.
298 298 if 'CONTENT_LENGTH' in env and 'HTTP_CONTENT_LENGTH' not in env:
299 299 headers['Content-Length'] = env['CONTENT_LENGTH']
300 300
301 301 if 'CONTENT_TYPE' in env and 'HTTP_CONTENT_TYPE' not in env:
302 302 headers['Content-Type'] = env['CONTENT_TYPE']
303 303
304 304 bodyfh = env['wsgi.input']
305 305 if 'Content-Length' in headers:
306 306 bodyfh = util.cappedreader(bodyfh, int(headers['Content-Length']))
307 307
308 308 return parsedrequest(method=env['REQUEST_METHOD'],
309 309 url=fullurl, baseurl=baseurl,
310 310 advertisedurl=advertisedfullurl,
311 311 advertisedbaseurl=advertisedbaseurl,
312 312 urlscheme=env['wsgi.url_scheme'],
313 313 remoteuser=env.get('REMOTE_USER'),
314 314 remotehost=env.get('REMOTE_HOST'),
315 315 apppath=apppath,
316 316 dispatchparts=dispatchparts, dispatchpath=dispatchpath,
317 317 reponame=reponame,
318 318 querystring=querystring,
319 319 qsparams=qsparams,
320 320 headers=headers,
321 321 bodyfh=bodyfh,
322 322 rawenv=env)
323 323
324 324 class offsettrackingwriter(object):
325 325 """A file object like object that is append only and tracks write count.
326 326
327 327 Instances are bound to a callable. This callable is called with data
328 328 whenever a ``write()`` is attempted.
329 329
330 330 Instances track the amount of written data so they can answer ``tell()``
331 331 requests.
332 332
333 333 The intent of this class is to wrap the ``write()`` function returned by
334 334 a WSGI ``start_response()`` function. Since ``write()`` is a callable and
335 335 not a file object, it doesn't implement other file object methods.
336 336 """
337 337 def __init__(self, writefn):
338 338 self._write = writefn
339 339 self._offset = 0
340 340
341 341 def write(self, s):
342 342 res = self._write(s)
343 343 # Some Python objects don't report the number of bytes written.
344 344 if res is None:
345 345 self._offset += len(s)
346 346 else:
347 347 self._offset += res
348 348
349 349 def flush(self):
350 350 pass
351 351
352 352 def tell(self):
353 353 return self._offset
354 354
355 355 class wsgiresponse(object):
356 356 """Represents a response to a WSGI request.
357 357
358 358 A response consists of a status line, headers, and a body.
359 359
360 360 Consumers must populate the ``status`` and ``headers`` fields and
361 361 make a call to a ``setbody*()`` method before the response can be
362 362 issued.
363 363
364 364 When it is time to start sending the response over the wire,
365 365 ``sendresponse()`` is called. It handles emitting the header portion
366 366 of the response message. It then yields chunks of body data to be
367 367 written to the peer. Typically, the WSGI application itself calls
368 368 and returns the value from ``sendresponse()``.
369 369 """
370 370
371 371 def __init__(self, req, startresponse):
372 372 """Create an empty response tied to a specific request.
373 373
374 374 ``req`` is a ``parsedrequest``. ``startresponse`` is the
375 375 ``start_response`` function passed to the WSGI application.
376 376 """
377 377 self._req = req
378 378 self._startresponse = startresponse
379 379
380 380 self.status = None
381 from . import wsgiheaders # avoid cycle
381 382 self.headers = wsgiheaders.Headers([])
382 383
383 384 self._bodybytes = None
384 385 self._bodygen = None
385 386 self._bodywillwrite = False
386 387 self._started = False
387 388 self._bodywritefn = None
388 389
389 390 def _verifybody(self):
390 391 if (self._bodybytes is not None or self._bodygen is not None
391 392 or self._bodywillwrite):
392 393 raise error.ProgrammingError('cannot define body multiple times')
393 394
394 395 def setbodybytes(self, b):
395 396 """Define the response body as static bytes.
396 397
397 398 The empty string signals that there is no response body.
398 399 """
399 400 self._verifybody()
400 401 self._bodybytes = b
401 402 self.headers['Content-Length'] = '%d' % len(b)
402 403
403 404 def setbodygen(self, gen):
404 405 """Define the response body as a generator of bytes."""
405 406 self._verifybody()
406 407 self._bodygen = gen
407 408
408 409 def setbodywillwrite(self):
409 410 """Signal an intent to use write() to emit the response body.
410 411
411 412 **This is the least preferred way to send a body.**
412 413
413 414 It is preferred for WSGI applications to emit a generator of chunks
414 415 constituting the response body. However, some consumers can't emit
415 416 data this way. So, WSGI provides a way to obtain a ``write(data)``
416 417 function that can be used to synchronously perform an unbuffered
417 418 write.
418 419
419 420 Calling this function signals an intent to produce the body in this
420 421 manner.
421 422 """
422 423 self._verifybody()
423 424 self._bodywillwrite = True
424 425
425 426 def sendresponse(self):
426 427 """Send the generated response to the client.
427 428
428 429 Before this is called, ``status`` must be set and one of
429 430 ``setbodybytes()`` or ``setbodygen()`` must be called.
430 431
431 432 Calling this method multiple times is not allowed.
432 433 """
433 434 if self._started:
434 435 raise error.ProgrammingError('sendresponse() called multiple times')
435 436
436 437 self._started = True
437 438
438 439 if not self.status:
439 440 raise error.ProgrammingError('status line not defined')
440 441
441 442 if (self._bodybytes is None and self._bodygen is None
442 443 and not self._bodywillwrite):
443 444 raise error.ProgrammingError('response body not defined')
444 445
445 446 # RFC 7232 Section 4.1 states that a 304 MUST generate one of
446 447 # {Cache-Control, Content-Location, Date, ETag, Expires, Vary}
447 448 # and SHOULD NOT generate other headers unless they could be used
448 449 # to guide cache updates. Furthermore, RFC 7230 Section 3.3.2
449 450 # states that no response body can be issued. Content-Length can
450 451 # be sent. But if it is present, it should be the size of the response
451 452 # that wasn't transferred.
452 453 if self.status.startswith('304 '):
453 454 # setbodybytes('') will set C-L to 0. This doesn't conform with the
454 455 # spec. So remove it.
455 456 if self.headers.get('Content-Length') == '0':
456 457 del self.headers['Content-Length']
457 458
458 459 # Strictly speaking, this is too strict. But until it causes
459 460 # problems, let's be strict.
460 461 badheaders = {k for k in self.headers.keys()
461 462 if k.lower() not in ('date', 'etag', 'expires',
462 463 'cache-control',
463 464 'content-location',
464 465 'vary')}
465 466 if badheaders:
466 467 raise error.ProgrammingError(
467 468 'illegal header on 304 response: %s' %
468 469 ', '.join(sorted(badheaders)))
469 470
470 471 if self._bodygen is not None or self._bodywillwrite:
471 472 raise error.ProgrammingError("must use setbodybytes('') with "
472 473 "304 responses")
473 474
474 475 # Various HTTP clients (notably httplib) won't read the HTTP response
475 476 # until the HTTP request has been sent in full. If servers (us) send a
476 477 # response before the HTTP request has been fully sent, the connection
477 478 # may deadlock because neither end is reading.
478 479 #
479 480 # We work around this by "draining" the request data before
480 481 # sending any response in some conditions.
481 482 drain = False
482 483 close = False
483 484
484 485 # If the client sent Expect: 100-continue, we assume it is smart enough
485 486 # to deal with the server sending a response before reading the request.
486 487 # (httplib doesn't do this.)
487 488 if self._req.headers.get('Expect', '').lower() == '100-continue':
488 489 pass
489 490 # Only tend to request methods that have bodies. Strictly speaking,
490 491 # we should sniff for a body. But this is fine for our existing
491 492 # WSGI applications.
492 493 elif self._req.method not in ('POST', 'PUT'):
493 494 pass
494 495 else:
495 496 # If we don't know how much data to read, there's no guarantee
496 497 # that we can drain the request responsibly. The WSGI
497 498 # specification only says that servers *should* ensure the
498 499 # input stream doesn't overrun the actual request. So there's
499 500 # no guarantee that reading until EOF won't corrupt the stream
500 501 # state.
501 502 if not isinstance(self._req.bodyfh, util.cappedreader):
502 503 close = True
503 504 else:
504 505 # We /could/ only drain certain HTTP response codes. But 200 and
505 506 # non-200 wire protocol responses both require draining. Since
506 507 # we have a capped reader in place for all situations where we
507 508 # drain, it is safe to read from that stream. We'll either do
508 509 # a drain or no-op if we're already at EOF.
509 510 drain = True
510 511
511 512 if close:
512 513 self.headers['Connection'] = 'Close'
513 514
514 515 if drain:
515 516 assert isinstance(self._req.bodyfh, util.cappedreader)
516 517 while True:
517 518 chunk = self._req.bodyfh.read(32768)
518 519 if not chunk:
519 520 break
520 521
521 522 strheaders = [(pycompat.strurl(k), pycompat.strurl(v)) for
522 523 k, v in self.headers.items()]
523 524 write = self._startresponse(pycompat.sysstr(self.status),
524 525 strheaders)
525 526
526 527 if self._bodybytes:
527 528 yield self._bodybytes
528 529 elif self._bodygen:
529 530 for chunk in self._bodygen:
530 531 yield chunk
531 532 elif self._bodywillwrite:
532 533 self._bodywritefn = write
533 534 else:
534 535 error.ProgrammingError('do not know how to send body')
535 536
536 537 def getbodyfile(self):
537 538 """Obtain a file object like object representing the response body.
538 539
539 540 For this to work, you must call ``setbodywillwrite()`` and then
540 541 ``sendresponse()`` first. ``sendresponse()`` is a generator and the
541 542 function won't run to completion unless the generator is advanced. The
542 543 generator yields not items. The easiest way to consume it is with
543 544 ``list(res.sendresponse())``, which should resolve to an empty list -
544 545 ``[]``.
545 546 """
546 547 if not self._bodywillwrite:
547 548 raise error.ProgrammingError('must call setbodywillwrite() first')
548 549
549 550 if not self._started:
550 551 raise error.ProgrammingError('must call sendresponse() first; did '
551 552 'you remember to consume it since it '
552 553 'is a generator?')
553 554
554 555 assert self._bodywritefn
555 556 return offsettrackingwriter(self._bodywritefn)
556 557
557 558 def wsgiapplication(app_maker):
558 559 '''For compatibility with old CGI scripts. A plain hgweb() or hgwebdir()
559 560 can and should now be used as a WSGI application.'''
560 561 application = app_maker()
561 562 def run_wsgi(env, respond):
562 563 return application(env, respond)
563 564 return run_wsgi
General Comments 0
You need to be logged in to leave comments. Login now