##// END OF EJS Templates
hgweb: put response headers back into str for Python 3...
Augie Fackler -
r37607:e320d940 default
parent child Browse files
Show More
@@ -1,561 +1,563
1 1 # hgweb/request.py - An http request from either CGI or the standalone server.
2 2 #
3 3 # Copyright 21 May 2005 - (c) 2005 Jake Edge <jake@edge2.net>
4 4 # Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 11 import wsgiref.headers as wsgiheaders
12 12 #import wsgiref.validate
13 13
14 14 from ..thirdparty import (
15 15 attr,
16 16 )
17 17 from .. import (
18 18 error,
19 19 pycompat,
20 20 util,
21 21 )
22 22
23 23 class multidict(object):
24 24 """A dict like object that can store multiple values for a key.
25 25
26 26 Used to store parsed request parameters.
27 27
28 28 This is inspired by WebOb's class of the same name.
29 29 """
30 30 def __init__(self):
31 31 self._items = {}
32 32
33 33 def __getitem__(self, key):
34 34 """Returns the last set value for a key."""
35 35 return self._items[key][-1]
36 36
37 37 def __setitem__(self, key, value):
38 38 """Replace a values for a key with a new value."""
39 39 self._items[key] = [value]
40 40
41 41 def __delitem__(self, key):
42 42 """Delete all values for a key."""
43 43 del self._items[key]
44 44
45 45 def __contains__(self, key):
46 46 return key in self._items
47 47
48 48 def __len__(self):
49 49 return len(self._items)
50 50
51 51 def get(self, key, default=None):
52 52 try:
53 53 return self.__getitem__(key)
54 54 except KeyError:
55 55 return default
56 56
57 57 def add(self, key, value):
58 58 """Add a new value for a key. Does not replace existing values."""
59 59 self._items.setdefault(key, []).append(value)
60 60
61 61 def getall(self, key):
62 62 """Obtains all values for a key."""
63 63 return self._items.get(key, [])
64 64
65 65 def getone(self, key):
66 66 """Obtain a single value for a key.
67 67
68 68 Raises KeyError if key not defined or it has multiple values set.
69 69 """
70 70 vals = self._items[key]
71 71
72 72 if len(vals) > 1:
73 73 raise KeyError('multiple values for %r' % key)
74 74
75 75 return vals[0]
76 76
77 77 def asdictoflists(self):
78 78 return {k: list(v) for k, v in self._items.iteritems()}
79 79
80 80 @attr.s(frozen=True)
81 81 class parsedrequest(object):
82 82 """Represents a parsed WSGI request.
83 83
84 84 Contains both parsed parameters as well as a handle on the input stream.
85 85 """
86 86
87 87 # Request method.
88 88 method = attr.ib()
89 89 # Full URL for this request.
90 90 url = attr.ib()
91 91 # URL without any path components. Just <proto>://<host><port>.
92 92 baseurl = attr.ib()
93 93 # Advertised URL. Like ``url`` and ``baseurl`` but uses SERVER_NAME instead
94 94 # of HTTP: Host header for hostname. This is likely what clients used.
95 95 advertisedurl = attr.ib()
96 96 advertisedbaseurl = attr.ib()
97 97 # URL scheme (part before ``://``). e.g. ``http`` or ``https``.
98 98 urlscheme = attr.ib()
99 99 # Value of REMOTE_USER, if set, or None.
100 100 remoteuser = attr.ib()
101 101 # Value of REMOTE_HOST, if set, or None.
102 102 remotehost = attr.ib()
103 103 # Relative WSGI application path. If defined, will begin with a
104 104 # ``/``.
105 105 apppath = attr.ib()
106 106 # List of path parts to be used for dispatch.
107 107 dispatchparts = attr.ib()
108 108 # URL path component (no query string) used for dispatch. Can be
109 109 # ``None`` to signal no path component given to the request, an
110 110 # empty string to signal a request to the application's root URL,
111 111 # or a string not beginning with ``/`` containing the requested
112 112 # path under the application.
113 113 dispatchpath = attr.ib()
114 114 # The name of the repository being accessed.
115 115 reponame = attr.ib()
116 116 # Raw query string (part after "?" in URL).
117 117 querystring = attr.ib()
118 118 # multidict of query string parameters.
119 119 qsparams = attr.ib()
120 120 # wsgiref.headers.Headers instance. Operates like a dict with case
121 121 # insensitive keys.
122 122 headers = attr.ib()
123 123 # Request body input stream.
124 124 bodyfh = attr.ib()
125 125 # WSGI environment dict, unmodified.
126 126 rawenv = attr.ib()
127 127
128 128 def parserequestfromenv(env, reponame=None, altbaseurl=None):
129 129 """Parse URL components from environment variables.
130 130
131 131 WSGI defines request attributes via environment variables. This function
132 132 parses the environment variables into a data structure.
133 133
134 134 If ``reponame`` is defined, the leading path components matching that
135 135 string are effectively shifted from ``PATH_INFO`` to ``SCRIPT_NAME``.
136 136 This simulates the world view of a WSGI application that processes
137 137 requests from the base URL of a repo.
138 138
139 139 If ``altbaseurl`` (typically comes from ``web.baseurl`` config option)
140 140 is defined, it is used - instead of the WSGI environment variables - for
141 141 constructing URL components up to and including the WSGI application path.
142 142 For example, if the current WSGI application is at ``/repo`` and a request
143 143 is made to ``/rev/@`` with this argument set to
144 144 ``http://myserver:9000/prefix``, the URL and path components will resolve as
145 145 if the request were to ``http://myserver:9000/prefix/rev/@``. In other
146 146 words, ``wsgi.url_scheme``, ``SERVER_NAME``, ``SERVER_PORT``, and
147 147 ``SCRIPT_NAME`` are all effectively replaced by components from this URL.
148 148 """
149 149 # PEP 3333 defines the WSGI spec and is a useful reference for this code.
150 150
151 151 # We first validate that the incoming object conforms with the WSGI spec.
152 152 # We only want to be dealing with spec-conforming WSGI implementations.
153 153 # TODO enable this once we fix internal violations.
154 154 #wsgiref.validate.check_environ(env)
155 155
156 156 # PEP-0333 states that environment keys and values are native strings
157 157 # (bytes on Python 2 and str on Python 3). The code points for the Unicode
158 158 # strings on Python 3 must be between \00000-\000FF. We deal with bytes
159 159 # in Mercurial, so mass convert string keys and values to bytes.
160 160 if pycompat.ispy3:
161 161 env = {k.encode('latin-1'): v for k, v in env.iteritems()}
162 162 env = {k: v.encode('latin-1') if isinstance(v, str) else v
163 163 for k, v in env.iteritems()}
164 164
165 165 if altbaseurl:
166 166 altbaseurl = util.url(altbaseurl)
167 167
168 168 # https://www.python.org/dev/peps/pep-0333/#environ-variables defines
169 169 # the environment variables.
170 170 # https://www.python.org/dev/peps/pep-0333/#url-reconstruction defines
171 171 # how URLs are reconstructed.
172 172 fullurl = env['wsgi.url_scheme'] + '://'
173 173
174 174 if altbaseurl and altbaseurl.scheme:
175 175 advertisedfullurl = altbaseurl.scheme + '://'
176 176 else:
177 177 advertisedfullurl = fullurl
178 178
179 179 def addport(s, port):
180 180 if s.startswith('https://'):
181 181 if port != '443':
182 182 s += ':' + port
183 183 else:
184 184 if port != '80':
185 185 s += ':' + port
186 186
187 187 return s
188 188
189 189 if env.get('HTTP_HOST'):
190 190 fullurl += env['HTTP_HOST']
191 191 else:
192 192 fullurl += env['SERVER_NAME']
193 193 fullurl = addport(fullurl, env['SERVER_PORT'])
194 194
195 195 if altbaseurl and altbaseurl.host:
196 196 advertisedfullurl += altbaseurl.host
197 197
198 198 if altbaseurl.port:
199 199 port = altbaseurl.port
200 200 elif altbaseurl.scheme == 'http' and not altbaseurl.port:
201 201 port = '80'
202 202 elif altbaseurl.scheme == 'https' and not altbaseurl.port:
203 203 port = '443'
204 204 else:
205 205 port = env['SERVER_PORT']
206 206
207 207 advertisedfullurl = addport(advertisedfullurl, port)
208 208 else:
209 209 advertisedfullurl += env['SERVER_NAME']
210 210 advertisedfullurl = addport(advertisedfullurl, env['SERVER_PORT'])
211 211
212 212 baseurl = fullurl
213 213 advertisedbaseurl = advertisedfullurl
214 214
215 215 fullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
216 216 fullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
217 217
218 218 if altbaseurl:
219 219 path = altbaseurl.path or ''
220 220 if path and not path.startswith('/'):
221 221 path = '/' + path
222 222 advertisedfullurl += util.urlreq.quote(path)
223 223 else:
224 224 advertisedfullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
225 225
226 226 advertisedfullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
227 227
228 228 if env.get('QUERY_STRING'):
229 229 fullurl += '?' + env['QUERY_STRING']
230 230 advertisedfullurl += '?' + env['QUERY_STRING']
231 231
232 232 # If ``reponame`` is defined, that must be a prefix on PATH_INFO
233 233 # that represents the repository being dispatched to. When computing
234 234 # the dispatch info, we ignore these leading path components.
235 235
236 236 if altbaseurl:
237 237 apppath = altbaseurl.path or ''
238 238 if apppath and not apppath.startswith('/'):
239 239 apppath = '/' + apppath
240 240 else:
241 241 apppath = env.get('SCRIPT_NAME', '')
242 242
243 243 if reponame:
244 244 repoprefix = '/' + reponame.strip('/')
245 245
246 246 if not env.get('PATH_INFO'):
247 247 raise error.ProgrammingError('reponame requires PATH_INFO')
248 248
249 249 if not env['PATH_INFO'].startswith(repoprefix):
250 250 raise error.ProgrammingError('PATH_INFO does not begin with repo '
251 251 'name: %s (%s)' % (env['PATH_INFO'],
252 252 reponame))
253 253
254 254 dispatchpath = env['PATH_INFO'][len(repoprefix):]
255 255
256 256 if dispatchpath and not dispatchpath.startswith('/'):
257 257 raise error.ProgrammingError('reponame prefix of PATH_INFO does '
258 258 'not end at path delimiter: %s (%s)' %
259 259 (env['PATH_INFO'], reponame))
260 260
261 261 apppath = apppath.rstrip('/') + repoprefix
262 262 dispatchparts = dispatchpath.strip('/').split('/')
263 263 dispatchpath = '/'.join(dispatchparts)
264 264
265 265 elif 'PATH_INFO' in env:
266 266 if env['PATH_INFO'].strip('/'):
267 267 dispatchparts = env['PATH_INFO'].strip('/').split('/')
268 268 dispatchpath = '/'.join(dispatchparts)
269 269 else:
270 270 dispatchparts = []
271 271 dispatchpath = ''
272 272 else:
273 273 dispatchparts = []
274 274 dispatchpath = None
275 275
276 276 querystring = env.get('QUERY_STRING', '')
277 277
278 278 # We store as a list so we have ordering information. We also store as
279 279 # a dict to facilitate fast lookup.
280 280 qsparams = multidict()
281 281 for k, v in util.urlreq.parseqsl(querystring, keep_blank_values=True):
282 282 qsparams.add(k, v)
283 283
284 284 # HTTP_* keys contain HTTP request headers. The Headers structure should
285 285 # perform case normalization for us. We just rewrite underscore to dash
286 286 # so keys match what likely went over the wire.
287 287 headers = []
288 288 for k, v in env.iteritems():
289 289 if k.startswith('HTTP_'):
290 290 headers.append((k[len('HTTP_'):].replace('_', '-'), v))
291 291
292 292 headers = wsgiheaders.Headers(headers)
293 293
294 294 # This is kind of a lie because the HTTP header wasn't explicitly
295 295 # sent. But for all intents and purposes it should be OK to lie about
296 296 # this, since a consumer will either either value to determine how many
297 297 # bytes are available to read.
298 298 if 'CONTENT_LENGTH' in env and 'HTTP_CONTENT_LENGTH' not in env:
299 299 headers['Content-Length'] = env['CONTENT_LENGTH']
300 300
301 301 if 'CONTENT_TYPE' in env and 'HTTP_CONTENT_TYPE' not in env:
302 302 headers['Content-Type'] = env['CONTENT_TYPE']
303 303
304 304 bodyfh = env['wsgi.input']
305 305 if 'Content-Length' in headers:
306 306 bodyfh = util.cappedreader(bodyfh, int(headers['Content-Length']))
307 307
308 308 return parsedrequest(method=env['REQUEST_METHOD'],
309 309 url=fullurl, baseurl=baseurl,
310 310 advertisedurl=advertisedfullurl,
311 311 advertisedbaseurl=advertisedbaseurl,
312 312 urlscheme=env['wsgi.url_scheme'],
313 313 remoteuser=env.get('REMOTE_USER'),
314 314 remotehost=env.get('REMOTE_HOST'),
315 315 apppath=apppath,
316 316 dispatchparts=dispatchparts, dispatchpath=dispatchpath,
317 317 reponame=reponame,
318 318 querystring=querystring,
319 319 qsparams=qsparams,
320 320 headers=headers,
321 321 bodyfh=bodyfh,
322 322 rawenv=env)
323 323
324 324 class offsettrackingwriter(object):
325 325 """A file object like object that is append only and tracks write count.
326 326
327 327 Instances are bound to a callable. This callable is called with data
328 328 whenever a ``write()`` is attempted.
329 329
330 330 Instances track the amount of written data so they can answer ``tell()``
331 331 requests.
332 332
333 333 The intent of this class is to wrap the ``write()`` function returned by
334 334 a WSGI ``start_response()`` function. Since ``write()`` is a callable and
335 335 not a file object, it doesn't implement other file object methods.
336 336 """
337 337 def __init__(self, writefn):
338 338 self._write = writefn
339 339 self._offset = 0
340 340
341 341 def write(self, s):
342 342 res = self._write(s)
343 343 # Some Python objects don't report the number of bytes written.
344 344 if res is None:
345 345 self._offset += len(s)
346 346 else:
347 347 self._offset += res
348 348
349 349 def flush(self):
350 350 pass
351 351
352 352 def tell(self):
353 353 return self._offset
354 354
355 355 class wsgiresponse(object):
356 356 """Represents a response to a WSGI request.
357 357
358 358 A response consists of a status line, headers, and a body.
359 359
360 360 Consumers must populate the ``status`` and ``headers`` fields and
361 361 make a call to a ``setbody*()`` method before the response can be
362 362 issued.
363 363
364 364 When it is time to start sending the response over the wire,
365 365 ``sendresponse()`` is called. It handles emitting the header portion
366 366 of the response message. It then yields chunks of body data to be
367 367 written to the peer. Typically, the WSGI application itself calls
368 368 and returns the value from ``sendresponse()``.
369 369 """
370 370
371 371 def __init__(self, req, startresponse):
372 372 """Create an empty response tied to a specific request.
373 373
374 374 ``req`` is a ``parsedrequest``. ``startresponse`` is the
375 375 ``start_response`` function passed to the WSGI application.
376 376 """
377 377 self._req = req
378 378 self._startresponse = startresponse
379 379
380 380 self.status = None
381 381 self.headers = wsgiheaders.Headers([])
382 382
383 383 self._bodybytes = None
384 384 self._bodygen = None
385 385 self._bodywillwrite = False
386 386 self._started = False
387 387 self._bodywritefn = None
388 388
389 389 def _verifybody(self):
390 390 if (self._bodybytes is not None or self._bodygen is not None
391 391 or self._bodywillwrite):
392 392 raise error.ProgrammingError('cannot define body multiple times')
393 393
394 394 def setbodybytes(self, b):
395 395 """Define the response body as static bytes.
396 396
397 397 The empty string signals that there is no response body.
398 398 """
399 399 self._verifybody()
400 400 self._bodybytes = b
401 401 self.headers['Content-Length'] = '%d' % len(b)
402 402
403 403 def setbodygen(self, gen):
404 404 """Define the response body as a generator of bytes."""
405 405 self._verifybody()
406 406 self._bodygen = gen
407 407
408 408 def setbodywillwrite(self):
409 409 """Signal an intent to use write() to emit the response body.
410 410
411 411 **This is the least preferred way to send a body.**
412 412
413 413 It is preferred for WSGI applications to emit a generator of chunks
414 414 constituting the response body. However, some consumers can't emit
415 415 data this way. So, WSGI provides a way to obtain a ``write(data)``
416 416 function that can be used to synchronously perform an unbuffered
417 417 write.
418 418
419 419 Calling this function signals an intent to produce the body in this
420 420 manner.
421 421 """
422 422 self._verifybody()
423 423 self._bodywillwrite = True
424 424
425 425 def sendresponse(self):
426 426 """Send the generated response to the client.
427 427
428 428 Before this is called, ``status`` must be set and one of
429 429 ``setbodybytes()`` or ``setbodygen()`` must be called.
430 430
431 431 Calling this method multiple times is not allowed.
432 432 """
433 433 if self._started:
434 434 raise error.ProgrammingError('sendresponse() called multiple times')
435 435
436 436 self._started = True
437 437
438 438 if not self.status:
439 439 raise error.ProgrammingError('status line not defined')
440 440
441 441 if (self._bodybytes is None and self._bodygen is None
442 442 and not self._bodywillwrite):
443 443 raise error.ProgrammingError('response body not defined')
444 444
445 445 # RFC 7232 Section 4.1 states that a 304 MUST generate one of
446 446 # {Cache-Control, Content-Location, Date, ETag, Expires, Vary}
447 447 # and SHOULD NOT generate other headers unless they could be used
448 448 # to guide cache updates. Furthermore, RFC 7230 Section 3.3.2
449 449 # states that no response body can be issued. Content-Length can
450 450 # be sent. But if it is present, it should be the size of the response
451 451 # that wasn't transferred.
452 452 if self.status.startswith('304 '):
453 453 # setbodybytes('') will set C-L to 0. This doesn't conform with the
454 454 # spec. So remove it.
455 455 if self.headers.get('Content-Length') == '0':
456 456 del self.headers['Content-Length']
457 457
458 458 # Strictly speaking, this is too strict. But until it causes
459 459 # problems, let's be strict.
460 460 badheaders = {k for k in self.headers.keys()
461 461 if k.lower() not in ('date', 'etag', 'expires',
462 462 'cache-control',
463 463 'content-location',
464 464 'vary')}
465 465 if badheaders:
466 466 raise error.ProgrammingError(
467 467 'illegal header on 304 response: %s' %
468 468 ', '.join(sorted(badheaders)))
469 469
470 470 if self._bodygen is not None or self._bodywillwrite:
471 471 raise error.ProgrammingError("must use setbodybytes('') with "
472 472 "304 responses")
473 473
474 474 # Various HTTP clients (notably httplib) won't read the HTTP response
475 475 # until the HTTP request has been sent in full. If servers (us) send a
476 476 # response before the HTTP request has been fully sent, the connection
477 477 # may deadlock because neither end is reading.
478 478 #
479 479 # We work around this by "draining" the request data before
480 480 # sending any response in some conditions.
481 481 drain = False
482 482 close = False
483 483
484 484 # If the client sent Expect: 100-continue, we assume it is smart enough
485 485 # to deal with the server sending a response before reading the request.
486 486 # (httplib doesn't do this.)
487 487 if self._req.headers.get('Expect', '').lower() == '100-continue':
488 488 pass
489 489 # Only tend to request methods that have bodies. Strictly speaking,
490 490 # we should sniff for a body. But this is fine for our existing
491 491 # WSGI applications.
492 492 elif self._req.method not in ('POST', 'PUT'):
493 493 pass
494 494 else:
495 495 # If we don't know how much data to read, there's no guarantee
496 496 # that we can drain the request responsibly. The WSGI
497 497 # specification only says that servers *should* ensure the
498 498 # input stream doesn't overrun the actual request. So there's
499 499 # no guarantee that reading until EOF won't corrupt the stream
500 500 # state.
501 501 if not isinstance(self._req.bodyfh, util.cappedreader):
502 502 close = True
503 503 else:
504 504 # We /could/ only drain certain HTTP response codes. But 200 and
505 505 # non-200 wire protocol responses both require draining. Since
506 506 # we have a capped reader in place for all situations where we
507 507 # drain, it is safe to read from that stream. We'll either do
508 508 # a drain or no-op if we're already at EOF.
509 509 drain = True
510 510
511 511 if close:
512 512 self.headers['Connection'] = 'Close'
513 513
514 514 if drain:
515 515 assert isinstance(self._req.bodyfh, util.cappedreader)
516 516 while True:
517 517 chunk = self._req.bodyfh.read(32768)
518 518 if not chunk:
519 519 break
520 520
521 strheaders = [(pycompat.strurl(k), pycompat.strurl(v)) for
522 k, v in self.headers.items()]
521 523 write = self._startresponse(pycompat.sysstr(self.status),
522 self.headers.items())
524 strheaders)
523 525
524 526 if self._bodybytes:
525 527 yield self._bodybytes
526 528 elif self._bodygen:
527 529 for chunk in self._bodygen:
528 530 yield chunk
529 531 elif self._bodywillwrite:
530 532 self._bodywritefn = write
531 533 else:
532 534 error.ProgrammingError('do not know how to send body')
533 535
534 536 def getbodyfile(self):
535 537 """Obtain a file object like object representing the response body.
536 538
537 539 For this to work, you must call ``setbodywillwrite()`` and then
538 540 ``sendresponse()`` first. ``sendresponse()`` is a generator and the
539 541 function won't run to completion unless the generator is advanced. The
540 542 generator yields not items. The easiest way to consume it is with
541 543 ``list(res.sendresponse())``, which should resolve to an empty list -
542 544 ``[]``.
543 545 """
544 546 if not self._bodywillwrite:
545 547 raise error.ProgrammingError('must call setbodywillwrite() first')
546 548
547 549 if not self._started:
548 550 raise error.ProgrammingError('must call sendresponse() first; did '
549 551 'you remember to consume it since it '
550 552 'is a generator?')
551 553
552 554 assert self._bodywritefn
553 555 return offsettrackingwriter(self._bodywritefn)
554 556
555 557 def wsgiapplication(app_maker):
556 558 '''For compatibility with old CGI scripts. A plain hgweb() or hgwebdir()
557 559 can and should now be used as a WSGI application.'''
558 560 application = app_maker()
559 561 def run_wsgi(env, respond):
560 562 return application(env, respond)
561 563 return run_wsgi
General Comments 0
You need to be logged in to leave comments. Login now