##// END OF EJS Templates
hgweb: cast bytearray to bytes...
Gregory Szorc -
r40612:6107d454 stable
parent child Browse files
Show More
@@ -1,576 +1,582
1 1 # hgweb/request.py - An http request from either CGI or the standalone server.
2 2 #
3 3 # Copyright 21 May 2005 - (c) 2005 Jake Edge <jake@edge2.net>
4 4 # Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 11 #import wsgiref.validate
12 12
13 13 from ..thirdparty import (
14 14 attr,
15 15 )
16 16 from .. import (
17 17 error,
18 18 pycompat,
19 19 util,
20 20 )
21 21
22 22 class multidict(object):
23 23 """A dict like object that can store multiple values for a key.
24 24
25 25 Used to store parsed request parameters.
26 26
27 27 This is inspired by WebOb's class of the same name.
28 28 """
29 29 def __init__(self):
30 30 self._items = {}
31 31
32 32 def __getitem__(self, key):
33 33 """Returns the last set value for a key."""
34 34 return self._items[key][-1]
35 35
36 36 def __setitem__(self, key, value):
37 37 """Replace a values for a key with a new value."""
38 38 self._items[key] = [value]
39 39
40 40 def __delitem__(self, key):
41 41 """Delete all values for a key."""
42 42 del self._items[key]
43 43
44 44 def __contains__(self, key):
45 45 return key in self._items
46 46
47 47 def __len__(self):
48 48 return len(self._items)
49 49
50 50 def get(self, key, default=None):
51 51 try:
52 52 return self.__getitem__(key)
53 53 except KeyError:
54 54 return default
55 55
56 56 def add(self, key, value):
57 57 """Add a new value for a key. Does not replace existing values."""
58 58 self._items.setdefault(key, []).append(value)
59 59
60 60 def getall(self, key):
61 61 """Obtains all values for a key."""
62 62 return self._items.get(key, [])
63 63
64 64 def getone(self, key):
65 65 """Obtain a single value for a key.
66 66
67 67 Raises KeyError if key not defined or it has multiple values set.
68 68 """
69 69 vals = self._items[key]
70 70
71 71 if len(vals) > 1:
72 72 raise KeyError('multiple values for %r' % key)
73 73
74 74 return vals[0]
75 75
76 76 def asdictoflists(self):
77 77 return {k: list(v) for k, v in self._items.iteritems()}
78 78
79 79 @attr.s(frozen=True)
80 80 class parsedrequest(object):
81 81 """Represents a parsed WSGI request.
82 82
83 83 Contains both parsed parameters as well as a handle on the input stream.
84 84 """
85 85
86 86 # Request method.
87 87 method = attr.ib()
88 88 # Full URL for this request.
89 89 url = attr.ib()
90 90 # URL without any path components. Just <proto>://<host><port>.
91 91 baseurl = attr.ib()
92 92 # Advertised URL. Like ``url`` and ``baseurl`` but uses SERVER_NAME instead
93 93 # of HTTP: Host header for hostname. This is likely what clients used.
94 94 advertisedurl = attr.ib()
95 95 advertisedbaseurl = attr.ib()
96 96 # URL scheme (part before ``://``). e.g. ``http`` or ``https``.
97 97 urlscheme = attr.ib()
98 98 # Value of REMOTE_USER, if set, or None.
99 99 remoteuser = attr.ib()
100 100 # Value of REMOTE_HOST, if set, or None.
101 101 remotehost = attr.ib()
102 102 # Relative WSGI application path. If defined, will begin with a
103 103 # ``/``.
104 104 apppath = attr.ib()
105 105 # List of path parts to be used for dispatch.
106 106 dispatchparts = attr.ib()
107 107 # URL path component (no query string) used for dispatch. Can be
108 108 # ``None`` to signal no path component given to the request, an
109 109 # empty string to signal a request to the application's root URL,
110 110 # or a string not beginning with ``/`` containing the requested
111 111 # path under the application.
112 112 dispatchpath = attr.ib()
113 113 # The name of the repository being accessed.
114 114 reponame = attr.ib()
115 115 # Raw query string (part after "?" in URL).
116 116 querystring = attr.ib()
117 117 # multidict of query string parameters.
118 118 qsparams = attr.ib()
119 119 # wsgiref.headers.Headers instance. Operates like a dict with case
120 120 # insensitive keys.
121 121 headers = attr.ib()
122 122 # Request body input stream.
123 123 bodyfh = attr.ib()
124 124 # WSGI environment dict, unmodified.
125 125 rawenv = attr.ib()
126 126
127 127 def parserequestfromenv(env, reponame=None, altbaseurl=None, bodyfh=None):
128 128 """Parse URL components from environment variables.
129 129
130 130 WSGI defines request attributes via environment variables. This function
131 131 parses the environment variables into a data structure.
132 132
133 133 If ``reponame`` is defined, the leading path components matching that
134 134 string are effectively shifted from ``PATH_INFO`` to ``SCRIPT_NAME``.
135 135 This simulates the world view of a WSGI application that processes
136 136 requests from the base URL of a repo.
137 137
138 138 If ``altbaseurl`` (typically comes from ``web.baseurl`` config option)
139 139 is defined, it is used - instead of the WSGI environment variables - for
140 140 constructing URL components up to and including the WSGI application path.
141 141 For example, if the current WSGI application is at ``/repo`` and a request
142 142 is made to ``/rev/@`` with this argument set to
143 143 ``http://myserver:9000/prefix``, the URL and path components will resolve as
144 144 if the request were to ``http://myserver:9000/prefix/rev/@``. In other
145 145 words, ``wsgi.url_scheme``, ``SERVER_NAME``, ``SERVER_PORT``, and
146 146 ``SCRIPT_NAME`` are all effectively replaced by components from this URL.
147 147
148 148 ``bodyfh`` can be used to specify a file object to read the request body
149 149 from. If not defined, ``wsgi.input`` from the environment dict is used.
150 150 """
151 151 # PEP 3333 defines the WSGI spec and is a useful reference for this code.
152 152
153 153 # We first validate that the incoming object conforms with the WSGI spec.
154 154 # We only want to be dealing with spec-conforming WSGI implementations.
155 155 # TODO enable this once we fix internal violations.
156 156 #wsgiref.validate.check_environ(env)
157 157
158 158 # PEP-0333 states that environment keys and values are native strings
159 159 # (bytes on Python 2 and str on Python 3). The code points for the Unicode
160 160 # strings on Python 3 must be between \00000-\000FF. We deal with bytes
161 161 # in Mercurial, so mass convert string keys and values to bytes.
162 162 if pycompat.ispy3:
163 163 env = {k.encode('latin-1'): v for k, v in env.iteritems()}
164 164 env = {k: v.encode('latin-1') if isinstance(v, str) else v
165 165 for k, v in env.iteritems()}
166 166
167 167 # Some hosting solutions are emulating hgwebdir, and dispatching directly
168 168 # to an hgweb instance using this environment variable. This was always
169 169 # checked prior to d7fd203e36cc; keep doing so to avoid breaking them.
170 170 if not reponame:
171 171 reponame = env.get('REPO_NAME')
172 172
173 173 if altbaseurl:
174 174 altbaseurl = util.url(altbaseurl)
175 175
176 176 # https://www.python.org/dev/peps/pep-0333/#environ-variables defines
177 177 # the environment variables.
178 178 # https://www.python.org/dev/peps/pep-0333/#url-reconstruction defines
179 179 # how URLs are reconstructed.
180 180 fullurl = env['wsgi.url_scheme'] + '://'
181 181
182 182 if altbaseurl and altbaseurl.scheme:
183 183 advertisedfullurl = altbaseurl.scheme + '://'
184 184 else:
185 185 advertisedfullurl = fullurl
186 186
187 187 def addport(s, port):
188 188 if s.startswith('https://'):
189 189 if port != '443':
190 190 s += ':' + port
191 191 else:
192 192 if port != '80':
193 193 s += ':' + port
194 194
195 195 return s
196 196
197 197 if env.get('HTTP_HOST'):
198 198 fullurl += env['HTTP_HOST']
199 199 else:
200 200 fullurl += env['SERVER_NAME']
201 201 fullurl = addport(fullurl, env['SERVER_PORT'])
202 202
203 203 if altbaseurl and altbaseurl.host:
204 204 advertisedfullurl += altbaseurl.host
205 205
206 206 if altbaseurl.port:
207 207 port = altbaseurl.port
208 208 elif altbaseurl.scheme == 'http' and not altbaseurl.port:
209 209 port = '80'
210 210 elif altbaseurl.scheme == 'https' and not altbaseurl.port:
211 211 port = '443'
212 212 else:
213 213 port = env['SERVER_PORT']
214 214
215 215 advertisedfullurl = addport(advertisedfullurl, port)
216 216 else:
217 217 advertisedfullurl += env['SERVER_NAME']
218 218 advertisedfullurl = addport(advertisedfullurl, env['SERVER_PORT'])
219 219
220 220 baseurl = fullurl
221 221 advertisedbaseurl = advertisedfullurl
222 222
223 223 fullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
224 224 fullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
225 225
226 226 if altbaseurl:
227 227 path = altbaseurl.path or ''
228 228 if path and not path.startswith('/'):
229 229 path = '/' + path
230 230 advertisedfullurl += util.urlreq.quote(path)
231 231 else:
232 232 advertisedfullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
233 233
234 234 advertisedfullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
235 235
236 236 if env.get('QUERY_STRING'):
237 237 fullurl += '?' + env['QUERY_STRING']
238 238 advertisedfullurl += '?' + env['QUERY_STRING']
239 239
240 240 # If ``reponame`` is defined, that must be a prefix on PATH_INFO
241 241 # that represents the repository being dispatched to. When computing
242 242 # the dispatch info, we ignore these leading path components.
243 243
244 244 if altbaseurl:
245 245 apppath = altbaseurl.path or ''
246 246 if apppath and not apppath.startswith('/'):
247 247 apppath = '/' + apppath
248 248 else:
249 249 apppath = env.get('SCRIPT_NAME', '')
250 250
251 251 if reponame:
252 252 repoprefix = '/' + reponame.strip('/')
253 253
254 254 if not env.get('PATH_INFO'):
255 255 raise error.ProgrammingError('reponame requires PATH_INFO')
256 256
257 257 if not env['PATH_INFO'].startswith(repoprefix):
258 258 raise error.ProgrammingError('PATH_INFO does not begin with repo '
259 259 'name: %s (%s)' % (env['PATH_INFO'],
260 260 reponame))
261 261
262 262 dispatchpath = env['PATH_INFO'][len(repoprefix):]
263 263
264 264 if dispatchpath and not dispatchpath.startswith('/'):
265 265 raise error.ProgrammingError('reponame prefix of PATH_INFO does '
266 266 'not end at path delimiter: %s (%s)' %
267 267 (env['PATH_INFO'], reponame))
268 268
269 269 apppath = apppath.rstrip('/') + repoprefix
270 270 dispatchparts = dispatchpath.strip('/').split('/')
271 271 dispatchpath = '/'.join(dispatchparts)
272 272
273 273 elif 'PATH_INFO' in env:
274 274 if env['PATH_INFO'].strip('/'):
275 275 dispatchparts = env['PATH_INFO'].strip('/').split('/')
276 276 dispatchpath = '/'.join(dispatchparts)
277 277 else:
278 278 dispatchparts = []
279 279 dispatchpath = ''
280 280 else:
281 281 dispatchparts = []
282 282 dispatchpath = None
283 283
284 284 querystring = env.get('QUERY_STRING', '')
285 285
286 286 # We store as a list so we have ordering information. We also store as
287 287 # a dict to facilitate fast lookup.
288 288 qsparams = multidict()
289 289 for k, v in util.urlreq.parseqsl(querystring, keep_blank_values=True):
290 290 qsparams.add(k, v)
291 291
292 292 # HTTP_* keys contain HTTP request headers. The Headers structure should
293 293 # perform case normalization for us. We just rewrite underscore to dash
294 294 # so keys match what likely went over the wire.
295 295 headers = []
296 296 for k, v in env.iteritems():
297 297 if k.startswith('HTTP_'):
298 298 headers.append((k[len('HTTP_'):].replace('_', '-'), v))
299 299
300 300 from . import wsgiheaders # avoid cycle
301 301 headers = wsgiheaders.Headers(headers)
302 302
303 303 # This is kind of a lie because the HTTP header wasn't explicitly
304 304 # sent. But for all intents and purposes it should be OK to lie about
305 305 # this, since a consumer will either either value to determine how many
306 306 # bytes are available to read.
307 307 if 'CONTENT_LENGTH' in env and 'HTTP_CONTENT_LENGTH' not in env:
308 308 headers['Content-Length'] = env['CONTENT_LENGTH']
309 309
310 310 if 'CONTENT_TYPE' in env and 'HTTP_CONTENT_TYPE' not in env:
311 311 headers['Content-Type'] = env['CONTENT_TYPE']
312 312
313 313 if bodyfh is None:
314 314 bodyfh = env['wsgi.input']
315 315 if 'Content-Length' in headers:
316 316 bodyfh = util.cappedreader(bodyfh,
317 317 int(headers['Content-Length'] or '0'))
318 318
319 319 return parsedrequest(method=env['REQUEST_METHOD'],
320 320 url=fullurl, baseurl=baseurl,
321 321 advertisedurl=advertisedfullurl,
322 322 advertisedbaseurl=advertisedbaseurl,
323 323 urlscheme=env['wsgi.url_scheme'],
324 324 remoteuser=env.get('REMOTE_USER'),
325 325 remotehost=env.get('REMOTE_HOST'),
326 326 apppath=apppath,
327 327 dispatchparts=dispatchparts, dispatchpath=dispatchpath,
328 328 reponame=reponame,
329 329 querystring=querystring,
330 330 qsparams=qsparams,
331 331 headers=headers,
332 332 bodyfh=bodyfh,
333 333 rawenv=env)
334 334
335 335 class offsettrackingwriter(object):
336 336 """A file object like object that is append only and tracks write count.
337 337
338 338 Instances are bound to a callable. This callable is called with data
339 339 whenever a ``write()`` is attempted.
340 340
341 341 Instances track the amount of written data so they can answer ``tell()``
342 342 requests.
343 343
344 344 The intent of this class is to wrap the ``write()`` function returned by
345 345 a WSGI ``start_response()`` function. Since ``write()`` is a callable and
346 346 not a file object, it doesn't implement other file object methods.
347 347 """
348 348 def __init__(self, writefn):
349 349 self._write = writefn
350 350 self._offset = 0
351 351
352 352 def write(self, s):
353 353 res = self._write(s)
354 354 # Some Python objects don't report the number of bytes written.
355 355 if res is None:
356 356 self._offset += len(s)
357 357 else:
358 358 self._offset += res
359 359
360 360 def flush(self):
361 361 pass
362 362
363 363 def tell(self):
364 364 return self._offset
365 365
366 366 class wsgiresponse(object):
367 367 """Represents a response to a WSGI request.
368 368
369 369 A response consists of a status line, headers, and a body.
370 370
371 371 Consumers must populate the ``status`` and ``headers`` fields and
372 372 make a call to a ``setbody*()`` method before the response can be
373 373 issued.
374 374
375 375 When it is time to start sending the response over the wire,
376 376 ``sendresponse()`` is called. It handles emitting the header portion
377 377 of the response message. It then yields chunks of body data to be
378 378 written to the peer. Typically, the WSGI application itself calls
379 379 and returns the value from ``sendresponse()``.
380 380 """
381 381
382 382 def __init__(self, req, startresponse):
383 383 """Create an empty response tied to a specific request.
384 384
385 385 ``req`` is a ``parsedrequest``. ``startresponse`` is the
386 386 ``start_response`` function passed to the WSGI application.
387 387 """
388 388 self._req = req
389 389 self._startresponse = startresponse
390 390
391 391 self.status = None
392 392 from . import wsgiheaders # avoid cycle
393 393 self.headers = wsgiheaders.Headers([])
394 394
395 395 self._bodybytes = None
396 396 self._bodygen = None
397 397 self._bodywillwrite = False
398 398 self._started = False
399 399 self._bodywritefn = None
400 400
401 401 def _verifybody(self):
402 402 if (self._bodybytes is not None or self._bodygen is not None
403 403 or self._bodywillwrite):
404 404 raise error.ProgrammingError('cannot define body multiple times')
405 405
406 406 def setbodybytes(self, b):
407 407 """Define the response body as static bytes.
408 408
409 409 The empty string signals that there is no response body.
410 410 """
411 411 self._verifybody()
412 412 self._bodybytes = b
413 413 self.headers['Content-Length'] = '%d' % len(b)
414 414
415 415 def setbodygen(self, gen):
416 416 """Define the response body as a generator of bytes."""
417 417 self._verifybody()
418 418 self._bodygen = gen
419 419
420 420 def setbodywillwrite(self):
421 421 """Signal an intent to use write() to emit the response body.
422 422
423 423 **This is the least preferred way to send a body.**
424 424
425 425 It is preferred for WSGI applications to emit a generator of chunks
426 426 constituting the response body. However, some consumers can't emit
427 427 data this way. So, WSGI provides a way to obtain a ``write(data)``
428 428 function that can be used to synchronously perform an unbuffered
429 429 write.
430 430
431 431 Calling this function signals an intent to produce the body in this
432 432 manner.
433 433 """
434 434 self._verifybody()
435 435 self._bodywillwrite = True
436 436
437 437 def sendresponse(self):
438 438 """Send the generated response to the client.
439 439
440 440 Before this is called, ``status`` must be set and one of
441 441 ``setbodybytes()`` or ``setbodygen()`` must be called.
442 442
443 443 Calling this method multiple times is not allowed.
444 444 """
445 445 if self._started:
446 446 raise error.ProgrammingError('sendresponse() called multiple times')
447 447
448 448 self._started = True
449 449
450 450 if not self.status:
451 451 raise error.ProgrammingError('status line not defined')
452 452
453 453 if (self._bodybytes is None and self._bodygen is None
454 454 and not self._bodywillwrite):
455 455 raise error.ProgrammingError('response body not defined')
456 456
457 457 # RFC 7232 Section 4.1 states that a 304 MUST generate one of
458 458 # {Cache-Control, Content-Location, Date, ETag, Expires, Vary}
459 459 # and SHOULD NOT generate other headers unless they could be used
460 460 # to guide cache updates. Furthermore, RFC 7230 Section 3.3.2
461 461 # states that no response body can be issued. Content-Length can
462 462 # be sent. But if it is present, it should be the size of the response
463 463 # that wasn't transferred.
464 464 if self.status.startswith('304 '):
465 465 # setbodybytes('') will set C-L to 0. This doesn't conform with the
466 466 # spec. So remove it.
467 467 if self.headers.get('Content-Length') == '0':
468 468 del self.headers['Content-Length']
469 469
470 470 # Strictly speaking, this is too strict. But until it causes
471 471 # problems, let's be strict.
472 472 badheaders = {k for k in self.headers.keys()
473 473 if k.lower() not in ('date', 'etag', 'expires',
474 474 'cache-control',
475 475 'content-location',
476 476 'content-security-policy',
477 477 'vary')}
478 478 if badheaders:
479 479 raise error.ProgrammingError(
480 480 'illegal header on 304 response: %s' %
481 481 ', '.join(sorted(badheaders)))
482 482
483 483 if self._bodygen is not None or self._bodywillwrite:
484 484 raise error.ProgrammingError("must use setbodybytes('') with "
485 485 "304 responses")
486 486
487 487 # Various HTTP clients (notably httplib) won't read the HTTP response
488 488 # until the HTTP request has been sent in full. If servers (us) send a
489 489 # response before the HTTP request has been fully sent, the connection
490 490 # may deadlock because neither end is reading.
491 491 #
492 492 # We work around this by "draining" the request data before
493 493 # sending any response in some conditions.
494 494 drain = False
495 495 close = False
496 496
497 497 # If the client sent Expect: 100-continue, we assume it is smart enough
498 498 # to deal with the server sending a response before reading the request.
499 499 # (httplib doesn't do this.)
500 500 if self._req.headers.get('Expect', '').lower() == '100-continue':
501 501 pass
502 502 # Only tend to request methods that have bodies. Strictly speaking,
503 503 # we should sniff for a body. But this is fine for our existing
504 504 # WSGI applications.
505 505 elif self._req.method not in ('POST', 'PUT'):
506 506 pass
507 507 else:
508 508 # If we don't know how much data to read, there's no guarantee
509 509 # that we can drain the request responsibly. The WSGI
510 510 # specification only says that servers *should* ensure the
511 511 # input stream doesn't overrun the actual request. So there's
512 512 # no guarantee that reading until EOF won't corrupt the stream
513 513 # state.
514 514 if not isinstance(self._req.bodyfh, util.cappedreader):
515 515 close = True
516 516 else:
517 517 # We /could/ only drain certain HTTP response codes. But 200 and
518 518 # non-200 wire protocol responses both require draining. Since
519 519 # we have a capped reader in place for all situations where we
520 520 # drain, it is safe to read from that stream. We'll either do
521 521 # a drain or no-op if we're already at EOF.
522 522 drain = True
523 523
524 524 if close:
525 525 self.headers['Connection'] = 'Close'
526 526
527 527 if drain:
528 528 assert isinstance(self._req.bodyfh, util.cappedreader)
529 529 while True:
530 530 chunk = self._req.bodyfh.read(32768)
531 531 if not chunk:
532 532 break
533 533
534 534 strheaders = [(pycompat.strurl(k), pycompat.strurl(v)) for
535 535 k, v in self.headers.items()]
536 536 write = self._startresponse(pycompat.sysstr(self.status),
537 537 strheaders)
538 538
539 539 if self._bodybytes:
540 540 yield self._bodybytes
541 541 elif self._bodygen:
542 542 for chunk in self._bodygen:
543 # PEP-3333 says that output must be bytes. And some WSGI
544 # implementations enforce this. We cast bytes-like types here
545 # for convenience.
546 if isinstance(chunk, bytearray):
547 chunk = bytes(chunk)
548
543 549 yield chunk
544 550 elif self._bodywillwrite:
545 551 self._bodywritefn = write
546 552 else:
547 553 error.ProgrammingError('do not know how to send body')
548 554
549 555 def getbodyfile(self):
550 556 """Obtain a file object like object representing the response body.
551 557
552 558 For this to work, you must call ``setbodywillwrite()`` and then
553 559 ``sendresponse()`` first. ``sendresponse()`` is a generator and the
554 560 function won't run to completion unless the generator is advanced. The
555 561 generator yields not items. The easiest way to consume it is with
556 562 ``list(res.sendresponse())``, which should resolve to an empty list -
557 563 ``[]``.
558 564 """
559 565 if not self._bodywillwrite:
560 566 raise error.ProgrammingError('must call setbodywillwrite() first')
561 567
562 568 if not self._started:
563 569 raise error.ProgrammingError('must call sendresponse() first; did '
564 570 'you remember to consume it since it '
565 571 'is a generator?')
566 572
567 573 assert self._bodywritefn
568 574 return offsettrackingwriter(self._bodywritefn)
569 575
570 576 def wsgiapplication(app_maker):
571 577 '''For compatibility with old CGI scripts. A plain hgweb() or hgwebdir()
572 578 can and should now be used as a WSGI application.'''
573 579 application = app_maker()
574 580 def run_wsgi(env, respond):
575 581 return application(env, respond)
576 582 return run_wsgi
General Comments 0
You need to be logged in to leave comments. Login now