##// END OF EJS Templates
hgweb: guard against empty Content-Length header...
Gregory Szorc -
r37843:e82b137a @21 stable
parent child Browse files
Show More
@@ -1,574 +1,575 b''
1 1 # hgweb/request.py - An http request from either CGI or the standalone server.
2 2 #
3 3 # Copyright 21 May 2005 - (c) 2005 Jake Edge <jake@edge2.net>
4 4 # Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 11 #import wsgiref.validate
12 12
13 13 from ..thirdparty import (
14 14 attr,
15 15 )
16 16 from .. import (
17 17 error,
18 18 pycompat,
19 19 util,
20 20 )
21 21
22 22 class multidict(object):
23 23 """A dict like object that can store multiple values for a key.
24 24
25 25 Used to store parsed request parameters.
26 26
27 27 This is inspired by WebOb's class of the same name.
28 28 """
29 29 def __init__(self):
30 30 self._items = {}
31 31
32 32 def __getitem__(self, key):
33 33 """Returns the last set value for a key."""
34 34 return self._items[key][-1]
35 35
36 36 def __setitem__(self, key, value):
37 37 """Replace a values for a key with a new value."""
38 38 self._items[key] = [value]
39 39
40 40 def __delitem__(self, key):
41 41 """Delete all values for a key."""
42 42 del self._items[key]
43 43
44 44 def __contains__(self, key):
45 45 return key in self._items
46 46
47 47 def __len__(self):
48 48 return len(self._items)
49 49
50 50 def get(self, key, default=None):
51 51 try:
52 52 return self.__getitem__(key)
53 53 except KeyError:
54 54 return default
55 55
56 56 def add(self, key, value):
57 57 """Add a new value for a key. Does not replace existing values."""
58 58 self._items.setdefault(key, []).append(value)
59 59
60 60 def getall(self, key):
61 61 """Obtains all values for a key."""
62 62 return self._items.get(key, [])
63 63
64 64 def getone(self, key):
65 65 """Obtain a single value for a key.
66 66
67 67 Raises KeyError if key not defined or it has multiple values set.
68 68 """
69 69 vals = self._items[key]
70 70
71 71 if len(vals) > 1:
72 72 raise KeyError('multiple values for %r' % key)
73 73
74 74 return vals[0]
75 75
76 76 def asdictoflists(self):
77 77 return {k: list(v) for k, v in self._items.iteritems()}
78 78
79 79 @attr.s(frozen=True)
80 80 class parsedrequest(object):
81 81 """Represents a parsed WSGI request.
82 82
83 83 Contains both parsed parameters as well as a handle on the input stream.
84 84 """
85 85
86 86 # Request method.
87 87 method = attr.ib()
88 88 # Full URL for this request.
89 89 url = attr.ib()
90 90 # URL without any path components. Just <proto>://<host><port>.
91 91 baseurl = attr.ib()
92 92 # Advertised URL. Like ``url`` and ``baseurl`` but uses SERVER_NAME instead
93 93 # of HTTP: Host header for hostname. This is likely what clients used.
94 94 advertisedurl = attr.ib()
95 95 advertisedbaseurl = attr.ib()
96 96 # URL scheme (part before ``://``). e.g. ``http`` or ``https``.
97 97 urlscheme = attr.ib()
98 98 # Value of REMOTE_USER, if set, or None.
99 99 remoteuser = attr.ib()
100 100 # Value of REMOTE_HOST, if set, or None.
101 101 remotehost = attr.ib()
102 102 # Relative WSGI application path. If defined, will begin with a
103 103 # ``/``.
104 104 apppath = attr.ib()
105 105 # List of path parts to be used for dispatch.
106 106 dispatchparts = attr.ib()
107 107 # URL path component (no query string) used for dispatch. Can be
108 108 # ``None`` to signal no path component given to the request, an
109 109 # empty string to signal a request to the application's root URL,
110 110 # or a string not beginning with ``/`` containing the requested
111 111 # path under the application.
112 112 dispatchpath = attr.ib()
113 113 # The name of the repository being accessed.
114 114 reponame = attr.ib()
115 115 # Raw query string (part after "?" in URL).
116 116 querystring = attr.ib()
117 117 # multidict of query string parameters.
118 118 qsparams = attr.ib()
119 119 # wsgiref.headers.Headers instance. Operates like a dict with case
120 120 # insensitive keys.
121 121 headers = attr.ib()
122 122 # Request body input stream.
123 123 bodyfh = attr.ib()
124 124 # WSGI environment dict, unmodified.
125 125 rawenv = attr.ib()
126 126
127 127 def parserequestfromenv(env, reponame=None, altbaseurl=None, bodyfh=None):
128 128 """Parse URL components from environment variables.
129 129
130 130 WSGI defines request attributes via environment variables. This function
131 131 parses the environment variables into a data structure.
132 132
133 133 If ``reponame`` is defined, the leading path components matching that
134 134 string are effectively shifted from ``PATH_INFO`` to ``SCRIPT_NAME``.
135 135 This simulates the world view of a WSGI application that processes
136 136 requests from the base URL of a repo.
137 137
138 138 If ``altbaseurl`` (typically comes from ``web.baseurl`` config option)
139 139 is defined, it is used - instead of the WSGI environment variables - for
140 140 constructing URL components up to and including the WSGI application path.
141 141 For example, if the current WSGI application is at ``/repo`` and a request
142 142 is made to ``/rev/@`` with this argument set to
143 143 ``http://myserver:9000/prefix``, the URL and path components will resolve as
144 144 if the request were to ``http://myserver:9000/prefix/rev/@``. In other
145 145 words, ``wsgi.url_scheme``, ``SERVER_NAME``, ``SERVER_PORT``, and
146 146 ``SCRIPT_NAME`` are all effectively replaced by components from this URL.
147 147
148 148 ``bodyfh`` can be used to specify a file object to read the request body
149 149 from. If not defined, ``wsgi.input`` from the environment dict is used.
150 150 """
151 151 # PEP 3333 defines the WSGI spec and is a useful reference for this code.
152 152
153 153 # We first validate that the incoming object conforms with the WSGI spec.
154 154 # We only want to be dealing with spec-conforming WSGI implementations.
155 155 # TODO enable this once we fix internal violations.
156 156 #wsgiref.validate.check_environ(env)
157 157
158 158 # PEP-0333 states that environment keys and values are native strings
159 159 # (bytes on Python 2 and str on Python 3). The code points for the Unicode
160 160 # strings on Python 3 must be between \00000-\000FF. We deal with bytes
161 161 # in Mercurial, so mass convert string keys and values to bytes.
162 162 if pycompat.ispy3:
163 163 env = {k.encode('latin-1'): v for k, v in env.iteritems()}
164 164 env = {k: v.encode('latin-1') if isinstance(v, str) else v
165 165 for k, v in env.iteritems()}
166 166
167 167 # Some hosting solutions are emulating hgwebdir, and dispatching directly
168 168 # to an hgweb instance using this environment variable. This was always
169 169 # checked prior to d7fd203e36cc; keep doing so to avoid breaking them.
170 170 if not reponame:
171 171 reponame = env.get('REPO_NAME')
172 172
173 173 if altbaseurl:
174 174 altbaseurl = util.url(altbaseurl)
175 175
176 176 # https://www.python.org/dev/peps/pep-0333/#environ-variables defines
177 177 # the environment variables.
178 178 # https://www.python.org/dev/peps/pep-0333/#url-reconstruction defines
179 179 # how URLs are reconstructed.
180 180 fullurl = env['wsgi.url_scheme'] + '://'
181 181
182 182 if altbaseurl and altbaseurl.scheme:
183 183 advertisedfullurl = altbaseurl.scheme + '://'
184 184 else:
185 185 advertisedfullurl = fullurl
186 186
187 187 def addport(s, port):
188 188 if s.startswith('https://'):
189 189 if port != '443':
190 190 s += ':' + port
191 191 else:
192 192 if port != '80':
193 193 s += ':' + port
194 194
195 195 return s
196 196
197 197 if env.get('HTTP_HOST'):
198 198 fullurl += env['HTTP_HOST']
199 199 else:
200 200 fullurl += env['SERVER_NAME']
201 201 fullurl = addport(fullurl, env['SERVER_PORT'])
202 202
203 203 if altbaseurl and altbaseurl.host:
204 204 advertisedfullurl += altbaseurl.host
205 205
206 206 if altbaseurl.port:
207 207 port = altbaseurl.port
208 208 elif altbaseurl.scheme == 'http' and not altbaseurl.port:
209 209 port = '80'
210 210 elif altbaseurl.scheme == 'https' and not altbaseurl.port:
211 211 port = '443'
212 212 else:
213 213 port = env['SERVER_PORT']
214 214
215 215 advertisedfullurl = addport(advertisedfullurl, port)
216 216 else:
217 217 advertisedfullurl += env['SERVER_NAME']
218 218 advertisedfullurl = addport(advertisedfullurl, env['SERVER_PORT'])
219 219
220 220 baseurl = fullurl
221 221 advertisedbaseurl = advertisedfullurl
222 222
223 223 fullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
224 224 fullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
225 225
226 226 if altbaseurl:
227 227 path = altbaseurl.path or ''
228 228 if path and not path.startswith('/'):
229 229 path = '/' + path
230 230 advertisedfullurl += util.urlreq.quote(path)
231 231 else:
232 232 advertisedfullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
233 233
234 234 advertisedfullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
235 235
236 236 if env.get('QUERY_STRING'):
237 237 fullurl += '?' + env['QUERY_STRING']
238 238 advertisedfullurl += '?' + env['QUERY_STRING']
239 239
240 240 # If ``reponame`` is defined, that must be a prefix on PATH_INFO
241 241 # that represents the repository being dispatched to. When computing
242 242 # the dispatch info, we ignore these leading path components.
243 243
244 244 if altbaseurl:
245 245 apppath = altbaseurl.path or ''
246 246 if apppath and not apppath.startswith('/'):
247 247 apppath = '/' + apppath
248 248 else:
249 249 apppath = env.get('SCRIPT_NAME', '')
250 250
251 251 if reponame:
252 252 repoprefix = '/' + reponame.strip('/')
253 253
254 254 if not env.get('PATH_INFO'):
255 255 raise error.ProgrammingError('reponame requires PATH_INFO')
256 256
257 257 if not env['PATH_INFO'].startswith(repoprefix):
258 258 raise error.ProgrammingError('PATH_INFO does not begin with repo '
259 259 'name: %s (%s)' % (env['PATH_INFO'],
260 260 reponame))
261 261
262 262 dispatchpath = env['PATH_INFO'][len(repoprefix):]
263 263
264 264 if dispatchpath and not dispatchpath.startswith('/'):
265 265 raise error.ProgrammingError('reponame prefix of PATH_INFO does '
266 266 'not end at path delimiter: %s (%s)' %
267 267 (env['PATH_INFO'], reponame))
268 268
269 269 apppath = apppath.rstrip('/') + repoprefix
270 270 dispatchparts = dispatchpath.strip('/').split('/')
271 271 dispatchpath = '/'.join(dispatchparts)
272 272
273 273 elif 'PATH_INFO' in env:
274 274 if env['PATH_INFO'].strip('/'):
275 275 dispatchparts = env['PATH_INFO'].strip('/').split('/')
276 276 dispatchpath = '/'.join(dispatchparts)
277 277 else:
278 278 dispatchparts = []
279 279 dispatchpath = ''
280 280 else:
281 281 dispatchparts = []
282 282 dispatchpath = None
283 283
284 284 querystring = env.get('QUERY_STRING', '')
285 285
286 286 # We store as a list so we have ordering information. We also store as
287 287 # a dict to facilitate fast lookup.
288 288 qsparams = multidict()
289 289 for k, v in util.urlreq.parseqsl(querystring, keep_blank_values=True):
290 290 qsparams.add(k, v)
291 291
292 292 # HTTP_* keys contain HTTP request headers. The Headers structure should
293 293 # perform case normalization for us. We just rewrite underscore to dash
294 294 # so keys match what likely went over the wire.
295 295 headers = []
296 296 for k, v in env.iteritems():
297 297 if k.startswith('HTTP_'):
298 298 headers.append((k[len('HTTP_'):].replace('_', '-'), v))
299 299
300 300 from . import wsgiheaders # avoid cycle
301 301 headers = wsgiheaders.Headers(headers)
302 302
303 303 # This is kind of a lie because the HTTP header wasn't explicitly
304 304 # sent. But for all intents and purposes it should be OK to lie about
305 305 # this, since a consumer will either either value to determine how many
306 306 # bytes are available to read.
307 307 if 'CONTENT_LENGTH' in env and 'HTTP_CONTENT_LENGTH' not in env:
308 308 headers['Content-Length'] = env['CONTENT_LENGTH']
309 309
310 310 if 'CONTENT_TYPE' in env and 'HTTP_CONTENT_TYPE' not in env:
311 311 headers['Content-Type'] = env['CONTENT_TYPE']
312 312
313 313 if bodyfh is None:
314 314 bodyfh = env['wsgi.input']
315 315 if 'Content-Length' in headers:
316 bodyfh = util.cappedreader(bodyfh, int(headers['Content-Length']))
316 bodyfh = util.cappedreader(bodyfh,
317 int(headers['Content-Length'] or '0'))
317 318
318 319 return parsedrequest(method=env['REQUEST_METHOD'],
319 320 url=fullurl, baseurl=baseurl,
320 321 advertisedurl=advertisedfullurl,
321 322 advertisedbaseurl=advertisedbaseurl,
322 323 urlscheme=env['wsgi.url_scheme'],
323 324 remoteuser=env.get('REMOTE_USER'),
324 325 remotehost=env.get('REMOTE_HOST'),
325 326 apppath=apppath,
326 327 dispatchparts=dispatchparts, dispatchpath=dispatchpath,
327 328 reponame=reponame,
328 329 querystring=querystring,
329 330 qsparams=qsparams,
330 331 headers=headers,
331 332 bodyfh=bodyfh,
332 333 rawenv=env)
333 334
334 335 class offsettrackingwriter(object):
335 336 """A file object like object that is append only and tracks write count.
336 337
337 338 Instances are bound to a callable. This callable is called with data
338 339 whenever a ``write()`` is attempted.
339 340
340 341 Instances track the amount of written data so they can answer ``tell()``
341 342 requests.
342 343
343 344 The intent of this class is to wrap the ``write()`` function returned by
344 345 a WSGI ``start_response()`` function. Since ``write()`` is a callable and
345 346 not a file object, it doesn't implement other file object methods.
346 347 """
347 348 def __init__(self, writefn):
348 349 self._write = writefn
349 350 self._offset = 0
350 351
351 352 def write(self, s):
352 353 res = self._write(s)
353 354 # Some Python objects don't report the number of bytes written.
354 355 if res is None:
355 356 self._offset += len(s)
356 357 else:
357 358 self._offset += res
358 359
359 360 def flush(self):
360 361 pass
361 362
362 363 def tell(self):
363 364 return self._offset
364 365
365 366 class wsgiresponse(object):
366 367 """Represents a response to a WSGI request.
367 368
368 369 A response consists of a status line, headers, and a body.
369 370
370 371 Consumers must populate the ``status`` and ``headers`` fields and
371 372 make a call to a ``setbody*()`` method before the response can be
372 373 issued.
373 374
374 375 When it is time to start sending the response over the wire,
375 376 ``sendresponse()`` is called. It handles emitting the header portion
376 377 of the response message. It then yields chunks of body data to be
377 378 written to the peer. Typically, the WSGI application itself calls
378 379 and returns the value from ``sendresponse()``.
379 380 """
380 381
381 382 def __init__(self, req, startresponse):
382 383 """Create an empty response tied to a specific request.
383 384
384 385 ``req`` is a ``parsedrequest``. ``startresponse`` is the
385 386 ``start_response`` function passed to the WSGI application.
386 387 """
387 388 self._req = req
388 389 self._startresponse = startresponse
389 390
390 391 self.status = None
391 392 from . import wsgiheaders # avoid cycle
392 393 self.headers = wsgiheaders.Headers([])
393 394
394 395 self._bodybytes = None
395 396 self._bodygen = None
396 397 self._bodywillwrite = False
397 398 self._started = False
398 399 self._bodywritefn = None
399 400
400 401 def _verifybody(self):
401 402 if (self._bodybytes is not None or self._bodygen is not None
402 403 or self._bodywillwrite):
403 404 raise error.ProgrammingError('cannot define body multiple times')
404 405
405 406 def setbodybytes(self, b):
406 407 """Define the response body as static bytes.
407 408
408 409 The empty string signals that there is no response body.
409 410 """
410 411 self._verifybody()
411 412 self._bodybytes = b
412 413 self.headers['Content-Length'] = '%d' % len(b)
413 414
414 415 def setbodygen(self, gen):
415 416 """Define the response body as a generator of bytes."""
416 417 self._verifybody()
417 418 self._bodygen = gen
418 419
419 420 def setbodywillwrite(self):
420 421 """Signal an intent to use write() to emit the response body.
421 422
422 423 **This is the least preferred way to send a body.**
423 424
424 425 It is preferred for WSGI applications to emit a generator of chunks
425 426 constituting the response body. However, some consumers can't emit
426 427 data this way. So, WSGI provides a way to obtain a ``write(data)``
427 428 function that can be used to synchronously perform an unbuffered
428 429 write.
429 430
430 431 Calling this function signals an intent to produce the body in this
431 432 manner.
432 433 """
433 434 self._verifybody()
434 435 self._bodywillwrite = True
435 436
436 437 def sendresponse(self):
437 438 """Send the generated response to the client.
438 439
439 440 Before this is called, ``status`` must be set and one of
440 441 ``setbodybytes()`` or ``setbodygen()`` must be called.
441 442
442 443 Calling this method multiple times is not allowed.
443 444 """
444 445 if self._started:
445 446 raise error.ProgrammingError('sendresponse() called multiple times')
446 447
447 448 self._started = True
448 449
449 450 if not self.status:
450 451 raise error.ProgrammingError('status line not defined')
451 452
452 453 if (self._bodybytes is None and self._bodygen is None
453 454 and not self._bodywillwrite):
454 455 raise error.ProgrammingError('response body not defined')
455 456
456 457 # RFC 7232 Section 4.1 states that a 304 MUST generate one of
457 458 # {Cache-Control, Content-Location, Date, ETag, Expires, Vary}
458 459 # and SHOULD NOT generate other headers unless they could be used
459 460 # to guide cache updates. Furthermore, RFC 7230 Section 3.3.2
460 461 # states that no response body can be issued. Content-Length can
461 462 # be sent. But if it is present, it should be the size of the response
462 463 # that wasn't transferred.
463 464 if self.status.startswith('304 '):
464 465 # setbodybytes('') will set C-L to 0. This doesn't conform with the
465 466 # spec. So remove it.
466 467 if self.headers.get('Content-Length') == '0':
467 468 del self.headers['Content-Length']
468 469
469 470 # Strictly speaking, this is too strict. But until it causes
470 471 # problems, let's be strict.
471 472 badheaders = {k for k in self.headers.keys()
472 473 if k.lower() not in ('date', 'etag', 'expires',
473 474 'cache-control',
474 475 'content-location',
475 476 'vary')}
476 477 if badheaders:
477 478 raise error.ProgrammingError(
478 479 'illegal header on 304 response: %s' %
479 480 ', '.join(sorted(badheaders)))
480 481
481 482 if self._bodygen is not None or self._bodywillwrite:
482 483 raise error.ProgrammingError("must use setbodybytes('') with "
483 484 "304 responses")
484 485
485 486 # Various HTTP clients (notably httplib) won't read the HTTP response
486 487 # until the HTTP request has been sent in full. If servers (us) send a
487 488 # response before the HTTP request has been fully sent, the connection
488 489 # may deadlock because neither end is reading.
489 490 #
490 491 # We work around this by "draining" the request data before
491 492 # sending any response in some conditions.
492 493 drain = False
493 494 close = False
494 495
495 496 # If the client sent Expect: 100-continue, we assume it is smart enough
496 497 # to deal with the server sending a response before reading the request.
497 498 # (httplib doesn't do this.)
498 499 if self._req.headers.get('Expect', '').lower() == '100-continue':
499 500 pass
500 501 # Only tend to request methods that have bodies. Strictly speaking,
501 502 # we should sniff for a body. But this is fine for our existing
502 503 # WSGI applications.
503 504 elif self._req.method not in ('POST', 'PUT'):
504 505 pass
505 506 else:
506 507 # If we don't know how much data to read, there's no guarantee
507 508 # that we can drain the request responsibly. The WSGI
508 509 # specification only says that servers *should* ensure the
509 510 # input stream doesn't overrun the actual request. So there's
510 511 # no guarantee that reading until EOF won't corrupt the stream
511 512 # state.
512 513 if not isinstance(self._req.bodyfh, util.cappedreader):
513 514 close = True
514 515 else:
515 516 # We /could/ only drain certain HTTP response codes. But 200 and
516 517 # non-200 wire protocol responses both require draining. Since
517 518 # we have a capped reader in place for all situations where we
518 519 # drain, it is safe to read from that stream. We'll either do
519 520 # a drain or no-op if we're already at EOF.
520 521 drain = True
521 522
522 523 if close:
523 524 self.headers['Connection'] = 'Close'
524 525
525 526 if drain:
526 527 assert isinstance(self._req.bodyfh, util.cappedreader)
527 528 while True:
528 529 chunk = self._req.bodyfh.read(32768)
529 530 if not chunk:
530 531 break
531 532
532 533 strheaders = [(pycompat.strurl(k), pycompat.strurl(v)) for
533 534 k, v in self.headers.items()]
534 535 write = self._startresponse(pycompat.sysstr(self.status),
535 536 strheaders)
536 537
537 538 if self._bodybytes:
538 539 yield self._bodybytes
539 540 elif self._bodygen:
540 541 for chunk in self._bodygen:
541 542 yield chunk
542 543 elif self._bodywillwrite:
543 544 self._bodywritefn = write
544 545 else:
545 546 error.ProgrammingError('do not know how to send body')
546 547
547 548 def getbodyfile(self):
548 549 """Obtain a file object like object representing the response body.
549 550
550 551 For this to work, you must call ``setbodywillwrite()`` and then
551 552 ``sendresponse()`` first. ``sendresponse()`` is a generator and the
552 553 function won't run to completion unless the generator is advanced. The
553 554 generator yields not items. The easiest way to consume it is with
554 555 ``list(res.sendresponse())``, which should resolve to an empty list -
555 556 ``[]``.
556 557 """
557 558 if not self._bodywillwrite:
558 559 raise error.ProgrammingError('must call setbodywillwrite() first')
559 560
560 561 if not self._started:
561 562 raise error.ProgrammingError('must call sendresponse() first; did '
562 563 'you remember to consume it since it '
563 564 'is a generator?')
564 565
565 566 assert self._bodywritefn
566 567 return offsettrackingwriter(self._bodywritefn)
567 568
568 569 def wsgiapplication(app_maker):
569 570 '''For compatibility with old CGI scripts. A plain hgweb() or hgwebdir()
570 571 can and should now be used as a WSGI application.'''
571 572 application = app_maker()
572 573 def run_wsgi(env, respond):
573 574 return application(env, respond)
574 575 return run_wsgi
General Comments 0
You need to be logged in to leave comments. Login now