##// END OF EJS Templates
hgweb: also set Content-Type header...
Gregory Szorc -
r37067:55e90139 default
parent child Browse files
Show More
@@ -1,558 +1,561
1 1 # hgweb/request.py - An http request from either CGI or the standalone server.
2 2 #
3 3 # Copyright 21 May 2005 - (c) 2005 Jake Edge <jake@edge2.net>
4 4 # Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 11 import wsgiref.headers as wsgiheaders
12 12 #import wsgiref.validate
13 13
14 14 from ..thirdparty import (
15 15 attr,
16 16 )
17 17 from .. import (
18 18 error,
19 19 pycompat,
20 20 util,
21 21 )
22 22
23 23 class multidict(object):
24 24 """A dict like object that can store multiple values for a key.
25 25
26 26 Used to store parsed request parameters.
27 27
28 28 This is inspired by WebOb's class of the same name.
29 29 """
30 30 def __init__(self):
31 31 self._items = {}
32 32
33 33 def __getitem__(self, key):
34 34 """Returns the last set value for a key."""
35 35 return self._items[key][-1]
36 36
37 37 def __setitem__(self, key, value):
38 38 """Replace a values for a key with a new value."""
39 39 self._items[key] = [value]
40 40
41 41 def __delitem__(self, key):
42 42 """Delete all values for a key."""
43 43 del self._items[key]
44 44
45 45 def __contains__(self, key):
46 46 return key in self._items
47 47
48 48 def __len__(self):
49 49 return len(self._items)
50 50
51 51 def get(self, key, default=None):
52 52 try:
53 53 return self.__getitem__(key)
54 54 except KeyError:
55 55 return default
56 56
57 57 def add(self, key, value):
58 58 """Add a new value for a key. Does not replace existing values."""
59 59 self._items.setdefault(key, []).append(value)
60 60
61 61 def getall(self, key):
62 62 """Obtains all values for a key."""
63 63 return self._items.get(key, [])
64 64
65 65 def getone(self, key):
66 66 """Obtain a single value for a key.
67 67
68 68 Raises KeyError if key not defined or it has multiple values set.
69 69 """
70 70 vals = self._items[key]
71 71
72 72 if len(vals) > 1:
73 73 raise KeyError('multiple values for %r' % key)
74 74
75 75 return vals[0]
76 76
77 77 def asdictoflists(self):
78 78 return {k: list(v) for k, v in self._items.iteritems()}
79 79
80 80 @attr.s(frozen=True)
81 81 class parsedrequest(object):
82 82 """Represents a parsed WSGI request.
83 83
84 84 Contains both parsed parameters as well as a handle on the input stream.
85 85 """
86 86
87 87 # Request method.
88 88 method = attr.ib()
89 89 # Full URL for this request.
90 90 url = attr.ib()
91 91 # URL without any path components. Just <proto>://<host><port>.
92 92 baseurl = attr.ib()
93 93 # Advertised URL. Like ``url`` and ``baseurl`` but uses SERVER_NAME instead
94 94 # of HTTP: Host header for hostname. This is likely what clients used.
95 95 advertisedurl = attr.ib()
96 96 advertisedbaseurl = attr.ib()
97 97 # URL scheme (part before ``://``). e.g. ``http`` or ``https``.
98 98 urlscheme = attr.ib()
99 99 # Value of REMOTE_USER, if set, or None.
100 100 remoteuser = attr.ib()
101 101 # Value of REMOTE_HOST, if set, or None.
102 102 remotehost = attr.ib()
103 103 # Relative WSGI application path. If defined, will begin with a
104 104 # ``/``.
105 105 apppath = attr.ib()
106 106 # List of path parts to be used for dispatch.
107 107 dispatchparts = attr.ib()
108 108 # URL path component (no query string) used for dispatch. Can be
109 109 # ``None`` to signal no path component given to the request, an
110 110 # empty string to signal a request to the application's root URL,
111 111 # or a string not beginning with ``/`` containing the requested
112 112 # path under the application.
113 113 dispatchpath = attr.ib()
114 114 # The name of the repository being accessed.
115 115 reponame = attr.ib()
116 116 # Raw query string (part after "?" in URL).
117 117 querystring = attr.ib()
118 118 # multidict of query string parameters.
119 119 qsparams = attr.ib()
120 120 # wsgiref.headers.Headers instance. Operates like a dict with case
121 121 # insensitive keys.
122 122 headers = attr.ib()
123 123 # Request body input stream.
124 124 bodyfh = attr.ib()
125 125 # WSGI environment dict, unmodified.
126 126 rawenv = attr.ib()
127 127
128 128 def parserequestfromenv(env, reponame=None, altbaseurl=None):
129 129 """Parse URL components from environment variables.
130 130
131 131 WSGI defines request attributes via environment variables. This function
132 132 parses the environment variables into a data structure.
133 133
134 134 If ``reponame`` is defined, the leading path components matching that
135 135 string are effectively shifted from ``PATH_INFO`` to ``SCRIPT_NAME``.
136 136 This simulates the world view of a WSGI application that processes
137 137 requests from the base URL of a repo.
138 138
139 139 If ``altbaseurl`` (typically comes from ``web.baseurl`` config option)
140 140 is defined, it is used - instead of the WSGI environment variables - for
141 141 constructing URL components up to and including the WSGI application path.
142 142 For example, if the current WSGI application is at ``/repo`` and a request
143 143 is made to ``/rev/@`` with this argument set to
144 144 ``http://myserver:9000/prefix``, the URL and path components will resolve as
145 145 if the request were to ``http://myserver:9000/prefix/rev/@``. In other
146 146 words, ``wsgi.url_scheme``, ``SERVER_NAME``, ``SERVER_PORT``, and
147 147 ``SCRIPT_NAME`` are all effectively replaced by components from this URL.
148 148 """
149 149 # PEP 3333 defines the WSGI spec and is a useful reference for this code.
150 150
151 151 # We first validate that the incoming object conforms with the WSGI spec.
152 152 # We only want to be dealing with spec-conforming WSGI implementations.
153 153 # TODO enable this once we fix internal violations.
154 154 #wsgiref.validate.check_environ(env)
155 155
156 156 # PEP-0333 states that environment keys and values are native strings
157 157 # (bytes on Python 2 and str on Python 3). The code points for the Unicode
158 158 # strings on Python 3 must be between \00000-\000FF. We deal with bytes
159 159 # in Mercurial, so mass convert string keys and values to bytes.
160 160 if pycompat.ispy3:
161 161 env = {k.encode('latin-1'): v for k, v in env.iteritems()}
162 162 env = {k: v.encode('latin-1') if isinstance(v, str) else v
163 163 for k, v in env.iteritems()}
164 164
165 165 if altbaseurl:
166 166 altbaseurl = util.url(altbaseurl)
167 167
168 168 # https://www.python.org/dev/peps/pep-0333/#environ-variables defines
169 169 # the environment variables.
170 170 # https://www.python.org/dev/peps/pep-0333/#url-reconstruction defines
171 171 # how URLs are reconstructed.
172 172 fullurl = env['wsgi.url_scheme'] + '://'
173 173
174 174 if altbaseurl and altbaseurl.scheme:
175 175 advertisedfullurl = altbaseurl.scheme + '://'
176 176 else:
177 177 advertisedfullurl = fullurl
178 178
179 179 def addport(s, port):
180 180 if s.startswith('https://'):
181 181 if port != '443':
182 182 s += ':' + port
183 183 else:
184 184 if port != '80':
185 185 s += ':' + port
186 186
187 187 return s
188 188
189 189 if env.get('HTTP_HOST'):
190 190 fullurl += env['HTTP_HOST']
191 191 else:
192 192 fullurl += env['SERVER_NAME']
193 193 fullurl = addport(fullurl, env['SERVER_PORT'])
194 194
195 195 if altbaseurl and altbaseurl.host:
196 196 advertisedfullurl += altbaseurl.host
197 197
198 198 if altbaseurl.port:
199 199 port = altbaseurl.port
200 200 elif altbaseurl.scheme == 'http' and not altbaseurl.port:
201 201 port = '80'
202 202 elif altbaseurl.scheme == 'https' and not altbaseurl.port:
203 203 port = '443'
204 204 else:
205 205 port = env['SERVER_PORT']
206 206
207 207 advertisedfullurl = addport(advertisedfullurl, port)
208 208 else:
209 209 advertisedfullurl += env['SERVER_NAME']
210 210 advertisedfullurl = addport(advertisedfullurl, env['SERVER_PORT'])
211 211
212 212 baseurl = fullurl
213 213 advertisedbaseurl = advertisedfullurl
214 214
215 215 fullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
216 216 fullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
217 217
218 218 if altbaseurl:
219 219 path = altbaseurl.path or ''
220 220 if path and not path.startswith('/'):
221 221 path = '/' + path
222 222 advertisedfullurl += util.urlreq.quote(path)
223 223 else:
224 224 advertisedfullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
225 225
226 226 advertisedfullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
227 227
228 228 if env.get('QUERY_STRING'):
229 229 fullurl += '?' + env['QUERY_STRING']
230 230 advertisedfullurl += '?' + env['QUERY_STRING']
231 231
232 232 # If ``reponame`` is defined, that must be a prefix on PATH_INFO
233 233 # that represents the repository being dispatched to. When computing
234 234 # the dispatch info, we ignore these leading path components.
235 235
236 236 if altbaseurl:
237 237 apppath = altbaseurl.path or ''
238 238 if apppath and not apppath.startswith('/'):
239 239 apppath = '/' + apppath
240 240 else:
241 241 apppath = env.get('SCRIPT_NAME', '')
242 242
243 243 if reponame:
244 244 repoprefix = '/' + reponame.strip('/')
245 245
246 246 if not env.get('PATH_INFO'):
247 247 raise error.ProgrammingError('reponame requires PATH_INFO')
248 248
249 249 if not env['PATH_INFO'].startswith(repoprefix):
250 250 raise error.ProgrammingError('PATH_INFO does not begin with repo '
251 251 'name: %s (%s)' % (env['PATH_INFO'],
252 252 reponame))
253 253
254 254 dispatchpath = env['PATH_INFO'][len(repoprefix):]
255 255
256 256 if dispatchpath and not dispatchpath.startswith('/'):
257 257 raise error.ProgrammingError('reponame prefix of PATH_INFO does '
258 258 'not end at path delimiter: %s (%s)' %
259 259 (env['PATH_INFO'], reponame))
260 260
261 261 apppath = apppath.rstrip('/') + repoprefix
262 262 dispatchparts = dispatchpath.strip('/').split('/')
263 263 dispatchpath = '/'.join(dispatchparts)
264 264
265 265 elif 'PATH_INFO' in env:
266 266 if env['PATH_INFO'].strip('/'):
267 267 dispatchparts = env['PATH_INFO'].strip('/').split('/')
268 268 dispatchpath = '/'.join(dispatchparts)
269 269 else:
270 270 dispatchparts = []
271 271 dispatchpath = ''
272 272 else:
273 273 dispatchparts = []
274 274 dispatchpath = None
275 275
276 276 querystring = env.get('QUERY_STRING', '')
277 277
278 278 # We store as a list so we have ordering information. We also store as
279 279 # a dict to facilitate fast lookup.
280 280 qsparams = multidict()
281 281 for k, v in util.urlreq.parseqsl(querystring, keep_blank_values=True):
282 282 qsparams.add(k, v)
283 283
284 284 # HTTP_* keys contain HTTP request headers. The Headers structure should
285 285 # perform case normalization for us. We just rewrite underscore to dash
286 286 # so keys match what likely went over the wire.
287 287 headers = []
288 288 for k, v in env.iteritems():
289 289 if k.startswith('HTTP_'):
290 290 headers.append((k[len('HTTP_'):].replace('_', '-'), v))
291 291
292 292 headers = wsgiheaders.Headers(headers)
293 293
294 294 # This is kind of a lie because the HTTP header wasn't explicitly
295 295 # sent. But for all intents and purposes it should be OK to lie about
296 296 # this, since a consumer will either either value to determine how many
297 297 # bytes are available to read.
298 298 if 'CONTENT_LENGTH' in env and 'HTTP_CONTENT_LENGTH' not in env:
299 299 headers['Content-Length'] = env['CONTENT_LENGTH']
300 300
301 if 'CONTENT_TYPE' in env and 'HTTP_CONTENT_TYPE' not in env:
302 headers['Content-Type'] = env['CONTENT_TYPE']
303
301 304 bodyfh = env['wsgi.input']
302 305 if 'Content-Length' in headers:
303 306 bodyfh = util.cappedreader(bodyfh, int(headers['Content-Length']))
304 307
305 308 return parsedrequest(method=env['REQUEST_METHOD'],
306 309 url=fullurl, baseurl=baseurl,
307 310 advertisedurl=advertisedfullurl,
308 311 advertisedbaseurl=advertisedbaseurl,
309 312 urlscheme=env['wsgi.url_scheme'],
310 313 remoteuser=env.get('REMOTE_USER'),
311 314 remotehost=env.get('REMOTE_HOST'),
312 315 apppath=apppath,
313 316 dispatchparts=dispatchparts, dispatchpath=dispatchpath,
314 317 reponame=reponame,
315 318 querystring=querystring,
316 319 qsparams=qsparams,
317 320 headers=headers,
318 321 bodyfh=bodyfh,
319 322 rawenv=env)
320 323
321 324 class offsettrackingwriter(object):
322 325 """A file object like object that is append only and tracks write count.
323 326
324 327 Instances are bound to a callable. This callable is called with data
325 328 whenever a ``write()`` is attempted.
326 329
327 330 Instances track the amount of written data so they can answer ``tell()``
328 331 requests.
329 332
330 333 The intent of this class is to wrap the ``write()`` function returned by
331 334 a WSGI ``start_response()`` function. Since ``write()`` is a callable and
332 335 not a file object, it doesn't implement other file object methods.
333 336 """
334 337 def __init__(self, writefn):
335 338 self._write = writefn
336 339 self._offset = 0
337 340
338 341 def write(self, s):
339 342 res = self._write(s)
340 343 # Some Python objects don't report the number of bytes written.
341 344 if res is None:
342 345 self._offset += len(s)
343 346 else:
344 347 self._offset += res
345 348
346 349 def flush(self):
347 350 pass
348 351
349 352 def tell(self):
350 353 return self._offset
351 354
352 355 class wsgiresponse(object):
353 356 """Represents a response to a WSGI request.
354 357
355 358 A response consists of a status line, headers, and a body.
356 359
357 360 Consumers must populate the ``status`` and ``headers`` fields and
358 361 make a call to a ``setbody*()`` method before the response can be
359 362 issued.
360 363
361 364 When it is time to start sending the response over the wire,
362 365 ``sendresponse()`` is called. It handles emitting the header portion
363 366 of the response message. It then yields chunks of body data to be
364 367 written to the peer. Typically, the WSGI application itself calls
365 368 and returns the value from ``sendresponse()``.
366 369 """
367 370
368 371 def __init__(self, req, startresponse):
369 372 """Create an empty response tied to a specific request.
370 373
371 374 ``req`` is a ``parsedrequest``. ``startresponse`` is the
372 375 ``start_response`` function passed to the WSGI application.
373 376 """
374 377 self._req = req
375 378 self._startresponse = startresponse
376 379
377 380 self.status = None
378 381 self.headers = wsgiheaders.Headers([])
379 382
380 383 self._bodybytes = None
381 384 self._bodygen = None
382 385 self._bodywillwrite = False
383 386 self._started = False
384 387 self._bodywritefn = None
385 388
386 389 def _verifybody(self):
387 390 if (self._bodybytes is not None or self._bodygen is not None
388 391 or self._bodywillwrite):
389 392 raise error.ProgrammingError('cannot define body multiple times')
390 393
391 394 def setbodybytes(self, b):
392 395 """Define the response body as static bytes.
393 396
394 397 The empty string signals that there is no response body.
395 398 """
396 399 self._verifybody()
397 400 self._bodybytes = b
398 401 self.headers['Content-Length'] = '%d' % len(b)
399 402
400 403 def setbodygen(self, gen):
401 404 """Define the response body as a generator of bytes."""
402 405 self._verifybody()
403 406 self._bodygen = gen
404 407
405 408 def setbodywillwrite(self):
406 409 """Signal an intent to use write() to emit the response body.
407 410
408 411 **This is the least preferred way to send a body.**
409 412
410 413 It is preferred for WSGI applications to emit a generator of chunks
411 414 constituting the response body. However, some consumers can't emit
412 415 data this way. So, WSGI provides a way to obtain a ``write(data)``
413 416 function that can be used to synchronously perform an unbuffered
414 417 write.
415 418
416 419 Calling this function signals an intent to produce the body in this
417 420 manner.
418 421 """
419 422 self._verifybody()
420 423 self._bodywillwrite = True
421 424
422 425 def sendresponse(self):
423 426 """Send the generated response to the client.
424 427
425 428 Before this is called, ``status`` must be set and one of
426 429 ``setbodybytes()`` or ``setbodygen()`` must be called.
427 430
428 431 Calling this method multiple times is not allowed.
429 432 """
430 433 if self._started:
431 434 raise error.ProgrammingError('sendresponse() called multiple times')
432 435
433 436 self._started = True
434 437
435 438 if not self.status:
436 439 raise error.ProgrammingError('status line not defined')
437 440
438 441 if (self._bodybytes is None and self._bodygen is None
439 442 and not self._bodywillwrite):
440 443 raise error.ProgrammingError('response body not defined')
441 444
442 445 # RFC 7232 Section 4.1 states that a 304 MUST generate one of
443 446 # {Cache-Control, Content-Location, Date, ETag, Expires, Vary}
444 447 # and SHOULD NOT generate other headers unless they could be used
445 448 # to guide cache updates. Furthermore, RFC 7230 Section 3.3.2
446 449 # states that no response body can be issued. Content-Length can
447 450 # be sent. But if it is present, it should be the size of the response
448 451 # that wasn't transferred.
449 452 if self.status.startswith('304 '):
450 453 # setbodybytes('') will set C-L to 0. This doesn't conform with the
451 454 # spec. So remove it.
452 455 if self.headers.get('Content-Length') == '0':
453 456 del self.headers['Content-Length']
454 457
455 458 # Strictly speaking, this is too strict. But until it causes
456 459 # problems, let's be strict.
457 460 badheaders = {k for k in self.headers.keys()
458 461 if k.lower() not in ('date', 'etag', 'expires',
459 462 'cache-control',
460 463 'content-location',
461 464 'vary')}
462 465 if badheaders:
463 466 raise error.ProgrammingError(
464 467 'illegal header on 304 response: %s' %
465 468 ', '.join(sorted(badheaders)))
466 469
467 470 if self._bodygen is not None or self._bodywillwrite:
468 471 raise error.ProgrammingError("must use setbodybytes('') with "
469 472 "304 responses")
470 473
471 474 # Various HTTP clients (notably httplib) won't read the HTTP response
472 475 # until the HTTP request has been sent in full. If servers (us) send a
473 476 # response before the HTTP request has been fully sent, the connection
474 477 # may deadlock because neither end is reading.
475 478 #
476 479 # We work around this by "draining" the request data before
477 480 # sending any response in some conditions.
478 481 drain = False
479 482 close = False
480 483
481 484 # If the client sent Expect: 100-continue, we assume it is smart enough
482 485 # to deal with the server sending a response before reading the request.
483 486 # (httplib doesn't do this.)
484 487 if self._req.headers.get('Expect', '').lower() == '100-continue':
485 488 pass
486 489 # Only tend to request methods that have bodies. Strictly speaking,
487 490 # we should sniff for a body. But this is fine for our existing
488 491 # WSGI applications.
489 492 elif self._req.method not in ('POST', 'PUT'):
490 493 pass
491 494 else:
492 495 # If we don't know how much data to read, there's no guarantee
493 496 # that we can drain the request responsibly. The WSGI
494 497 # specification only says that servers *should* ensure the
495 498 # input stream doesn't overrun the actual request. So there's
496 499 # no guarantee that reading until EOF won't corrupt the stream
497 500 # state.
498 501 if not isinstance(self._req.bodyfh, util.cappedreader):
499 502 close = True
500 503 else:
501 504 # We /could/ only drain certain HTTP response codes. But 200 and
502 505 # non-200 wire protocol responses both require draining. Since
503 506 # we have a capped reader in place for all situations where we
504 507 # drain, it is safe to read from that stream. We'll either do
505 508 # a drain or no-op if we're already at EOF.
506 509 drain = True
507 510
508 511 if close:
509 512 self.headers['Connection'] = 'Close'
510 513
511 514 if drain:
512 515 assert isinstance(self._req.bodyfh, util.cappedreader)
513 516 while True:
514 517 chunk = self._req.bodyfh.read(32768)
515 518 if not chunk:
516 519 break
517 520
518 521 write = self._startresponse(pycompat.sysstr(self.status),
519 522 self.headers.items())
520 523
521 524 if self._bodybytes:
522 525 yield self._bodybytes
523 526 elif self._bodygen:
524 527 for chunk in self._bodygen:
525 528 yield chunk
526 529 elif self._bodywillwrite:
527 530 self._bodywritefn = write
528 531 else:
529 532 error.ProgrammingError('do not know how to send body')
530 533
531 534 def getbodyfile(self):
532 535 """Obtain a file object like object representing the response body.
533 536
534 537 For this to work, you must call ``setbodywillwrite()`` and then
535 538 ``sendresponse()`` first. ``sendresponse()`` is a generator and the
536 539 function won't run to completion unless the generator is advanced. The
537 540 generator yields not items. The easiest way to consume it is with
538 541 ``list(res.sendresponse())``, which should resolve to an empty list -
539 542 ``[]``.
540 543 """
541 544 if not self._bodywillwrite:
542 545 raise error.ProgrammingError('must call setbodywillwrite() first')
543 546
544 547 if not self._started:
545 548 raise error.ProgrammingError('must call sendresponse() first; did '
546 549 'you remember to consume it since it '
547 550 'is a generator?')
548 551
549 552 assert self._bodywritefn
550 553 return offsettrackingwriter(self._bodywritefn)
551 554
552 555 def wsgiapplication(app_maker):
553 556 '''For compatibility with old CGI scripts. A plain hgweb() or hgwebdir()
554 557 can and should now be used as a WSGI application.'''
555 558 application = app_maker()
556 559 def run_wsgi(env, respond):
557 560 return application(env, respond)
558 561 return run_wsgi
General Comments 0
You need to be logged in to leave comments. Login now