##// END OF EJS Templates
hgweb: fallback to checking wsgireq.env for REPO_NAME for 3rd party hosting...
Matt Harbison -
r37634:5e81cf96 default
parent child Browse files
Show More
@@ -1,564 +1,570 b''
1 1 # hgweb/request.py - An http request from either CGI or the standalone server.
2 2 #
3 3 # Copyright 21 May 2005 - (c) 2005 Jake Edge <jake@edge2.net>
4 4 # Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 11 #import wsgiref.validate
12 12
13 13 from ..thirdparty import (
14 14 attr,
15 15 )
16 16 from .. import (
17 17 error,
18 18 pycompat,
19 19 util,
20 20 )
21 21
22 22 class multidict(object):
23 23 """A dict like object that can store multiple values for a key.
24 24
25 25 Used to store parsed request parameters.
26 26
27 27 This is inspired by WebOb's class of the same name.
28 28 """
29 29 def __init__(self):
30 30 self._items = {}
31 31
32 32 def __getitem__(self, key):
33 33 """Returns the last set value for a key."""
34 34 return self._items[key][-1]
35 35
36 36 def __setitem__(self, key, value):
37 37 """Replace a values for a key with a new value."""
38 38 self._items[key] = [value]
39 39
40 40 def __delitem__(self, key):
41 41 """Delete all values for a key."""
42 42 del self._items[key]
43 43
44 44 def __contains__(self, key):
45 45 return key in self._items
46 46
47 47 def __len__(self):
48 48 return len(self._items)
49 49
50 50 def get(self, key, default=None):
51 51 try:
52 52 return self.__getitem__(key)
53 53 except KeyError:
54 54 return default
55 55
56 56 def add(self, key, value):
57 57 """Add a new value for a key. Does not replace existing values."""
58 58 self._items.setdefault(key, []).append(value)
59 59
60 60 def getall(self, key):
61 61 """Obtains all values for a key."""
62 62 return self._items.get(key, [])
63 63
64 64 def getone(self, key):
65 65 """Obtain a single value for a key.
66 66
67 67 Raises KeyError if key not defined or it has multiple values set.
68 68 """
69 69 vals = self._items[key]
70 70
71 71 if len(vals) > 1:
72 72 raise KeyError('multiple values for %r' % key)
73 73
74 74 return vals[0]
75 75
76 76 def asdictoflists(self):
77 77 return {k: list(v) for k, v in self._items.iteritems()}
78 78
79 79 @attr.s(frozen=True)
80 80 class parsedrequest(object):
81 81 """Represents a parsed WSGI request.
82 82
83 83 Contains both parsed parameters as well as a handle on the input stream.
84 84 """
85 85
86 86 # Request method.
87 87 method = attr.ib()
88 88 # Full URL for this request.
89 89 url = attr.ib()
90 90 # URL without any path components. Just <proto>://<host><port>.
91 91 baseurl = attr.ib()
92 92 # Advertised URL. Like ``url`` and ``baseurl`` but uses SERVER_NAME instead
93 93 # of HTTP: Host header for hostname. This is likely what clients used.
94 94 advertisedurl = attr.ib()
95 95 advertisedbaseurl = attr.ib()
96 96 # URL scheme (part before ``://``). e.g. ``http`` or ``https``.
97 97 urlscheme = attr.ib()
98 98 # Value of REMOTE_USER, if set, or None.
99 99 remoteuser = attr.ib()
100 100 # Value of REMOTE_HOST, if set, or None.
101 101 remotehost = attr.ib()
102 102 # Relative WSGI application path. If defined, will begin with a
103 103 # ``/``.
104 104 apppath = attr.ib()
105 105 # List of path parts to be used for dispatch.
106 106 dispatchparts = attr.ib()
107 107 # URL path component (no query string) used for dispatch. Can be
108 108 # ``None`` to signal no path component given to the request, an
109 109 # empty string to signal a request to the application's root URL,
110 110 # or a string not beginning with ``/`` containing the requested
111 111 # path under the application.
112 112 dispatchpath = attr.ib()
113 113 # The name of the repository being accessed.
114 114 reponame = attr.ib()
115 115 # Raw query string (part after "?" in URL).
116 116 querystring = attr.ib()
117 117 # multidict of query string parameters.
118 118 qsparams = attr.ib()
119 119 # wsgiref.headers.Headers instance. Operates like a dict with case
120 120 # insensitive keys.
121 121 headers = attr.ib()
122 122 # Request body input stream.
123 123 bodyfh = attr.ib()
124 124 # WSGI environment dict, unmodified.
125 125 rawenv = attr.ib()
126 126
127 127 def parserequestfromenv(env, reponame=None, altbaseurl=None):
128 128 """Parse URL components from environment variables.
129 129
130 130 WSGI defines request attributes via environment variables. This function
131 131 parses the environment variables into a data structure.
132 132
133 133 If ``reponame`` is defined, the leading path components matching that
134 134 string are effectively shifted from ``PATH_INFO`` to ``SCRIPT_NAME``.
135 135 This simulates the world view of a WSGI application that processes
136 136 requests from the base URL of a repo.
137 137
138 138 If ``altbaseurl`` (typically comes from ``web.baseurl`` config option)
139 139 is defined, it is used - instead of the WSGI environment variables - for
140 140 constructing URL components up to and including the WSGI application path.
141 141 For example, if the current WSGI application is at ``/repo`` and a request
142 142 is made to ``/rev/@`` with this argument set to
143 143 ``http://myserver:9000/prefix``, the URL and path components will resolve as
144 144 if the request were to ``http://myserver:9000/prefix/rev/@``. In other
145 145 words, ``wsgi.url_scheme``, ``SERVER_NAME``, ``SERVER_PORT``, and
146 146 ``SCRIPT_NAME`` are all effectively replaced by components from this URL.
147 147 """
148 148 # PEP 3333 defines the WSGI spec and is a useful reference for this code.
149 149
150 150 # We first validate that the incoming object conforms with the WSGI spec.
151 151 # We only want to be dealing with spec-conforming WSGI implementations.
152 152 # TODO enable this once we fix internal violations.
153 153 #wsgiref.validate.check_environ(env)
154 154
155 155 # PEP-0333 states that environment keys and values are native strings
156 156 # (bytes on Python 2 and str on Python 3). The code points for the Unicode
157 157 # strings on Python 3 must be between \00000-\000FF. We deal with bytes
158 158 # in Mercurial, so mass convert string keys and values to bytes.
159 159 if pycompat.ispy3:
160 160 env = {k.encode('latin-1'): v for k, v in env.iteritems()}
161 161 env = {k: v.encode('latin-1') if isinstance(v, str) else v
162 162 for k, v in env.iteritems()}
163 163
164 # Some hosting solutions are emulating hgwebdir, and dispatching directly
165 # to an hgweb instance using this environment variable. This was always
166 # checked prior to d7fd203e36cc; keep doing so to avoid breaking them.
167 if not reponame:
168 reponame = env.get('REPO_NAME')
169
164 170 if altbaseurl:
165 171 altbaseurl = util.url(altbaseurl)
166 172
167 173 # https://www.python.org/dev/peps/pep-0333/#environ-variables defines
168 174 # the environment variables.
169 175 # https://www.python.org/dev/peps/pep-0333/#url-reconstruction defines
170 176 # how URLs are reconstructed.
171 177 fullurl = env['wsgi.url_scheme'] + '://'
172 178
173 179 if altbaseurl and altbaseurl.scheme:
174 180 advertisedfullurl = altbaseurl.scheme + '://'
175 181 else:
176 182 advertisedfullurl = fullurl
177 183
178 184 def addport(s, port):
179 185 if s.startswith('https://'):
180 186 if port != '443':
181 187 s += ':' + port
182 188 else:
183 189 if port != '80':
184 190 s += ':' + port
185 191
186 192 return s
187 193
188 194 if env.get('HTTP_HOST'):
189 195 fullurl += env['HTTP_HOST']
190 196 else:
191 197 fullurl += env['SERVER_NAME']
192 198 fullurl = addport(fullurl, env['SERVER_PORT'])
193 199
194 200 if altbaseurl and altbaseurl.host:
195 201 advertisedfullurl += altbaseurl.host
196 202
197 203 if altbaseurl.port:
198 204 port = altbaseurl.port
199 205 elif altbaseurl.scheme == 'http' and not altbaseurl.port:
200 206 port = '80'
201 207 elif altbaseurl.scheme == 'https' and not altbaseurl.port:
202 208 port = '443'
203 209 else:
204 210 port = env['SERVER_PORT']
205 211
206 212 advertisedfullurl = addport(advertisedfullurl, port)
207 213 else:
208 214 advertisedfullurl += env['SERVER_NAME']
209 215 advertisedfullurl = addport(advertisedfullurl, env['SERVER_PORT'])
210 216
211 217 baseurl = fullurl
212 218 advertisedbaseurl = advertisedfullurl
213 219
214 220 fullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
215 221 fullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
216 222
217 223 if altbaseurl:
218 224 path = altbaseurl.path or ''
219 225 if path and not path.startswith('/'):
220 226 path = '/' + path
221 227 advertisedfullurl += util.urlreq.quote(path)
222 228 else:
223 229 advertisedfullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
224 230
225 231 advertisedfullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
226 232
227 233 if env.get('QUERY_STRING'):
228 234 fullurl += '?' + env['QUERY_STRING']
229 235 advertisedfullurl += '?' + env['QUERY_STRING']
230 236
231 237 # If ``reponame`` is defined, that must be a prefix on PATH_INFO
232 238 # that represents the repository being dispatched to. When computing
233 239 # the dispatch info, we ignore these leading path components.
234 240
235 241 if altbaseurl:
236 242 apppath = altbaseurl.path or ''
237 243 if apppath and not apppath.startswith('/'):
238 244 apppath = '/' + apppath
239 245 else:
240 246 apppath = env.get('SCRIPT_NAME', '')
241 247
242 248 if reponame:
243 249 repoprefix = '/' + reponame.strip('/')
244 250
245 251 if not env.get('PATH_INFO'):
246 252 raise error.ProgrammingError('reponame requires PATH_INFO')
247 253
248 254 if not env['PATH_INFO'].startswith(repoprefix):
249 255 raise error.ProgrammingError('PATH_INFO does not begin with repo '
250 256 'name: %s (%s)' % (env['PATH_INFO'],
251 257 reponame))
252 258
253 259 dispatchpath = env['PATH_INFO'][len(repoprefix):]
254 260
255 261 if dispatchpath and not dispatchpath.startswith('/'):
256 262 raise error.ProgrammingError('reponame prefix of PATH_INFO does '
257 263 'not end at path delimiter: %s (%s)' %
258 264 (env['PATH_INFO'], reponame))
259 265
260 266 apppath = apppath.rstrip('/') + repoprefix
261 267 dispatchparts = dispatchpath.strip('/').split('/')
262 268 dispatchpath = '/'.join(dispatchparts)
263 269
264 270 elif 'PATH_INFO' in env:
265 271 if env['PATH_INFO'].strip('/'):
266 272 dispatchparts = env['PATH_INFO'].strip('/').split('/')
267 273 dispatchpath = '/'.join(dispatchparts)
268 274 else:
269 275 dispatchparts = []
270 276 dispatchpath = ''
271 277 else:
272 278 dispatchparts = []
273 279 dispatchpath = None
274 280
275 281 querystring = env.get('QUERY_STRING', '')
276 282
277 283 # We store as a list so we have ordering information. We also store as
278 284 # a dict to facilitate fast lookup.
279 285 qsparams = multidict()
280 286 for k, v in util.urlreq.parseqsl(querystring, keep_blank_values=True):
281 287 qsparams.add(k, v)
282 288
283 289 # HTTP_* keys contain HTTP request headers. The Headers structure should
284 290 # perform case normalization for us. We just rewrite underscore to dash
285 291 # so keys match what likely went over the wire.
286 292 headers = []
287 293 for k, v in env.iteritems():
288 294 if k.startswith('HTTP_'):
289 295 headers.append((k[len('HTTP_'):].replace('_', '-'), v))
290 296
291 297 from . import wsgiheaders # avoid cycle
292 298 headers = wsgiheaders.Headers(headers)
293 299
294 300 # This is kind of a lie because the HTTP header wasn't explicitly
295 301 # sent. But for all intents and purposes it should be OK to lie about
296 302 # this, since a consumer will either either value to determine how many
297 303 # bytes are available to read.
298 304 if 'CONTENT_LENGTH' in env and 'HTTP_CONTENT_LENGTH' not in env:
299 305 headers['Content-Length'] = env['CONTENT_LENGTH']
300 306
301 307 if 'CONTENT_TYPE' in env and 'HTTP_CONTENT_TYPE' not in env:
302 308 headers['Content-Type'] = env['CONTENT_TYPE']
303 309
304 310 bodyfh = env['wsgi.input']
305 311 if 'Content-Length' in headers:
306 312 bodyfh = util.cappedreader(bodyfh, int(headers['Content-Length']))
307 313
308 314 return parsedrequest(method=env['REQUEST_METHOD'],
309 315 url=fullurl, baseurl=baseurl,
310 316 advertisedurl=advertisedfullurl,
311 317 advertisedbaseurl=advertisedbaseurl,
312 318 urlscheme=env['wsgi.url_scheme'],
313 319 remoteuser=env.get('REMOTE_USER'),
314 320 remotehost=env.get('REMOTE_HOST'),
315 321 apppath=apppath,
316 322 dispatchparts=dispatchparts, dispatchpath=dispatchpath,
317 323 reponame=reponame,
318 324 querystring=querystring,
319 325 qsparams=qsparams,
320 326 headers=headers,
321 327 bodyfh=bodyfh,
322 328 rawenv=env)
323 329
324 330 class offsettrackingwriter(object):
325 331 """A file object like object that is append only and tracks write count.
326 332
327 333 Instances are bound to a callable. This callable is called with data
328 334 whenever a ``write()`` is attempted.
329 335
330 336 Instances track the amount of written data so they can answer ``tell()``
331 337 requests.
332 338
333 339 The intent of this class is to wrap the ``write()`` function returned by
334 340 a WSGI ``start_response()`` function. Since ``write()`` is a callable and
335 341 not a file object, it doesn't implement other file object methods.
336 342 """
337 343 def __init__(self, writefn):
338 344 self._write = writefn
339 345 self._offset = 0
340 346
341 347 def write(self, s):
342 348 res = self._write(s)
343 349 # Some Python objects don't report the number of bytes written.
344 350 if res is None:
345 351 self._offset += len(s)
346 352 else:
347 353 self._offset += res
348 354
349 355 def flush(self):
350 356 pass
351 357
352 358 def tell(self):
353 359 return self._offset
354 360
355 361 class wsgiresponse(object):
356 362 """Represents a response to a WSGI request.
357 363
358 364 A response consists of a status line, headers, and a body.
359 365
360 366 Consumers must populate the ``status`` and ``headers`` fields and
361 367 make a call to a ``setbody*()`` method before the response can be
362 368 issued.
363 369
364 370 When it is time to start sending the response over the wire,
365 371 ``sendresponse()`` is called. It handles emitting the header portion
366 372 of the response message. It then yields chunks of body data to be
367 373 written to the peer. Typically, the WSGI application itself calls
368 374 and returns the value from ``sendresponse()``.
369 375 """
370 376
371 377 def __init__(self, req, startresponse):
372 378 """Create an empty response tied to a specific request.
373 379
374 380 ``req`` is a ``parsedrequest``. ``startresponse`` is the
375 381 ``start_response`` function passed to the WSGI application.
376 382 """
377 383 self._req = req
378 384 self._startresponse = startresponse
379 385
380 386 self.status = None
381 387 from . import wsgiheaders # avoid cycle
382 388 self.headers = wsgiheaders.Headers([])
383 389
384 390 self._bodybytes = None
385 391 self._bodygen = None
386 392 self._bodywillwrite = False
387 393 self._started = False
388 394 self._bodywritefn = None
389 395
390 396 def _verifybody(self):
391 397 if (self._bodybytes is not None or self._bodygen is not None
392 398 or self._bodywillwrite):
393 399 raise error.ProgrammingError('cannot define body multiple times')
394 400
395 401 def setbodybytes(self, b):
396 402 """Define the response body as static bytes.
397 403
398 404 The empty string signals that there is no response body.
399 405 """
400 406 self._verifybody()
401 407 self._bodybytes = b
402 408 self.headers['Content-Length'] = '%d' % len(b)
403 409
404 410 def setbodygen(self, gen):
405 411 """Define the response body as a generator of bytes."""
406 412 self._verifybody()
407 413 self._bodygen = gen
408 414
409 415 def setbodywillwrite(self):
410 416 """Signal an intent to use write() to emit the response body.
411 417
412 418 **This is the least preferred way to send a body.**
413 419
414 420 It is preferred for WSGI applications to emit a generator of chunks
415 421 constituting the response body. However, some consumers can't emit
416 422 data this way. So, WSGI provides a way to obtain a ``write(data)``
417 423 function that can be used to synchronously perform an unbuffered
418 424 write.
419 425
420 426 Calling this function signals an intent to produce the body in this
421 427 manner.
422 428 """
423 429 self._verifybody()
424 430 self._bodywillwrite = True
425 431
426 432 def sendresponse(self):
427 433 """Send the generated response to the client.
428 434
429 435 Before this is called, ``status`` must be set and one of
430 436 ``setbodybytes()`` or ``setbodygen()`` must be called.
431 437
432 438 Calling this method multiple times is not allowed.
433 439 """
434 440 if self._started:
435 441 raise error.ProgrammingError('sendresponse() called multiple times')
436 442
437 443 self._started = True
438 444
439 445 if not self.status:
440 446 raise error.ProgrammingError('status line not defined')
441 447
442 448 if (self._bodybytes is None and self._bodygen is None
443 449 and not self._bodywillwrite):
444 450 raise error.ProgrammingError('response body not defined')
445 451
446 452 # RFC 7232 Section 4.1 states that a 304 MUST generate one of
447 453 # {Cache-Control, Content-Location, Date, ETag, Expires, Vary}
448 454 # and SHOULD NOT generate other headers unless they could be used
449 455 # to guide cache updates. Furthermore, RFC 7230 Section 3.3.2
450 456 # states that no response body can be issued. Content-Length can
451 457 # be sent. But if it is present, it should be the size of the response
452 458 # that wasn't transferred.
453 459 if self.status.startswith('304 '):
454 460 # setbodybytes('') will set C-L to 0. This doesn't conform with the
455 461 # spec. So remove it.
456 462 if self.headers.get('Content-Length') == '0':
457 463 del self.headers['Content-Length']
458 464
459 465 # Strictly speaking, this is too strict. But until it causes
460 466 # problems, let's be strict.
461 467 badheaders = {k for k in self.headers.keys()
462 468 if k.lower() not in ('date', 'etag', 'expires',
463 469 'cache-control',
464 470 'content-location',
465 471 'vary')}
466 472 if badheaders:
467 473 raise error.ProgrammingError(
468 474 'illegal header on 304 response: %s' %
469 475 ', '.join(sorted(badheaders)))
470 476
471 477 if self._bodygen is not None or self._bodywillwrite:
472 478 raise error.ProgrammingError("must use setbodybytes('') with "
473 479 "304 responses")
474 480
475 481 # Various HTTP clients (notably httplib) won't read the HTTP response
476 482 # until the HTTP request has been sent in full. If servers (us) send a
477 483 # response before the HTTP request has been fully sent, the connection
478 484 # may deadlock because neither end is reading.
479 485 #
480 486 # We work around this by "draining" the request data before
481 487 # sending any response in some conditions.
482 488 drain = False
483 489 close = False
484 490
485 491 # If the client sent Expect: 100-continue, we assume it is smart enough
486 492 # to deal with the server sending a response before reading the request.
487 493 # (httplib doesn't do this.)
488 494 if self._req.headers.get('Expect', '').lower() == '100-continue':
489 495 pass
490 496 # Only tend to request methods that have bodies. Strictly speaking,
491 497 # we should sniff for a body. But this is fine for our existing
492 498 # WSGI applications.
493 499 elif self._req.method not in ('POST', 'PUT'):
494 500 pass
495 501 else:
496 502 # If we don't know how much data to read, there's no guarantee
497 503 # that we can drain the request responsibly. The WSGI
498 504 # specification only says that servers *should* ensure the
499 505 # input stream doesn't overrun the actual request. So there's
500 506 # no guarantee that reading until EOF won't corrupt the stream
501 507 # state.
502 508 if not isinstance(self._req.bodyfh, util.cappedreader):
503 509 close = True
504 510 else:
505 511 # We /could/ only drain certain HTTP response codes. But 200 and
506 512 # non-200 wire protocol responses both require draining. Since
507 513 # we have a capped reader in place for all situations where we
508 514 # drain, it is safe to read from that stream. We'll either do
509 515 # a drain or no-op if we're already at EOF.
510 516 drain = True
511 517
512 518 if close:
513 519 self.headers['Connection'] = 'Close'
514 520
515 521 if drain:
516 522 assert isinstance(self._req.bodyfh, util.cappedreader)
517 523 while True:
518 524 chunk = self._req.bodyfh.read(32768)
519 525 if not chunk:
520 526 break
521 527
522 528 strheaders = [(pycompat.strurl(k), pycompat.strurl(v)) for
523 529 k, v in self.headers.items()]
524 530 write = self._startresponse(pycompat.sysstr(self.status),
525 531 strheaders)
526 532
527 533 if self._bodybytes:
528 534 yield self._bodybytes
529 535 elif self._bodygen:
530 536 for chunk in self._bodygen:
531 537 yield chunk
532 538 elif self._bodywillwrite:
533 539 self._bodywritefn = write
534 540 else:
535 541 error.ProgrammingError('do not know how to send body')
536 542
537 543 def getbodyfile(self):
538 544 """Obtain a file object like object representing the response body.
539 545
540 546 For this to work, you must call ``setbodywillwrite()`` and then
541 547 ``sendresponse()`` first. ``sendresponse()`` is a generator and the
542 548 function won't run to completion unless the generator is advanced. The
543 549 generator yields not items. The easiest way to consume it is with
544 550 ``list(res.sendresponse())``, which should resolve to an empty list -
545 551 ``[]``.
546 552 """
547 553 if not self._bodywillwrite:
548 554 raise error.ProgrammingError('must call setbodywillwrite() first')
549 555
550 556 if not self._started:
551 557 raise error.ProgrammingError('must call sendresponse() first; did '
552 558 'you remember to consume it since it '
553 559 'is a generator?')
554 560
555 561 assert self._bodywritefn
556 562 return offsettrackingwriter(self._bodywritefn)
557 563
558 564 def wsgiapplication(app_maker):
559 565 '''For compatibility with old CGI scripts. A plain hgweb() or hgwebdir()
560 566 can and should now be used as a WSGI application.'''
561 567 application = app_maker()
562 568 def run_wsgi(env, respond):
563 569 return application(env, respond)
564 570 return run_wsgi
General Comments 0
You need to be logged in to leave comments. Login now