byterange.py
451 lines
| 16.2 KiB
| text/x-python
|
PythonLexer
/ mercurial / byterange.py
mpm@selenic.com
|
r0 | # This library is free software; you can redistribute it and/or | ||
# modify it under the terms of the GNU Lesser General Public | ||||
# License as published by the Free Software Foundation; either | ||||
# version 2.1 of the License, or (at your option) any later version. | ||||
# | ||||
# This library is distributed in the hope that it will be useful, | ||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||||
# Lesser General Public License for more details. | ||||
# | ||||
# You should have received a copy of the GNU Lesser General Public | ||||
mpm@selenic.com
|
r575 | # License along with this library; if not, write to the | ||
# Free Software Foundation, Inc., | ||||
# 59 Temple Place, Suite 330, | ||||
mpm@selenic.com
|
r0 | # Boston, MA 02111-1307 USA | ||
# This file is part of urlgrabber, a high-level cross-protocol url-grabber | ||||
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko | ||||
# $Id: byterange.py,v 1.9 2005/02/14 21:55:07 mstenner Exp $ | ||||
import os | ||||
import stat | ||||
import urllib | ||||
import urllib2 | ||||
import rfc822 | ||||
mpm@selenic.com
|
r575 | try: | ||
mpm@selenic.com
|
r0 | from cStringIO import StringIO | ||
mpm@selenic.com
|
r575 | except ImportError, msg: | ||
mpm@selenic.com
|
r0 | from StringIO import StringIO | ||
class RangeError(IOError): | ||||
"""Error raised when an unsatisfiable range is requested.""" | ||||
pass | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | class HTTPRangeHandler(urllib2.BaseHandler): | ||
"""Handler that enables HTTP Range headers. | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | This was extremely simple. The Range header is a HTTP feature to | ||
mpm@selenic.com
|
r575 | begin with so all this class does is tell urllib2 that the | ||
"206 Partial Content" reponse from the HTTP server is what we | ||||
mpm@selenic.com
|
r0 | expected. | ||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | Example: | ||
import urllib2 | ||||
import byterange | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | range_handler = range.HTTPRangeHandler() | ||
opener = urllib2.build_opener(range_handler) | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | # install it | ||
urllib2.install_opener(opener) | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | # create Request and set Range header | ||
req = urllib2.Request('http://www.python.org/') | ||||
req.header['Range'] = 'bytes=30-50' | ||||
f = urllib2.urlopen(req) | ||||
""" | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | def http_error_206(self, req, fp, code, msg, hdrs): | ||
# 206 Partial Content Response | ||||
r = urllib.addinfourl(fp, hdrs, req.get_full_url()) | ||||
r.code = code | ||||
r.msg = msg | ||||
return r | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | def http_error_416(self, req, fp, code, msg, hdrs): | ||
# HTTP's Range Not Satisfiable error | ||||
raise RangeError('Requested Range Not Satisfiable') | ||||
class RangeableFileObject: | ||||
"""File object wrapper to enable raw range handling. | ||||
mpm@selenic.com
|
r575 | This was implemented primarilary for handling range | ||
specifications for file:// urls. This object effectively makes | ||||
a file object look like it consists only of a range of bytes in | ||||
mpm@selenic.com
|
r0 | the stream. | ||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | Examples: | ||
mpm@selenic.com
|
r575 | # expose 10 bytes, starting at byte position 20, from | ||
mpm@selenic.com
|
r0 | # /etc/aliases. | ||
>>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30)) | ||||
# seek seeks within the range (to position 23 in this case) | ||||
>>> fo.seek(3) | ||||
# tell tells where your at _within the range_ (position 3 in | ||||
# this case) | ||||
>>> fo.tell() | ||||
# read EOFs if an attempt is made to read past the last | ||||
# byte in the range. the following will return only 7 bytes. | ||||
>>> fo.read(30) | ||||
""" | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | def __init__(self, fo, rangetup): | ||
"""Create a RangeableFileObject. | ||||
mpm@selenic.com
|
r575 | fo -- a file like object. only the read() method need be | ||
supported but supporting an optimized seek() is | ||||
mpm@selenic.com
|
r0 | preferable. | ||
rangetup -- a (firstbyte,lastbyte) tuple specifying the range | ||||
to work over. | ||||
The file object provided is assumed to be at byte offset 0. | ||||
""" | ||||
self.fo = fo | ||||
(self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup) | ||||
self.realpos = 0 | ||||
self._do_seek(self.firstbyte) | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | def __getattr__(self, name): | ||
"""This effectively allows us to wrap at the instance level. | ||||
Any attribute not found in _this_ object will be searched for | ||||
in self.fo. This includes methods.""" | ||||
if hasattr(self.fo, name): | ||||
return getattr(self.fo, name) | ||||
raise AttributeError, name | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | def tell(self): | ||
"""Return the position within the range. | ||||
mpm@selenic.com
|
r575 | This is different from fo.seek in that position 0 is the | ||
mpm@selenic.com
|
r0 | first byte position of the range tuple. For example, if | ||
this object was created with a range tuple of (500,899), | ||||
tell() will return 0 when at byte position 500 of the file. | ||||
""" | ||||
return (self.realpos - self.firstbyte) | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | def seek(self,offset,whence=0): | ||
"""Seek within the byte range. | ||||
Positioning is identical to that described under tell(). | ||||
""" | ||||
assert whence in (0, 1, 2) | ||||
if whence == 0: # absolute seek | ||||
realoffset = self.firstbyte + offset | ||||
elif whence == 1: # relative seek | ||||
realoffset = self.realpos + offset | ||||
elif whence == 2: # absolute from end of file | ||||
# XXX: are we raising the right Error here? | ||||
raise IOError('seek from end of file not supported.') | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | # do not allow seek past lastbyte in range | ||
if self.lastbyte and (realoffset >= self.lastbyte): | ||||
realoffset = self.lastbyte | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | self._do_seek(realoffset - self.realpos) | ||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | def read(self, size=-1): | ||
"""Read within the range. | ||||
This method will limit the size read based on the range. | ||||
""" | ||||
size = self._calc_read_size(size) | ||||
rslt = self.fo.read(size) | ||||
self.realpos += len(rslt) | ||||
return rslt | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | def readline(self, size=-1): | ||
"""Read lines within the range. | ||||
This method will limit the size read based on the range. | ||||
""" | ||||
size = self._calc_read_size(size) | ||||
rslt = self.fo.readline(size) | ||||
self.realpos += len(rslt) | ||||
return rslt | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | def _calc_read_size(self, size): | ||
"""Handles calculating the amount of data to read based on | ||||
the range. | ||||
""" | ||||
if self.lastbyte: | ||||
if size > -1: | ||||
if ((self.realpos + size) >= self.lastbyte): | ||||
size = (self.lastbyte - self.realpos) | ||||
else: | ||||
size = (self.lastbyte - self.realpos) | ||||
return size | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | def _do_seek(self,offset): | ||
"""Seek based on whether wrapped object supports seek(). | ||||
offset is relative to the current position (self.realpos). | ||||
""" | ||||
assert offset >= 0 | ||||
if not hasattr(self.fo, 'seek'): | ||||
self._poor_mans_seek(offset) | ||||
else: | ||||
self.fo.seek(self.realpos + offset) | ||||
self.realpos+= offset | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | def _poor_mans_seek(self,offset): | ||
"""Seek by calling the wrapped file objects read() method. | ||||
This is used for file like objects that do not have native | ||||
seek support. The wrapped objects read() method is called | ||||
to manually seek to the desired position. | ||||
offset -- read this number of bytes from the wrapped | ||||
file object. | ||||
mpm@selenic.com
|
r575 | raise RangeError if we encounter EOF before reaching the | ||
mpm@selenic.com
|
r0 | specified offset. | ||
""" | ||||
pos = 0 | ||||
bufsize = 1024 | ||||
while pos < offset: | ||||
if (pos + bufsize) > offset: | ||||
bufsize = offset - pos | ||||
buf = self.fo.read(bufsize) | ||||
if len(buf) != bufsize: | ||||
raise RangeError('Requested Range Not Satisfiable') | ||||
pos+= bufsize | ||||
class FileRangeHandler(urllib2.FileHandler): | ||||
"""FileHandler subclass that adds Range support. | ||||
This class handles Range headers exactly like an HTTP | ||||
server would. | ||||
""" | ||||
def open_local_file(self, req): | ||||
import mimetypes | ||||
import mimetools | ||||
host = req.get_host() | ||||
file = req.get_selector() | ||||
localfile = urllib.url2pathname(file) | ||||
stats = os.stat(localfile) | ||||
size = stats[stat.ST_SIZE] | ||||
modified = rfc822.formatdate(stats[stat.ST_MTIME]) | ||||
mtype = mimetypes.guess_type(file)[0] | ||||
if host: | ||||
host, port = urllib.splitport(host) | ||||
if port or socket.gethostbyname(host) not in self.get_names(): | ||||
raise URLError('file not on local host') | ||||
fo = open(localfile,'rb') | ||||
brange = req.headers.get('Range',None) | ||||
brange = range_header_to_tuple(brange) | ||||
assert brange != () | ||||
if brange: | ||||
(fb,lb) = brange | ||||
if lb == '': lb = size | ||||
if fb < 0 or fb > size or lb > size: | ||||
raise RangeError('Requested Range Not Satisfiable') | ||||
size = (lb - fb) | ||||
fo = RangeableFileObject(fo, (fb,lb)) | ||||
headers = mimetools.Message(StringIO( | ||||
'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % | ||||
(mtype or 'text/plain', size, modified))) | ||||
return urllib.addinfourl(fo, headers, 'file:'+file) | ||||
mpm@selenic.com
|
r575 | # FTP Range Support | ||
mpm@selenic.com
|
r0 | # Unfortunately, a large amount of base FTP code had to be copied | ||
# from urllib and urllib2 in order to insert the FTP REST command. | ||||
mpm@selenic.com
|
r575 | # Code modifications for range support have been commented as | ||
mpm@selenic.com
|
r0 | # follows: | ||
# -- range support modifications start/end here | ||||
from urllib import splitport, splituser, splitpasswd, splitattr, \ | ||||
unquote, addclosehook, addinfourl | ||||
import ftplib | ||||
import socket | ||||
import sys | ||||
import ftplib | ||||
import mimetypes | ||||
import mimetools | ||||
class FTPRangeHandler(urllib2.FTPHandler): | ||||
def ftp_open(self, req): | ||||
host = req.get_host() | ||||
if not host: | ||||
raise IOError, ('ftp error', 'no host given') | ||||
host, port = splitport(host) | ||||
if port is None: | ||||
port = ftplib.FTP_PORT | ||||
# username/password handling | ||||
user, host = splituser(host) | ||||
if user: | ||||
user, passwd = splitpasswd(user) | ||||
else: | ||||
passwd = None | ||||
host = unquote(host) | ||||
user = unquote(user or '') | ||||
passwd = unquote(passwd or '') | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | try: | ||
host = socket.gethostbyname(host) | ||||
except socket.error, msg: | ||||
raise URLError(msg) | ||||
path, attrs = splitattr(req.get_selector()) | ||||
dirs = path.split('/') | ||||
dirs = map(unquote, dirs) | ||||
dirs, file = dirs[:-1], dirs[-1] | ||||
if dirs and not dirs[0]: | ||||
dirs = dirs[1:] | ||||
try: | ||||
fw = self.connect_ftp(user, passwd, host, port, dirs) | ||||
type = file and 'I' or 'D' | ||||
for attr in attrs: | ||||
attr, value = splitattr(attr) | ||||
if attr.lower() == 'type' and \ | ||||
value in ('a', 'A', 'i', 'I', 'd', 'D'): | ||||
type = value.upper() | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | # -- range support modifications start here | ||
rest = None | ||||
mpm@selenic.com
|
r575 | range_tup = range_header_to_tuple(req.headers.get('Range',None)) | ||
mpm@selenic.com
|
r0 | assert range_tup != () | ||
if range_tup: | ||||
(fb,lb) = range_tup | ||||
if fb > 0: rest = fb | ||||
# -- range support modifications end here | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | fp, retrlen = fw.retrfile(file, type, rest) | ||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | # -- range support modifications start here | ||
if range_tup: | ||||
(fb,lb) = range_tup | ||||
mpm@selenic.com
|
r575 | if lb == '': | ||
mpm@selenic.com
|
r0 | if retrlen is None or retrlen == 0: | ||
raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.') | ||||
lb = retrlen | ||||
retrlen = lb - fb | ||||
if retrlen < 0: | ||||
# beginning of range is larger than file | ||||
raise RangeError('Requested Range Not Satisfiable') | ||||
else: | ||||
retrlen = lb - fb | ||||
fp = RangeableFileObject(fp, (0,retrlen)) | ||||
# -- range support modifications end here | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | headers = "" | ||
mtype = mimetypes.guess_type(req.get_full_url())[0] | ||||
if mtype: | ||||
headers += "Content-Type: %s\n" % mtype | ||||
if retrlen is not None and retrlen >= 0: | ||||
headers += "Content-Length: %d\n" % retrlen | ||||
sf = StringIO(headers) | ||||
headers = mimetools.Message(sf) | ||||
return addinfourl(fp, headers, req.get_full_url()) | ||||
except ftplib.all_errors, msg: | ||||
raise IOError, ('ftp error', msg), sys.exc_info()[2] | ||||
def connect_ftp(self, user, passwd, host, port, dirs): | ||||
fw = ftpwrapper(user, passwd, host, port, dirs) | ||||
return fw | ||||
class ftpwrapper(urllib.ftpwrapper): | ||||
# range support note: | ||||
# this ftpwrapper code is copied directly from | ||||
# urllib. The only enhancement is to add the rest | ||||
# argument and pass it on to ftp.ntransfercmd | ||||
def retrfile(self, file, type, rest=None): | ||||
self.endtransfer() | ||||
if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 | ||||
else: cmd = 'TYPE ' + type; isdir = 0 | ||||
try: | ||||
self.ftp.voidcmd(cmd) | ||||
except ftplib.all_errors: | ||||
self.init() | ||||
self.ftp.voidcmd(cmd) | ||||
conn = None | ||||
if file and not isdir: | ||||
# Use nlst to see if the file exists at all | ||||
try: | ||||
self.ftp.nlst(file) | ||||
except ftplib.error_perm, reason: | ||||
raise IOError, ('ftp error', reason), sys.exc_info()[2] | ||||
# Restore the transfer mode! | ||||
self.ftp.voidcmd(cmd) | ||||
# Try to retrieve as a file | ||||
try: | ||||
cmd = 'RETR ' + file | ||||
conn = self.ftp.ntransfercmd(cmd, rest) | ||||
except ftplib.error_perm, reason: | ||||
if str(reason)[:3] == '501': | ||||
# workaround for REST not supported error | ||||
fp, retrlen = self.retrfile(file, type) | ||||
fp = RangeableFileObject(fp, (rest,'')) | ||||
return (fp, retrlen) | ||||
elif str(reason)[:3] != '550': | ||||
raise IOError, ('ftp error', reason), sys.exc_info()[2] | ||||
if not conn: | ||||
# Set transfer mode to ASCII! | ||||
self.ftp.voidcmd('TYPE A') | ||||
# Try a directory listing | ||||
if file: cmd = 'LIST ' + file | ||||
else: cmd = 'LIST' | ||||
conn = self.ftp.ntransfercmd(cmd) | ||||
self.busy = 1 | ||||
# Pass back both a suitably decorated object and a retrieval length | ||||
return (addclosehook(conn[0].makefile('rb'), | ||||
self.endtransfer), conn[1]) | ||||
#################################################################### | ||||
# Range Tuple Functions | ||||
# XXX: These range tuple functions might go better in a class. | ||||
_rangere = None | ||||
def range_header_to_tuple(range_header): | ||||
"""Get a (firstbyte,lastbyte) tuple from a Range header value. | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | Range headers have the form "bytes=<firstbyte>-<lastbyte>". This | ||
function pulls the firstbyte and lastbyte values and returns | ||||
a (firstbyte,lastbyte) tuple. If lastbyte is not specified in | ||||
the header value, it is returned as an empty string in the | ||||
tuple. | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | Return None if range_header is None | ||
mpm@selenic.com
|
r575 | Return () if range_header does not conform to the range spec | ||
mpm@selenic.com
|
r0 | pattern. | ||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | """ | ||
global _rangere | ||||
if range_header is None: return None | ||||
if _rangere is None: | ||||
import re | ||||
_rangere = re.compile(r'^bytes=(\d{1,})-(\d*)') | ||||
match = _rangere.match(range_header) | ||||
mpm@selenic.com
|
r575 | if match: | ||
mpm@selenic.com
|
r0 | tup = range_tuple_normalize(match.group(1,2)) | ||
mpm@selenic.com
|
r575 | if tup and tup[1]: | ||
mpm@selenic.com
|
r0 | tup = (tup[0],tup[1]+1) | ||
return tup | ||||
return () | ||||
def range_tuple_to_header(range_tup): | ||||
"""Convert a range tuple to a Range header value. | ||||
Return a string of the form "bytes=<firstbyte>-<lastbyte>" or None | ||||
if no range is needed. | ||||
""" | ||||
if range_tup is None: return None | ||||
range_tup = range_tuple_normalize(range_tup) | ||||
if range_tup: | ||||
mpm@selenic.com
|
r575 | if range_tup[1]: | ||
mpm@selenic.com
|
r0 | range_tup = (range_tup[0],range_tup[1] - 1) | ||
return 'bytes=%s-%s' % range_tup | ||||
mpm@selenic.com
|
r575 | |||
mpm@selenic.com
|
r0 | def range_tuple_normalize(range_tup): | ||
"""Normalize a (first_byte,last_byte) range tuple. | ||||
Return a tuple whose first element is guaranteed to be an int | ||||
mpm@selenic.com
|
r575 | and whose second element will be '' (meaning: the last byte) or | ||
mpm@selenic.com
|
r0 | an int. Finally, return None if the normalized tuple == (0,'') | ||
as that is equivelant to retrieving the entire file. | ||||
""" | ||||
if range_tup is None: return None | ||||
# handle first byte | ||||
fb = range_tup[0] | ||||
if fb in (None,''): fb = 0 | ||||
else: fb = int(fb) | ||||
# handle last byte | ||||
try: lb = range_tup[1] | ||||
except IndexError: lb = '' | ||||
mpm@selenic.com
|
r575 | else: | ||
mpm@selenic.com
|
r0 | if lb is None: lb = '' | ||
elif lb != '': lb = int(lb) | ||||
# check if range is over the entire file | ||||
if (fb,lb) == (0,''): return None | ||||
# check that the range is valid | ||||
if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb)) | ||||
return (fb,lb) | ||||