##// END OF EJS Templates
revlog: Extract low-level random-access file read caching logic...
Simon Sapin -
r48218:e0a314bc default
parent child Browse files
Show More
@@ -0,0 +1,138 b''
1 # Copyright Mercurial Contributors
2 #
3 # This software may be used and distributed according to the terms of the
4 # GNU General Public License version 2 or any later version.
5
6 import contextlib
7
8 from ..i18n import _
9 from .. import (
10 error,
11 util,
12 )
13
14
15 _MAX_CACHED_CHUNK_SIZE = 1048576 # 1 MiB
16
17 PARTIAL_READ_MSG = _(
18 b'partial read of revlog %s; expected %d bytes from offset %d, got %d'
19 )
20
21
22 def _is_power_of_two(n):
23 return (n & (n - 1) == 0) and n != 0
24
25
26 class randomaccessfile(object):
27 """Accessing arbitrary chuncks of data within a file, with some caching"""
28
29 def __init__(
30 self,
31 opener,
32 filename,
33 default_cached_chunk_size,
34 initial_cache=None,
35 ):
36 # Required by bitwise manipulation below
37 assert _is_power_of_two(default_cached_chunk_size)
38
39 self.opener = opener
40 self.filename = filename
41 self.default_cached_chunk_size = default_cached_chunk_size
42 self.writing_handle = None # This is set from revlog.py
43 self._cached_chunk = b''
44 self._cached_chunk_position = 0 # Offset from the start of the file
45 if initial_cache:
46 self._cached_chunk_position, self._cached_chunk = initial_cache
47
48 def clear_cache(self):
49 self._cached_chunk = b''
50 self._cached_chunk_position = 0
51
52 def _open(self, mode=b'r'):
53 """Return a file object"""
54 return self.opener(self.filename, mode=mode)
55
56 @contextlib.contextmanager
57 def _open_read(self, existing_file_obj=None):
58 """File object suitable for reading data"""
59 # Use explicit file handle, if given.
60 if existing_file_obj is not None:
61 yield existing_file_obj
62
63 # Use a file handle being actively used for writes, if available.
64 # There is some danger to doing this because reads will seek the
65 # file. However, revlog._writeentry performs a SEEK_END before all
66 # writes, so we should be safe.
67 elif self.writing_handle:
68 yield self.writing_handle
69
70 # Otherwise open a new file handle.
71 else:
72 with self._open() as fp:
73 yield fp
74
75 def read_chunk(self, offset, length, existing_file_obj=None):
76 """Read a chunk of bytes from the file.
77
78 Accepts an absolute offset, length to read, and an optional existing
79 file handle to read from.
80
81 If an existing file handle is passed, it will be seeked and the
82 original seek position will NOT be restored.
83
84 Returns a str or buffer of raw byte data.
85
86 Raises if the requested number of bytes could not be read.
87 """
88 end = offset + length
89 cache_start = self._cached_chunk_position
90 cache_end = cache_start + len(self._cached_chunk)
91 # Is the requested chunk within the cache?
92 if cache_start <= offset and end <= cache_end:
93 if cache_start == offset and end == cache_end:
94 return self._cached_chunk # avoid a copy
95 relative_start = offset - cache_start
96 return util.buffer(self._cached_chunk, relative_start, length)
97
98 return self._read_and_update_cache(offset, length, existing_file_obj)
99
100 def _read_and_update_cache(self, offset, length, existing_file_obj=None):
101 # Cache data both forward and backward around the requested
102 # data, in a fixed size window. This helps speed up operations
103 # involving reading the revlog backwards.
104 real_offset = offset & ~(self.default_cached_chunk_size - 1)
105 real_length = (
106 (offset + length + self.default_cached_chunk_size)
107 & ~(self.default_cached_chunk_size - 1)
108 ) - real_offset
109 with self._open_read(existing_file_obj) as file_obj:
110 file_obj.seek(real_offset)
111 data = file_obj.read(real_length)
112
113 self._add_cached_chunk(real_offset, data)
114
115 relative_offset = offset - real_offset
116 got = len(data) - relative_offset
117 if got < length:
118 message = PARTIAL_READ_MSG % (self.filename, length, offset, got)
119 raise error.RevlogError(message)
120
121 if offset != real_offset or real_length != length:
122 return util.buffer(data, relative_offset, length)
123 return data
124
125 def _add_cached_chunk(self, offset, data):
126 """Add to or replace the cached data chunk.
127
128 Accepts an absolute offset and the data that is at that location.
129 """
130 if (
131 self._cached_chunk_position + len(self._cached_chunk) == offset
132 and len(self._cached_chunk) + len(data) < _MAX_CACHED_CHUNK_SIZE
133 ):
134 # add to existing cache
135 self._cached_chunk += data
136 else:
137 self._cached_chunk = data
138 self._cached_chunk_position = offset
@@ -454,6 +454,7 b' class changelog(revlog.revlog):'
454 self.opener = _delayopener(
454 self.opener = _delayopener(
455 self._realopener, self._indexfile, self._delaybuf
455 self._realopener, self._indexfile, self._delaybuf
456 )
456 )
457 self._segmentfile.opener = self.opener
457 self._delayed = True
458 self._delayed = True
458 tr.addpending(b'cl-%i' % id(self), self._writepending)
459 tr.addpending(b'cl-%i' % id(self), self._writepending)
459 tr.addfinalize(b'cl-%i' % id(self), self._finalize)
460 tr.addfinalize(b'cl-%i' % id(self), self._finalize)
@@ -462,6 +463,7 b' class changelog(revlog.revlog):'
462 """finalize index updates"""
463 """finalize index updates"""
463 self._delayed = False
464 self._delayed = False
464 self.opener = self._realopener
465 self.opener = self._realopener
466 self._segmentfile.opener = self.opener
465 # move redirected index data back into place
467 # move redirected index data back into place
466 if self._docket is not None:
468 if self._docket is not None:
467 self._write_docket(tr)
469 self._write_docket(tr)
@@ -501,6 +503,7 b' class changelog(revlog.revlog):'
501 self._delaybuf = None
503 self._delaybuf = None
502 self._divert = True
504 self._divert = True
503 self.opener = _divertopener(self._realopener, self._indexfile)
505 self.opener = _divertopener(self._realopener, self._indexfile)
506 self._segmentfile.opener = self.opener
504
507
505 if self._divert:
508 if self._divert:
506 return True
509 return True
@@ -86,6 +86,7 b' from .revlogutils import ('
86 docket as docketutil,
86 docket as docketutil,
87 flagutil,
87 flagutil,
88 nodemap as nodemaputil,
88 nodemap as nodemaputil,
89 randomaccessfile,
89 revlogv0,
90 revlogv0,
90 sidedata as sidedatautil,
91 sidedata as sidedatautil,
91 )
92 )
@@ -125,7 +126,6 b" rustrevlog = policy.importrust('revlog')"
125
126
126 # max size of revlog with inline data
127 # max size of revlog with inline data
127 _maxinline = 131072
128 _maxinline = 131072
128 _chunksize = 1048576
129
129
130 # Flag processors for REVIDX_ELLIPSIS.
130 # Flag processors for REVIDX_ELLIPSIS.
131 def ellipsisreadprocessor(rl, text):
131 def ellipsisreadprocessor(rl, text):
@@ -232,10 +232,6 b' def parse_index_v1_mixed(data, inline):'
232 # signed integer)
232 # signed integer)
233 _maxentrysize = 0x7FFFFFFF
233 _maxentrysize = 0x7FFFFFFF
234
234
235 PARTIAL_READ_MSG = _(
236 b'partial read of revlog %s; expected %d bytes from offset %d, got %d'
237 )
238
239 FILE_TOO_SHORT_MSG = _(
235 FILE_TOO_SHORT_MSG = _(
240 b'cannot read from revlog %s;'
236 b'cannot read from revlog %s;'
241 b' expected %d bytes from offset %d, data size is %d'
237 b' expected %d bytes from offset %d, data size is %d'
@@ -605,7 +601,7 b' class revlog(object):'
605 self._parse_index = parse_index_v1_mixed
601 self._parse_index = parse_index_v1_mixed
606 try:
602 try:
607 d = self._parse_index(index_data, self._inline)
603 d = self._parse_index(index_data, self._inline)
608 index, _chunkcache = d
604 index, chunkcache = d
609 use_nodemap = (
605 use_nodemap = (
610 not self._inline
606 not self._inline
611 and self._nodemap_file is not None
607 and self._nodemap_file is not None
@@ -626,9 +622,13 b' class revlog(object):'
626 raise error.RevlogError(
622 raise error.RevlogError(
627 _(b"index %s is corrupted") % self.display_id
623 _(b"index %s is corrupted") % self.display_id
628 )
624 )
629 self.index, self._chunkcache = d
625 self.index = index
630 if not self._chunkcache:
626 self._segmentfile = randomaccessfile.randomaccessfile(
631 self._chunkclear()
627 self.opener,
628 (self._indexfile if self._inline else self._datafile),
629 self._chunkcachesize,
630 chunkcache,
631 )
632 # revnum -> (chain-length, sum-delta-length)
632 # revnum -> (chain-length, sum-delta-length)
633 self._chaininfocache = util.lrucachedict(500)
633 self._chaininfocache = util.lrucachedict(500)
634 # revlog header -> revlog compressor
634 # revlog header -> revlog compressor
@@ -709,32 +709,6 b' class revlog(object):'
709 return self.opener(self._datafile, mode=mode)
709 return self.opener(self._datafile, mode=mode)
710
710
711 @contextlib.contextmanager
711 @contextlib.contextmanager
712 def _datareadfp(self, existingfp=None):
713 """file object suitable to read data"""
714 # Use explicit file handle, if given.
715 if existingfp is not None:
716 yield existingfp
717
718 # Use a file handle being actively used for writes, if available.
719 # There is some danger to doing this because reads will seek the
720 # file. However, _writeentry() performs a SEEK_END before all writes,
721 # so we should be safe.
722 elif self._writinghandles:
723 if self._inline:
724 yield self._writinghandles[0]
725 else:
726 yield self._writinghandles[1]
727
728 # Otherwise open a new file handle.
729 else:
730 if self._inline:
731 func = self._indexfp
732 else:
733 func = self._datafp
734 with func() as fp:
735 yield fp
736
737 @contextlib.contextmanager
738 def _sidedatareadfp(self):
712 def _sidedatareadfp(self):
739 """file object suitable to read sidedata"""
713 """file object suitable to read sidedata"""
740 if self._writinghandles:
714 if self._writinghandles:
@@ -807,7 +781,7 b' class revlog(object):'
807 def clearcaches(self):
781 def clearcaches(self):
808 self._revisioncache = None
782 self._revisioncache = None
809 self._chainbasecache.clear()
783 self._chainbasecache.clear()
810 self._chunkcache = (0, b'')
784 self._segmentfile.clear_cache()
811 self._pcache = {}
785 self._pcache = {}
812 self._nodemap_docket = None
786 self._nodemap_docket = None
813 self.index.clearcaches()
787 self.index.clearcaches()
@@ -1629,85 +1603,6 b' class revlog(object):'
1629 p1, p2 = self.parents(node)
1603 p1, p2 = self.parents(node)
1630 return storageutil.hashrevisionsha1(text, p1, p2) != node
1604 return storageutil.hashrevisionsha1(text, p1, p2) != node
1631
1605
1632 def _cachesegment(self, offset, data):
1633 """Add a segment to the revlog cache.
1634
1635 Accepts an absolute offset and the data that is at that location.
1636 """
1637 o, d = self._chunkcache
1638 # try to add to existing cache
1639 if o + len(d) == offset and len(d) + len(data) < _chunksize:
1640 self._chunkcache = o, d + data
1641 else:
1642 self._chunkcache = offset, data
1643
1644 def _readsegment(self, offset, length, df=None):
1645 """Load a segment of raw data from the revlog.
1646
1647 Accepts an absolute offset, length to read, and an optional existing
1648 file handle to read from.
1649
1650 If an existing file handle is passed, it will be seeked and the
1651 original seek position will NOT be restored.
1652
1653 Returns a str or buffer of raw byte data.
1654
1655 Raises if the requested number of bytes could not be read.
1656 """
1657 # Cache data both forward and backward around the requested
1658 # data, in a fixed size window. This helps speed up operations
1659 # involving reading the revlog backwards.
1660 cachesize = self._chunkcachesize
1661 realoffset = offset & ~(cachesize - 1)
1662 reallength = (
1663 (offset + length + cachesize) & ~(cachesize - 1)
1664 ) - realoffset
1665 with self._datareadfp(df) as df:
1666 df.seek(realoffset)
1667 d = df.read(reallength)
1668
1669 self._cachesegment(realoffset, d)
1670 if offset != realoffset or reallength != length:
1671 startoffset = offset - realoffset
1672 if len(d) - startoffset < length:
1673 filename = self._indexfile if self._inline else self._datafile
1674 got = len(d) - startoffset
1675 m = PARTIAL_READ_MSG % (filename, length, offset, got)
1676 raise error.RevlogError(m)
1677 return util.buffer(d, startoffset, length)
1678
1679 if len(d) < length:
1680 filename = self._indexfile if self._inline else self._datafile
1681 got = len(d) - startoffset
1682 m = PARTIAL_READ_MSG % (filename, length, offset, got)
1683 raise error.RevlogError(m)
1684
1685 return d
1686
1687 def _getsegment(self, offset, length, df=None):
1688 """Obtain a segment of raw data from the revlog.
1689
1690 Accepts an absolute offset, length of bytes to obtain, and an
1691 optional file handle to the already-opened revlog. If the file
1692 handle is used, it's original seek position will not be preserved.
1693
1694 Requests for data may be returned from a cache.
1695
1696 Returns a str or a buffer instance of raw byte data.
1697 """
1698 o, d = self._chunkcache
1699 l = len(d)
1700
1701 # is it in the cache?
1702 cachestart = offset - o
1703 cacheend = cachestart + length
1704 if cachestart >= 0 and cacheend <= l:
1705 if cachestart == 0 and cacheend == l:
1706 return d # avoid a copy
1707 return util.buffer(d, cachestart, cacheend - cachestart)
1708
1709 return self._readsegment(offset, length, df=df)
1710
1711 def _getsegmentforrevs(self, startrev, endrev, df=None):
1606 def _getsegmentforrevs(self, startrev, endrev, df=None):
1712 """Obtain a segment of raw data corresponding to a range of revisions.
1607 """Obtain a segment of raw data corresponding to a range of revisions.
1713
1608
@@ -1740,7 +1635,7 b' class revlog(object):'
1740 end += (endrev + 1) * self.index.entry_size
1635 end += (endrev + 1) * self.index.entry_size
1741 length = end - start
1636 length = end - start
1742
1637
1743 return start, self._getsegment(start, length, df=df)
1638 return start, self._segmentfile.read_chunk(start, length, df)
1744
1639
1745 def _chunk(self, rev, df=None):
1640 def _chunk(self, rev, df=None):
1746 """Obtain a single decompressed chunk for a revision.
1641 """Obtain a single decompressed chunk for a revision.
@@ -1832,10 +1727,6 b' class revlog(object):'
1832
1727
1833 return l
1728 return l
1834
1729
1835 def _chunkclear(self):
1836 """Clear the raw chunk cache."""
1837 self._chunkcache = (0, b'')
1838
1839 def deltaparent(self, rev):
1730 def deltaparent(self, rev):
1840 """return deltaparent of the given revision"""
1731 """return deltaparent of the given revision"""
1841 base = self.index[rev][3]
1732 base = self.index[rev][3]
@@ -2043,7 +1934,12 b' class revlog(object):'
2043 length = sidedata_size
1934 length = sidedata_size
2044 offset = sidedata_offset
1935 offset = sidedata_offset
2045 got = len(comp_segment)
1936 got = len(comp_segment)
2046 m = PARTIAL_READ_MSG % (filename, length, offset, got)
1937 m = randomaccessfile.PARTIAL_READ_MSG % (
1938 filename,
1939 length,
1940 offset,
1941 got,
1942 )
2047 raise error.RevlogError(m)
1943 raise error.RevlogError(m)
2048
1944
2049 comp = self.index[rev][11]
1945 comp = self.index[rev][11]
@@ -2136,6 +2032,7 b' class revlog(object):'
2136 # We can't use the cached file handle after close(). So prevent
2032 # We can't use the cached file handle after close(). So prevent
2137 # its usage.
2033 # its usage.
2138 self._writinghandles = None
2034 self._writinghandles = None
2035 self._segmentfile.writing_handle = None
2139
2036
2140 new_dfh = self._datafp(b'w+')
2037 new_dfh = self._datafp(b'w+')
2141 new_dfh.truncate(0) # drop any potentially existing data
2038 new_dfh.truncate(0) # drop any potentially existing data
@@ -2171,12 +2068,17 b' class revlog(object):'
2171
2068
2172 tr.replace(self._indexfile, trindex * self.index.entry_size)
2069 tr.replace(self._indexfile, trindex * self.index.entry_size)
2173 nodemaputil.setup_persistent_nodemap(tr, self)
2070 nodemaputil.setup_persistent_nodemap(tr, self)
2174 self._chunkclear()
2071 self._segmentfile = randomaccessfile.randomaccessfile(
2072 self.opener,
2073 self._datafile,
2074 self._chunkcachesize,
2075 )
2175
2076
2176 if existing_handles:
2077 if existing_handles:
2177 # switched from inline to conventional reopen the index
2078 # switched from inline to conventional reopen the index
2178 ifh = self.__index_write_fp()
2079 ifh = self.__index_write_fp()
2179 self._writinghandles = (ifh, new_dfh, None)
2080 self._writinghandles = (ifh, new_dfh, None)
2081 self._segmentfile.writing_handle = new_dfh
2180 new_dfh = None
2082 new_dfh = None
2181 finally:
2083 finally:
2182 if new_dfh is not None:
2084 if new_dfh is not None:
@@ -2235,11 +2137,13 b' class revlog(object):'
2235 transaction.add(self._indexfile, isize)
2137 transaction.add(self._indexfile, isize)
2236 # exposing all file handle for writing.
2138 # exposing all file handle for writing.
2237 self._writinghandles = (ifh, dfh, sdfh)
2139 self._writinghandles = (ifh, dfh, sdfh)
2140 self._segmentfile.writing_handle = ifh if self._inline else dfh
2238 yield
2141 yield
2239 if self._docket is not None:
2142 if self._docket is not None:
2240 self._write_docket(transaction)
2143 self._write_docket(transaction)
2241 finally:
2144 finally:
2242 self._writinghandles = None
2145 self._writinghandles = None
2146 self._segmentfile.writing_handle = None
2243 if dfh is not None:
2147 if dfh is not None:
2244 dfh.close()
2148 dfh.close()
2245 if sdfh is not None:
2149 if sdfh is not None:
@@ -2873,7 +2777,7 b' class revlog(object):'
2873 # then reset internal state in memory to forget those revisions
2777 # then reset internal state in memory to forget those revisions
2874 self._revisioncache = None
2778 self._revisioncache = None
2875 self._chaininfocache = util.lrucachedict(500)
2779 self._chaininfocache = util.lrucachedict(500)
2876 self._chunkclear()
2780 self._segmentfile.clear_cache()
2877
2781
2878 del self.index[rev:-1]
2782 del self.index[rev:-1]
2879
2783
General Comments 0
You need to be logged in to leave comments. Login now