##// END OF EJS Templates
revlog: Extract low-level random-access file read caching logic...
Simon Sapin -
r48218:e0a314bc default
parent child Browse files
Show More
@@ -0,0 +1,138 b''
1 # Copyright Mercurial Contributors
2 #
3 # This software may be used and distributed according to the terms of the
4 # GNU General Public License version 2 or any later version.
5
6 import contextlib
7
8 from ..i18n import _
9 from .. import (
10 error,
11 util,
12 )
13
14
15 _MAX_CACHED_CHUNK_SIZE = 1048576 # 1 MiB
16
17 PARTIAL_READ_MSG = _(
18 b'partial read of revlog %s; expected %d bytes from offset %d, got %d'
19 )
20
21
22 def _is_power_of_two(n):
23 return (n & (n - 1) == 0) and n != 0
24
25
26 class randomaccessfile(object):
27 """Accessing arbitrary chuncks of data within a file, with some caching"""
28
29 def __init__(
30 self,
31 opener,
32 filename,
33 default_cached_chunk_size,
34 initial_cache=None,
35 ):
36 # Required by bitwise manipulation below
37 assert _is_power_of_two(default_cached_chunk_size)
38
39 self.opener = opener
40 self.filename = filename
41 self.default_cached_chunk_size = default_cached_chunk_size
42 self.writing_handle = None # This is set from revlog.py
43 self._cached_chunk = b''
44 self._cached_chunk_position = 0 # Offset from the start of the file
45 if initial_cache:
46 self._cached_chunk_position, self._cached_chunk = initial_cache
47
48 def clear_cache(self):
49 self._cached_chunk = b''
50 self._cached_chunk_position = 0
51
52 def _open(self, mode=b'r'):
53 """Return a file object"""
54 return self.opener(self.filename, mode=mode)
55
56 @contextlib.contextmanager
57 def _open_read(self, existing_file_obj=None):
58 """File object suitable for reading data"""
59 # Use explicit file handle, if given.
60 if existing_file_obj is not None:
61 yield existing_file_obj
62
63 # Use a file handle being actively used for writes, if available.
64 # There is some danger to doing this because reads will seek the
65 # file. However, revlog._writeentry performs a SEEK_END before all
66 # writes, so we should be safe.
67 elif self.writing_handle:
68 yield self.writing_handle
69
70 # Otherwise open a new file handle.
71 else:
72 with self._open() as fp:
73 yield fp
74
75 def read_chunk(self, offset, length, existing_file_obj=None):
76 """Read a chunk of bytes from the file.
77
78 Accepts an absolute offset, length to read, and an optional existing
79 file handle to read from.
80
81 If an existing file handle is passed, it will be seeked and the
82 original seek position will NOT be restored.
83
84 Returns a str or buffer of raw byte data.
85
86 Raises if the requested number of bytes could not be read.
87 """
88 end = offset + length
89 cache_start = self._cached_chunk_position
90 cache_end = cache_start + len(self._cached_chunk)
91 # Is the requested chunk within the cache?
92 if cache_start <= offset and end <= cache_end:
93 if cache_start == offset and end == cache_end:
94 return self._cached_chunk # avoid a copy
95 relative_start = offset - cache_start
96 return util.buffer(self._cached_chunk, relative_start, length)
97
98 return self._read_and_update_cache(offset, length, existing_file_obj)
99
100 def _read_and_update_cache(self, offset, length, existing_file_obj=None):
101 # Cache data both forward and backward around the requested
102 # data, in a fixed size window. This helps speed up operations
103 # involving reading the revlog backwards.
104 real_offset = offset & ~(self.default_cached_chunk_size - 1)
105 real_length = (
106 (offset + length + self.default_cached_chunk_size)
107 & ~(self.default_cached_chunk_size - 1)
108 ) - real_offset
109 with self._open_read(existing_file_obj) as file_obj:
110 file_obj.seek(real_offset)
111 data = file_obj.read(real_length)
112
113 self._add_cached_chunk(real_offset, data)
114
115 relative_offset = offset - real_offset
116 got = len(data) - relative_offset
117 if got < length:
118 message = PARTIAL_READ_MSG % (self.filename, length, offset, got)
119 raise error.RevlogError(message)
120
121 if offset != real_offset or real_length != length:
122 return util.buffer(data, relative_offset, length)
123 return data
124
125 def _add_cached_chunk(self, offset, data):
126 """Add to or replace the cached data chunk.
127
128 Accepts an absolute offset and the data that is at that location.
129 """
130 if (
131 self._cached_chunk_position + len(self._cached_chunk) == offset
132 and len(self._cached_chunk) + len(data) < _MAX_CACHED_CHUNK_SIZE
133 ):
134 # add to existing cache
135 self._cached_chunk += data
136 else:
137 self._cached_chunk = data
138 self._cached_chunk_position = offset
@@ -454,6 +454,7 b' class changelog(revlog.revlog):'
454 454 self.opener = _delayopener(
455 455 self._realopener, self._indexfile, self._delaybuf
456 456 )
457 self._segmentfile.opener = self.opener
457 458 self._delayed = True
458 459 tr.addpending(b'cl-%i' % id(self), self._writepending)
459 460 tr.addfinalize(b'cl-%i' % id(self), self._finalize)
@@ -462,6 +463,7 b' class changelog(revlog.revlog):'
462 463 """finalize index updates"""
463 464 self._delayed = False
464 465 self.opener = self._realopener
466 self._segmentfile.opener = self.opener
465 467 # move redirected index data back into place
466 468 if self._docket is not None:
467 469 self._write_docket(tr)
@@ -501,6 +503,7 b' class changelog(revlog.revlog):'
501 503 self._delaybuf = None
502 504 self._divert = True
503 505 self.opener = _divertopener(self._realopener, self._indexfile)
506 self._segmentfile.opener = self.opener
504 507
505 508 if self._divert:
506 509 return True
@@ -86,6 +86,7 b' from .revlogutils import ('
86 86 docket as docketutil,
87 87 flagutil,
88 88 nodemap as nodemaputil,
89 randomaccessfile,
89 90 revlogv0,
90 91 sidedata as sidedatautil,
91 92 )
@@ -125,7 +126,6 b" rustrevlog = policy.importrust('revlog')"
125 126
126 127 # max size of revlog with inline data
127 128 _maxinline = 131072
128 _chunksize = 1048576
129 129
130 130 # Flag processors for REVIDX_ELLIPSIS.
131 131 def ellipsisreadprocessor(rl, text):
@@ -232,10 +232,6 b' def parse_index_v1_mixed(data, inline):'
232 232 # signed integer)
233 233 _maxentrysize = 0x7FFFFFFF
234 234
235 PARTIAL_READ_MSG = _(
236 b'partial read of revlog %s; expected %d bytes from offset %d, got %d'
237 )
238
239 235 FILE_TOO_SHORT_MSG = _(
240 236 b'cannot read from revlog %s;'
241 237 b' expected %d bytes from offset %d, data size is %d'
@@ -605,7 +601,7 b' class revlog(object):'
605 601 self._parse_index = parse_index_v1_mixed
606 602 try:
607 603 d = self._parse_index(index_data, self._inline)
608 index, _chunkcache = d
604 index, chunkcache = d
609 605 use_nodemap = (
610 606 not self._inline
611 607 and self._nodemap_file is not None
@@ -626,9 +622,13 b' class revlog(object):'
626 622 raise error.RevlogError(
627 623 _(b"index %s is corrupted") % self.display_id
628 624 )
629 self.index, self._chunkcache = d
630 if not self._chunkcache:
631 self._chunkclear()
625 self.index = index
626 self._segmentfile = randomaccessfile.randomaccessfile(
627 self.opener,
628 (self._indexfile if self._inline else self._datafile),
629 self._chunkcachesize,
630 chunkcache,
631 )
632 632 # revnum -> (chain-length, sum-delta-length)
633 633 self._chaininfocache = util.lrucachedict(500)
634 634 # revlog header -> revlog compressor
@@ -709,32 +709,6 b' class revlog(object):'
709 709 return self.opener(self._datafile, mode=mode)
710 710
711 711 @contextlib.contextmanager
712 def _datareadfp(self, existingfp=None):
713 """file object suitable to read data"""
714 # Use explicit file handle, if given.
715 if existingfp is not None:
716 yield existingfp
717
718 # Use a file handle being actively used for writes, if available.
719 # There is some danger to doing this because reads will seek the
720 # file. However, _writeentry() performs a SEEK_END before all writes,
721 # so we should be safe.
722 elif self._writinghandles:
723 if self._inline:
724 yield self._writinghandles[0]
725 else:
726 yield self._writinghandles[1]
727
728 # Otherwise open a new file handle.
729 else:
730 if self._inline:
731 func = self._indexfp
732 else:
733 func = self._datafp
734 with func() as fp:
735 yield fp
736
737 @contextlib.contextmanager
738 712 def _sidedatareadfp(self):
739 713 """file object suitable to read sidedata"""
740 714 if self._writinghandles:
@@ -807,7 +781,7 b' class revlog(object):'
807 781 def clearcaches(self):
808 782 self._revisioncache = None
809 783 self._chainbasecache.clear()
810 self._chunkcache = (0, b'')
784 self._segmentfile.clear_cache()
811 785 self._pcache = {}
812 786 self._nodemap_docket = None
813 787 self.index.clearcaches()
@@ -1629,85 +1603,6 b' class revlog(object):'
1629 1603 p1, p2 = self.parents(node)
1630 1604 return storageutil.hashrevisionsha1(text, p1, p2) != node
1631 1605
1632 def _cachesegment(self, offset, data):
1633 """Add a segment to the revlog cache.
1634
1635 Accepts an absolute offset and the data that is at that location.
1636 """
1637 o, d = self._chunkcache
1638 # try to add to existing cache
1639 if o + len(d) == offset and len(d) + len(data) < _chunksize:
1640 self._chunkcache = o, d + data
1641 else:
1642 self._chunkcache = offset, data
1643
1644 def _readsegment(self, offset, length, df=None):
1645 """Load a segment of raw data from the revlog.
1646
1647 Accepts an absolute offset, length to read, and an optional existing
1648 file handle to read from.
1649
1650 If an existing file handle is passed, it will be seeked and the
1651 original seek position will NOT be restored.
1652
1653 Returns a str or buffer of raw byte data.
1654
1655 Raises if the requested number of bytes could not be read.
1656 """
1657 # Cache data both forward and backward around the requested
1658 # data, in a fixed size window. This helps speed up operations
1659 # involving reading the revlog backwards.
1660 cachesize = self._chunkcachesize
1661 realoffset = offset & ~(cachesize - 1)
1662 reallength = (
1663 (offset + length + cachesize) & ~(cachesize - 1)
1664 ) - realoffset
1665 with self._datareadfp(df) as df:
1666 df.seek(realoffset)
1667 d = df.read(reallength)
1668
1669 self._cachesegment(realoffset, d)
1670 if offset != realoffset or reallength != length:
1671 startoffset = offset - realoffset
1672 if len(d) - startoffset < length:
1673 filename = self._indexfile if self._inline else self._datafile
1674 got = len(d) - startoffset
1675 m = PARTIAL_READ_MSG % (filename, length, offset, got)
1676 raise error.RevlogError(m)
1677 return util.buffer(d, startoffset, length)
1678
1679 if len(d) < length:
1680 filename = self._indexfile if self._inline else self._datafile
1681 got = len(d) - startoffset
1682 m = PARTIAL_READ_MSG % (filename, length, offset, got)
1683 raise error.RevlogError(m)
1684
1685 return d
1686
1687 def _getsegment(self, offset, length, df=None):
1688 """Obtain a segment of raw data from the revlog.
1689
1690 Accepts an absolute offset, length of bytes to obtain, and an
1691 optional file handle to the already-opened revlog. If the file
1692 handle is used, it's original seek position will not be preserved.
1693
1694 Requests for data may be returned from a cache.
1695
1696 Returns a str or a buffer instance of raw byte data.
1697 """
1698 o, d = self._chunkcache
1699 l = len(d)
1700
1701 # is it in the cache?
1702 cachestart = offset - o
1703 cacheend = cachestart + length
1704 if cachestart >= 0 and cacheend <= l:
1705 if cachestart == 0 and cacheend == l:
1706 return d # avoid a copy
1707 return util.buffer(d, cachestart, cacheend - cachestart)
1708
1709 return self._readsegment(offset, length, df=df)
1710
1711 1606 def _getsegmentforrevs(self, startrev, endrev, df=None):
1712 1607 """Obtain a segment of raw data corresponding to a range of revisions.
1713 1608
@@ -1740,7 +1635,7 b' class revlog(object):'
1740 1635 end += (endrev + 1) * self.index.entry_size
1741 1636 length = end - start
1742 1637
1743 return start, self._getsegment(start, length, df=df)
1638 return start, self._segmentfile.read_chunk(start, length, df)
1744 1639
1745 1640 def _chunk(self, rev, df=None):
1746 1641 """Obtain a single decompressed chunk for a revision.
@@ -1832,10 +1727,6 b' class revlog(object):'
1832 1727
1833 1728 return l
1834 1729
1835 def _chunkclear(self):
1836 """Clear the raw chunk cache."""
1837 self._chunkcache = (0, b'')
1838
1839 1730 def deltaparent(self, rev):
1840 1731 """return deltaparent of the given revision"""
1841 1732 base = self.index[rev][3]
@@ -2043,7 +1934,12 b' class revlog(object):'
2043 1934 length = sidedata_size
2044 1935 offset = sidedata_offset
2045 1936 got = len(comp_segment)
2046 m = PARTIAL_READ_MSG % (filename, length, offset, got)
1937 m = randomaccessfile.PARTIAL_READ_MSG % (
1938 filename,
1939 length,
1940 offset,
1941 got,
1942 )
2047 1943 raise error.RevlogError(m)
2048 1944
2049 1945 comp = self.index[rev][11]
@@ -2136,6 +2032,7 b' class revlog(object):'
2136 2032 # We can't use the cached file handle after close(). So prevent
2137 2033 # its usage.
2138 2034 self._writinghandles = None
2035 self._segmentfile.writing_handle = None
2139 2036
2140 2037 new_dfh = self._datafp(b'w+')
2141 2038 new_dfh.truncate(0) # drop any potentially existing data
@@ -2171,12 +2068,17 b' class revlog(object):'
2171 2068
2172 2069 tr.replace(self._indexfile, trindex * self.index.entry_size)
2173 2070 nodemaputil.setup_persistent_nodemap(tr, self)
2174 self._chunkclear()
2071 self._segmentfile = randomaccessfile.randomaccessfile(
2072 self.opener,
2073 self._datafile,
2074 self._chunkcachesize,
2075 )
2175 2076
2176 2077 if existing_handles:
2177 2078 # switched from inline to conventional reopen the index
2178 2079 ifh = self.__index_write_fp()
2179 2080 self._writinghandles = (ifh, new_dfh, None)
2081 self._segmentfile.writing_handle = new_dfh
2180 2082 new_dfh = None
2181 2083 finally:
2182 2084 if new_dfh is not None:
@@ -2235,11 +2137,13 b' class revlog(object):'
2235 2137 transaction.add(self._indexfile, isize)
2236 2138 # exposing all file handle for writing.
2237 2139 self._writinghandles = (ifh, dfh, sdfh)
2140 self._segmentfile.writing_handle = ifh if self._inline else dfh
2238 2141 yield
2239 2142 if self._docket is not None:
2240 2143 self._write_docket(transaction)
2241 2144 finally:
2242 2145 self._writinghandles = None
2146 self._segmentfile.writing_handle = None
2243 2147 if dfh is not None:
2244 2148 dfh.close()
2245 2149 if sdfh is not None:
@@ -2873,7 +2777,7 b' class revlog(object):'
2873 2777 # then reset internal state in memory to forget those revisions
2874 2778 self._revisioncache = None
2875 2779 self._chaininfocache = util.lrucachedict(500)
2876 self._chunkclear()
2780 self._segmentfile.clear_cache()
2877 2781
2878 2782 del self.index[rev:-1]
2879 2783
General Comments 0
You need to be logged in to leave comments. Login now