# HG changeset patch # User Pierre-Yves David # Date 2023-10-25 00:13:18 # Node ID a82704902db88245d19043f7944bace0fffdc4bc # Parent de6a8cc24de398e17e9d9088b697619520cfc14a revlog: move the compression/decompression logic on the inner object This is a necessary step before being able to move more logic around restoring a revision content there. For now, we do a simple patch for the perf extension logic, when the implementation of the inner object changes, we will likely need some evolution of the API. However this is true of many things in the perf extension. So we will see this later. diff --git a/contrib/perf.py b/contrib/perf.py --- a/contrib/perf.py +++ b/contrib/perf.py @@ -3833,14 +3833,16 @@ def perfrevlogchunks(ui, repo, file_=Non def docompress(compressor): rl.clearcaches() + compressor_holder = getattr(rl, '_inner', rl) + try: # Swap in the requested compression engine. - oldcompressor = rl._compressor - rl._compressor = compressor + oldcompressor = compressor_holder._compressor + compressor_holder._compressor = compressor for chunk in chunks[0]: rl.compress(chunk) finally: - rl._compressor = oldcompressor + compressor_holder._compressor = oldcompressor benches = [ (lambda: doread(), b'read'), diff --git a/mercurial/revlog.py b/mercurial/revlog.py --- a/mercurial/revlog.py +++ b/mercurial/revlog.py @@ -353,7 +353,9 @@ class _InnerRevlog: sidedata_file, inline, data_config, + feature_config, chunk_cache, + default_compression_header, ): self.opener = opener self.index = index @@ -363,6 +365,9 @@ class _InnerRevlog: self.sidedata_file = sidedata_file self.inline = inline self.data_config = data_config + self.feature_config = feature_config + + self._default_compression_header = default_compression_header # index @@ -381,6 +386,9 @@ class _InnerRevlog: self.data_config.chunk_cache_size, ) + # revlog header -> revlog compressor + self._decompressors = {} + @property def index_file(self): return self.__index_file @@ -405,6 +413,103 @@ class _InnerRevlog: """the end of the data chunk for this revision""" return self.start(rev) + self.length(rev) + @util.propertycache + def _compressor(self): + engine = util.compengines[self.feature_config.compression_engine] + return engine.revlogcompressor( + self.feature_config.compression_engine_options + ) + + @util.propertycache + def _decompressor(self): + """the default decompressor""" + if self._default_compression_header is None: + return None + t = self._default_compression_header + c = self._get_decompressor(t) + return c.decompress + + def _get_decompressor(self, t): + try: + compressor = self._decompressors[t] + except KeyError: + try: + engine = util.compengines.forrevlogheader(t) + compressor = engine.revlogcompressor( + self.feature_config.compression_engine_options + ) + self._decompressors[t] = compressor + except KeyError: + raise error.RevlogError( + _(b'unknown compression type %s') % binascii.hexlify(t) + ) + return compressor + + def compress(self, data): + """Generate a possibly-compressed representation of data.""" + if not data: + return b'', data + + compressed = self._compressor.compress(data) + + if compressed: + # The revlog compressor added the header in the returned data. + return b'', compressed + + if data[0:1] == b'\0': + return b'', data + return b'u', data + + def decompress(self, data): + """Decompress a revlog chunk. + + The chunk is expected to begin with a header identifying the + format type so it can be routed to an appropriate decompressor. + """ + if not data: + return data + + # Revlogs are read much more frequently than they are written and many + # chunks only take microseconds to decompress, so performance is + # important here. + # + # We can make a few assumptions about revlogs: + # + # 1) the majority of chunks will be compressed (as opposed to inline + # raw data). + # 2) decompressing *any* data will likely by at least 10x slower than + # returning raw inline data. + # 3) we want to prioritize common and officially supported compression + # engines + # + # It follows that we want to optimize for "decompress compressed data + # when encoded with common and officially supported compression engines" + # case over "raw data" and "data encoded by less common or non-official + # compression engines." That is why we have the inline lookup first + # followed by the compengines lookup. + # + # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib + # compressed chunks. And this matters for changelog and manifest reads. + t = data[0:1] + + if t == b'x': + try: + return _zlibdecompress(data) + except zlib.error as e: + raise error.RevlogError( + _(b'revlog decompress error: %s') + % stringutil.forcebytestr(e) + ) + # '\0' is more common than 'u' so it goes first. + elif t == b'\0': + return data + elif t == b'u': + return util.buffer(data, 1) + + compressor = self._get_decompressor(t) + + return compressor.decompress(data) + @contextlib.contextmanager def reading(self): """Context manager that keeps data and sidedata files open for reading""" @@ -1284,12 +1389,15 @@ class revlog: self.index = index # revnum -> (chain-length, sum-delta-length) self._chaininfocache = util.lrucachedict(500) - # revlog header -> revlog compressor - self._decompressors = {} return chunkcache def _load_inner(self, chunk_cache): + if self._docket is None: + default_compression_header = None + else: + default_compression_header = self._docket.default_compression_header + self._inner = _InnerRevlog( opener=self.opener, index=self.index, @@ -1298,7 +1406,9 @@ class revlog: sidedata_file=self._sidedatafile, inline=self._inline, data_config=self.data_config, + feature_config=self.feature_config, chunk_cache=chunk_cache, + default_compression_header=default_compression_header, ) def get_revlog(self): @@ -1319,38 +1429,6 @@ class revlog: else: return self.radix - def _get_decompressor(self, t): - try: - compressor = self._decompressors[t] - except KeyError: - try: - engine = util.compengines.forrevlogheader(t) - compressor = engine.revlogcompressor( - self.feature_config.compression_engine_options - ) - self._decompressors[t] = compressor - except KeyError: - raise error.RevlogError( - _(b'unknown compression type %s') % binascii.hexlify(t) - ) - return compressor - - @util.propertycache - def _compressor(self): - engine = util.compengines[self.feature_config.compression_engine] - return engine.revlogcompressor( - self.feature_config.compression_engine_options - ) - - @util.propertycache - def _decompressor(self): - """the default decompressor""" - if self._docket is None: - return None - t = self._docket.default_compression_header - c = self._get_decompressor(t) - return c.decompress - def _datafp(self, mode=b'r'): """file object for the revlog's data file""" return self.opener(self._datafile, mode=mode) @@ -2272,9 +2350,9 @@ class revlog: if compression_mode == COMP_MODE_PLAIN: return data elif compression_mode == COMP_MODE_DEFAULT: - return self._decompressor(data) + return self._inner._decompressor(data) elif compression_mode == COMP_MODE_INLINE: - return self.decompress(data) + return self._inner.decompress(data) else: msg = b'unknown compression mode %d' msg %= compression_mode @@ -2328,9 +2406,9 @@ class revlog: # 2G on Windows return [self._chunk(rev) for rev in revschunk] - decomp = self.decompress + decomp = self._inner.decompress # self._decompressor might be None, but will not be used in that case - def_decomp = self._decompressor + def_decomp = self._inner._decompressor for rev in revschunk: chunkstart = start(rev) if inline: @@ -2544,9 +2622,9 @@ class revlog: if comp == COMP_MODE_PLAIN: segment = comp_segment elif comp == COMP_MODE_DEFAULT: - segment = self._decompressor(comp_segment) + segment = self._inner._decompressor(comp_segment) elif comp == COMP_MODE_INLINE: - segment = self.decompress(comp_segment) + segment = self._inner.decompress(comp_segment) else: msg = b'unknown compression mode %d' msg %= comp @@ -2842,69 +2920,10 @@ class revlog: ) def compress(self, data): - """Generate a possibly-compressed representation of data.""" - if not data: - return b'', data - - compressed = self._compressor.compress(data) - - if compressed: - # The revlog compressor added the header in the returned data. - return b'', compressed - - if data[0:1] == b'\0': - return b'', data - return b'u', data + return self._inner.compress(data) def decompress(self, data): - """Decompress a revlog chunk. - - The chunk is expected to begin with a header identifying the - format type so it can be routed to an appropriate decompressor. - """ - if not data: - return data - - # Revlogs are read much more frequently than they are written and many - # chunks only take microseconds to decompress, so performance is - # important here. - # - # We can make a few assumptions about revlogs: - # - # 1) the majority of chunks will be compressed (as opposed to inline - # raw data). - # 2) decompressing *any* data will likely by at least 10x slower than - # returning raw inline data. - # 3) we want to prioritize common and officially supported compression - # engines - # - # It follows that we want to optimize for "decompress compressed data - # when encoded with common and officially supported compression engines" - # case over "raw data" and "data encoded by less common or non-official - # compression engines." That is why we have the inline lookup first - # followed by the compengines lookup. - # - # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib - # compressed chunks. And this matters for changelog and manifest reads. - t = data[0:1] - - if t == b'x': - try: - return _zlibdecompress(data) - except zlib.error as e: - raise error.RevlogError( - _(b'revlog decompress error: %s') - % stringutil.forcebytestr(e) - ) - # '\0' is more common than 'u' so it goes first. - elif t == b'\0': - return data - elif t == b'u': - return util.buffer(data, 1) - - compressor = self._get_decompressor(t) - - return compressor.decompress(data) + return self._inner.decompress(data) def _addrevision( self, @@ -3029,7 +3048,7 @@ class revlog: sidedata_compression_mode = COMP_MODE_PLAIN serialized_sidedata = sidedatautil.serialize_sidedata(sidedata) sidedata_offset = self._docket.sidedata_end - h, comp_sidedata = self.compress(serialized_sidedata) + h, comp_sidedata = self._inner.compress(serialized_sidedata) if ( h != b'u' and comp_sidedata[0:1] != b'\0' @@ -3876,7 +3895,7 @@ class revlog: sidedata_compression_mode = COMP_MODE_INLINE if serialized_sidedata and self.feature_config.has_side_data: sidedata_compression_mode = COMP_MODE_PLAIN - h, comp_sidedata = self.compress(serialized_sidedata) + h, comp_sidedata = self._inner.compress(serialized_sidedata) if ( h != b'u' and comp_sidedata[0] != b'\0' diff --git a/mercurial/revlogutils/deltas.py b/mercurial/revlogutils/deltas.py --- a/mercurial/revlogutils/deltas.py +++ b/mercurial/revlogutils/deltas.py @@ -1205,7 +1205,7 @@ class deltacomputer: msg = b"DBG-DELTAS-SEARCH: DISCARDED (prev size)\n" self._write_debug(msg) return None - header, data = revlog.compress(delta) + header, data = revlog._inner.compress(delta) deltalen = len(header) + len(data) offset = revlog.end(len(revlog) - 1) dist = deltalen + offset - revlog.start(chainbase) @@ -1226,7 +1226,7 @@ class deltacomputer: def _fullsnapshotinfo(self, revinfo, curr): rawtext = self.buildtext(revinfo) - data = self.revlog.compress(rawtext) + data = self.revlog._inner.compress(rawtext) compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0]) deltabase = chainbase = curr snapshotdepth = 0