upstream/mercurial-mirror Files · mercurial/revlogutils/randomaccessfile.py

exchange: improve computation of relevant markers for large repos...

exchange: improve computation of relevant markers for large repos Compute the candidate nodes with relevant markers directly from keys of the predecessors/successors/children dictionaries of obsstore. This is faster than iterating over all nodes directly. This test could be further improved for repositories with relative few markers compared to the repository size, but this is no longer hot already. With the current loop structure, the obshashrange use works as well as before as it passes lists with a single node. Adjust the interface by allowing revision lists as well as node lists. This helps cases that computes ancestors as it reduces the materialisation cost. Use this in _pushdiscoveryobsmarker and _getbundleobsmarkerpart. Improve the latter further by directly using ancestors(). Performance benchmarks show notable and welcome improvement to no-op push and pull (that would also apply to other push/pull). This apply to push and pull done without evolve. ### push/pull Benchmark parameter # bin-env-vars.hg.flavor = default # benchmark.variants.explicit-rev = none # benchmark.variants.protocol = ssh # benchmark.variants.revs = none ## benchmark.name = hg.command.pull # data-env-vars.name = mercurial-devel-2024-03-22-zstd-sparse-revlog before: 5.968537 seconds after: 5.668507 seconds (-5.03%, -0.30) # data-env-vars.name = tryton-devel-2024-03-22-zstd-sparse-revlog before: 1.446232 seconds after: 0.835553 seconds (-42.23%, -0.61) # data-env-vars.name = netbsd-src-draft-2024-09-19-zstd-sparse-revlog before: 5.777412 seconds after: 2.523454 seconds (-56.32%, -3.25) ## benchmark.name = hg.command.push # data-env-vars.name = mercurial-devel-2024-03-22-zstd-sparse-revlog before: 6.155501 seconds after: 5.885072 seconds (-4.39%, -0.27) # data-env-vars.name = tryton-devel-2024-03-22-zstd-sparse-revlog before: 1.491054 seconds after: 0.934882 seconds (-37.30%, -0.56) # data-env-vars.name = netbsd-src-draft-2024-09-19-zstd-sparse-revlog before: 5.902494 seconds after: 2.957644 seconds (-49.89%, -2.94) There is not notable different in these result using the "rust" flavor instead of the "default". The performance impact on the same operation when using evolve were also tested and no impact was noted.

Matt Harbison - - Load All Authors

File last commit:

r52757:1c5810ce default


                r52789:8583d138

default

Download file

             randomaccessfile.py
        
                    232 lines
            
             | 6.9 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / mercurial / revlogutils / randomaccessfile.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      # Copyright Mercurial Contributors

      #

      # This software may be used and distributed according to the terms of the

      # GNU General Public License version 2 or any later version.

      from __future__ import annotations

      import contextlib

      from ..i18n import _

      from .. import (

          error,

          util,

      )

      _MAX_CACHED_CHUNK_SIZE = 1048576  # 1 MiB

      PARTIAL_READ_MSG = _(

          b'partial read of revlog %s; expected %d bytes from offset %d, got %d'

      )

      def _is_power_of_two(n):

          return (n & (n - 1) == 0) and n != 0

      class appender:

          """the changelog index must be updated last on disk, so we use this class

          to delay writes to it"""

          def __init__(self, vfs, name, mode, buf):

              self.data = buf

              fp = vfs(name, mode)

              self.fp = fp

              self.offset = fp.tell()

              self.size = vfs.fstat(fp).st_size

              self._end = self.size

          def end(self):

              return self._end

          def tell(self):

              return self.offset

          def flush(self):

              pass

          @property

          def closed(self):

              return self.fp.closed

          def close(self):

              self.fp.close()

          def seek(self, offset, whence=0):

              '''virtual file offset spans real file and data'''

              if whence == 0:

                  self.offset = offset

              elif whence == 1:

                  self.offset += offset

              elif whence == 2:

                  self.offset = self.end() + offset

              if self.offset < self.size:

                  self.fp.seek(self.offset)

          def read(self, count=-1):

              '''only trick here is reads that span real file and data'''

              ret = b""

              if self.offset < self.size:

                  s = self.fp.read(count)

                  ret = s

                  self.offset += len(s)

                  if count > 0:

                      count -= len(s)

              if count != 0:

                  doff = self.offset - self.size

                  self.data.insert(0, b"".join(self.data))

                  del self.data[1:]

                  s = self.data[0][doff : doff + count]

                  self.offset += len(s)

                  ret += s

              return ret

          def write(self, s):

              self.data.append(bytes(s))

              self.offset += len(s)

              self._end += len(s)

          def __enter__(self):

              self.fp.__enter__()

              return self

          def __exit__(self, *args):

              return self.fp.__exit__(*args)

      class randomaccessfile:

          """Accessing arbitrary chuncks of data within a file, with some caching"""

          def __init__(

              self,

              opener,

              filename,

              default_cached_chunk_size,

              initial_cache=None,

          ):

              # Required by bitwise manipulation below

              assert _is_power_of_two(default_cached_chunk_size)

              self.opener = opener

              self.filename = filename

              self.default_cached_chunk_size = default_cached_chunk_size

              self.writing_handle = None  # This is set from revlog.py

              self.reading_handle = None

              self._cached_chunk = b''

              self._cached_chunk_position = 0  # Offset from the start of the file

              if initial_cache:

                  self._cached_chunk_position, self._cached_chunk = initial_cache

          def clear_cache(self):

              self._cached_chunk = b''

              self._cached_chunk_position = 0

          @property

          def is_open(self):

              """True if any file handle is being held

              Used for assert and debug in the python code"""

              return (

                  self.reading_handle is not None or self.writing_handle is not None

              )

          def _open(self, mode=b'r'):

              """Return a file object"""

              return self.opener(self.filename, mode=mode)

          @contextlib.contextmanager

          def _read_handle(self):

              """File object suitable for reading data"""

              # Use a file handle being actively used for writes, if available.

              # There is some danger to doing this because reads will seek the

              # file. However, revlog._writeentry performs a SEEK_END before all

              # writes, so we should be safe.

              if self.writing_handle:

                  yield self.writing_handle

              elif self.reading_handle:

                  yield self.reading_handle

              # Otherwise open a new file handle.

              else:

                  with self._open() as fp:

                      yield fp

          @contextlib.contextmanager

          def reading(self):

              """Context manager that keeps the file open for reading"""

              if (

                  self.reading_handle is None

                  and self.writing_handle is None

                  and self.filename is not None

              ):

                  with self._open() as fp:

                      self.reading_handle = fp

                      try:

                          yield

                      finally:

                          self.reading_handle = None

              else:

                  yield

          def read_chunk(self, offset, length):

              """Read a chunk of bytes from the file.

              Accepts an absolute offset, length to read.

              Returns a str or buffer of raw byte data.

              Raises if the requested number of bytes could not be read.

              """

              end = offset + length

              cache_start = self._cached_chunk_position

              cache_end = cache_start + len(self._cached_chunk)

              # Is the requested chunk within the cache?

              if cache_start <= offset and end <= cache_end:

                  if cache_start == offset and end == cache_end:

                      return self._cached_chunk  # avoid a copy

                  relative_start = offset - cache_start

                  return util.buffer(self._cached_chunk, relative_start, length)

              return self._read_and_update_cache(offset, length)

          def _read_and_update_cache(self, offset, length):

              # Cache data both forward and backward around the requested

              # data, in a fixed size window. This helps speed up operations

              # involving reading the revlog backwards.

              real_offset = offset & ~(self.default_cached_chunk_size - 1)

              real_length = (

                  (offset + length + self.default_cached_chunk_size)

                  & ~(self.default_cached_chunk_size - 1)

              ) - real_offset

              with self._read_handle() as file_obj:

                  file_obj.seek(real_offset)

                  data = file_obj.read(real_length)

              self._add_cached_chunk(real_offset, data)

              relative_offset = offset - real_offset

              got = len(data) - relative_offset

              if got < length:

                  message = PARTIAL_READ_MSG % (self.filename, length, offset, got)

                  raise error.RevlogError(message)

              if offset != real_offset or real_length != length:

                  return util.buffer(data, relative_offset, length)

              return data

          def _add_cached_chunk(self, offset, data):

              """Add to or replace the cached data chunk.

              Accepts an absolute offset and the data that is at that location.

              """

              if (

                  self._cached_chunk_position + len(self._cached_chunk) == offset

                  and len(self._cached_chunk) + len(data) < _MAX_CACHED_CHUNK_SIZE

              ):

                  # add to existing cache

                  self._cached_chunk += data

              else:

                  self._cached_chunk = data

                  self._cached_chunk_position = offset

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				# Copyright Mercurial Contributors
				#
				# This software may be used and distributed according to the terms of the
				# GNU General Public License version 2 or any later version.

				from __future__ import annotations

				import contextlib

				from ..i18n import _
				from .. import (
				error,
				util,
				)


				_MAX_CACHED_CHUNK_SIZE = 1048576 # 1 MiB

				PARTIAL_READ_MSG = _(
				b'partial read of revlog %s; expected %d bytes from offset %d, got %d'
				)


				def _is_power_of_two(n):
				return (n & (n - 1) == 0) and n != 0


				class appender:
				"""the changelog index must be updated last on disk, so we use this class
				to delay writes to it"""

				def __init__(self, vfs, name, mode, buf):
				self.data = buf
				fp = vfs(name, mode)
				self.fp = fp
				self.offset = fp.tell()
				self.size = vfs.fstat(fp).st_size
				self._end = self.size

				def end(self):
				return self._end

				def tell(self):
				return self.offset

				def flush(self):
				pass

				@property
				def closed(self):
				return self.fp.closed

				def close(self):
				self.fp.close()

				def seek(self, offset, whence=0):
				'''virtual file offset spans real file and data'''
				if whence == 0:
				self.offset = offset
				elif whence == 1:
				self.offset += offset
				elif whence == 2:
				self.offset = self.end() + offset
				if self.offset < self.size:
				self.fp.seek(self.offset)

				def read(self, count=-1):
				'''only trick here is reads that span real file and data'''
				ret = b""
				if self.offset < self.size:
				s = self.fp.read(count)
				ret = s
				self.offset += len(s)
				if count > 0:
				count -= len(s)
				if count != 0:
				doff = self.offset - self.size
				self.data.insert(0, b"".join(self.data))
				del self.data[1:]
				s = self.data[0][doff : doff + count]
				self.offset += len(s)
				ret += s
				return ret

				def write(self, s):
				self.data.append(bytes(s))
				self.offset += len(s)
				self._end += len(s)

				def __enter__(self):
				self.fp.__enter__()
				return self

				def __exit__(self, *args):
				return self.fp.__exit__(*args)


				class randomaccessfile:
				"""Accessing arbitrary chuncks of data within a file, with some caching"""

				def __init__(
				self,
				opener,
				filename,
				default_cached_chunk_size,
				initial_cache=None,
				):
				# Required by bitwise manipulation below
				assert _is_power_of_two(default_cached_chunk_size)

				self.opener = opener
				self.filename = filename
				self.default_cached_chunk_size = default_cached_chunk_size
				self.writing_handle = None # This is set from revlog.py
				self.reading_handle = None
				self._cached_chunk = b''
				self._cached_chunk_position = 0 # Offset from the start of the file
				if initial_cache:
				self._cached_chunk_position, self._cached_chunk = initial_cache

				def clear_cache(self):
				self._cached_chunk = b''
				self._cached_chunk_position = 0

				@property
				def is_open(self):
				"""True if any file handle is being held

				Used for assert and debug in the python code"""
				return (
				self.reading_handle is not None or self.writing_handle is not None
				)

				def _open(self, mode=b'r'):
				"""Return a file object"""
				return self.opener(self.filename, mode=mode)

				@contextlib.contextmanager
				def _read_handle(self):
				"""File object suitable for reading data"""
				# Use a file handle being actively used for writes, if available.
				# There is some danger to doing this because reads will seek the
				# file. However, revlog._writeentry performs a SEEK_END before all
				# writes, so we should be safe.
				if self.writing_handle:
				yield self.writing_handle

				elif self.reading_handle:
				yield self.reading_handle

				# Otherwise open a new file handle.
				else:
				with self._open() as fp:
				yield fp

				@contextlib.contextmanager
				def reading(self):
				"""Context manager that keeps the file open for reading"""
				if (
				self.reading_handle is None
				and self.writing_handle is None
				and self.filename is not None
				):
				with self._open() as fp:
				self.reading_handle = fp
				try:
				yield
				finally:
				self.reading_handle = None
				else:
				yield

				def read_chunk(self, offset, length):
				"""Read a chunk of bytes from the file.

				Accepts an absolute offset, length to read.

				Returns a str or buffer of raw byte data.

				Raises if the requested number of bytes could not be read.
				"""
				end = offset + length
				cache_start = self._cached_chunk_position
				cache_end = cache_start + len(self._cached_chunk)
				# Is the requested chunk within the cache?
				if cache_start <= offset and end <= cache_end:
				if cache_start == offset and end == cache_end:
				return self._cached_chunk # avoid a copy
				relative_start = offset - cache_start
				return util.buffer(self._cached_chunk, relative_start, length)

				return self._read_and_update_cache(offset, length)

				def _read_and_update_cache(self, offset, length):
				# Cache data both forward and backward around the requested
				# data, in a fixed size window. This helps speed up operations
				# involving reading the revlog backwards.
				real_offset = offset & ~(self.default_cached_chunk_size - 1)
				real_length = (
				(offset + length + self.default_cached_chunk_size)
				& ~(self.default_cached_chunk_size - 1)
				) - real_offset
				with self._read_handle() as file_obj:
				file_obj.seek(real_offset)
				data = file_obj.read(real_length)

				self._add_cached_chunk(real_offset, data)

				relative_offset = offset - real_offset
				got = len(data) - relative_offset
				if got < length:
				message = PARTIAL_READ_MSG % (self.filename, length, offset, got)
				raise error.RevlogError(message)

				if offset != real_offset or real_length != length:
				return util.buffer(data, relative_offset, length)
				return data

				def _add_cached_chunk(self, offset, data):
				"""Add to or replace the cached data chunk.

				Accepts an absolute offset and the data that is at that location.
				"""
				if (
				self._cached_chunk_position + len(self._cached_chunk) == offset
				and len(self._cached_chunk) + len(data) < _MAX_CACHED_CHUNK_SIZE
				):
				# add to existing cache
				self._cached_chunk += data
				else:
				self._cached_chunk = data
				self._cached_chunk_position = offset