rhodecode-vcsserver Files · vcsserver/lib/archive_cache/backends/base.py

feat(archive-cache): synced with CE codebase

super-admin - - Load All Authors

File last commit:

r1251:f8e66197 default


                r1251:f8e66197

default

Download file

             base.py
        
                    348 lines
            
             | 11.2 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / vcsserver / lib / archive_cache / backends / base.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        super-admin
    
feat(archive-cache): synced with CE codebase

              r1251
            
      # Copyright (C) 2015-2024 RhodeCode GmbH

      #

      # This program is free software: you can redistribute it and/or modify

      # it under the terms of the GNU Affero General Public License, version 3

      # (only), as published by the Free Software Foundation.

      #

      # This program is distributed in the hope that it will be useful,

      # but WITHOUT ANY WARRANTY; without even the implied warranty of

      # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

      # GNU General Public License for more details.

      #

      # You should have received a copy of the GNU Affero General Public License

      # along with this program.  If not, see <http://www.gnu.org/licenses/>.

      #

      # This program is dual-licensed. If you wish to learn more about the

      # RhodeCode Enterprise Edition, including its added features, Support services,

      # and proprietary license terms, please see https://rhodecode.com/licenses/

      import os

      import functools

      import logging

      import typing

      import time

      import zlib

      from ...ext_json import json

      from ..utils import StatsDB, NOT_GIVEN, ShardFileReader, EVICTION_POLICY, format_size

      from ..lock import GenerationLock

      log = logging.getLogger(__name__)

      class BaseShard:

          storage_type: str = ''

          fs = None

          @classmethod

          def hash(cls, key):

              """Compute portable hash for `key`.

              :param key: key to hash

              :return: hash value

              """

              mask = 0xFFFFFFFF

              return zlib.adler32(key.encode('utf-8')) & mask  # noqa

          def _write_file(self, full_path, read_iterator, mode):

              raise NotImplementedError

          def _get_keyfile(self, key):

              raise NotImplementedError

          def random_filename(self):

              raise NotImplementedError

          def _store(self, key, value_reader, metadata, mode):

              (filename,  # hash-name

               full_path  # full-path/hash-name

               ) = self.random_filename()

              key_file, key_file_path = self._get_keyfile(key)

              # STORE METADATA

              _metadata = {

                  "version": "v1",

                  "key_file": key_file,  # this is the .key.json file storing meta

                  "key_file_path": key_file_path,  # full path to key_file

                  "archive_key": key,  # original name we stored archive under, e.g my-archive.zip

                  "archive_filename": filename,  # the actual filename we stored that file under

                  "archive_full_path": full_path,

                  "store_time": time.time(),

                  "access_count": 0,

                  "access_time": 0,

                  "size": 0

              }

              if metadata:

                  _metadata.update(metadata)

              read_iterator = iter(functools.partial(value_reader.read, 2**22), b'')

              size, sha256 = self._write_file(full_path, read_iterator, mode)

              _metadata['size'] = size

              _metadata['sha256'] = sha256

              # after archive is finished, we create a key to save the presence of the binary file

              with self.fs.open(key_file_path, 'wb') as f:

                  f.write(json.dumps(_metadata))

              return key, filename, size, _metadata

          def _fetch(self,  key, retry, retry_attempts, retry_backoff):

              if retry is NOT_GIVEN:

                  retry = False

              if retry_attempts is NOT_GIVEN:

                  retry_attempts = 0

              if retry and retry_attempts > 0:

                  for attempt in range(1, retry_attempts + 1):

                      if key in self:

                          break

                      # we didn't find the key, wait retry_backoff N seconds, and re-check

                      time.sleep(retry_backoff)

              if key not in self:

                  log.exception(f'requested key={key} not found in {self} retry={retry}, attempts={retry_attempts}')

                  raise KeyError(key)

              key_file, key_file_path = self._get_keyfile(key)

              with self.fs.open(key_file_path, 'rb') as f:

                  metadata = json.loads(f.read())

              archive_path = metadata['archive_full_path']

              try:

                  return ShardFileReader(self.fs.open(archive_path, 'rb')), metadata

              finally:

                  # update usage stats, count and accessed

                  metadata["access_count"] = metadata.get("access_count", 0) + 1

                  metadata["access_time"] = time.time()

                  log.debug('Updated %s with access snapshot, access_count=%s access_time=%s',

                            key_file, metadata['access_count'], metadata['access_time'])

                  with self.fs.open(key_file_path, 'wb') as f:

                      f.write(json.dumps(metadata))

          def _remove(self, key):

              if key not in self:

                  log.exception(f'requested key={key} not found in {self}')

                  raise KeyError(key)

              key_file, key_file_path = self._get_keyfile(key)

              with self.fs.open(key_file_path, 'rb') as f:

                  metadata = json.loads(f.read())

              archive_path = metadata['archive_full_path']

              self.fs.rm(archive_path)

              self.fs.rm(key_file_path)

              return 1

          @property

          def storage_medium(self):

              return getattr(self, self.storage_type)

          @property

          def key_suffix(self):

              return 'key.json'

          def __contains__(self, key):

              """Return `True` if `key` matching item is found in cache.

              :param key: key matching item

              :return: True if key matching item

              """

              key_file, key_file_path = self._get_keyfile(key)

              return self.fs.exists(key_file_path)

      class BaseCache:

          _locking_url: str = ''

          _storage_path: str = ''

          _config = {}

          retry = False

          retry_attempts = 0

          retry_backoff = 1

          _shards = tuple()

          def __contains__(self, key):

              """Return `True` if `key` matching item is found in cache.

              :param key: key matching item

              :return: True if key matching item

              """

              return self.has_key(key)

          def __repr__(self):

              return f'<{self.__class__.__name__}(storage={self._storage_path})>'

          @classmethod

          def gb_to_bytes(cls, gb):

              return gb * (1024 ** 3)

          @property

          def storage_path(self):

              return self._storage_path

          @classmethod

          def get_stats_db(cls):

              return StatsDB()

          def get_conf(self, key, pop=False):

              if key not in self._config:

                  raise ValueError(f"No configuration key '{key}', please make sure it exists in archive_cache config")

              val = self._config[key]

              if pop:

                  del self._config[key]

              return val

          def _get_shard(self, key):

              raise NotImplementedError

          def _get_size(self, shard, archive_path):

              raise NotImplementedError

          def store(self, key, value_reader, metadata=None):

              shard = self._get_shard(key)

              return shard.store(key, value_reader, metadata)

          def fetch(self, key, retry=NOT_GIVEN, retry_attempts=NOT_GIVEN) -> tuple[typing.BinaryIO, dict]:

              """

              Return file handle corresponding to `key` from specific shard cache.

              """

              if retry is NOT_GIVEN:

                  retry = self.retry

              if retry_attempts is NOT_GIVEN:

                  retry_attempts = self.retry_attempts

              retry_backoff = self.retry_backoff

              shard = self._get_shard(key)

              return shard.fetch(key, retry=retry, retry_attempts=retry_attempts, retry_backoff=retry_backoff)

          def remove(self, key):

              shard = self._get_shard(key)

              return shard.remove(key)

          def has_key(self, archive_key):

              """Return `True` if `key` matching item is found in cache.

              :param archive_key: key for item, this is a unique archive name we want to store data under. e.g my-archive-svn.zip

              :return: True if key is found

              """

              shard = self._get_shard(archive_key)

              return archive_key in shard

          def iter_keys(self):

              for shard in self._shards:

                  if shard.fs.exists(shard.storage_medium):

                      for path, _dirs, _files in shard.fs.walk(shard.storage_medium):

                          for key_file_path in _files:

                              if key_file_path.endswith(shard.key_suffix):

                                  yield shard, key_file_path

          def get_lock(self, lock_key):

              return GenerationLock(lock_key, self._locking_url)

          def evict(self, policy=None, size_limit=None) -> int:

              """

              Remove old items based on the conditions

              explanation of this algo:

              iterate over each shard, then for each shard iterate over the .key files

              read the key files metadata stored. This gives us a full list of keys, cached_archived, their size and

              access data, time creation, and access counts.

              Store that into a memory DB so we can run different sorting strategies easily.

              Summing the size is a sum sql query.

              Then we run a sorting strategy based on eviction policy.

              We iterate over sorted keys, and remove each checking if we hit the overall limit.

              """

              policy = policy or self._eviction_policy

              size_limit = size_limit or self._cache_size_limit

              select_policy = EVICTION_POLICY[policy]['evict']

              log.debug('Running eviction policy \'%s\', and checking for size limit: %s',

                        policy, format_size(size_limit))

              if select_policy is None:

                  return 0

              db = self.get_stats_db()

              data = []

              cnt = 1

              for shard, key_file in self.iter_keys():

                  with shard.fs.open(os.path.join(shard.storage_medium, key_file), 'rb') as f:

                      metadata = json.loads(f.read())

                  key_file_path = os.path.join(shard.storage_medium, key_file)

                  archive_key = metadata['archive_key']

                  archive_path = metadata['archive_full_path']

                  size = metadata.get('size')

                  if not size:

                      # in case we don't have size re-calc it...

                      size = self._get_size(shard, archive_path)

                  data.append([

                      cnt,

                      key_file,

                      key_file_path,

                      archive_key,

                      archive_path,

                      metadata.get('store_time', 0),

                      metadata.get('access_time', 0),

                      metadata.get('access_count', 0),

                      size,

                  ])

                  cnt += 1

              # Insert bulk data using executemany

              db.bulk_insert(data)

              total_size = db.get_total_size()

              log.debug('Analyzed %s keys, occupying: %s, running eviction to match %s',

                        len(data), format_size(total_size), format_size(size_limit))

              removed_items = 0

              removed_size = 0

              for key_file, archive_key, size in db.get_sorted_keys(select_policy):

                  # simulate removal impact BEFORE removal

                  total_size -= size

                  if total_size <= size_limit:

                      # we obtained what we wanted...

                      break

                  self.remove(archive_key)

                  removed_items += 1

                  removed_size += size

              log.debug('Removed %s cache archives, and reduced size by: %s',

                        removed_items, format_size(removed_size))

              return removed_items

          def get_statistics(self):

              total_files = 0

              total_size = 0

              meta = {}

              for shard, key_file in self.iter_keys():

                  json_key = f"{shard.storage_medium}/{key_file}"

                  with shard.fs.open(json_key, 'rb') as f:

                      total_files += 1

                      metadata = json.loads(f.read())

                      total_size += metadata['size']

              return total_files, total_size, meta

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

super-admin feat(archive-cache): synced with CE codebase	r1251	# Copyright (C) 2015-2024 RhodeCode GmbH
		#
		# This program is free software: you can redistribute it and/or modify
		# it under the terms of the GNU Affero General Public License, version 3
		# (only), as published by the Free Software Foundation.
		#
		# This program is distributed in the hope that it will be useful,
		# but WITHOUT ANY WARRANTY; without even the implied warranty of
		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		# GNU General Public License for more details.
		#
		# You should have received a copy of the GNU Affero General Public License
		# along with this program. If not, see <http://www.gnu.org/licenses/>.
		#
		# This program is dual-licensed. If you wish to learn more about the
		# RhodeCode Enterprise Edition, including its added features, Support services,
		# and proprietary license terms, please see https://rhodecode.com/licenses/

		import os
		import functools
		import logging
		import typing
		import time
		import zlib

		from ...ext_json import json
		from ..utils import StatsDB, NOT_GIVEN, ShardFileReader, EVICTION_POLICY, format_size
		from ..lock import GenerationLock

		log = logging.getLogger(__name__)


		class BaseShard:
		storage_type: str = ''
		fs = None

		@classmethod
		def hash(cls, key):
		"""Compute portable hash for `key`.

		:param key: key to hash
		:return: hash value

		"""
		mask = 0xFFFFFFFF
		return zlib.adler32(key.encode('utf-8')) & mask # noqa

		def _write_file(self, full_path, read_iterator, mode):
		raise NotImplementedError

		def _get_keyfile(self, key):
		raise NotImplementedError

		def random_filename(self):
		raise NotImplementedError

		def _store(self, key, value_reader, metadata, mode):
		(filename, # hash-name
		full_path # full-path/hash-name
		) = self.random_filename()

		key_file, key_file_path = self._get_keyfile(key)

		# STORE METADATA
		_metadata = {
		"version": "v1",

		"key_file": key_file, # this is the .key.json file storing meta
		"key_file_path": key_file_path, # full path to key_file
		"archive_key": key, # original name we stored archive under, e.g my-archive.zip
		"archive_filename": filename, # the actual filename we stored that file under
		"archive_full_path": full_path,

		"store_time": time.time(),
		"access_count": 0,
		"access_time": 0,

		"size": 0
		}
		if metadata:
		_metadata.update(metadata)

		read_iterator = iter(functools.partial(value_reader.read, 2**22), b'')
		size, sha256 = self._write_file(full_path, read_iterator, mode)
		_metadata['size'] = size
		_metadata['sha256'] = sha256

		# after archive is finished, we create a key to save the presence of the binary file
		with self.fs.open(key_file_path, 'wb') as f:
		f.write(json.dumps(_metadata))

		return key, filename, size, _metadata

		def _fetch(self, key, retry, retry_attempts, retry_backoff):
		if retry is NOT_GIVEN:
		retry = False
		if retry_attempts is NOT_GIVEN:
		retry_attempts = 0

		if retry and retry_attempts > 0:
		for attempt in range(1, retry_attempts + 1):
		if key in self:
		break
		# we didn't find the key, wait retry_backoff N seconds, and re-check
		time.sleep(retry_backoff)

		if key not in self:
		log.exception(f'requested key={key} not found in {self} retry={retry}, attempts={retry_attempts}')
		raise KeyError(key)

		key_file, key_file_path = self._get_keyfile(key)
		with self.fs.open(key_file_path, 'rb') as f:
		metadata = json.loads(f.read())

		archive_path = metadata['archive_full_path']

		try:
		return ShardFileReader(self.fs.open(archive_path, 'rb')), metadata
		finally:
		# update usage stats, count and accessed
		metadata["access_count"] = metadata.get("access_count", 0) + 1
		metadata["access_time"] = time.time()
		log.debug('Updated %s with access snapshot, access_count=%s access_time=%s',
		key_file, metadata['access_count'], metadata['access_time'])
		with self.fs.open(key_file_path, 'wb') as f:
		f.write(json.dumps(metadata))

		def _remove(self, key):
		if key not in self:
		log.exception(f'requested key={key} not found in {self}')
		raise KeyError(key)

		key_file, key_file_path = self._get_keyfile(key)
		with self.fs.open(key_file_path, 'rb') as f:
		metadata = json.loads(f.read())

		archive_path = metadata['archive_full_path']
		self.fs.rm(archive_path)
		self.fs.rm(key_file_path)
		return 1

		@property
		def storage_medium(self):
		return getattr(self, self.storage_type)

		@property
		def key_suffix(self):
		return 'key.json'

		def __contains__(self, key):
		"""Return `True` if `key` matching item is found in cache.

		:param key: key matching item
		:return: True if key matching item

		"""
		key_file, key_file_path = self._get_keyfile(key)
		return self.fs.exists(key_file_path)


		class BaseCache:
		_locking_url: str = ''
		_storage_path: str = ''
		_config = {}
		retry = False
		retry_attempts = 0
		retry_backoff = 1
		_shards = tuple()

		def __contains__(self, key):
		"""Return `True` if `key` matching item is found in cache.

		:param key: key matching item
		:return: True if key matching item

		"""
		return self.has_key(key)

		def __repr__(self):
		return f'<{self.__class__.__name__}(storage={self._storage_path})>'

		@classmethod
		def gb_to_bytes(cls, gb):
		return gb * (1024 ** 3)

		@property
		def storage_path(self):
		return self._storage_path

		@classmethod
		def get_stats_db(cls):
		return StatsDB()

		def get_conf(self, key, pop=False):
		if key not in self._config:
		raise ValueError(f"No configuration key '{key}', please make sure it exists in archive_cache config")
		val = self._config[key]
		if pop:
		del self._config[key]
		return val

		def _get_shard(self, key):
		raise NotImplementedError

		def _get_size(self, shard, archive_path):
		raise NotImplementedError

		def store(self, key, value_reader, metadata=None):
		shard = self._get_shard(key)
		return shard.store(key, value_reader, metadata)

		def fetch(self, key, retry=NOT_GIVEN, retry_attempts=NOT_GIVEN) -> tuple[typing.BinaryIO, dict]:
		"""
		Return file handle corresponding to `key` from specific shard cache.
		"""
		if retry is NOT_GIVEN:
		retry = self.retry
		if retry_attempts is NOT_GIVEN:
		retry_attempts = self.retry_attempts
		retry_backoff = self.retry_backoff

		shard = self._get_shard(key)
		return shard.fetch(key, retry=retry, retry_attempts=retry_attempts, retry_backoff=retry_backoff)

		def remove(self, key):
		shard = self._get_shard(key)
		return shard.remove(key)

		def has_key(self, archive_key):
		"""Return `True` if `key` matching item is found in cache.

		:param archive_key: key for item, this is a unique archive name we want to store data under. e.g my-archive-svn.zip
		:return: True if key is found

		"""
		shard = self._get_shard(archive_key)
		return archive_key in shard

		def iter_keys(self):
		for shard in self._shards:
		if shard.fs.exists(shard.storage_medium):
		for path, _dirs, _files in shard.fs.walk(shard.storage_medium):
		for key_file_path in _files:
		if key_file_path.endswith(shard.key_suffix):
		yield shard, key_file_path

		def get_lock(self, lock_key):
		return GenerationLock(lock_key, self._locking_url)

		def evict(self, policy=None, size_limit=None) -> int:
		"""
		Remove old items based on the conditions


		explanation of this algo:
		iterate over each shard, then for each shard iterate over the .key files
		read the key files metadata stored. This gives us a full list of keys, cached_archived, their size and
		access data, time creation, and access counts.

		Store that into a memory DB so we can run different sorting strategies easily.
		Summing the size is a sum sql query.

		Then we run a sorting strategy based on eviction policy.
		We iterate over sorted keys, and remove each checking if we hit the overall limit.
		"""

		policy = policy or self._eviction_policy
		size_limit = size_limit or self._cache_size_limit

		select_policy = EVICTION_POLICY[policy]['evict']

		log.debug('Running eviction policy \'%s\', and checking for size limit: %s',
		policy, format_size(size_limit))

		if select_policy is None:
		return 0

		db = self.get_stats_db()

		data = []
		cnt = 1

		for shard, key_file in self.iter_keys():
		with shard.fs.open(os.path.join(shard.storage_medium, key_file), 'rb') as f:
		metadata = json.loads(f.read())

		key_file_path = os.path.join(shard.storage_medium, key_file)

		archive_key = metadata['archive_key']
		archive_path = metadata['archive_full_path']

		size = metadata.get('size')
		if not size:
		# in case we don't have size re-calc it...
		size = self._get_size(shard, archive_path)

		data.append([
		cnt,
		key_file,
		key_file_path,
		archive_key,
		archive_path,
		metadata.get('store_time', 0),
		metadata.get('access_time', 0),
		metadata.get('access_count', 0),
		size,
		])
		cnt += 1

		# Insert bulk data using executemany
		db.bulk_insert(data)

		total_size = db.get_total_size()
		log.debug('Analyzed %s keys, occupying: %s, running eviction to match %s',
		len(data), format_size(total_size), format_size(size_limit))

		removed_items = 0
		removed_size = 0
		for key_file, archive_key, size in db.get_sorted_keys(select_policy):
		# simulate removal impact BEFORE removal
		total_size -= size

		if total_size <= size_limit:
		# we obtained what we wanted...
		break

		self.remove(archive_key)
		removed_items += 1
		removed_size += size

		log.debug('Removed %s cache archives, and reduced size by: %s',
		removed_items, format_size(removed_size))
		return removed_items

		def get_statistics(self):
		total_files = 0
		total_size = 0
		meta = {}

		for shard, key_file in self.iter_keys():
		json_key = f"{shard.storage_medium}/{key_file}"
		with shard.fs.open(json_key, 'rb') as f:
		total_files += 1
		metadata = json.loads(f.read())
		total_size += metadata['size']

		return total_files, total_size, meta