rhodecode-vcsserver Commit - r1242:8380b87c

archive-cache: synced with CE lib

super-admin -

r1242:8380b87c default

parent child

vcsserver/lib/rc_cache/archive_cache/fanout_cache.py

0 +155 -2

              # RhodeCode VCSServer provides access to different vcs backends via network.
              # Copyright (C) 2014-2024 RhodeCode GmbH
              #
              # This program is free software; you can redistribute it and/or modify
              # it under the terms of the GNU General Public License as published by
              # the Free Software Foundation; either version 3 of the License, or
              # (at your option) any later version.
              #
              # This program is distributed in the hope that it will be useful,
              # but WITHOUT ANY WARRANTY; without even the implied warranty of
              # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
              # GNU General Public License for more details.
              #
              # You should have received a copy of the GNU General Public License
              # along with this program; if not, write to the Free Software Foundation,
              # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
              import codecs
              import contextlib
              import functools
              import os
              import logging
              import time
              import typing
              import zlib
+             import sqlite3
              from vcsserver.lib.rc_json import json
              from .lock import GenerationLock
              log = logging.getLogger(__name__)
              cache_meta = None
              UNKNOWN = -241
              NO_VAL = -917
              MODE_BINARY = 'BINARY'
+             EVICTION_POLICY = {
+                 'none': {
+                     'evict': None,
+                 },
+                 'least-recently-stored': {
+                     'evict': 'SELECT {fields} FROM archive_cache ORDER BY store_time',
+                 },
+                 'least-recently-used': {
+                     'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_time',
+                 },
+                 'least-frequently-used': {
+                     'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_count',
+                 },
+             }
+             class DB:
+                 def __init__(self):
+                     self.connection = sqlite3.connect(':memory:')
+                     self._init_db()
+                 def _init_db(self):
+                     qry = '''
+                         CREATE TABLE IF NOT EXISTS archive_cache (
+                          rowid INTEGER PRIMARY KEY,
+                          key_file TEXT,
+                          key_file_path TEXT,
+                          filename TEXT,
+                          full_path TEXT,
+                          store_time REAL,
+                          access_time REAL,
+                          access_count INTEGER DEFAULT 0,
+                          size INTEGER DEFAULT 0
+                          )
+                     '''
+                     self.sql(qry)
+                     self.connection.commit()
+                 @property
+                 def sql(self):
+                     return self.connection.execute
+                 def bulk_insert(self, rows):
+                     qry = '''
+                         INSERT INTO archive_cache (
+                          rowid,
+                          key_file,
+                          key_file_path,
+                          filename,
+                          full_path,
+                          store_time,
+                          access_time,
+                          access_count,
+                          size
+                         )
+                         VALUES (
+                         ?, ?, ?, ?, ?, ?, ?, ?, ?
+                         )
+                     '''
+                     cursor = self.connection.cursor()
+                     cursor.executemany(qry, rows)
+                     self.connection.commit()
              class FileSystemCache:
                  def __init__(self, index, directory, **settings):
                      self._index = index
                      self._directory = directory
                  def _write_file(self, full_path, iterator, mode, encoding=None):
                      full_dir, _ = os.path.split(full_path)
                      for count in range(1, 11):
                          with contextlib.suppress(OSError):
                              os.makedirs(full_dir)
                          try:
                              # Another cache may have deleted the directory before
                              # the file could be opened.
                              writer = open(full_path, mode, encoding=encoding)
                          except OSError:
                              if count == 10:
                                  # Give up after 10 tries to open the file.
                                  raise
                              continue
                          with writer:
                              size = 0
                              for chunk in iterator:
                                  size += len(chunk)
                                  writer.write(chunk)
                              return size
                  def _get_keyfile(self, key):
                      return os.path.join(self._directory, f'{key}.key')
                  def store(self, key, value_reader, metadata):
                      filename, full_path = self.random_filename()
                      key_file = self._get_keyfile(key)
                      # STORE METADATA
                      _metadata = {
                          "version": "v1",
-                         "timestamp": time.time(),
                          "filename": filename,
                          "full_path": full_path,
                          "key_file": key_file,
+                         "store_time": time.time(),
+                         "access_count": 1,
+                         "access_time": 0,
+                         "size": 0
                      }
                      if metadata:
                          _metadata.update(metadata)
                      reader = functools.partial(value_reader.read, 2**22)
                      iterator = iter(reader, b'')
                      size = self._write_file(full_path, iterator, 'xb')
+                     metadata['size'] = size
                      # after archive is finished, we create a key to save the presence of the binary file
                      with open(key_file, 'wb') as f:
                          f.write(json.dumps(_metadata))
                      return key, size, MODE_BINARY, filename, _metadata
                  def fetch(self, key) -> tuple[typing.BinaryIO, dict]:
                      if key not in self:
                          raise KeyError(key)
                      key_file = self._get_keyfile(key)
                      with open(key_file, 'rb') as f:
                          metadata = json.loads(f.read())
                      filename = metadata['filename']
+                     try:
                      return open(os.path.join(self._directory, filename), 'rb'), metadata
+                     finally:
+                         # update usage stats, count and accessed
+                         metadata["access_count"] = metadata.get("access_count", 0) + 1
+                         metadata["access_time"] = time.time()
+                         with open(key_file, 'wb') as f:
+                             f.write(json.dumps(metadata))
                  def random_filename(self):
                      """Return filename and full-path tuple for file storage.
                      Filename will be a randomly generated 28 character hexadecimal string
                      with ".archive_cache" suffixed. Two levels of sub-directories will be used to
                      reduce the size of directories. On older filesystems, lookups in
                      directories with many files may be slow.
                      """
                      hex_name = codecs.encode(os.urandom(16), 'hex').decode('utf-8')
                      sub_dir = os.path.join(hex_name[:2], hex_name[2:4])
                      name = hex_name[4:] + '.archive_cache'
                      filename = os.path.join(sub_dir, name)
                      full_path = os.path.join(self._directory, filename)
                      return filename, full_path
                  def hash(self, key):
                      """Compute portable hash for `key`.
                      :param key: key to hash
                      :return: hash value
                      """
                      mask = 0xFFFFFFFF
                      return zlib.adler32(key.encode('utf-8')) & mask  # noqa
                  def __contains__(self, key):
                      """Return `True` if `key` matching item is found in cache.
                      :param key: key matching item
                      :return: True if key matching item
                      """
                      key_file = self._get_keyfile(key)
                      return os.path.exists(key_file)
              class FanoutCache:
                  """Cache that shards keys and values."""
                  def __init__(
                      self, directory=None, **settings
                  ):
                      """Initialize cache instance.
                      :param str directory: cache directory
                      :param settings: settings dict
                      """
                      if directory is None:
                          raise ValueError('directory cannot be None')
                      directory = str(directory)
                      directory = os.path.expanduser(directory)
                      directory = os.path.expandvars(directory)
                      self._directory = directory
                      self._count = settings.pop('cache_shards')
                      self._locking_url = settings.pop('locking_url')
+                     self._eviction_policy = settings['cache_eviction_policy']
+                     self._cache_size_limit = settings['cache_size_limit']
                      self._shards = tuple(
                          FileSystemCache(
                              index=num,
                              directory=os.path.join(directory, 'shard_%03d' % num),
                              **settings,
                          )
                          for num in range(self._count)
                      )
                      self._hash = self._shards[0].hash
                  def get_lock(self, lock_key):
                      return GenerationLock(lock_key, self._locking_url)
                  def _get_shard(self, key) -> FileSystemCache:
                      index = self._hash(key) % self._count
                      shard = self._shards[index]
                      return shard
                  def store(self, key, value_reader, metadata=None):
                      shard = self._get_shard(key)
                      return shard.store(key, value_reader, metadata)
                  def fetch(self, key):
                      """Return file handle corresponding to `key` from cache.
                      """
                      shard = self._get_shard(key)
                      return shard.fetch(key)
                  def has_key(self, key):
                      """Return `True` if `key` matching item is found in cache.
                      :param key: key for item
                      :return: True if key is found
                      """
                      shard = self._get_shard(key)
                      return key in shard
                  def __contains__(self, item):
                      return self.has_key(item)
+                 def evict(self, policy=None, size_limit=None):
+                     """
+                     Remove old items based on the conditions
+                     explanation of this algo:
+                     iterate over each shard, then for each shard iterate over the .key files
+                     read the key files metadata stored. This gives us a full list of keys, cached_archived, their size and
+                     access data, time creation, and access counts.
+                     Store that into a memory DB so we can run different sorting strategies easily.
+                     Summing the size is a sum sql query.
+                     Then we run a sorting strategy based on eviction policy.
+                     We iterate over sorted keys, and remove each checking if we hit the overall limit.
+                     """
+                     policy = policy or self._eviction_policy
+                     size_limit = size_limit or self._cache_size_limit
+                     select_policy = EVICTION_POLICY[policy]['evict']
+                     if select_policy is None:
+                         return 0
+                     db = DB()
+                     data = []
+                     cnt = 1
+                     for shard in self._shards:
+                         for key_file in os.listdir(shard._directory):
+                             if key_file.endswith('.key'):
+                                 key_file_path = os.path.join(shard._directory, key_file)
+                                 with open(key_file_path, 'rb') as f:
+                                     metadata = json.loads(f.read())
+                                     # in case we don't have size re-calc it...
+                                     if not metadata.get('size'):
+                                         fn = metadata.get('full_path')
+                                         size = os.stat(fn).st_size
+                                     data.append([
+                                       cnt,
+                                       key_file,
+                                       key_file_path,
+                                       metadata.get('filename'),
+                                       metadata.get('full_path'),
+                                       metadata.get('store_time', 0),
+                                       metadata.get('access_time', 0),
+                                       metadata.get('access_count', 0),
+                                       metadata.get('size', size),
+                                     ])
+                                     cnt += 1
+                     # Insert bulk data using executemany
+                     db.bulk_insert(data)
+                     ((total_size,),) = db.sql('SELECT COALESCE(SUM(size), 0) FROM archive_cache').fetchall()
+                     select_policy_qry = select_policy.format(fields='key_file_path, full_path, size')
+                     sorted_keys = db.sql(select_policy_qry).fetchall()
+                     for key, cached_file, size in sorted_keys:
+                         # simulate removal impact BEFORE removal
+                         total_size -= size
+                         if total_size <= size_limit:
+                             # we obtained what we wanted...
+                             break
+                         os.remove(cached_file)
+                         os.remove(key)
+                     return
              def get_archival_config(config):
                  final_config = {
                  }
                  for k, v in config.items():
                      if k.startswith('archive_cache'):
                          final_config[k] = v
                  return final_config
              def get_archival_cache_store(config):
                  global cache_meta
                  if cache_meta is not None:
                      return cache_meta
                  config = get_archival_config(config)
                  backend = config['archive_cache.backend.type']
                  if backend != 'filesystem':
                      raise ValueError('archive_cache.backend.type only supports "filesystem"')
                  archive_cache_locking_url = config['archive_cache.locking.url']
                  archive_cache_dir = config['archive_cache.filesystem.store_dir']
                  archive_cache_size_gb = config['archive_cache.filesystem.cache_size_gb']
                  archive_cache_shards = config['archive_cache.filesystem.cache_shards']
                  archive_cache_eviction_policy = config['archive_cache.filesystem.eviction_policy']
                  log.debug('Initializing archival cache instance under %s', archive_cache_dir)
                  # check if it's ok to write, and re-create the archive cache
                  if not os.path.isdir(archive_cache_dir):
                      os.makedirs(archive_cache_dir, exist_ok=True)
                  d_cache = FanoutCache(
                      archive_cache_dir,
                      locking_url=archive_cache_locking_url,
                      cache_shards=archive_cache_shards,
                      cache_size_limit=archive_cache_size_gb * 1024 * 1024 * 1024,
                      cache_eviction_policy=archive_cache_eviction_policy
                  )
                  cache_meta = d_cache
                  return cache_meta

vcsserver/lib/rc_cache/archive_cache/utils.py

0 +42 0

              # RhodeCode VCSServer provides access to different vcs backends via network.
              # Copyright (C) 2014-2024 RhodeCode GmbH
              #
              # This program is free software; you can redistribute it and/or modify
              # it under the terms of the GNU General Public License as published by
              # the Free Software Foundation; either version 3 of the License, or
              # (at your option) any later version.
              #
              # This program is distributed in the hope that it will be useful,
              # but WITHOUT ANY WARRANTY; without even the implied warranty of
              # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
              # GNU General Public License for more details.
              #
              # You should have received a copy of the GNU General Public License
              # along with this program; if not, write to the Free Software Foundation,
              # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
+             import os
              class ArchiveCacheLock(Exception):
                  pass
              def archive_iterator(_reader, block_size: int = 4096 * 512):
                  # 4096 * 64 = 64KB
                  while 1:
                      data = _reader.read(block_size)
                      if not data:
                          break
                      yield data
+             def get_directory_statistics(start_path):
+                 """
+                 total_files, total_size, directory_stats = get_directory_statistics(start_path)
+                 print(f"Directory statistics for: {start_path}\n")
+                 print(f"Total files: {total_files}")
+                 print(f"Total size: {format_size(total_size)}\n")
+                 :param start_path:
+                 :return:
+                 """
+                 total_files = 0
+                 total_size = 0
+                 directory_stats = {}
+                 for dir_path, dir_names, file_names in os.walk(start_path):
+                     dir_size = 0
+                     file_count = len(file_names)
+                     for file in file_names:
+                         filepath = os.path.join(dir_path, file)
+                         file_size = os.path.getsize(filepath)
+                         dir_size += file_size
+                     directory_stats[dir_path] = {'file_count': file_count, 'size': dir_size}
+                     total_files += file_count
+                     total_size += dir_size
+                 return total_files, total_size, directory_stats
+             def format_size(size):
+                 # Convert size in bytes to a human-readable format (e.g., KB, MB, GB)
+                 for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+                     if size < 1024:
+                         return f"{size:.2f} {unit}"
+                     size /= 1024

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages