rhodecode-vcsserver Commit - r1242:8380b87c

archive-cache: synced with CE lib

super-admin -

r1242:8380b87c default

parent child

vcsserver/lib/rc_cache/archive_cache/fanout_cache.py

0 +156 -3

             # RhodeCode VCSServer provides access to different vcs backends via network.
             # Copyright (C) 2014-2024 RhodeCode GmbH
             #
             # This program is free software; you can redistribute it and/or modify
             # it under the terms of the GNU General Public License as published by
             # the Free Software Foundation; either version 3 of the License, or
             # (at your option) any later version.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program; if not, write to the Free Software Foundation,
             # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
             import codecs
             import contextlib
             import functools
             import os
             import logging
             import time
             import typing
             import zlib
+            import sqlite3
             from vcsserver.lib.rc_json import json
             from .lock import GenerationLock
             log = logging.getLogger(__name__)
             cache_meta = None
             UNKNOWN = -241
             NO_VAL = -917
             MODE_BINARY = 'BINARY'
+            EVICTION_POLICY = {
+                'none': {
+                    'evict': None,
+                },
+                'least-recently-stored': {
+                    'evict': 'SELECT {fields} FROM archive_cache ORDER BY store_time',
+                },
+                'least-recently-used': {
+                    'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_time',
+                },
+                'least-frequently-used': {
+                    'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_count',
+                },
+            }
+            class DB:
+                def __init__(self):
+                    self.connection = sqlite3.connect(':memory:')
+                    self._init_db()
+                def _init_db(self):
+                    qry = '''
+                        CREATE TABLE IF NOT EXISTS archive_cache (
+                         rowid INTEGER PRIMARY KEY,
+                         key_file TEXT,
+                         key_file_path TEXT,
+                         filename TEXT,
+                         full_path TEXT,
+                         store_time REAL,
+                         access_time REAL,
+                         access_count INTEGER DEFAULT 0,
+                         size INTEGER DEFAULT 0
+                         )
+                    '''
+                    self.sql(qry)
+                    self.connection.commit()
+                @property
+                def sql(self):
+                    return self.connection.execute
+                def bulk_insert(self, rows):
+                    qry = '''
+                        INSERT INTO archive_cache (
+                         rowid,
+                         key_file,
+                         key_file_path,
+                         filename,
+                         full_path,
+                         store_time,
+                         access_time,
+                         access_count,
+                         size
+                        )
+                        VALUES (
+                        ?, ?, ?, ?, ?, ?, ?, ?, ?
+                        )
+                    '''
+                    cursor = self.connection.cursor()
+                    cursor.executemany(qry, rows)
+                    self.connection.commit()
             class FileSystemCache:
                 def __init__(self, index, directory, **settings):
                     self._index = index
                     self._directory = directory
                 def _write_file(self, full_path, iterator, mode, encoding=None):
                     full_dir, _ = os.path.split(full_path)
                     for count in range(1, 11):
                         with contextlib.suppress(OSError):
                             os.makedirs(full_dir)
                         try:
                             # Another cache may have deleted the directory before
                             # the file could be opened.
                             writer = open(full_path, mode, encoding=encoding)
                         except OSError:
                             if count == 10:
                                 # Give up after 10 tries to open the file.
                                 raise
                             continue
                         with writer:
                             size = 0
                             for chunk in iterator:
                                 size += len(chunk)
                                 writer.write(chunk)
                             return size
                 def _get_keyfile(self, key):
                     return os.path.join(self._directory, f'{key}.key')
                 def store(self, key, value_reader, metadata):
                     filename, full_path = self.random_filename()
                     key_file = self._get_keyfile(key)
                     # STORE METADATA
                     _metadata = {
                         "version": "v1",
-                        "timestamp": time.time(),
                         "filename": filename,
                         "full_path": full_path,
                         "key_file": key_file,
+                        "store_time": time.time(),
+                        "access_count": 1,
+                        "access_time": 0,
+                        "size": 0
                     }
                     if metadata:
                         _metadata.update(metadata)
                     reader = functools.partial(value_reader.read, 2**22)
                     iterator = iter(reader, b'')
                     size = self._write_file(full_path, iterator, 'xb')
+                    metadata['size'] = size
                     # after archive is finished, we create a key to save the presence of the binary file
                     with open(key_file, 'wb') as f:
                         f.write(json.dumps(_metadata))
                     return key, size, MODE_BINARY, filename, _metadata
                 def fetch(self, key) -> tuple[typing.BinaryIO, dict]:
                     if key not in self:
                         raise KeyError(key)
                     key_file = self._get_keyfile(key)
                     with open(key_file, 'rb') as f:
                         metadata = json.loads(f.read())
                     filename = metadata['filename']
-                    return open(os.path.join(self._directory, filename), 'rb'), metadata
+                    try:
+                        return open(os.path.join(self._directory, filename), 'rb'), metadata
+                    finally:
+                        # update usage stats, count and accessed
+                        metadata["access_count"] = metadata.get("access_count", 0) + 1
+                        metadata["access_time"] = time.time()
+                        with open(key_file, 'wb') as f:
+                            f.write(json.dumps(metadata))
                 def random_filename(self):
                     """Return filename and full-path tuple for file storage.
                     Filename will be a randomly generated 28 character hexadecimal string
                     with ".archive_cache" suffixed. Two levels of sub-directories will be used to
                     reduce the size of directories. On older filesystems, lookups in
                     directories with many files may be slow.
                     """
                     hex_name = codecs.encode(os.urandom(16), 'hex').decode('utf-8')
                     sub_dir = os.path.join(hex_name[:2], hex_name[2:4])
                     name = hex_name[4:] + '.archive_cache'
                     filename = os.path.join(sub_dir, name)
                     full_path = os.path.join(self._directory, filename)
                     return filename, full_path
                 def hash(self, key):
                     """Compute portable hash for `key`.
                     :param key: key to hash
                     :return: hash value
                     """
                     mask = 0xFFFFFFFF
                     return zlib.adler32(key.encode('utf-8')) & mask  # noqa
                 def __contains__(self, key):
                     """Return `True` if `key` matching item is found in cache.
                     :param key: key matching item
                     :return: True if key matching item
                     """
                     key_file = self._get_keyfile(key)
                     return os.path.exists(key_file)
             class FanoutCache:
                 """Cache that shards keys and values."""
                 def __init__(
                     self, directory=None, **settings
                 ):
                     """Initialize cache instance.
                     :param str directory: cache directory
                     :param settings: settings dict
                     """
                     if directory is None:
                         raise ValueError('directory cannot be None')
                     directory = str(directory)
                     directory = os.path.expanduser(directory)
                     directory = os.path.expandvars(directory)
                     self._directory = directory
                     self._count = settings.pop('cache_shards')
                     self._locking_url = settings.pop('locking_url')
+                    self._eviction_policy = settings['cache_eviction_policy']
+                    self._cache_size_limit = settings['cache_size_limit']
                     self._shards = tuple(
                         FileSystemCache(
                             index=num,
                             directory=os.path.join(directory, 'shard_%03d' % num),
                             **settings,
                         )
                         for num in range(self._count)
                     )
                     self._hash = self._shards[0].hash
                 def get_lock(self, lock_key):
                     return GenerationLock(lock_key, self._locking_url)
                 def _get_shard(self, key) -> FileSystemCache:
                     index = self._hash(key) % self._count
                     shard = self._shards[index]
                     return shard
                 def store(self, key, value_reader, metadata=None):
                     shard = self._get_shard(key)
                     return shard.store(key, value_reader, metadata)
                 def fetch(self, key):
                     """Return file handle corresponding to `key` from cache.
                     """
                     shard = self._get_shard(key)
                     return shard.fetch(key)
                 def has_key(self, key):
                     """Return `True` if `key` matching item is found in cache.
                     :param key: key for item
                     :return: True if key is found
                     """
                     shard = self._get_shard(key)
                     return key in shard
                 def __contains__(self, item):
                     return self.has_key(item)
+                def evict(self, policy=None, size_limit=None):
+                    """
+                    Remove old items based on the conditions
+                    explanation of this algo:
+                    iterate over each shard, then for each shard iterate over the .key files
+                    read the key files metadata stored. This gives us a full list of keys, cached_archived, their size and
+                    access data, time creation, and access counts.
+                    Store that into a memory DB so we can run different sorting strategies easily.
+                    Summing the size is a sum sql query.
+                    Then we run a sorting strategy based on eviction policy.
+                    We iterate over sorted keys, and remove each checking if we hit the overall limit.
+                    """
+                    policy = policy or self._eviction_policy
+                    size_limit = size_limit or self._cache_size_limit
+                    select_policy = EVICTION_POLICY[policy]['evict']
+                    if select_policy is None:
+                        return 0
+                    db = DB()
+                    data = []
+                    cnt = 1
+                    for shard in self._shards:
+                        for key_file in os.listdir(shard._directory):
+                            if key_file.endswith('.key'):
+                                key_file_path = os.path.join(shard._directory, key_file)
+                                with open(key_file_path, 'rb') as f:
+                                    metadata = json.loads(f.read())
+                                    # in case we don't have size re-calc it...
+                                    if not metadata.get('size'):
+                                        fn = metadata.get('full_path')
+                                        size = os.stat(fn).st_size
+                                    data.append([
+                                      cnt,
+                                      key_file,
+                                      key_file_path,
+                                      metadata.get('filename'),
+                                      metadata.get('full_path'),
+                                      metadata.get('store_time', 0),
+                                      metadata.get('access_time', 0),
+                                      metadata.get('access_count', 0),
+                                      metadata.get('size', size),
+                                    ])
+                                    cnt += 1
+                    # Insert bulk data using executemany
+                    db.bulk_insert(data)
+                    ((total_size,),) = db.sql('SELECT COALESCE(SUM(size), 0) FROM archive_cache').fetchall()
+                    select_policy_qry = select_policy.format(fields='key_file_path, full_path, size')
+                    sorted_keys = db.sql(select_policy_qry).fetchall()
+                    for key, cached_file, size in sorted_keys:
+                        # simulate removal impact BEFORE removal
+                        total_size -= size
+                        if total_size <= size_limit:
+                            # we obtained what we wanted...
+                            break
+                        os.remove(cached_file)
+                        os.remove(key)
+                    return
             def get_archival_config(config):
                 final_config = {
                 }
                 for k, v in config.items():
                     if k.startswith('archive_cache'):
                         final_config[k] = v
                 return final_config
             def get_archival_cache_store(config):
                 global cache_meta
                 if cache_meta is not None:
                     return cache_meta
                 config = get_archival_config(config)
                 backend = config['archive_cache.backend.type']
                 if backend != 'filesystem':
                     raise ValueError('archive_cache.backend.type only supports "filesystem"')
                 archive_cache_locking_url = config['archive_cache.locking.url']
                 archive_cache_dir = config['archive_cache.filesystem.store_dir']
                 archive_cache_size_gb = config['archive_cache.filesystem.cache_size_gb']
                 archive_cache_shards = config['archive_cache.filesystem.cache_shards']
                 archive_cache_eviction_policy = config['archive_cache.filesystem.eviction_policy']
                 log.debug('Initializing archival cache instance under %s', archive_cache_dir)
                 # check if it's ok to write, and re-create the archive cache
                 if not os.path.isdir(archive_cache_dir):
                     os.makedirs(archive_cache_dir, exist_ok=True)
                 d_cache = FanoutCache(
                     archive_cache_dir,
                     locking_url=archive_cache_locking_url,
                     cache_shards=archive_cache_shards,
                     cache_size_limit=archive_cache_size_gb * 1024 * 1024 * 1024,
                     cache_eviction_policy=archive_cache_eviction_policy
                 )
                 cache_meta = d_cache
                 return cache_meta

vcsserver/lib/rc_cache/archive_cache/utils.py

0 +42 0

             # RhodeCode VCSServer provides access to different vcs backends via network.
             # Copyright (C) 2014-2024 RhodeCode GmbH
             #
             # This program is free software; you can redistribute it and/or modify
             # it under the terms of the GNU General Public License as published by
             # the Free Software Foundation; either version 3 of the License, or
             # (at your option) any later version.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program; if not, write to the Free Software Foundation,
             # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
+            import os
             class ArchiveCacheLock(Exception):
                 pass
             def archive_iterator(_reader, block_size: int = 4096 * 512):
                 # 4096 * 64 = 64KB
                 while 1:
                     data = _reader.read(block_size)
                     if not data:
                         break
                     yield data
+            def get_directory_statistics(start_path):
+                """
+                total_files, total_size, directory_stats = get_directory_statistics(start_path)
+                print(f"Directory statistics for: {start_path}\n")
+                print(f"Total files: {total_files}")
+                print(f"Total size: {format_size(total_size)}\n")
+                :param start_path:
+                :return:
+                """
+                total_files = 0
+                total_size = 0
+                directory_stats = {}
+                for dir_path, dir_names, file_names in os.walk(start_path):
+                    dir_size = 0
+                    file_count = len(file_names)
+                    for file in file_names:
+                        filepath = os.path.join(dir_path, file)
+                        file_size = os.path.getsize(filepath)
+                        dir_size += file_size
+                    directory_stats[dir_path] = {'file_count': file_count, 'size': dir_size}
+                    total_files += file_count
+                    total_size += dir_size
+                return total_files, total_size, directory_stats
+            def format_size(size):
+                # Convert size in bytes to a human-readable format (e.g., KB, MB, GB)
+                for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+                    if size < 1024:
+                        return f"{size:.2f} {unit}"
+                    size /= 1024

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages