##// END OF EJS Templates
feat(archive-cache): synced with CE codebase
feat(archive-cache): synced with CE codebase

File last commit:

r1251:f8e66197 default
r1251:f8e66197 default
Show More
base.py
348 lines | 11.2 KiB | text/x-python | PythonLexer
feat(archive-cache): synced with CE codebase
r1251 # Copyright (C) 2015-2024 RhodeCode GmbH
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License, version 3
# (only), as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# This program is dual-licensed. If you wish to learn more about the
# RhodeCode Enterprise Edition, including its added features, Support services,
# and proprietary license terms, please see https://rhodecode.com/licenses/
import os
import functools
import logging
import typing
import time
import zlib
from ...ext_json import json
from ..utils import StatsDB, NOT_GIVEN, ShardFileReader, EVICTION_POLICY, format_size
from ..lock import GenerationLock
log = logging.getLogger(__name__)
class BaseShard:
storage_type: str = ''
fs = None
@classmethod
def hash(cls, key):
"""Compute portable hash for `key`.
:param key: key to hash
:return: hash value
"""
mask = 0xFFFFFFFF
return zlib.adler32(key.encode('utf-8')) & mask # noqa
def _write_file(self, full_path, read_iterator, mode):
raise NotImplementedError
def _get_keyfile(self, key):
raise NotImplementedError
def random_filename(self):
raise NotImplementedError
def _store(self, key, value_reader, metadata, mode):
(filename, # hash-name
full_path # full-path/hash-name
) = self.random_filename()
key_file, key_file_path = self._get_keyfile(key)
# STORE METADATA
_metadata = {
"version": "v1",
"key_file": key_file, # this is the .key.json file storing meta
"key_file_path": key_file_path, # full path to key_file
"archive_key": key, # original name we stored archive under, e.g my-archive.zip
"archive_filename": filename, # the actual filename we stored that file under
"archive_full_path": full_path,
"store_time": time.time(),
"access_count": 0,
"access_time": 0,
"size": 0
}
if metadata:
_metadata.update(metadata)
read_iterator = iter(functools.partial(value_reader.read, 2**22), b'')
size, sha256 = self._write_file(full_path, read_iterator, mode)
_metadata['size'] = size
_metadata['sha256'] = sha256
# after archive is finished, we create a key to save the presence of the binary file
with self.fs.open(key_file_path, 'wb') as f:
f.write(json.dumps(_metadata))
return key, filename, size, _metadata
def _fetch(self, key, retry, retry_attempts, retry_backoff):
if retry is NOT_GIVEN:
retry = False
if retry_attempts is NOT_GIVEN:
retry_attempts = 0
if retry and retry_attempts > 0:
for attempt in range(1, retry_attempts + 1):
if key in self:
break
# we didn't find the key, wait retry_backoff N seconds, and re-check
time.sleep(retry_backoff)
if key not in self:
log.exception(f'requested key={key} not found in {self} retry={retry}, attempts={retry_attempts}')
raise KeyError(key)
key_file, key_file_path = self._get_keyfile(key)
with self.fs.open(key_file_path, 'rb') as f:
metadata = json.loads(f.read())
archive_path = metadata['archive_full_path']
try:
return ShardFileReader(self.fs.open(archive_path, 'rb')), metadata
finally:
# update usage stats, count and accessed
metadata["access_count"] = metadata.get("access_count", 0) + 1
metadata["access_time"] = time.time()
log.debug('Updated %s with access snapshot, access_count=%s access_time=%s',
key_file, metadata['access_count'], metadata['access_time'])
with self.fs.open(key_file_path, 'wb') as f:
f.write(json.dumps(metadata))
def _remove(self, key):
if key not in self:
log.exception(f'requested key={key} not found in {self}')
raise KeyError(key)
key_file, key_file_path = self._get_keyfile(key)
with self.fs.open(key_file_path, 'rb') as f:
metadata = json.loads(f.read())
archive_path = metadata['archive_full_path']
self.fs.rm(archive_path)
self.fs.rm(key_file_path)
return 1
@property
def storage_medium(self):
return getattr(self, self.storage_type)
@property
def key_suffix(self):
return 'key.json'
def __contains__(self, key):
"""Return `True` if `key` matching item is found in cache.
:param key: key matching item
:return: True if key matching item
"""
key_file, key_file_path = self._get_keyfile(key)
return self.fs.exists(key_file_path)
class BaseCache:
_locking_url: str = ''
_storage_path: str = ''
_config = {}
retry = False
retry_attempts = 0
retry_backoff = 1
_shards = tuple()
def __contains__(self, key):
"""Return `True` if `key` matching item is found in cache.
:param key: key matching item
:return: True if key matching item
"""
return self.has_key(key)
def __repr__(self):
return f'<{self.__class__.__name__}(storage={self._storage_path})>'
@classmethod
def gb_to_bytes(cls, gb):
return gb * (1024 ** 3)
@property
def storage_path(self):
return self._storage_path
@classmethod
def get_stats_db(cls):
return StatsDB()
def get_conf(self, key, pop=False):
if key not in self._config:
raise ValueError(f"No configuration key '{key}', please make sure it exists in archive_cache config")
val = self._config[key]
if pop:
del self._config[key]
return val
def _get_shard(self, key):
raise NotImplementedError
def _get_size(self, shard, archive_path):
raise NotImplementedError
def store(self, key, value_reader, metadata=None):
shard = self._get_shard(key)
return shard.store(key, value_reader, metadata)
def fetch(self, key, retry=NOT_GIVEN, retry_attempts=NOT_GIVEN) -> tuple[typing.BinaryIO, dict]:
"""
Return file handle corresponding to `key` from specific shard cache.
"""
if retry is NOT_GIVEN:
retry = self.retry
if retry_attempts is NOT_GIVEN:
retry_attempts = self.retry_attempts
retry_backoff = self.retry_backoff
shard = self._get_shard(key)
return shard.fetch(key, retry=retry, retry_attempts=retry_attempts, retry_backoff=retry_backoff)
def remove(self, key):
shard = self._get_shard(key)
return shard.remove(key)
def has_key(self, archive_key):
"""Return `True` if `key` matching item is found in cache.
:param archive_key: key for item, this is a unique archive name we want to store data under. e.g my-archive-svn.zip
:return: True if key is found
"""
shard = self._get_shard(archive_key)
return archive_key in shard
def iter_keys(self):
for shard in self._shards:
if shard.fs.exists(shard.storage_medium):
for path, _dirs, _files in shard.fs.walk(shard.storage_medium):
for key_file_path in _files:
if key_file_path.endswith(shard.key_suffix):
yield shard, key_file_path
def get_lock(self, lock_key):
return GenerationLock(lock_key, self._locking_url)
def evict(self, policy=None, size_limit=None) -> int:
"""
Remove old items based on the conditions
explanation of this algo:
iterate over each shard, then for each shard iterate over the .key files
read the key files metadata stored. This gives us a full list of keys, cached_archived, their size and
access data, time creation, and access counts.
Store that into a memory DB so we can run different sorting strategies easily.
Summing the size is a sum sql query.
Then we run a sorting strategy based on eviction policy.
We iterate over sorted keys, and remove each checking if we hit the overall limit.
"""
policy = policy or self._eviction_policy
size_limit = size_limit or self._cache_size_limit
select_policy = EVICTION_POLICY[policy]['evict']
log.debug('Running eviction policy \'%s\', and checking for size limit: %s',
policy, format_size(size_limit))
if select_policy is None:
return 0
db = self.get_stats_db()
data = []
cnt = 1
for shard, key_file in self.iter_keys():
with shard.fs.open(os.path.join(shard.storage_medium, key_file), 'rb') as f:
metadata = json.loads(f.read())
key_file_path = os.path.join(shard.storage_medium, key_file)
archive_key = metadata['archive_key']
archive_path = metadata['archive_full_path']
size = metadata.get('size')
if not size:
# in case we don't have size re-calc it...
size = self._get_size(shard, archive_path)
data.append([
cnt,
key_file,
key_file_path,
archive_key,
archive_path,
metadata.get('store_time', 0),
metadata.get('access_time', 0),
metadata.get('access_count', 0),
size,
])
cnt += 1
# Insert bulk data using executemany
db.bulk_insert(data)
total_size = db.get_total_size()
log.debug('Analyzed %s keys, occupying: %s, running eviction to match %s',
len(data), format_size(total_size), format_size(size_limit))
removed_items = 0
removed_size = 0
for key_file, archive_key, size in db.get_sorted_keys(select_policy):
# simulate removal impact BEFORE removal
total_size -= size
if total_size <= size_limit:
# we obtained what we wanted...
break
self.remove(archive_key)
removed_items += 1
removed_size += size
log.debug('Removed %s cache archives, and reduced size by: %s',
removed_items, format_size(removed_size))
return removed_items
def get_statistics(self):
total_files = 0
total_size = 0
meta = {}
for shard, key_file in self.iter_keys():
json_key = f"{shard.storage_medium}/{key_file}"
with shard.fs.open(json_key, 'rb') as f:
total_files += 1
metadata = json.loads(f.read())
total_size += metadata['size']
return total_files, total_size, meta