|
|
# RhodeCode VCSServer provides access to different vcs backends via network.
|
|
|
# Copyright (C) 2014-2024 RhodeCode GmbH
|
|
|
#
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
|
# (at your option) any later version.
|
|
|
#
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
# GNU General Public License for more details.
|
|
|
#
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
# along with this program; if not, write to the Free Software Foundation,
|
|
|
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
|
|
import codecs
|
|
|
import contextlib
|
|
|
import functools
|
|
|
import os
|
|
|
import logging
|
|
|
import time
|
|
|
import typing
|
|
|
import zlib
|
|
|
import sqlite3
|
|
|
|
|
|
from vcsserver.lib.rc_json import json
|
|
|
from .lock import GenerationLock
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
cache_meta = None
|
|
|
|
|
|
UNKNOWN = -241
|
|
|
NO_VAL = -917
|
|
|
|
|
|
MODE_BINARY = 'BINARY'
|
|
|
|
|
|
|
|
|
EVICTION_POLICY = {
|
|
|
'none': {
|
|
|
'evict': None,
|
|
|
},
|
|
|
'least-recently-stored': {
|
|
|
'evict': 'SELECT {fields} FROM archive_cache ORDER BY store_time',
|
|
|
},
|
|
|
'least-recently-used': {
|
|
|
'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_time',
|
|
|
},
|
|
|
'least-frequently-used': {
|
|
|
'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_count',
|
|
|
},
|
|
|
}
|
|
|
|
|
|
|
|
|
class DB:
|
|
|
|
|
|
def __init__(self):
|
|
|
self.connection = sqlite3.connect(':memory:')
|
|
|
self._init_db()
|
|
|
|
|
|
def _init_db(self):
|
|
|
qry = '''
|
|
|
CREATE TABLE IF NOT EXISTS archive_cache (
|
|
|
rowid INTEGER PRIMARY KEY,
|
|
|
key_file TEXT,
|
|
|
key_file_path TEXT,
|
|
|
filename TEXT,
|
|
|
full_path TEXT,
|
|
|
store_time REAL,
|
|
|
access_time REAL,
|
|
|
access_count INTEGER DEFAULT 0,
|
|
|
size INTEGER DEFAULT 0
|
|
|
)
|
|
|
'''
|
|
|
|
|
|
self.sql(qry)
|
|
|
self.connection.commit()
|
|
|
|
|
|
@property
|
|
|
def sql(self):
|
|
|
return self.connection.execute
|
|
|
|
|
|
def bulk_insert(self, rows):
|
|
|
qry = '''
|
|
|
INSERT INTO archive_cache (
|
|
|
rowid,
|
|
|
key_file,
|
|
|
key_file_path,
|
|
|
filename,
|
|
|
full_path,
|
|
|
store_time,
|
|
|
access_time,
|
|
|
access_count,
|
|
|
size
|
|
|
)
|
|
|
VALUES (
|
|
|
?, ?, ?, ?, ?, ?, ?, ?, ?
|
|
|
)
|
|
|
'''
|
|
|
cursor = self.connection.cursor()
|
|
|
cursor.executemany(qry, rows)
|
|
|
self.connection.commit()
|
|
|
|
|
|
|
|
|
class FileSystemCache:
|
|
|
|
|
|
def __init__(self, index, directory, **settings):
|
|
|
self._index = index
|
|
|
self._directory = directory
|
|
|
|
|
|
def _write_file(self, full_path, iterator, mode, encoding=None):
|
|
|
full_dir, _ = os.path.split(full_path)
|
|
|
|
|
|
for count in range(1, 11):
|
|
|
with contextlib.suppress(OSError):
|
|
|
os.makedirs(full_dir)
|
|
|
|
|
|
try:
|
|
|
# Another cache may have deleted the directory before
|
|
|
# the file could be opened.
|
|
|
writer = open(full_path, mode, encoding=encoding)
|
|
|
except OSError:
|
|
|
if count == 10:
|
|
|
# Give up after 10 tries to open the file.
|
|
|
raise
|
|
|
continue
|
|
|
|
|
|
with writer:
|
|
|
size = 0
|
|
|
for chunk in iterator:
|
|
|
size += len(chunk)
|
|
|
writer.write(chunk)
|
|
|
return size
|
|
|
|
|
|
def _get_keyfile(self, key):
|
|
|
return os.path.join(self._directory, f'{key}.key')
|
|
|
|
|
|
def store(self, key, value_reader, metadata):
|
|
|
filename, full_path = self.random_filename()
|
|
|
key_file = self._get_keyfile(key)
|
|
|
|
|
|
# STORE METADATA
|
|
|
_metadata = {
|
|
|
"version": "v1",
|
|
|
"filename": filename,
|
|
|
"full_path": full_path,
|
|
|
"key_file": key_file,
|
|
|
"store_time": time.time(),
|
|
|
"access_count": 1,
|
|
|
"access_time": 0,
|
|
|
"size": 0
|
|
|
}
|
|
|
if metadata:
|
|
|
_metadata.update(metadata)
|
|
|
|
|
|
reader = functools.partial(value_reader.read, 2**22)
|
|
|
|
|
|
iterator = iter(reader, b'')
|
|
|
size = self._write_file(full_path, iterator, 'xb')
|
|
|
metadata['size'] = size
|
|
|
|
|
|
# after archive is finished, we create a key to save the presence of the binary file
|
|
|
with open(key_file, 'wb') as f:
|
|
|
f.write(json.dumps(_metadata))
|
|
|
|
|
|
return key, size, MODE_BINARY, filename, _metadata
|
|
|
|
|
|
def fetch(self, key) -> tuple[typing.BinaryIO, dict]:
|
|
|
if key not in self:
|
|
|
raise KeyError(key)
|
|
|
|
|
|
key_file = self._get_keyfile(key)
|
|
|
with open(key_file, 'rb') as f:
|
|
|
metadata = json.loads(f.read())
|
|
|
|
|
|
filename = metadata['filename']
|
|
|
|
|
|
try:
|
|
|
return open(os.path.join(self._directory, filename), 'rb'), metadata
|
|
|
finally:
|
|
|
# update usage stats, count and accessed
|
|
|
metadata["access_count"] = metadata.get("access_count", 0) + 1
|
|
|
metadata["access_time"] = time.time()
|
|
|
|
|
|
with open(key_file, 'wb') as f:
|
|
|
f.write(json.dumps(metadata))
|
|
|
|
|
|
def random_filename(self):
|
|
|
"""Return filename and full-path tuple for file storage.
|
|
|
|
|
|
Filename will be a randomly generated 28 character hexadecimal string
|
|
|
with ".archive_cache" suffixed. Two levels of sub-directories will be used to
|
|
|
reduce the size of directories. On older filesystems, lookups in
|
|
|
directories with many files may be slow.
|
|
|
"""
|
|
|
|
|
|
hex_name = codecs.encode(os.urandom(16), 'hex').decode('utf-8')
|
|
|
sub_dir = os.path.join(hex_name[:2], hex_name[2:4])
|
|
|
name = hex_name[4:] + '.archive_cache'
|
|
|
filename = os.path.join(sub_dir, name)
|
|
|
full_path = os.path.join(self._directory, filename)
|
|
|
return filename, full_path
|
|
|
|
|
|
def hash(self, key):
|
|
|
"""Compute portable hash for `key`.
|
|
|
|
|
|
:param key: key to hash
|
|
|
:return: hash value
|
|
|
|
|
|
"""
|
|
|
mask = 0xFFFFFFFF
|
|
|
return zlib.adler32(key.encode('utf-8')) & mask # noqa
|
|
|
|
|
|
def __contains__(self, key):
|
|
|
"""Return `True` if `key` matching item is found in cache.
|
|
|
|
|
|
:param key: key matching item
|
|
|
:return: True if key matching item
|
|
|
|
|
|
"""
|
|
|
key_file = self._get_keyfile(key)
|
|
|
return os.path.exists(key_file)
|
|
|
|
|
|
|
|
|
class FanoutCache:
|
|
|
"""Cache that shards keys and values."""
|
|
|
|
|
|
def __init__(
|
|
|
self, directory=None, **settings
|
|
|
):
|
|
|
"""Initialize cache instance.
|
|
|
|
|
|
:param str directory: cache directory
|
|
|
:param settings: settings dict
|
|
|
|
|
|
"""
|
|
|
if directory is None:
|
|
|
raise ValueError('directory cannot be None')
|
|
|
|
|
|
directory = str(directory)
|
|
|
directory = os.path.expanduser(directory)
|
|
|
directory = os.path.expandvars(directory)
|
|
|
self._directory = directory
|
|
|
|
|
|
self._count = settings.pop('cache_shards')
|
|
|
self._locking_url = settings.pop('locking_url')
|
|
|
|
|
|
self._eviction_policy = settings['cache_eviction_policy']
|
|
|
self._cache_size_limit = settings['cache_size_limit']
|
|
|
|
|
|
self._shards = tuple(
|
|
|
FileSystemCache(
|
|
|
index=num,
|
|
|
directory=os.path.join(directory, 'shard_%03d' % num),
|
|
|
**settings,
|
|
|
)
|
|
|
for num in range(self._count)
|
|
|
)
|
|
|
self._hash = self._shards[0].hash
|
|
|
|
|
|
def get_lock(self, lock_key):
|
|
|
return GenerationLock(lock_key, self._locking_url)
|
|
|
|
|
|
def _get_shard(self, key) -> FileSystemCache:
|
|
|
index = self._hash(key) % self._count
|
|
|
shard = self._shards[index]
|
|
|
return shard
|
|
|
|
|
|
def store(self, key, value_reader, metadata=None):
|
|
|
shard = self._get_shard(key)
|
|
|
return shard.store(key, value_reader, metadata)
|
|
|
|
|
|
def fetch(self, key):
|
|
|
"""Return file handle corresponding to `key` from cache.
|
|
|
"""
|
|
|
shard = self._get_shard(key)
|
|
|
return shard.fetch(key)
|
|
|
|
|
|
def has_key(self, key):
|
|
|
"""Return `True` if `key` matching item is found in cache.
|
|
|
|
|
|
:param key: key for item
|
|
|
:return: True if key is found
|
|
|
|
|
|
"""
|
|
|
shard = self._get_shard(key)
|
|
|
return key in shard
|
|
|
|
|
|
def __contains__(self, item):
|
|
|
return self.has_key(item)
|
|
|
|
|
|
def evict(self, policy=None, size_limit=None):
|
|
|
"""
|
|
|
Remove old items based on the conditions
|
|
|
|
|
|
|
|
|
explanation of this algo:
|
|
|
iterate over each shard, then for each shard iterate over the .key files
|
|
|
read the key files metadata stored. This gives us a full list of keys, cached_archived, their size and
|
|
|
access data, time creation, and access counts.
|
|
|
|
|
|
Store that into a memory DB so we can run different sorting strategies easily.
|
|
|
Summing the size is a sum sql query.
|
|
|
|
|
|
Then we run a sorting strategy based on eviction policy.
|
|
|
We iterate over sorted keys, and remove each checking if we hit the overall limit.
|
|
|
"""
|
|
|
|
|
|
policy = policy or self._eviction_policy
|
|
|
size_limit = size_limit or self._cache_size_limit
|
|
|
|
|
|
select_policy = EVICTION_POLICY[policy]['evict']
|
|
|
|
|
|
if select_policy is None:
|
|
|
return 0
|
|
|
|
|
|
db = DB()
|
|
|
|
|
|
data = []
|
|
|
cnt = 1
|
|
|
for shard in self._shards:
|
|
|
for key_file in os.listdir(shard._directory):
|
|
|
if key_file.endswith('.key'):
|
|
|
key_file_path = os.path.join(shard._directory, key_file)
|
|
|
with open(key_file_path, 'rb') as f:
|
|
|
metadata = json.loads(f.read())
|
|
|
# in case we don't have size re-calc it...
|
|
|
if not metadata.get('size'):
|
|
|
fn = metadata.get('full_path')
|
|
|
size = os.stat(fn).st_size
|
|
|
|
|
|
data.append([
|
|
|
cnt,
|
|
|
key_file,
|
|
|
key_file_path,
|
|
|
metadata.get('filename'),
|
|
|
metadata.get('full_path'),
|
|
|
metadata.get('store_time', 0),
|
|
|
metadata.get('access_time', 0),
|
|
|
metadata.get('access_count', 0),
|
|
|
metadata.get('size', size),
|
|
|
])
|
|
|
cnt += 1
|
|
|
|
|
|
# Insert bulk data using executemany
|
|
|
db.bulk_insert(data)
|
|
|
|
|
|
((total_size,),) = db.sql('SELECT COALESCE(SUM(size), 0) FROM archive_cache').fetchall()
|
|
|
|
|
|
select_policy_qry = select_policy.format(fields='key_file_path, full_path, size')
|
|
|
sorted_keys = db.sql(select_policy_qry).fetchall()
|
|
|
|
|
|
for key, cached_file, size in sorted_keys:
|
|
|
# simulate removal impact BEFORE removal
|
|
|
total_size -= size
|
|
|
if total_size <= size_limit:
|
|
|
# we obtained what we wanted...
|
|
|
break
|
|
|
|
|
|
os.remove(cached_file)
|
|
|
os.remove(key)
|
|
|
return
|
|
|
|
|
|
|
|
|
def get_archival_config(config):
|
|
|
|
|
|
final_config = {
|
|
|
|
|
|
}
|
|
|
|
|
|
for k, v in config.items():
|
|
|
if k.startswith('archive_cache'):
|
|
|
final_config[k] = v
|
|
|
|
|
|
return final_config
|
|
|
|
|
|
|
|
|
def get_archival_cache_store(config):
|
|
|
|
|
|
global cache_meta
|
|
|
if cache_meta is not None:
|
|
|
return cache_meta
|
|
|
|
|
|
config = get_archival_config(config)
|
|
|
backend = config['archive_cache.backend.type']
|
|
|
if backend != 'filesystem':
|
|
|
raise ValueError('archive_cache.backend.type only supports "filesystem"')
|
|
|
|
|
|
archive_cache_locking_url = config['archive_cache.locking.url']
|
|
|
archive_cache_dir = config['archive_cache.filesystem.store_dir']
|
|
|
archive_cache_size_gb = config['archive_cache.filesystem.cache_size_gb']
|
|
|
archive_cache_shards = config['archive_cache.filesystem.cache_shards']
|
|
|
archive_cache_eviction_policy = config['archive_cache.filesystem.eviction_policy']
|
|
|
|
|
|
log.debug('Initializing archival cache instance under %s', archive_cache_dir)
|
|
|
|
|
|
# check if it's ok to write, and re-create the archive cache
|
|
|
if not os.path.isdir(archive_cache_dir):
|
|
|
os.makedirs(archive_cache_dir, exist_ok=True)
|
|
|
|
|
|
d_cache = FanoutCache(
|
|
|
archive_cache_dir,
|
|
|
locking_url=archive_cache_locking_url,
|
|
|
cache_shards=archive_cache_shards,
|
|
|
cache_size_limit=archive_cache_size_gb * 1024 * 1024 * 1024,
|
|
|
cache_eviction_policy=archive_cache_eviction_policy
|
|
|
)
|
|
|
cache_meta = d_cache
|
|
|
return cache_meta
|
|
|
|