rhodecode-vcsserver Commit - r1242:8380b87c

archive-cache: synced with CE lib

super-admin -

r1242:8380b87c default

parent child

vcsserver/lib/rc_cache/archive_cache/fanout_cache.py

0 +155 -2

@@ -23,6 +23,7 b' import logging'
23	import time	23	import time
24	import typing	24	import typing
25	import zlib	25	import zlib
		26	import sqlite3
26		27
27	from vcsserver.lib.rc_json import json	28	from vcsserver.lib.rc_json import json
28	from .lock import GenerationLock	29	from .lock import GenerationLock
@@ -37,6 +38,72 b' NO_VAL = -917'
37	MODE_BINARY = 'BINARY'	38	MODE_BINARY = 'BINARY'
38		39
39		40
		41	EVICTION_POLICY = {
		42	'none': {
		43	'evict': None,
		44	},
		45	'least-recently-stored': {
		46	'evict': 'SELECT {fields} FROM archive_cache ORDER BY store_time',
		47	},
		48	'least-recently-used': {
		49	'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_time',
		50	},
		51	'least-frequently-used': {
		52	'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_count',
		53	},
		54	}
		55
		56
		57	class DB:
		58
		59	def __init__(self):
		60	self.connection = sqlite3.connect(':memory:')
		61	self._init_db()
		62
		63	def _init_db(self):
		64	qry = '''
		65	CREATE TABLE IF NOT EXISTS archive_cache (
		66	rowid INTEGER PRIMARY KEY,
		67	key_file TEXT,
		68	key_file_path TEXT,
		69	filename TEXT,
		70	full_path TEXT,
		71	store_time REAL,
		72	access_time REAL,
		73	access_count INTEGER DEFAULT 0,
		74	size INTEGER DEFAULT 0
		75	)
		76	'''
		77
		78	self.sql(qry)
		79	self.connection.commit()
		80
		81	@property
		82	def sql(self):
		83	return self.connection.execute
		84
		85	def bulk_insert(self, rows):
		86	qry = '''
		87	INSERT INTO archive_cache (
		88	rowid,
		89	key_file,
		90	key_file_path,
		91	filename,
		92	full_path,
		93	store_time,
		94	access_time,
		95	access_count,
		96	size
		97	)
		98	VALUES (
		99	?, ?, ?, ?, ?, ?, ?, ?, ?
		100	)
		101	'''
		102	cursor = self.connection.cursor()
		103	cursor.executemany(qry, rows)
		104	self.connection.commit()
		105
		106
40	class FileSystemCache:	107	class FileSystemCache:
41		108
42	def __init__(self, index, directory, **settings):	109	def __init__(self, index, directory, **settings):
@@ -77,10 +144,13 b' class FileSystemCache:'
77	# STORE METADATA	144	# STORE METADATA
78	_metadata = {	145	_metadata = {
79	"version": "v1",	146	"version": "v1",
80	"timestamp": time.time(),
81	"filename": filename,	147	"filename": filename,
82	"full_path": full_path,	148	"full_path": full_path,
83	"key_file": key_file,	149	"key_file": key_file,
		150	"store_time": time.time(),
		151	"access_count": 1,
		152	"access_time": 0,
		153	"size": 0
84	}	154	}
85	if metadata:	155	if metadata:
86	_metadata.update(metadata)	156	_metadata.update(metadata)
@@ -89,6 +159,7 b' class FileSystemCache:'
89		159
90	iterator = iter(reader, b'')	160	iterator = iter(reader, b'')
91	size = self._write_file(full_path, iterator, 'xb')	161	size = self._write_file(full_path, iterator, 'xb')
		162	metadata['size'] = size
92		163
93	# after archive is finished, we create a key to save the presence of the binary file	164	# after archive is finished, we create a key to save the presence of the binary file
94	with open(key_file, 'wb') as f:	165	with open(key_file, 'wb') as f:
@@ -106,7 +177,15 b' class FileSystemCache:'
106		177
107	filename = metadata['filename']	178	filename = metadata['filename']
108		179
		180	try:
109	return open(os.path.join(self._directory, filename), 'rb'), metadata	181	return open(os.path.join(self._directory, filename), 'rb'), metadata
		182	finally:
		183	# update usage stats, count and accessed
		184	metadata["access_count"] = metadata.get("access_count", 0) + 1
		185	metadata["access_time"] = time.time()
		186
		187	with open(key_file, 'wb') as f:
		188	f.write(json.dumps(metadata))
110		189
111	def random_filename(self):	190	def random_filename(self):
112	"""Return filename and full-path tuple for file storage.	191	"""Return filename and full-path tuple for file storage.
@@ -168,6 +247,9 b' class FanoutCache:'
168	self._count = settings.pop('cache_shards')	247	self._count = settings.pop('cache_shards')
169	self._locking_url = settings.pop('locking_url')	248	self._locking_url = settings.pop('locking_url')
170		249
		250	self._eviction_policy = settings['cache_eviction_policy']
		251	self._cache_size_limit = settings['cache_size_limit']
		252
171	self._shards = tuple(	253	self._shards = tuple(
172	FileSystemCache(	254	FileSystemCache(
173	index=num,	255	index=num,
@@ -209,6 +291,78 b' class FanoutCache:'
209	def __contains__(self, item):	291	def __contains__(self, item):
210	return self.has_key(item)	292	return self.has_key(item)
211		293
		294	def evict(self, policy=None, size_limit=None):
		295	"""
		296	Remove old items based on the conditions
		297
		298
		299	explanation of this algo:
		300	iterate over each shard, then for each shard iterate over the .key files
		301	read the key files metadata stored. This gives us a full list of keys, cached_archived, their size and
		302	access data, time creation, and access counts.
		303
		304	Store that into a memory DB so we can run different sorting strategies easily.
		305	Summing the size is a sum sql query.
		306
		307	Then we run a sorting strategy based on eviction policy.
		308	We iterate over sorted keys, and remove each checking if we hit the overall limit.
		309	"""
		310
		311	policy = policy or self._eviction_policy
		312	size_limit = size_limit or self._cache_size_limit
		313
		314	select_policy = EVICTION_POLICY[policy]['evict']
		315
		316	if select_policy is None:
		317	return 0
		318
		319	db = DB()
		320
		321	data = []
		322	cnt = 1
		323	for shard in self._shards:
		324	for key_file in os.listdir(shard._directory):
		325	if key_file.endswith('.key'):
		326	key_file_path = os.path.join(shard._directory, key_file)
		327	with open(key_file_path, 'rb') as f:
		328	metadata = json.loads(f.read())
		329	# in case we don't have size re-calc it...
		330	if not metadata.get('size'):
		331	fn = metadata.get('full_path')
		332	size = os.stat(fn).st_size
		333
		334	data.append([
		335	cnt,
		336	key_file,
		337	key_file_path,
		338	metadata.get('filename'),
		339	metadata.get('full_path'),
		340	metadata.get('store_time', 0),
		341	metadata.get('access_time', 0),
		342	metadata.get('access_count', 0),
		343	metadata.get('size', size),
		344	])
		345	cnt += 1
		346
		347	# Insert bulk data using executemany
		348	db.bulk_insert(data)
		349
		350	((total_size,),) = db.sql('SELECT COALESCE(SUM(size), 0) FROM archive_cache').fetchall()
		351
		352	select_policy_qry = select_policy.format(fields='key_file_path, full_path, size')
		353	sorted_keys = db.sql(select_policy_qry).fetchall()
		354
		355	for key, cached_file, size in sorted_keys:
		356	# simulate removal impact BEFORE removal
		357	total_size -= size
		358	if total_size <= size_limit:
		359	# we obtained what we wanted...
		360	break
		361
		362	os.remove(cached_file)
		363	os.remove(key)
		364	return
		365
212		366
213	def get_archival_config(config):	367	def get_archival_config(config):
214		368
@@ -255,4 +409,3 b' def get_archival_cache_store(config):'
255	)	409	)
256	cache_meta = d_cache	410	cache_meta = d_cache
257	return cache_meta	411	return cache_meta
258

vcsserver/lib/rc_cache/archive_cache/utils.py

0 +42 0

@@ -15,6 +15,8 b''
15	# along with this program; if not, write to the Free Software Foundation,	15	# along with this program; if not, write to the Free Software Foundation,
16	# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA	16	# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17		17
		18	import os
		19
18		20
19	class ArchiveCacheLock(Exception):	21	class ArchiveCacheLock(Exception):
20	pass	22	pass
@@ -27,3 +29,43 b' def archive_iterator(_reader, block_size'
27	if not data:	29	if not data:
28	break	30	break
29	yield data	31	yield data
		32
		33
		34	def get_directory_statistics(start_path):
		35	"""
		36	total_files, total_size, directory_stats = get_directory_statistics(start_path)
		37
		38	print(f"Directory statistics for: {start_path}\n")
		39	print(f"Total files: {total_files}")
		40	print(f"Total size: {format_size(total_size)}\n")
		41
		42	:param start_path:
		43	:return:
		44	"""
		45
		46	total_files = 0
		47	total_size = 0
		48	directory_stats = {}
		49
		50	for dir_path, dir_names, file_names in os.walk(start_path):
		51	dir_size = 0
		52	file_count = len(file_names)
		53
		54	for file in file_names:
		55	filepath = os.path.join(dir_path, file)
		56	file_size = os.path.getsize(filepath)
		57	dir_size += file_size
		58
		59	directory_stats[dir_path] = {'file_count': file_count, 'size': dir_size}
		60	total_files += file_count
		61	total_size += dir_size
		62
		63	return total_files, total_size, directory_stats
		64
		65
		66	def format_size(size):
		67	# Convert size in bytes to a human-readable format (e.g., KB, MB, GB)
		68	for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
		69	if size < 1024:
		70	return f"{size:.2f} {unit}"
		71	size /= 1024

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages