rhodecode-enterprise-ce Commit - r5425:9c658c9d

1

2

#

2

#

3

# This program is free software: you can redistribute it and/or modify

3

# This program is free software: you can redistribute it and/or modify

4

# it under the terms of the GNU Affero General Public License, version 3

4

# it under the terms of the GNU Affero General Public License, version 3

5

# (only), as published by the Free Software Foundation.

5

# (only), as published by the Free Software Foundation.

6

#

6

#

7

# This program is distributed in the hope that it will be useful,

7

# This program is distributed in the hope that it will be useful,

8

# but WITHOUT ANY WARRANTY; without even the implied warranty of

8

# but WITHOUT ANY WARRANTY; without even the implied warranty of

9

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

9

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

10

# GNU General Public License for more details.

10

# GNU General Public License for more details.

11

#

11

#

12

# You should have received a copy of the GNU Affero General Public License

12

# You should have received a copy of the GNU Affero General Public License

13

# along with this program. If not, see <http://www.gnu.org/licenses/>.

13

# along with this program. If not, see <http://www.gnu.org/licenses/>.

14

#

14

#

15

# This program is dual-licensed. If you wish to learn more about the

15

# This program is dual-licensed. If you wish to learn more about the

16

# RhodeCode Enterprise Edition, including its added features, Support services,

16

# RhodeCode Enterprise Edition, including its added features, Support services,

17

# and proprietary license terms, please see https://rhodecode.com/licenses/

17

# and proprietary license terms, please see https://rhodecode.com/licenses/

18

19

import codecs

19

import codecs

20

import contextlib

20

import contextlib

21

import functools

21

import functools

22

import os

22

import os

23

import logging

23

import logging

24

import time

24

import time

25

import typing

25

import typing

26

import zlib

26

import zlib

27

import sqlite3

27

import sqlite3

28

29

from ...ext_json import json

29

from ...ext_json import json

30

from .lock import GenerationLock

30

from .lock import GenerationLock

31

from .utils import format_size

31

from .utils import format_size

32

33

log = logging.getLogger(__name__)

33

log = logging.getLogger(__name__)

34

35

cache_meta = None

35

cache_meta = None

36

37

UNKNOWN = -241

37

UNKNOWN = -241

38

NO_VAL = -917

38

NO_VAL = -917

39

40

MODE_BINARY = 'BINARY'

40

MODE_BINARY = 'BINARY'

41

42

43

EVICTION_POLICY = {

43

EVICTION_POLICY = {

44

'none': {

44

'none': {

45

'evict': None,

45

'evict': None,

46

},

46

},

47

'least-recently-stored': {

47

'least-recently-stored': {

48

'evict': 'SELECT {fields} FROM archive_cache ORDER BY store_time',

48

'evict': 'SELECT {fields} FROM archive_cache ORDER BY store_time',

49

},

49

},

50

'least-recently-used': {

50

'least-recently-used': {

51

'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_time',

51

'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_time',

52

},

52

},

53

'least-frequently-used': {

53

'least-frequently-used': {

54

'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_count',

54

'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_count',

55

},

55

},

56

}

56

}

57

58

59

class DB:

59

class DB:

60

61

def __init__(self):

61

def __init__(self):

62

self.connection = sqlite3.connect(':memory:')

62

self.connection = sqlite3.connect(':memory:')

63

self._init_db()

63

self._init_db()

64

65

def _init_db(self):

65

def _init_db(self):

66

qry = '''

66

qry = '''

67

CREATE TABLE IF NOT EXISTS archive_cache (

67

CREATE TABLE IF NOT EXISTS archive_cache (

68

rowid INTEGER PRIMARY KEY,

68

rowid INTEGER PRIMARY KEY,

69

key_file TEXT,

69

key_file TEXT,

70

key_file_path TEXT,

70

key_file_path TEXT,

71

filename TEXT,

71

filename TEXT,

72

full_path TEXT,

72

full_path TEXT,

73

store_time REAL,

73

store_time REAL,

74

access_time REAL,

74

access_time REAL,

75

access_count INTEGER DEFAULT 0,

75

access_count INTEGER DEFAULT 0,

76

size INTEGER DEFAULT 0

76

size INTEGER DEFAULT 0

77

)

77

)

78

'''

78

'''

79

80

self.sql(qry)

80

self.sql(qry)

81

self.connection.commit()

81

self.connection.commit()

82

83

@property

83

@property

84

def sql(self):

84

def sql(self):

85

return self.connection.execute

85

return self.connection.execute

86

87

def bulk_insert(self, rows):

87

def bulk_insert(self, rows):

88

qry = '''

88

qry = '''

89

INSERT INTO archive_cache (

89

INSERT INTO archive_cache (

90

rowid,

90

rowid,

91

key_file,

91

key_file,

92

key_file_path,

92

key_file_path,

93

filename,

93

filename,

94

full_path,

94

full_path,

95

store_time,

95

store_time,

96

access_time,

96

access_time,

97

access_count,

97

access_count,

98

size

98

size

99

)

99

)

100

VALUES (

100

VALUES (

101

?, ?, ?, ?, ?, ?, ?, ?, ?

101

?, ?, ?, ?, ?, ?, ?, ?, ?

102

)

102

)

103

'''

103

'''

104

cursor = self.connection.cursor()

104

cursor = self.connection.cursor()

105

cursor.executemany(qry, rows)

105

cursor.executemany(qry, rows)

106

self.connection.commit()

106

self.connection.commit()

107

108

109

class FileSystemCache:

109

class FileSystemCache:

110

111

def __init__(self, index, directory, **settings):

111

def __init__(self, index, directory, **settings):

112

self._index = index

112

self._index = index

113

self._directory = directory

113

self._directory = directory

114

115

def _write_file(self, full_path, iterator, mode, encoding=None):

115

def _write_file(self, full_path, iterator, mode, encoding=None):

116

full_dir, _ = os.path.split(full_path)

116

full_dir, _ = os.path.split(full_path)

117

118

for count in range(1, 11):

118

for count in range(1, 11):

119

with contextlib.suppress(OSError):

119

with contextlib.suppress(OSError):

120

os.makedirs(full_dir)

120

os.makedirs(full_dir)

121

122

try:

122

try:

123

# Another cache may have deleted the directory before

123

# Another cache may have deleted the directory before

124

# the file could be opened.

124

# the file could be opened.

125

writer = open(full_path, mode, encoding=encoding)

125

writer = open(full_path, mode, encoding=encoding)

126

except OSError:

126

except OSError:

127

if count == 10:

127

if count == 10:

128

# Give up after 10 tries to open the file.

128

# Give up after 10 tries to open the file.

129

raise

129

raise

130

continue

130

continue

131

132

with writer:

132

with writer:

133

size = 0

133

size = 0

134

for chunk in iterator:

134

for chunk in iterator:

135

size += len(chunk)

135

size += len(chunk)

136

writer.write(chunk)

136

writer.write(chunk)

137

return size

137

return size

138

139

def _get_keyfile(self, key):

139

def _get_keyfile(self, key):

140

return os.path.join(self._directory, f'{key}.key')

140

return os.path.join(self._directory, f'{key}.key')

141

142

def store(self, key, value_reader, metadata):

142

def store(self, key, value_reader, metadata):

143

filename, full_path = self.random_filename()

143

filename, full_path = self.random_filename()

144

key_file = self._get_keyfile(key)

144

key_file = self._get_keyfile(key)

145

146

# STORE METADATA

146

# STORE METADATA

147

_metadata = {

147

_metadata = {

148

"version": "v1",

148

"version": "v1",

149

"filename": filename,

149

"filename": filename,

150

"full_path": full_path,

150

"full_path": full_path,

151

"key_file": key_file,

151

"key_file": key_file,

152

"store_time": time.time(),

152

"store_time": time.time(),

153

"access_count": 1,

153

"access_count": 1,

154

"access_time": 0,

154

"access_time": 0,

155

"size": 0

155

"size": 0

156

}

156

}

157

if metadata:

157

if metadata:

158

_metadata.update(metadata)

158

_metadata.update(metadata)

159

160

reader = functools.partial(value_reader.read, 2**22)

160

reader = functools.partial(value_reader.read, 2**22)

161

162

iterator = iter(reader, b'')

162

iterator = iter(reader, b'')

163

size = self._write_file(full_path, iterator, 'xb')

163

size = self._write_file(full_path, iterator, 'xb')

164

metadata['size'] = size

164

metadata['size'] = size

165

166

# after archive is finished, we create a key to save the presence of the binary file

166

# after archive is finished, we create a key to save the presence of the binary file

167

with open(key_file, 'wb') as f:

167

with open(key_file, 'wb') as f:

168

f.write(json.dumps(_metadata))

168

f.write(json.dumps(_metadata))

169

170

return key, size, MODE_BINARY, filename, _metadata

170

return key, size, MODE_BINARY, filename, _metadata

171

172

def fetch(self, key) -> tuple[typing.BinaryIO, dict]:

172

def fetch(self, key) -> tuple[typing.BinaryIO, dict]:

173

if key not in self:

173

if key not in self:

174

raise KeyError(key)

174

raise KeyError(key)

175

176

key_file = self._get_keyfile(key)

176

key_file = self._get_keyfile(key)

177

with open(key_file, 'rb') as f:

177

with open(key_file, 'rb') as f:

178

metadata = json.loads(f.read())

178

metadata = json.loads(f.read())

179

180

filename = metadata['filename']

180

filename = metadata['filename']

181

182

try:

182

try:

183

return open(os.path.join(self._directory, filename), 'rb'), metadata

183

return open(os.path.join(self._directory, filename), 'rb'), metadata

184

finally:

184

finally:

185

# update usage stats, count and accessed

185

# update usage stats, count and accessed

186

metadata["access_count"] = metadata.get("access_count", 0) + 1

186

metadata["access_count"] = metadata.get("access_count", 0) + 1

187

metadata["access_time"] = time.time()

187

metadata["access_time"] = time.time()

188

189

with open(key_file, 'wb') as f:

189

with open(key_file, 'wb') as f:

190

f.write(json.dumps(metadata))

190

f.write(json.dumps(metadata))

191

192

def random_filename(self):

192

def random_filename(self):

193

"""Return filename and full-path tuple for file storage.

193

"""Return filename and full-path tuple for file storage.

194

195

Filename will be a randomly generated 28 character hexadecimal string

195

Filename will be a randomly generated 28 character hexadecimal string

196

with ".archive_cache" suffixed. Two levels of sub-directories will be used to

196

with ".archive_cache" suffixed. Two levels of sub-directories will be used to

197

reduce the size of directories. On older filesystems, lookups in

197

reduce the size of directories. On older filesystems, lookups in

198

directories with many files may be slow.

198

directories with many files may be slow.

199

"""

199

"""

200

201

hex_name = codecs.encode(os.urandom(16), 'hex').decode('utf-8')

201

hex_name = codecs.encode(os.urandom(16), 'hex').decode('utf-8')

202

sub_dir = os.path.join(hex_name[:2], hex_name[2:4])

202

sub_dir = os.path.join(hex_name[:2], hex_name[2:4])

203

name = hex_name[4:] + '.archive_cache'

203

name = hex_name[4:] + '.archive_cache'

204

filename = os.path.join(sub_dir, name)

204

filename = os.path.join(sub_dir, name)

205

full_path = os.path.join(self._directory, filename)

205

full_path = os.path.join(self._directory, filename)

206

return filename, full_path

206

return filename, full_path

207

208

def hash(self, key):

208

def hash(self, key):

209

"""Compute portable hash for `key`.

209

"""Compute portable hash for `key`.

210

211

:param key: key to hash

211

:param key: key to hash

212

:return: hash value

212

:return: hash value

213

214

"""

214

"""

215

mask = 0xFFFFFFFF

215

mask = 0xFFFFFFFF

216

return zlib.adler32(key.encode('utf-8')) & mask # noqa

216

return zlib.adler32(key.encode('utf-8')) & mask # noqa

217

218

def __contains__(self, key):

218

def __contains__(self, key):

219

"""Return `True` if `key` matching item is found in cache.

219

"""Return `True` if `key` matching item is found in cache.

220

221

:param key: key matching item

221

:param key: key matching item

222

:return: True if key matching item

222

:return: True if key matching item

223

224

"""

224

"""

225

key_file = self._get_keyfile(key)

225

key_file = self._get_keyfile(key)

226

return os.path.exists(key_file)

226

return os.path.exists(key_file)

227

228

229

class FanoutCache:

229

class FanoutCache:

230

"""Cache that shards keys and values."""

230

"""Cache that shards keys and values."""

231

232

def __init__(

232

def __init__(

233

self, directory=None, **settings

233

self, directory=None, **settings

234

):

234

):

235

"""Initialize cache instance.

235

"""Initialize cache instance.

236

237

:param str directory: cache directory

237

:param str directory: cache directory

238

:param settings: settings dict

238

:param settings: settings dict

239

240

"""

240

"""

241

if directory is None:

241

if directory is None:

242

raise ValueError('directory cannot be None')

242

raise ValueError('directory cannot be None')

243

244

directory = str(directory)

244

directory = str(directory)

245

directory = os.path.expanduser(directory)

245

directory = os.path.expanduser(directory)

246

directory = os.path.expandvars(directory)

246

directory = os.path.expandvars(directory)

247

self._directory = directory

247

self._directory = directory

248

249

self._count = settings.pop('cache_shards')

249

self._count = settings.pop('cache_shards')

250

self._locking_url = settings.pop('locking_url')

250

self._locking_url = settings.pop('locking_url')

251

252

self._eviction_policy = settings['cache_eviction_policy']

252

self._eviction_policy = settings['cache_eviction_policy']

253

self._cache_size_limit = settings['cache_size_limit']

253

self._cache_size_limit = settings['cache_size_limit']

254

255

self._shards = tuple(

255

self._shards = tuple(

256

FileSystemCache(

256

FileSystemCache(

257

index=num,

257

index=num,

258

directory=os.path.join(directory, 'shard_%03d' % num),

258

directory=os.path.join(directory, 'shard_%03d' % num),

259

**settings,

259

**settings,

260

)

260

)

261

for num in range(self._count)

261

for num in range(self._count)

262

)

262

)

263

self._hash = self._shards[0].hash

263

self._hash = self._shards[0].hash

264

265

def get_lock(self, lock_key):

265

def get_lock(self, lock_key):

266

return GenerationLock(lock_key, self._locking_url)

266

return GenerationLock(lock_key, self._locking_url)

267

268

def _get_shard(self, key) -> FileSystemCache:

268

def _get_shard(self, key) -> FileSystemCache:

269

index = self._hash(key) % self._count

269

index = self._hash(key) % self._count

270

shard = self._shards[index]

270

shard = self._shards[index]

271

return shard

271

return shard

272

273

def store(self, key, value_reader, metadata=None):

273

def store(self, key, value_reader, metadata=None):

274

shard = self._get_shard(key)

274

shard = self._get_shard(key)

275

return shard.store(key, value_reader, metadata)

275

return shard.store(key, value_reader, metadata)

276

277

def fetch(self, key):

277

def fetch(self, key):

278

"""Return file handle corresponding to `key` from cache.

278

"""Return file handle corresponding to `key` from cache.

279

"""

279

"""

280

shard = self._get_shard(key)

280

shard = self._get_shard(key)

281

return shard.fetch(key)

281

return shard.fetch(key)

282

283

def has_key(self, key):

283

def has_key(self, key):

284

"""Return `True` if `key` matching item is found in cache.

284

"""Return `True` if `key` matching item is found in cache.

285

286

:param key: key for item

286

:param key: key for item

287

:return: True if key is found

287

:return: True if key is found

288

289

"""

289

"""

290

shard = self._get_shard(key)

290

shard = self._get_shard(key)

291

return key in shard

291

return key in shard

292

293

def __contains__(self, item):

293

def __contains__(self, item):

294

return self.has_key(item)

294

return self.has_key(item)

295

296

def evict(self, policy=None, size_limit=None):

296

def evict(self, policy=None, size_limit=None):

297

"""

297

"""

298

Remove old items based on the conditions

298

Remove old items based on the conditions

299

300

301

explanation of this algo:

301

explanation of this algo:

302

iterate over each shard, then for each shard iterate over the .key files

302

iterate over each shard, then for each shard iterate over the .key files

303

read the key files metadata stored. This gives us a full list of keys, cached_archived, their size and

303

read the key files metadata stored. This gives us a full list of keys, cached_archived, their size and

304

access data, time creation, and access counts.

304

access data, time creation, and access counts.

305

306

Store that into a memory DB so we can run different sorting strategies easily.

306

Store that into a memory DB so we can run different sorting strategies easily.

307

Summing the size is a sum sql query.

307

Summing the size is a sum sql query.

308

309

Then we run a sorting strategy based on eviction policy.

309

Then we run a sorting strategy based on eviction policy.

310

We iterate over sorted keys, and remove each checking if we hit the overall limit.

310

We iterate over sorted keys, and remove each checking if we hit the overall limit.

311

"""

311

"""

312

313

policy = policy or self._eviction_policy

313

policy = policy or self._eviction_policy

314

size_limit = size_limit or self._cache_size_limit

314

size_limit = size_limit or self._cache_size_limit

315

316

select_policy = EVICTION_POLICY[policy]['evict']

316

select_policy = EVICTION_POLICY[policy]['evict']

317

318

log.debug('Running eviction policy \'%s\', and checking for size limit: %s',

318

log.debug('Running eviction policy \'%s\', and checking for size limit: %s',

319

policy, format_size(size_limit))

319

policy, format_size(size_limit))

320

321

if select_policy is None:

321

if select_policy is None:

322

return 0

322

return 0

323

324

db = DB()

324

db = DB()

325

326

data = []

326

data = []

327

cnt = 1

327

cnt = 1

328

for shard in self._shards:

328

for shard in self._shards:

329

for key_file in os.listdir(shard._directory):

329

for key_file in os.listdir(shard._directory):

330

if key_file.endswith('.key'):

330

if key_file.endswith('.key'):

331

key_file_path = os.path.join(shard._directory, key_file)

331

key_file_path = os.path.join(shard._directory, key_file)

332

with open(key_file_path, 'rb') as f:

332

with open(key_file_path, 'rb') as f:

333

metadata = json.loads(f.read())

333

metadata = json.loads(f.read())

334

335

size = metadata.get('size')

336

filename = metadata.get('filename')

337

full_path = metadata.get('full_path')

338

339

if not size:

334

# in case we don't have size re-calc it...

340

# in case we don't have size re-calc it...

335

if not metadata.get('size'):

341

size = os.stat(full_path).st_size

336

fn = metadata.get('full_path')

337

size = os.stat(fn).st_size

338

342

339

data.append([

343

data.append([

340

cnt,

344

cnt,

341

key_file,

345

key_file,

342

key_file_path,

346

key_file_path,

343

~~metadata~~.~~get~~('filename'),

347

filename,

344

~~metadata~~.~~get~~('full_path'),

348

full_path,

345

metadata.get('store_time', 0),

349

metadata.get('store_time', 0),

346

metadata.get('access_time', 0),

350

metadata.get('access_time', 0),

347

metadata.get('access_count', 0),

351

metadata.get('access_count', 0),

348

~~metadata~~.~~get~~('size', ~~size~~),

352

size,

349

])

353

])

350

cnt += 1

354

cnt += 1

351

355

352

# Insert bulk data using executemany

356

# Insert bulk data using executemany

353

db.bulk_insert(data)

357

db.bulk_insert(data)

354

358

355

((total_size,),) = db.sql('SELECT COALESCE(SUM(size), 0) FROM archive_cache').fetchall()

359

((total_size,),) = db.sql('SELECT COALESCE(SUM(size), 0) FROM archive_cache').fetchall()

356

log.debug('Analyzed %s keys, occupied: %s', len(data), format_size(total_size))

360

log.debug('Analyzed %s keys, occupied: %s', len(data), format_size(total_size))

357

select_policy_qry = select_policy.format(fields='key_file_path, full_path, size')

361

select_policy_qry = select_policy.format(fields='key_file_path, full_path, size')

358

sorted_keys = db.sql(select_policy_qry).fetchall()

362

sorted_keys = db.sql(select_policy_qry).fetchall()

359

363

360

removed_items = 0

364

removed_items = 0

361

removed_size = 0

365

removed_size = 0

362

for key, cached_file, size in sorted_keys:

366

for key, cached_file, size in sorted_keys:

363

# simulate removal impact BEFORE removal

367

# simulate removal impact BEFORE removal

364

total_size -= size

368

total_size -= size

365

369

366

if total_size <= size_limit:

370

if total_size <= size_limit:

367

# we obtained what we wanted...

371

# we obtained what we wanted...

368

break

372

break

369

373

370

os.remove(cached_file)

374

os.remove(cached_file)

371

os.remove(key)

375

os.remove(key)

372

removed_items += 1

376

removed_items += 1

373

removed_size += size

377

removed_size += size

374

378

375

log.debug('Removed %s cache archives, and reduced size: %s', removed_items, format_size(removed_size))

379

log.debug('Removed %s cache archives, and reduced size: %s', removed_items, format_size(removed_size))

376

return removed_items

380

return removed_items

377

381

378

382

379

def get_archival_config(config):

383

def get_archival_config(config):

380

384

381

final_config = {

385

final_config = {

382

386

383

}

387

}

384

388

385

for k, v in config.items():

389

for k, v in config.items():

386

if k.startswith('archive_cache'):

390

if k.startswith('archive_cache'):

387

final_config[k] = v

391

final_config[k] = v

388

392

389

return final_config

393

return final_config

390

394

391

395

392

def get_archival_cache_store(config):

396

def get_archival_cache_store(config):

393

397

394

global cache_meta

398

global cache_meta

395

if cache_meta is not None:

399

if cache_meta is not None:

396

return cache_meta

400

return cache_meta

397

401

398

config = get_archival_config(config)

402

config = get_archival_config(config)

399

backend = config['archive_cache.backend.type']

403

backend = config['archive_cache.backend.type']

400

if backend != 'filesystem':

404

if backend != 'filesystem':

401

raise ValueError('archive_cache.backend.type only supports "filesystem"')

405

raise ValueError('archive_cache.backend.type only supports "filesystem"')

402

406

403

archive_cache_locking_url = config['archive_cache.locking.url']

407

archive_cache_locking_url = config['archive_cache.locking.url']

404

archive_cache_dir = config['archive_cache.filesystem.store_dir']

408

archive_cache_dir = config['archive_cache.filesystem.store_dir']

405

archive_cache_size_gb = config['archive_cache.filesystem.cache_size_gb']

409

archive_cache_size_gb = config['archive_cache.filesystem.cache_size_gb']

406

archive_cache_shards = config['archive_cache.filesystem.cache_shards']

410

archive_cache_shards = config['archive_cache.filesystem.cache_shards']

407

archive_cache_eviction_policy = config['archive_cache.filesystem.eviction_policy']

411

archive_cache_eviction_policy = config['archive_cache.filesystem.eviction_policy']

408

412

409

log.debug('Initializing archival cache instance under %s', archive_cache_dir)

413

log.debug('Initializing archival cache instance under %s', archive_cache_dir)

410

414

411

# check if it's ok to write, and re-create the archive cache

415

# check if it's ok to write, and re-create the archive cache

412

if not os.path.isdir(archive_cache_dir):

416

if not os.path.isdir(archive_cache_dir):

413

os.makedirs(archive_cache_dir, exist_ok=True)

417

os.makedirs(archive_cache_dir, exist_ok=True)

414

418

415

d_cache = FanoutCache(

419

d_cache = FanoutCache(

416

archive_cache_dir,

420

archive_cache_dir,

417

locking_url=archive_cache_locking_url,

421

locking_url=archive_cache_locking_url,

418

cache_shards=archive_cache_shards,

422

cache_shards=archive_cache_shards,

419

cache_size_limit=archive_cache_size_gb * 1024 * 1024 * 1024,

423

cache_size_limit=archive_cache_size_gb * 1024 * 1024 * 1024,

420

cache_eviction_policy=archive_cache_eviction_policy

424

cache_eviction_policy=archive_cache_eviction_policy

421

)

425

)

422

cache_meta = d_cache

426

cache_meta = d_cache

423

return cache_meta

427

return cache_meta

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # Copyright (C) 2015-2024 RhodeCode GmbH
             #
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU Affero General Public License, version 3
             # (only), as published by the Free Software Foundation.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU Affero General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
             #
             # This program is dual-licensed. If you wish to learn more about the
             # RhodeCode Enterprise Edition, including its added features, Support services,
             # and proprietary license terms, please see https://rhodecode.com/licenses/
             import codecs
             import contextlib
             import functools
             import os
             import logging
             import time
             import typing
             import zlib
             import sqlite3
             from ...ext_json import json
             from .lock import GenerationLock
             from .utils import format_size
             log = logging.getLogger(__name__)
             cache_meta = None
             UNKNOWN = -241
             NO_VAL = -917
             MODE_BINARY = 'BINARY'
             EVICTION_POLICY = {
                 'none': {
                     'evict': None,
                 },
                 'least-recently-stored': {
                     'evict': 'SELECT {fields} FROM archive_cache ORDER BY store_time',
                 },
                 'least-recently-used': {
                     'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_time',
                 },
                 'least-frequently-used': {
                     'evict': 'SELECT {fields} FROM archive_cache ORDER BY access_count',
                 },
             }
             class DB:
                 def __init__(self):
                     self.connection = sqlite3.connect(':memory:')
                     self._init_db()
                 def _init_db(self):
                     qry = '''
                         CREATE TABLE IF NOT EXISTS archive_cache (
                          rowid INTEGER PRIMARY KEY,
                          key_file TEXT,
                          key_file_path TEXT,
                          filename TEXT,
                          full_path TEXT,
                          store_time REAL,
                          access_time REAL,
                          access_count INTEGER DEFAULT 0,
                          size INTEGER DEFAULT 0
                          )
                     '''
                     self.sql(qry)
                     self.connection.commit()
                 @property
                 def sql(self):
                     return self.connection.execute
                 def bulk_insert(self, rows):
                     qry = '''
                         INSERT INTO archive_cache (
                          rowid,
                          key_file,
                          key_file_path,
                          filename,
                          full_path,
                          store_time,
                          access_time,
                          access_count,
                          size
                         )
                         VALUES (
                         ?, ?, ?, ?, ?, ?, ?, ?, ?
                         )
                     '''
                     cursor = self.connection.cursor()
                     cursor.executemany(qry, rows)
                     self.connection.commit()
             class FileSystemCache:
                 def __init__(self, index, directory, **settings):
                     self._index = index
                     self._directory = directory
                 def _write_file(self, full_path, iterator, mode, encoding=None):
                     full_dir, _ = os.path.split(full_path)
                     for count in range(1, 11):
                         with contextlib.suppress(OSError):
                             os.makedirs(full_dir)
                         try:
                             # Another cache may have deleted the directory before
                             # the file could be opened.
                             writer = open(full_path, mode, encoding=encoding)
                         except OSError:
                             if count == 10:
                                 # Give up after 10 tries to open the file.
                                 raise
                             continue
                         with writer:
                             size = 0
                             for chunk in iterator:
                                 size += len(chunk)
                                 writer.write(chunk)
                             return size
                 def _get_keyfile(self, key):
                     return os.path.join(self._directory, f'{key}.key')
                 def store(self, key, value_reader, metadata):
                     filename, full_path = self.random_filename()
                     key_file = self._get_keyfile(key)
                     # STORE METADATA
                     _metadata = {
                         "version": "v1",
                         "filename": filename,
                         "full_path": full_path,
                         "key_file": key_file,
                         "store_time": time.time(),
                         "access_count": 1,
                         "access_time": 0,
                         "size": 0
                     }
                     if metadata:
                         _metadata.update(metadata)
                     reader = functools.partial(value_reader.read, 2**22)
                     iterator = iter(reader, b'')
                     size = self._write_file(full_path, iterator, 'xb')
                     metadata['size'] = size
                     # after archive is finished, we create a key to save the presence of the binary file
                     with open(key_file, 'wb') as f:
                         f.write(json.dumps(_metadata))
                     return key, size, MODE_BINARY, filename, _metadata
                 def fetch(self, key) -> tuple[typing.BinaryIO, dict]:
                     if key not in self:
                         raise KeyError(key)
                     key_file = self._get_keyfile(key)
                     with open(key_file, 'rb') as f:
                         metadata = json.loads(f.read())
                     filename = metadata['filename']
                     try:
                         return open(os.path.join(self._directory, filename), 'rb'), metadata
                     finally:
                         # update usage stats, count and accessed
                         metadata["access_count"] = metadata.get("access_count", 0) + 1
                         metadata["access_time"] = time.time()
                         with open(key_file, 'wb') as f:
                             f.write(json.dumps(metadata))
                 def random_filename(self):
                     """Return filename and full-path tuple for file storage.
                     Filename will be a randomly generated 28 character hexadecimal string
                     with ".archive_cache" suffixed. Two levels of sub-directories will be used to
                     reduce the size of directories. On older filesystems, lookups in
                     directories with many files may be slow.
                     """
                     hex_name = codecs.encode(os.urandom(16), 'hex').decode('utf-8')
                     sub_dir = os.path.join(hex_name[:2], hex_name[2:4])
                     name = hex_name[4:] + '.archive_cache'
                     filename = os.path.join(sub_dir, name)
                     full_path = os.path.join(self._directory, filename)
                     return filename, full_path
                 def hash(self, key):
                     """Compute portable hash for `key`.
                     :param key: key to hash
                     :return: hash value
                     """
                     mask = 0xFFFFFFFF
                     return zlib.adler32(key.encode('utf-8')) & mask  # noqa
                 def __contains__(self, key):
                     """Return `True` if `key` matching item is found in cache.
                     :param key: key matching item
                     :return: True if key matching item
                     """
                     key_file = self._get_keyfile(key)
                     return os.path.exists(key_file)
             class FanoutCache:
                 """Cache that shards keys and values."""
                 def __init__(
                     self, directory=None, **settings
                 ):
                     """Initialize cache instance.
                     :param str directory: cache directory
                     :param settings: settings dict
                     """
                     if directory is None:
                         raise ValueError('directory cannot be None')
                     directory = str(directory)
                     directory = os.path.expanduser(directory)
                     directory = os.path.expandvars(directory)
                     self._directory = directory
                     self._count = settings.pop('cache_shards')
                     self._locking_url = settings.pop('locking_url')
                     self._eviction_policy = settings['cache_eviction_policy']
                     self._cache_size_limit = settings['cache_size_limit']
                     self._shards = tuple(
                         FileSystemCache(
                             index=num,
                             directory=os.path.join(directory, 'shard_%03d' % num),
                             **settings,
                         )
                         for num in range(self._count)
                     )
                     self._hash = self._shards[0].hash
                 def get_lock(self, lock_key):
                     return GenerationLock(lock_key, self._locking_url)
                 def _get_shard(self, key) -> FileSystemCache:
                     index = self._hash(key) % self._count
                     shard = self._shards[index]
                     return shard
                 def store(self, key, value_reader, metadata=None):
                     shard = self._get_shard(key)
                     return shard.store(key, value_reader, metadata)
                 def fetch(self, key):
                     """Return file handle corresponding to `key` from cache.
                     """
                     shard = self._get_shard(key)
                     return shard.fetch(key)
                 def has_key(self, key):
                     """Return `True` if `key` matching item is found in cache.
                     :param key: key for item
                     :return: True if key is found
                     """
                     shard = self._get_shard(key)
                     return key in shard
                 def __contains__(self, item):
                     return self.has_key(item)
                 def evict(self, policy=None, size_limit=None):
                     """
                     Remove old items based on the conditions
                     explanation of this algo:
                     iterate over each shard, then for each shard iterate over the .key files
                     read the key files metadata stored. This gives us a full list of keys, cached_archived, their size and
                     access data, time creation, and access counts.
                     Store that into a memory DB so we can run different sorting strategies easily.
                     Summing the size is a sum sql query.
                     Then we run a sorting strategy based on eviction policy.
                     We iterate over sorted keys, and remove each checking if we hit the overall limit.
                     """
                     policy = policy or self._eviction_policy
                     size_limit = size_limit or self._cache_size_limit
                     select_policy = EVICTION_POLICY[policy]['evict']
                     log.debug('Running eviction policy \'%s\', and checking for size limit: %s',
                               policy, format_size(size_limit))
                     if select_policy is None:
                         return 0
                     db = DB()
                     data = []
                     cnt = 1
                     for shard in self._shards:
                         for key_file in os.listdir(shard._directory):
                             if key_file.endswith('.key'):
                                 key_file_path = os.path.join(shard._directory, key_file)
                                 with open(key_file_path, 'rb') as f:
                                     metadata = json.loads(f.read())
+                                    size = metadata.get('size')
+                                    filename = metadata.get('filename')
+                                    full_path = metadata.get('full_path')
+                                    if not size:
                                         # in case we don't have size re-calc it...
-                                    if not metadata.get('size'):
+                                        size = os.stat(full_path).st_size
-                                        fn = metadata.get('full_path')
-                                        size = os.stat(fn).st_size
                                     data.append([
                                       cnt,
                                       key_file,
                                       key_file_path,
-                                      metadata.get('filename'),
+                                      filename,
-                                      metadata.get('full_path'),
+                                      full_path,
                                       metadata.get('store_time', 0),
                                       metadata.get('access_time', 0),
                                       metadata.get('access_count', 0),
-                                      metadata.get('size', size),
+                                      size,
                                     ])
                                     cnt += 1
                     # Insert bulk data using executemany
                     db.bulk_insert(data)
                     ((total_size,),) = db.sql('SELECT COALESCE(SUM(size), 0) FROM archive_cache').fetchall()
                     log.debug('Analyzed %s keys, occupied: %s', len(data), format_size(total_size))
                     select_policy_qry = select_policy.format(fields='key_file_path, full_path, size')
                     sorted_keys = db.sql(select_policy_qry).fetchall()
                     removed_items = 0
                     removed_size = 0
                     for key, cached_file, size in sorted_keys:
                         # simulate removal impact BEFORE removal
                         total_size -= size
                         if total_size <= size_limit:
                             # we obtained what we wanted...
                             break
                         os.remove(cached_file)
                         os.remove(key)
                         removed_items += 1
                         removed_size += size
                     log.debug('Removed %s cache archives, and reduced size: %s', removed_items, format_size(removed_size))
                     return removed_items
             def get_archival_config(config):
                 final_config = {
                 }
                 for k, v in config.items():
                     if k.startswith('archive_cache'):
                         final_config[k] = v
                 return final_config
             def get_archival_cache_store(config):
                 global cache_meta
                 if cache_meta is not None:
                     return cache_meta
                 config = get_archival_config(config)
                 backend = config['archive_cache.backend.type']
                 if backend != 'filesystem':
                     raise ValueError('archive_cache.backend.type only supports "filesystem"')
                 archive_cache_locking_url = config['archive_cache.locking.url']
                 archive_cache_dir = config['archive_cache.filesystem.store_dir']
                 archive_cache_size_gb = config['archive_cache.filesystem.cache_size_gb']
                 archive_cache_shards = config['archive_cache.filesystem.cache_shards']
                 archive_cache_eviction_policy = config['archive_cache.filesystem.eviction_policy']
                 log.debug('Initializing archival cache instance under %s', archive_cache_dir)
                 # check if it's ok to write, and re-create the archive cache
                 if not os.path.isdir(archive_cache_dir):
                     os.makedirs(archive_cache_dir, exist_ok=True)
                 d_cache = FanoutCache(
                     archive_cache_dir,
                     locking_url=archive_cache_locking_url,
                     cache_shards=archive_cache_shards,
                     cache_size_limit=archive_cache_size_gb * 1024 * 1024 * 1024,
                     cache_eviction_policy=archive_cache_eviction_policy
                 )
                 cache_meta = d_cache
                 return cache_meta