##// END OF EJS Templates
store: refactor hashed encoding into its own function
Bryan O'Sullivan -
r17610:d0afa149 default
parent child Browse files
Show More
@@ -1,477 +1,482 b''
1 1 # store.py - repository store handling for Mercurial
2 2 #
3 3 # Copyright 2008 Matt Mackall <mpm@selenic.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from i18n import _
9 9 import osutil, scmutil, util, parsers
10 10 import os, stat, errno
11 11
12 12 _sha = util.sha1
13 13
14 14 # This avoids a collision between a file named foo and a dir named
15 15 # foo.i or foo.d
16 16 def _encodedir(path):
17 17 '''
18 18 >>> _encodedir('data/foo.i')
19 19 'data/foo.i'
20 20 >>> _encodedir('data/foo.i/bla.i')
21 21 'data/foo.i.hg/bla.i'
22 22 >>> _encodedir('data/foo.i.hg/bla.i')
23 23 'data/foo.i.hg.hg/bla.i'
24 24 >>> _encodedir('data/foo.i\\ndata/foo.i/bla.i\\ndata/foo.i.hg/bla.i\\n')
25 25 'data/foo.i\\ndata/foo.i.hg/bla.i\\ndata/foo.i.hg.hg/bla.i\\n'
26 26 '''
27 27 return (path
28 28 .replace(".hg/", ".hg.hg/")
29 29 .replace(".i/", ".i.hg/")
30 30 .replace(".d/", ".d.hg/"))
31 31
32 32 encodedir = getattr(parsers, 'encodedir', _encodedir)
33 33
34 34 def decodedir(path):
35 35 '''
36 36 >>> decodedir('data/foo.i')
37 37 'data/foo.i'
38 38 >>> decodedir('data/foo.i.hg/bla.i')
39 39 'data/foo.i/bla.i'
40 40 >>> decodedir('data/foo.i.hg.hg/bla.i')
41 41 'data/foo.i.hg/bla.i'
42 42 '''
43 43 if ".hg/" not in path:
44 44 return path
45 45 return (path
46 46 .replace(".d.hg/", ".d/")
47 47 .replace(".i.hg/", ".i/")
48 48 .replace(".hg.hg/", ".hg/"))
49 49
50 50 def _buildencodefun():
51 51 '''
52 52 >>> enc, dec = _buildencodefun()
53 53
54 54 >>> enc('nothing/special.txt')
55 55 'nothing/special.txt'
56 56 >>> dec('nothing/special.txt')
57 57 'nothing/special.txt'
58 58
59 59 >>> enc('HELLO')
60 60 '_h_e_l_l_o'
61 61 >>> dec('_h_e_l_l_o')
62 62 'HELLO'
63 63
64 64 >>> enc('hello:world?')
65 65 'hello~3aworld~3f'
66 66 >>> dec('hello~3aworld~3f')
67 67 'hello:world?'
68 68
69 69 >>> enc('the\x07quick\xADshot')
70 70 'the~07quick~adshot'
71 71 >>> dec('the~07quick~adshot')
72 72 'the\\x07quick\\xadshot'
73 73 '''
74 74 e = '_'
75 75 winreserved = [ord(x) for x in '\\:*?"<>|']
76 76 cmap = dict([(chr(x), chr(x)) for x in xrange(127)])
77 77 for x in (range(32) + range(126, 256) + winreserved):
78 78 cmap[chr(x)] = "~%02x" % x
79 79 for x in range(ord("A"), ord("Z")+1) + [ord(e)]:
80 80 cmap[chr(x)] = e + chr(x).lower()
81 81 dmap = {}
82 82 for k, v in cmap.iteritems():
83 83 dmap[v] = k
84 84 def decode(s):
85 85 i = 0
86 86 while i < len(s):
87 87 for l in xrange(1, 4):
88 88 try:
89 89 yield dmap[s[i:i + l]]
90 90 i += l
91 91 break
92 92 except KeyError:
93 93 pass
94 94 else:
95 95 raise KeyError
96 96 return (lambda s: ''.join([cmap[c] for c in s]),
97 97 lambda s: ''.join(list(decode(s))))
98 98
99 99 _encodefname, _decodefname = _buildencodefun()
100 100
101 101 def encodefilename(s):
102 102 '''
103 103 >>> encodefilename('foo.i/bar.d/bla.hg/hi:world?/HELLO')
104 104 'foo.i.hg/bar.d.hg/bla.hg.hg/hi~3aworld~3f/_h_e_l_l_o'
105 105 '''
106 106 return _encodefname(encodedir(s))
107 107
108 108 def decodefilename(s):
109 109 '''
110 110 >>> decodefilename('foo.i.hg/bar.d.hg/bla.hg.hg/hi~3aworld~3f/_h_e_l_l_o')
111 111 'foo.i/bar.d/bla.hg/hi:world?/HELLO'
112 112 '''
113 113 return decodedir(_decodefname(s))
114 114
115 115 def _buildlowerencodefun():
116 116 '''
117 117 >>> f = _buildlowerencodefun()
118 118 >>> f('nothing/special.txt')
119 119 'nothing/special.txt'
120 120 >>> f('HELLO')
121 121 'hello'
122 122 >>> f('hello:world?')
123 123 'hello~3aworld~3f'
124 124 >>> f('the\x07quick\xADshot')
125 125 'the~07quick~adshot'
126 126 '''
127 127 winreserved = [ord(x) for x in '\\:*?"<>|']
128 128 cmap = dict([(chr(x), chr(x)) for x in xrange(127)])
129 129 for x in (range(32) + range(126, 256) + winreserved):
130 130 cmap[chr(x)] = "~%02x" % x
131 131 for x in range(ord("A"), ord("Z")+1):
132 132 cmap[chr(x)] = chr(x).lower()
133 133 return lambda s: "".join([cmap[c] for c in s])
134 134
135 135 lowerencode = _buildlowerencodefun()
136 136
137 137 # Windows reserved names: con, prn, aux, nul, com1..com9, lpt1..lpt9
138 138 _winres3 = ('aux', 'con', 'prn', 'nul') # length 3
139 139 _winres4 = ('com', 'lpt') # length 4 (with trailing 1..9)
140 140 def _auxencode(path, dotencode):
141 141 '''
142 142 Encodes filenames containing names reserved by Windows or which end in
143 143 period or space. Does not touch other single reserved characters c.
144 144 Specifically, c in '\\:*?"<>|' or ord(c) <= 31 are *not* encoded here.
145 145 Additionally encodes space or period at the beginning, if dotencode is
146 146 True. Parameter path is assumed to be all lowercase.
147 147 A segment only needs encoding if a reserved name appears as a
148 148 basename (e.g. "aux", "aux.foo"). A directory or file named "foo.aux"
149 149 doesn't need encoding.
150 150
151 151 >>> s = '.foo/aux.txt/txt.aux/con/prn/nul/foo.'
152 152 >>> _auxencode(s.split('/'), True)
153 153 ['~2efoo', 'au~78.txt', 'txt.aux', 'co~6e', 'pr~6e', 'nu~6c', 'foo~2e']
154 154 >>> s = '.com1com2/lpt9.lpt4.lpt1/conprn/com0/lpt0/foo.'
155 155 >>> _auxencode(s.split('/'), False)
156 156 ['.com1com2', 'lp~749.lpt4.lpt1', 'conprn', 'com0', 'lpt0', 'foo~2e']
157 157 >>> _auxencode(['foo. '], True)
158 158 ['foo.~20']
159 159 >>> _auxencode([' .foo'], True)
160 160 ['~20.foo']
161 161 '''
162 162 for i, n in enumerate(path):
163 163 if not n:
164 164 continue
165 165 if dotencode and n[0] in '. ':
166 166 n = "~%02x" % ord(n[0]) + n[1:]
167 167 path[i] = n
168 168 else:
169 169 l = n.find('.')
170 170 if l == -1:
171 171 l = len(n)
172 172 if ((l == 3 and n[:3] in _winres3) or
173 173 (l == 4 and n[3] <= '9' and n[3] >= '1'
174 174 and n[:3] in _winres4)):
175 175 # encode third letter ('aux' -> 'au~78')
176 176 ec = "~%02x" % ord(n[2])
177 177 n = n[0:2] + ec + n[3:]
178 178 path[i] = n
179 179 if n[-1] in '. ':
180 180 # encode last period or space ('foo...' -> 'foo..~2e')
181 181 path[i] = n[:-1] + "~%02x" % ord(n[-1])
182 182 return path
183 183
184 184 _maxstorepathlen = 120
185 185 _dirprefixlen = 8
186 186 _maxshortdirslen = 8 * (_dirprefixlen + 1) - 4
187
188 def _hashencode(path, dotencode):
189 digest = _sha(path).hexdigest()
190 le = lowerencode(path).split('/')[1:]
191 parts = _auxencode(le, dotencode)
192 basename = parts[-1]
193 _root, ext = os.path.splitext(basename)
194 sdirs = []
195 sdirslen = 0
196 for p in parts[:-1]:
197 d = p[:_dirprefixlen]
198 if d[-1] in '. ':
199 # Windows can't access dirs ending in period or space
200 d = d[:-1] + '_'
201 if sdirslen == 0:
202 t = len(d)
203 else:
204 t = sdirslen + 1 + len(d)
205 if t > _maxshortdirslen:
206 break
207 sdirs.append(d)
208 sdirslen = t
209 dirs = '/'.join(sdirs)
210 if len(dirs) > 0:
211 dirs += '/'
212 res = 'dh/' + dirs + digest + ext
213 spaceleft = _maxstorepathlen - len(res)
214 if spaceleft > 0:
215 filler = basename[:spaceleft]
216 res = 'dh/' + dirs + filler + digest + ext
217 return res
218
187 219 def _hybridencode(path, dotencode):
188 220 '''encodes path with a length limit
189 221
190 222 Encodes all paths that begin with 'data/', according to the following.
191 223
192 224 Default encoding (reversible):
193 225
194 226 Encodes all uppercase letters 'X' as '_x'. All reserved or illegal
195 227 characters are encoded as '~xx', where xx is the two digit hex code
196 228 of the character (see encodefilename).
197 229 Relevant path components consisting of Windows reserved filenames are
198 230 masked by encoding the third character ('aux' -> 'au~78', see auxencode).
199 231
200 232 Hashed encoding (not reversible):
201 233
202 234 If the default-encoded path is longer than _maxstorepathlen, a
203 235 non-reversible hybrid hashing of the path is done instead.
204 236 This encoding uses up to _dirprefixlen characters of all directory
205 237 levels of the lowerencoded path, but not more levels than can fit into
206 238 _maxshortdirslen.
207 239 Then follows the filler followed by the sha digest of the full path.
208 240 The filler is the beginning of the basename of the lowerencoded path
209 241 (the basename is everything after the last path separator). The filler
210 242 is as long as possible, filling in characters from the basename until
211 243 the encoded path has _maxstorepathlen characters (or all chars of the
212 244 basename have been taken).
213 245 The extension (e.g. '.i' or '.d') is preserved.
214 246
215 247 The string 'data/' at the beginning is replaced with 'dh/', if the hashed
216 248 encoding was used.
217 249 '''
218 250 path = encodedir(path)
219 251 ef = _encodefname(path).split('/')
220 252 res = '/'.join(_auxencode(ef, dotencode))
221 253 if len(res) > _maxstorepathlen:
222 digest = _sha(path).hexdigest()
223 le = lowerencode(path).split('/')[1:]
224 parts = _auxencode(le, dotencode)
225 basename = parts[-1]
226 _root, ext = os.path.splitext(basename)
227 sdirs = []
228 sdirslen = 0
229 for p in parts[:-1]:
230 d = p[:_dirprefixlen]
231 if d[-1] in '. ':
232 # Windows can't access dirs ending in period or space
233 d = d[:-1] + '_'
234 if sdirslen == 0:
235 t = len(d)
236 else:
237 t = sdirslen + 1 + len(d)
238 if t > _maxshortdirslen:
239 break
240 sdirs.append(d)
241 sdirslen = t
242 dirs = '/'.join(sdirs)
243 if len(dirs) > 0:
244 dirs += '/'
245 res = 'dh/' + dirs + digest + ext
246 spaceleft = _maxstorepathlen - len(res)
247 if spaceleft > 0:
248 filler = basename[:spaceleft]
249 res = 'dh/' + dirs + filler + digest + ext
254 res = _hashencode(path, dotencode)
250 255 return res
251 256
252 257 def _calcmode(path):
253 258 try:
254 259 # files in .hg/ will be created using this mode
255 260 mode = os.stat(path).st_mode
256 261 # avoid some useless chmods
257 262 if (0777 & ~util.umask) == (0777 & mode):
258 263 mode = None
259 264 except OSError:
260 265 mode = None
261 266 return mode
262 267
263 268 _data = ('data 00manifest.d 00manifest.i 00changelog.d 00changelog.i'
264 269 ' phaseroots obsstore')
265 270
266 271 class basicstore(object):
267 272 '''base class for local repository stores'''
268 273 def __init__(self, path, openertype):
269 274 self.path = path
270 275 self.createmode = _calcmode(path)
271 276 op = openertype(self.path)
272 277 op.createmode = self.createmode
273 278 self.opener = scmutil.filteropener(op, encodedir)
274 279
275 280 def join(self, f):
276 281 return self.path + '/' + encodedir(f)
277 282
278 283 def _walk(self, relpath, recurse):
279 284 '''yields (unencoded, encoded, size)'''
280 285 path = self.path
281 286 if relpath:
282 287 path += '/' + relpath
283 288 striplen = len(self.path) + 1
284 289 l = []
285 290 if os.path.isdir(path):
286 291 visit = [path]
287 292 while visit:
288 293 p = visit.pop()
289 294 for f, kind, st in osutil.listdir(p, stat=True):
290 295 fp = p + '/' + f
291 296 if kind == stat.S_IFREG and f[-2:] in ('.d', '.i'):
292 297 n = util.pconvert(fp[striplen:])
293 298 l.append((decodedir(n), n, st.st_size))
294 299 elif kind == stat.S_IFDIR and recurse:
295 300 visit.append(fp)
296 301 l.sort()
297 302 return l
298 303
299 304 def datafiles(self):
300 305 return self._walk('data', True)
301 306
302 307 def walk(self):
303 308 '''yields (unencoded, encoded, size)'''
304 309 # yield data files first
305 310 for x in self.datafiles():
306 311 yield x
307 312 # yield manifest before changelog
308 313 for x in reversed(self._walk('', False)):
309 314 yield x
310 315
311 316 def copylist(self):
312 317 return ['requires'] + _data.split()
313 318
314 319 def write(self):
315 320 pass
316 321
317 322 class encodedstore(basicstore):
318 323 def __init__(self, path, openertype):
319 324 self.path = path + '/store'
320 325 self.createmode = _calcmode(self.path)
321 326 op = openertype(self.path)
322 327 op.createmode = self.createmode
323 328 self.opener = scmutil.filteropener(op, encodefilename)
324 329
325 330 def datafiles(self):
326 331 for a, b, size in self._walk('data', True):
327 332 try:
328 333 a = decodefilename(a)
329 334 except KeyError:
330 335 a = None
331 336 yield a, b, size
332 337
333 338 def join(self, f):
334 339 return self.path + '/' + encodefilename(f)
335 340
336 341 def copylist(self):
337 342 return (['requires', '00changelog.i'] +
338 343 ['store/' + f for f in _data.split()])
339 344
340 345 class fncache(object):
341 346 # the filename used to be partially encoded
342 347 # hence the encodedir/decodedir dance
343 348 def __init__(self, opener):
344 349 self.opener = opener
345 350 self.entries = None
346 351 self._dirty = False
347 352
348 353 def _load(self):
349 354 '''fill the entries from the fncache file'''
350 355 self._dirty = False
351 356 try:
352 357 fp = self.opener('fncache', mode='rb')
353 358 except IOError:
354 359 # skip nonexistent file
355 360 self.entries = set()
356 361 return
357 362 self.entries = set(decodedir(fp.read()).splitlines())
358 363 if '' in self.entries:
359 364 fp.seek(0)
360 365 for n, line in enumerate(fp):
361 366 if not line.rstrip('\n'):
362 367 t = _('invalid entry in fncache, line %s') % (n + 1)
363 368 raise util.Abort(t)
364 369 fp.close()
365 370
366 371 def _write(self, files, atomictemp):
367 372 fp = self.opener('fncache', mode='wb', atomictemp=atomictemp)
368 373 if files:
369 374 fp.write(encodedir('\n'.join(files) + '\n'))
370 375 fp.close()
371 376 self._dirty = False
372 377
373 378 def rewrite(self, files):
374 379 self._write(files, False)
375 380 self.entries = set(files)
376 381
377 382 def write(self):
378 383 if self._dirty:
379 384 self._write(self.entries, True)
380 385
381 386 def add(self, fn):
382 387 if self.entries is None:
383 388 self._load()
384 389 if fn not in self.entries:
385 390 self._dirty = True
386 391 self.entries.add(fn)
387 392
388 393 def __contains__(self, fn):
389 394 if self.entries is None:
390 395 self._load()
391 396 return fn in self.entries
392 397
393 398 def __iter__(self):
394 399 if self.entries is None:
395 400 self._load()
396 401 return iter(self.entries)
397 402
398 403 class _fncacheopener(scmutil.abstractopener):
399 404 def __init__(self, op, fnc, encode):
400 405 self.opener = op
401 406 self.fncache = fnc
402 407 self.encode = encode
403 408
404 409 def _getmustaudit(self):
405 410 return self.opener.mustaudit
406 411
407 412 def _setmustaudit(self, onoff):
408 413 self.opener.mustaudit = onoff
409 414
410 415 mustaudit = property(_getmustaudit, _setmustaudit)
411 416
412 417 def __call__(self, path, mode='r', *args, **kw):
413 418 if mode not in ('r', 'rb') and path.startswith('data/'):
414 419 self.fncache.add(path)
415 420 return self.opener(self.encode(path), mode, *args, **kw)
416 421
417 422 def _plainhybridencode(f):
418 423 return _hybridencode(f, False)
419 424
420 425 def _dothybridencode(f):
421 426 return _hybridencode(f, True)
422 427
423 428 class fncachestore(basicstore):
424 429 def __init__(self, path, openertype, dotencode):
425 430 if dotencode:
426 431 encode = _dothybridencode
427 432 else:
428 433 encode = _plainhybridencode
429 434 self.encode = encode
430 435 self.path = path + '/store'
431 436 self.pathsep = self.path + '/'
432 437 self.createmode = _calcmode(self.path)
433 438 op = openertype(self.path)
434 439 op.createmode = self.createmode
435 440 fnc = fncache(op)
436 441 self.fncache = fnc
437 442 self.opener = _fncacheopener(op, fnc, encode)
438 443
439 444 def join(self, f):
440 445 return self.pathsep + self.encode(f)
441 446
442 447 def getsize(self, path):
443 448 return os.stat(self.pathsep + path).st_size
444 449
445 450 def datafiles(self):
446 451 rewrite = False
447 452 existing = []
448 453 for f in sorted(self.fncache):
449 454 ef = self.encode(f)
450 455 try:
451 456 yield f, ef, self.getsize(ef)
452 457 existing.append(f)
453 458 except OSError, err:
454 459 if err.errno != errno.ENOENT:
455 460 raise
456 461 # nonexistent entry
457 462 rewrite = True
458 463 if rewrite:
459 464 # rewrite fncache to remove nonexistent entries
460 465 # (may be caused by rollback / strip)
461 466 self.fncache.rewrite(existing)
462 467
463 468 def copylist(self):
464 469 d = ('data dh fncache phaseroots obsstore'
465 470 ' 00manifest.d 00manifest.i 00changelog.d 00changelog.i')
466 471 return (['requires', '00changelog.i'] +
467 472 ['store/' + f for f in d.split()])
468 473
469 474 def write(self):
470 475 self.fncache.write()
471 476
472 477 def store(requirements, path, openertype):
473 478 if 'store' in requirements:
474 479 if 'fncache' in requirements:
475 480 return fncachestore(path, openertype, 'dotencode' in requirements)
476 481 return encodedstore(path, openertype)
477 482 return basicstore(path, openertype)
General Comments 0
You need to be logged in to leave comments. Login now