upstream/mercurial-mirror Files · tests/test-pathencode.py

show: use consistent (and possibly shorter) node lengths...

show: use consistent (and possibly shorter) node lengths `hg show` makes heavy use of shortest() to limit the length of the node hash. For the "stack" and "work" views, you are often looking at multiple lines of similar output for "lines" of work. It is visually appeasing for things to vertically align. A naive use of {shortest(node, N)} could result in variable length nodes and for the first character of the description to vary by a column or two. We implement a function to determine the longest shortest prefix for a set of revisions. The new function is used to determine the printed node length for all `hg show` views. .. feature:: show: use consistent node length in views Our previous shortest node length of 5 was arbitrarily chosen. shortest() already does the work of ensuring that a partial node isn't ambiguous with an integer revision, which is our primary risk of a collision for very short nodes. It should be safe to go with the shortest node possible. Existing code is also optimized to handle nodes as short as 4. So, we decrease the minimum hash length from 5 to 4. We also add a test demonstrating that prefix collisions increase the node length. .. feature:: show: decrease minimum displayed hash length from 5 to 4 Differential Revision: https://phab.mercurial-scm.org/D558

Pulkit Goyal - - Load All Authors

File last commit:

r28928:59481bfd default


                r34192:e6b5e732

default

Download file

             test-pathencode.py
        
                    204 lines
            
             | 6.3 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / tests / test-pathencode.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      # This is a randomized test that generates different pathnames every

      # time it is invoked, and tests the encoding of those pathnames.

      #

      # It uses a simple probabilistic model to generate valid pathnames

      # that have proven likely to expose bugs and divergent behavior in

      # different encoding implementations.

      from __future__ import absolute_import, print_function

      import binascii

      import collections

      import itertools

      import math

      import os

      import random

      import sys

      import time

      from mercurial import (

          store,

      )

      validchars = set(map(chr, range(0, 256)))

      alphanum = range(ord('A'), ord('Z'))

      for c in '\0/':

          validchars.remove(c)

      winreserved = ('aux con prn nul'.split() +

                     ['com%d' % i for i in xrange(1, 10)] +

                     ['lpt%d' % i for i in xrange(1, 10)])

      def casecombinations(names):

          '''Build all case-diddled combinations of names.'''

          combos = set()

          for r in names:

              for i in xrange(len(r) + 1):

                  for c in itertools.combinations(xrange(len(r)), i):

                      d = r

                      for j in c:

                          d = ''.join((d[:j], d[j].upper(), d[j + 1:]))

                      combos.add(d)

          return sorted(combos)

      def buildprobtable(fp, cmd='hg manifest tip'):

          '''Construct and print a table of probabilities for path name

          components.  The numbers are percentages.'''

          counts = collections.defaultdict(lambda: 0)

          for line in os.popen(cmd).read().splitlines():

              if line[-2:] in ('.i', '.d'):

                  line = line[:-2]

              if line.startswith('data/'):

                  line = line[5:]

              for c in line:

                  counts[c] += 1

          for c in '\r/\n':

              counts.pop(c, None)

          t = sum(counts.itervalues()) / 100.0

          fp.write('probtable = (')

          for i, (k, v) in enumerate(sorted(counts.iteritems(), key=lambda x: x[1],

                                            reverse=True)):

              if (i % 5) == 0:

                  fp.write('\n    ')

              vt = v / t

              if vt < 0.0005:

                  break

              fp.write('(%r, %.03f), ' % (k, vt))

          fp.write('\n    )\n')

      # A table of character frequencies (as percentages), gleaned by

      # looking at filelog names from a real-world, very large repo.

      probtable = (

          ('t', 9.828), ('e', 9.042), ('s', 8.011), ('a', 6.801), ('i', 6.618),

          ('g', 5.053), ('r', 5.030), ('o', 4.887), ('p', 4.363), ('n', 4.258),

          ('l', 3.830), ('h', 3.693), ('_', 3.659), ('.', 3.377), ('m', 3.194),

          ('u', 2.364), ('d', 2.296), ('c', 2.163), ('b', 1.739), ('f', 1.625),

          ('6', 0.666), ('j', 0.610), ('y', 0.554), ('x', 0.487), ('w', 0.477),

          ('k', 0.476), ('v', 0.473), ('3', 0.336), ('1', 0.335), ('2', 0.326),

          ('4', 0.310), ('5', 0.305), ('9', 0.302), ('8', 0.300), ('7', 0.299),

          ('q', 0.298), ('0', 0.250), ('z', 0.223), ('-', 0.118), ('C', 0.095),

          ('T', 0.087), ('F', 0.085), ('B', 0.077), ('S', 0.076), ('P', 0.076),

          ('L', 0.059), ('A', 0.058), ('N', 0.051), ('D', 0.049), ('M', 0.046),

          ('E', 0.039), ('I', 0.035), ('R', 0.035), ('G', 0.028), ('U', 0.026),

          ('W', 0.025), ('O', 0.017), ('V', 0.015), ('H', 0.013), ('Q', 0.011),

          ('J', 0.007), ('K', 0.005), ('+', 0.004), ('X', 0.003), ('Y', 0.001),

          )

      for c, _ in probtable:

          validchars.remove(c)

      validchars = list(validchars)

      def pickfrom(rng, table):

          c = 0

          r = rng.random() * sum(i[1] for i in table)

          for i, p in table:

              c += p

              if c >= r:

                  return i

      reservedcombos = casecombinations(winreserved)

      # The first component of a name following a slash.

      firsttable = (

          (lambda rng: pickfrom(rng, probtable), 90),

          (lambda rng: rng.choice(validchars), 5),

          (lambda rng: rng.choice(reservedcombos), 5),

          )

      # Components of a name following the first.

      resttable = firsttable[:-1]

      # Special suffixes.

      internalsuffixcombos = casecombinations('.hg .i .d'.split())

      # The last component of a path, before a slash or at the end of a name.

      lasttable = resttable + (

          (lambda rng: '', 95),

          (lambda rng: rng.choice(internalsuffixcombos), 5),

          )

      def makepart(rng, k):

          '''Construct a part of a pathname, without slashes.'''

          p = pickfrom(rng, firsttable)(rng)

          l = len(p)

          ps = [p]

          maxl = rng.randint(1, k)

          while l < maxl:

              p = pickfrom(rng, resttable)(rng)

              l += len(p)

              ps.append(p)

          ps.append(pickfrom(rng, lasttable)(rng))

          return ''.join(ps)

      def makepath(rng, j, k):

          '''Construct a complete pathname.'''

          return ('data/' + '/'.join(makepart(rng, k) for _ in xrange(j)) +

                  rng.choice(['.d', '.i']))

      def genpath(rng, count):

          '''Generate random pathnames with gradually increasing lengths.'''

          mink, maxk = 1, 4096

          def steps():

              for i in xrange(count):

                  yield mink + int(round(math.sqrt((maxk - mink) * float(i) / count)))

          for k in steps():

              x = rng.randint(1, k)

              y = rng.randint(1, k)

              yield makepath(rng, x, y)

      def runtests(rng, seed, count):

          nerrs = 0

          for p in genpath(rng, count):

              h = store._pathencode(p)    # uses C implementation, if available

              r = store._hybridencode(p, True) # reference implementation in Python

              if h != r:

                  if nerrs == 0:

                      print('seed:', hex(seed)[:-1], file=sys.stderr)

                  print("\np: '%s'" % p.encode("string_escape"), file=sys.stderr)

                  print("h: '%s'" % h.encode("string_escape"), file=sys.stderr)

                  print("r: '%s'" % r.encode("string_escape"), file=sys.stderr)

                  nerrs += 1

          return nerrs

      def main():

          import getopt

          # Empirically observed to take about a second to run

          count = 100

          seed = None

          opts, args = getopt.getopt(sys.argv[1:], 'c:s:',

                                     ['build', 'count=', 'seed='])

          for o, a in opts:

              if o in ('-c', '--count'):

                  count = int(a)

              elif o in ('-s', '--seed'):

                  seed = long(a, base=0) # accepts base 10 or 16 strings

              elif o == '--build':

                  buildprobtable(sys.stdout,

                                 'find .hg/store/data -type f && '

                                 'cat .hg/store/fncache 2>/dev/null')

                  sys.exit(0)

          if seed is None:

              try:

                  seed = long(binascii.hexlify(os.urandom(16)), 16)

              except AttributeError:

                  seed = long(time.time() * 1000)

          rng = random.Random(seed)

          if runtests(rng, seed, count):

              sys.exit(1)

      if __name__ == '__main__':

          main()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				# This is a randomized test that generates different pathnames every
				# time it is invoked, and tests the encoding of those pathnames.
				#
				# It uses a simple probabilistic model to generate valid pathnames
				# that have proven likely to expose bugs and divergent behavior in
				# different encoding implementations.

				from __future__ import absolute_import, print_function

				import binascii
				import collections
				import itertools
				import math
				import os
				import random
				import sys
				import time
				from mercurial import (
				store,
				)

				validchars = set(map(chr, range(0, 256)))
				alphanum = range(ord('A'), ord('Z'))

				for c in '\0/':
				validchars.remove(c)

				winreserved = ('aux con prn nul'.split() +
				['com%d' % i for i in xrange(1, 10)] +
				['lpt%d' % i for i in xrange(1, 10)])

				def casecombinations(names):
				'''Build all case-diddled combinations of names.'''

				combos = set()

				for r in names:
				for i in xrange(len(r) + 1):
				for c in itertools.combinations(xrange(len(r)), i):
				d = r
				for j in c:
				d = ''.join((d[:j], d[j].upper(), d[j + 1:]))
				combos.add(d)
				return sorted(combos)

				def buildprobtable(fp, cmd='hg manifest tip'):
				'''Construct and print a table of probabilities for path name
				components. The numbers are percentages.'''

				counts = collections.defaultdict(lambda: 0)
				for line in os.popen(cmd).read().splitlines():
				if line[-2:] in ('.i', '.d'):
				line = line[:-2]
				if line.startswith('data/'):
				line = line[5:]
				for c in line:
				counts[c] += 1
				for c in '\r/\n':
				counts.pop(c, None)
				t = sum(counts.itervalues()) / 100.0
				fp.write('probtable = (')
				for i, (k, v) in enumerate(sorted(counts.iteritems(), key=lambda x: x[1],
				reverse=True)):
				if (i % 5) == 0:
				fp.write('\n ')
				vt = v / t
				if vt < 0.0005:
				break
				fp.write('(%r, %.03f), ' % (k, vt))
				fp.write('\n )\n')

				# A table of character frequencies (as percentages), gleaned by
				# looking at filelog names from a real-world, very large repo.

				probtable = (
				('t', 9.828), ('e', 9.042), ('s', 8.011), ('a', 6.801), ('i', 6.618),
				('g', 5.053), ('r', 5.030), ('o', 4.887), ('p', 4.363), ('n', 4.258),
				('l', 3.830), ('h', 3.693), ('_', 3.659), ('.', 3.377), ('m', 3.194),
				('u', 2.364), ('d', 2.296), ('c', 2.163), ('b', 1.739), ('f', 1.625),
				('6', 0.666), ('j', 0.610), ('y', 0.554), ('x', 0.487), ('w', 0.477),
				('k', 0.476), ('v', 0.473), ('3', 0.336), ('1', 0.335), ('2', 0.326),
				('4', 0.310), ('5', 0.305), ('9', 0.302), ('8', 0.300), ('7', 0.299),
				('q', 0.298), ('0', 0.250), ('z', 0.223), ('-', 0.118), ('C', 0.095),
				('T', 0.087), ('F', 0.085), ('B', 0.077), ('S', 0.076), ('P', 0.076),
				('L', 0.059), ('A', 0.058), ('N', 0.051), ('D', 0.049), ('M', 0.046),
				('E', 0.039), ('I', 0.035), ('R', 0.035), ('G', 0.028), ('U', 0.026),
				('W', 0.025), ('O', 0.017), ('V', 0.015), ('H', 0.013), ('Q', 0.011),
				('J', 0.007), ('K', 0.005), ('+', 0.004), ('X', 0.003), ('Y', 0.001),
				)

				for c, _ in probtable:
				validchars.remove(c)
				validchars = list(validchars)

				def pickfrom(rng, table):
				c = 0
				r = rng.random() * sum(i[1] for i in table)
				for i, p in table:
				c += p
				if c >= r:
				return i

				reservedcombos = casecombinations(winreserved)

				# The first component of a name following a slash.

				firsttable = (
				(lambda rng: pickfrom(rng, probtable), 90),
				(lambda rng: rng.choice(validchars), 5),
				(lambda rng: rng.choice(reservedcombos), 5),
				)

				# Components of a name following the first.

				resttable = firsttable[:-1]

				# Special suffixes.

				internalsuffixcombos = casecombinations('.hg .i .d'.split())

				# The last component of a path, before a slash or at the end of a name.

				lasttable = resttable + (
				(lambda rng: '', 95),
				(lambda rng: rng.choice(internalsuffixcombos), 5),
				)

				def makepart(rng, k):
				'''Construct a part of a pathname, without slashes.'''

				p = pickfrom(rng, firsttable)(rng)
				l = len(p)
				ps = [p]
				maxl = rng.randint(1, k)
				while l < maxl:
				p = pickfrom(rng, resttable)(rng)
				l += len(p)
				ps.append(p)
				ps.append(pickfrom(rng, lasttable)(rng))
				return ''.join(ps)

				def makepath(rng, j, k):
				'''Construct a complete pathname.'''

				return ('data/' + '/'.join(makepart(rng, k) for _ in xrange(j)) +
				rng.choice(['.d', '.i']))

				def genpath(rng, count):
				'''Generate random pathnames with gradually increasing lengths.'''

				mink, maxk = 1, 4096
				def steps():
				for i in xrange(count):
				yield mink + int(round(math.sqrt((maxk - mink) * float(i) / count)))
				for k in steps():
				x = rng.randint(1, k)
				y = rng.randint(1, k)
				yield makepath(rng, x, y)

				def runtests(rng, seed, count):
				nerrs = 0
				for p in genpath(rng, count):
				h = store._pathencode(p) # uses C implementation, if available
				r = store._hybridencode(p, True) # reference implementation in Python
				if h != r:
				if nerrs == 0:
				print('seed:', hex(seed)[:-1], file=sys.stderr)
				print("\np: '%s'" % p.encode("string_escape"), file=sys.stderr)
				print("h: '%s'" % h.encode("string_escape"), file=sys.stderr)
				print("r: '%s'" % r.encode("string_escape"), file=sys.stderr)
				nerrs += 1
				return nerrs

				def main():
				import getopt

				# Empirically observed to take about a second to run
				count = 100
				seed = None
				opts, args = getopt.getopt(sys.argv[1:], 'c:s:',
				['build', 'count=', 'seed='])
				for o, a in opts:
				if o in ('-c', '--count'):
				count = int(a)
				elif o in ('-s', '--seed'):
				seed = long(a, base=0) # accepts base 10 or 16 strings
				elif o == '--build':
				buildprobtable(sys.stdout,
				'find .hg/store/data -type f && '
				'cat .hg/store/fncache 2>/dev/null')
				sys.exit(0)

				if seed is None:
				try:
				seed = long(binascii.hexlify(os.urandom(16)), 16)
				except AttributeError:
				seed = long(time.time() * 1000)

				rng = random.Random(seed)
				if runtests(rng, seed, count):
				sys.exit(1)

				if __name__ == '__main__':
				main()