# HG changeset patch # User Siddharth Agarwal # Date 2015-06-16 07:46:01 # Node ID 868b7ee8b570afe636959f3c70bf01e0f147b258 # Parent 72b2711f12eafcfdb132f7f9bcc125c3b1efb19a dirstate: use a presized dict for the dirstate This uses a simple heuristic to avoid expensive resizes. On a real-world repo with around 400,000 files, perfdirstate: before: ! wall 0.155562 comb 0.160000 user 0.150000 sys 0.010000 (best of 64) after: ! wall 0.132638 comb 0.130000 user 0.120000 sys 0.010000 (best of 75) On another real-world repo with around 250,000 files: before: ! wall 0.098459 comb 0.100000 user 0.090000 sys 0.010000 (best of 100) after: ! wall 0.089084 comb 0.090000 user 0.080000 sys 0.010000 (best of 100) diff --git a/mercurial/dirstate.py b/mercurial/dirstate.py --- a/mercurial/dirstate.py +++ b/mercurial/dirstate.py @@ -338,6 +338,19 @@ class dirstate(object): if not st: return + if util.safehasattr(parsers, 'dict_new_presized'): + # Make an estimate of the number of files in the dirstate based on + # its size. From a linear regression on a set of real-world repos, + # all over 10,000 files, the size of a dirstate entry is 85 + # bytes. The cost of resizing is significantly higher than the cost + # of filling in a larger presized dict, so subtract 20% from the + # size. + # + # This heuristic is imperfect in many ways, so in a future dirstate + # format update it makes sense to just record the number of entries + # on write. + self._map = parsers.dict_new_presized(len(st) / 71) + # Python's garbage collector triggers a GC each time a certain number # of container objects (the number being defined by # gc.get_threshold()) are allocated. parse_dirstate creates a tuple