# HG changeset patch # User Yuya Nishihara # Date 2020-11-28 02:15:54 # Node ID 210f9b8d7bbd84d0dd5dcaab6d25a00ed98ef0de # Parent fdd54a87621389d7c12ee3cc69dd57c97a37ac19 diff: do not concatenate immutable bytes while building a/b bodies (issue6445) Use bytearray instead. I don't know what's changed since Python 2, but bytes concatenation is 100x slow on Python 3. % python2.7 -m timeit -s "s = b''" "for i in range(10000): s += b'line'" 1000 loops, best of 3: 321 usec per loop % python3.9 -m timeit -s "s = b''" "for i in range(10000): s += b'line'" 5 loops, best of 5: 39.2 msec per loop Benchmark using tailwind.css (measuring the fast path, a is empty): % HGRCPATH=/dev/null python2.7 ./hg log -R /tmp/issue6445 -p --time \ --color=always --config diff.word-diff=true >/dev/null (prev) time: real 1.580 secs (user 1.560+0.000 sys 0.020+0.000) (this) time: real 1.610 secs (user 1.570+0.000 sys 0.030+0.000) % HGRCPATH=/dev/null python3.9 ./hg log -R /tmp/issue6445 -p --time \ --color=always --config diff.word-diff=true >/dev/null (prev) time: real 114.500 secs (user 114.460+0.000 sys 0.030+0.000) (this) time: real 2.180 secs (user 2.140+0.000 sys 0.040+0.000) Benchmark using random tabular text data (not the fast path): % dd if=/dev/urandom bs=1k count=1000 | hexdump -v -e '16/1 "%3u," "\n"' > ttf % hg ci -ma % dd if=/dev/urandom bs=1k count=1000 | hexdump -v -e '16/1 "%3u," "\n"' > ttf % hg ci -mb % HGRCPATH=/dev/null python2.7 ./hg log -R /tmp/issue6445 -p --time \ --color=always --config diff.word-diff=true >/dev/null (prev) time: real 3.240 secs (user 3.040+0.000 sys 0.200+0.000 (this) time: real 3.230 secs (user 3.070+0.000 sys 0.160+0.000) % HGRCPATH=/dev/null python3.9 ./hg log -R /tmp/issue6445 -p --time \ --color=always --config diff.word-diff=true >/dev/null (prev) time: real 44.130 secs (user 43.850+0.000 sys 0.270+0.000) (this) time: real 4.170 secs (user 3.850+0.000 sys 0.310+0.000) diff --git a/mercurial/patch.py b/mercurial/patch.py --- a/mercurial/patch.py +++ b/mercurial/patch.py @@ -2731,8 +2731,8 @@ def diffsinglehunk(hunklines): def diffsinglehunkinline(hunklines): """yield tokens for a list of lines in a single hunk, with inline colors""" # prepare deleted, and inserted content - a = b'' - b = b'' + a = bytearray() + b = bytearray() for line in hunklines: if line[0:1] == b'-': a += line[1:] @@ -2746,8 +2746,8 @@ def diffsinglehunkinline(hunklines): yield t return # re-split the content into words - al = wordsplitter.findall(a) - bl = wordsplitter.findall(b) + al = wordsplitter.findall(bytes(a)) + bl = wordsplitter.findall(bytes(b)) # re-arrange the words to lines since the diff algorithm is line-based aln = [s if s == b'\n' else s + b'\n' for s in al] bln = [s if s == b'\n' else s + b'\n' for s in bl]