# HG changeset patch # User Arseniy Alekseyev # Date 2023-03-06 11:27:57 # Node ID 95acba2c29f6e90a7c19da0585a7b52efb416082 # Parent bcf54837241d2efa0d4e0b727d7fe9b6433e3448 encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings Apparently the code uses "+=" with a bytes object, which is linear-time, so the whole encoding is quadratic-time. This patch makes us use a bytearray object, instead, which has a(n amortized-)constant-time append operation. The encoding is still not particularly fast, but at least a 10MB file takes tens of seconds, not many hours to encode. diff --git a/mercurial/encoding.py b/mercurial/encoding.py --- a/mercurial/encoding.py +++ b/mercurial/encoding.py @@ -657,7 +657,7 @@ def toutf8b(s): pass s = pycompat.bytestr(s) - r = b"" + r = bytearray() pos = 0 l = len(s) while pos < l: @@ -673,7 +673,7 @@ def toutf8b(s): c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) pos += 1 r += c - return r + return bytes(r) def fromutf8b(s): @@ -712,7 +712,7 @@ def fromutf8b(s): # helper again to walk the string without "decoding" it. s = pycompat.bytestr(s) - r = b"" + r = bytearray() pos = 0 l = len(s) while pos < l: @@ -722,4 +722,4 @@ def fromutf8b(s): if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) r += c - return r + return bytes(r)