diff --git a/hgext/convert/darcs.py b/hgext/convert/darcs.py --- a/hgext/convert/darcs.py +++ b/hgext/convert/darcs.py @@ -7,22 +7,22 @@ from common import NoRepo, checktool, commandline, commit, converter_source from mercurial.i18n import _ -from mercurial import util +from mercurial import encoding, util import os, shutil, tempfile, re # The naming drift of ElementTree is fun! try: - from xml.etree.cElementTree import ElementTree + from xml.etree.cElementTree import ElementTree, XMLParser except ImportError: try: - from xml.etree.ElementTree import ElementTree + from xml.etree.ElementTree import ElementTree, XMLParser except ImportError: try: - from elementtree.cElementTree import ElementTree + from elementtree.cElementTree import ElementTree, XMLParser except ImportError: try: - from elementtree.ElementTree import ElementTree + from elementtree.ElementTree import ElementTree, XMLParser except ImportError: ElementTree = None @@ -88,12 +88,24 @@ class darcs_source(converter_source, com self.ui.debug('cleaning up %s\n' % self.tmppath) shutil.rmtree(self.tmppath, ignore_errors=True) + def recode(self, s, encoding=None): + if isinstance(s, unicode): + # XMLParser returns unicode objects for anything it can't + # encode into ASCII. We convert them back to str to get + # recode's normal conversion behavior. + s = s.encode('latin-1') + return super(darcs_source, self).recode(s, encoding) + def xml(self, cmd, **kwargs): # NOTE: darcs is currently encoding agnostic and will print # patch metadata byte-for-byte, even in the XML changelog. etree = ElementTree() + # While we are decoding the XML as latin-1 to be as liberal as + # possible, etree will still raise an exception if any + # non-printable characters are in the XML changelog. + parser = XMLParser(encoding='latin-1') fp = self._run(cmd, **kwargs) - etree.parse(fp) + etree.parse(fp, parser=parser) self.checkexit(fp.close()) return etree.getroot() diff --git a/tests/test-convert-darcs.t b/tests/test-convert-darcs.t --- a/tests/test-convert-darcs.t +++ b/tests/test-convert-darcs.t @@ -49,8 +49,6 @@ update source $ darcs record -a -l -m p1.2 Finished recording patch 'p1.2' -merge branch - $ darcs pull -a ../darcs-clone Backing up ./a(-darcs-backup0) We have conflicts in the following files: @@ -85,6 +83,15 @@ darcs is encoding agnostic, so it takes $ darcs record -a -l -m 'p4: desc ñ' -A 'author ñ' Finished recording patch 'p4: desc ñ' + +Test latin-1 commit message + + $ echo h > h + $ printf "p5: desc " > ../p5 + $ python -c 'print "".join([chr(i) for i in range(128, 256)])' >> ../p5 + $ darcs record -a -l --logfile ../p5 + Finished recording patch 'p5: desc ' + $ glog() > { > HGENCODING=utf-8 hg glog --template '{rev} "{desc|firstline}" ({author}) files: {files}\n' "$@" @@ -95,12 +102,13 @@ darcs is encoding agnostic, so it takes scanning source... sorting... converting... - 5 p0 - 4 p1.2 - 3 p1.1 - 2 p2 - 1 p3 - 0 p4: desc ? + 6 p0 + 5 p1.2 + 4 p1.1 + 3 p2 + 2 p3 + 1 p4: desc ? + 0 p5: desc ???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? The converter does not currently handle patch conflicts very well. When they occur, it reverts *all* changes and moves forward, @@ -109,8 +117,11 @@ Unfortunately, non-conflicting changes, "c" file in p1.1 patch are reverted too. Just to say that manifest not listing "c" here is a bug. - $ glog -R darcs-repo-hg - o 5 "p4: desc ñ" (author ñ) files: g + $ HGENCODING=latin-1 glog -R darcs-repo-hg -r 6 | "$TESTDIR"/printrepr.py + o 6 "p5: desc \xc2\x80\xc2\x81\xc2\x82\xc2\x83\xc2\x84\xc2\x85\xc2\x86\xc2\x87\xc2\x88\xc2\x89\xc2\x8a\xc2\x8b\xc2\x8c\xc2\x8d\xc2\x8e\xc2\x8f\xc2\x90\xc2\x91\xc2\x92\xc2\x93\xc2\x94\xc2\x95\xc2\x96\xc2\x97\xc2\x98\xc2\x99\xc2\x9a\xc2\x9b\xc2\x9c\xc2\x9d\xc2\x9e\xc2\x9f\xc2\xa0\xc2\xa1\xc2\xa2\xc2\xa3\xc2\xa4\xc2\xa5\xc2\xa6\xc2\xa7\xc2\xa8\xc2\xa9\xc2\xaa\xc2\xab\xc2\xac\xc2\xad\xc2\xae\xc2\xaf\xc2\xb0\xc2\xb1\xc2\xb2\xc2\xb3\xc2\xb4\xc2\xb5\xc2\xb6\xc2\xb7\xc2\xb8\xc2\xb9\xc2\xba\xc2\xbb\xc2\xbc\xc2\xbd\xc2\xbe\xc2\xbf\xc3\x80\xc3\x81\xc3\x82\xc3\x83\xc3\x84\xc3\x85\xc3\x86\xc3\x87\xc3\x88\xc3\x89\xc3\x8a\xc3\x8b\xc3\x8c\xc3\x8d\xc3\x8e\xc3\x8f\xc3\x90\xc3\x91\xc3\x92\xc3\x93\xc3\x94\xc3\x95\xc3\x96\xc3\x97\xc3\x98\xc3\x99\xc3\x9a\xc3\x9b\xc3\x9c\xc3\x9d\xc3\x9e\xc3\x9f\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\xc3\xa5\xc3\xa6\xc3\xa7\xc3\xa8\xc3\xa9\xc3\xaa\xc3\xab\xc3\xac\xc3\xad\xc3\xae\xc3\xaf\xc3\xb0\xc3\xb1\xc3\xb2\xc3\xb3\xc3\xb4\xc3\xb5\xc3\xb6\xc3\xb7\xc3\xb8\xc3\xb9\xc3\xba\xc3\xbb\xc3\xbc\xc3\xbd\xc3\xbe\xc3\xbf" (test@example.org) files: h + | + $ HGENCODING=utf-8 glog -R darcs-repo-hg -r 0:5 | "$TESTDIR"/printrepr.py + o 5 "p4: desc \xc3\xb1" (author \xc3\xb1) files: g | o 4 "p3" (test@example.org) files: dir/d dir/d2 dir2/d f ff | @@ -122,6 +133,7 @@ Just to say that manifest not listing "c | o 0 "p0" (test@example.org) files: a + $ hg up -q -R darcs-repo-hg $ hg -R darcs-repo-hg manifest --debug 7225b30cdf38257d5cc7780772c051b6f33e6d6b 644 a @@ -129,3 +141,4 @@ Just to say that manifest not listing "c 37406831adc447ec2385014019599dfec953c806 644 dir2/d b783a337463792a5c7d548ad85a7d3253c16ba8c 644 ff 0973eb1b2ecc4de7fafe7447ce1b7462108b4848 644 g + fe6f8b4f507fe3eb524c527192a84920a4288dac 644 h