##// END OF EJS Templates
tests: stabilize doctest output...
Mads Kiilerich -
r18378:404feac7 default
parent child Browse files
Show More
@@ -1,346 +1,348
1 # changelog.py - changelog class for mercurial
1 # changelog.py - changelog class for mercurial
2 #
2 #
3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from node import bin, hex, nullid
8 from node import bin, hex, nullid
9 from i18n import _
9 from i18n import _
10 import util, error, revlog, encoding
10 import util, error, revlog, encoding
11
11
12 _defaultextra = {'branch': 'default'}
12 _defaultextra = {'branch': 'default'}
13
13
14 def _string_escape(text):
14 def _string_escape(text):
15 """
15 """
16 >>> d = {'nl': chr(10), 'bs': chr(92), 'cr': chr(13), 'nul': chr(0)}
16 >>> d = {'nl': chr(10), 'bs': chr(92), 'cr': chr(13), 'nul': chr(0)}
17 >>> s = "ab%(nl)scd%(bs)s%(bs)sn%(nul)sab%(cr)scd%(bs)s%(nl)s" % d
17 >>> s = "ab%(nl)scd%(bs)s%(bs)sn%(nul)sab%(cr)scd%(bs)s%(nl)s" % d
18 >>> s
18 >>> s
19 'ab\\ncd\\\\\\\\n\\x00ab\\rcd\\\\\\n'
19 'ab\\ncd\\\\\\\\n\\x00ab\\rcd\\\\\\n'
20 >>> res = _string_escape(s)
20 >>> res = _string_escape(s)
21 >>> s == res.decode('string_escape')
21 >>> s == res.decode('string_escape')
22 True
22 True
23 """
23 """
24 # subset of the string_escape codec
24 # subset of the string_escape codec
25 text = text.replace('\\', '\\\\').replace('\n', '\\n').replace('\r', '\\r')
25 text = text.replace('\\', '\\\\').replace('\n', '\\n').replace('\r', '\\r')
26 return text.replace('\0', '\\0')
26 return text.replace('\0', '\\0')
27
27
28 def decodeextra(text):
28 def decodeextra(text):
29 """
29 """
30 >>> decodeextra(encodeextra({'foo': 'bar', 'baz': chr(0) + '2'}))
30 >>> from pprint import pprint as pp
31 {'foo': 'bar', 'baz': '\\x002', 'branch': 'default'}
31 >>> pp(decodeextra(encodeextra({'foo': 'bar', 'baz': chr(0) + '2'})))
32 >>> decodeextra(encodeextra({'foo': 'bar', 'baz': chr(92) + chr(0) + '2'}))
32 {'baz': '\\x002', 'branch': 'default', 'foo': 'bar'}
33 {'foo': 'bar', 'baz': '\\\\\\x002', 'branch': 'default'}
33 >>> pp(decodeextra(encodeextra({'foo': 'bar',
34 ... 'baz': chr(92) + chr(0) + '2'})))
35 {'baz': '\\\\\\x002', 'branch': 'default', 'foo': 'bar'}
34 """
36 """
35 extra = _defaultextra.copy()
37 extra = _defaultextra.copy()
36 for l in text.split('\0'):
38 for l in text.split('\0'):
37 if l:
39 if l:
38 if '\\0' in l:
40 if '\\0' in l:
39 # fix up \0 without getting into trouble with \\0
41 # fix up \0 without getting into trouble with \\0
40 l = l.replace('\\\\', '\\\\\n')
42 l = l.replace('\\\\', '\\\\\n')
41 l = l.replace('\\0', '\0')
43 l = l.replace('\\0', '\0')
42 l = l.replace('\n', '')
44 l = l.replace('\n', '')
43 k, v = l.decode('string_escape').split(':', 1)
45 k, v = l.decode('string_escape').split(':', 1)
44 extra[k] = v
46 extra[k] = v
45 return extra
47 return extra
46
48
47 def encodeextra(d):
49 def encodeextra(d):
48 # keys must be sorted to produce a deterministic changelog entry
50 # keys must be sorted to produce a deterministic changelog entry
49 items = [_string_escape('%s:%s' % (k, d[k])) for k in sorted(d)]
51 items = [_string_escape('%s:%s' % (k, d[k])) for k in sorted(d)]
50 return "\0".join(items)
52 return "\0".join(items)
51
53
52 def stripdesc(desc):
54 def stripdesc(desc):
53 """strip trailing whitespace and leading and trailing empty lines"""
55 """strip trailing whitespace and leading and trailing empty lines"""
54 return '\n'.join([l.rstrip() for l in desc.splitlines()]).strip('\n')
56 return '\n'.join([l.rstrip() for l in desc.splitlines()]).strip('\n')
55
57
56 class appender(object):
58 class appender(object):
57 '''the changelog index must be updated last on disk, so we use this class
59 '''the changelog index must be updated last on disk, so we use this class
58 to delay writes to it'''
60 to delay writes to it'''
59 def __init__(self, fp, buf):
61 def __init__(self, fp, buf):
60 self.data = buf
62 self.data = buf
61 self.fp = fp
63 self.fp = fp
62 self.offset = fp.tell()
64 self.offset = fp.tell()
63 self.size = util.fstat(fp).st_size
65 self.size = util.fstat(fp).st_size
64
66
65 def end(self):
67 def end(self):
66 return self.size + len("".join(self.data))
68 return self.size + len("".join(self.data))
67 def tell(self):
69 def tell(self):
68 return self.offset
70 return self.offset
69 def flush(self):
71 def flush(self):
70 pass
72 pass
71 def close(self):
73 def close(self):
72 self.fp.close()
74 self.fp.close()
73
75
74 def seek(self, offset, whence=0):
76 def seek(self, offset, whence=0):
75 '''virtual file offset spans real file and data'''
77 '''virtual file offset spans real file and data'''
76 if whence == 0:
78 if whence == 0:
77 self.offset = offset
79 self.offset = offset
78 elif whence == 1:
80 elif whence == 1:
79 self.offset += offset
81 self.offset += offset
80 elif whence == 2:
82 elif whence == 2:
81 self.offset = self.end() + offset
83 self.offset = self.end() + offset
82 if self.offset < self.size:
84 if self.offset < self.size:
83 self.fp.seek(self.offset)
85 self.fp.seek(self.offset)
84
86
85 def read(self, count=-1):
87 def read(self, count=-1):
86 '''only trick here is reads that span real file and data'''
88 '''only trick here is reads that span real file and data'''
87 ret = ""
89 ret = ""
88 if self.offset < self.size:
90 if self.offset < self.size:
89 s = self.fp.read(count)
91 s = self.fp.read(count)
90 ret = s
92 ret = s
91 self.offset += len(s)
93 self.offset += len(s)
92 if count > 0:
94 if count > 0:
93 count -= len(s)
95 count -= len(s)
94 if count != 0:
96 if count != 0:
95 doff = self.offset - self.size
97 doff = self.offset - self.size
96 self.data.insert(0, "".join(self.data))
98 self.data.insert(0, "".join(self.data))
97 del self.data[1:]
99 del self.data[1:]
98 s = self.data[0][doff:doff + count]
100 s = self.data[0][doff:doff + count]
99 self.offset += len(s)
101 self.offset += len(s)
100 ret += s
102 ret += s
101 return ret
103 return ret
102
104
103 def write(self, s):
105 def write(self, s):
104 self.data.append(str(s))
106 self.data.append(str(s))
105 self.offset += len(s)
107 self.offset += len(s)
106
108
107 def delayopener(opener, target, divert, buf):
109 def delayopener(opener, target, divert, buf):
108 def o(name, mode='r'):
110 def o(name, mode='r'):
109 if name != target:
111 if name != target:
110 return opener(name, mode)
112 return opener(name, mode)
111 if divert:
113 if divert:
112 return opener(name + ".a", mode.replace('a', 'w'))
114 return opener(name + ".a", mode.replace('a', 'w'))
113 # otherwise, divert to memory
115 # otherwise, divert to memory
114 return appender(opener(name, mode), buf)
116 return appender(opener(name, mode), buf)
115 return o
117 return o
116
118
117 class changelog(revlog.revlog):
119 class changelog(revlog.revlog):
118 def __init__(self, opener):
120 def __init__(self, opener):
119 revlog.revlog.__init__(self, opener, "00changelog.i")
121 revlog.revlog.__init__(self, opener, "00changelog.i")
120 if self._initempty:
122 if self._initempty:
121 # changelogs don't benefit from generaldelta
123 # changelogs don't benefit from generaldelta
122 self.version &= ~revlog.REVLOGGENERALDELTA
124 self.version &= ~revlog.REVLOGGENERALDELTA
123 self._generaldelta = False
125 self._generaldelta = False
124 self._realopener = opener
126 self._realopener = opener
125 self._delayed = False
127 self._delayed = False
126 self._divert = False
128 self._divert = False
127 self.filteredrevs = frozenset()
129 self.filteredrevs = frozenset()
128
130
129 def tip(self):
131 def tip(self):
130 """filtered version of revlog.tip"""
132 """filtered version of revlog.tip"""
131 for i in xrange(len(self) -1, -2, -1):
133 for i in xrange(len(self) -1, -2, -1):
132 if i not in self.filteredrevs:
134 if i not in self.filteredrevs:
133 return self.node(i)
135 return self.node(i)
134
136
135 def __iter__(self):
137 def __iter__(self):
136 """filtered version of revlog.__iter__"""
138 """filtered version of revlog.__iter__"""
137 if len(self.filteredrevs) == 0:
139 if len(self.filteredrevs) == 0:
138 return revlog.revlog.__iter__(self)
140 return revlog.revlog.__iter__(self)
139
141
140 def filterediter():
142 def filterediter():
141 for i in xrange(len(self)):
143 for i in xrange(len(self)):
142 if i not in self.filteredrevs:
144 if i not in self.filteredrevs:
143 yield i
145 yield i
144
146
145 return filterediter()
147 return filterediter()
146
148
147 def revs(self, start=0, stop=None):
149 def revs(self, start=0, stop=None):
148 """filtered version of revlog.revs"""
150 """filtered version of revlog.revs"""
149 for i in super(changelog, self).revs(start, stop):
151 for i in super(changelog, self).revs(start, stop):
150 if i not in self.filteredrevs:
152 if i not in self.filteredrevs:
151 yield i
153 yield i
152
154
153 @util.propertycache
155 @util.propertycache
154 def nodemap(self):
156 def nodemap(self):
155 # XXX need filtering too
157 # XXX need filtering too
156 self.rev(self.node(0))
158 self.rev(self.node(0))
157 return self._nodecache
159 return self._nodecache
158
160
159 def hasnode(self, node):
161 def hasnode(self, node):
160 """filtered version of revlog.hasnode"""
162 """filtered version of revlog.hasnode"""
161 try:
163 try:
162 i = self.rev(node)
164 i = self.rev(node)
163 return i not in self.filteredrevs
165 return i not in self.filteredrevs
164 except KeyError:
166 except KeyError:
165 return False
167 return False
166
168
167 def headrevs(self):
169 def headrevs(self):
168 if self.filteredrevs:
170 if self.filteredrevs:
169 # XXX we should fix and use the C version
171 # XXX we should fix and use the C version
170 return self._headrevs()
172 return self._headrevs()
171 return super(changelog, self).headrevs()
173 return super(changelog, self).headrevs()
172
174
173 def strip(self, *args, **kwargs):
175 def strip(self, *args, **kwargs):
174 # XXX make something better than assert
176 # XXX make something better than assert
175 # We can't expect proper strip behavior if we are filtered.
177 # We can't expect proper strip behavior if we are filtered.
176 assert not self.filteredrevs
178 assert not self.filteredrevs
177 super(changelog, self).strip(*args, **kwargs)
179 super(changelog, self).strip(*args, **kwargs)
178
180
179 def rev(self, node):
181 def rev(self, node):
180 """filtered version of revlog.rev"""
182 """filtered version of revlog.rev"""
181 r = super(changelog, self).rev(node)
183 r = super(changelog, self).rev(node)
182 if r in self.filteredrevs:
184 if r in self.filteredrevs:
183 raise error.LookupError(node, self.indexfile, _('no node'))
185 raise error.LookupError(node, self.indexfile, _('no node'))
184 return r
186 return r
185
187
186 def node(self, rev):
188 def node(self, rev):
187 """filtered version of revlog.node"""
189 """filtered version of revlog.node"""
188 if rev in self.filteredrevs:
190 if rev in self.filteredrevs:
189 raise IndexError(rev)
191 raise IndexError(rev)
190 return super(changelog, self).node(rev)
192 return super(changelog, self).node(rev)
191
193
192 def linkrev(self, rev):
194 def linkrev(self, rev):
193 """filtered version of revlog.linkrev"""
195 """filtered version of revlog.linkrev"""
194 if rev in self.filteredrevs:
196 if rev in self.filteredrevs:
195 raise IndexError(rev)
197 raise IndexError(rev)
196 return super(changelog, self).linkrev(rev)
198 return super(changelog, self).linkrev(rev)
197
199
198 def parentrevs(self, rev):
200 def parentrevs(self, rev):
199 """filtered version of revlog.parentrevs"""
201 """filtered version of revlog.parentrevs"""
200 if rev in self.filteredrevs:
202 if rev in self.filteredrevs:
201 raise IndexError(rev)
203 raise IndexError(rev)
202 return super(changelog, self).parentrevs(rev)
204 return super(changelog, self).parentrevs(rev)
203
205
204 def flags(self, rev):
206 def flags(self, rev):
205 """filtered version of revlog.flags"""
207 """filtered version of revlog.flags"""
206 if rev in self.filteredrevs:
208 if rev in self.filteredrevs:
207 raise IndexError(rev)
209 raise IndexError(rev)
208 return super(changelog, self).flags(rev)
210 return super(changelog, self).flags(rev)
209
211
210 def delayupdate(self):
212 def delayupdate(self):
211 "delay visibility of index updates to other readers"
213 "delay visibility of index updates to other readers"
212 self._delayed = True
214 self._delayed = True
213 self._divert = (len(self) == 0)
215 self._divert = (len(self) == 0)
214 self._delaybuf = []
216 self._delaybuf = []
215 self.opener = delayopener(self._realopener, self.indexfile,
217 self.opener = delayopener(self._realopener, self.indexfile,
216 self._divert, self._delaybuf)
218 self._divert, self._delaybuf)
217
219
218 def finalize(self, tr):
220 def finalize(self, tr):
219 "finalize index updates"
221 "finalize index updates"
220 self._delayed = False
222 self._delayed = False
221 self.opener = self._realopener
223 self.opener = self._realopener
222 # move redirected index data back into place
224 # move redirected index data back into place
223 if self._divert:
225 if self._divert:
224 nfile = self.opener(self.indexfile + ".a")
226 nfile = self.opener(self.indexfile + ".a")
225 n = nfile.name
227 n = nfile.name
226 nfile.close()
228 nfile.close()
227 util.rename(n, n[:-2])
229 util.rename(n, n[:-2])
228 elif self._delaybuf:
230 elif self._delaybuf:
229 fp = self.opener(self.indexfile, 'a')
231 fp = self.opener(self.indexfile, 'a')
230 fp.write("".join(self._delaybuf))
232 fp.write("".join(self._delaybuf))
231 fp.close()
233 fp.close()
232 self._delaybuf = []
234 self._delaybuf = []
233 # split when we're done
235 # split when we're done
234 self.checkinlinesize(tr)
236 self.checkinlinesize(tr)
235
237
236 def readpending(self, file):
238 def readpending(self, file):
237 r = revlog.revlog(self.opener, file)
239 r = revlog.revlog(self.opener, file)
238 self.index = r.index
240 self.index = r.index
239 self.nodemap = r.nodemap
241 self.nodemap = r.nodemap
240 self._nodecache = r._nodecache
242 self._nodecache = r._nodecache
241 self._chunkcache = r._chunkcache
243 self._chunkcache = r._chunkcache
242
244
243 def writepending(self):
245 def writepending(self):
244 "create a file containing the unfinalized state for pretxnchangegroup"
246 "create a file containing the unfinalized state for pretxnchangegroup"
245 if self._delaybuf:
247 if self._delaybuf:
246 # make a temporary copy of the index
248 # make a temporary copy of the index
247 fp1 = self._realopener(self.indexfile)
249 fp1 = self._realopener(self.indexfile)
248 fp2 = self._realopener(self.indexfile + ".a", "w")
250 fp2 = self._realopener(self.indexfile + ".a", "w")
249 fp2.write(fp1.read())
251 fp2.write(fp1.read())
250 # add pending data
252 # add pending data
251 fp2.write("".join(self._delaybuf))
253 fp2.write("".join(self._delaybuf))
252 fp2.close()
254 fp2.close()
253 # switch modes so finalize can simply rename
255 # switch modes so finalize can simply rename
254 self._delaybuf = []
256 self._delaybuf = []
255 self._divert = True
257 self._divert = True
256
258
257 if self._divert:
259 if self._divert:
258 return True
260 return True
259
261
260 return False
262 return False
261
263
262 def checkinlinesize(self, tr, fp=None):
264 def checkinlinesize(self, tr, fp=None):
263 if not self._delayed:
265 if not self._delayed:
264 revlog.revlog.checkinlinesize(self, tr, fp)
266 revlog.revlog.checkinlinesize(self, tr, fp)
265
267
266 def read(self, node):
268 def read(self, node):
267 """
269 """
268 format used:
270 format used:
269 nodeid\n : manifest node in ascii
271 nodeid\n : manifest node in ascii
270 user\n : user, no \n or \r allowed
272 user\n : user, no \n or \r allowed
271 time tz extra\n : date (time is int or float, timezone is int)
273 time tz extra\n : date (time is int or float, timezone is int)
272 : extra is metadata, encoded and separated by '\0'
274 : extra is metadata, encoded and separated by '\0'
273 : older versions ignore it
275 : older versions ignore it
274 files\n\n : files modified by the cset, no \n or \r allowed
276 files\n\n : files modified by the cset, no \n or \r allowed
275 (.*) : comment (free text, ideally utf-8)
277 (.*) : comment (free text, ideally utf-8)
276
278
277 changelog v0 doesn't use extra
279 changelog v0 doesn't use extra
278 """
280 """
279 text = self.revision(node)
281 text = self.revision(node)
280 if not text:
282 if not text:
281 return (nullid, "", (0, 0), [], "", _defaultextra)
283 return (nullid, "", (0, 0), [], "", _defaultextra)
282 last = text.index("\n\n")
284 last = text.index("\n\n")
283 desc = encoding.tolocal(text[last + 2:])
285 desc = encoding.tolocal(text[last + 2:])
284 l = text[:last].split('\n')
286 l = text[:last].split('\n')
285 manifest = bin(l[0])
287 manifest = bin(l[0])
286 user = encoding.tolocal(l[1])
288 user = encoding.tolocal(l[1])
287
289
288 tdata = l[2].split(' ', 2)
290 tdata = l[2].split(' ', 2)
289 if len(tdata) != 3:
291 if len(tdata) != 3:
290 time = float(tdata[0])
292 time = float(tdata[0])
291 try:
293 try:
292 # various tools did silly things with the time zone field.
294 # various tools did silly things with the time zone field.
293 timezone = int(tdata[1])
295 timezone = int(tdata[1])
294 except ValueError:
296 except ValueError:
295 timezone = 0
297 timezone = 0
296 extra = _defaultextra
298 extra = _defaultextra
297 else:
299 else:
298 time, timezone = float(tdata[0]), int(tdata[1])
300 time, timezone = float(tdata[0]), int(tdata[1])
299 extra = decodeextra(tdata[2])
301 extra = decodeextra(tdata[2])
300
302
301 files = l[3:]
303 files = l[3:]
302 return (manifest, user, (time, timezone), files, desc, extra)
304 return (manifest, user, (time, timezone), files, desc, extra)
303
305
304 def add(self, manifest, files, desc, transaction, p1, p2,
306 def add(self, manifest, files, desc, transaction, p1, p2,
305 user, date=None, extra=None):
307 user, date=None, extra=None):
306 # Convert to UTF-8 encoded bytestrings as the very first
308 # Convert to UTF-8 encoded bytestrings as the very first
307 # thing: calling any method on a localstr object will turn it
309 # thing: calling any method on a localstr object will turn it
308 # into a str object and the cached UTF-8 string is thus lost.
310 # into a str object and the cached UTF-8 string is thus lost.
309 user, desc = encoding.fromlocal(user), encoding.fromlocal(desc)
311 user, desc = encoding.fromlocal(user), encoding.fromlocal(desc)
310
312
311 user = user.strip()
313 user = user.strip()
312 # An empty username or a username with a "\n" will make the
314 # An empty username or a username with a "\n" will make the
313 # revision text contain two "\n\n" sequences -> corrupt
315 # revision text contain two "\n\n" sequences -> corrupt
314 # repository since read cannot unpack the revision.
316 # repository since read cannot unpack the revision.
315 if not user:
317 if not user:
316 raise error.RevlogError(_("empty username"))
318 raise error.RevlogError(_("empty username"))
317 if "\n" in user:
319 if "\n" in user:
318 raise error.RevlogError(_("username %s contains a newline")
320 raise error.RevlogError(_("username %s contains a newline")
319 % repr(user))
321 % repr(user))
320
322
321 desc = stripdesc(desc)
323 desc = stripdesc(desc)
322
324
323 if date:
325 if date:
324 parseddate = "%d %d" % util.parsedate(date)
326 parseddate = "%d %d" % util.parsedate(date)
325 else:
327 else:
326 parseddate = "%d %d" % util.makedate()
328 parseddate = "%d %d" % util.makedate()
327 if extra:
329 if extra:
328 branch = extra.get("branch")
330 branch = extra.get("branch")
329 if branch in ("default", ""):
331 if branch in ("default", ""):
330 del extra["branch"]
332 del extra["branch"]
331 elif branch in (".", "null", "tip"):
333 elif branch in (".", "null", "tip"):
332 raise error.RevlogError(_('the name \'%s\' is reserved')
334 raise error.RevlogError(_('the name \'%s\' is reserved')
333 % branch)
335 % branch)
334 if extra:
336 if extra:
335 extra = encodeextra(extra)
337 extra = encodeextra(extra)
336 parseddate = "%s %s" % (parseddate, extra)
338 parseddate = "%s %s" % (parseddate, extra)
337 l = [hex(manifest), user, parseddate] + sorted(files) + ["", desc]
339 l = [hex(manifest), user, parseddate] + sorted(files) + ["", desc]
338 text = "\n".join(l)
340 text = "\n".join(l)
339 return self.addrevision(text, transaction, len(self), p1, p2)
341 return self.addrevision(text, transaction, len(self), p1, p2)
340
342
341 def branch(self, rev):
343 def branch(self, rev):
342 """return the branch of a revision
344 """return the branch of a revision
343
345
344 This function exists because creating a changectx object
346 This function exists because creating a changectx object
345 just to access this is costly."""
347 just to access this is costly."""
346 return encoding.tolocal(self.read(rev)[5].get("branch"))
348 return encoding.tolocal(self.read(rev)[5].get("branch"))
@@ -1,287 +1,287
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 import error
8 import error
9 import unicodedata, locale, os
9 import unicodedata, locale, os
10
10
11 def _getpreferredencoding():
11 def _getpreferredencoding():
12 '''
12 '''
13 On darwin, getpreferredencoding ignores the locale environment and
13 On darwin, getpreferredencoding ignores the locale environment and
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 for Python 2.7 and up. This is the same corrected code for earlier
15 for Python 2.7 and up. This is the same corrected code for earlier
16 Python versions.
16 Python versions.
17
17
18 However, we can't use a version check for this method, as some distributions
18 However, we can't use a version check for this method, as some distributions
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 encoding, as it is unlikely that this encoding is the actually expected.
20 encoding, as it is unlikely that this encoding is the actually expected.
21 '''
21 '''
22 try:
22 try:
23 locale.CODESET
23 locale.CODESET
24 except AttributeError:
24 except AttributeError:
25 # Fall back to parsing environment variables :-(
25 # Fall back to parsing environment variables :-(
26 return locale.getdefaultlocale()[1]
26 return locale.getdefaultlocale()[1]
27
27
28 oldloc = locale.setlocale(locale.LC_CTYPE)
28 oldloc = locale.setlocale(locale.LC_CTYPE)
29 locale.setlocale(locale.LC_CTYPE, "")
29 locale.setlocale(locale.LC_CTYPE, "")
30 result = locale.nl_langinfo(locale.CODESET)
30 result = locale.nl_langinfo(locale.CODESET)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
32
32
33 return result
33 return result
34
34
35 _encodingfixers = {
35 _encodingfixers = {
36 '646': lambda: 'ascii',
36 '646': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
38 'mac-roman': _getpreferredencoding
38 'mac-roman': _getpreferredencoding
39 }
39 }
40
40
41 try:
41 try:
42 encoding = os.environ.get("HGENCODING")
42 encoding = os.environ.get("HGENCODING")
43 if not encoding:
43 if not encoding:
44 encoding = locale.getpreferredencoding() or 'ascii'
44 encoding = locale.getpreferredencoding() or 'ascii'
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
46 except locale.Error:
46 except locale.Error:
47 encoding = 'ascii'
47 encoding = 'ascii'
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 fallbackencoding = 'ISO-8859-1'
49 fallbackencoding = 'ISO-8859-1'
50
50
51 class localstr(str):
51 class localstr(str):
52 '''This class allows strings that are unmodified to be
52 '''This class allows strings that are unmodified to be
53 round-tripped to the local encoding and back'''
53 round-tripped to the local encoding and back'''
54 def __new__(cls, u, l):
54 def __new__(cls, u, l):
55 s = str.__new__(cls, l)
55 s = str.__new__(cls, l)
56 s._utf8 = u
56 s._utf8 = u
57 return s
57 return s
58 def __hash__(self):
58 def __hash__(self):
59 return hash(self._utf8) # avoid collisions in local string space
59 return hash(self._utf8) # avoid collisions in local string space
60
60
61 def tolocal(s):
61 def tolocal(s):
62 """
62 """
63 Convert a string from internal UTF-8 to local encoding
63 Convert a string from internal UTF-8 to local encoding
64
64
65 All internal strings should be UTF-8 but some repos before the
65 All internal strings should be UTF-8 but some repos before the
66 implementation of locale support may contain latin1 or possibly
66 implementation of locale support may contain latin1 or possibly
67 other character sets. We attempt to decode everything strictly
67 other character sets. We attempt to decode everything strictly
68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
69 replace unknown characters.
69 replace unknown characters.
70
70
71 The localstr class is used to cache the known UTF-8 encoding of
71 The localstr class is used to cache the known UTF-8 encoding of
72 strings next to their local representation to allow lossless
72 strings next to their local representation to allow lossless
73 round-trip conversion back to UTF-8.
73 round-trip conversion back to UTF-8.
74
74
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 >>> l = tolocal(u)
76 >>> l = tolocal(u)
77 >>> l
77 >>> l
78 'foo: ?'
78 'foo: ?'
79 >>> fromlocal(l)
79 >>> fromlocal(l)
80 'foo: \\xc3\\xa4'
80 'foo: \\xc3\\xa4'
81 >>> u2 = 'foo: \\xc3\\xa1'
81 >>> u2 = 'foo: \\xc3\\xa1'
82 >>> d = { l: 1, tolocal(u2): 2 }
82 >>> d = { l: 1, tolocal(u2): 2 }
83 >>> d # no collision
83 >>> len(d) # no collision
84 {'foo: ?': 1, 'foo: ?': 2}
84 2
85 >>> 'foo: ?' in d
85 >>> 'foo: ?' in d
86 False
86 False
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 >>> l = tolocal(l1)
88 >>> l = tolocal(l1)
89 >>> l
89 >>> l
90 'foo: ?'
90 'foo: ?'
91 >>> fromlocal(l) # magically in utf-8
91 >>> fromlocal(l) # magically in utf-8
92 'foo: \\xc3\\xa4'
92 'foo: \\xc3\\xa4'
93 """
93 """
94
94
95 try:
95 try:
96 try:
96 try:
97 # make sure string is actually stored in UTF-8
97 # make sure string is actually stored in UTF-8
98 u = s.decode('UTF-8')
98 u = s.decode('UTF-8')
99 if encoding == 'UTF-8':
99 if encoding == 'UTF-8':
100 # fast path
100 # fast path
101 return s
101 return s
102 r = u.encode(encoding, "replace")
102 r = u.encode(encoding, "replace")
103 if u == r.decode(encoding):
103 if u == r.decode(encoding):
104 # r is a safe, non-lossy encoding of s
104 # r is a safe, non-lossy encoding of s
105 return r
105 return r
106 return localstr(s, r)
106 return localstr(s, r)
107 except UnicodeDecodeError:
107 except UnicodeDecodeError:
108 # we should only get here if we're looking at an ancient changeset
108 # we should only get here if we're looking at an ancient changeset
109 try:
109 try:
110 u = s.decode(fallbackencoding)
110 u = s.decode(fallbackencoding)
111 r = u.encode(encoding, "replace")
111 r = u.encode(encoding, "replace")
112 if u == r.decode(encoding):
112 if u == r.decode(encoding):
113 # r is a safe, non-lossy encoding of s
113 # r is a safe, non-lossy encoding of s
114 return r
114 return r
115 return localstr(u.encode('UTF-8'), r)
115 return localstr(u.encode('UTF-8'), r)
116 except UnicodeDecodeError:
116 except UnicodeDecodeError:
117 u = s.decode("utf-8", "replace") # last ditch
117 u = s.decode("utf-8", "replace") # last ditch
118 return u.encode(encoding, "replace") # can't round-trip
118 return u.encode(encoding, "replace") # can't round-trip
119 except LookupError, k:
119 except LookupError, k:
120 raise error.Abort(k, hint="please check your locale settings")
120 raise error.Abort(k, hint="please check your locale settings")
121
121
122 def fromlocal(s):
122 def fromlocal(s):
123 """
123 """
124 Convert a string from the local character encoding to UTF-8
124 Convert a string from the local character encoding to UTF-8
125
125
126 We attempt to decode strings using the encoding mode set by
126 We attempt to decode strings using the encoding mode set by
127 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
127 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
128 characters will cause an error message. Other modes include
128 characters will cause an error message. Other modes include
129 'replace', which replaces unknown characters with a special
129 'replace', which replaces unknown characters with a special
130 Unicode character, and 'ignore', which drops the character.
130 Unicode character, and 'ignore', which drops the character.
131 """
131 """
132
132
133 # can we do a lossless round-trip?
133 # can we do a lossless round-trip?
134 if isinstance(s, localstr):
134 if isinstance(s, localstr):
135 return s._utf8
135 return s._utf8
136
136
137 try:
137 try:
138 return s.decode(encoding, encodingmode).encode("utf-8")
138 return s.decode(encoding, encodingmode).encode("utf-8")
139 except UnicodeDecodeError, inst:
139 except UnicodeDecodeError, inst:
140 sub = s[max(0, inst.start - 10):inst.start + 10]
140 sub = s[max(0, inst.start - 10):inst.start + 10]
141 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
141 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
142 except LookupError, k:
142 except LookupError, k:
143 raise error.Abort(k, hint="please check your locale settings")
143 raise error.Abort(k, hint="please check your locale settings")
144
144
145 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
145 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
146 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
146 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
147 and "WFA" or "WF")
147 and "WFA" or "WF")
148
148
149 def colwidth(s):
149 def colwidth(s):
150 "Find the column width of a string for display in the local encoding"
150 "Find the column width of a string for display in the local encoding"
151 return ucolwidth(s.decode(encoding, 'replace'))
151 return ucolwidth(s.decode(encoding, 'replace'))
152
152
153 def ucolwidth(d):
153 def ucolwidth(d):
154 "Find the column width of a Unicode string for display"
154 "Find the column width of a Unicode string for display"
155 eaw = getattr(unicodedata, 'east_asian_width', None)
155 eaw = getattr(unicodedata, 'east_asian_width', None)
156 if eaw is not None:
156 if eaw is not None:
157 return sum([eaw(c) in wide and 2 or 1 for c in d])
157 return sum([eaw(c) in wide and 2 or 1 for c in d])
158 return len(d)
158 return len(d)
159
159
160 def getcols(s, start, c):
160 def getcols(s, start, c):
161 '''Use colwidth to find a c-column substring of s starting at byte
161 '''Use colwidth to find a c-column substring of s starting at byte
162 index start'''
162 index start'''
163 for x in xrange(start + c, len(s)):
163 for x in xrange(start + c, len(s)):
164 t = s[start:x]
164 t = s[start:x]
165 if colwidth(t) == c:
165 if colwidth(t) == c:
166 return t
166 return t
167
167
168 def lower(s):
168 def lower(s):
169 "best-effort encoding-aware case-folding of local string s"
169 "best-effort encoding-aware case-folding of local string s"
170 try:
170 try:
171 s.decode('ascii') # throw exception for non-ASCII character
171 s.decode('ascii') # throw exception for non-ASCII character
172 return s.lower()
172 return s.lower()
173 except UnicodeDecodeError:
173 except UnicodeDecodeError:
174 pass
174 pass
175 try:
175 try:
176 if isinstance(s, localstr):
176 if isinstance(s, localstr):
177 u = s._utf8.decode("utf-8")
177 u = s._utf8.decode("utf-8")
178 else:
178 else:
179 u = s.decode(encoding, encodingmode)
179 u = s.decode(encoding, encodingmode)
180
180
181 lu = u.lower()
181 lu = u.lower()
182 if u == lu:
182 if u == lu:
183 return s # preserve localstring
183 return s # preserve localstring
184 return lu.encode(encoding)
184 return lu.encode(encoding)
185 except UnicodeError:
185 except UnicodeError:
186 return s.lower() # we don't know how to fold this except in ASCII
186 return s.lower() # we don't know how to fold this except in ASCII
187 except LookupError, k:
187 except LookupError, k:
188 raise error.Abort(k, hint="please check your locale settings")
188 raise error.Abort(k, hint="please check your locale settings")
189
189
190 def upper(s):
190 def upper(s):
191 "best-effort encoding-aware case-folding of local string s"
191 "best-effort encoding-aware case-folding of local string s"
192 try:
192 try:
193 s.decode('ascii') # throw exception for non-ASCII character
193 s.decode('ascii') # throw exception for non-ASCII character
194 return s.upper()
194 return s.upper()
195 except UnicodeDecodeError:
195 except UnicodeDecodeError:
196 pass
196 pass
197 try:
197 try:
198 if isinstance(s, localstr):
198 if isinstance(s, localstr):
199 u = s._utf8.decode("utf-8")
199 u = s._utf8.decode("utf-8")
200 else:
200 else:
201 u = s.decode(encoding, encodingmode)
201 u = s.decode(encoding, encodingmode)
202
202
203 uu = u.upper()
203 uu = u.upper()
204 if u == uu:
204 if u == uu:
205 return s # preserve localstring
205 return s # preserve localstring
206 return uu.encode(encoding)
206 return uu.encode(encoding)
207 except UnicodeError:
207 except UnicodeError:
208 return s.upper() # we don't know how to fold this except in ASCII
208 return s.upper() # we don't know how to fold this except in ASCII
209 except LookupError, k:
209 except LookupError, k:
210 raise error.Abort(k, hint="please check your locale settings")
210 raise error.Abort(k, hint="please check your locale settings")
211
211
212 def toutf8b(s):
212 def toutf8b(s):
213 '''convert a local, possibly-binary string into UTF-8b
213 '''convert a local, possibly-binary string into UTF-8b
214
214
215 This is intended as a generic method to preserve data when working
215 This is intended as a generic method to preserve data when working
216 with schemes like JSON and XML that have no provision for
216 with schemes like JSON and XML that have no provision for
217 arbitrary byte strings. As Mercurial often doesn't know
217 arbitrary byte strings. As Mercurial often doesn't know
218 what encoding data is in, we use so-called UTF-8b.
218 what encoding data is in, we use so-called UTF-8b.
219
219
220 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
220 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
221 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
221 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
222 uDC00-uDCFF.
222 uDC00-uDCFF.
223
223
224 Principles of operation:
224 Principles of operation:
225
225
226 - ASCII and UTF-8 data successfully round-trips and is understood
226 - ASCII and UTF-8 data successfully round-trips and is understood
227 by Unicode-oriented clients
227 by Unicode-oriented clients
228 - filenames and file contents in arbitrary other encodings can have
228 - filenames and file contents in arbitrary other encodings can have
229 be round-tripped or recovered by clueful clients
229 be round-tripped or recovered by clueful clients
230 - local strings that have a cached known UTF-8 encoding (aka
230 - local strings that have a cached known UTF-8 encoding (aka
231 localstr) get sent as UTF-8 so Unicode-oriented clients get the
231 localstr) get sent as UTF-8 so Unicode-oriented clients get the
232 Unicode data they want
232 Unicode data they want
233 - because we must preserve UTF-8 bytestring in places such as
233 - because we must preserve UTF-8 bytestring in places such as
234 filenames, metadata can't be roundtripped without help
234 filenames, metadata can't be roundtripped without help
235
235
236 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
236 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
237 arbitrary bytes into an internal Unicode format that can be
237 arbitrary bytes into an internal Unicode format that can be
238 re-encoded back into the original. Here we are exposing the
238 re-encoded back into the original. Here we are exposing the
239 internal surrogate encoding as a UTF-8 string.)
239 internal surrogate encoding as a UTF-8 string.)
240 '''
240 '''
241
241
242 if isinstance(s, localstr):
242 if isinstance(s, localstr):
243 return s._utf8
243 return s._utf8
244
244
245 try:
245 try:
246 if s.decode('utf-8'):
246 if s.decode('utf-8'):
247 return s
247 return s
248 except UnicodeDecodeError:
248 except UnicodeDecodeError:
249 # surrogate-encode any characters that don't round-trip
249 # surrogate-encode any characters that don't round-trip
250 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
250 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
251 r = ""
251 r = ""
252 pos = 0
252 pos = 0
253 for c in s:
253 for c in s:
254 if s2[pos:pos + 1] == c:
254 if s2[pos:pos + 1] == c:
255 r += c
255 r += c
256 pos += 1
256 pos += 1
257 else:
257 else:
258 r += unichr(0xdc00 + ord(c)).encode('utf-8')
258 r += unichr(0xdc00 + ord(c)).encode('utf-8')
259 return r
259 return r
260
260
261 def fromutf8b(s):
261 def fromutf8b(s):
262 '''Given a UTF-8b string, return a local, possibly-binary string.
262 '''Given a UTF-8b string, return a local, possibly-binary string.
263
263
264 return the original binary string. This
264 return the original binary string. This
265 is a round-trip process for strings like filenames, but metadata
265 is a round-trip process for strings like filenames, but metadata
266 that's was passed through tolocal will remain in UTF-8.
266 that's was passed through tolocal will remain in UTF-8.
267
267
268 >>> m = "\\xc3\\xa9\\x99abcd"
268 >>> m = "\\xc3\\xa9\\x99abcd"
269 >>> n = toutf8b(m)
269 >>> n = toutf8b(m)
270 >>> n
270 >>> n
271 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
271 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
272 >>> fromutf8b(n) == m
272 >>> fromutf8b(n) == m
273 True
273 True
274 '''
274 '''
275
275
276 # fast path - look for uDxxx prefixes in s
276 # fast path - look for uDxxx prefixes in s
277 if "\xed" not in s:
277 if "\xed" not in s:
278 return s
278 return s
279
279
280 u = s.decode("utf-8")
280 u = s.decode("utf-8")
281 r = ""
281 r = ""
282 for c in u:
282 for c in u:
283 if ord(c) & 0xff00 == 0xdc00:
283 if ord(c) & 0xff00 == 0xdc00:
284 r += chr(ord(c) & 0xff)
284 r += chr(ord(c) & 0xff)
285 else:
285 else:
286 r += c.encode("utf-8")
286 r += c.encode("utf-8")
287 return r
287 return r
General Comments 0
You need to be logged in to leave comments. Login now