##// END OF EJS Templates
convert: svn: some improvements in memory usage
Brendan Cully -
r4837:4cdbaa88 default
parent child Browse files
Show More
@@ -1,573 +1,583
1 1 # Subversion 1.4/1.5 Python API backend
2 2 #
3 3 # Copyright(C) 2007 Daniel Holth et al
4 4
5 5 import pprint
6 6 import locale
7 7
8 8 from mercurial import util
9 9
10 10 # Subversion stuff. Works best with very recent Python SVN bindings
11 11 # e.g. SVN 1.5 or backports. Thanks to the bzr folks for enhancing
12 12 # these bindings.
13 13
14 14 from cStringIO import StringIO
15 15
16 16 from common import NoRepo, commit, converter_source
17 17
18 18 try:
19 19 from svn.core import SubversionException, Pool
20 20 import svn.core
21 21 import svn.ra
22 22 import svn.delta
23 23 import svn
24 24 import transport
25 25 except ImportError:
26 26 pass
27 27
28 28 class CompatibilityException(Exception): pass
29 29
30 30 # SVN conversion code stolen from bzr-svn and tailor
31 31 class convert_svn(converter_source):
32 32 def __init__(self, ui, url, rev=None):
33 33 super(convert_svn, self).__init__(ui, url, rev=rev)
34 34
35 35 try:
36 36 SubversionException
37 37 except NameError:
38 38 msg = 'subversion python bindings could not be loaded\n'
39 39 ui.warn(msg)
40 40 raise NoRepo(msg)
41 41
42 42 self.encoding = locale.getpreferredencoding()
43 43 self.lastrevs = {}
44 44
45 45 latest = None
46 46 if rev:
47 47 try:
48 48 latest = int(rev)
49 49 except ValueError:
50 50 raise util.Abort('svn: revision %s is not an integer' % rev)
51 51 try:
52 52 # Support file://path@rev syntax. Useful e.g. to convert
53 53 # deleted branches.
54 54 url, latest = url.rsplit("@", 1)
55 55 latest = int(latest)
56 56 except ValueError, e:
57 57 pass
58 58 self.url = url
59 59 self.encoding = 'UTF-8' # Subversion is always nominal UTF-8
60 60 try:
61 61 self.transport = transport.SvnRaTransport(url = url)
62 62 self.ra = self.transport.ra
63 63 self.ctx = svn.client.create_context()
64 64 self.base = svn.ra.get_repos_root(self.ra)
65 65 self.module = self.url[len(self.base):]
66 66 self.modulemap = {} # revision, module
67 67 self.commits = {}
68 68 self.files = {}
69 69 self.uuid = svn.ra.get_uuid(self.ra).decode(self.encoding)
70 70 except SubversionException, e:
71 71 raise NoRepo("couldn't open SVN repo %s" % url)
72 72
73 73 try:
74 74 self.get_blacklist()
75 75 except IOError, e:
76 76 pass
77 77
78 78 self.last_changed = self.latest(self.module, latest)
79 79
80 80 self.head = self.revid(self.last_changed)
81 81
82 82 def revid(self, revnum, module=None):
83 83 if not module:
84 84 module = self.module
85 85 return (u"svn:%s%s@%s" % (self.uuid, module, revnum)).decode(self.encoding)
86 86
87 87 def revnum(self, rev):
88 88 return int(rev.split('@')[-1])
89 89
90 90 def revsplit(self, rev):
91 91 url, revnum = rev.encode(self.encoding).split('@', 1)
92 92 revnum = int(revnum)
93 93 parts = url.split('/', 1)
94 94 uuid = parts.pop(0)[4:]
95 95 mod = ''
96 96 if parts:
97 97 mod = '/' + parts[0]
98 98 return uuid, mod, revnum
99 99
100 100 def latest(self, path, stop=0):
101 101 'find the latest revision affecting path, up to stop'
102 102 if not stop:
103 103 stop = svn.ra.get_latest_revnum(self.ra)
104 104 try:
105 105 self.reparent('')
106 106 dirent = svn.ra.stat(self.ra, path.strip('/'), stop)
107 107 self.reparent(self.module)
108 108 except SubversionException:
109 109 dirent = None
110 110 if not dirent:
111 111 raise util.Abort('%s not found up to revision %d' \
112 112 % (path, stop))
113 113
114 114 return dirent.created_rev
115 115
116 116 def get_blacklist(self):
117 117 """Avoid certain revision numbers.
118 118 It is not uncommon for two nearby revisions to cancel each other
119 119 out, e.g. 'I copied trunk into a subdirectory of itself instead
120 120 of making a branch'. The converted repository is significantly
121 121 smaller if we ignore such revisions."""
122 122 self.blacklist = set()
123 123 blacklist = self.blacklist
124 124 for line in file("blacklist.txt", "r"):
125 125 if not line.startswith("#"):
126 126 try:
127 127 svn_rev = int(line.strip())
128 128 blacklist.add(svn_rev)
129 129 except ValueError, e:
130 130 pass # not an integer or a comment
131 131
132 132 def is_blacklisted(self, svn_rev):
133 133 return svn_rev in self.blacklist
134 134
135 135 def reparent(self, module):
136 136 svn_url = self.base + module
137 137 self.ui.debug("reparent to %s\n" % svn_url.encode(self.encoding))
138 138 svn.ra.reparent(self.ra, svn_url.encode(self.encoding))
139 139
140 140 def _fetch_revisions(self, from_revnum = 0, to_revnum = 347):
141 141 def get_entry_from_path(path, module=self.module):
142 142 # Given the repository url of this wc, say
143 143 # "http://server/plone/CMFPlone/branches/Plone-2_0-branch"
144 144 # extract the "entry" portion (a relative path) from what
145 145 # svn log --xml says, ie
146 146 # "/CMFPlone/branches/Plone-2_0-branch/tests/PloneTestCase.py"
147 147 # that is to say "tests/PloneTestCase.py"
148 148
149 149 if path.startswith(module):
150 150 relative = path[len(module):]
151 151 if relative.startswith('/'):
152 152 return relative[1:]
153 153 else:
154 154 return relative
155 155
156 156 # The path is outside our tracked tree...
157 157 self.ui.debug('Ignoring %r since it is not under %r\n' % (path, module))
158 158 return None
159 159
160 160 received = []
161 161 # svn.ra.get_log requires no other calls to the ra until it completes,
162 162 # so we just collect the log entries and parse them afterwards
163 163 def receivelog(*arg, **args):
164 164 received.append(arg)
165 165
166 166 self.child_cset = None
167 167 def parselogentry(*arg, **args):
168 168 orig_paths, revnum, author, date, message, pool = arg
169 169
170 170 if self.is_blacklisted(revnum):
171 171 self.ui.note('skipping blacklisted revision %d\n' % revnum)
172 172 return
173 173
174 174 self.ui.debug("parsing revision %d\n" % revnum)
175 175
176 176 if orig_paths is None:
177 177 self.ui.debug('revision %d has no entries\n' % revnum)
178 178 return
179 179
180 180 if revnum in self.modulemap:
181 181 new_module = self.modulemap[revnum]
182 182 if new_module != self.module:
183 183 self.module = new_module
184 184 self.reparent(self.module)
185 185
186 186 copyfrom = {} # Map of entrypath, revision for finding source of deleted revisions.
187 187 copies = {}
188 188 entries = []
189 189 rev = self.revid(revnum)
190 190 parents = []
191
192 # branch log might return entries for a parent we already have
193 if rev in self.commits:
194 return
195
191 196 try:
192 197 branch = self.module.split("/")[-1]
193 198 if branch == 'trunk':
194 199 branch = ''
195 200 except IndexError:
196 201 branch = None
197
202
198 203 for path in sorted(orig_paths):
199 204 # self.ui.write("path %s\n" % path)
200 205 if path == self.module: # Follow branching back in history
201 206 ent = orig_paths[path]
202 207 if ent:
203 208 if ent.copyfrom_path:
204 209 # ent.copyfrom_rev may not be the actual last revision
205 210 prev = self.latest(ent.copyfrom_path, ent.copyfrom_rev)
206 211 self.modulemap[prev] = ent.copyfrom_path
207 212 parents = [self.revid(prev, ent.copyfrom_path)]
208 213 self.ui.note('found parent of branch %s at %d: %s\n' % \
209 214 (self.module, prev, ent.copyfrom_path))
210 215 else:
211 216 self.ui.debug("No copyfrom path, don't know what to do.\n")
212 217 # Maybe it was added and there is no more history.
213 218 entrypath = get_entry_from_path(path, module=self.module)
214 219 # self.ui.write("entrypath %s\n" % entrypath)
215 220 if entrypath is None:
216 221 # Outside our area of interest
217 222 self.ui.debug("boring@%s: %s\n" % (revnum, path))
218 223 continue
219 224 entry = entrypath.decode(self.encoding)
220 225 ent = orig_paths[path]
221 226
222 227 kind = svn.ra.check_path(self.ra, entrypath, revnum)
223 228 if kind == svn.core.svn_node_file:
224 229 if ent.copyfrom_path:
225 230 copyfrom_path = get_entry_from_path(ent.copyfrom_path)
226 231 if copyfrom_path:
227 232 self.ui.debug("Copied to %s from %s@%s\n" % (entry, copyfrom_path, ent.copyfrom_rev))
228 233 # It's probably important for hg that the source
229 234 # exists in the revision's parent, not just the
230 235 # ent.copyfrom_rev
231 236 fromkind = svn.ra.check_path(self.ra, copyfrom_path, ent.copyfrom_rev)
232 237 if fromkind != 0:
233 238 copies[self.recode(entry)] = self.recode(copyfrom_path)
234 239 entries.append(self.recode(entry))
235 240 elif kind == 0: # gone, but had better be a deleted *file*
236 241 self.ui.debug("gone from %s\n" % ent.copyfrom_rev)
237 242
238 243 # if a branch is created but entries are removed in the same
239 244 # changeset, get the right fromrev
240 245 if parents:
241 246 uuid, old_module, fromrev = self.revsplit(parents[0])
242 247 else:
243 248 fromrev = revnum - 1
244 249 # might always need to be revnum - 1 in these 3 lines?
245 250 old_module = self.modulemap.get(fromrev, self.module)
246 251
247 252 basepath = old_module + "/" + get_entry_from_path(path, module=self.module)
248 253 entrypath = old_module + "/" + get_entry_from_path(path, module=self.module)
249 254
250 255 def lookup_parts(p):
251 256 rc = None
252 257 parts = p.split("/")
253 258 for i in range(len(parts)):
254 259 part = "/".join(parts[:i])
255 260 info = part, copyfrom.get(part, None)
256 261 if info[1] is not None:
257 262 self.ui.debug("Found parent directory %s\n" % info[1])
258 263 rc = info
259 264 return rc
260 265
261 266 self.ui.debug("base, entry %s %s\n" % (basepath, entrypath))
262 267
263 268 frompath, froment = lookup_parts(entrypath) or (None, revnum - 1)
264 269
265 270 # need to remove fragment from lookup_parts and replace with copyfrom_path
266 271 if frompath is not None:
267 272 self.ui.debug("munge-o-matic\n")
268 273 self.ui.debug(entrypath + '\n')
269 274 self.ui.debug(entrypath[len(frompath):] + '\n')
270 275 entrypath = froment.copyfrom_path + entrypath[len(frompath):]
271 276 fromrev = froment.copyfrom_rev
272 277 self.ui.debug("Info: %s %s %s %s\n" % (frompath, froment, ent, entrypath))
273 278
274 279 fromkind = svn.ra.check_path(self.ra, entrypath, fromrev)
275 280 if fromkind == svn.core.svn_node_file: # a deleted file
276 281 entries.append(self.recode(entry))
277 282 elif fromkind == svn.core.svn_node_dir:
278 283 # print "Deleted/moved non-file:", revnum, path, ent
279 284 # children = self._find_children(path, revnum - 1)
280 285 # print "find children %s@%d from %d action %s" % (path, revnum, ent.copyfrom_rev, ent.action)
281 286 # Sometimes this is tricky. For example: in
282 287 # The Subversion Repository revision 6940 a dir
283 288 # was copied and one of its files was deleted
284 289 # from the new location in the same commit. This
285 290 # code can't deal with that yet.
286 291 if ent.action == 'C':
287 292 children = self._find_children(path, fromrev)
288 293 else:
289 294 oroot = entrypath.strip('/')
290 295 nroot = path.strip('/')
291 296 children = self._find_children(oroot, fromrev)
292 297 children = [s.replace(oroot,nroot) for s in children]
293 298 # Mark all [files, not directories] as deleted.
294 299 for child in children:
295 300 # Can we move a child directory and its
296 301 # parent in the same commit? (probably can). Could
297 302 # cause problems if instead of revnum -1,
298 303 # we have to look in (copyfrom_path, revnum - 1)
299 304 entrypath = get_entry_from_path("/" + child, module=old_module)
300 305 if entrypath:
301 306 entry = self.recode(entrypath.decode(self.encoding))
302 307 if entry in copies:
303 308 # deleted file within a copy
304 309 del copies[entry]
305 310 else:
306 311 entries.append(entry)
307 312 else:
308 313 self.ui.debug('unknown path in revision %d: %s\n' % \
309 314 (revnum, path))
310 315 elif kind == svn.core.svn_node_dir:
311 316 # Should probably synthesize normal file entries
312 317 # and handle as above to clean up copy/rename handling.
313 318
314 319 # If the directory just had a prop change,
315 320 # then we shouldn't need to look for its children.
316 321 # Also this could create duplicate entries. Not sure
317 322 # whether this will matter. Maybe should make entries a set.
318 323 # print "Changed directory", revnum, path, ent.action, ent.copyfrom_path, ent.copyfrom_rev
319 324 # This will fail if a directory was copied
320 325 # from another branch and then some of its files
321 326 # were deleted in the same transaction.
322 327 children = self._find_children(path, revnum)
323 328 children.sort()
324 329 for child in children:
325 330 # Can we move a child directory and its
326 331 # parent in the same commit? (probably can). Could
327 332 # cause problems if instead of revnum -1,
328 333 # we have to look in (copyfrom_path, revnum - 1)
329 334 entrypath = get_entry_from_path("/" + child, module=self.module)
330 335 # print child, self.module, entrypath
331 336 if entrypath:
332 337 # Need to filter out directories here...
333 338 kind = svn.ra.check_path(self.ra, entrypath, revnum)
334 339 if kind != svn.core.svn_node_dir:
335 340 entries.append(self.recode(entrypath))
336 341
337 342 # Copies here (must copy all from source)
338 343 # Probably not a real problem for us if
339 344 # source does not exist
340 345
341 346 # Can do this with the copy command "hg copy"
342 347 # if ent.copyfrom_path:
343 348 # copyfrom_entry = get_entry_from_path(ent.copyfrom_path.decode(self.encoding),
344 349 # module=self.module)
345 350 # copyto_entry = entrypath
346 351 #
347 352 # print "copy directory", copyfrom_entry, 'to', copyto_entry
348 353 #
349 354 # copies.append((copyfrom_entry, copyto_entry))
350 355
351 356 if ent.copyfrom_path:
352 357 copyfrom_path = ent.copyfrom_path.decode(self.encoding)
353 358 copyfrom_entry = get_entry_from_path(copyfrom_path, module=self.module)
354 359 if copyfrom_entry:
355 360 copyfrom[path] = ent
356 361 self.ui.debug("mark %s came from %s\n" % (path, copyfrom[path]))
357 362
358 363 # Good, /probably/ a regular copy. Really should check
359 364 # to see whether the parent revision actually contains
360 365 # the directory in question.
361 366 children = self._find_children(self.recode(copyfrom_path), ent.copyfrom_rev)
362 367 children.sort()
363 368 for child in children:
364 369 entrypath = get_entry_from_path("/" + child, module=self.module)
365 370 if entrypath:
366 371 entry = entrypath.decode(self.encoding)
367 372 # print "COPY COPY From", copyfrom_entry, entry
368 373 copyto_path = path + entry[len(copyfrom_entry):]
369 374 copyto_entry = get_entry_from_path(copyto_path, module=self.module)
370 375 # print "COPY", entry, "COPY To", copyto_entry
371 376 copies[self.recode(copyto_entry)] = self.recode(entry)
372 377 # copy from quux splort/quuxfile
373 378
374 379 self.modulemap[revnum] = self.module # track backwards in time
375 380 # a list of (filename, id) where id lets us retrieve the file.
376 381 # eg in git, id is the object hash. for svn it'll be the
377 382 self.files[rev] = zip(entries, [rev] * len(entries))
378 383 if not entries:
379 384 return
380 385
381 386 # Example SVN datetime. Includes microseconds.
382 387 # ISO-8601 conformant
383 388 # '2007-01-04T17:35:00.902377Z'
384 389 date = util.parsedate(date[:18] + " UTC", ["%Y-%m-%dT%H:%M:%S"])
385 390
386 391 log = message and self.recode(message)
387 392 author = author and self.recode(author) or ''
388 393
389 394 cset = commit(author=author,
390 395 date=util.datestr(date),
391 396 desc=log,
392 397 parents=parents,
393 398 copies=copies,
394 399 branch=branch)
395 400
396 401 self.commits[rev] = cset
397 402 if self.child_cset and not self.child_cset.parents:
398 403 self.child_cset.parents = [rev]
399 404 self.child_cset = cset
400 405
401 406 self.ui.note('fetching revision log for "%s" from %d to %d\n' % \
402 407 (self.module, from_revnum, to_revnum))
403 408
404 409 try:
405 410 discover_changed_paths = True
406 411 strict_node_history = False
407 412 svn.ra.get_log(self.ra, [self.module], from_revnum, to_revnum, 0,
408 413 discover_changed_paths, strict_node_history,
409 414 receivelog)
410 415 for entry in received:
411 416 parselogentry(*entry)
412 417 except SubversionException, (_, num):
413 418 if num == svn.core.SVN_ERR_FS_NO_SUCH_REVISION:
414 419 raise NoSuchRevision(branch=self,
415 420 revision="Revision number %d" % to_revnum)
416 421 raise
417 422
418 423 def setrevmap(self, revmap):
419 424 lastrevs = {}
420 425 for revid in revmap.keys():
421 426 uuid, module, revnum = self.revsplit(revid)
422 427 lastrevnum = lastrevs.setdefault(module, revnum)
423 428 if revnum > lastrevnum:
424 429 lastrevs[module] = revnum
425 430 self.lastrevs = lastrevs
426 431
427 432 def getheads(self):
428 433 # detect standard /branches, /tags, /trunk layout
429 434 optrev = svn.core.svn_opt_revision_t()
430 435 optrev.kind = svn.core.svn_opt_revision_number
431 436 optrev.value.number = self.last_changed
432 437 rpath = self.url.strip('/')
433 438 paths = svn.client.ls(rpath, optrev, False, self.ctx)
434 439 if 'branches' in paths and 'trunk' in paths:
435 440 self.module += '/trunk'
436 441 lt = self.latest(self.module, self.last_changed)
437 442 self.head = self.revid(lt)
438 443 self.heads = [self.head]
439 444 branches = svn.client.ls(rpath + '/branches', optrev, False, self.ctx)
440 445 for branch in branches.keys():
441 446 module = '/branches/' + branch
442 447 brevnum = self.latest(module, self.last_changed)
443 448 brev = self.revid(brevnum, module)
444 449 self.ui.note('found branch %s at %d\n' % (branch, brevnum))
445 450 self.heads.append(brev)
446 451 else:
447 452 self.heads = [self.head]
448 453 return self.heads
449 454
450 455 def _getfile(self, file, rev):
451 456 io = StringIO()
452 457 # TODO: ra.get_file transmits the whole file instead of diffs.
453 458 mode = ''
454 459 try:
455 460 revnum = self.revnum(rev)
456 461 if self.module != self.modulemap[revnum]:
457 462 self.module = self.modulemap[revnum]
458 463 self.reparent(self.module)
459 464 info = svn.ra.get_file(self.ra, file, revnum, io)
460 465 if isinstance(info, list):
461 466 info = info[-1]
462 467 mode = ("svn:executable" in info) and 'x' or ''
463 468 mode = ("svn:special" in info) and 'l' or mode
464 469 except SubversionException, e:
465 470 notfound = (svn.core.SVN_ERR_FS_NOT_FOUND,
466 471 svn.core.SVN_ERR_RA_DAV_PATH_NOT_FOUND)
467 472 if e.apr_err in notfound: # File not found
468 473 raise IOError()
469 474 raise
470 475 data = io.getvalue()
471 476 if mode == 'l':
472 477 link_prefix = "link "
473 478 if data.startswith(link_prefix):
474 479 data = data[len(link_prefix):]
475 480 return data, mode
476 481
477 482 def getfile(self, file, rev):
478 483 data, mode = self._getfile(file, rev)
479 484 self.modecache[(file, rev)] = mode
480 485 return data
481 486
482 487 def getmode(self, file, rev):
483 488 return self.modecache[(file, rev)]
484 489
485 490 def getchanges(self, rev):
486 491 self.modecache = {}
487 492 files = self.files[rev]
488 493 cl = files
489 494 cl.sort()
495 # caller caches the result, so free it here to release memory
496 del self.files[rev]
490 497 return cl
491 498
492 499 def getcommit(self, rev):
493 500 if rev not in self.commits:
494 501 uuid, module, revnum = self.revsplit(rev)
495 502 self.module = module
496 503 self.reparent(module)
497 504 stop = self.lastrevs.get(module, 0)
498 505 self._fetch_revisions(from_revnum=revnum, to_revnum=stop)
499 return self.commits[rev]
506 commit = self.commits[rev]
507 # caller caches the result, so free it here to release memory
508 del self.commits[rev]
509 return commit
500 510
501 511 def gettags(self):
502 512 tags = {}
503 513 def parselogentry(*arg, **args):
504 514 orig_paths, revnum, author, date, message, pool = arg
505 515 for path in orig_paths:
506 516 if not path.startswith('/tags/'):
507 517 continue
508 518 ent = orig_paths[path]
509 519 source = ent.copyfrom_path
510 520 rev = ent.copyfrom_rev
511 521 tag = path.split('/', 2)[2]
512 522 tags[tag] = self.revid(rev, module=source)
513 523
514 524 start = self.revnum(self.head)
515 525 try:
516 526 svn.ra.get_log(self.ra, ['/tags'], 0, start, 0, True, False,
517 527 parselogentry)
518 528 return tags
519 529 except SubversionException:
520 530 self.ui.note('no tags found at revision %d\n' % start)
521 531 return {}
522 532
523 533 def _find_children(self, path, revnum):
524 534 path = path.strip("/")
525 535
526 536 def _find_children_fallback(path, revnum):
527 537 # SWIG python bindings for getdir are broken up to at least 1.4.3
528 538 pool = Pool()
529 539 optrev = svn.core.svn_opt_revision_t()
530 540 optrev.kind = svn.core.svn_opt_revision_number
531 541 optrev.value.number = revnum
532 542 rpath = '/'.join([self.base, path]).strip('/')
533 543 return ['%s/%s' % (path, x) for x in svn.client.ls(rpath, optrev, True, self.ctx, pool).keys()]
534 544
535 545 if hasattr(self, '_find_children_fallback'):
536 546 return _find_children_fallback(path, revnum)
537 547
538 548 self.reparent("/" + path)
539 549 pool = Pool()
540 550
541 551 children = []
542 552 def find_children_inner(children, path, revnum = revnum):
543 553 if hasattr(svn.ra, 'get_dir2'): # Since SVN 1.4
544 554 fields = 0xffffffff # Binding does not provide SVN_DIRENT_ALL
545 555 getdir = svn.ra.get_dir2(self.ra, path, revnum, fields, pool)
546 556 else:
547 557 getdir = svn.ra.get_dir(self.ra, path, revnum, pool)
548 558 if type(getdir) == dict:
549 559 # python binding for getdir is broken up to at least 1.4.3
550 560 raise CompatibilityException()
551 561 dirents = getdir[0]
552 562 if type(dirents) == int:
553 563 # got here once due to infinite recursion bug
554 564 # pprint.pprint(getdir)
555 565 return
556 566 c = dirents.keys()
557 567 c.sort()
558 568 for child in c:
559 569 dirent = dirents[child]
560 570 if dirent.kind == svn.core.svn_node_dir:
561 571 find_children_inner(children, (path + "/" + child).strip("/"))
562 572 else:
563 573 children.append((path + "/" + child).strip("/"))
564 574
565 575 try:
566 576 find_children_inner(children, "")
567 577 except CompatibilityException:
568 578 self._find_children_fallback = True
569 579 self.reparent(self.module)
570 580 return _find_children_fallback(path, revnum)
571 581
572 582 self.reparent(self.module)
573 583 return [path + "/" + c for c in children]
General Comments 0
You need to be logged in to leave comments. Login now