##// END OF EJS Templates
diff-match: python3/code fixes for backport code
super-admin -
r4954:f53817a4 default
parent child Browse files
Show More
This diff has been collapsed as it changes many lines, (3652 lines changed) Show them Hide them
@@ -1,6 +1,3 b''
1
2
3
4 1 """Diff Match and Patch
5 2
6 3 Copyright 2006 Google Inc.
@@ -25,7 +22,7 b' Computes the difference between two text'
25 22 Applies the patch onto another text, allowing for errors.
26 23 """
27 24
28 __author__ = 'fraser@google.com (Neil Fraser)'
25 __author__ = "fraser@google.com (Neil Fraser)"
29 26
30 27 import math
31 28 import re
@@ -35,1885 +32,2000 b' import urllib.request, urllib.parse, url'
35 32
36 33
37 34 class diff_match_patch:
38 """Class containing the diff, match and patch methods.
39
40 Also contains the behaviour settings.
41 """
42
43 def __init__(self):
44 """Inits a diff_match_patch object with default settings.
45 Redefine these in your program to override the defaults.
46 """
47
48 # Number of seconds to map a diff before giving up (0 for infinity).
49 self.Diff_Timeout = 1.0
50 # Cost of an empty edit operation in terms of edit characters.
51 self.Diff_EditCost = 4
52 # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
53 self.Match_Threshold = 0.5
54 # How far to search for a match (0 = exact location, 1000+ = broad match).
55 # A match this many characters away from the expected location will add
56 # 1.0 to the score (0.0 is a perfect match).
57 self.Match_Distance = 1000
58 # When deleting a large block of text (over ~64 characters), how close do
59 # the contents have to be to match the expected contents. (0.0 = perfection,
60 # 1.0 = very loose). Note that Match_Threshold controls how closely the
61 # end points of a delete need to match.
62 self.Patch_DeleteThreshold = 0.5
63 # Chunk size for context length.
64 self.Patch_Margin = 4
65
66 # The number of bits in an int.
67 # Python has no maximum, thus to disable patch splitting set to 0.
68 # However to avoid long patches in certain pathological cases, use 32.
69 # Multiple short patches (using native ints) are much faster than long ones.
70 self.Match_MaxBits = 32
71
72 # DIFF FUNCTIONS
73
74 # The data structure representing a diff is an array of tuples:
75 # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
76 # which means: delete "Hello", add "Goodbye" and keep " world."
77 DIFF_DELETE = -1
78 DIFF_INSERT = 1
79 DIFF_EQUAL = 0
80
81 def diff_main(self, text1, text2, checklines=True, deadline=None):
82 """Find the differences between two texts. Simplifies the problem by
83 stripping any common prefix or suffix off the texts before diffing.
84
85 Args:
86 text1: Old string to be diffed.
87 text2: New string to be diffed.
88 checklines: Optional speedup flag. If present and false, then don't run
89 a line-level diff first to identify the changed areas.
90 Defaults to true, which does a faster, slightly less optimal diff.
91 deadline: Optional time when the diff should be complete by. Used
92 internally for recursive calls. Users should set DiffTimeout instead.
93
94 Returns:
95 Array of changes.
96 """
97 # Set a deadline by which time the diff must be complete.
98 if deadline is None:
99 # Unlike in most languages, Python counts time in seconds.
100 if self.Diff_Timeout <= 0:
101 deadline = sys.maxint
102 else:
103 deadline = time.time() + self.Diff_Timeout
104
105 # Check for null inputs.
106 if text1 is None or text2 is None:
107 raise ValueError("Null inputs. (diff_main)")
108
109 # Check for equality (speedup).
110 if text1 == text2:
111 if text1:
112 return [(self.DIFF_EQUAL, text1)]
113 return []
114
115 # Trim off common prefix (speedup).
116 commonlength = self.diff_commonPrefix(text1, text2)
117 commonprefix = text1[:commonlength]
118 text1 = text1[commonlength:]
119 text2 = text2[commonlength:]
120
121 # Trim off common suffix (speedup).
122 commonlength = self.diff_commonSuffix(text1, text2)
123 if commonlength == 0:
124 commonsuffix = ''
125 else:
126 commonsuffix = text1[-commonlength:]
127 text1 = text1[:-commonlength]
128 text2 = text2[:-commonlength]
129
130 # Compute the diff on the middle block.
131 diffs = self.diff_compute(text1, text2, checklines, deadline)
132
133 # Restore the prefix and suffix.
134 if commonprefix:
135 diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
136 if commonsuffix:
137 diffs.append((self.DIFF_EQUAL, commonsuffix))
138 self.diff_cleanupMerge(diffs)
139 return diffs
140
141 def diff_compute(self, text1, text2, checklines, deadline):
142 """Find the differences between two texts. Assumes that the texts do not
143 have any common prefix or suffix.
144
145 Args:
146 text1: Old string to be diffed.
147 text2: New string to be diffed.
148 checklines: Speedup flag. If false, then don't run a line-level diff
149 first to identify the changed areas.
150 If true, then run a faster, slightly less optimal diff.
151 deadline: Time when the diff should be complete by.
152
153 Returns:
154 Array of changes.
155 """
156 if not text1:
157 # Just add some text (speedup).
158 return [(self.DIFF_INSERT, text2)]
159
160 if not text2:
161 # Just delete some text (speedup).
162 return [(self.DIFF_DELETE, text1)]
163
164 if len(text1) > len(text2):
165 (longtext, shorttext) = (text1, text2)
166 else:
167 (shorttext, longtext) = (text1, text2)
168 i = longtext.find(shorttext)
169 if i != -1:
170 # Shorter text is inside the longer text (speedup).
171 diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),
172 (self.DIFF_INSERT, longtext[i + len(shorttext):])]
173 # Swap insertions for deletions if diff is reversed.
174 if len(text1) > len(text2):
175 diffs[0] = (self.DIFF_DELETE, diffs[0][1])
176 diffs[2] = (self.DIFF_DELETE, diffs[2][1])
177 return diffs
178
179 if len(shorttext) == 1:
180 # Single character string.
181 # After the previous speedup, the character can't be an equality.
182 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
183
184 # Check to see if the problem can be split in two.
185 hm = self.diff_halfMatch(text1, text2)
186 if hm:
187 # A half-match was found, sort out the return data.
188 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
189 # Send both pairs off for separate processing.
190 diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)
191 diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)
192 # Merge the results.
193 return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
194
195 if checklines and len(text1) > 100 and len(text2) > 100:
196 return self.diff_lineMode(text1, text2, deadline)
197
198 return self.diff_bisect(text1, text2, deadline)
199
200 def diff_lineMode(self, text1, text2, deadline):
201 """Do a quick line-level diff on both strings, then rediff the parts for
202 greater accuracy.
203 This speedup can produce non-minimal diffs.
204
205 Args:
206 text1: Old string to be diffed.
207 text2: New string to be diffed.
208 deadline: Time when the diff should be complete by.
209
210 Returns:
211 Array of changes.
212 """
213
214 # Scan the text on a line-by-line basis first.
215 (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
216
217 diffs = self.diff_main(text1, text2, False, deadline)
218
219 # Convert the diff back to original text.
220 self.diff_charsToLines(diffs, linearray)
221 # Eliminate freak matches (e.g. blank lines)
222 self.diff_cleanupSemantic(diffs)
223
224 # Rediff any replacement blocks, this time character-by-character.
225 # Add a dummy entry at the end.
226 diffs.append((self.DIFF_EQUAL, ''))
227 pointer = 0
228 count_delete = 0
229 count_insert = 0
230 text_delete = ''
231 text_insert = ''
232 while pointer < len(diffs):
233 if diffs[pointer][0] == self.DIFF_INSERT:
234 count_insert += 1
235 text_insert += diffs[pointer][1]
236 elif diffs[pointer][0] == self.DIFF_DELETE:
237 count_delete += 1
238 text_delete += diffs[pointer][1]
239 elif diffs[pointer][0] == self.DIFF_EQUAL:
240 # Upon reaching an equality, check for prior redundancies.
241 if count_delete >= 1 and count_insert >= 1:
242 # Delete the offending records and add the merged ones.
243 a = self.diff_main(text_delete, text_insert, False, deadline)
244 diffs[pointer - count_delete - count_insert : pointer] = a
245 pointer = pointer - count_delete - count_insert + len(a)
246 count_insert = 0
247 count_delete = 0
248 text_delete = ''
249 text_insert = ''
250
251 pointer += 1
252
253 diffs.pop() # Remove the dummy entry at the end.
254
255 return diffs
256
257 def diff_bisect(self, text1, text2, deadline):
258 """Find the 'middle snake' of a diff, split the problem in two
259 and return the recursively constructed diff.
260 See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
261
262 Args:
263 text1: Old string to be diffed.
264 text2: New string to be diffed.
265 deadline: Time at which to bail if not yet complete.
266
267 Returns:
268 Array of diff tuples.
269 """
270
271 # Cache the text lengths to prevent multiple calls.
272 text1_length = len(text1)
273 text2_length = len(text2)
274 max_d = (text1_length + text2_length + 1) // 2
275 v_offset = max_d
276 v_length = 2 * max_d
277 v1 = [-1] * v_length
278 v1[v_offset + 1] = 0
279 v2 = v1[:]
280 delta = text1_length - text2_length
281 # If the total number of characters is odd, then the front path will
282 # collide with the reverse path.
283 front = (delta % 2 != 0)
284 # Offsets for start and end of k loop.
285 # Prevents mapping of space beyond the grid.
286 k1start = 0
287 k1end = 0
288 k2start = 0
289 k2end = 0
290 for d in range(max_d):
291 # Bail out if deadline is reached.
292 if time.time() > deadline:
293 break
294
295 # Walk the front path one step.
296 for k1 in range(-d + k1start, d + 1 - k1end, 2):
297 k1_offset = v_offset + k1
298 if k1 == -d or (k1 != d and
299 v1[k1_offset - 1] < v1[k1_offset + 1]):
300 x1 = v1[k1_offset + 1]
301 else:
302 x1 = v1[k1_offset - 1] + 1
303 y1 = x1 - k1
304 while (x1 < text1_length and y1 < text2_length and
305 text1[x1] == text2[y1]):
306 x1 += 1
307 y1 += 1
308 v1[k1_offset] = x1
309 if x1 > text1_length:
310 # Ran off the right of the graph.
311 k1end += 2
312 elif y1 > text2_length:
313 # Ran off the bottom of the graph.
314 k1start += 2
315 elif front:
316 k2_offset = v_offset + delta - k1
317 if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:
318 # Mirror x2 onto top-left coordinate system.
319 x2 = text1_length - v2[k2_offset]
320 if x1 >= x2:
321 # Overlap detected.
322 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
323
324 # Walk the reverse path one step.
325 for k2 in range(-d + k2start, d + 1 - k2end, 2):
326 k2_offset = v_offset + k2
327 if k2 == -d or (k2 != d and
328 v2[k2_offset - 1] < v2[k2_offset + 1]):
329 x2 = v2[k2_offset + 1]
330 else:
331 x2 = v2[k2_offset - 1] + 1
332 y2 = x2 - k2
333 while (x2 < text1_length and y2 < text2_length and
334 text1[-x2 - 1] == text2[-y2 - 1]):
335 x2 += 1
336 y2 += 1
337 v2[k2_offset] = x2
338 if x2 > text1_length:
339 # Ran off the left of the graph.
340 k2end += 2
341 elif y2 > text2_length:
342 # Ran off the top of the graph.
343 k2start += 2
344 elif not front:
345 k1_offset = v_offset + delta - k2
346 if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:
347 x1 = v1[k1_offset]
348 y1 = v_offset + x1 - k1_offset
349 # Mirror x2 onto top-left coordinate system.
350 x2 = text1_length - x2
351 if x1 >= x2:
352 # Overlap detected.
353 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
354
355 # Diff took too long and hit the deadline or
356 # number of diffs equals number of characters, no commonality at all.
357 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
358
359 def diff_bisectSplit(self, text1, text2, x, y, deadline):
360 """Given the location of the 'middle snake', split the diff in two parts
361 and recurse.
362
363 Args:
364 text1: Old string to be diffed.
365 text2: New string to be diffed.
366 x: Index of split point in text1.
367 y: Index of split point in text2.
368 deadline: Time at which to bail if not yet complete.
369
370 Returns:
371 Array of diff tuples.
372 """
373 text1a = text1[:x]
374 text2a = text2[:y]
375 text1b = text1[x:]
376 text2b = text2[y:]
377
378 # Compute both diffs serially.
379 diffs = self.diff_main(text1a, text2a, False, deadline)
380 diffsb = self.diff_main(text1b, text2b, False, deadline)
381
382 return diffs + diffsb
383
384 def diff_linesToChars(self, text1, text2):
385 """Split two texts into an array of strings. Reduce the texts to a string
386 of hashes where each Unicode character represents one line.
35 """Class containing the diff, match and patch methods.
387 36
388 Args:
389 text1: First string.
390 text2: Second string.
391
392 Returns:
393 Three element tuple, containing the encoded text1, the encoded text2 and
394 the array of unique strings. The zeroth element of the array of unique
395 strings is intentionally blank.
396 """
397 lineArray = [] # e.g. lineArray[4] == "Hello\n"
398 lineHash = {} # e.g. lineHash["Hello\n"] == 4
399
400 # "\x00" is a valid character, but various debuggers don't like it.
401 # So we'll insert a junk entry to avoid generating a null character.
402 lineArray.append('')
403
404 def diff_linesToCharsMunge(text):
405 """Split a text into an array of strings. Reduce the texts to a string
406 of hashes where each Unicode character represents one line.
407 Modifies linearray and linehash through being a closure.
408
409 Args:
410 text: String to encode.
411
412 Returns:
413 Encoded string.
414 """
415 chars = []
416 # Walk the text, pulling out a substring for each line.
417 # text.split('\n') would would temporarily double our memory footprint.
418 # Modifying text would create many large strings to garbage collect.
419 lineStart = 0
420 lineEnd = -1
421 while lineEnd < len(text) - 1:
422 lineEnd = text.find('\n', lineStart)
423 if lineEnd == -1:
424 lineEnd = len(text) - 1
425 line = text[lineStart:lineEnd + 1]
426 lineStart = lineEnd + 1
427
428 if line in lineHash:
429 chars.append(unichr(lineHash[line]))
430 else:
431 lineArray.append(line)
432 lineHash[line] = len(lineArray) - 1
433 chars.append(unichr(len(lineArray) - 1))
434 return "".join(chars)
435
436 chars1 = diff_linesToCharsMunge(text1)
437 chars2 = diff_linesToCharsMunge(text2)
438 return (chars1, chars2, lineArray)
439
440 def diff_charsToLines(self, diffs, lineArray):
441 """Rehydrate the text in a diff from a string of line hashes to real lines
442 of text.
443
444 Args:
445 diffs: Array of diff tuples.
446 lineArray: Array of unique strings.
447 """
448 for x in range(len(diffs)):
449 text = []
450 for char in diffs[x][1]:
451 text.append(lineArray[ord(char)])
452 diffs[x] = (diffs[x][0], "".join(text))
453
454 def diff_commonPrefix(self, text1, text2):
455 """Determine the common prefix of two strings.
456
457 Args:
458 text1: First string.
459 text2: Second string.
460
461 Returns:
462 The number of characters common to the start of each string.
463 """
464 # Quick check for common null cases.
465 if not text1 or not text2 or text1[0] != text2[0]:
466 return 0
467 # Binary search.
468 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
469 pointermin = 0
470 pointermax = min(len(text1), len(text2))
471 pointermid = pointermax
472 pointerstart = 0
473 while pointermin < pointermid:
474 if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
475 pointermin = pointermid
476 pointerstart = pointermin
477 else:
478 pointermax = pointermid
479 pointermid = (pointermax - pointermin) // 2 + pointermin
480 return pointermid
481
482 def diff_commonSuffix(self, text1, text2):
483 """Determine the common suffix of two strings.
484
485 Args:
486 text1: First string.
487 text2: Second string.
488
489 Returns:
490 The number of characters common to the end of each string.
491 """
492 # Quick check for common null cases.
493 if not text1 or not text2 or text1[-1] != text2[-1]:
494 return 0
495 # Binary search.
496 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
497 pointermin = 0
498 pointermax = min(len(text1), len(text2))
499 pointermid = pointermax
500 pointerend = 0
501 while pointermin < pointermid:
502 if (text1[-pointermid:len(text1) - pointerend] ==
503 text2[-pointermid:len(text2) - pointerend]):
504 pointermin = pointermid
505 pointerend = pointermin
506 else:
507 pointermax = pointermid
508 pointermid = (pointermax - pointermin) // 2 + pointermin
509 return pointermid
510
511 def diff_commonOverlap(self, text1, text2):
512 """Determine if the suffix of one string is the prefix of another.
513
514 Args:
515 text1 First string.
516 text2 Second string.
517
518 Returns:
519 The number of characters common to the end of the first
520 string and the start of the second string.
521 """
522 # Cache the text lengths to prevent multiple calls.
523 text1_length = len(text1)
524 text2_length = len(text2)
525 # Eliminate the null case.
526 if text1_length == 0 or text2_length == 0:
527 return 0
528 # Truncate the longer string.
529 if text1_length > text2_length:
530 text1 = text1[-text2_length:]
531 elif text1_length < text2_length:
532 text2 = text2[:text1_length]
533 text_length = min(text1_length, text2_length)
534 # Quick check for the worst case.
535 if text1 == text2:
536 return text_length
537
538 # Start by looking for a single character match
539 # and increase length until no match is found.
540 # Performance analysis: http://neil.fraser.name/news/2010/11/04/
541 best = 0
542 length = 1
543 while True:
544 pattern = text1[-length:]
545 found = text2.find(pattern)
546 if found == -1:
547 return best
548 length += found
549 if found == 0 or text1[-length:] == text2[:length]:
550 best = length
551 length += 1
552
553 def diff_halfMatch(self, text1, text2):
554 """Do the two texts share a substring which is at least half the length of
555 the longer text?
556 This speedup can produce non-minimal diffs.
557
558 Args:
559 text1: First string.
560 text2: Second string.
561
562 Returns:
563 Five element Array, containing the prefix of text1, the suffix of text1,
564 the prefix of text2, the suffix of text2 and the common middle. Or None
565 if there was no match.
566 """
567 if self.Diff_Timeout <= 0:
568 # Don't risk returning a non-optimal diff if we have unlimited time.
569 return None
570 if len(text1) > len(text2):
571 (longtext, shorttext) = (text1, text2)
572 else:
573 (shorttext, longtext) = (text1, text2)
574 if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):
575 return None # Pointless.
576
577 def diff_halfMatchI(longtext, shorttext, i):
578 """Does a substring of shorttext exist within longtext such that the
579 substring is at least half the length of longtext?
580 Closure, but does not reference any external variables.
581
582 Args:
583 longtext: Longer string.
584 shorttext: Shorter string.
585 i: Start index of quarter length substring within longtext.
586
587 Returns:
588 Five element Array, containing the prefix of longtext, the suffix of
589 longtext, the prefix of shorttext, the suffix of shorttext and the
590 common middle. Or None if there was no match.
591 """
592 seed = longtext[i:i + len(longtext) // 4]
593 best_common = ''
594 j = shorttext.find(seed)
595 while j != -1:
596 prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
597 suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
598 if len(best_common) < suffixLength + prefixLength:
599 best_common = (shorttext[j - suffixLength:j] +
600 shorttext[j:j + prefixLength])
601 best_longtext_a = longtext[:i - suffixLength]
602 best_longtext_b = longtext[i + prefixLength:]
603 best_shorttext_a = shorttext[:j - suffixLength]
604 best_shorttext_b = shorttext[j + prefixLength:]
605 j = shorttext.find(seed, j + 1)
606
607 if len(best_common) * 2 >= len(longtext):
608 return (best_longtext_a, best_longtext_b,
609 best_shorttext_a, best_shorttext_b, best_common)
610 else:
611 return None
612
613 # First check if the second quarter is the seed for a half-match.
614 hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4)
615 # Check again based on the third quarter.
616 hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2)
617 if not hm1 and not hm2:
618 return None
619 elif not hm2:
620 hm = hm1
621 elif not hm1:
622 hm = hm2
623 else:
624 # Both matched. Select the longest.
625 if len(hm1[4]) > len(hm2[4]):
626 hm = hm1
627 else:
628 hm = hm2
629
630 # A half-match was found, sort out the return data.
631 if len(text1) > len(text2):
632 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
633 else:
634 (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
635 return (text1_a, text1_b, text2_a, text2_b, mid_common)
636
637 def diff_cleanupSemantic(self, diffs):
638 """Reduce the number of edits by eliminating semantically trivial
639 equalities.
640
641 Args:
642 diffs: Array of diff tuples.
643 """
644 changes = False
645 equalities = [] # Stack of indices where equalities are found.
646 lastequality = None # Always equal to diffs[equalities[-1]][1]
647 pointer = 0 # Index of current position.
648 # Number of chars that changed prior to the equality.
649 length_insertions1, length_deletions1 = 0, 0
650 # Number of chars that changed after the equality.
651 length_insertions2, length_deletions2 = 0, 0
652 while pointer < len(diffs):
653 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
654 equalities.append(pointer)
655 length_insertions1, length_insertions2 = length_insertions2, 0
656 length_deletions1, length_deletions2 = length_deletions2, 0
657 lastequality = diffs[pointer][1]
658 else: # An insertion or deletion.
659 if diffs[pointer][0] == self.DIFF_INSERT:
660 length_insertions2 += len(diffs[pointer][1])
661 else:
662 length_deletions2 += len(diffs[pointer][1])
663 # Eliminate an equality that is smaller or equal to the edits on both
664 # sides of it.
665 if (lastequality and (len(lastequality) <=
666 max(length_insertions1, length_deletions1)) and
667 (len(lastequality) <= max(length_insertions2, length_deletions2))):
668 # Duplicate record.
669 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
670 # Change second copy to insert.
671 diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
672 diffs[equalities[-1] + 1][1])
673 # Throw away the equality we just deleted.
674 equalities.pop()
675 # Throw away the previous equality (it needs to be reevaluated).
676 if len(equalities):
677 equalities.pop()
678 if len(equalities):
679 pointer = equalities[-1]
680 else:
681 pointer = -1
682 # Reset the counters.
683 length_insertions1, length_deletions1 = 0, 0
684 length_insertions2, length_deletions2 = 0, 0
685 lastequality = None
686 changes = True
687 pointer += 1
688
689 # Normalize the diff.
690 if changes:
691 self.diff_cleanupMerge(diffs)
692 self.diff_cleanupSemanticLossless(diffs)
693
694 # Find any overlaps between deletions and insertions.
695 # e.g: <del>abcxxx</del><ins>xxxdef</ins>
696 # -> <del>abc</del>xxx<ins>def</ins>
697 # e.g: <del>xxxabc</del><ins>defxxx</ins>
698 # -> <ins>def</ins>xxx<del>abc</del>
699 # Only extract an overlap if it is as big as the edit ahead or behind it.
700 pointer = 1
701 while pointer < len(diffs):
702 if (diffs[pointer - 1][0] == self.DIFF_DELETE and
703 diffs[pointer][0] == self.DIFF_INSERT):
704 deletion = diffs[pointer - 1][1]
705 insertion = diffs[pointer][1]
706 overlap_length1 = self.diff_commonOverlap(deletion, insertion)
707 overlap_length2 = self.diff_commonOverlap(insertion, deletion)
708 if overlap_length1 >= overlap_length2:
709 if (overlap_length1 >= len(deletion) / 2.0 or
710 overlap_length1 >= len(insertion) / 2.0):
711 # Overlap found. Insert an equality and trim the surrounding edits.
712 diffs.insert(pointer, (self.DIFF_EQUAL,
713 insertion[:overlap_length1]))
714 diffs[pointer - 1] = (self.DIFF_DELETE,
715 deletion[:len(deletion) - overlap_length1])
716 diffs[pointer + 1] = (self.DIFF_INSERT,
717 insertion[overlap_length1:])
718 pointer += 1
719 else:
720 if (overlap_length2 >= len(deletion) / 2.0 or
721 overlap_length2 >= len(insertion) / 2.0):
722 # Reverse overlap found.
723 # Insert an equality and swap and trim the surrounding edits.
724 diffs.insert(pointer, (self.DIFF_EQUAL, deletion[:overlap_length2]))
725 diffs[pointer - 1] = (self.DIFF_INSERT,
726 insertion[:len(insertion) - overlap_length2])
727 diffs[pointer + 1] = (self.DIFF_DELETE, deletion[overlap_length2:])
728 pointer += 1
729 pointer += 1
730 pointer += 1
731
732 def diff_cleanupSemanticLossless(self, diffs):
733 """Look for single edits surrounded on both sides by equalities
734 which can be shifted sideways to align the edit to a word boundary.
735 e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
736
737 Args:
738 diffs: Array of diff tuples.
37 Also contains the behaviour settings.
739 38 """
740 39
741 def diff_cleanupSemanticScore(one, two):
742 """Given two strings, compute a score representing whether the
743 internal boundary falls on logical boundaries.
744 Scores range from 6 (best) to 0 (worst).
745 Closure, but does not reference any external variables.
40 def __init__(self):
41 """Inits a diff_match_patch object with default settings.
42 Redefine these in your program to override the defaults.
43 """
44
45 # Number of seconds to map a diff before giving up (0 for infinity).
46 self.Diff_Timeout = 1.0
47 # Cost of an empty edit operation in terms of edit characters.
48 self.Diff_EditCost = 4
49 # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
50 self.Match_Threshold = 0.5
51 # How far to search for a match (0 = exact location, 1000+ = broad match).
52 # A match this many characters away from the expected location will add
53 # 1.0 to the score (0.0 is a perfect match).
54 self.Match_Distance = 1000
55 # When deleting a large block of text (over ~64 characters), how close do
56 # the contents have to be to match the expected contents. (0.0 = perfection,
57 # 1.0 = very loose). Note that Match_Threshold controls how closely the
58 # end points of a delete need to match.
59 self.Patch_DeleteThreshold = 0.5
60 # Chunk size for context length.
61 self.Patch_Margin = 4
62
63 # The number of bits in an int.
64 # Python has no maximum, thus to disable patch splitting set to 0.
65 # However to avoid long patches in certain pathological cases, use 32.
66 # Multiple short patches (using native ints) are much faster than long ones.
67 self.Match_MaxBits = 32
68
69 # DIFF FUNCTIONS
746 70
747 Args:
748 one: First string.
749 two: Second string.
71 # The data structure representing a diff is an array of tuples:
72 # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
73 # which means: delete "Hello", add "Goodbye" and keep " world."
74 DIFF_DELETE = -1
75 DIFF_INSERT = 1
76 DIFF_EQUAL = 0
77
78 def diff_main(self, text1, text2, checklines=True, deadline=None):
79 """Find the differences between two texts. Simplifies the problem by
80 stripping any common prefix or suffix off the texts before diffing.
81
82 Args:
83 text1: Old string to be diffed.
84 text2: New string to be diffed.
85 checklines: Optional speedup flag. If present and false, then don't run
86 a line-level diff first to identify the changed areas.
87 Defaults to true, which does a faster, slightly less optimal diff.
88 deadline: Optional time when the diff should be complete by. Used
89 internally for recursive calls. Users should set DiffTimeout instead.
90
91 Returns:
92 Array of changes.
93 """
94 # Set a deadline by which time the diff must be complete.
95 if deadline is None:
96 # Unlike in most languages, Python counts time in seconds.
97 if self.Diff_Timeout <= 0:
98 deadline = sys.maxsize
99 else:
100 deadline = time.time() + self.Diff_Timeout
101
102 # Check for null inputs.
103 if text1 is None or text2 is None:
104 raise ValueError("Null inputs. (diff_main)")
750 105
751 Returns:
752 The score.
753 """
754 if not one or not two:
755 # Edges are the best.
756 return 6
106 # Check for equality (speedup).
107 if text1 == text2:
108 if text1:
109 return [(self.DIFF_EQUAL, text1)]
110 return []
111
112 # Trim off common prefix (speedup).
113 commonlength = self.diff_commonPrefix(text1, text2)
114 commonprefix = text1[:commonlength]
115 text1 = text1[commonlength:]
116 text2 = text2[commonlength:]
117
118 # Trim off common suffix (speedup).
119 commonlength = self.diff_commonSuffix(text1, text2)
120 if commonlength == 0:
121 commonsuffix = ""
122 else:
123 commonsuffix = text1[-commonlength:]
124 text1 = text1[:-commonlength]
125 text2 = text2[:-commonlength]
126
127 # Compute the diff on the middle block.
128 diffs = self.diff_compute(text1, text2, checklines, deadline)
129
130 # Restore the prefix and suffix.
131 if commonprefix:
132 diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
133 if commonsuffix:
134 diffs.append((self.DIFF_EQUAL, commonsuffix))
135 self.diff_cleanupMerge(diffs)
136 return diffs
137
138 def diff_compute(self, text1, text2, checklines, deadline):
139 """Find the differences between two texts. Assumes that the texts do not
140 have any common prefix or suffix.
757 141
758 # Each port of this function behaves slightly differently due to
759 # subtle differences in each language's definition of things like
760 # 'whitespace'. Since this function's purpose is largely cosmetic,
761 # the choice has been made to use each language's native features
762 # rather than force total conformity.
763 char1 = one[-1]
764 char2 = two[0]
765 nonAlphaNumeric1 = not char1.isalnum()
766 nonAlphaNumeric2 = not char2.isalnum()
767 whitespace1 = nonAlphaNumeric1 and char1.isspace()
768 whitespace2 = nonAlphaNumeric2 and char2.isspace()
769 lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n")
770 lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n")
771 blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one)
772 blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two)
142 Args:
143 text1: Old string to be diffed.
144 text2: New string to be diffed.
145 checklines: Speedup flag. If false, then don't run a line-level diff
146 first to identify the changed areas.
147 If true, then run a faster, slightly less optimal diff.
148 deadline: Time when the diff should be complete by.
149
150 Returns:
151 Array of changes.
152 """
153 if not text1:
154 # Just add some text (speedup).
155 return [(self.DIFF_INSERT, text2)]
156
157 if not text2:
158 # Just delete some text (speedup).
159 return [(self.DIFF_DELETE, text1)]
160
161 if len(text1) > len(text2):
162 (longtext, shorttext) = (text1, text2)
163 else:
164 (shorttext, longtext) = (text1, text2)
165 i = longtext.find(shorttext)
166 if i != -1:
167 # Shorter text is inside the longer text (speedup).
168 diffs = [
169 (self.DIFF_INSERT, longtext[:i]),
170 (self.DIFF_EQUAL, shorttext),
171 (self.DIFF_INSERT, longtext[i + len(shorttext) :]),
172 ]
173 # Swap insertions for deletions if diff is reversed.
174 if len(text1) > len(text2):
175 diffs[0] = (self.DIFF_DELETE, diffs[0][1])
176 diffs[2] = (self.DIFF_DELETE, diffs[2][1])
177 return diffs
773 178
774 if blankLine1 or blankLine2:
775 # Five points for blank lines.
776 return 5
777 elif lineBreak1 or lineBreak2:
778 # Four points for line breaks.
779 return 4
780 elif nonAlphaNumeric1 and not whitespace1 and whitespace2:
781 # Three points for end of sentences.
782 return 3
783 elif whitespace1 or whitespace2:
784 # Two points for whitespace.
785 return 2
786 elif nonAlphaNumeric1 or nonAlphaNumeric2:
787 # One point for non-alphanumeric.
788 return 1
789 return 0
179 if len(shorttext) == 1:
180 # Single character string.
181 # After the previous speedup, the character can't be an equality.
182 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
183
184 # Check to see if the problem can be split in two.
185 hm = self.diff_halfMatch(text1, text2)
186 if hm:
187 # A half-match was found, sort out the return data.
188 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
189 # Send both pairs off for separate processing.
190 diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)
191 diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)
192 # Merge the results.
193 return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
194
195 if checklines and len(text1) > 100 and len(text2) > 100:
196 return self.diff_lineMode(text1, text2, deadline)
197
198 return self.diff_bisect(text1, text2, deadline)
199
200 def diff_lineMode(self, text1, text2, deadline):
201 """Do a quick line-level diff on both strings, then rediff the parts for
202 greater accuracy.
203 This speedup can produce non-minimal diffs.
204
205 Args:
206 text1: Old string to be diffed.
207 text2: New string to be diffed.
208 deadline: Time when the diff should be complete by.
209
210 Returns:
211 Array of changes.
212 """
790 213
791 pointer = 1
792 # Intentionally ignore the first and last element (don't need checking).
793 while pointer < len(diffs) - 1:
794 if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
795 diffs[pointer + 1][0] == self.DIFF_EQUAL):
796 # This is a single edit surrounded by equalities.
797 equality1 = diffs[pointer - 1][1]
798 edit = diffs[pointer][1]
799 equality2 = diffs[pointer + 1][1]
214 # Scan the text on a line-by-line basis first.
215 (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
216
217 diffs = self.diff_main(text1, text2, False, deadline)
218
219 # Convert the diff back to original text.
220 self.diff_charsToLines(diffs, linearray)
221 # Eliminate freak matches (e.g. blank lines)
222 self.diff_cleanupSemantic(diffs)
223
224 # Rediff any replacement blocks, this time character-by-character.
225 # Add a dummy entry at the end.
226 diffs.append((self.DIFF_EQUAL, ""))
227 pointer = 0
228 count_delete = 0
229 count_insert = 0
230 text_delete = ""
231 text_insert = ""
232 while pointer < len(diffs):
233 if diffs[pointer][0] == self.DIFF_INSERT:
234 count_insert += 1
235 text_insert += diffs[pointer][1]
236 elif diffs[pointer][0] == self.DIFF_DELETE:
237 count_delete += 1
238 text_delete += diffs[pointer][1]
239 elif diffs[pointer][0] == self.DIFF_EQUAL:
240 # Upon reaching an equality, check for prior redundancies.
241 if count_delete >= 1 and count_insert >= 1:
242 # Delete the offending records and add the merged ones.
243 a = self.diff_main(text_delete, text_insert, False, deadline)
244 diffs[pointer - count_delete - count_insert : pointer] = a
245 pointer = pointer - count_delete - count_insert + len(a)
246 count_insert = 0
247 count_delete = 0
248 text_delete = ""
249 text_insert = ""
800 250
801 # First, shift the edit as far left as possible.
802 commonOffset = self.diff_commonSuffix(equality1, edit)
803 if commonOffset:
804 commonString = edit[-commonOffset:]
805 equality1 = equality1[:-commonOffset]
806 edit = commonString + edit[:-commonOffset]
807 equality2 = commonString + equality2
251 pointer += 1
252
253 diffs.pop() # Remove the dummy entry at the end.
254
255 return diffs
256
257 def diff_bisect(self, text1, text2, deadline):
258 """Find the 'middle snake' of a diff, split the problem in two
259 and return the recursively constructed diff.
260 See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
261
262 Args:
263 text1: Old string to be diffed.
264 text2: New string to be diffed.
265 deadline: Time at which to bail if not yet complete.
266
267 Returns:
268 Array of diff tuples.
269 """
808 270
809 # Second, step character by character right, looking for the best fit.
810 bestEquality1 = equality1
811 bestEdit = edit
812 bestEquality2 = equality2
813 bestScore = (diff_cleanupSemanticScore(equality1, edit) +
814 diff_cleanupSemanticScore(edit, equality2))
815 while edit and equality2 and edit[0] == equality2[0]:
816 equality1 += edit[0]
817 edit = edit[1:] + equality2[0]
818 equality2 = equality2[1:]
819 score = (diff_cleanupSemanticScore(equality1, edit) +
820 diff_cleanupSemanticScore(edit, equality2))
821 # The >= encourages trailing rather than leading whitespace on edits.
822 if score >= bestScore:
823 bestScore = score
824 bestEquality1 = equality1
825 bestEdit = edit
826 bestEquality2 = equality2
271 # Cache the text lengths to prevent multiple calls.
272 text1_length = len(text1)
273 text2_length = len(text2)
274 max_d = (text1_length + text2_length + 1) // 2
275 v_offset = max_d
276 v_length = 2 * max_d
277 v1 = [-1] * v_length
278 v1[v_offset + 1] = 0
279 v2 = v1[:]
280 delta = text1_length - text2_length
281 # If the total number of characters is odd, then the front path will
282 # collide with the reverse path.
283 front = delta % 2 != 0
284 # Offsets for start and end of k loop.
285 # Prevents mapping of space beyond the grid.
286 k1start = 0
287 k1end = 0
288 k2start = 0
289 k2end = 0
290 for d in range(max_d):
291 # Bail out if deadline is reached.
292 if time.time() > deadline:
293 break
294
295 # Walk the front path one step.
296 for k1 in range(-d + k1start, d + 1 - k1end, 2):
297 k1_offset = v_offset + k1
298 if k1 == -d or (k1 != d and v1[k1_offset - 1] < v1[k1_offset + 1]):
299 x1 = v1[k1_offset + 1]
300 else:
301 x1 = v1[k1_offset - 1] + 1
302 y1 = x1 - k1
303 while (
304 x1 < text1_length and y1 < text2_length and text1[x1] == text2[y1]
305 ):
306 x1 += 1
307 y1 += 1
308 v1[k1_offset] = x1
309 if x1 > text1_length:
310 # Ran off the right of the graph.
311 k1end += 2
312 elif y1 > text2_length:
313 # Ran off the bottom of the graph.
314 k1start += 2
315 elif front:
316 k2_offset = v_offset + delta - k1
317 if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:
318 # Mirror x2 onto top-left coordinate system.
319 x2 = text1_length - v2[k2_offset]
320 if x1 >= x2:
321 # Overlap detected.
322 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
827 323
828 if diffs[pointer - 1][1] != bestEquality1:
829 # We have an improvement, save it back to the diff.
830 if bestEquality1:
831 diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
832 else:
833 del diffs[pointer - 1]
834 pointer -= 1
835 diffs[pointer] = (diffs[pointer][0], bestEdit)
836 if bestEquality2:
837 diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
838 else:
839 del diffs[pointer + 1]
840 pointer -= 1
841 pointer += 1
324 # Walk the reverse path one step.
325 for k2 in range(-d + k2start, d + 1 - k2end, 2):
326 k2_offset = v_offset + k2
327 if k2 == -d or (k2 != d and v2[k2_offset - 1] < v2[k2_offset + 1]):
328 x2 = v2[k2_offset + 1]
329 else:
330 x2 = v2[k2_offset - 1] + 1
331 y2 = x2 - k2
332 while (
333 x2 < text1_length
334 and y2 < text2_length
335 and text1[-x2 - 1] == text2[-y2 - 1]
336 ):
337 x2 += 1
338 y2 += 1
339 v2[k2_offset] = x2
340 if x2 > text1_length:
341 # Ran off the left of the graph.
342 k2end += 2
343 elif y2 > text2_length:
344 # Ran off the top of the graph.
345 k2start += 2
346 elif not front:
347 k1_offset = v_offset + delta - k2
348 if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:
349 x1 = v1[k1_offset]
350 y1 = v_offset + x1 - k1_offset
351 # Mirror x2 onto top-left coordinate system.
352 x2 = text1_length - x2
353 if x1 >= x2:
354 # Overlap detected.
355 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
356
357 # Diff took too long and hit the deadline or
358 # number of diffs equals number of characters, no commonality at all.
359 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
360
361 def diff_bisectSplit(self, text1, text2, x, y, deadline):
362 """Given the location of the 'middle snake', split the diff in two parts
363 and recurse.
364
365 Args:
366 text1: Old string to be diffed.
367 text2: New string to be diffed.
368 x: Index of split point in text1.
369 y: Index of split point in text2.
370 deadline: Time at which to bail if not yet complete.
371
372 Returns:
373 Array of diff tuples.
374 """
375 text1a = text1[:x]
376 text2a = text2[:y]
377 text1b = text1[x:]
378 text2b = text2[y:]
379
380 # Compute both diffs serially.
381 diffs = self.diff_main(text1a, text2a, False, deadline)
382 diffsb = self.diff_main(text1b, text2b, False, deadline)
842 383
843 # Define some regex patterns for matching boundaries.
844 BLANKLINEEND = re.compile(r"\n\r?\n$");
845 BLANKLINESTART = re.compile(r"^\r?\n\r?\n");
384 return diffs + diffsb
385
386 def diff_linesToChars(self, text1, text2):
387 """Split two texts into an array of strings. Reduce the texts to a string
388 of hashes where each Unicode character represents one line.
389
390 Args:
391 text1: First string.
392 text2: Second string.
393
394 Returns:
395 Three element tuple, containing the encoded text1, the encoded text2 and
396 the array of unique strings. The zeroth element of the array of unique
397 strings is intentionally blank.
398 """
399 lineArray = [] # e.g. lineArray[4] == "Hello\n"
400 lineHash = {} # e.g. lineHash["Hello\n"] == 4
401
402 # "\x00" is a valid character, but various debuggers don't like it.
403 # So we'll insert a junk entry to avoid generating a null character.
404 lineArray.append("")
405
406 def diff_linesToCharsMunge(text):
407 """Split a text into an array of strings. Reduce the texts to a string
408 of hashes where each Unicode character represents one line.
409 Modifies linearray and linehash through being a closure.
410
411 Args:
412 text: String to encode.
413
414 Returns:
415 Encoded string.
416 """
417 chars = []
418 # Walk the text, pulling out a substring for each line.
419 # text.split('\n') would would temporarily double our memory footprint.
420 # Modifying text would create many large strings to garbage collect.
421 lineStart = 0
422 lineEnd = -1
423 while lineEnd < len(text) - 1:
424 lineEnd = text.find("\n", lineStart)
425 if lineEnd == -1:
426 lineEnd = len(text) - 1
427 line = text[lineStart : lineEnd + 1]
428 lineStart = lineEnd + 1
429
430 if line in lineHash:
431 chars.append(chr(lineHash[line]))
432 else:
433 lineArray.append(line)
434 lineHash[line] = len(lineArray) - 1
435 chars.append(chr(len(lineArray) - 1))
436 return "".join(chars)
437
438 chars1 = diff_linesToCharsMunge(text1)
439 chars2 = diff_linesToCharsMunge(text2)
440 return (chars1, chars2, lineArray)
441
442 def diff_charsToLines(self, diffs, lineArray):
443 """Rehydrate the text in a diff from a string of line hashes to real lines
444 of text.
846 445
847 def diff_cleanupEfficiency(self, diffs):
848 """Reduce the number of edits by eliminating operationally trivial
849 equalities.
446 Args:
447 diffs: Array of diff tuples.
448 lineArray: Array of unique strings.
449 """
450 for x in range(len(diffs)):
451 text = []
452 for char in diffs[x][1]:
453 text.append(lineArray[ord(char)])
454 diffs[x] = (diffs[x][0], "".join(text))
455
456 def diff_commonPrefix(self, text1, text2):
457 """Determine the common prefix of two strings.
458
459 Args:
460 text1: First string.
461 text2: Second string.
462
463 Returns:
464 The number of characters common to the start of each string.
465 """
466 # Quick check for common null cases.
467 if not text1 or not text2 or text1[0] != text2[0]:
468 return 0
469 # Binary search.
470 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
471 pointermin = 0
472 pointermax = min(len(text1), len(text2))
473 pointermid = pointermax
474 pointerstart = 0
475 while pointermin < pointermid:
476 if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
477 pointermin = pointermid
478 pointerstart = pointermin
479 else:
480 pointermax = pointermid
481 pointermid = (pointermax - pointermin) // 2 + pointermin
482 return pointermid
483
484 def diff_commonSuffix(self, text1, text2):
485 """Determine the common suffix of two strings.
486
487 Args:
488 text1: First string.
489 text2: Second string.
850 490
851 Args:
852 diffs: Array of diff tuples.
853 """
854 changes = False
855 equalities = [] # Stack of indices where equalities are found.
856 lastequality = None # Always equal to diffs[equalities[-1]][1]
857 pointer = 0 # Index of current position.
858 pre_ins = False # Is there an insertion operation before the last equality.
859 pre_del = False # Is there a deletion operation before the last equality.
860 post_ins = False # Is there an insertion operation after the last equality.
861 post_del = False # Is there a deletion operation after the last equality.
862 while pointer < len(diffs):
863 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
864 if (len(diffs[pointer][1]) < self.Diff_EditCost and
865 (post_ins or post_del)):
866 # Candidate found.
867 equalities.append(pointer)
868 pre_ins = post_ins
869 pre_del = post_del
870 lastequality = diffs[pointer][1]
491 Returns:
492 The number of characters common to the end of each string.
493 """
494 # Quick check for common null cases.
495 if not text1 or not text2 or text1[-1] != text2[-1]:
496 return 0
497 # Binary search.
498 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
499 pointermin = 0
500 pointermax = min(len(text1), len(text2))
501 pointermid = pointermax
502 pointerend = 0
503 while pointermin < pointermid:
504 if (
505 text1[-pointermid : len(text1) - pointerend]
506 == text2[-pointermid : len(text2) - pointerend]
507 ):
508 pointermin = pointermid
509 pointerend = pointermin
510 else:
511 pointermax = pointermid
512 pointermid = (pointermax - pointermin) // 2 + pointermin
513 return pointermid
514
515 def diff_commonOverlap(self, text1, text2):
516 """Determine if the suffix of one string is the prefix of another.
517
518 Args:
519 text1 First string.
520 text2 Second string.
521
522 Returns:
523 The number of characters common to the end of the first
524 string and the start of the second string.
525 """
526 # Cache the text lengths to prevent multiple calls.
527 text1_length = len(text1)
528 text2_length = len(text2)
529 # Eliminate the null case.
530 if text1_length == 0 or text2_length == 0:
531 return 0
532 # Truncate the longer string.
533 if text1_length > text2_length:
534 text1 = text1[-text2_length:]
535 elif text1_length < text2_length:
536 text2 = text2[:text1_length]
537 text_length = min(text1_length, text2_length)
538 # Quick check for the worst case.
539 if text1 == text2:
540 return text_length
541
542 # Start by looking for a single character match
543 # and increase length until no match is found.
544 # Performance analysis: http://neil.fraser.name/news/2010/11/04/
545 best = 0
546 length = 1
547 while True:
548 pattern = text1[-length:]
549 found = text2.find(pattern)
550 if found == -1:
551 return best
552 length += found
553 if found == 0 or text1[-length:] == text2[:length]:
554 best = length
555 length += 1
556
557 def diff_halfMatch(self, text1, text2):
558 """Do the two texts share a substring which is at least half the length of
559 the longer text?
560 This speedup can produce non-minimal diffs.
561
562 Args:
563 text1: First string.
564 text2: Second string.
565
566 Returns:
567 Five element Array, containing the prefix of text1, the suffix of text1,
568 the prefix of text2, the suffix of text2 and the common middle. Or None
569 if there was no match.
570 """
571 if self.Diff_Timeout <= 0:
572 # Don't risk returning a non-optimal diff if we have unlimited time.
573 return None
574 if len(text1) > len(text2):
575 (longtext, shorttext) = (text1, text2)
871 576 else:
872 # Not a candidate, and can never become one.
873 equalities = []
874 lastequality = None
577 (shorttext, longtext) = (text1, text2)
578 if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):
579 return None # Pointless.
875 580
876 post_ins = post_del = False
877 else: # An insertion or deletion.
878 if diffs[pointer][0] == self.DIFF_DELETE:
879 post_del = True
880 else:
881 post_ins = True
581 def diff_halfMatchI(longtext, shorttext, i):
582 """Does a substring of shorttext exist within longtext such that the
583 substring is at least half the length of longtext?
584 Closure, but does not reference any external variables.
882 585
883 # Five types to be split:
884 # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
885 # <ins>A</ins>X<ins>C</ins><del>D</del>
886 # <ins>A</ins><del>B</del>X<ins>C</ins>
887 # <ins>A</del>X<ins>C</ins><del>D</del>
888 # <ins>A</ins><del>B</del>X<del>C</del>
586 Args:
587 longtext: Longer string.
588 shorttext: Shorter string.
589 i: Start index of quarter length substring within longtext.
889 590
890 if lastequality and ((pre_ins and pre_del and post_ins and post_del) or
891 ((len(lastequality) < self.Diff_EditCost / 2) and
892 (pre_ins + pre_del + post_ins + post_del) == 3)):
893 # Duplicate record.
894 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
895 # Change second copy to insert.
896 diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
897 diffs[equalities[-1] + 1][1])
898 equalities.pop() # Throw away the equality we just deleted.
899 lastequality = None
900 if pre_ins and pre_del:
901 # No changes made which could affect previous entry, keep going.
902 post_ins = post_del = True
903 equalities = []
904 else:
905 if len(equalities):
906 equalities.pop() # Throw away the previous equality.
907 if len(equalities):
908 pointer = equalities[-1]
909 else:
910 pointer = -1
911 post_ins = post_del = False
912 changes = True
913 pointer += 1
914
915 if changes:
916 self.diff_cleanupMerge(diffs)
917
918 def diff_cleanupMerge(self, diffs):
919 """Reorder and merge like edit sections. Merge equalities.
920 Any edit section can move as long as it doesn't cross an equality.
591 Returns:
592 Five element Array, containing the prefix of longtext, the suffix of
593 longtext, the prefix of shorttext, the suffix of shorttext and the
594 common middle. Or None if there was no match.
595 """
596 seed = longtext[i : i + len(longtext) // 4]
597 best_common = ""
598 j = shorttext.find(seed)
599 while j != -1:
600 prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
601 suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
602 if len(best_common) < suffixLength + prefixLength:
603 best_common = (
604 shorttext[j - suffixLength : j]
605 + shorttext[j : j + prefixLength]
606 )
607 best_longtext_a = longtext[: i - suffixLength]
608 best_longtext_b = longtext[i + prefixLength :]
609 best_shorttext_a = shorttext[: j - suffixLength]
610 best_shorttext_b = shorttext[j + prefixLength :]
611 j = shorttext.find(seed, j + 1)
921 612
922 Args:
923 diffs: Array of diff tuples.
924 """
925 diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end.
926 pointer = 0
927 count_delete = 0
928 count_insert = 0
929 text_delete = ''
930 text_insert = ''
931 while pointer < len(diffs):
932 if diffs[pointer][0] == self.DIFF_INSERT:
933 count_insert += 1
934 text_insert += diffs[pointer][1]
935 pointer += 1
936 elif diffs[pointer][0] == self.DIFF_DELETE:
937 count_delete += 1
938 text_delete += diffs[pointer][1]
939 pointer += 1
940 elif diffs[pointer][0] == self.DIFF_EQUAL:
941 # Upon reaching an equality, check for prior redundancies.
942 if count_delete + count_insert > 1:
943 if count_delete != 0 and count_insert != 0:
944 # Factor out any common prefixies.
945 commonlength = self.diff_commonPrefix(text_insert, text_delete)
946 if commonlength != 0:
947 x = pointer - count_delete - count_insert - 1
948 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
949 diffs[x] = (diffs[x][0], diffs[x][1] +
950 text_insert[:commonlength])
951 else:
952 diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength]))
613 if len(best_common) * 2 >= len(longtext):
614 return (
615 best_longtext_a,
616 best_longtext_b,
617 best_shorttext_a,
618 best_shorttext_b,
619 best_common,
620 )
621 else:
622 return None
623
624 # First check if the second quarter is the seed for a half-match.
625 hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4)
626 # Check again based on the third quarter.
627 hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2)
628 if not hm1 and not hm2:
629 return None
630 elif not hm2:
631 hm = hm1
632 elif not hm1:
633 hm = hm2
634 else:
635 # Both matched. Select the longest.
636 if len(hm1[4]) > len(hm2[4]):
637 hm = hm1
638 else:
639 hm = hm2
640
641 # A half-match was found, sort out the return data.
642 if len(text1) > len(text2):
643 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
644 else:
645 (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
646 return (text1_a, text1_b, text2_a, text2_b, mid_common)
647
648 def diff_cleanupSemantic(self, diffs):
649 """Reduce the number of edits by eliminating semantically trivial
650 equalities.
651
652 Args:
653 diffs: Array of diff tuples.
654 """
655 changes = False
656 equalities = [] # Stack of indices where equalities are found.
657 lastequality = None # Always equal to diffs[equalities[-1]][1]
658 pointer = 0 # Index of current position.
659 # Number of chars that changed prior to the equality.
660 length_insertions1, length_deletions1 = 0, 0
661 # Number of chars that changed after the equality.
662 length_insertions2, length_deletions2 = 0, 0
663 while pointer < len(diffs):
664 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
665 equalities.append(pointer)
666 length_insertions1, length_insertions2 = length_insertions2, 0
667 length_deletions1, length_deletions2 = length_deletions2, 0
668 lastequality = diffs[pointer][1]
669 else: # An insertion or deletion.
670 if diffs[pointer][0] == self.DIFF_INSERT:
671 length_insertions2 += len(diffs[pointer][1])
672 else:
673 length_deletions2 += len(diffs[pointer][1])
674 # Eliminate an equality that is smaller or equal to the edits on both
675 # sides of it.
676 if (
677 lastequality
678 and (
679 len(lastequality) <= max(length_insertions1, length_deletions1)
680 )
681 and (
682 len(lastequality) <= max(length_insertions2, length_deletions2)
683 )
684 ):
685 # Duplicate record.
686 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
687 # Change second copy to insert.
688 diffs[equalities[-1] + 1] = (
689 self.DIFF_INSERT,
690 diffs[equalities[-1] + 1][1],
691 )
692 # Throw away the equality we just deleted.
693 equalities.pop()
694 # Throw away the previous equality (it needs to be reevaluated).
695 if len(equalities):
696 equalities.pop()
697 if len(equalities):
698 pointer = equalities[-1]
699 else:
700 pointer = -1
701 # Reset the counters.
702 length_insertions1, length_deletions1 = 0, 0
703 length_insertions2, length_deletions2 = 0, 0
704 lastequality = None
705 changes = True
706 pointer += 1
707
708 # Normalize the diff.
709 if changes:
710 self.diff_cleanupMerge(diffs)
711 self.diff_cleanupSemanticLossless(diffs)
712
713 # Find any overlaps between deletions and insertions.
714 # e.g: <del>abcxxx</del><ins>xxxdef</ins>
715 # -> <del>abc</del>xxx<ins>def</ins>
716 # e.g: <del>xxxabc</del><ins>defxxx</ins>
717 # -> <ins>def</ins>xxx<del>abc</del>
718 # Only extract an overlap if it is as big as the edit ahead or behind it.
719 pointer = 1
720 while pointer < len(diffs):
721 if (
722 diffs[pointer - 1][0] == self.DIFF_DELETE
723 and diffs[pointer][0] == self.DIFF_INSERT
724 ):
725 deletion = diffs[pointer - 1][1]
726 insertion = diffs[pointer][1]
727 overlap_length1 = self.diff_commonOverlap(deletion, insertion)
728 overlap_length2 = self.diff_commonOverlap(insertion, deletion)
729 if overlap_length1 >= overlap_length2:
730 if (
731 overlap_length1 >= len(deletion) / 2.0
732 or overlap_length1 >= len(insertion) / 2.0
733 ):
734 # Overlap found. Insert an equality and trim the surrounding edits.
735 diffs.insert(
736 pointer, (self.DIFF_EQUAL, insertion[:overlap_length1])
737 )
738 diffs[pointer - 1] = (
739 self.DIFF_DELETE,
740 deletion[: len(deletion) - overlap_length1],
741 )
742 diffs[pointer + 1] = (
743 self.DIFF_INSERT,
744 insertion[overlap_length1:],
745 )
746 pointer += 1
747 else:
748 if (
749 overlap_length2 >= len(deletion) / 2.0
750 or overlap_length2 >= len(insertion) / 2.0
751 ):
752 # Reverse overlap found.
753 # Insert an equality and swap and trim the surrounding edits.
754 diffs.insert(
755 pointer, (self.DIFF_EQUAL, deletion[:overlap_length2])
756 )
757 diffs[pointer - 1] = (
758 self.DIFF_INSERT,
759 insertion[: len(insertion) - overlap_length2],
760 )
761 diffs[pointer + 1] = (
762 self.DIFF_DELETE,
763 deletion[overlap_length2:],
764 )
765 pointer += 1
953 766 pointer += 1
954 text_insert = text_insert[commonlength:]
955 text_delete = text_delete[commonlength:]
956 # Factor out any common suffixies.
957 commonlength = self.diff_commonSuffix(text_insert, text_delete)
958 if commonlength != 0:
959 diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +
960 diffs[pointer][1])
961 text_insert = text_insert[:-commonlength]
962 text_delete = text_delete[:-commonlength]
963 # Delete the offending records and add the merged ones.
964 if count_delete == 0:
965 diffs[pointer - count_insert : pointer] = [
966 (self.DIFF_INSERT, text_insert)]
967 elif count_insert == 0:
968 diffs[pointer - count_delete : pointer] = [
969 (self.DIFF_DELETE, text_delete)]
970 else:
971 diffs[pointer - count_delete - count_insert : pointer] = [
972 (self.DIFF_DELETE, text_delete),
973 (self.DIFF_INSERT, text_insert)]
974 pointer = pointer - count_delete - count_insert + 1
975 if count_delete != 0:
976 767 pointer += 1
977 if count_insert != 0:
978 pointer += 1
979 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
980 # Merge this equality with the previous one.
981 diffs[pointer - 1] = (diffs[pointer - 1][0],
982 diffs[pointer - 1][1] + diffs[pointer][1])
983 del diffs[pointer]
984 else:
985 pointer += 1
768
769 def diff_cleanupSemanticLossless(self, diffs):
770 """Look for single edits surrounded on both sides by equalities
771 which can be shifted sideways to align the edit to a word boundary.
772 e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
773
774 Args:
775 diffs: Array of diff tuples.
776 """
777
778 def diff_cleanupSemanticScore(one, two):
779 """Given two strings, compute a score representing whether the
780 internal boundary falls on logical boundaries.
781 Scores range from 6 (best) to 0 (worst).
782 Closure, but does not reference any external variables.
783
784 Args:
785 one: First string.
786 two: Second string.
787
788 Returns:
789 The score.
790 """
791 if not one or not two:
792 # Edges are the best.
793 return 6
794
795 # Each port of this function behaves slightly differently due to
796 # subtle differences in each language's definition of things like
797 # 'whitespace'. Since this function's purpose is largely cosmetic,
798 # the choice has been made to use each language's native features
799 # rather than force total conformity.
800 char1 = one[-1]
801 char2 = two[0]
802 nonAlphaNumeric1 = not char1.isalnum()
803 nonAlphaNumeric2 = not char2.isalnum()
804 whitespace1 = nonAlphaNumeric1 and char1.isspace()
805 whitespace2 = nonAlphaNumeric2 and char2.isspace()
806 lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n")
807 lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n")
808 blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one)
809 blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two)
810
811 if blankLine1 or blankLine2:
812 # Five points for blank lines.
813 return 5
814 elif lineBreak1 or lineBreak2:
815 # Four points for line breaks.
816 return 4
817 elif nonAlphaNumeric1 and not whitespace1 and whitespace2:
818 # Three points for end of sentences.
819 return 3
820 elif whitespace1 or whitespace2:
821 # Two points for whitespace.
822 return 2
823 elif nonAlphaNumeric1 or nonAlphaNumeric2:
824 # One point for non-alphanumeric.
825 return 1
826 return 0
986 827
987 count_insert = 0
988 count_delete = 0
989 text_delete = ''
990 text_insert = ''
828 pointer = 1
829 # Intentionally ignore the first and last element (don't need checking).
830 while pointer < len(diffs) - 1:
831 if (
832 diffs[pointer - 1][0] == self.DIFF_EQUAL
833 and diffs[pointer + 1][0] == self.DIFF_EQUAL
834 ):
835 # This is a single edit surrounded by equalities.
836 equality1 = diffs[pointer - 1][1]
837 edit = diffs[pointer][1]
838 equality2 = diffs[pointer + 1][1]
991 839
992 if diffs[-1][1] == '':
993 diffs.pop() # Remove the dummy entry at the end.
840 # First, shift the edit as far left as possible.
841 commonOffset = self.diff_commonSuffix(equality1, edit)
842 if commonOffset:
843 commonString = edit[-commonOffset:]
844 equality1 = equality1[:-commonOffset]
845 edit = commonString + edit[:-commonOffset]
846 equality2 = commonString + equality2
994 847
995 # Second pass: look for single edits surrounded on both sides by equalities
996 # which can be shifted sideways to eliminate an equality.
997 # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
998 changes = False
999 pointer = 1
1000 # Intentionally ignore the first and last element (don't need checking).
1001 while pointer < len(diffs) - 1:
1002 if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
1003 diffs[pointer + 1][0] == self.DIFF_EQUAL):
1004 # This is a single edit surrounded by equalities.
1005 if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
1006 # Shift the edit over the previous equality.
1007 diffs[pointer] = (diffs[pointer][0],
1008 diffs[pointer - 1][1] +
1009 diffs[pointer][1][:-len(diffs[pointer - 1][1])])
1010 diffs[pointer + 1] = (diffs[pointer + 1][0],
1011 diffs[pointer - 1][1] + diffs[pointer + 1][1])
1012 del diffs[pointer - 1]
1013 changes = True
1014 elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
1015 # Shift the edit over the next equality.
1016 diffs[pointer - 1] = (diffs[pointer - 1][0],
1017 diffs[pointer - 1][1] + diffs[pointer + 1][1])
1018 diffs[pointer] = (diffs[pointer][0],
1019 diffs[pointer][1][len(diffs[pointer + 1][1]):] +
1020 diffs[pointer + 1][1])
1021 del diffs[pointer + 1]
1022 changes = True
1023 pointer += 1
848 # Second, step character by character right, looking for the best fit.
849 bestEquality1 = equality1
850 bestEdit = edit
851 bestEquality2 = equality2
852 bestScore = diff_cleanupSemanticScore(
853 equality1, edit
854 ) + diff_cleanupSemanticScore(edit, equality2)
855 while edit and equality2 and edit[0] == equality2[0]:
856 equality1 += edit[0]
857 edit = edit[1:] + equality2[0]
858 equality2 = equality2[1:]
859 score = diff_cleanupSemanticScore(
860 equality1, edit
861 ) + diff_cleanupSemanticScore(edit, equality2)
862 # The >= encourages trailing rather than leading whitespace on edits.
863 if score >= bestScore:
864 bestScore = score
865 bestEquality1 = equality1
866 bestEdit = edit
867 bestEquality2 = equality2
868
869 if diffs[pointer - 1][1] != bestEquality1:
870 # We have an improvement, save it back to the diff.
871 if bestEquality1:
872 diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
873 else:
874 del diffs[pointer - 1]
875 pointer -= 1
876 diffs[pointer] = (diffs[pointer][0], bestEdit)
877 if bestEquality2:
878 diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
879 else:
880 del diffs[pointer + 1]
881 pointer -= 1
882 pointer += 1
883
884 # Define some regex patterns for matching boundaries.
885 BLANKLINEEND = re.compile(r"\n\r?\n$")
886 BLANKLINESTART = re.compile(r"^\r?\n\r?\n")
887
888 def diff_cleanupEfficiency(self, diffs):
889 """Reduce the number of edits by eliminating operationally trivial
890 equalities.
1024 891
1025 # If shifts were made, the diff needs reordering and another shift sweep.
1026 if changes:
1027 self.diff_cleanupMerge(diffs)
892 Args:
893 diffs: Array of diff tuples.
894 """
895 changes = False
896 equalities = [] # Stack of indices where equalities are found.
897 lastequality = None # Always equal to diffs[equalities[-1]][1]
898 pointer = 0 # Index of current position.
899 pre_ins = False # Is there an insertion operation before the last equality.
900 pre_del = False # Is there a deletion operation before the last equality.
901 post_ins = False # Is there an insertion operation after the last equality.
902 post_del = False # Is there a deletion operation after the last equality.
903 while pointer < len(diffs):
904 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
905 if len(diffs[pointer][1]) < self.Diff_EditCost and (
906 post_ins or post_del
907 ):
908 # Candidate found.
909 equalities.append(pointer)
910 pre_ins = post_ins
911 pre_del = post_del
912 lastequality = diffs[pointer][1]
913 else:
914 # Not a candidate, and can never become one.
915 equalities = []
916 lastequality = None
1028 917
1029 def diff_xIndex(self, diffs, loc):
1030 """loc is a location in text1, compute and return the equivalent location
1031 in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
1032
1033 Args:
1034 diffs: Array of diff tuples.
1035 loc: Location within text1.
918 post_ins = post_del = False
919 else: # An insertion or deletion.
920 if diffs[pointer][0] == self.DIFF_DELETE:
921 post_del = True
922 else:
923 post_ins = True
1036 924
1037 Returns:
1038 Location within text2.
1039 """
1040 chars1 = 0
1041 chars2 = 0
1042 last_chars1 = 0
1043 last_chars2 = 0
1044 for x in range(len(diffs)):
1045 (op, text) = diffs[x]
1046 if op != self.DIFF_INSERT: # Equality or deletion.
1047 chars1 += len(text)
1048 if op != self.DIFF_DELETE: # Equality or insertion.
1049 chars2 += len(text)
1050 if chars1 > loc: # Overshot the location.
1051 break
1052 last_chars1 = chars1
1053 last_chars2 = chars2
925 # Five types to be split:
926 # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
927 # <ins>A</ins>X<ins>C</ins><del>D</del>
928 # <ins>A</ins><del>B</del>X<ins>C</ins>
929 # <ins>A</del>X<ins>C</ins><del>D</del>
930 # <ins>A</ins><del>B</del>X<del>C</del>
1054 931
1055 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
1056 # The location was deleted.
1057 return last_chars2
1058 # Add the remaining len(character).
1059 return last_chars2 + (loc - last_chars1)
932 if lastequality and (
933 (pre_ins and pre_del and post_ins and post_del)
934 or (
935 (len(lastequality) < self.Diff_EditCost / 2)
936 and (pre_ins + pre_del + post_ins + post_del) == 3
937 )
938 ):
939 # Duplicate record.
940 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
941 # Change second copy to insert.
942 diffs[equalities[-1] + 1] = (
943 self.DIFF_INSERT,
944 diffs[equalities[-1] + 1][1],
945 )
946 equalities.pop() # Throw away the equality we just deleted.
947 lastequality = None
948 if pre_ins and pre_del:
949 # No changes made which could affect previous entry, keep going.
950 post_ins = post_del = True
951 equalities = []
952 else:
953 if len(equalities):
954 equalities.pop() # Throw away the previous equality.
955 if len(equalities):
956 pointer = equalities[-1]
957 else:
958 pointer = -1
959 post_ins = post_del = False
960 changes = True
961 pointer += 1
1060 962
1061 def diff_prettyHtml(self, diffs):
1062 """Convert a diff array into a pretty HTML report.
963 if changes:
964 self.diff_cleanupMerge(diffs)
1063 965
1064 Args:
1065 diffs: Array of diff tuples.
966 def diff_cleanupMerge(self, diffs):
967 """Reorder and merge like edit sections. Merge equalities.
968 Any edit section can move as long as it doesn't cross an equality.
1066 969
1067 Returns:
1068 HTML representation.
1069 """
1070 html = []
1071 for (op, data) in diffs:
1072 text = (data.replace("&", "&amp;").replace("<", "&lt;")
1073 .replace(">", "&gt;").replace("\n", "&para;<br>"))
1074 if op == self.DIFF_INSERT:
1075 html.append("<ins style=\"background:#e6ffe6;\">%s</ins>" % text)
1076 elif op == self.DIFF_DELETE:
1077 html.append("<del style=\"background:#ffe6e6;\">%s</del>" % text)
1078 elif op == self.DIFF_EQUAL:
1079 html.append("<span>%s</span>" % text)
1080 return "".join(html)
970 Args:
971 diffs: Array of diff tuples.
972 """
973 diffs.append((self.DIFF_EQUAL, "")) # Add a dummy entry at the end.
974 pointer = 0
975 count_delete = 0
976 count_insert = 0
977 text_delete = ""
978 text_insert = ""
979 while pointer < len(diffs):
980 if diffs[pointer][0] == self.DIFF_INSERT:
981 count_insert += 1
982 text_insert += diffs[pointer][1]
983 pointer += 1
984 elif diffs[pointer][0] == self.DIFF_DELETE:
985 count_delete += 1
986 text_delete += diffs[pointer][1]
987 pointer += 1
988 elif diffs[pointer][0] == self.DIFF_EQUAL:
989 # Upon reaching an equality, check for prior redundancies.
990 if count_delete + count_insert > 1:
991 if count_delete != 0 and count_insert != 0:
992 # Factor out any common prefixies.
993 commonlength = self.diff_commonPrefix(text_insert, text_delete)
994 if commonlength != 0:
995 x = pointer - count_delete - count_insert - 1
996 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
997 diffs[x] = (
998 diffs[x][0],
999 diffs[x][1] + text_insert[:commonlength],
1000 )
1001 else:
1002 diffs.insert(
1003 0, (self.DIFF_EQUAL, text_insert[:commonlength])
1004 )
1005 pointer += 1
1006 text_insert = text_insert[commonlength:]
1007 text_delete = text_delete[commonlength:]
1008 # Factor out any common suffixies.
1009 commonlength = self.diff_commonSuffix(text_insert, text_delete)
1010 if commonlength != 0:
1011 diffs[pointer] = (
1012 diffs[pointer][0],
1013 text_insert[-commonlength:] + diffs[pointer][1],
1014 )
1015 text_insert = text_insert[:-commonlength]
1016 text_delete = text_delete[:-commonlength]
1017 # Delete the offending records and add the merged ones.
1018 if count_delete == 0:
1019 diffs[pointer - count_insert : pointer] = [
1020 (self.DIFF_INSERT, text_insert)
1021 ]
1022 elif count_insert == 0:
1023 diffs[pointer - count_delete : pointer] = [
1024 (self.DIFF_DELETE, text_delete)
1025 ]
1026 else:
1027 diffs[pointer - count_delete - count_insert : pointer] = [
1028 (self.DIFF_DELETE, text_delete),
1029 (self.DIFF_INSERT, text_insert),
1030 ]
1031 pointer = pointer - count_delete - count_insert + 1
1032 if count_delete != 0:
1033 pointer += 1
1034 if count_insert != 0:
1035 pointer += 1
1036 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
1037 # Merge this equality with the previous one.
1038 diffs[pointer - 1] = (
1039 diffs[pointer - 1][0],
1040 diffs[pointer - 1][1] + diffs[pointer][1],
1041 )
1042 del diffs[pointer]
1043 else:
1044 pointer += 1
1081 1045
1082 def diff_text1(self, diffs):
1083 """Compute and return the source text (all equalities and deletions).
1084
1085 Args:
1086 diffs: Array of diff tuples.
1046 count_insert = 0
1047 count_delete = 0
1048 text_delete = ""
1049 text_insert = ""
1087 1050
1088 Returns:
1089 Source text.
1090 """
1091 text = []
1092 for (op, data) in diffs:
1093 if op != self.DIFF_INSERT:
1094 text.append(data)
1095 return "".join(text)
1096
1097 def diff_text2(self, diffs):
1098 """Compute and return the destination text (all equalities and insertions).
1051 if diffs[-1][1] == "":
1052 diffs.pop() # Remove the dummy entry at the end.
1099 1053
1100 Args:
1101 diffs: Array of diff tuples.
1054 # Second pass: look for single edits surrounded on both sides by equalities
1055 # which can be shifted sideways to eliminate an equality.
1056 # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
1057 changes = False
1058 pointer = 1
1059 # Intentionally ignore the first and last element (don't need checking).
1060 while pointer < len(diffs) - 1:
1061 if (
1062 diffs[pointer - 1][0] == self.DIFF_EQUAL
1063 and diffs[pointer + 1][0] == self.DIFF_EQUAL
1064 ):
1065 # This is a single edit surrounded by equalities.
1066 if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
1067 # Shift the edit over the previous equality.
1068 diffs[pointer] = (
1069 diffs[pointer][0],
1070 diffs[pointer - 1][1]
1071 + diffs[pointer][1][: -len(diffs[pointer - 1][1])],
1072 )
1073 diffs[pointer + 1] = (
1074 diffs[pointer + 1][0],
1075 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1076 )
1077 del diffs[pointer - 1]
1078 changes = True
1079 elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
1080 # Shift the edit over the next equality.
1081 diffs[pointer - 1] = (
1082 diffs[pointer - 1][0],
1083 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1084 )
1085 diffs[pointer] = (
1086 diffs[pointer][0],
1087 diffs[pointer][1][len(diffs[pointer + 1][1]) :]
1088 + diffs[pointer + 1][1],
1089 )
1090 del diffs[pointer + 1]
1091 changes = True
1092 pointer += 1
1102 1093
1103 Returns:
1104 Destination text.
1105 """
1106 text = []
1107 for (op, data) in diffs:
1108 if op != self.DIFF_DELETE:
1109 text.append(data)
1110 return "".join(text)
1094 # If shifts were made, the diff needs reordering and another shift sweep.
1095 if changes:
1096 self.diff_cleanupMerge(diffs)
1097
1098 def diff_xIndex(self, diffs, loc):
1099 """loc is a location in text1, compute and return the equivalent location
1100 in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
1101
1102 Args:
1103 diffs: Array of diff tuples.
1104 loc: Location within text1.
1111 1105
1112 def diff_levenshtein(self, diffs):
1113 """Compute the Levenshtein distance; the number of inserted, deleted or
1114 substituted characters.
1106 Returns:
1107 Location within text2.
1108 """
1109 chars1 = 0
1110 chars2 = 0
1111 last_chars1 = 0
1112 last_chars2 = 0
1113 for x in range(len(diffs)):
1114 (op, text) = diffs[x]
1115 if op != self.DIFF_INSERT: # Equality or deletion.
1116 chars1 += len(text)
1117 if op != self.DIFF_DELETE: # Equality or insertion.
1118 chars2 += len(text)
1119 if chars1 > loc: # Overshot the location.
1120 break
1121 last_chars1 = chars1
1122 last_chars2 = chars2
1115 1123
1116 Args:
1117 diffs: Array of diff tuples.
1124 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
1125 # The location was deleted.
1126 return last_chars2
1127 # Add the remaining len(character).
1128 return last_chars2 + (loc - last_chars1)
1129
1130 def diff_prettyHtml(self, diffs):
1131 """Convert a diff array into a pretty HTML report.
1132
1133 Args:
1134 diffs: Array of diff tuples.
1135
1136 Returns:
1137 HTML representation.
1138 """
1139 html = []
1140 for op, data in diffs:
1141 text = (
1142 data.replace("&", "&amp;")
1143 .replace("<", "&lt;")
1144 .replace(">", "&gt;")
1145 .replace("\n", "&para;<br>")
1146 )
1147 if op == self.DIFF_INSERT:
1148 html.append('<ins style="background:#e6ffe6;">%s</ins>' % text)
1149 elif op == self.DIFF_DELETE:
1150 html.append('<del style="background:#ffe6e6;">%s</del>' % text)
1151 elif op == self.DIFF_EQUAL:
1152 html.append("<span>%s</span>" % text)
1153 return "".join(html)
1154
1155 def diff_text1(self, diffs):
1156 """Compute and return the source text (all equalities and deletions).
1118 1157
1119 Returns:
1120 Number of changes.
1121 """
1122 levenshtein = 0
1123 insertions = 0
1124 deletions = 0
1125 for (op, data) in diffs:
1126 if op == self.DIFF_INSERT:
1127 insertions += len(data)
1128 elif op == self.DIFF_DELETE:
1129 deletions += len(data)
1130 elif op == self.DIFF_EQUAL:
1131 # A deletion and an insertion is one substitution.
1132 levenshtein += max(insertions, deletions)
1158 Args:
1159 diffs: Array of diff tuples.
1160
1161 Returns:
1162 Source text.
1163 """
1164 text = []
1165 for op, data in diffs:
1166 if op != self.DIFF_INSERT:
1167 text.append(data)
1168 return "".join(text)
1169
1170 def diff_text2(self, diffs):
1171 """Compute and return the destination text (all equalities and insertions).
1172
1173 Args:
1174 diffs: Array of diff tuples.
1175
1176 Returns:
1177 Destination text.
1178 """
1179 text = []
1180 for op, data in diffs:
1181 if op != self.DIFF_DELETE:
1182 text.append(data)
1183 return "".join(text)
1184
1185 def diff_levenshtein(self, diffs):
1186 """Compute the Levenshtein distance; the number of inserted, deleted or
1187 substituted characters.
1188
1189 Args:
1190 diffs: Array of diff tuples.
1191
1192 Returns:
1193 Number of changes.
1194 """
1195 levenshtein = 0
1133 1196 insertions = 0
1134 1197 deletions = 0
1135 levenshtein += max(insertions, deletions)
1136 return levenshtein
1198 for op, data in diffs:
1199 if op == self.DIFF_INSERT:
1200 insertions += len(data)
1201 elif op == self.DIFF_DELETE:
1202 deletions += len(data)
1203 elif op == self.DIFF_EQUAL:
1204 # A deletion and an insertion is one substitution.
1205 levenshtein += max(insertions, deletions)
1206 insertions = 0
1207 deletions = 0
1208 levenshtein += max(insertions, deletions)
1209 return levenshtein
1137 1210
1138 def diff_toDelta(self, diffs):
1139 """Crush the diff into an encoded string which describes the operations
1140 required to transform text1 into text2.
1141 E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
1142 Operations are tab-separated. Inserted text is escaped using %xx notation.
1211 def diff_toDelta(self, diffs):
1212 """Crush the diff into an encoded string which describes the operations
1213 required to transform text1 into text2.
1214 E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
1215 Operations are tab-separated. Inserted text is escaped using %xx notation.
1143 1216
1144 Args:
1145 diffs: Array of diff tuples.
1217 Args:
1218 diffs: Array of diff tuples.
1146 1219
1147 Returns:
1148 Delta text.
1149 """
1150 text = []
1151 for (op, data) in diffs:
1152 if op == self.DIFF_INSERT:
1153 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
1154 data = data.encode("utf-8")
1155 text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
1156 elif op == self.DIFF_DELETE:
1157 text.append("-%d" % len(data))
1158 elif op == self.DIFF_EQUAL:
1159 text.append("=%d" % len(data))
1160 return "\t".join(text)
1220 Returns:
1221 Delta text.
1222 """
1223 text = []
1224 for op, data in diffs:
1225 if op == self.DIFF_INSERT:
1226 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
1227 data = data.encode("utf-8")
1228 text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
1229 elif op == self.DIFF_DELETE:
1230 text.append("-%d" % len(data))
1231 elif op == self.DIFF_EQUAL:
1232 text.append("=%d" % len(data))
1233 return "\t".join(text)
1161 1234
1162 def diff_fromDelta(self, text1, delta):
1163 """Given the original text1, and an encoded string which describes the
1164 operations required to transform text1 into text2, compute the full diff.
1235 def diff_fromDelta(self, text1, delta):
1236 """Given the original text1, and an encoded string which describes the
1237 operations required to transform text1 into text2, compute the full diff.
1165 1238
1166 Args:
1167 text1: Source string for the diff.
1168 delta: Delta text.
1169
1170 Returns:
1171 Array of diff tuples.
1239 Args:
1240 text1: Source string for the diff.
1241 delta: Delta text.
1172 1242
1173 Raises:
1174 ValueError: If invalid input.
1175 """
1176 if type(delta) == unicode:
1177 # Deltas should be composed of a subset of ascii chars, Unicode not
1178 # required. If this encode raises UnicodeEncodeError, delta is invalid.
1179 delta = delta.encode("ascii")
1180 diffs = []
1181 pointer = 0 # Cursor in text1
1182 tokens = delta.split("\t")
1183 for token in tokens:
1184 if token == "":
1185 # Blank tokens are ok (from a trailing \t).
1186 continue
1187 # Each token begins with a one character parameter which specifies the
1188 # operation of this token (delete, insert, equality).
1189 param = token[1:]
1190 if token[0] == "+":
1191 param = urllib.parse.unquote(param).decode("utf-8")
1192 diffs.append((self.DIFF_INSERT, param))
1193 elif token[0] == "-" or token[0] == "=":
1194 try:
1195 n = int(param)
1196 except ValueError:
1197 raise ValueError("Invalid number in diff_fromDelta: " + param)
1198 if n < 0:
1199 raise ValueError("Negative number in diff_fromDelta: " + param)
1200 text = text1[pointer : pointer + n]
1201 pointer += n
1202 if token[0] == "=":
1203 diffs.append((self.DIFF_EQUAL, text))
1204 else:
1205 diffs.append((self.DIFF_DELETE, text))
1206 else:
1207 # Anything else is an error.
1208 raise ValueError("Invalid diff operation in diff_fromDelta: " +
1209 token[0])
1210 if pointer != len(text1):
1211 raise ValueError(
1212 "Delta length (%d) does not equal source text length (%d)." %
1213 (pointer, len(text1)))
1214 return diffs
1243 Returns:
1244 Array of diff tuples.
1215 1245
1216 # MATCH FUNCTIONS
1246 Raises:
1247 ValueError: If invalid input.
1248 """
1249 if type(delta) == str:
1250 # Deltas should be composed of a subset of ascii chars, Unicode not
1251 # required. If this encode raises UnicodeEncodeError, delta is invalid.
1252 delta = delta.encode("ascii")
1253 diffs = []
1254 pointer = 0 # Cursor in text1
1255 tokens = delta.split("\t")
1256 for token in tokens:
1257 if token == "":
1258 # Blank tokens are ok (from a trailing \t).
1259 continue
1260 # Each token begins with a one character parameter which specifies the
1261 # operation of this token (delete, insert, equality).
1262 param = token[1:]
1263 if token[0] == "+":
1264 param = urllib.parse.unquote(param)
1265 diffs.append((self.DIFF_INSERT, param))
1266 elif token[0] == "-" or token[0] == "=":
1267 try:
1268 n = int(param)
1269 except ValueError:
1270 raise ValueError("Invalid number in diff_fromDelta: " + param)
1271 if n < 0:
1272 raise ValueError("Negative number in diff_fromDelta: " + param)
1273 text = text1[pointer : pointer + n]
1274 pointer += n
1275 if token[0] == "=":
1276 diffs.append((self.DIFF_EQUAL, text))
1277 else:
1278 diffs.append((self.DIFF_DELETE, text))
1279 else:
1280 # Anything else is an error.
1281 raise ValueError(
1282 "Invalid diff operation in diff_fromDelta: " + token[0]
1283 )
1284 if pointer != len(text1):
1285 raise ValueError(
1286 "Delta length (%d) does not equal source text length (%d)."
1287 % (pointer, len(text1))
1288 )
1289 return diffs
1217 1290
1218 def match_main(self, text, pattern, loc):
1219 """Locate the best instance of 'pattern' in 'text' near 'loc'.
1291 # MATCH FUNCTIONS
1220 1292
1221 Args:
1222 text: The text to search.
1223 pattern: The pattern to search for.
1224 loc: The location to search around.
1293 def match_main(self, text, pattern, loc):
1294 """Locate the best instance of 'pattern' in 'text' near 'loc'.
1225 1295
1226 Returns:
1227 Best match index or -1.
1228 """
1229 # Check for null inputs.
1230 if text is None or pattern is None:
1231 raise ValueError("Null inputs. (match_main)")
1296 Args:
1297 text: The text to search.
1298 pattern: The pattern to search for.
1299 loc: The location to search around.
1300
1301 Returns:
1302 Best match index or -1.
1303 """
1304 # Check for null inputs.
1305 if text is None or pattern is None:
1306 raise ValueError("Null inputs. (match_main)")
1232 1307
1233 loc = max(0, min(loc, len(text)))
1234 if text == pattern:
1235 # Shortcut (potentially not guaranteed by the algorithm)
1236 return 0
1237 elif not text:
1238 # Nothing to match.
1239 return -1
1240 elif text[loc:loc + len(pattern)] == pattern:
1241 # Perfect match at the perfect spot! (Includes case of null pattern)
1242 return loc
1243 else:
1244 # Do a fuzzy compare.
1245 match = self.match_bitap(text, pattern, loc)
1246 return match
1308 loc = max(0, min(loc, len(text)))
1309 if text == pattern:
1310 # Shortcut (potentially not guaranteed by the algorithm)
1311 return 0
1312 elif not text:
1313 # Nothing to match.
1314 return -1
1315 elif text[loc : loc + len(pattern)] == pattern:
1316 # Perfect match at the perfect spot! (Includes case of null pattern)
1317 return loc
1318 else:
1319 # Do a fuzzy compare.
1320 match = self.match_bitap(text, pattern, loc)
1321 return match
1247 1322
1248 def match_bitap(self, text, pattern, loc):
1249 """Locate the best instance of 'pattern' in 'text' near 'loc' using the
1250 Bitap algorithm.
1323 def match_bitap(self, text, pattern, loc):
1324 """Locate the best instance of 'pattern' in 'text' near 'loc' using the
1325 Bitap algorithm.
1251 1326
1252 Args:
1253 text: The text to search.
1254 pattern: The pattern to search for.
1255 loc: The location to search around.
1327 Args:
1328 text: The text to search.
1329 pattern: The pattern to search for.
1330 loc: The location to search around.
1256 1331
1257 Returns:
1258 Best match index or -1.
1259 """
1260 # Python doesn't have a maxint limit, so ignore this check.
1261 #if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1262 # raise ValueError("Pattern too long for this application.")
1332 Returns:
1333 Best match index or -1.
1334 """
1335 # Python doesn't have a maxint limit, so ignore this check.
1336 # if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1337 # raise ValueError("Pattern too long for this application.")
1263 1338
1264 # Initialise the alphabet.
1265 s = self.match_alphabet(pattern)
1339 # Initialise the alphabet.
1340 s = self.match_alphabet(pattern)
1266 1341
1267 def match_bitapScore(e, x):
1268 """Compute and return the score for a match with e errors and x location.
1269 Accesses loc and pattern through being a closure.
1342 def match_bitapScore(e, x):
1343 """Compute and return the score for a match with e errors and x location.
1344 Accesses loc and pattern through being a closure.
1270 1345
1271 Args:
1272 e: Number of errors in match.
1273 x: Location of match.
1346 Args:
1347 e: Number of errors in match.
1348 x: Location of match.
1274 1349
1275 Returns:
1276 Overall score for match (0.0 = good, 1.0 = bad).
1277 """
1278 accuracy = float(e) / len(pattern)
1279 proximity = abs(loc - x)
1280 if not self.Match_Distance:
1281 # Dodge divide by zero error.
1282 return proximity and 1.0 or accuracy
1283 return accuracy + (proximity / float(self.Match_Distance))
1350 Returns:
1351 Overall score for match (0.0 = good, 1.0 = bad).
1352 """
1353 accuracy = float(e) / len(pattern)
1354 proximity = abs(loc - x)
1355 if not self.Match_Distance:
1356 # Dodge divide by zero error.
1357 return proximity and 1.0 or accuracy
1358 return accuracy + (proximity / float(self.Match_Distance))
1284 1359
1285 # Highest score beyond which we give up.
1286 score_threshold = self.Match_Threshold
1287 # Is there a nearby exact match? (speedup)
1288 best_loc = text.find(pattern, loc)
1289 if best_loc != -1:
1290 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1291 # What about in the other direction? (speedup)
1292 best_loc = text.rfind(pattern, loc + len(pattern))
1293 if best_loc != -1:
1294 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1360 # Highest score beyond which we give up.
1361 score_threshold = self.Match_Threshold
1362 # Is there a nearby exact match? (speedup)
1363 best_loc = text.find(pattern, loc)
1364 if best_loc != -1:
1365 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1366 # What about in the other direction? (speedup)
1367 best_loc = text.rfind(pattern, loc + len(pattern))
1368 if best_loc != -1:
1369 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1295 1370
1296 # Initialise the bit arrays.
1297 matchmask = 1 << (len(pattern) - 1)
1298 best_loc = -1
1371 # Initialise the bit arrays.
1372 matchmask = 1 << (len(pattern) - 1)
1373 best_loc = -1
1299 1374
1300 bin_max = len(pattern) + len(text)
1301 # Empty initialization added to appease pychecker.
1302 last_rd = None
1303 for d in range(len(pattern)):
1304 # Scan for the best match each iteration allows for one more error.
1305 # Run a binary search to determine how far from 'loc' we can stray at
1306 # this error level.
1307 bin_min = 0
1308 bin_mid = bin_max
1309 while bin_min < bin_mid:
1310 if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1311 bin_min = bin_mid
1312 else:
1313 bin_max = bin_mid
1314 bin_mid = (bin_max - bin_min) // 2 + bin_min
1375 bin_max = len(pattern) + len(text)
1376 # Empty initialization added to appease pychecker.
1377 last_rd = None
1378 for d in range(len(pattern)):
1379 # Scan for the best match each iteration allows for one more error.
1380 # Run a binary search to determine how far from 'loc' we can stray at
1381 # this error level.
1382 bin_min = 0
1383 bin_mid = bin_max
1384 while bin_min < bin_mid:
1385 if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1386 bin_min = bin_mid
1387 else:
1388 bin_max = bin_mid
1389 bin_mid = (bin_max - bin_min) // 2 + bin_min
1315 1390
1316 # Use the result from this iteration as the maximum for the next.
1317 bin_max = bin_mid
1318 start = max(1, loc - bin_mid + 1)
1319 finish = min(loc + bin_mid, len(text)) + len(pattern)
1391 # Use the result from this iteration as the maximum for the next.
1392 bin_max = bin_mid
1393 start = max(1, loc - bin_mid + 1)
1394 finish = min(loc + bin_mid, len(text)) + len(pattern)
1320 1395
1321 rd = [0] * (finish + 2)
1322 rd[finish + 1] = (1 << d) - 1
1323 for j in range(finish, start - 1, -1):
1324 if len(text) <= j - 1:
1325 # Out of range.
1326 charMatch = 0
1327 else:
1328 charMatch = s.get(text[j - 1], 0)
1329 if d == 0: # First pass: exact match.
1330 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1331 else: # Subsequent passes: fuzzy match.
1332 rd[j] = (((rd[j + 1] << 1) | 1) & charMatch) | (
1333 ((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
1334 if rd[j] & matchmask:
1335 score = match_bitapScore(d, j - 1)
1336 # This match will almost certainly be better than any existing match.
1337 # But check anyway.
1338 if score <= score_threshold:
1339 # Told you so.
1340 score_threshold = score
1341 best_loc = j - 1
1342 if best_loc > loc:
1343 # When passing loc, don't exceed our current distance from loc.
1344 start = max(1, 2 * loc - best_loc)
1345 else:
1346 # Already passed loc, downhill from here on in.
1347 break
1348 # No hope for a (better) match at greater error levels.
1349 if match_bitapScore(d + 1, loc) > score_threshold:
1350 break
1351 last_rd = rd
1352 return best_loc
1396 rd = [0] * (finish + 2)
1397 rd[finish + 1] = (1 << d) - 1
1398 for j in range(finish, start - 1, -1):
1399 if len(text) <= j - 1:
1400 # Out of range.
1401 charMatch = 0
1402 else:
1403 charMatch = s.get(text[j - 1], 0)
1404 if d == 0: # First pass: exact match.
1405 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1406 else: # Subsequent passes: fuzzy match.
1407 rd[j] = (
1408 (((rd[j + 1] << 1) | 1) & charMatch)
1409 | (((last_rd[j + 1] | last_rd[j]) << 1) | 1)
1410 | last_rd[j + 1]
1411 )
1412 if rd[j] & matchmask:
1413 score = match_bitapScore(d, j - 1)
1414 # This match will almost certainly be better than any existing match.
1415 # But check anyway.
1416 if score <= score_threshold:
1417 # Told you so.
1418 score_threshold = score
1419 best_loc = j - 1
1420 if best_loc > loc:
1421 # When passing loc, don't exceed our current distance from loc.
1422 start = max(1, 2 * loc - best_loc)
1423 else:
1424 # Already passed loc, downhill from here on in.
1425 break
1426 # No hope for a (better) match at greater error levels.
1427 if match_bitapScore(d + 1, loc) > score_threshold:
1428 break
1429 last_rd = rd
1430 return best_loc
1353 1431
1354 def match_alphabet(self, pattern):
1355 """Initialise the alphabet for the Bitap algorithm.
1432 def match_alphabet(self, pattern):
1433 """Initialise the alphabet for the Bitap algorithm.
1356 1434
1357 Args:
1358 pattern: The text to encode.
1435 Args:
1436 pattern: The text to encode.
1359 1437
1360 Returns:
1361 Hash of character locations.
1362 """
1363 s = {}
1364 for char in pattern:
1365 s[char] = 0
1366 for i in range(len(pattern)):
1367 s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1368 return s
1438 Returns:
1439 Hash of character locations.
1440 """
1441 s = {}
1442 for char in pattern:
1443 s[char] = 0
1444 for i in range(len(pattern)):
1445 s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1446 return s
1369 1447
1370 # PATCH FUNCTIONS
1448 # PATCH FUNCTIONS
1371 1449
1372 def patch_addContext(self, patch, text):
1373 """Increase the context until it is unique,
1374 but don't let the pattern expand beyond Match_MaxBits.
1450 def patch_addContext(self, patch, text):
1451 """Increase the context until it is unique,
1452 but don't let the pattern expand beyond Match_MaxBits.
1375 1453
1376 Args:
1377 patch: The patch to grow.
1378 text: Source text.
1379 """
1380 if len(text) == 0:
1381 return
1382 pattern = text[patch.start2 : patch.start2 + patch.length1]
1383 padding = 0
1454 Args:
1455 patch: The patch to grow.
1456 text: Source text.
1457 """
1458 if len(text) == 0:
1459 return
1460 pattern = text[patch.start2 : patch.start2 + patch.length1]
1461 padding = 0
1384 1462
1385 # Look for the first and last matches of pattern in text. If two different
1386 # matches are found, increase the pattern length.
1387 while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits ==
1388 0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin -
1389 self.Patch_Margin)):
1390 padding += self.Patch_Margin
1391 pattern = text[max(0, patch.start2 - padding) :
1392 patch.start2 + patch.length1 + padding]
1393 # Add one chunk for good luck.
1394 padding += self.Patch_Margin
1463 # Look for the first and last matches of pattern in text. If two different
1464 # matches are found, increase the pattern length.
1465 while text.find(pattern) != text.rfind(pattern) and (
1466 self.Match_MaxBits == 0
1467 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - self.Patch_Margin
1468 ):
1469 padding += self.Patch_Margin
1470 pattern = text[
1471 max(0, patch.start2 - padding) : patch.start2 + patch.length1 + padding
1472 ]
1473 # Add one chunk for good luck.
1474 padding += self.Patch_Margin
1395 1475
1396 # Add the prefix.
1397 prefix = text[max(0, patch.start2 - padding) : patch.start2]
1398 if prefix:
1399 patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1400 # Add the suffix.
1401 suffix = text[patch.start2 + patch.length1 :
1402 patch.start2 + patch.length1 + padding]
1403 if suffix:
1404 patch.diffs.append((self.DIFF_EQUAL, suffix))
1476 # Add the prefix.
1477 prefix = text[max(0, patch.start2 - padding) : patch.start2]
1478 if prefix:
1479 patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1480 # Add the suffix.
1481 suffix = text[
1482 patch.start2 + patch.length1 : patch.start2 + patch.length1 + padding
1483 ]
1484 if suffix:
1485 patch.diffs.append((self.DIFF_EQUAL, suffix))
1405 1486
1406 # Roll back the start points.
1407 patch.start1 -= len(prefix)
1408 patch.start2 -= len(prefix)
1409 # Extend lengths.
1410 patch.length1 += len(prefix) + len(suffix)
1411 patch.length2 += len(prefix) + len(suffix)
1487 # Roll back the start points.
1488 patch.start1 -= len(prefix)
1489 patch.start2 -= len(prefix)
1490 # Extend lengths.
1491 patch.length1 += len(prefix) + len(suffix)
1492 patch.length2 += len(prefix) + len(suffix)
1412 1493
1413 def patch_make(self, a, b=None, c=None):
1414 """Compute a list of patches to turn text1 into text2.
1415 Use diffs if provided, otherwise compute it ourselves.
1416 There are four ways to call this function, depending on what data is
1417 available to the caller:
1418 Method 1:
1419 a = text1, b = text2
1420 Method 2:
1421 a = diffs
1422 Method 3 (optimal):
1423 a = text1, b = diffs
1424 Method 4 (deprecated, use method 3):
1425 a = text1, b = text2, c = diffs
1494 def patch_make(self, a, b=None, c=None):
1495 """Compute a list of patches to turn text1 into text2.
1496 Use diffs if provided, otherwise compute it ourselves.
1497 There are four ways to call this function, depending on what data is
1498 available to the caller:
1499 Method 1:
1500 a = text1, b = text2
1501 Method 2:
1502 a = diffs
1503 Method 3 (optimal):
1504 a = text1, b = diffs
1505 Method 4 (deprecated, use method 3):
1506 a = text1, b = text2, c = diffs
1426 1507
1427 Args:
1428 a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1429 text2 (method 2).
1430 b: text2 (methods 1,4) or Array of diff tuples for text1 to
1431 text2 (method 3) or undefined (method 2).
1432 c: Array of diff tuples for text1 to text2 (method 4) or
1433 undefined (methods 1,2,3).
1508 Args:
1509 a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1510 text2 (method 2).
1511 b: text2 (methods 1,4) or Array of diff tuples for text1 to
1512 text2 (method 3) or undefined (method 2).
1513 c: Array of diff tuples for text1 to text2 (method 4) or
1514 undefined (methods 1,2,3).
1434 1515
1435 Returns:
1436 Array of Patch objects.
1437 """
1438 text1 = None
1439 diffs = None
1440 # Note that texts may arrive as 'str' or 'unicode'.
1441 if isinstance(a, str) and isinstance(b, str) and c is None:
1442 # Method 1: text1, text2
1443 # Compute diffs from text1 and text2.
1444 text1 = a
1445 diffs = self.diff_main(text1, b, True)
1446 if len(diffs) > 2:
1447 self.diff_cleanupSemantic(diffs)
1448 self.diff_cleanupEfficiency(diffs)
1449 elif isinstance(a, list) and b is None and c is None:
1450 # Method 2: diffs
1451 # Compute text1 from diffs.
1452 diffs = a
1453 text1 = self.diff_text1(diffs)
1454 elif isinstance(a, str) and isinstance(b, list) and c is None:
1455 # Method 3: text1, diffs
1456 text1 = a
1457 diffs = b
1458 elif (isinstance(a, str) and isinstance(b, str) and
1459 isinstance(c, list)):
1460 # Method 4: text1, text2, diffs
1461 # text2 is not used.
1462 text1 = a
1463 diffs = c
1464 else:
1465 raise ValueError("Unknown call format to patch_make.")
1516 Returns:
1517 Array of Patch objects.
1518 """
1519 text1 = None
1520 diffs = None
1521 # Note that texts may arrive as 'str' or 'unicode'.
1522 if isinstance(a, str) and isinstance(b, str) and c is None:
1523 # Method 1: text1, text2
1524 # Compute diffs from text1 and text2.
1525 text1 = a
1526 diffs = self.diff_main(text1, b, True)
1527 if len(diffs) > 2:
1528 self.diff_cleanupSemantic(diffs)
1529 self.diff_cleanupEfficiency(diffs)
1530 elif isinstance(a, list) and b is None and c is None:
1531 # Method 2: diffs
1532 # Compute text1 from diffs.
1533 diffs = a
1534 text1 = self.diff_text1(diffs)
1535 elif isinstance(a, str) and isinstance(b, list) and c is None:
1536 # Method 3: text1, diffs
1537 text1 = a
1538 diffs = b
1539 elif isinstance(a, str) and isinstance(b, str) and isinstance(c, list):
1540 # Method 4: text1, text2, diffs
1541 # text2 is not used.
1542 text1 = a
1543 diffs = c
1544 else:
1545 raise ValueError("Unknown call format to patch_make.")
1466 1546
1467 if not diffs:
1468 return [] # Get rid of the None case.
1469 patches = []
1470 patch = patch_obj()
1471 char_count1 = 0 # Number of characters into the text1 string.
1472 char_count2 = 0 # Number of characters into the text2 string.
1473 prepatch_text = text1 # Recreate the patches to determine context info.
1474 postpatch_text = text1
1475 for x in range(len(diffs)):
1476 (diff_type, diff_text) = diffs[x]
1477 if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1478 # A new patch starts here.
1479 patch.start1 = char_count1
1480 patch.start2 = char_count2
1481 if diff_type == self.DIFF_INSERT:
1482 # Insertion
1483 patch.diffs.append(diffs[x])
1484 patch.length2 += len(diff_text)
1485 postpatch_text = (postpatch_text[:char_count2] + diff_text +
1486 postpatch_text[char_count2:])
1487 elif diff_type == self.DIFF_DELETE:
1488 # Deletion.
1489 patch.length1 += len(diff_text)
1490 patch.diffs.append(diffs[x])
1491 postpatch_text = (postpatch_text[:char_count2] +
1492 postpatch_text[char_count2 + len(diff_text):])
1493 elif (diff_type == self.DIFF_EQUAL and
1494 len(diff_text) <= 2 * self.Patch_Margin and
1495 len(patch.diffs) != 0 and len(diffs) != x + 1):
1496 # Small equality inside a patch.
1497 patch.diffs.append(diffs[x])
1498 patch.length1 += len(diff_text)
1499 patch.length2 += len(diff_text)
1547 if not diffs:
1548 return [] # Get rid of the None case.
1549 patches = []
1550 patch = patch_obj()
1551 char_count1 = 0 # Number of characters into the text1 string.
1552 char_count2 = 0 # Number of characters into the text2 string.
1553 prepatch_text = text1 # Recreate the patches to determine context info.
1554 postpatch_text = text1
1555 for x in range(len(diffs)):
1556 (diff_type, diff_text) = diffs[x]
1557 if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1558 # A new patch starts here.
1559 patch.start1 = char_count1
1560 patch.start2 = char_count2
1561 if diff_type == self.DIFF_INSERT:
1562 # Insertion
1563 patch.diffs.append(diffs[x])
1564 patch.length2 += len(diff_text)
1565 postpatch_text = (
1566 postpatch_text[:char_count2]
1567 + diff_text
1568 + postpatch_text[char_count2:]
1569 )
1570 elif diff_type == self.DIFF_DELETE:
1571 # Deletion.
1572 patch.length1 += len(diff_text)
1573 patch.diffs.append(diffs[x])
1574 postpatch_text = (
1575 postpatch_text[:char_count2]
1576 + postpatch_text[char_count2 + len(diff_text) :]
1577 )
1578 elif (
1579 diff_type == self.DIFF_EQUAL
1580 and len(diff_text) <= 2 * self.Patch_Margin
1581 and len(patch.diffs) != 0
1582 and len(diffs) != x + 1
1583 ):
1584 # Small equality inside a patch.
1585 patch.diffs.append(diffs[x])
1586 patch.length1 += len(diff_text)
1587 patch.length2 += len(diff_text)
1500 1588
1501 if (diff_type == self.DIFF_EQUAL and
1502 len(diff_text) >= 2 * self.Patch_Margin):
1503 # Time for a new patch.
1504 if len(patch.diffs) != 0:
1505 self.patch_addContext(patch, prepatch_text)
1506 patches.append(patch)
1507 patch = patch_obj()
1508 # Unlike Unidiff, our patch lists have a rolling context.
1509 # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1510 # Update prepatch text & pos to reflect the application of the
1511 # just completed patch.
1512 prepatch_text = postpatch_text
1513 char_count1 = char_count2
1589 if diff_type == self.DIFF_EQUAL and len(diff_text) >= 2 * self.Patch_Margin:
1590 # Time for a new patch.
1591 if len(patch.diffs) != 0:
1592 self.patch_addContext(patch, prepatch_text)
1593 patches.append(patch)
1594 patch = patch_obj()
1595 # Unlike Unidiff, our patch lists have a rolling context.
1596 # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1597 # Update prepatch text & pos to reflect the application of the
1598 # just completed patch.
1599 prepatch_text = postpatch_text
1600 char_count1 = char_count2
1514 1601
1515 # Update the current character count.
1516 if diff_type != self.DIFF_INSERT:
1517 char_count1 += len(diff_text)
1518 if diff_type != self.DIFF_DELETE:
1519 char_count2 += len(diff_text)
1602 # Update the current character count.
1603 if diff_type != self.DIFF_INSERT:
1604 char_count1 += len(diff_text)
1605 if diff_type != self.DIFF_DELETE:
1606 char_count2 += len(diff_text)
1520 1607
1521 # Pick up the leftover patch if not empty.
1522 if len(patch.diffs) != 0:
1523 self.patch_addContext(patch, prepatch_text)
1524 patches.append(patch)
1525 return patches
1608 # Pick up the leftover patch if not empty.
1609 if len(patch.diffs) != 0:
1610 self.patch_addContext(patch, prepatch_text)
1611 patches.append(patch)
1612 return patches
1526 1613
1527 def patch_deepCopy(self, patches):
1528 """Given an array of patches, return another array that is identical.
1614 def patch_deepCopy(self, patches):
1615 """Given an array of patches, return another array that is identical.
1529 1616
1530 Args:
1531 patches: Array of Patch objects.
1617 Args:
1618 patches: Array of Patch objects.
1532 1619
1533 Returns:
1534 Array of Patch objects.
1535 """
1536 patchesCopy = []
1537 for patch in patches:
1538 patchCopy = patch_obj()
1539 # No need to deep copy the tuples since they are immutable.
1540 patchCopy.diffs = patch.diffs[:]
1541 patchCopy.start1 = patch.start1
1542 patchCopy.start2 = patch.start2
1543 patchCopy.length1 = patch.length1
1544 patchCopy.length2 = patch.length2
1545 patchesCopy.append(patchCopy)
1546 return patchesCopy
1620 Returns:
1621 Array of Patch objects.
1622 """
1623 patchesCopy = []
1624 for patch in patches:
1625 patchCopy = patch_obj()
1626 # No need to deep copy the tuples since they are immutable.
1627 patchCopy.diffs = patch.diffs[:]
1628 patchCopy.start1 = patch.start1
1629 patchCopy.start2 = patch.start2
1630 patchCopy.length1 = patch.length1
1631 patchCopy.length2 = patch.length2
1632 patchesCopy.append(patchCopy)
1633 return patchesCopy
1547 1634
1548 def patch_apply(self, patches, text):
1549 """Merge a set of patches onto the text. Return a patched text, as well
1550 as a list of true/false values indicating which patches were applied.
1635 def patch_apply(self, patches, text):
1636 """Merge a set of patches onto the text. Return a patched text, as well
1637 as a list of true/false values indicating which patches were applied.
1551 1638
1552 Args:
1553 patches: Array of Patch objects.
1554 text: Old text.
1639 Args:
1640 patches: Array of Patch objects.
1641 text: Old text.
1555 1642
1556 Returns:
1557 Two element Array, containing the new text and an array of boolean values.
1558 """
1559 if not patches:
1560 return (text, [])
1643 Returns:
1644 Two element Array, containing the new text and an array of boolean values.
1645 """
1646 if not patches:
1647 return (text, [])
1561 1648
1562 # Deep copy the patches so that no changes are made to originals.
1563 patches = self.patch_deepCopy(patches)
1649 # Deep copy the patches so that no changes are made to originals.
1650 patches = self.patch_deepCopy(patches)
1564 1651
1565 nullPadding = self.patch_addPadding(patches)
1566 text = nullPadding + text + nullPadding
1567 self.patch_splitMax(patches)
1652 nullPadding = self.patch_addPadding(patches)
1653 text = nullPadding + text + nullPadding
1654 self.patch_splitMax(patches)
1568 1655
1569 # delta keeps track of the offset between the expected and actual location
1570 # of the previous patch. If there are patches expected at positions 10 and
1571 # 20, but the first patch was found at 12, delta is 2 and the second patch
1572 # has an effective expected position of 22.
1573 delta = 0
1574 results = []
1575 for patch in patches:
1576 expected_loc = patch.start2 + delta
1577 text1 = self.diff_text1(patch.diffs)
1578 end_loc = -1
1579 if len(text1) > self.Match_MaxBits:
1580 # patch_splitMax will only provide an oversized pattern in the case of
1581 # a monster delete.
1582 start_loc = self.match_main(text, text1[:self.Match_MaxBits],
1583 expected_loc)
1584 if start_loc != -1:
1585 end_loc = self.match_main(text, text1[-self.Match_MaxBits:],
1586 expected_loc + len(text1) - self.Match_MaxBits)
1587 if end_loc == -1 or start_loc >= end_loc:
1588 # Can't find valid trailing context. Drop this patch.
1589 start_loc = -1
1590 else:
1591 start_loc = self.match_main(text, text1, expected_loc)
1592 if start_loc == -1:
1593 # No match found. :(
1594 results.append(False)
1595 # Subtract the delta for this failed patch from subsequent patches.
1596 delta -= patch.length2 - patch.length1
1597 else:
1598 # Found a match. :)
1599 results.append(True)
1600 delta = start_loc - expected_loc
1601 if end_loc == -1:
1602 text2 = text[start_loc : start_loc + len(text1)]
1603 else:
1604 text2 = text[start_loc : end_loc + self.Match_MaxBits]
1605 if text1 == text2:
1606 # Perfect match, just shove the replacement text in.
1607 text = (text[:start_loc] + self.diff_text2(patch.diffs) +
1608 text[start_loc + len(text1):])
1609 else:
1610 # Imperfect match.
1611 # Run a diff to get a framework of equivalent indices.
1612 diffs = self.diff_main(text1, text2, False)
1613 if (len(text1) > self.Match_MaxBits and
1614 self.diff_levenshtein(diffs) / float(len(text1)) >
1615 self.Patch_DeleteThreshold):
1616 # The end points match, but the content is unacceptably bad.
1617 results[-1] = False
1618 else:
1619 self.diff_cleanupSemanticLossless(diffs)
1620 index1 = 0
1621 for (op, data) in patch.diffs:
1622 if op != self.DIFF_EQUAL:
1623 index2 = self.diff_xIndex(diffs, index1)
1624 if op == self.DIFF_INSERT: # Insertion
1625 text = text[:start_loc + index2] + data + text[start_loc +
1626 index2:]
1627 elif op == self.DIFF_DELETE: # Deletion
1628 text = text[:start_loc + index2] + text[start_loc +
1629 self.diff_xIndex(diffs, index1 + len(data)):]
1630 if op != self.DIFF_DELETE:
1631 index1 += len(data)
1632 # Strip the padding off.
1633 text = text[len(nullPadding):-len(nullPadding)]
1634 return (text, results)
1656 # delta keeps track of the offset between the expected and actual location
1657 # of the previous patch. If there are patches expected at positions 10 and
1658 # 20, but the first patch was found at 12, delta is 2 and the second patch
1659 # has an effective expected position of 22.
1660 delta = 0
1661 results = []
1662 for patch in patches:
1663 expected_loc = patch.start2 + delta
1664 text1 = self.diff_text1(patch.diffs)
1665 end_loc = -1
1666 if len(text1) > self.Match_MaxBits:
1667 # patch_splitMax will only provide an oversized pattern in the case of
1668 # a monster delete.
1669 start_loc = self.match_main(
1670 text, text1[: self.Match_MaxBits], expected_loc
1671 )
1672 if start_loc != -1:
1673 end_loc = self.match_main(
1674 text,
1675 text1[-self.Match_MaxBits :],
1676 expected_loc + len(text1) - self.Match_MaxBits,
1677 )
1678 if end_loc == -1 or start_loc >= end_loc:
1679 # Can't find valid trailing context. Drop this patch.
1680 start_loc = -1
1681 else:
1682 start_loc = self.match_main(text, text1, expected_loc)
1683 if start_loc == -1:
1684 # No match found. :(
1685 results.append(False)
1686 # Subtract the delta for this failed patch from subsequent patches.
1687 delta -= patch.length2 - patch.length1
1688 else:
1689 # Found a match. :)
1690 results.append(True)
1691 delta = start_loc - expected_loc
1692 if end_loc == -1:
1693 text2 = text[start_loc : start_loc + len(text1)]
1694 else:
1695 text2 = text[start_loc : end_loc + self.Match_MaxBits]
1696 if text1 == text2:
1697 # Perfect match, just shove the replacement text in.
1698 text = (
1699 text[:start_loc]
1700 + self.diff_text2(patch.diffs)
1701 + text[start_loc + len(text1) :]
1702 )
1703 else:
1704 # Imperfect match.
1705 # Run a diff to get a framework of equivalent indices.
1706 diffs = self.diff_main(text1, text2, False)
1707 if (
1708 len(text1) > self.Match_MaxBits
1709 and self.diff_levenshtein(diffs) / float(len(text1))
1710 > self.Patch_DeleteThreshold
1711 ):
1712 # The end points match, but the content is unacceptably bad.
1713 results[-1] = False
1714 else:
1715 self.diff_cleanupSemanticLossless(diffs)
1716 index1 = 0
1717 for op, data in patch.diffs:
1718 if op != self.DIFF_EQUAL:
1719 index2 = self.diff_xIndex(diffs, index1)
1720 if op == self.DIFF_INSERT: # Insertion
1721 text = (
1722 text[: start_loc + index2]
1723 + data
1724 + text[start_loc + index2 :]
1725 )
1726 elif op == self.DIFF_DELETE: # Deletion
1727 text = (
1728 text[: start_loc + index2]
1729 + text[
1730 start_loc
1731 + self.diff_xIndex(diffs, index1 + len(data)) :
1732 ]
1733 )
1734 if op != self.DIFF_DELETE:
1735 index1 += len(data)
1736 # Strip the padding off.
1737 text = text[len(nullPadding) : -len(nullPadding)]
1738 return (text, results)
1635 1739
1636 def patch_addPadding(self, patches):
1637 """Add some padding on text start and end so that edges can match
1638 something. Intended to be called only from within patch_apply.
1740 def patch_addPadding(self, patches):
1741 """Add some padding on text start and end so that edges can match
1742 something. Intended to be called only from within patch_apply.
1639 1743
1640 Args:
1641 patches: Array of Patch objects.
1744 Args:
1745 patches: Array of Patch objects.
1642 1746
1643 Returns:
1644 The padding string added to each side.
1645 """
1646 paddingLength = self.Patch_Margin
1647 nullPadding = ""
1648 for x in range(1, paddingLength + 1):
1649 nullPadding += chr(x)
1747 Returns:
1748 The padding string added to each side.
1749 """
1750 paddingLength = self.Patch_Margin
1751 nullPadding = ""
1752 for x in range(1, paddingLength + 1):
1753 nullPadding += chr(x)
1650 1754
1651 # Bump all the patches forward.
1652 for patch in patches:
1653 patch.start1 += paddingLength
1654 patch.start2 += paddingLength
1755 # Bump all the patches forward.
1756 for patch in patches:
1757 patch.start1 += paddingLength
1758 patch.start2 += paddingLength
1655 1759
1656 # Add some padding on start of first diff.
1657 patch = patches[0]
1658 diffs = patch.diffs
1659 if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1660 # Add nullPadding equality.
1661 diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1662 patch.start1 -= paddingLength # Should be 0.
1663 patch.start2 -= paddingLength # Should be 0.
1664 patch.length1 += paddingLength
1665 patch.length2 += paddingLength
1666 elif paddingLength > len(diffs[0][1]):
1667 # Grow first equality.
1668 extraLength = paddingLength - len(diffs[0][1])
1669 newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]
1670 diffs[0] = (diffs[0][0], newText)
1671 patch.start1 -= extraLength
1672 patch.start2 -= extraLength
1673 patch.length1 += extraLength
1674 patch.length2 += extraLength
1760 # Add some padding on start of first diff.
1761 patch = patches[0]
1762 diffs = patch.diffs
1763 if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1764 # Add nullPadding equality.
1765 diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1766 patch.start1 -= paddingLength # Should be 0.
1767 patch.start2 -= paddingLength # Should be 0.
1768 patch.length1 += paddingLength
1769 patch.length2 += paddingLength
1770 elif paddingLength > len(diffs[0][1]):
1771 # Grow first equality.
1772 extraLength = paddingLength - len(diffs[0][1])
1773 newText = nullPadding[len(diffs[0][1]) :] + diffs[0][1]
1774 diffs[0] = (diffs[0][0], newText)
1775 patch.start1 -= extraLength
1776 patch.start2 -= extraLength
1777 patch.length1 += extraLength
1778 patch.length2 += extraLength
1675 1779
1676 # Add some padding on end of last diff.
1677 patch = patches[-1]
1678 diffs = patch.diffs
1679 if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1680 # Add nullPadding equality.
1681 diffs.append((self.DIFF_EQUAL, nullPadding))
1682 patch.length1 += paddingLength
1683 patch.length2 += paddingLength
1684 elif paddingLength > len(diffs[-1][1]):
1685 # Grow last equality.
1686 extraLength = paddingLength - len(diffs[-1][1])
1687 newText = diffs[-1][1] + nullPadding[:extraLength]
1688 diffs[-1] = (diffs[-1][0], newText)
1689 patch.length1 += extraLength
1690 patch.length2 += extraLength
1780 # Add some padding on end of last diff.
1781 patch = patches[-1]
1782 diffs = patch.diffs
1783 if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1784 # Add nullPadding equality.
1785 diffs.append((self.DIFF_EQUAL, nullPadding))
1786 patch.length1 += paddingLength
1787 patch.length2 += paddingLength
1788 elif paddingLength > len(diffs[-1][1]):
1789 # Grow last equality.
1790 extraLength = paddingLength - len(diffs[-1][1])
1791 newText = diffs[-1][1] + nullPadding[:extraLength]
1792 diffs[-1] = (diffs[-1][0], newText)
1793 patch.length1 += extraLength
1794 patch.length2 += extraLength
1691 1795
1692 return nullPadding
1796 return nullPadding
1693 1797
1694 def patch_splitMax(self, patches):
1695 """Look through the patches and break up any which are longer than the
1696 maximum limit of the match algorithm.
1697 Intended to be called only from within patch_apply.
1798 def patch_splitMax(self, patches):
1799 """Look through the patches and break up any which are longer than the
1800 maximum limit of the match algorithm.
1801 Intended to be called only from within patch_apply.
1698 1802
1699 Args:
1700 patches: Array of Patch objects.
1701 """
1702 patch_size = self.Match_MaxBits
1703 if patch_size == 0:
1704 # Python has the option of not splitting strings due to its ability
1705 # to handle integers of arbitrary precision.
1706 return
1707 for x in range(len(patches)):
1708 if patches[x].length1 <= patch_size:
1709 continue
1710 bigpatch = patches[x]
1711 # Remove the big old patch.
1712 del patches[x]
1713 x -= 1
1714 start1 = bigpatch.start1
1715 start2 = bigpatch.start2
1716 precontext = ''
1717 while len(bigpatch.diffs) != 0:
1718 # Create one of several smaller patches.
1719 patch = patch_obj()
1720 empty = True
1721 patch.start1 = start1 - len(precontext)
1722 patch.start2 = start2 - len(precontext)
1723 if precontext:
1724 patch.length1 = patch.length2 = len(precontext)
1725 patch.diffs.append((self.DIFF_EQUAL, precontext))
1803 Args:
1804 patches: Array of Patch objects.
1805 """
1806 patch_size = self.Match_MaxBits
1807 if patch_size == 0:
1808 # Python has the option of not splitting strings due to its ability
1809 # to handle integers of arbitrary precision.
1810 return
1811 for x in range(len(patches)):
1812 if patches[x].length1 <= patch_size:
1813 continue
1814 bigpatch = patches[x]
1815 # Remove the big old patch.
1816 del patches[x]
1817 x -= 1
1818 start1 = bigpatch.start1
1819 start2 = bigpatch.start2
1820 precontext = ""
1821 while len(bigpatch.diffs) != 0:
1822 # Create one of several smaller patches.
1823 patch = patch_obj()
1824 empty = True
1825 patch.start1 = start1 - len(precontext)
1826 patch.start2 = start2 - len(precontext)
1827 if precontext:
1828 patch.length1 = patch.length2 = len(precontext)
1829 patch.diffs.append((self.DIFF_EQUAL, precontext))
1726 1830
1727 while (len(bigpatch.diffs) != 0 and
1728 patch.length1 < patch_size - self.Patch_Margin):
1729 (diff_type, diff_text) = bigpatch.diffs[0]
1730 if diff_type == self.DIFF_INSERT:
1731 # Insertions are harmless.
1732 patch.length2 += len(diff_text)
1733 start2 += len(diff_text)
1734 patch.diffs.append(bigpatch.diffs.pop(0))
1735 empty = False
1736 elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and
1737 patch.diffs[0][0] == self.DIFF_EQUAL and
1738 len(diff_text) > 2 * patch_size):
1739 # This is a large deletion. Let it pass in one chunk.
1740 patch.length1 += len(diff_text)
1741 start1 += len(diff_text)
1742 empty = False
1743 patch.diffs.append((diff_type, diff_text))
1744 del bigpatch.diffs[0]
1745 else:
1746 # Deletion or equality. Only take as much as we can stomach.
1747 diff_text = diff_text[:patch_size - patch.length1 -
1748 self.Patch_Margin]
1749 patch.length1 += len(diff_text)
1750 start1 += len(diff_text)
1751 if diff_type == self.DIFF_EQUAL:
1752 patch.length2 += len(diff_text)
1753 start2 += len(diff_text)
1754 else:
1755 empty = False
1831 while (
1832 len(bigpatch.diffs) != 0
1833 and patch.length1 < patch_size - self.Patch_Margin
1834 ):
1835 (diff_type, diff_text) = bigpatch.diffs[0]
1836 if diff_type == self.DIFF_INSERT:
1837 # Insertions are harmless.
1838 patch.length2 += len(diff_text)
1839 start2 += len(diff_text)
1840 patch.diffs.append(bigpatch.diffs.pop(0))
1841 empty = False
1842 elif (
1843 diff_type == self.DIFF_DELETE
1844 and len(patch.diffs) == 1
1845 and patch.diffs[0][0] == self.DIFF_EQUAL
1846 and len(diff_text) > 2 * patch_size
1847 ):
1848 # This is a large deletion. Let it pass in one chunk.
1849 patch.length1 += len(diff_text)
1850 start1 += len(diff_text)
1851 empty = False
1852 patch.diffs.append((diff_type, diff_text))
1853 del bigpatch.diffs[0]
1854 else:
1855 # Deletion or equality. Only take as much as we can stomach.
1856 diff_text = diff_text[
1857 : patch_size - patch.length1 - self.Patch_Margin
1858 ]
1859 patch.length1 += len(diff_text)
1860 start1 += len(diff_text)
1861 if diff_type == self.DIFF_EQUAL:
1862 patch.length2 += len(diff_text)
1863 start2 += len(diff_text)
1864 else:
1865 empty = False
1756 1866
1757 patch.diffs.append((diff_type, diff_text))
1758 if diff_text == bigpatch.diffs[0][1]:
1759 del bigpatch.diffs[0]
1760 else:
1761 bigpatch.diffs[0] = (bigpatch.diffs[0][0],
1762 bigpatch.diffs[0][1][len(diff_text):])
1867 patch.diffs.append((diff_type, diff_text))
1868 if diff_text == bigpatch.diffs[0][1]:
1869 del bigpatch.diffs[0]
1870 else:
1871 bigpatch.diffs[0] = (
1872 bigpatch.diffs[0][0],
1873 bigpatch.diffs[0][1][len(diff_text) :],
1874 )
1763 1875
1764 # Compute the head context for the next patch.
1765 precontext = self.diff_text2(patch.diffs)
1766 precontext = precontext[-self.Patch_Margin:]
1767 # Append the end context for this patch.
1768 postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin]
1769 if postcontext:
1770 patch.length1 += len(postcontext)
1771 patch.length2 += len(postcontext)
1772 if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1773 patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +
1774 postcontext)
1775 else:
1776 patch.diffs.append((self.DIFF_EQUAL, postcontext))
1876 # Compute the head context for the next patch.
1877 precontext = self.diff_text2(patch.diffs)
1878 precontext = precontext[-self.Patch_Margin :]
1879 # Append the end context for this patch.
1880 postcontext = self.diff_text1(bigpatch.diffs)[: self.Patch_Margin]
1881 if postcontext:
1882 patch.length1 += len(postcontext)
1883 patch.length2 += len(postcontext)
1884 if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1885 patch.diffs[-1] = (
1886 self.DIFF_EQUAL,
1887 patch.diffs[-1][1] + postcontext,
1888 )
1889 else:
1890 patch.diffs.append((self.DIFF_EQUAL, postcontext))
1777 1891
1778 if not empty:
1779 x += 1
1780 patches.insert(x, patch)
1781
1782 def patch_toText(self, patches):
1783 """Take a list of patches and return a textual representation.
1892 if not empty:
1893 x += 1
1894 patches.insert(x, patch)
1784 1895
1785 Args:
1786 patches: Array of Patch objects.
1896 def patch_toText(self, patches):
1897 """Take a list of patches and return a textual representation.
1898
1899 Args:
1900 patches: Array of Patch objects.
1787 1901
1788 Returns:
1789 Text representation of patches.
1790 """
1791 text = []
1792 for patch in patches:
1793 text.append(str(patch))
1794 return "".join(text)
1902 Returns:
1903 Text representation of patches.
1904 """
1905 text = []
1906 for patch in patches:
1907 text.append(str(patch))
1908 return "".join(text)
1795 1909
1796 def patch_fromText(self, textline):
1797 """Parse a textual representation of patches and return a list of patch
1798 objects.
1910 def patch_fromText(self, textline):
1911 """Parse a textual representation of patches and return a list of patch
1912 objects.
1799 1913
1800 Args:
1801 textline: Text representation of patches.
1914 Args:
1915 textline: Text representation of patches.
1802 1916
1803 Returns:
1804 Array of Patch objects.
1917 Returns:
1918 Array of Patch objects.
1805 1919
1806 Raises:
1807 ValueError: If invalid input.
1808 """
1809 if type(textline) == unicode:
1810 # Patches should be composed of a subset of ascii chars, Unicode not
1811 # required. If this encode raises UnicodeEncodeError, patch is invalid.
1812 textline = textline.encode("ascii")
1813 patches = []
1814 if not textline:
1815 return patches
1816 text = textline.split('\n')
1817 while len(text) != 0:
1818 m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1819 if not m:
1820 raise ValueError("Invalid patch string: " + text[0])
1821 patch = patch_obj()
1822 patches.append(patch)
1823 patch.start1 = int(m.group(1))
1824 if m.group(2) == '':
1825 patch.start1 -= 1
1826 patch.length1 = 1
1827 elif m.group(2) == '0':
1828 patch.length1 = 0
1829 else:
1830 patch.start1 -= 1
1831 patch.length1 = int(m.group(2))
1920 Raises:
1921 ValueError: If invalid input.
1922 """
1923 if type(textline) == unicode:
1924 # Patches should be composed of a subset of ascii chars, Unicode not
1925 # required. If this encode raises UnicodeEncodeError, patch is invalid.
1926 textline = textline.encode("ascii")
1927 patches = []
1928 if not textline:
1929 return patches
1930 text = textline.split("\n")
1931 while len(text) != 0:
1932 m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1933 if not m:
1934 raise ValueError("Invalid patch string: " + text[0])
1935 patch = patch_obj()
1936 patches.append(patch)
1937 patch.start1 = int(m.group(1))
1938 if m.group(2) == "":
1939 patch.start1 -= 1
1940 patch.length1 = 1
1941 elif m.group(2) == "0":
1942 patch.length1 = 0
1943 else:
1944 patch.start1 -= 1
1945 patch.length1 = int(m.group(2))
1832 1946
1833 patch.start2 = int(m.group(3))
1834 if m.group(4) == '':
1835 patch.start2 -= 1
1836 patch.length2 = 1
1837 elif m.group(4) == '0':
1838 patch.length2 = 0
1839 else:
1840 patch.start2 -= 1
1841 patch.length2 = int(m.group(4))
1947 patch.start2 = int(m.group(3))
1948 if m.group(4) == "":
1949 patch.start2 -= 1
1950 patch.length2 = 1
1951 elif m.group(4) == "0":
1952 patch.length2 = 0
1953 else:
1954 patch.start2 -= 1
1955 patch.length2 = int(m.group(4))
1842 1956
1843 del text[0]
1957 del text[0]
1844 1958
1845 while len(text) != 0:
1846 if text[0]:
1847 sign = text[0][0]
1848 else:
1849 sign = ''
1850 line = urllib.parse.unquote(text[0][1:])
1851 line = line.decode("utf-8")
1852 if sign == '+':
1853 # Insertion.
1854 patch.diffs.append((self.DIFF_INSERT, line))
1855 elif sign == '-':
1856 # Deletion.
1857 patch.diffs.append((self.DIFF_DELETE, line))
1858 elif sign == ' ':
1859 # Minor equality.
1860 patch.diffs.append((self.DIFF_EQUAL, line))
1861 elif sign == '@':
1862 # Start of next patch.
1863 break
1864 elif sign == '':
1865 # Blank line? Whatever.
1866 pass
1867 else:
1868 # WTF?
1869 raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))
1870 del text[0]
1871 return patches
1959 while len(text) != 0:
1960 if text[0]:
1961 sign = text[0][0]
1962 else:
1963 sign = ""
1964 line = urllib.parse.unquote(text[0][1:])
1965 line = line.decode("utf-8")
1966 if sign == "+":
1967 # Insertion.
1968 patch.diffs.append((self.DIFF_INSERT, line))
1969 elif sign == "-":
1970 # Deletion.
1971 patch.diffs.append((self.DIFF_DELETE, line))
1972 elif sign == " ":
1973 # Minor equality.
1974 patch.diffs.append((self.DIFF_EQUAL, line))
1975 elif sign == "@":
1976 # Start of next patch.
1977 break
1978 elif sign == "":
1979 # Blank line? Whatever.
1980 pass
1981 else:
1982 # WTF?
1983 raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))
1984 del text[0]
1985 return patches
1872 1986
1873 1987
1874 1988 class patch_obj:
1875 """Class representing one patch operation.
1876 """
1989 """Class representing one patch operation."""
1877 1990
1878 def __init__(self):
1879 """Initializes with an empty list of diffs.
1880 """
1881 self.diffs = []
1882 self.start1 = None
1883 self.start2 = None
1884 self.length1 = 0
1885 self.length2 = 0
1991 def __init__(self):
1992 """Initializes with an empty list of diffs."""
1993 self.diffs = []
1994 self.start1 = None
1995 self.start2 = None
1996 self.length1 = 0
1997 self.length2 = 0
1886 1998
1887 def __str__(self):
1888 """Emmulate GNU diff's format.
1889 Header: @@ -382,8 +481,9 @@
1890 Indicies are printed as 1-based, not 0-based.
1999 def __str__(self):
2000 """Emmulate GNU diff's format.
2001 Header: @@ -382,8 +481,9 @@
2002 Indicies are printed as 1-based, not 0-based.
1891 2003
1892 Returns:
1893 The GNU diff string.
1894 """
1895 if self.length1 == 0:
1896 coords1 = str(self.start1) + ",0"
1897 elif self.length1 == 1:
1898 coords1 = str(self.start1 + 1)
1899 else:
1900 coords1 = str(self.start1 + 1) + "," + str(self.length1)
1901 if self.length2 == 0:
1902 coords2 = str(self.start2) + ",0"
1903 elif self.length2 == 1:
1904 coords2 = str(self.start2 + 1)
1905 else:
1906 coords2 = str(self.start2 + 1) + "," + str(self.length2)
1907 text = ["@@ -", coords1, " +", coords2, " @@\n"]
1908 # Escape the body of the patch with %xx notation.
1909 for (op, data) in self.diffs:
1910 if op == diff_match_patch.DIFF_INSERT:
1911 text.append("+")
1912 elif op == diff_match_patch.DIFF_DELETE:
1913 text.append("-")
1914 elif op == diff_match_patch.DIFF_EQUAL:
1915 text.append(" ")
1916 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
1917 data = data.encode("utf-8")
1918 text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
1919 return "".join(text) No newline at end of file
2004 Returns:
2005 The GNU diff string.
2006 """
2007 if self.length1 == 0:
2008 coords1 = str(self.start1) + ",0"
2009 elif self.length1 == 1:
2010 coords1 = str(self.start1 + 1)
2011 else:
2012 coords1 = str(self.start1 + 1) + "," + str(self.length1)
2013 if self.length2 == 0:
2014 coords2 = str(self.start2) + ",0"
2015 elif self.length2 == 1:
2016 coords2 = str(self.start2 + 1)
2017 else:
2018 coords2 = str(self.start2 + 1) + "," + str(self.length2)
2019 text = ["@@ -", coords1, " +", coords2, " @@\n"]
2020 # Escape the body of the patch with %xx notation.
2021 for op, data in self.diffs:
2022 if op == diff_match_patch.DIFF_INSERT:
2023 text.append("+")
2024 elif op == diff_match_patch.DIFF_DELETE:
2025 text.append("-")
2026 elif op == diff_match_patch.DIFF_EQUAL:
2027 text.append(" ")
2028 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
2029 data = data.encode("utf-8")
2030 text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
2031 return "".join(text)
General Comments 0
You need to be logged in to leave comments. Login now