##// END OF EJS Templates
diff-match: python3/code fixes for backport code
super-admin -
r4954:f53817a4 default
parent child Browse files
Show More
@@ -1,1919 +1,2031 b''
1
2
3
4 1 """Diff Match and Patch
5 2
6 3 Copyright 2006 Google Inc.
7 4 http://code.google.com/p/google-diff-match-patch/
8 5
9 6 Licensed under the Apache License, Version 2.0 (the "License");
10 7 you may not use this file except in compliance with the License.
11 8 You may obtain a copy of the License at
12 9
13 10 http://www.apache.org/licenses/LICENSE-2.0
14 11
15 12 Unless required by applicable law or agreed to in writing, software
16 13 distributed under the License is distributed on an "AS IS" BASIS,
17 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 15 See the License for the specific language governing permissions and
19 16 limitations under the License.
20 17 """
21 18
22 19 """Functions for diff, match and patch.
23 20
24 21 Computes the difference between two texts to create a patch.
25 22 Applies the patch onto another text, allowing for errors.
26 23 """
27 24
28 __author__ = 'fraser@google.com (Neil Fraser)'
25 __author__ = "fraser@google.com (Neil Fraser)"
29 26
30 27 import math
31 28 import re
32 29 import sys
33 30 import time
34 31 import urllib.request, urllib.parse, urllib.error
35 32
36 33
37 34 class diff_match_patch:
38 35 """Class containing the diff, match and patch methods.
39 36
40 37 Also contains the behaviour settings.
41 38 """
42 39
43 40 def __init__(self):
44 41 """Inits a diff_match_patch object with default settings.
45 42 Redefine these in your program to override the defaults.
46 43 """
47 44
48 45 # Number of seconds to map a diff before giving up (0 for infinity).
49 46 self.Diff_Timeout = 1.0
50 47 # Cost of an empty edit operation in terms of edit characters.
51 48 self.Diff_EditCost = 4
52 49 # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
53 50 self.Match_Threshold = 0.5
54 51 # How far to search for a match (0 = exact location, 1000+ = broad match).
55 52 # A match this many characters away from the expected location will add
56 53 # 1.0 to the score (0.0 is a perfect match).
57 54 self.Match_Distance = 1000
58 55 # When deleting a large block of text (over ~64 characters), how close do
59 56 # the contents have to be to match the expected contents. (0.0 = perfection,
60 57 # 1.0 = very loose). Note that Match_Threshold controls how closely the
61 58 # end points of a delete need to match.
62 59 self.Patch_DeleteThreshold = 0.5
63 60 # Chunk size for context length.
64 61 self.Patch_Margin = 4
65 62
66 63 # The number of bits in an int.
67 64 # Python has no maximum, thus to disable patch splitting set to 0.
68 65 # However to avoid long patches in certain pathological cases, use 32.
69 66 # Multiple short patches (using native ints) are much faster than long ones.
70 67 self.Match_MaxBits = 32
71 68
72 69 # DIFF FUNCTIONS
73 70
74 71 # The data structure representing a diff is an array of tuples:
75 72 # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
76 73 # which means: delete "Hello", add "Goodbye" and keep " world."
77 74 DIFF_DELETE = -1
78 75 DIFF_INSERT = 1
79 76 DIFF_EQUAL = 0
80 77
81 78 def diff_main(self, text1, text2, checklines=True, deadline=None):
82 79 """Find the differences between two texts. Simplifies the problem by
83 80 stripping any common prefix or suffix off the texts before diffing.
84 81
85 82 Args:
86 83 text1: Old string to be diffed.
87 84 text2: New string to be diffed.
88 85 checklines: Optional speedup flag. If present and false, then don't run
89 86 a line-level diff first to identify the changed areas.
90 87 Defaults to true, which does a faster, slightly less optimal diff.
91 88 deadline: Optional time when the diff should be complete by. Used
92 89 internally for recursive calls. Users should set DiffTimeout instead.
93 90
94 91 Returns:
95 92 Array of changes.
96 93 """
97 94 # Set a deadline by which time the diff must be complete.
98 95 if deadline is None:
99 96 # Unlike in most languages, Python counts time in seconds.
100 97 if self.Diff_Timeout <= 0:
101 deadline = sys.maxint
98 deadline = sys.maxsize
102 99 else:
103 100 deadline = time.time() + self.Diff_Timeout
104 101
105 102 # Check for null inputs.
106 103 if text1 is None or text2 is None:
107 104 raise ValueError("Null inputs. (diff_main)")
108 105
109 106 # Check for equality (speedup).
110 107 if text1 == text2:
111 108 if text1:
112 109 return [(self.DIFF_EQUAL, text1)]
113 110 return []
114 111
115 112 # Trim off common prefix (speedup).
116 113 commonlength = self.diff_commonPrefix(text1, text2)
117 114 commonprefix = text1[:commonlength]
118 115 text1 = text1[commonlength:]
119 116 text2 = text2[commonlength:]
120 117
121 118 # Trim off common suffix (speedup).
122 119 commonlength = self.diff_commonSuffix(text1, text2)
123 120 if commonlength == 0:
124 commonsuffix = ''
121 commonsuffix = ""
125 122 else:
126 123 commonsuffix = text1[-commonlength:]
127 124 text1 = text1[:-commonlength]
128 125 text2 = text2[:-commonlength]
129 126
130 127 # Compute the diff on the middle block.
131 128 diffs = self.diff_compute(text1, text2, checklines, deadline)
132 129
133 130 # Restore the prefix and suffix.
134 131 if commonprefix:
135 132 diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
136 133 if commonsuffix:
137 134 diffs.append((self.DIFF_EQUAL, commonsuffix))
138 135 self.diff_cleanupMerge(diffs)
139 136 return diffs
140 137
141 138 def diff_compute(self, text1, text2, checklines, deadline):
142 139 """Find the differences between two texts. Assumes that the texts do not
143 140 have any common prefix or suffix.
144 141
145 142 Args:
146 143 text1: Old string to be diffed.
147 144 text2: New string to be diffed.
148 145 checklines: Speedup flag. If false, then don't run a line-level diff
149 146 first to identify the changed areas.
150 147 If true, then run a faster, slightly less optimal diff.
151 148 deadline: Time when the diff should be complete by.
152 149
153 150 Returns:
154 151 Array of changes.
155 152 """
156 153 if not text1:
157 154 # Just add some text (speedup).
158 155 return [(self.DIFF_INSERT, text2)]
159 156
160 157 if not text2:
161 158 # Just delete some text (speedup).
162 159 return [(self.DIFF_DELETE, text1)]
163 160
164 161 if len(text1) > len(text2):
165 162 (longtext, shorttext) = (text1, text2)
166 163 else:
167 164 (shorttext, longtext) = (text1, text2)
168 165 i = longtext.find(shorttext)
169 166 if i != -1:
170 167 # Shorter text is inside the longer text (speedup).
171 diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),
172 (self.DIFF_INSERT, longtext[i + len(shorttext):])]
168 diffs = [
169 (self.DIFF_INSERT, longtext[:i]),
170 (self.DIFF_EQUAL, shorttext),
171 (self.DIFF_INSERT, longtext[i + len(shorttext) :]),
172 ]
173 173 # Swap insertions for deletions if diff is reversed.
174 174 if len(text1) > len(text2):
175 175 diffs[0] = (self.DIFF_DELETE, diffs[0][1])
176 176 diffs[2] = (self.DIFF_DELETE, diffs[2][1])
177 177 return diffs
178 178
179 179 if len(shorttext) == 1:
180 180 # Single character string.
181 181 # After the previous speedup, the character can't be an equality.
182 182 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
183 183
184 184 # Check to see if the problem can be split in two.
185 185 hm = self.diff_halfMatch(text1, text2)
186 186 if hm:
187 187 # A half-match was found, sort out the return data.
188 188 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
189 189 # Send both pairs off for separate processing.
190 190 diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)
191 191 diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)
192 192 # Merge the results.
193 193 return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
194 194
195 195 if checklines and len(text1) > 100 and len(text2) > 100:
196 196 return self.diff_lineMode(text1, text2, deadline)
197 197
198 198 return self.diff_bisect(text1, text2, deadline)
199 199
200 200 def diff_lineMode(self, text1, text2, deadline):
201 201 """Do a quick line-level diff on both strings, then rediff the parts for
202 202 greater accuracy.
203 203 This speedup can produce non-minimal diffs.
204 204
205 205 Args:
206 206 text1: Old string to be diffed.
207 207 text2: New string to be diffed.
208 208 deadline: Time when the diff should be complete by.
209 209
210 210 Returns:
211 211 Array of changes.
212 212 """
213 213
214 214 # Scan the text on a line-by-line basis first.
215 215 (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
216 216
217 217 diffs = self.diff_main(text1, text2, False, deadline)
218 218
219 219 # Convert the diff back to original text.
220 220 self.diff_charsToLines(diffs, linearray)
221 221 # Eliminate freak matches (e.g. blank lines)
222 222 self.diff_cleanupSemantic(diffs)
223 223
224 224 # Rediff any replacement blocks, this time character-by-character.
225 225 # Add a dummy entry at the end.
226 diffs.append((self.DIFF_EQUAL, ''))
226 diffs.append((self.DIFF_EQUAL, ""))
227 227 pointer = 0
228 228 count_delete = 0
229 229 count_insert = 0
230 text_delete = ''
231 text_insert = ''
230 text_delete = ""
231 text_insert = ""
232 232 while pointer < len(diffs):
233 233 if diffs[pointer][0] == self.DIFF_INSERT:
234 234 count_insert += 1
235 235 text_insert += diffs[pointer][1]
236 236 elif diffs[pointer][0] == self.DIFF_DELETE:
237 237 count_delete += 1
238 238 text_delete += diffs[pointer][1]
239 239 elif diffs[pointer][0] == self.DIFF_EQUAL:
240 240 # Upon reaching an equality, check for prior redundancies.
241 241 if count_delete >= 1 and count_insert >= 1:
242 242 # Delete the offending records and add the merged ones.
243 243 a = self.diff_main(text_delete, text_insert, False, deadline)
244 244 diffs[pointer - count_delete - count_insert : pointer] = a
245 245 pointer = pointer - count_delete - count_insert + len(a)
246 246 count_insert = 0
247 247 count_delete = 0
248 text_delete = ''
249 text_insert = ''
248 text_delete = ""
249 text_insert = ""
250 250
251 251 pointer += 1
252 252
253 253 diffs.pop() # Remove the dummy entry at the end.
254 254
255 255 return diffs
256 256
257 257 def diff_bisect(self, text1, text2, deadline):
258 258 """Find the 'middle snake' of a diff, split the problem in two
259 259 and return the recursively constructed diff.
260 260 See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
261 261
262 262 Args:
263 263 text1: Old string to be diffed.
264 264 text2: New string to be diffed.
265 265 deadline: Time at which to bail if not yet complete.
266 266
267 267 Returns:
268 268 Array of diff tuples.
269 269 """
270 270
271 271 # Cache the text lengths to prevent multiple calls.
272 272 text1_length = len(text1)
273 273 text2_length = len(text2)
274 274 max_d = (text1_length + text2_length + 1) // 2
275 275 v_offset = max_d
276 276 v_length = 2 * max_d
277 277 v1 = [-1] * v_length
278 278 v1[v_offset + 1] = 0
279 279 v2 = v1[:]
280 280 delta = text1_length - text2_length
281 281 # If the total number of characters is odd, then the front path will
282 282 # collide with the reverse path.
283 front = (delta % 2 != 0)
283 front = delta % 2 != 0
284 284 # Offsets for start and end of k loop.
285 285 # Prevents mapping of space beyond the grid.
286 286 k1start = 0
287 287 k1end = 0
288 288 k2start = 0
289 289 k2end = 0
290 290 for d in range(max_d):
291 291 # Bail out if deadline is reached.
292 292 if time.time() > deadline:
293 293 break
294 294
295 295 # Walk the front path one step.
296 296 for k1 in range(-d + k1start, d + 1 - k1end, 2):
297 297 k1_offset = v_offset + k1
298 if k1 == -d or (k1 != d and
299 v1[k1_offset - 1] < v1[k1_offset + 1]):
298 if k1 == -d or (k1 != d and v1[k1_offset - 1] < v1[k1_offset + 1]):
300 299 x1 = v1[k1_offset + 1]
301 300 else:
302 301 x1 = v1[k1_offset - 1] + 1
303 302 y1 = x1 - k1
304 while (x1 < text1_length and y1 < text2_length and
305 text1[x1] == text2[y1]):
303 while (
304 x1 < text1_length and y1 < text2_length and text1[x1] == text2[y1]
305 ):
306 306 x1 += 1
307 307 y1 += 1
308 308 v1[k1_offset] = x1
309 309 if x1 > text1_length:
310 310 # Ran off the right of the graph.
311 311 k1end += 2
312 312 elif y1 > text2_length:
313 313 # Ran off the bottom of the graph.
314 314 k1start += 2
315 315 elif front:
316 316 k2_offset = v_offset + delta - k1
317 317 if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:
318 318 # Mirror x2 onto top-left coordinate system.
319 319 x2 = text1_length - v2[k2_offset]
320 320 if x1 >= x2:
321 321 # Overlap detected.
322 322 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
323 323
324 324 # Walk the reverse path one step.
325 325 for k2 in range(-d + k2start, d + 1 - k2end, 2):
326 326 k2_offset = v_offset + k2
327 if k2 == -d or (k2 != d and
328 v2[k2_offset - 1] < v2[k2_offset + 1]):
327 if k2 == -d or (k2 != d and v2[k2_offset - 1] < v2[k2_offset + 1]):
329 328 x2 = v2[k2_offset + 1]
330 329 else:
331 330 x2 = v2[k2_offset - 1] + 1
332 331 y2 = x2 - k2
333 while (x2 < text1_length and y2 < text2_length and
334 text1[-x2 - 1] == text2[-y2 - 1]):
332 while (
333 x2 < text1_length
334 and y2 < text2_length
335 and text1[-x2 - 1] == text2[-y2 - 1]
336 ):
335 337 x2 += 1
336 338 y2 += 1
337 339 v2[k2_offset] = x2
338 340 if x2 > text1_length:
339 341 # Ran off the left of the graph.
340 342 k2end += 2
341 343 elif y2 > text2_length:
342 344 # Ran off the top of the graph.
343 345 k2start += 2
344 346 elif not front:
345 347 k1_offset = v_offset + delta - k2
346 348 if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:
347 349 x1 = v1[k1_offset]
348 350 y1 = v_offset + x1 - k1_offset
349 351 # Mirror x2 onto top-left coordinate system.
350 352 x2 = text1_length - x2
351 353 if x1 >= x2:
352 354 # Overlap detected.
353 355 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
354 356
355 357 # Diff took too long and hit the deadline or
356 358 # number of diffs equals number of characters, no commonality at all.
357 359 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
358 360
359 361 def diff_bisectSplit(self, text1, text2, x, y, deadline):
360 362 """Given the location of the 'middle snake', split the diff in two parts
361 363 and recurse.
362 364
363 365 Args:
364 366 text1: Old string to be diffed.
365 367 text2: New string to be diffed.
366 368 x: Index of split point in text1.
367 369 y: Index of split point in text2.
368 370 deadline: Time at which to bail if not yet complete.
369 371
370 372 Returns:
371 373 Array of diff tuples.
372 374 """
373 375 text1a = text1[:x]
374 376 text2a = text2[:y]
375 377 text1b = text1[x:]
376 378 text2b = text2[y:]
377 379
378 380 # Compute both diffs serially.
379 381 diffs = self.diff_main(text1a, text2a, False, deadline)
380 382 diffsb = self.diff_main(text1b, text2b, False, deadline)
381 383
382 384 return diffs + diffsb
383 385
384 386 def diff_linesToChars(self, text1, text2):
385 387 """Split two texts into an array of strings. Reduce the texts to a string
386 388 of hashes where each Unicode character represents one line.
387 389
388 390 Args:
389 391 text1: First string.
390 392 text2: Second string.
391 393
392 394 Returns:
393 395 Three element tuple, containing the encoded text1, the encoded text2 and
394 396 the array of unique strings. The zeroth element of the array of unique
395 397 strings is intentionally blank.
396 398 """
397 399 lineArray = [] # e.g. lineArray[4] == "Hello\n"
398 400 lineHash = {} # e.g. lineHash["Hello\n"] == 4
399 401
400 402 # "\x00" is a valid character, but various debuggers don't like it.
401 403 # So we'll insert a junk entry to avoid generating a null character.
402 lineArray.append('')
404 lineArray.append("")
403 405
404 406 def diff_linesToCharsMunge(text):
405 407 """Split a text into an array of strings. Reduce the texts to a string
406 408 of hashes where each Unicode character represents one line.
407 409 Modifies linearray and linehash through being a closure.
408 410
409 411 Args:
410 412 text: String to encode.
411 413
412 414 Returns:
413 415 Encoded string.
414 416 """
415 417 chars = []
416 418 # Walk the text, pulling out a substring for each line.
417 419 # text.split('\n') would would temporarily double our memory footprint.
418 420 # Modifying text would create many large strings to garbage collect.
419 421 lineStart = 0
420 422 lineEnd = -1
421 423 while lineEnd < len(text) - 1:
422 lineEnd = text.find('\n', lineStart)
424 lineEnd = text.find("\n", lineStart)
423 425 if lineEnd == -1:
424 426 lineEnd = len(text) - 1
425 427 line = text[lineStart:lineEnd + 1]
426 428 lineStart = lineEnd + 1
427 429
428 430 if line in lineHash:
429 chars.append(unichr(lineHash[line]))
431 chars.append(chr(lineHash[line]))
430 432 else:
431 433 lineArray.append(line)
432 434 lineHash[line] = len(lineArray) - 1
433 chars.append(unichr(len(lineArray) - 1))
435 chars.append(chr(len(lineArray) - 1))
434 436 return "".join(chars)
435 437
436 438 chars1 = diff_linesToCharsMunge(text1)
437 439 chars2 = diff_linesToCharsMunge(text2)
438 440 return (chars1, chars2, lineArray)
439 441
440 442 def diff_charsToLines(self, diffs, lineArray):
441 443 """Rehydrate the text in a diff from a string of line hashes to real lines
442 444 of text.
443 445
444 446 Args:
445 447 diffs: Array of diff tuples.
446 448 lineArray: Array of unique strings.
447 449 """
448 450 for x in range(len(diffs)):
449 451 text = []
450 452 for char in diffs[x][1]:
451 453 text.append(lineArray[ord(char)])
452 454 diffs[x] = (diffs[x][0], "".join(text))
453 455
454 456 def diff_commonPrefix(self, text1, text2):
455 457 """Determine the common prefix of two strings.
456 458
457 459 Args:
458 460 text1: First string.
459 461 text2: Second string.
460 462
461 463 Returns:
462 464 The number of characters common to the start of each string.
463 465 """
464 466 # Quick check for common null cases.
465 467 if not text1 or not text2 or text1[0] != text2[0]:
466 468 return 0
467 469 # Binary search.
468 470 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
469 471 pointermin = 0
470 472 pointermax = min(len(text1), len(text2))
471 473 pointermid = pointermax
472 474 pointerstart = 0
473 475 while pointermin < pointermid:
474 476 if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
475 477 pointermin = pointermid
476 478 pointerstart = pointermin
477 479 else:
478 480 pointermax = pointermid
479 481 pointermid = (pointermax - pointermin) // 2 + pointermin
480 482 return pointermid
481 483
482 484 def diff_commonSuffix(self, text1, text2):
483 485 """Determine the common suffix of two strings.
484 486
485 487 Args:
486 488 text1: First string.
487 489 text2: Second string.
488 490
489 491 Returns:
490 492 The number of characters common to the end of each string.
491 493 """
492 494 # Quick check for common null cases.
493 495 if not text1 or not text2 or text1[-1] != text2[-1]:
494 496 return 0
495 497 # Binary search.
496 498 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
497 499 pointermin = 0
498 500 pointermax = min(len(text1), len(text2))
499 501 pointermid = pointermax
500 502 pointerend = 0
501 503 while pointermin < pointermid:
502 if (text1[-pointermid:len(text1) - pointerend] ==
503 text2[-pointermid:len(text2) - pointerend]):
504 if (
505 text1[-pointermid : len(text1) - pointerend]
506 == text2[-pointermid : len(text2) - pointerend]
507 ):
504 508 pointermin = pointermid
505 509 pointerend = pointermin
506 510 else:
507 511 pointermax = pointermid
508 512 pointermid = (pointermax - pointermin) // 2 + pointermin
509 513 return pointermid
510 514
511 515 def diff_commonOverlap(self, text1, text2):
512 516 """Determine if the suffix of one string is the prefix of another.
513 517
514 518 Args:
515 519 text1 First string.
516 520 text2 Second string.
517 521
518 522 Returns:
519 523 The number of characters common to the end of the first
520 524 string and the start of the second string.
521 525 """
522 526 # Cache the text lengths to prevent multiple calls.
523 527 text1_length = len(text1)
524 528 text2_length = len(text2)
525 529 # Eliminate the null case.
526 530 if text1_length == 0 or text2_length == 0:
527 531 return 0
528 532 # Truncate the longer string.
529 533 if text1_length > text2_length:
530 534 text1 = text1[-text2_length:]
531 535 elif text1_length < text2_length:
532 536 text2 = text2[:text1_length]
533 537 text_length = min(text1_length, text2_length)
534 538 # Quick check for the worst case.
535 539 if text1 == text2:
536 540 return text_length
537 541
538 542 # Start by looking for a single character match
539 543 # and increase length until no match is found.
540 544 # Performance analysis: http://neil.fraser.name/news/2010/11/04/
541 545 best = 0
542 546 length = 1
543 547 while True:
544 548 pattern = text1[-length:]
545 549 found = text2.find(pattern)
546 550 if found == -1:
547 551 return best
548 552 length += found
549 553 if found == 0 or text1[-length:] == text2[:length]:
550 554 best = length
551 555 length += 1
552 556
553 557 def diff_halfMatch(self, text1, text2):
554 558 """Do the two texts share a substring which is at least half the length of
555 559 the longer text?
556 560 This speedup can produce non-minimal diffs.
557 561
558 562 Args:
559 563 text1: First string.
560 564 text2: Second string.
561 565
562 566 Returns:
563 567 Five element Array, containing the prefix of text1, the suffix of text1,
564 568 the prefix of text2, the suffix of text2 and the common middle. Or None
565 569 if there was no match.
566 570 """
567 571 if self.Diff_Timeout <= 0:
568 572 # Don't risk returning a non-optimal diff if we have unlimited time.
569 573 return None
570 574 if len(text1) > len(text2):
571 575 (longtext, shorttext) = (text1, text2)
572 576 else:
573 577 (shorttext, longtext) = (text1, text2)
574 578 if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):
575 579 return None # Pointless.
576 580
577 581 def diff_halfMatchI(longtext, shorttext, i):
578 582 """Does a substring of shorttext exist within longtext such that the
579 583 substring is at least half the length of longtext?
580 584 Closure, but does not reference any external variables.
581 585
582 586 Args:
583 587 longtext: Longer string.
584 588 shorttext: Shorter string.
585 589 i: Start index of quarter length substring within longtext.
586 590
587 591 Returns:
588 592 Five element Array, containing the prefix of longtext, the suffix of
589 593 longtext, the prefix of shorttext, the suffix of shorttext and the
590 594 common middle. Or None if there was no match.
591 595 """
592 596 seed = longtext[i:i + len(longtext) // 4]
593 best_common = ''
597 best_common = ""
594 598 j = shorttext.find(seed)
595 599 while j != -1:
596 600 prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
597 601 suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
598 602 if len(best_common) < suffixLength + prefixLength:
599 best_common = (shorttext[j - suffixLength:j] +
600 shorttext[j:j + prefixLength])
603 best_common = (
604 shorttext[j - suffixLength : j]
605 + shorttext[j : j + prefixLength]
606 )
601 607 best_longtext_a = longtext[:i - suffixLength]
602 608 best_longtext_b = longtext[i + prefixLength:]
603 609 best_shorttext_a = shorttext[:j - suffixLength]
604 610 best_shorttext_b = shorttext[j + prefixLength:]
605 611 j = shorttext.find(seed, j + 1)
606 612
607 613 if len(best_common) * 2 >= len(longtext):
608 return (best_longtext_a, best_longtext_b,
609 best_shorttext_a, best_shorttext_b, best_common)
614 return (
615 best_longtext_a,
616 best_longtext_b,
617 best_shorttext_a,
618 best_shorttext_b,
619 best_common,
620 )
610 621 else:
611 622 return None
612 623
613 624 # First check if the second quarter is the seed for a half-match.
614 625 hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4)
615 626 # Check again based on the third quarter.
616 627 hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2)
617 628 if not hm1 and not hm2:
618 629 return None
619 630 elif not hm2:
620 631 hm = hm1
621 632 elif not hm1:
622 633 hm = hm2
623 634 else:
624 635 # Both matched. Select the longest.
625 636 if len(hm1[4]) > len(hm2[4]):
626 637 hm = hm1
627 638 else:
628 639 hm = hm2
629 640
630 641 # A half-match was found, sort out the return data.
631 642 if len(text1) > len(text2):
632 643 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
633 644 else:
634 645 (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
635 646 return (text1_a, text1_b, text2_a, text2_b, mid_common)
636 647
637 648 def diff_cleanupSemantic(self, diffs):
638 649 """Reduce the number of edits by eliminating semantically trivial
639 650 equalities.
640 651
641 652 Args:
642 653 diffs: Array of diff tuples.
643 654 """
644 655 changes = False
645 656 equalities = [] # Stack of indices where equalities are found.
646 657 lastequality = None # Always equal to diffs[equalities[-1]][1]
647 658 pointer = 0 # Index of current position.
648 659 # Number of chars that changed prior to the equality.
649 660 length_insertions1, length_deletions1 = 0, 0
650 661 # Number of chars that changed after the equality.
651 662 length_insertions2, length_deletions2 = 0, 0
652 663 while pointer < len(diffs):
653 664 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
654 665 equalities.append(pointer)
655 666 length_insertions1, length_insertions2 = length_insertions2, 0
656 667 length_deletions1, length_deletions2 = length_deletions2, 0
657 668 lastequality = diffs[pointer][1]
658 669 else: # An insertion or deletion.
659 670 if diffs[pointer][0] == self.DIFF_INSERT:
660 671 length_insertions2 += len(diffs[pointer][1])
661 672 else:
662 673 length_deletions2 += len(diffs[pointer][1])
663 674 # Eliminate an equality that is smaller or equal to the edits on both
664 675 # sides of it.
665 if (lastequality and (len(lastequality) <=
666 max(length_insertions1, length_deletions1)) and
667 (len(lastequality) <= max(length_insertions2, length_deletions2))):
676 if (
677 lastequality
678 and (
679 len(lastequality) <= max(length_insertions1, length_deletions1)
680 )
681 and (
682 len(lastequality) <= max(length_insertions2, length_deletions2)
683 )
684 ):
668 685 # Duplicate record.
669 686 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
670 687 # Change second copy to insert.
671 diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
672 diffs[equalities[-1] + 1][1])
688 diffs[equalities[-1] + 1] = (
689 self.DIFF_INSERT,
690 diffs[equalities[-1] + 1][1],
691 )
673 692 # Throw away the equality we just deleted.
674 693 equalities.pop()
675 694 # Throw away the previous equality (it needs to be reevaluated).
676 695 if len(equalities):
677 696 equalities.pop()
678 697 if len(equalities):
679 698 pointer = equalities[-1]
680 699 else:
681 700 pointer = -1
682 701 # Reset the counters.
683 702 length_insertions1, length_deletions1 = 0, 0
684 703 length_insertions2, length_deletions2 = 0, 0
685 704 lastequality = None
686 705 changes = True
687 706 pointer += 1
688 707
689 708 # Normalize the diff.
690 709 if changes:
691 710 self.diff_cleanupMerge(diffs)
692 711 self.diff_cleanupSemanticLossless(diffs)
693 712
694 713 # Find any overlaps between deletions and insertions.
695 714 # e.g: <del>abcxxx</del><ins>xxxdef</ins>
696 715 # -> <del>abc</del>xxx<ins>def</ins>
697 716 # e.g: <del>xxxabc</del><ins>defxxx</ins>
698 717 # -> <ins>def</ins>xxx<del>abc</del>
699 718 # Only extract an overlap if it is as big as the edit ahead or behind it.
700 719 pointer = 1
701 720 while pointer < len(diffs):
702 if (diffs[pointer - 1][0] == self.DIFF_DELETE and
703 diffs[pointer][0] == self.DIFF_INSERT):
721 if (
722 diffs[pointer - 1][0] == self.DIFF_DELETE
723 and diffs[pointer][0] == self.DIFF_INSERT
724 ):
704 725 deletion = diffs[pointer - 1][1]
705 726 insertion = diffs[pointer][1]
706 727 overlap_length1 = self.diff_commonOverlap(deletion, insertion)
707 728 overlap_length2 = self.diff_commonOverlap(insertion, deletion)
708 729 if overlap_length1 >= overlap_length2:
709 if (overlap_length1 >= len(deletion) / 2.0 or
710 overlap_length1 >= len(insertion) / 2.0):
730 if (
731 overlap_length1 >= len(deletion) / 2.0
732 or overlap_length1 >= len(insertion) / 2.0
733 ):
711 734 # Overlap found. Insert an equality and trim the surrounding edits.
712 diffs.insert(pointer, (self.DIFF_EQUAL,
713 insertion[:overlap_length1]))
714 diffs[pointer - 1] = (self.DIFF_DELETE,
715 deletion[:len(deletion) - overlap_length1])
716 diffs[pointer + 1] = (self.DIFF_INSERT,
717 insertion[overlap_length1:])
735 diffs.insert(
736 pointer, (self.DIFF_EQUAL, insertion[:overlap_length1])
737 )
738 diffs[pointer - 1] = (
739 self.DIFF_DELETE,
740 deletion[: len(deletion) - overlap_length1],
741 )
742 diffs[pointer + 1] = (
743 self.DIFF_INSERT,
744 insertion[overlap_length1:],
745 )
718 746 pointer += 1
719 747 else:
720 if (overlap_length2 >= len(deletion) / 2.0 or
721 overlap_length2 >= len(insertion) / 2.0):
748 if (
749 overlap_length2 >= len(deletion) / 2.0
750 or overlap_length2 >= len(insertion) / 2.0
751 ):
722 752 # Reverse overlap found.
723 753 # Insert an equality and swap and trim the surrounding edits.
724 diffs.insert(pointer, (self.DIFF_EQUAL, deletion[:overlap_length2]))
725 diffs[pointer - 1] = (self.DIFF_INSERT,
726 insertion[:len(insertion) - overlap_length2])
727 diffs[pointer + 1] = (self.DIFF_DELETE, deletion[overlap_length2:])
754 diffs.insert(
755 pointer, (self.DIFF_EQUAL, deletion[:overlap_length2])
756 )
757 diffs[pointer - 1] = (
758 self.DIFF_INSERT,
759 insertion[: len(insertion) - overlap_length2],
760 )
761 diffs[pointer + 1] = (
762 self.DIFF_DELETE,
763 deletion[overlap_length2:],
764 )
728 765 pointer += 1
729 766 pointer += 1
730 767 pointer += 1
731 768
732 769 def diff_cleanupSemanticLossless(self, diffs):
733 770 """Look for single edits surrounded on both sides by equalities
734 771 which can be shifted sideways to align the edit to a word boundary.
735 772 e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
736 773
737 774 Args:
738 775 diffs: Array of diff tuples.
739 776 """
740 777
741 778 def diff_cleanupSemanticScore(one, two):
742 779 """Given two strings, compute a score representing whether the
743 780 internal boundary falls on logical boundaries.
744 781 Scores range from 6 (best) to 0 (worst).
745 782 Closure, but does not reference any external variables.
746 783
747 784 Args:
748 785 one: First string.
749 786 two: Second string.
750 787
751 788 Returns:
752 789 The score.
753 790 """
754 791 if not one or not two:
755 792 # Edges are the best.
756 793 return 6
757 794
758 795 # Each port of this function behaves slightly differently due to
759 796 # subtle differences in each language's definition of things like
760 797 # 'whitespace'. Since this function's purpose is largely cosmetic,
761 798 # the choice has been made to use each language's native features
762 799 # rather than force total conformity.
763 800 char1 = one[-1]
764 801 char2 = two[0]
765 802 nonAlphaNumeric1 = not char1.isalnum()
766 803 nonAlphaNumeric2 = not char2.isalnum()
767 804 whitespace1 = nonAlphaNumeric1 and char1.isspace()
768 805 whitespace2 = nonAlphaNumeric2 and char2.isspace()
769 806 lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n")
770 807 lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n")
771 808 blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one)
772 809 blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two)
773 810
774 811 if blankLine1 or blankLine2:
775 812 # Five points for blank lines.
776 813 return 5
777 814 elif lineBreak1 or lineBreak2:
778 815 # Four points for line breaks.
779 816 return 4
780 817 elif nonAlphaNumeric1 and not whitespace1 and whitespace2:
781 818 # Three points for end of sentences.
782 819 return 3
783 820 elif whitespace1 or whitespace2:
784 821 # Two points for whitespace.
785 822 return 2
786 823 elif nonAlphaNumeric1 or nonAlphaNumeric2:
787 824 # One point for non-alphanumeric.
788 825 return 1
789 826 return 0
790 827
791 828 pointer = 1
792 829 # Intentionally ignore the first and last element (don't need checking).
793 830 while pointer < len(diffs) - 1:
794 if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
795 diffs[pointer + 1][0] == self.DIFF_EQUAL):
831 if (
832 diffs[pointer - 1][0] == self.DIFF_EQUAL
833 and diffs[pointer + 1][0] == self.DIFF_EQUAL
834 ):
796 835 # This is a single edit surrounded by equalities.
797 836 equality1 = diffs[pointer - 1][1]
798 837 edit = diffs[pointer][1]
799 838 equality2 = diffs[pointer + 1][1]
800 839
801 840 # First, shift the edit as far left as possible.
802 841 commonOffset = self.diff_commonSuffix(equality1, edit)
803 842 if commonOffset:
804 843 commonString = edit[-commonOffset:]
805 844 equality1 = equality1[:-commonOffset]
806 845 edit = commonString + edit[:-commonOffset]
807 846 equality2 = commonString + equality2
808 847
809 848 # Second, step character by character right, looking for the best fit.
810 849 bestEquality1 = equality1
811 850 bestEdit = edit
812 851 bestEquality2 = equality2
813 bestScore = (diff_cleanupSemanticScore(equality1, edit) +
814 diff_cleanupSemanticScore(edit, equality2))
852 bestScore = diff_cleanupSemanticScore(
853 equality1, edit
854 ) + diff_cleanupSemanticScore(edit, equality2)
815 855 while edit and equality2 and edit[0] == equality2[0]:
816 856 equality1 += edit[0]
817 857 edit = edit[1:] + equality2[0]
818 858 equality2 = equality2[1:]
819 score = (diff_cleanupSemanticScore(equality1, edit) +
820 diff_cleanupSemanticScore(edit, equality2))
859 score = diff_cleanupSemanticScore(
860 equality1, edit
861 ) + diff_cleanupSemanticScore(edit, equality2)
821 862 # The >= encourages trailing rather than leading whitespace on edits.
822 863 if score >= bestScore:
823 864 bestScore = score
824 865 bestEquality1 = equality1
825 866 bestEdit = edit
826 867 bestEquality2 = equality2
827 868
828 869 if diffs[pointer - 1][1] != bestEquality1:
829 870 # We have an improvement, save it back to the diff.
830 871 if bestEquality1:
831 872 diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
832 873 else:
833 874 del diffs[pointer - 1]
834 875 pointer -= 1
835 876 diffs[pointer] = (diffs[pointer][0], bestEdit)
836 877 if bestEquality2:
837 878 diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
838 879 else:
839 880 del diffs[pointer + 1]
840 881 pointer -= 1
841 882 pointer += 1
842 883
843 884 # Define some regex patterns for matching boundaries.
844 BLANKLINEEND = re.compile(r"\n\r?\n$");
845 BLANKLINESTART = re.compile(r"^\r?\n\r?\n");
885 BLANKLINEEND = re.compile(r"\n\r?\n$")
886 BLANKLINESTART = re.compile(r"^\r?\n\r?\n")
846 887
847 888 def diff_cleanupEfficiency(self, diffs):
848 889 """Reduce the number of edits by eliminating operationally trivial
849 890 equalities.
850 891
851 892 Args:
852 893 diffs: Array of diff tuples.
853 894 """
854 895 changes = False
855 896 equalities = [] # Stack of indices where equalities are found.
856 897 lastequality = None # Always equal to diffs[equalities[-1]][1]
857 898 pointer = 0 # Index of current position.
858 899 pre_ins = False # Is there an insertion operation before the last equality.
859 900 pre_del = False # Is there a deletion operation before the last equality.
860 901 post_ins = False # Is there an insertion operation after the last equality.
861 902 post_del = False # Is there a deletion operation after the last equality.
862 903 while pointer < len(diffs):
863 904 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
864 if (len(diffs[pointer][1]) < self.Diff_EditCost and
865 (post_ins or post_del)):
905 if len(diffs[pointer][1]) < self.Diff_EditCost and (
906 post_ins or post_del
907 ):
866 908 # Candidate found.
867 909 equalities.append(pointer)
868 910 pre_ins = post_ins
869 911 pre_del = post_del
870 912 lastequality = diffs[pointer][1]
871 913 else:
872 914 # Not a candidate, and can never become one.
873 915 equalities = []
874 916 lastequality = None
875 917
876 918 post_ins = post_del = False
877 919 else: # An insertion or deletion.
878 920 if diffs[pointer][0] == self.DIFF_DELETE:
879 921 post_del = True
880 922 else:
881 923 post_ins = True
882 924
883 925 # Five types to be split:
884 926 # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
885 927 # <ins>A</ins>X<ins>C</ins><del>D</del>
886 928 # <ins>A</ins><del>B</del>X<ins>C</ins>
887 929 # <ins>A</del>X<ins>C</ins><del>D</del>
888 930 # <ins>A</ins><del>B</del>X<del>C</del>
889 931
890 if lastequality and ((pre_ins and pre_del and post_ins and post_del) or
891 ((len(lastequality) < self.Diff_EditCost / 2) and
892 (pre_ins + pre_del + post_ins + post_del) == 3)):
932 if lastequality and (
933 (pre_ins and pre_del and post_ins and post_del)
934 or (
935 (len(lastequality) < self.Diff_EditCost / 2)
936 and (pre_ins + pre_del + post_ins + post_del) == 3
937 )
938 ):
893 939 # Duplicate record.
894 940 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
895 941 # Change second copy to insert.
896 diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
897 diffs[equalities[-1] + 1][1])
942 diffs[equalities[-1] + 1] = (
943 self.DIFF_INSERT,
944 diffs[equalities[-1] + 1][1],
945 )
898 946 equalities.pop() # Throw away the equality we just deleted.
899 947 lastequality = None
900 948 if pre_ins and pre_del:
901 949 # No changes made which could affect previous entry, keep going.
902 950 post_ins = post_del = True
903 951 equalities = []
904 952 else:
905 953 if len(equalities):
906 954 equalities.pop() # Throw away the previous equality.
907 955 if len(equalities):
908 956 pointer = equalities[-1]
909 957 else:
910 958 pointer = -1
911 959 post_ins = post_del = False
912 960 changes = True
913 961 pointer += 1
914 962
915 963 if changes:
916 964 self.diff_cleanupMerge(diffs)
917 965
918 966 def diff_cleanupMerge(self, diffs):
919 967 """Reorder and merge like edit sections. Merge equalities.
920 968 Any edit section can move as long as it doesn't cross an equality.
921 969
922 970 Args:
923 971 diffs: Array of diff tuples.
924 972 """
925 diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end.
973 diffs.append((self.DIFF_EQUAL, "")) # Add a dummy entry at the end.
926 974 pointer = 0
927 975 count_delete = 0
928 976 count_insert = 0
929 text_delete = ''
930 text_insert = ''
977 text_delete = ""
978 text_insert = ""
931 979 while pointer < len(diffs):
932 980 if diffs[pointer][0] == self.DIFF_INSERT:
933 981 count_insert += 1
934 982 text_insert += diffs[pointer][1]
935 983 pointer += 1
936 984 elif diffs[pointer][0] == self.DIFF_DELETE:
937 985 count_delete += 1
938 986 text_delete += diffs[pointer][1]
939 987 pointer += 1
940 988 elif diffs[pointer][0] == self.DIFF_EQUAL:
941 989 # Upon reaching an equality, check for prior redundancies.
942 990 if count_delete + count_insert > 1:
943 991 if count_delete != 0 and count_insert != 0:
944 992 # Factor out any common prefixies.
945 993 commonlength = self.diff_commonPrefix(text_insert, text_delete)
946 994 if commonlength != 0:
947 995 x = pointer - count_delete - count_insert - 1
948 996 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
949 diffs[x] = (diffs[x][0], diffs[x][1] +
950 text_insert[:commonlength])
997 diffs[x] = (
998 diffs[x][0],
999 diffs[x][1] + text_insert[:commonlength],
1000 )
951 1001 else:
952 diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength]))
1002 diffs.insert(
1003 0, (self.DIFF_EQUAL, text_insert[:commonlength])
1004 )
953 1005 pointer += 1
954 1006 text_insert = text_insert[commonlength:]
955 1007 text_delete = text_delete[commonlength:]
956 1008 # Factor out any common suffixies.
957 1009 commonlength = self.diff_commonSuffix(text_insert, text_delete)
958 1010 if commonlength != 0:
959 diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +
960 diffs[pointer][1])
1011 diffs[pointer] = (
1012 diffs[pointer][0],
1013 text_insert[-commonlength:] + diffs[pointer][1],
1014 )
961 1015 text_insert = text_insert[:-commonlength]
962 1016 text_delete = text_delete[:-commonlength]
963 1017 # Delete the offending records and add the merged ones.
964 1018 if count_delete == 0:
965 1019 diffs[pointer - count_insert : pointer] = [
966 (self.DIFF_INSERT, text_insert)]
1020 (self.DIFF_INSERT, text_insert)
1021 ]
967 1022 elif count_insert == 0:
968 1023 diffs[pointer - count_delete : pointer] = [
969 (self.DIFF_DELETE, text_delete)]
1024 (self.DIFF_DELETE, text_delete)
1025 ]
970 1026 else:
971 1027 diffs[pointer - count_delete - count_insert : pointer] = [
972 1028 (self.DIFF_DELETE, text_delete),
973 (self.DIFF_INSERT, text_insert)]
1029 (self.DIFF_INSERT, text_insert),
1030 ]
974 1031 pointer = pointer - count_delete - count_insert + 1
975 1032 if count_delete != 0:
976 1033 pointer += 1
977 1034 if count_insert != 0:
978 1035 pointer += 1
979 1036 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
980 1037 # Merge this equality with the previous one.
981 diffs[pointer - 1] = (diffs[pointer - 1][0],
982 diffs[pointer - 1][1] + diffs[pointer][1])
1038 diffs[pointer - 1] = (
1039 diffs[pointer - 1][0],
1040 diffs[pointer - 1][1] + diffs[pointer][1],
1041 )
983 1042 del diffs[pointer]
984 1043 else:
985 1044 pointer += 1
986 1045
987 1046 count_insert = 0
988 1047 count_delete = 0
989 text_delete = ''
990 text_insert = ''
1048 text_delete = ""
1049 text_insert = ""
991 1050
992 if diffs[-1][1] == '':
1051 if diffs[-1][1] == "":
993 1052 diffs.pop() # Remove the dummy entry at the end.
994 1053
995 1054 # Second pass: look for single edits surrounded on both sides by equalities
996 1055 # which can be shifted sideways to eliminate an equality.
997 1056 # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
998 1057 changes = False
999 1058 pointer = 1
1000 1059 # Intentionally ignore the first and last element (don't need checking).
1001 1060 while pointer < len(diffs) - 1:
1002 if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
1003 diffs[pointer + 1][0] == self.DIFF_EQUAL):
1061 if (
1062 diffs[pointer - 1][0] == self.DIFF_EQUAL
1063 and diffs[pointer + 1][0] == self.DIFF_EQUAL
1064 ):
1004 1065 # This is a single edit surrounded by equalities.
1005 1066 if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
1006 1067 # Shift the edit over the previous equality.
1007 diffs[pointer] = (diffs[pointer][0],
1008 diffs[pointer - 1][1] +
1009 diffs[pointer][1][:-len(diffs[pointer - 1][1])])
1010 diffs[pointer + 1] = (diffs[pointer + 1][0],
1011 diffs[pointer - 1][1] + diffs[pointer + 1][1])
1068 diffs[pointer] = (
1069 diffs[pointer][0],
1070 diffs[pointer - 1][1]
1071 + diffs[pointer][1][: -len(diffs[pointer - 1][1])],
1072 )
1073 diffs[pointer + 1] = (
1074 diffs[pointer + 1][0],
1075 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1076 )
1012 1077 del diffs[pointer - 1]
1013 1078 changes = True
1014 1079 elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
1015 1080 # Shift the edit over the next equality.
1016 diffs[pointer - 1] = (diffs[pointer - 1][0],
1017 diffs[pointer - 1][1] + diffs[pointer + 1][1])
1018 diffs[pointer] = (diffs[pointer][0],
1019 diffs[pointer][1][len(diffs[pointer + 1][1]):] +
1020 diffs[pointer + 1][1])
1081 diffs[pointer - 1] = (
1082 diffs[pointer - 1][0],
1083 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1084 )
1085 diffs[pointer] = (
1086 diffs[pointer][0],
1087 diffs[pointer][1][len(diffs[pointer + 1][1]) :]
1088 + diffs[pointer + 1][1],
1089 )
1021 1090 del diffs[pointer + 1]
1022 1091 changes = True
1023 1092 pointer += 1
1024 1093
1025 1094 # If shifts were made, the diff needs reordering and another shift sweep.
1026 1095 if changes:
1027 1096 self.diff_cleanupMerge(diffs)
1028 1097
1029 1098 def diff_xIndex(self, diffs, loc):
1030 1099 """loc is a location in text1, compute and return the equivalent location
1031 1100 in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
1032 1101
1033 1102 Args:
1034 1103 diffs: Array of diff tuples.
1035 1104 loc: Location within text1.
1036 1105
1037 1106 Returns:
1038 1107 Location within text2.
1039 1108 """
1040 1109 chars1 = 0
1041 1110 chars2 = 0
1042 1111 last_chars1 = 0
1043 1112 last_chars2 = 0
1044 1113 for x in range(len(diffs)):
1045 1114 (op, text) = diffs[x]
1046 1115 if op != self.DIFF_INSERT: # Equality or deletion.
1047 1116 chars1 += len(text)
1048 1117 if op != self.DIFF_DELETE: # Equality or insertion.
1049 1118 chars2 += len(text)
1050 1119 if chars1 > loc: # Overshot the location.
1051 1120 break
1052 1121 last_chars1 = chars1
1053 1122 last_chars2 = chars2
1054 1123
1055 1124 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
1056 1125 # The location was deleted.
1057 1126 return last_chars2
1058 1127 # Add the remaining len(character).
1059 1128 return last_chars2 + (loc - last_chars1)
1060 1129
1061 1130 def diff_prettyHtml(self, diffs):
1062 1131 """Convert a diff array into a pretty HTML report.
1063 1132
1064 1133 Args:
1065 1134 diffs: Array of diff tuples.
1066 1135
1067 1136 Returns:
1068 1137 HTML representation.
1069 1138 """
1070 1139 html = []
1071 for (op, data) in diffs:
1072 text = (data.replace("&", "&amp;").replace("<", "&lt;")
1073 .replace(">", "&gt;").replace("\n", "&para;<br>"))
1140 for op, data in diffs:
1141 text = (
1142 data.replace("&", "&amp;")
1143 .replace("<", "&lt;")
1144 .replace(">", "&gt;")
1145 .replace("\n", "&para;<br>")
1146 )
1074 1147 if op == self.DIFF_INSERT:
1075 html.append("<ins style=\"background:#e6ffe6;\">%s</ins>" % text)
1148 html.append('<ins style="background:#e6ffe6;">%s</ins>' % text)
1076 1149 elif op == self.DIFF_DELETE:
1077 html.append("<del style=\"background:#ffe6e6;\">%s</del>" % text)
1150 html.append('<del style="background:#ffe6e6;">%s</del>' % text)
1078 1151 elif op == self.DIFF_EQUAL:
1079 1152 html.append("<span>%s</span>" % text)
1080 1153 return "".join(html)
1081 1154
1082 1155 def diff_text1(self, diffs):
1083 1156 """Compute and return the source text (all equalities and deletions).
1084 1157
1085 1158 Args:
1086 1159 diffs: Array of diff tuples.
1087 1160
1088 1161 Returns:
1089 1162 Source text.
1090 1163 """
1091 1164 text = []
1092 for (op, data) in diffs:
1165 for op, data in diffs:
1093 1166 if op != self.DIFF_INSERT:
1094 1167 text.append(data)
1095 1168 return "".join(text)
1096 1169
1097 1170 def diff_text2(self, diffs):
1098 1171 """Compute and return the destination text (all equalities and insertions).
1099 1172
1100 1173 Args:
1101 1174 diffs: Array of diff tuples.
1102 1175
1103 1176 Returns:
1104 1177 Destination text.
1105 1178 """
1106 1179 text = []
1107 for (op, data) in diffs:
1180 for op, data in diffs:
1108 1181 if op != self.DIFF_DELETE:
1109 1182 text.append(data)
1110 1183 return "".join(text)
1111 1184
1112 1185 def diff_levenshtein(self, diffs):
1113 1186 """Compute the Levenshtein distance; the number of inserted, deleted or
1114 1187 substituted characters.
1115 1188
1116 1189 Args:
1117 1190 diffs: Array of diff tuples.
1118 1191
1119 1192 Returns:
1120 1193 Number of changes.
1121 1194 """
1122 1195 levenshtein = 0
1123 1196 insertions = 0
1124 1197 deletions = 0
1125 for (op, data) in diffs:
1198 for op, data in diffs:
1126 1199 if op == self.DIFF_INSERT:
1127 1200 insertions += len(data)
1128 1201 elif op == self.DIFF_DELETE:
1129 1202 deletions += len(data)
1130 1203 elif op == self.DIFF_EQUAL:
1131 1204 # A deletion and an insertion is one substitution.
1132 1205 levenshtein += max(insertions, deletions)
1133 1206 insertions = 0
1134 1207 deletions = 0
1135 1208 levenshtein += max(insertions, deletions)
1136 1209 return levenshtein
1137 1210
1138 1211 def diff_toDelta(self, diffs):
1139 1212 """Crush the diff into an encoded string which describes the operations
1140 1213 required to transform text1 into text2.
1141 1214 E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
1142 1215 Operations are tab-separated. Inserted text is escaped using %xx notation.
1143 1216
1144 1217 Args:
1145 1218 diffs: Array of diff tuples.
1146 1219
1147 1220 Returns:
1148 1221 Delta text.
1149 1222 """
1150 1223 text = []
1151 for (op, data) in diffs:
1224 for op, data in diffs:
1152 1225 if op == self.DIFF_INSERT:
1153 1226 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
1154 1227 data = data.encode("utf-8")
1155 1228 text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
1156 1229 elif op == self.DIFF_DELETE:
1157 1230 text.append("-%d" % len(data))
1158 1231 elif op == self.DIFF_EQUAL:
1159 1232 text.append("=%d" % len(data))
1160 1233 return "\t".join(text)
1161 1234
1162 1235 def diff_fromDelta(self, text1, delta):
1163 1236 """Given the original text1, and an encoded string which describes the
1164 1237 operations required to transform text1 into text2, compute the full diff.
1165 1238
1166 1239 Args:
1167 1240 text1: Source string for the diff.
1168 1241 delta: Delta text.
1169 1242
1170 1243 Returns:
1171 1244 Array of diff tuples.
1172 1245
1173 1246 Raises:
1174 1247 ValueError: If invalid input.
1175 1248 """
1176 if type(delta) == unicode:
1249 if type(delta) == str:
1177 1250 # Deltas should be composed of a subset of ascii chars, Unicode not
1178 1251 # required. If this encode raises UnicodeEncodeError, delta is invalid.
1179 1252 delta = delta.encode("ascii")
1180 1253 diffs = []
1181 1254 pointer = 0 # Cursor in text1
1182 1255 tokens = delta.split("\t")
1183 1256 for token in tokens:
1184 1257 if token == "":
1185 1258 # Blank tokens are ok (from a trailing \t).
1186 1259 continue
1187 1260 # Each token begins with a one character parameter which specifies the
1188 1261 # operation of this token (delete, insert, equality).
1189 1262 param = token[1:]
1190 1263 if token[0] == "+":
1191 param = urllib.parse.unquote(param).decode("utf-8")
1264 param = urllib.parse.unquote(param)
1192 1265 diffs.append((self.DIFF_INSERT, param))
1193 1266 elif token[0] == "-" or token[0] == "=":
1194 1267 try:
1195 1268 n = int(param)
1196 1269 except ValueError:
1197 1270 raise ValueError("Invalid number in diff_fromDelta: " + param)
1198 1271 if n < 0:
1199 1272 raise ValueError("Negative number in diff_fromDelta: " + param)
1200 1273 text = text1[pointer : pointer + n]
1201 1274 pointer += n
1202 1275 if token[0] == "=":
1203 1276 diffs.append((self.DIFF_EQUAL, text))
1204 1277 else:
1205 1278 diffs.append((self.DIFF_DELETE, text))
1206 1279 else:
1207 1280 # Anything else is an error.
1208 raise ValueError("Invalid diff operation in diff_fromDelta: " +
1209 token[0])
1281 raise ValueError(
1282 "Invalid diff operation in diff_fromDelta: " + token[0]
1283 )
1210 1284 if pointer != len(text1):
1211 1285 raise ValueError(
1212 "Delta length (%d) does not equal source text length (%d)." %
1213 (pointer, len(text1)))
1286 "Delta length (%d) does not equal source text length (%d)."
1287 % (pointer, len(text1))
1288 )
1214 1289 return diffs
1215 1290
1216 1291 # MATCH FUNCTIONS
1217 1292
1218 1293 def match_main(self, text, pattern, loc):
1219 1294 """Locate the best instance of 'pattern' in 'text' near 'loc'.
1220 1295
1221 1296 Args:
1222 1297 text: The text to search.
1223 1298 pattern: The pattern to search for.
1224 1299 loc: The location to search around.
1225 1300
1226 1301 Returns:
1227 1302 Best match index or -1.
1228 1303 """
1229 1304 # Check for null inputs.
1230 1305 if text is None or pattern is None:
1231 1306 raise ValueError("Null inputs. (match_main)")
1232 1307
1233 1308 loc = max(0, min(loc, len(text)))
1234 1309 if text == pattern:
1235 1310 # Shortcut (potentially not guaranteed by the algorithm)
1236 1311 return 0
1237 1312 elif not text:
1238 1313 # Nothing to match.
1239 1314 return -1
1240 1315 elif text[loc:loc + len(pattern)] == pattern:
1241 1316 # Perfect match at the perfect spot! (Includes case of null pattern)
1242 1317 return loc
1243 1318 else:
1244 1319 # Do a fuzzy compare.
1245 1320 match = self.match_bitap(text, pattern, loc)
1246 1321 return match
1247 1322
1248 1323 def match_bitap(self, text, pattern, loc):
1249 1324 """Locate the best instance of 'pattern' in 'text' near 'loc' using the
1250 1325 Bitap algorithm.
1251 1326
1252 1327 Args:
1253 1328 text: The text to search.
1254 1329 pattern: The pattern to search for.
1255 1330 loc: The location to search around.
1256 1331
1257 1332 Returns:
1258 1333 Best match index or -1.
1259 1334 """
1260 1335 # Python doesn't have a maxint limit, so ignore this check.
1261 1336 #if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1262 1337 # raise ValueError("Pattern too long for this application.")
1263 1338
1264 1339 # Initialise the alphabet.
1265 1340 s = self.match_alphabet(pattern)
1266 1341
1267 1342 def match_bitapScore(e, x):
1268 1343 """Compute and return the score for a match with e errors and x location.
1269 1344 Accesses loc and pattern through being a closure.
1270 1345
1271 1346 Args:
1272 1347 e: Number of errors in match.
1273 1348 x: Location of match.
1274 1349
1275 1350 Returns:
1276 1351 Overall score for match (0.0 = good, 1.0 = bad).
1277 1352 """
1278 1353 accuracy = float(e) / len(pattern)
1279 1354 proximity = abs(loc - x)
1280 1355 if not self.Match_Distance:
1281 1356 # Dodge divide by zero error.
1282 1357 return proximity and 1.0 or accuracy
1283 1358 return accuracy + (proximity / float(self.Match_Distance))
1284 1359
1285 1360 # Highest score beyond which we give up.
1286 1361 score_threshold = self.Match_Threshold
1287 1362 # Is there a nearby exact match? (speedup)
1288 1363 best_loc = text.find(pattern, loc)
1289 1364 if best_loc != -1:
1290 1365 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1291 1366 # What about in the other direction? (speedup)
1292 1367 best_loc = text.rfind(pattern, loc + len(pattern))
1293 1368 if best_loc != -1:
1294 1369 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1295 1370
1296 1371 # Initialise the bit arrays.
1297 1372 matchmask = 1 << (len(pattern) - 1)
1298 1373 best_loc = -1
1299 1374
1300 1375 bin_max = len(pattern) + len(text)
1301 1376 # Empty initialization added to appease pychecker.
1302 1377 last_rd = None
1303 1378 for d in range(len(pattern)):
1304 1379 # Scan for the best match each iteration allows for one more error.
1305 1380 # Run a binary search to determine how far from 'loc' we can stray at
1306 1381 # this error level.
1307 1382 bin_min = 0
1308 1383 bin_mid = bin_max
1309 1384 while bin_min < bin_mid:
1310 1385 if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1311 1386 bin_min = bin_mid
1312 1387 else:
1313 1388 bin_max = bin_mid
1314 1389 bin_mid = (bin_max - bin_min) // 2 + bin_min
1315 1390
1316 1391 # Use the result from this iteration as the maximum for the next.
1317 1392 bin_max = bin_mid
1318 1393 start = max(1, loc - bin_mid + 1)
1319 1394 finish = min(loc + bin_mid, len(text)) + len(pattern)
1320 1395
1321 1396 rd = [0] * (finish + 2)
1322 1397 rd[finish + 1] = (1 << d) - 1
1323 1398 for j in range(finish, start - 1, -1):
1324 1399 if len(text) <= j - 1:
1325 1400 # Out of range.
1326 1401 charMatch = 0
1327 1402 else:
1328 1403 charMatch = s.get(text[j - 1], 0)
1329 1404 if d == 0: # First pass: exact match.
1330 1405 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1331 1406 else: # Subsequent passes: fuzzy match.
1332 rd[j] = (((rd[j + 1] << 1) | 1) & charMatch) | (
1333 ((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
1407 rd[j] = (
1408 (((rd[j + 1] << 1) | 1) & charMatch)
1409 | (((last_rd[j + 1] | last_rd[j]) << 1) | 1)
1410 | last_rd[j + 1]
1411 )
1334 1412 if rd[j] & matchmask:
1335 1413 score = match_bitapScore(d, j - 1)
1336 1414 # This match will almost certainly be better than any existing match.
1337 1415 # But check anyway.
1338 1416 if score <= score_threshold:
1339 1417 # Told you so.
1340 1418 score_threshold = score
1341 1419 best_loc = j - 1
1342 1420 if best_loc > loc:
1343 1421 # When passing loc, don't exceed our current distance from loc.
1344 1422 start = max(1, 2 * loc - best_loc)
1345 1423 else:
1346 1424 # Already passed loc, downhill from here on in.
1347 1425 break
1348 1426 # No hope for a (better) match at greater error levels.
1349 1427 if match_bitapScore(d + 1, loc) > score_threshold:
1350 1428 break
1351 1429 last_rd = rd
1352 1430 return best_loc
1353 1431
1354 1432 def match_alphabet(self, pattern):
1355 1433 """Initialise the alphabet for the Bitap algorithm.
1356 1434
1357 1435 Args:
1358 1436 pattern: The text to encode.
1359 1437
1360 1438 Returns:
1361 1439 Hash of character locations.
1362 1440 """
1363 1441 s = {}
1364 1442 for char in pattern:
1365 1443 s[char] = 0
1366 1444 for i in range(len(pattern)):
1367 1445 s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1368 1446 return s
1369 1447
1370 1448 # PATCH FUNCTIONS
1371 1449
1372 1450 def patch_addContext(self, patch, text):
1373 1451 """Increase the context until it is unique,
1374 1452 but don't let the pattern expand beyond Match_MaxBits.
1375 1453
1376 1454 Args:
1377 1455 patch: The patch to grow.
1378 1456 text: Source text.
1379 1457 """
1380 1458 if len(text) == 0:
1381 1459 return
1382 1460 pattern = text[patch.start2 : patch.start2 + patch.length1]
1383 1461 padding = 0
1384 1462
1385 1463 # Look for the first and last matches of pattern in text. If two different
1386 1464 # matches are found, increase the pattern length.
1387 while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits ==
1388 0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin -
1389 self.Patch_Margin)):
1465 while text.find(pattern) != text.rfind(pattern) and (
1466 self.Match_MaxBits == 0
1467 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - self.Patch_Margin
1468 ):
1390 1469 padding += self.Patch_Margin
1391 pattern = text[max(0, patch.start2 - padding) :
1392 patch.start2 + patch.length1 + padding]
1470 pattern = text[
1471 max(0, patch.start2 - padding) : patch.start2 + patch.length1 + padding
1472 ]
1393 1473 # Add one chunk for good luck.
1394 1474 padding += self.Patch_Margin
1395 1475
1396 1476 # Add the prefix.
1397 1477 prefix = text[max(0, patch.start2 - padding) : patch.start2]
1398 1478 if prefix:
1399 1479 patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1400 1480 # Add the suffix.
1401 suffix = text[patch.start2 + patch.length1 :
1402 patch.start2 + patch.length1 + padding]
1481 suffix = text[
1482 patch.start2 + patch.length1 : patch.start2 + patch.length1 + padding
1483 ]
1403 1484 if suffix:
1404 1485 patch.diffs.append((self.DIFF_EQUAL, suffix))
1405 1486
1406 1487 # Roll back the start points.
1407 1488 patch.start1 -= len(prefix)
1408 1489 patch.start2 -= len(prefix)
1409 1490 # Extend lengths.
1410 1491 patch.length1 += len(prefix) + len(suffix)
1411 1492 patch.length2 += len(prefix) + len(suffix)
1412 1493
1413 1494 def patch_make(self, a, b=None, c=None):
1414 1495 """Compute a list of patches to turn text1 into text2.
1415 1496 Use diffs if provided, otherwise compute it ourselves.
1416 1497 There are four ways to call this function, depending on what data is
1417 1498 available to the caller:
1418 1499 Method 1:
1419 1500 a = text1, b = text2
1420 1501 Method 2:
1421 1502 a = diffs
1422 1503 Method 3 (optimal):
1423 1504 a = text1, b = diffs
1424 1505 Method 4 (deprecated, use method 3):
1425 1506 a = text1, b = text2, c = diffs
1426 1507
1427 1508 Args:
1428 1509 a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1429 1510 text2 (method 2).
1430 1511 b: text2 (methods 1,4) or Array of diff tuples for text1 to
1431 1512 text2 (method 3) or undefined (method 2).
1432 1513 c: Array of diff tuples for text1 to text2 (method 4) or
1433 1514 undefined (methods 1,2,3).
1434 1515
1435 1516 Returns:
1436 1517 Array of Patch objects.
1437 1518 """
1438 1519 text1 = None
1439 1520 diffs = None
1440 1521 # Note that texts may arrive as 'str' or 'unicode'.
1441 1522 if isinstance(a, str) and isinstance(b, str) and c is None:
1442 1523 # Method 1: text1, text2
1443 1524 # Compute diffs from text1 and text2.
1444 1525 text1 = a
1445 1526 diffs = self.diff_main(text1, b, True)
1446 1527 if len(diffs) > 2:
1447 1528 self.diff_cleanupSemantic(diffs)
1448 1529 self.diff_cleanupEfficiency(diffs)
1449 1530 elif isinstance(a, list) and b is None and c is None:
1450 1531 # Method 2: diffs
1451 1532 # Compute text1 from diffs.
1452 1533 diffs = a
1453 1534 text1 = self.diff_text1(diffs)
1454 1535 elif isinstance(a, str) and isinstance(b, list) and c is None:
1455 1536 # Method 3: text1, diffs
1456 1537 text1 = a
1457 1538 diffs = b
1458 elif (isinstance(a, str) and isinstance(b, str) and
1459 isinstance(c, list)):
1539 elif isinstance(a, str) and isinstance(b, str) and isinstance(c, list):
1460 1540 # Method 4: text1, text2, diffs
1461 1541 # text2 is not used.
1462 1542 text1 = a
1463 1543 diffs = c
1464 1544 else:
1465 1545 raise ValueError("Unknown call format to patch_make.")
1466 1546
1467 1547 if not diffs:
1468 1548 return [] # Get rid of the None case.
1469 1549 patches = []
1470 1550 patch = patch_obj()
1471 1551 char_count1 = 0 # Number of characters into the text1 string.
1472 1552 char_count2 = 0 # Number of characters into the text2 string.
1473 1553 prepatch_text = text1 # Recreate the patches to determine context info.
1474 1554 postpatch_text = text1
1475 1555 for x in range(len(diffs)):
1476 1556 (diff_type, diff_text) = diffs[x]
1477 1557 if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1478 1558 # A new patch starts here.
1479 1559 patch.start1 = char_count1
1480 1560 patch.start2 = char_count2
1481 1561 if diff_type == self.DIFF_INSERT:
1482 1562 # Insertion
1483 1563 patch.diffs.append(diffs[x])
1484 1564 patch.length2 += len(diff_text)
1485 postpatch_text = (postpatch_text[:char_count2] + diff_text +
1486 postpatch_text[char_count2:])
1565 postpatch_text = (
1566 postpatch_text[:char_count2]
1567 + diff_text
1568 + postpatch_text[char_count2:]
1569 )
1487 1570 elif diff_type == self.DIFF_DELETE:
1488 1571 # Deletion.
1489 1572 patch.length1 += len(diff_text)
1490 1573 patch.diffs.append(diffs[x])
1491 postpatch_text = (postpatch_text[:char_count2] +
1492 postpatch_text[char_count2 + len(diff_text):])
1493 elif (diff_type == self.DIFF_EQUAL and
1494 len(diff_text) <= 2 * self.Patch_Margin and
1495 len(patch.diffs) != 0 and len(diffs) != x + 1):
1574 postpatch_text = (
1575 postpatch_text[:char_count2]
1576 + postpatch_text[char_count2 + len(diff_text) :]
1577 )
1578 elif (
1579 diff_type == self.DIFF_EQUAL
1580 and len(diff_text) <= 2 * self.Patch_Margin
1581 and len(patch.diffs) != 0
1582 and len(diffs) != x + 1
1583 ):
1496 1584 # Small equality inside a patch.
1497 1585 patch.diffs.append(diffs[x])
1498 1586 patch.length1 += len(diff_text)
1499 1587 patch.length2 += len(diff_text)
1500 1588
1501 if (diff_type == self.DIFF_EQUAL and
1502 len(diff_text) >= 2 * self.Patch_Margin):
1589 if diff_type == self.DIFF_EQUAL and len(diff_text) >= 2 * self.Patch_Margin:
1503 1590 # Time for a new patch.
1504 1591 if len(patch.diffs) != 0:
1505 1592 self.patch_addContext(patch, prepatch_text)
1506 1593 patches.append(patch)
1507 1594 patch = patch_obj()
1508 1595 # Unlike Unidiff, our patch lists have a rolling context.
1509 1596 # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1510 1597 # Update prepatch text & pos to reflect the application of the
1511 1598 # just completed patch.
1512 1599 prepatch_text = postpatch_text
1513 1600 char_count1 = char_count2
1514 1601
1515 1602 # Update the current character count.
1516 1603 if diff_type != self.DIFF_INSERT:
1517 1604 char_count1 += len(diff_text)
1518 1605 if diff_type != self.DIFF_DELETE:
1519 1606 char_count2 += len(diff_text)
1520 1607
1521 1608 # Pick up the leftover patch if not empty.
1522 1609 if len(patch.diffs) != 0:
1523 1610 self.patch_addContext(patch, prepatch_text)
1524 1611 patches.append(patch)
1525 1612 return patches
1526 1613
1527 1614 def patch_deepCopy(self, patches):
1528 1615 """Given an array of patches, return another array that is identical.
1529 1616
1530 1617 Args:
1531 1618 patches: Array of Patch objects.
1532 1619
1533 1620 Returns:
1534 1621 Array of Patch objects.
1535 1622 """
1536 1623 patchesCopy = []
1537 1624 for patch in patches:
1538 1625 patchCopy = patch_obj()
1539 1626 # No need to deep copy the tuples since they are immutable.
1540 1627 patchCopy.diffs = patch.diffs[:]
1541 1628 patchCopy.start1 = patch.start1
1542 1629 patchCopy.start2 = patch.start2
1543 1630 patchCopy.length1 = patch.length1
1544 1631 patchCopy.length2 = patch.length2
1545 1632 patchesCopy.append(patchCopy)
1546 1633 return patchesCopy
1547 1634
1548 1635 def patch_apply(self, patches, text):
1549 1636 """Merge a set of patches onto the text. Return a patched text, as well
1550 1637 as a list of true/false values indicating which patches were applied.
1551 1638
1552 1639 Args:
1553 1640 patches: Array of Patch objects.
1554 1641 text: Old text.
1555 1642
1556 1643 Returns:
1557 1644 Two element Array, containing the new text and an array of boolean values.
1558 1645 """
1559 1646 if not patches:
1560 1647 return (text, [])
1561 1648
1562 1649 # Deep copy the patches so that no changes are made to originals.
1563 1650 patches = self.patch_deepCopy(patches)
1564 1651
1565 1652 nullPadding = self.patch_addPadding(patches)
1566 1653 text = nullPadding + text + nullPadding
1567 1654 self.patch_splitMax(patches)
1568 1655
1569 1656 # delta keeps track of the offset between the expected and actual location
1570 1657 # of the previous patch. If there are patches expected at positions 10 and
1571 1658 # 20, but the first patch was found at 12, delta is 2 and the second patch
1572 1659 # has an effective expected position of 22.
1573 1660 delta = 0
1574 1661 results = []
1575 1662 for patch in patches:
1576 1663 expected_loc = patch.start2 + delta
1577 1664 text1 = self.diff_text1(patch.diffs)
1578 1665 end_loc = -1
1579 1666 if len(text1) > self.Match_MaxBits:
1580 1667 # patch_splitMax will only provide an oversized pattern in the case of
1581 1668 # a monster delete.
1582 start_loc = self.match_main(text, text1[:self.Match_MaxBits],
1583 expected_loc)
1669 start_loc = self.match_main(
1670 text, text1[: self.Match_MaxBits], expected_loc
1671 )
1584 1672 if start_loc != -1:
1585 end_loc = self.match_main(text, text1[-self.Match_MaxBits:],
1586 expected_loc + len(text1) - self.Match_MaxBits)
1673 end_loc = self.match_main(
1674 text,
1675 text1[-self.Match_MaxBits :],
1676 expected_loc + len(text1) - self.Match_MaxBits,
1677 )
1587 1678 if end_loc == -1 or start_loc >= end_loc:
1588 1679 # Can't find valid trailing context. Drop this patch.
1589 1680 start_loc = -1
1590 1681 else:
1591 1682 start_loc = self.match_main(text, text1, expected_loc)
1592 1683 if start_loc == -1:
1593 1684 # No match found. :(
1594 1685 results.append(False)
1595 1686 # Subtract the delta for this failed patch from subsequent patches.
1596 1687 delta -= patch.length2 - patch.length1
1597 1688 else:
1598 1689 # Found a match. :)
1599 1690 results.append(True)
1600 1691 delta = start_loc - expected_loc
1601 1692 if end_loc == -1:
1602 1693 text2 = text[start_loc : start_loc + len(text1)]
1603 1694 else:
1604 1695 text2 = text[start_loc : end_loc + self.Match_MaxBits]
1605 1696 if text1 == text2:
1606 1697 # Perfect match, just shove the replacement text in.
1607 text = (text[:start_loc] + self.diff_text2(patch.diffs) +
1608 text[start_loc + len(text1):])
1698 text = (
1699 text[:start_loc]
1700 + self.diff_text2(patch.diffs)
1701 + text[start_loc + len(text1) :]
1702 )
1609 1703 else:
1610 1704 # Imperfect match.
1611 1705 # Run a diff to get a framework of equivalent indices.
1612 1706 diffs = self.diff_main(text1, text2, False)
1613 if (len(text1) > self.Match_MaxBits and
1614 self.diff_levenshtein(diffs) / float(len(text1)) >
1615 self.Patch_DeleteThreshold):
1707 if (
1708 len(text1) > self.Match_MaxBits
1709 and self.diff_levenshtein(diffs) / float(len(text1))
1710 > self.Patch_DeleteThreshold
1711 ):
1616 1712 # The end points match, but the content is unacceptably bad.
1617 1713 results[-1] = False
1618 1714 else:
1619 1715 self.diff_cleanupSemanticLossless(diffs)
1620 1716 index1 = 0
1621 for (op, data) in patch.diffs:
1717 for op, data in patch.diffs:
1622 1718 if op != self.DIFF_EQUAL:
1623 1719 index2 = self.diff_xIndex(diffs, index1)
1624 1720 if op == self.DIFF_INSERT: # Insertion
1625 text = text[:start_loc + index2] + data + text[start_loc +
1626 index2:]
1721 text = (
1722 text[: start_loc + index2]
1723 + data
1724 + text[start_loc + index2 :]
1725 )
1627 1726 elif op == self.DIFF_DELETE: # Deletion
1628 text = text[:start_loc + index2] + text[start_loc +
1629 self.diff_xIndex(diffs, index1 + len(data)):]
1727 text = (
1728 text[: start_loc + index2]
1729 + text[
1730 start_loc
1731 + self.diff_xIndex(diffs, index1 + len(data)) :
1732 ]
1733 )
1630 1734 if op != self.DIFF_DELETE:
1631 1735 index1 += len(data)
1632 1736 # Strip the padding off.
1633 1737 text = text[len(nullPadding):-len(nullPadding)]
1634 1738 return (text, results)
1635 1739
1636 1740 def patch_addPadding(self, patches):
1637 1741 """Add some padding on text start and end so that edges can match
1638 1742 something. Intended to be called only from within patch_apply.
1639 1743
1640 1744 Args:
1641 1745 patches: Array of Patch objects.
1642 1746
1643 1747 Returns:
1644 1748 The padding string added to each side.
1645 1749 """
1646 1750 paddingLength = self.Patch_Margin
1647 1751 nullPadding = ""
1648 1752 for x in range(1, paddingLength + 1):
1649 1753 nullPadding += chr(x)
1650 1754
1651 1755 # Bump all the patches forward.
1652 1756 for patch in patches:
1653 1757 patch.start1 += paddingLength
1654 1758 patch.start2 += paddingLength
1655 1759
1656 1760 # Add some padding on start of first diff.
1657 1761 patch = patches[0]
1658 1762 diffs = patch.diffs
1659 1763 if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1660 1764 # Add nullPadding equality.
1661 1765 diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1662 1766 patch.start1 -= paddingLength # Should be 0.
1663 1767 patch.start2 -= paddingLength # Should be 0.
1664 1768 patch.length1 += paddingLength
1665 1769 patch.length2 += paddingLength
1666 1770 elif paddingLength > len(diffs[0][1]):
1667 1771 # Grow first equality.
1668 1772 extraLength = paddingLength - len(diffs[0][1])
1669 1773 newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]
1670 1774 diffs[0] = (diffs[0][0], newText)
1671 1775 patch.start1 -= extraLength
1672 1776 patch.start2 -= extraLength
1673 1777 patch.length1 += extraLength
1674 1778 patch.length2 += extraLength
1675 1779
1676 1780 # Add some padding on end of last diff.
1677 1781 patch = patches[-1]
1678 1782 diffs = patch.diffs
1679 1783 if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1680 1784 # Add nullPadding equality.
1681 1785 diffs.append((self.DIFF_EQUAL, nullPadding))
1682 1786 patch.length1 += paddingLength
1683 1787 patch.length2 += paddingLength
1684 1788 elif paddingLength > len(diffs[-1][1]):
1685 1789 # Grow last equality.
1686 1790 extraLength = paddingLength - len(diffs[-1][1])
1687 1791 newText = diffs[-1][1] + nullPadding[:extraLength]
1688 1792 diffs[-1] = (diffs[-1][0], newText)
1689 1793 patch.length1 += extraLength
1690 1794 patch.length2 += extraLength
1691 1795
1692 1796 return nullPadding
1693 1797
1694 1798 def patch_splitMax(self, patches):
1695 1799 """Look through the patches and break up any which are longer than the
1696 1800 maximum limit of the match algorithm.
1697 1801 Intended to be called only from within patch_apply.
1698 1802
1699 1803 Args:
1700 1804 patches: Array of Patch objects.
1701 1805 """
1702 1806 patch_size = self.Match_MaxBits
1703 1807 if patch_size == 0:
1704 1808 # Python has the option of not splitting strings due to its ability
1705 1809 # to handle integers of arbitrary precision.
1706 1810 return
1707 1811 for x in range(len(patches)):
1708 1812 if patches[x].length1 <= patch_size:
1709 1813 continue
1710 1814 bigpatch = patches[x]
1711 1815 # Remove the big old patch.
1712 1816 del patches[x]
1713 1817 x -= 1
1714 1818 start1 = bigpatch.start1
1715 1819 start2 = bigpatch.start2
1716 precontext = ''
1820 precontext = ""
1717 1821 while len(bigpatch.diffs) != 0:
1718 1822 # Create one of several smaller patches.
1719 1823 patch = patch_obj()
1720 1824 empty = True
1721 1825 patch.start1 = start1 - len(precontext)
1722 1826 patch.start2 = start2 - len(precontext)
1723 1827 if precontext:
1724 1828 patch.length1 = patch.length2 = len(precontext)
1725 1829 patch.diffs.append((self.DIFF_EQUAL, precontext))
1726 1830
1727 while (len(bigpatch.diffs) != 0 and
1728 patch.length1 < patch_size - self.Patch_Margin):
1831 while (
1832 len(bigpatch.diffs) != 0
1833 and patch.length1 < patch_size - self.Patch_Margin
1834 ):
1729 1835 (diff_type, diff_text) = bigpatch.diffs[0]
1730 1836 if diff_type == self.DIFF_INSERT:
1731 1837 # Insertions are harmless.
1732 1838 patch.length2 += len(diff_text)
1733 1839 start2 += len(diff_text)
1734 1840 patch.diffs.append(bigpatch.diffs.pop(0))
1735 1841 empty = False
1736 elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and
1737 patch.diffs[0][0] == self.DIFF_EQUAL and
1738 len(diff_text) > 2 * patch_size):
1842 elif (
1843 diff_type == self.DIFF_DELETE
1844 and len(patch.diffs) == 1
1845 and patch.diffs[0][0] == self.DIFF_EQUAL
1846 and len(diff_text) > 2 * patch_size
1847 ):
1739 1848 # This is a large deletion. Let it pass in one chunk.
1740 1849 patch.length1 += len(diff_text)
1741 1850 start1 += len(diff_text)
1742 1851 empty = False
1743 1852 patch.diffs.append((diff_type, diff_text))
1744 1853 del bigpatch.diffs[0]
1745 1854 else:
1746 1855 # Deletion or equality. Only take as much as we can stomach.
1747 diff_text = diff_text[:patch_size - patch.length1 -
1748 self.Patch_Margin]
1856 diff_text = diff_text[
1857 : patch_size - patch.length1 - self.Patch_Margin
1858 ]
1749 1859 patch.length1 += len(diff_text)
1750 1860 start1 += len(diff_text)
1751 1861 if diff_type == self.DIFF_EQUAL:
1752 1862 patch.length2 += len(diff_text)
1753 1863 start2 += len(diff_text)
1754 1864 else:
1755 1865 empty = False
1756 1866
1757 1867 patch.diffs.append((diff_type, diff_text))
1758 1868 if diff_text == bigpatch.diffs[0][1]:
1759 1869 del bigpatch.diffs[0]
1760 1870 else:
1761 bigpatch.diffs[0] = (bigpatch.diffs[0][0],
1762 bigpatch.diffs[0][1][len(diff_text):])
1871 bigpatch.diffs[0] = (
1872 bigpatch.diffs[0][0],
1873 bigpatch.diffs[0][1][len(diff_text) :],
1874 )
1763 1875
1764 1876 # Compute the head context for the next patch.
1765 1877 precontext = self.diff_text2(patch.diffs)
1766 1878 precontext = precontext[-self.Patch_Margin:]
1767 1879 # Append the end context for this patch.
1768 1880 postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin]
1769 1881 if postcontext:
1770 1882 patch.length1 += len(postcontext)
1771 1883 patch.length2 += len(postcontext)
1772 1884 if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1773 patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +
1774 postcontext)
1885 patch.diffs[-1] = (
1886 self.DIFF_EQUAL,
1887 patch.diffs[-1][1] + postcontext,
1888 )
1775 1889 else:
1776 1890 patch.diffs.append((self.DIFF_EQUAL, postcontext))
1777 1891
1778 1892 if not empty:
1779 1893 x += 1
1780 1894 patches.insert(x, patch)
1781 1895
1782 1896 def patch_toText(self, patches):
1783 1897 """Take a list of patches and return a textual representation.
1784 1898
1785 1899 Args:
1786 1900 patches: Array of Patch objects.
1787 1901
1788 1902 Returns:
1789 1903 Text representation of patches.
1790 1904 """
1791 1905 text = []
1792 1906 for patch in patches:
1793 1907 text.append(str(patch))
1794 1908 return "".join(text)
1795 1909
1796 1910 def patch_fromText(self, textline):
1797 1911 """Parse a textual representation of patches and return a list of patch
1798 1912 objects.
1799 1913
1800 1914 Args:
1801 1915 textline: Text representation of patches.
1802 1916
1803 1917 Returns:
1804 1918 Array of Patch objects.
1805 1919
1806 1920 Raises:
1807 1921 ValueError: If invalid input.
1808 1922 """
1809 1923 if type(textline) == unicode:
1810 1924 # Patches should be composed of a subset of ascii chars, Unicode not
1811 1925 # required. If this encode raises UnicodeEncodeError, patch is invalid.
1812 1926 textline = textline.encode("ascii")
1813 1927 patches = []
1814 1928 if not textline:
1815 1929 return patches
1816 text = textline.split('\n')
1930 text = textline.split("\n")
1817 1931 while len(text) != 0:
1818 1932 m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1819 1933 if not m:
1820 1934 raise ValueError("Invalid patch string: " + text[0])
1821 1935 patch = patch_obj()
1822 1936 patches.append(patch)
1823 1937 patch.start1 = int(m.group(1))
1824 if m.group(2) == '':
1938 if m.group(2) == "":
1825 1939 patch.start1 -= 1
1826 1940 patch.length1 = 1
1827 elif m.group(2) == '0':
1941 elif m.group(2) == "0":
1828 1942 patch.length1 = 0
1829 1943 else:
1830 1944 patch.start1 -= 1
1831 1945 patch.length1 = int(m.group(2))
1832 1946
1833 1947 patch.start2 = int(m.group(3))
1834 if m.group(4) == '':
1948 if m.group(4) == "":
1835 1949 patch.start2 -= 1
1836 1950 patch.length2 = 1
1837 elif m.group(4) == '0':
1951 elif m.group(4) == "0":
1838 1952 patch.length2 = 0
1839 1953 else:
1840 1954 patch.start2 -= 1
1841 1955 patch.length2 = int(m.group(4))
1842 1956
1843 1957 del text[0]
1844 1958
1845 1959 while len(text) != 0:
1846 1960 if text[0]:
1847 1961 sign = text[0][0]
1848 1962 else:
1849 sign = ''
1963 sign = ""
1850 1964 line = urllib.parse.unquote(text[0][1:])
1851 1965 line = line.decode("utf-8")
1852 if sign == '+':
1966 if sign == "+":
1853 1967 # Insertion.
1854 1968 patch.diffs.append((self.DIFF_INSERT, line))
1855 elif sign == '-':
1969 elif sign == "-":
1856 1970 # Deletion.
1857 1971 patch.diffs.append((self.DIFF_DELETE, line))
1858 elif sign == ' ':
1972 elif sign == " ":
1859 1973 # Minor equality.
1860 1974 patch.diffs.append((self.DIFF_EQUAL, line))
1861 elif sign == '@':
1975 elif sign == "@":
1862 1976 # Start of next patch.
1863 1977 break
1864 elif sign == '':
1978 elif sign == "":
1865 1979 # Blank line? Whatever.
1866 1980 pass
1867 1981 else:
1868 1982 # WTF?
1869 1983 raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))
1870 1984 del text[0]
1871 1985 return patches
1872 1986
1873 1987
1874 1988 class patch_obj:
1875 """Class representing one patch operation.
1876 """
1989 """Class representing one patch operation."""
1877 1990
1878 1991 def __init__(self):
1879 """Initializes with an empty list of diffs.
1880 """
1992 """Initializes with an empty list of diffs."""
1881 1993 self.diffs = []
1882 1994 self.start1 = None
1883 1995 self.start2 = None
1884 1996 self.length1 = 0
1885 1997 self.length2 = 0
1886 1998
1887 1999 def __str__(self):
1888 2000 """Emmulate GNU diff's format.
1889 2001 Header: @@ -382,8 +481,9 @@
1890 2002 Indicies are printed as 1-based, not 0-based.
1891 2003
1892 2004 Returns:
1893 2005 The GNU diff string.
1894 2006 """
1895 2007 if self.length1 == 0:
1896 2008 coords1 = str(self.start1) + ",0"
1897 2009 elif self.length1 == 1:
1898 2010 coords1 = str(self.start1 + 1)
1899 2011 else:
1900 2012 coords1 = str(self.start1 + 1) + "," + str(self.length1)
1901 2013 if self.length2 == 0:
1902 2014 coords2 = str(self.start2) + ",0"
1903 2015 elif self.length2 == 1:
1904 2016 coords2 = str(self.start2 + 1)
1905 2017 else:
1906 2018 coords2 = str(self.start2 + 1) + "," + str(self.length2)
1907 2019 text = ["@@ -", coords1, " +", coords2, " @@\n"]
1908 2020 # Escape the body of the patch with %xx notation.
1909 for (op, data) in self.diffs:
2021 for op, data in self.diffs:
1910 2022 if op == diff_match_patch.DIFF_INSERT:
1911 2023 text.append("+")
1912 2024 elif op == diff_match_patch.DIFF_DELETE:
1913 2025 text.append("-")
1914 2026 elif op == diff_match_patch.DIFF_EQUAL:
1915 2027 text.append(" ")
1916 2028 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
1917 2029 data = data.encode("utf-8")
1918 2030 text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
1919 return "".join(text) No newline at end of file
2031 return "".join(text)
General Comments 0
You need to be logged in to leave comments. Login now