##// END OF EJS Templates
diffs: python3 port
super-admin -
r5083:875cd526 default
parent child Browse files
Show More
@@ -1,2031 +1,2033 b''
1 1 """Diff Match and Patch
2 2
3 3 Copyright 2006 Google Inc.
4 4 http://code.google.com/p/google-diff-match-patch/
5 5
6 6 Licensed under the Apache License, Version 2.0 (the "License");
7 7 you may not use this file except in compliance with the License.
8 8 You may obtain a copy of the License at
9 9
10 10 http://www.apache.org/licenses/LICENSE-2.0
11 11
12 12 Unless required by applicable law or agreed to in writing, software
13 13 distributed under the License is distributed on an "AS IS" BASIS,
14 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 15 See the License for the specific language governing permissions and
16 16 limitations under the License.
17 17 """
18 18
19 19 """Functions for diff, match and patch.
20 20
21 21 Computes the difference between two texts to create a patch.
22 22 Applies the patch onto another text, allowing for errors.
23 23 """
24 24
25 25 __author__ = "fraser@google.com (Neil Fraser)"
26 26
27 27 import math
28 28 import re
29 29 import sys
30 30 import time
31 import urllib.request, urllib.parse, urllib.error
31 import urllib.request
32 import urllib.parse
33 import urllib.error
32 34
33 35
34 36 class diff_match_patch:
35 37 """Class containing the diff, match and patch methods.
36 38
37 39 Also contains the behaviour settings.
38 40 """
39 41
40 42 def __init__(self):
41 43 """Inits a diff_match_patch object with default settings.
42 44 Redefine these in your program to override the defaults.
43 45 """
44 46
45 47 # Number of seconds to map a diff before giving up (0 for infinity).
46 48 self.Diff_Timeout = 1.0
47 49 # Cost of an empty edit operation in terms of edit characters.
48 50 self.Diff_EditCost = 4
49 51 # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
50 52 self.Match_Threshold = 0.5
51 53 # How far to search for a match (0 = exact location, 1000+ = broad match).
52 54 # A match this many characters away from the expected location will add
53 55 # 1.0 to the score (0.0 is a perfect match).
54 56 self.Match_Distance = 1000
55 57 # When deleting a large block of text (over ~64 characters), how close do
56 58 # the contents have to be to match the expected contents. (0.0 = perfection,
57 59 # 1.0 = very loose). Note that Match_Threshold controls how closely the
58 60 # end points of a delete need to match.
59 61 self.Patch_DeleteThreshold = 0.5
60 62 # Chunk size for context length.
61 63 self.Patch_Margin = 4
62 64
63 65 # The number of bits in an int.
64 66 # Python has no maximum, thus to disable patch splitting set to 0.
65 67 # However to avoid long patches in certain pathological cases, use 32.
66 68 # Multiple short patches (using native ints) are much faster than long ones.
67 69 self.Match_MaxBits = 32
68 70
69 71 # DIFF FUNCTIONS
70 72
71 73 # The data structure representing a diff is an array of tuples:
72 74 # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
73 75 # which means: delete "Hello", add "Goodbye" and keep " world."
74 76 DIFF_DELETE = -1
75 77 DIFF_INSERT = 1
76 78 DIFF_EQUAL = 0
77 79
78 80 def diff_main(self, text1, text2, checklines=True, deadline=None):
79 81 """Find the differences between two texts. Simplifies the problem by
80 82 stripping any common prefix or suffix off the texts before diffing.
81 83
82 84 Args:
83 85 text1: Old string to be diffed.
84 86 text2: New string to be diffed.
85 87 checklines: Optional speedup flag. If present and false, then don't run
86 88 a line-level diff first to identify the changed areas.
87 89 Defaults to true, which does a faster, slightly less optimal diff.
88 90 deadline: Optional time when the diff should be complete by. Used
89 91 internally for recursive calls. Users should set DiffTimeout instead.
90 92
91 93 Returns:
92 94 Array of changes.
93 95 """
94 96 # Set a deadline by which time the diff must be complete.
95 97 if deadline is None:
96 98 # Unlike in most languages, Python counts time in seconds.
97 99 if self.Diff_Timeout <= 0:
98 100 deadline = sys.maxsize
99 101 else:
100 102 deadline = time.time() + self.Diff_Timeout
101 103
102 104 # Check for null inputs.
103 105 if text1 is None or text2 is None:
104 106 raise ValueError("Null inputs. (diff_main)")
105 107
106 108 # Check for equality (speedup).
107 109 if text1 == text2:
108 110 if text1:
109 111 return [(self.DIFF_EQUAL, text1)]
110 112 return []
111 113
112 114 # Trim off common prefix (speedup).
113 115 commonlength = self.diff_commonPrefix(text1, text2)
114 116 commonprefix = text1[:commonlength]
115 117 text1 = text1[commonlength:]
116 118 text2 = text2[commonlength:]
117 119
118 120 # Trim off common suffix (speedup).
119 121 commonlength = self.diff_commonSuffix(text1, text2)
120 122 if commonlength == 0:
121 123 commonsuffix = ""
122 124 else:
123 125 commonsuffix = text1[-commonlength:]
124 126 text1 = text1[:-commonlength]
125 127 text2 = text2[:-commonlength]
126 128
127 129 # Compute the diff on the middle block.
128 130 diffs = self.diff_compute(text1, text2, checklines, deadline)
129 131
130 132 # Restore the prefix and suffix.
131 133 if commonprefix:
132 134 diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
133 135 if commonsuffix:
134 136 diffs.append((self.DIFF_EQUAL, commonsuffix))
135 137 self.diff_cleanupMerge(diffs)
136 138 return diffs
137 139
138 140 def diff_compute(self, text1, text2, checklines, deadline):
139 141 """Find the differences between two texts. Assumes that the texts do not
140 142 have any common prefix or suffix.
141 143
142 144 Args:
143 145 text1: Old string to be diffed.
144 146 text2: New string to be diffed.
145 147 checklines: Speedup flag. If false, then don't run a line-level diff
146 148 first to identify the changed areas.
147 149 If true, then run a faster, slightly less optimal diff.
148 150 deadline: Time when the diff should be complete by.
149 151
150 152 Returns:
151 153 Array of changes.
152 154 """
153 155 if not text1:
154 156 # Just add some text (speedup).
155 157 return [(self.DIFF_INSERT, text2)]
156 158
157 159 if not text2:
158 160 # Just delete some text (speedup).
159 161 return [(self.DIFF_DELETE, text1)]
160 162
161 163 if len(text1) > len(text2):
162 164 (longtext, shorttext) = (text1, text2)
163 165 else:
164 166 (shorttext, longtext) = (text1, text2)
165 167 i = longtext.find(shorttext)
166 168 if i != -1:
167 169 # Shorter text is inside the longer text (speedup).
168 170 diffs = [
169 171 (self.DIFF_INSERT, longtext[:i]),
170 172 (self.DIFF_EQUAL, shorttext),
171 (self.DIFF_INSERT, longtext[i + len(shorttext) :]),
173 (self.DIFF_INSERT, longtext[i + len(shorttext):]),
172 174 ]
173 175 # Swap insertions for deletions if diff is reversed.
174 176 if len(text1) > len(text2):
175 177 diffs[0] = (self.DIFF_DELETE, diffs[0][1])
176 178 diffs[2] = (self.DIFF_DELETE, diffs[2][1])
177 179 return diffs
178 180
179 181 if len(shorttext) == 1:
180 182 # Single character string.
181 183 # After the previous speedup, the character can't be an equality.
182 184 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
183 185
184 186 # Check to see if the problem can be split in two.
185 187 hm = self.diff_halfMatch(text1, text2)
186 188 if hm:
187 189 # A half-match was found, sort out the return data.
188 190 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
189 191 # Send both pairs off for separate processing.
190 192 diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)
191 193 diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)
192 194 # Merge the results.
193 195 return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
194 196
195 197 if checklines and len(text1) > 100 and len(text2) > 100:
196 198 return self.diff_lineMode(text1, text2, deadline)
197 199
198 200 return self.diff_bisect(text1, text2, deadline)
199 201
200 202 def diff_lineMode(self, text1, text2, deadline):
201 203 """Do a quick line-level diff on both strings, then rediff the parts for
202 204 greater accuracy.
203 205 This speedup can produce non-minimal diffs.
204 206
205 207 Args:
206 208 text1: Old string to be diffed.
207 209 text2: New string to be diffed.
208 210 deadline: Time when the diff should be complete by.
209 211
210 212 Returns:
211 213 Array of changes.
212 214 """
213 215
214 216 # Scan the text on a line-by-line basis first.
215 217 (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
216 218
217 219 diffs = self.diff_main(text1, text2, False, deadline)
218 220
219 221 # Convert the diff back to original text.
220 222 self.diff_charsToLines(diffs, linearray)
221 223 # Eliminate freak matches (e.g. blank lines)
222 224 self.diff_cleanupSemantic(diffs)
223 225
224 226 # Rediff any replacement blocks, this time character-by-character.
225 227 # Add a dummy entry at the end.
226 228 diffs.append((self.DIFF_EQUAL, ""))
227 229 pointer = 0
228 230 count_delete = 0
229 231 count_insert = 0
230 232 text_delete = ""
231 233 text_insert = ""
232 234 while pointer < len(diffs):
233 235 if diffs[pointer][0] == self.DIFF_INSERT:
234 236 count_insert += 1
235 237 text_insert += diffs[pointer][1]
236 238 elif diffs[pointer][0] == self.DIFF_DELETE:
237 239 count_delete += 1
238 240 text_delete += diffs[pointer][1]
239 241 elif diffs[pointer][0] == self.DIFF_EQUAL:
240 242 # Upon reaching an equality, check for prior redundancies.
241 243 if count_delete >= 1 and count_insert >= 1:
242 244 # Delete the offending records and add the merged ones.
243 245 a = self.diff_main(text_delete, text_insert, False, deadline)
244 diffs[pointer - count_delete - count_insert : pointer] = a
246 diffs[pointer - count_delete - count_insert: pointer] = a
245 247 pointer = pointer - count_delete - count_insert + len(a)
246 248 count_insert = 0
247 249 count_delete = 0
248 250 text_delete = ""
249 251 text_insert = ""
250 252
251 253 pointer += 1
252 254
253 255 diffs.pop() # Remove the dummy entry at the end.
254 256
255 257 return diffs
256 258
257 259 def diff_bisect(self, text1, text2, deadline):
258 260 """Find the 'middle snake' of a diff, split the problem in two
259 261 and return the recursively constructed diff.
260 262 See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
261 263
262 264 Args:
263 265 text1: Old string to be diffed.
264 266 text2: New string to be diffed.
265 267 deadline: Time at which to bail if not yet complete.
266 268
267 269 Returns:
268 270 Array of diff tuples.
269 271 """
270 272
271 273 # Cache the text lengths to prevent multiple calls.
272 274 text1_length = len(text1)
273 275 text2_length = len(text2)
274 276 max_d = (text1_length + text2_length + 1) // 2
275 277 v_offset = max_d
276 278 v_length = 2 * max_d
277 279 v1 = [-1] * v_length
278 280 v1[v_offset + 1] = 0
279 281 v2 = v1[:]
280 282 delta = text1_length - text2_length
281 283 # If the total number of characters is odd, then the front path will
282 284 # collide with the reverse path.
283 285 front = delta % 2 != 0
284 286 # Offsets for start and end of k loop.
285 287 # Prevents mapping of space beyond the grid.
286 288 k1start = 0
287 289 k1end = 0
288 290 k2start = 0
289 291 k2end = 0
290 292 for d in range(max_d):
291 293 # Bail out if deadline is reached.
292 294 if time.time() > deadline:
293 295 break
294 296
295 297 # Walk the front path one step.
296 298 for k1 in range(-d + k1start, d + 1 - k1end, 2):
297 299 k1_offset = v_offset + k1
298 300 if k1 == -d or (k1 != d and v1[k1_offset - 1] < v1[k1_offset + 1]):
299 301 x1 = v1[k1_offset + 1]
300 302 else:
301 303 x1 = v1[k1_offset - 1] + 1
302 304 y1 = x1 - k1
303 305 while (
304 306 x1 < text1_length and y1 < text2_length and text1[x1] == text2[y1]
305 307 ):
306 308 x1 += 1
307 309 y1 += 1
308 310 v1[k1_offset] = x1
309 311 if x1 > text1_length:
310 312 # Ran off the right of the graph.
311 313 k1end += 2
312 314 elif y1 > text2_length:
313 315 # Ran off the bottom of the graph.
314 316 k1start += 2
315 317 elif front:
316 318 k2_offset = v_offset + delta - k1
317 319 if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:
318 320 # Mirror x2 onto top-left coordinate system.
319 321 x2 = text1_length - v2[k2_offset]
320 322 if x1 >= x2:
321 323 # Overlap detected.
322 324 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
323 325
324 326 # Walk the reverse path one step.
325 327 for k2 in range(-d + k2start, d + 1 - k2end, 2):
326 328 k2_offset = v_offset + k2
327 329 if k2 == -d or (k2 != d and v2[k2_offset - 1] < v2[k2_offset + 1]):
328 330 x2 = v2[k2_offset + 1]
329 331 else:
330 332 x2 = v2[k2_offset - 1] + 1
331 333 y2 = x2 - k2
332 334 while (
333 335 x2 < text1_length
334 336 and y2 < text2_length
335 337 and text1[-x2 - 1] == text2[-y2 - 1]
336 338 ):
337 339 x2 += 1
338 340 y2 += 1
339 341 v2[k2_offset] = x2
340 342 if x2 > text1_length:
341 343 # Ran off the left of the graph.
342 344 k2end += 2
343 345 elif y2 > text2_length:
344 346 # Ran off the top of the graph.
345 347 k2start += 2
346 348 elif not front:
347 349 k1_offset = v_offset + delta - k2
348 350 if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:
349 351 x1 = v1[k1_offset]
350 352 y1 = v_offset + x1 - k1_offset
351 353 # Mirror x2 onto top-left coordinate system.
352 354 x2 = text1_length - x2
353 355 if x1 >= x2:
354 356 # Overlap detected.
355 357 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
356 358
357 359 # Diff took too long and hit the deadline or
358 360 # number of diffs equals number of characters, no commonality at all.
359 361 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
360 362
361 363 def diff_bisectSplit(self, text1, text2, x, y, deadline):
362 364 """Given the location of the 'middle snake', split the diff in two parts
363 365 and recurse.
364 366
365 367 Args:
366 368 text1: Old string to be diffed.
367 369 text2: New string to be diffed.
368 370 x: Index of split point in text1.
369 371 y: Index of split point in text2.
370 372 deadline: Time at which to bail if not yet complete.
371 373
372 374 Returns:
373 375 Array of diff tuples.
374 376 """
375 377 text1a = text1[:x]
376 378 text2a = text2[:y]
377 379 text1b = text1[x:]
378 380 text2b = text2[y:]
379 381
380 382 # Compute both diffs serially.
381 383 diffs = self.diff_main(text1a, text2a, False, deadline)
382 384 diffsb = self.diff_main(text1b, text2b, False, deadline)
383 385
384 386 return diffs + diffsb
385 387
386 388 def diff_linesToChars(self, text1, text2):
387 389 """Split two texts into an array of strings. Reduce the texts to a string
388 390 of hashes where each Unicode character represents one line.
389 391
390 392 Args:
391 393 text1: First string.
392 394 text2: Second string.
393 395
394 396 Returns:
395 397 Three element tuple, containing the encoded text1, the encoded text2 and
396 398 the array of unique strings. The zeroth element of the array of unique
397 399 strings is intentionally blank.
398 400 """
399 401 lineArray = [] # e.g. lineArray[4] == "Hello\n"
400 402 lineHash = {} # e.g. lineHash["Hello\n"] == 4
401 403
402 404 # "\x00" is a valid character, but various debuggers don't like it.
403 405 # So we'll insert a junk entry to avoid generating a null character.
404 406 lineArray.append("")
405 407
406 408 def diff_linesToCharsMunge(text):
407 409 """Split a text into an array of strings. Reduce the texts to a string
408 410 of hashes where each Unicode character represents one line.
409 411 Modifies linearray and linehash through being a closure.
410 412
411 413 Args:
412 414 text: String to encode.
413 415
414 416 Returns:
415 417 Encoded string.
416 418 """
417 419 chars = []
418 420 # Walk the text, pulling out a substring for each line.
419 421 # text.split('\n') would would temporarily double our memory footprint.
420 422 # Modifying text would create many large strings to garbage collect.
421 423 lineStart = 0
422 424 lineEnd = -1
423 425 while lineEnd < len(text) - 1:
424 426 lineEnd = text.find("\n", lineStart)
425 427 if lineEnd == -1:
426 428 lineEnd = len(text) - 1
427 429 line = text[lineStart : lineEnd + 1]
428 430 lineStart = lineEnd + 1
429 431
430 432 if line in lineHash:
431 433 chars.append(chr(lineHash[line]))
432 434 else:
433 435 lineArray.append(line)
434 436 lineHash[line] = len(lineArray) - 1
435 437 chars.append(chr(len(lineArray) - 1))
436 438 return "".join(chars)
437 439
438 440 chars1 = diff_linesToCharsMunge(text1)
439 441 chars2 = diff_linesToCharsMunge(text2)
440 442 return (chars1, chars2, lineArray)
441 443
442 444 def diff_charsToLines(self, diffs, lineArray):
443 445 """Rehydrate the text in a diff from a string of line hashes to real lines
444 446 of text.
445 447
446 448 Args:
447 449 diffs: Array of diff tuples.
448 450 lineArray: Array of unique strings.
449 451 """
450 452 for x in range(len(diffs)):
451 453 text = []
452 454 for char in diffs[x][1]:
453 455 text.append(lineArray[ord(char)])
454 456 diffs[x] = (diffs[x][0], "".join(text))
455 457
456 458 def diff_commonPrefix(self, text1, text2):
457 459 """Determine the common prefix of two strings.
458 460
459 461 Args:
460 462 text1: First string.
461 463 text2: Second string.
462 464
463 465 Returns:
464 466 The number of characters common to the start of each string.
465 467 """
466 468 # Quick check for common null cases.
467 469 if not text1 or not text2 or text1[0] != text2[0]:
468 470 return 0
469 471 # Binary search.
470 472 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
471 473 pointermin = 0
472 474 pointermax = min(len(text1), len(text2))
473 475 pointermid = pointermax
474 476 pointerstart = 0
475 477 while pointermin < pointermid:
476 478 if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
477 479 pointermin = pointermid
478 480 pointerstart = pointermin
479 481 else:
480 482 pointermax = pointermid
481 483 pointermid = (pointermax - pointermin) // 2 + pointermin
482 484 return pointermid
483 485
484 486 def diff_commonSuffix(self, text1, text2):
485 487 """Determine the common suffix of two strings.
486 488
487 489 Args:
488 490 text1: First string.
489 491 text2: Second string.
490 492
491 493 Returns:
492 494 The number of characters common to the end of each string.
493 495 """
494 496 # Quick check for common null cases.
495 497 if not text1 or not text2 or text1[-1] != text2[-1]:
496 498 return 0
497 499 # Binary search.
498 500 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
499 501 pointermin = 0
500 502 pointermax = min(len(text1), len(text2))
501 503 pointermid = pointermax
502 504 pointerend = 0
503 505 while pointermin < pointermid:
504 506 if (
505 507 text1[-pointermid : len(text1) - pointerend]
506 508 == text2[-pointermid : len(text2) - pointerend]
507 509 ):
508 510 pointermin = pointermid
509 511 pointerend = pointermin
510 512 else:
511 513 pointermax = pointermid
512 514 pointermid = (pointermax - pointermin) // 2 + pointermin
513 515 return pointermid
514 516
515 517 def diff_commonOverlap(self, text1, text2):
516 518 """Determine if the suffix of one string is the prefix of another.
517 519
518 520 Args:
519 521 text1 First string.
520 522 text2 Second string.
521 523
522 524 Returns:
523 525 The number of characters common to the end of the first
524 526 string and the start of the second string.
525 527 """
526 528 # Cache the text lengths to prevent multiple calls.
527 529 text1_length = len(text1)
528 530 text2_length = len(text2)
529 531 # Eliminate the null case.
530 532 if text1_length == 0 or text2_length == 0:
531 533 return 0
532 534 # Truncate the longer string.
533 535 if text1_length > text2_length:
534 536 text1 = text1[-text2_length:]
535 537 elif text1_length < text2_length:
536 538 text2 = text2[:text1_length]
537 539 text_length = min(text1_length, text2_length)
538 540 # Quick check for the worst case.
539 541 if text1 == text2:
540 542 return text_length
541 543
542 544 # Start by looking for a single character match
543 545 # and increase length until no match is found.
544 546 # Performance analysis: http://neil.fraser.name/news/2010/11/04/
545 547 best = 0
546 548 length = 1
547 549 while True:
548 550 pattern = text1[-length:]
549 551 found = text2.find(pattern)
550 552 if found == -1:
551 553 return best
552 554 length += found
553 555 if found == 0 or text1[-length:] == text2[:length]:
554 556 best = length
555 557 length += 1
556 558
557 559 def diff_halfMatch(self, text1, text2):
558 560 """Do the two texts share a substring which is at least half the length of
559 561 the longer text?
560 562 This speedup can produce non-minimal diffs.
561 563
562 564 Args:
563 565 text1: First string.
564 566 text2: Second string.
565 567
566 568 Returns:
567 569 Five element Array, containing the prefix of text1, the suffix of text1,
568 570 the prefix of text2, the suffix of text2 and the common middle. Or None
569 571 if there was no match.
570 572 """
571 573 if self.Diff_Timeout <= 0:
572 574 # Don't risk returning a non-optimal diff if we have unlimited time.
573 575 return None
574 576 if len(text1) > len(text2):
575 577 (longtext, shorttext) = (text1, text2)
576 578 else:
577 579 (shorttext, longtext) = (text1, text2)
578 580 if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):
579 581 return None # Pointless.
580 582
581 583 def diff_halfMatchI(longtext, shorttext, i):
582 584 """Does a substring of shorttext exist within longtext such that the
583 585 substring is at least half the length of longtext?
584 586 Closure, but does not reference any external variables.
585 587
586 588 Args:
587 589 longtext: Longer string.
588 590 shorttext: Shorter string.
589 591 i: Start index of quarter length substring within longtext.
590 592
591 593 Returns:
592 594 Five element Array, containing the prefix of longtext, the suffix of
593 595 longtext, the prefix of shorttext, the suffix of shorttext and the
594 596 common middle. Or None if there was no match.
595 597 """
596 598 seed = longtext[i : i + len(longtext) // 4]
597 599 best_common = ""
598 600 j = shorttext.find(seed)
599 601 while j != -1:
600 602 prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
601 603 suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
602 604 if len(best_common) < suffixLength + prefixLength:
603 605 best_common = (
604 606 shorttext[j - suffixLength : j]
605 607 + shorttext[j : j + prefixLength]
606 608 )
607 609 best_longtext_a = longtext[: i - suffixLength]
608 610 best_longtext_b = longtext[i + prefixLength :]
609 611 best_shorttext_a = shorttext[: j - suffixLength]
610 612 best_shorttext_b = shorttext[j + prefixLength :]
611 613 j = shorttext.find(seed, j + 1)
612 614
613 615 if len(best_common) * 2 >= len(longtext):
614 616 return (
615 617 best_longtext_a,
616 618 best_longtext_b,
617 619 best_shorttext_a,
618 620 best_shorttext_b,
619 621 best_common,
620 622 )
621 623 else:
622 624 return None
623 625
624 626 # First check if the second quarter is the seed for a half-match.
625 627 hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4)
626 628 # Check again based on the third quarter.
627 629 hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2)
628 630 if not hm1 and not hm2:
629 631 return None
630 632 elif not hm2:
631 633 hm = hm1
632 634 elif not hm1:
633 635 hm = hm2
634 636 else:
635 637 # Both matched. Select the longest.
636 638 if len(hm1[4]) > len(hm2[4]):
637 639 hm = hm1
638 640 else:
639 641 hm = hm2
640 642
641 643 # A half-match was found, sort out the return data.
642 644 if len(text1) > len(text2):
643 645 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
644 646 else:
645 647 (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
646 648 return (text1_a, text1_b, text2_a, text2_b, mid_common)
647 649
648 650 def diff_cleanupSemantic(self, diffs):
649 651 """Reduce the number of edits by eliminating semantically trivial
650 652 equalities.
651 653
652 654 Args:
653 655 diffs: Array of diff tuples.
654 656 """
655 657 changes = False
656 658 equalities = [] # Stack of indices where equalities are found.
657 659 lastequality = None # Always equal to diffs[equalities[-1]][1]
658 660 pointer = 0 # Index of current position.
659 661 # Number of chars that changed prior to the equality.
660 662 length_insertions1, length_deletions1 = 0, 0
661 663 # Number of chars that changed after the equality.
662 664 length_insertions2, length_deletions2 = 0, 0
663 665 while pointer < len(diffs):
664 666 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
665 667 equalities.append(pointer)
666 668 length_insertions1, length_insertions2 = length_insertions2, 0
667 669 length_deletions1, length_deletions2 = length_deletions2, 0
668 670 lastequality = diffs[pointer][1]
669 671 else: # An insertion or deletion.
670 672 if diffs[pointer][0] == self.DIFF_INSERT:
671 673 length_insertions2 += len(diffs[pointer][1])
672 674 else:
673 675 length_deletions2 += len(diffs[pointer][1])
674 676 # Eliminate an equality that is smaller or equal to the edits on both
675 677 # sides of it.
676 678 if (
677 679 lastequality
678 680 and (
679 681 len(lastequality) <= max(length_insertions1, length_deletions1)
680 682 )
681 683 and (
682 684 len(lastequality) <= max(length_insertions2, length_deletions2)
683 685 )
684 686 ):
685 687 # Duplicate record.
686 688 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
687 689 # Change second copy to insert.
688 690 diffs[equalities[-1] + 1] = (
689 691 self.DIFF_INSERT,
690 692 diffs[equalities[-1] + 1][1],
691 693 )
692 694 # Throw away the equality we just deleted.
693 695 equalities.pop()
694 696 # Throw away the previous equality (it needs to be reevaluated).
695 697 if len(equalities):
696 698 equalities.pop()
697 699 if len(equalities):
698 700 pointer = equalities[-1]
699 701 else:
700 702 pointer = -1
701 703 # Reset the counters.
702 704 length_insertions1, length_deletions1 = 0, 0
703 705 length_insertions2, length_deletions2 = 0, 0
704 706 lastequality = None
705 707 changes = True
706 708 pointer += 1
707 709
708 710 # Normalize the diff.
709 711 if changes:
710 712 self.diff_cleanupMerge(diffs)
711 713 self.diff_cleanupSemanticLossless(diffs)
712 714
713 715 # Find any overlaps between deletions and insertions.
714 716 # e.g: <del>abcxxx</del><ins>xxxdef</ins>
715 717 # -> <del>abc</del>xxx<ins>def</ins>
716 718 # e.g: <del>xxxabc</del><ins>defxxx</ins>
717 719 # -> <ins>def</ins>xxx<del>abc</del>
718 720 # Only extract an overlap if it is as big as the edit ahead or behind it.
719 721 pointer = 1
720 722 while pointer < len(diffs):
721 723 if (
722 724 diffs[pointer - 1][0] == self.DIFF_DELETE
723 725 and diffs[pointer][0] == self.DIFF_INSERT
724 726 ):
725 727 deletion = diffs[pointer - 1][1]
726 728 insertion = diffs[pointer][1]
727 729 overlap_length1 = self.diff_commonOverlap(deletion, insertion)
728 730 overlap_length2 = self.diff_commonOverlap(insertion, deletion)
729 731 if overlap_length1 >= overlap_length2:
730 732 if (
731 733 overlap_length1 >= len(deletion) / 2.0
732 734 or overlap_length1 >= len(insertion) / 2.0
733 735 ):
734 736 # Overlap found. Insert an equality and trim the surrounding edits.
735 737 diffs.insert(
736 738 pointer, (self.DIFF_EQUAL, insertion[:overlap_length1])
737 739 )
738 740 diffs[pointer - 1] = (
739 741 self.DIFF_DELETE,
740 742 deletion[: len(deletion) - overlap_length1],
741 743 )
742 744 diffs[pointer + 1] = (
743 745 self.DIFF_INSERT,
744 746 insertion[overlap_length1:],
745 747 )
746 748 pointer += 1
747 749 else:
748 750 if (
749 751 overlap_length2 >= len(deletion) / 2.0
750 752 or overlap_length2 >= len(insertion) / 2.0
751 753 ):
752 754 # Reverse overlap found.
753 755 # Insert an equality and swap and trim the surrounding edits.
754 756 diffs.insert(
755 757 pointer, (self.DIFF_EQUAL, deletion[:overlap_length2])
756 758 )
757 759 diffs[pointer - 1] = (
758 760 self.DIFF_INSERT,
759 761 insertion[: len(insertion) - overlap_length2],
760 762 )
761 763 diffs[pointer + 1] = (
762 764 self.DIFF_DELETE,
763 765 deletion[overlap_length2:],
764 766 )
765 767 pointer += 1
766 768 pointer += 1
767 769 pointer += 1
768 770
769 771 def diff_cleanupSemanticLossless(self, diffs):
770 772 """Look for single edits surrounded on both sides by equalities
771 773 which can be shifted sideways to align the edit to a word boundary.
772 774 e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
773 775
774 776 Args:
775 777 diffs: Array of diff tuples.
776 778 """
777 779
778 780 def diff_cleanupSemanticScore(one, two):
779 781 """Given two strings, compute a score representing whether the
780 782 internal boundary falls on logical boundaries.
781 783 Scores range from 6 (best) to 0 (worst).
782 784 Closure, but does not reference any external variables.
783 785
784 786 Args:
785 787 one: First string.
786 788 two: Second string.
787 789
788 790 Returns:
789 791 The score.
790 792 """
791 793 if not one or not two:
792 794 # Edges are the best.
793 795 return 6
794 796
795 797 # Each port of this function behaves slightly differently due to
796 798 # subtle differences in each language's definition of things like
797 799 # 'whitespace'. Since this function's purpose is largely cosmetic,
798 800 # the choice has been made to use each language's native features
799 801 # rather than force total conformity.
800 802 char1 = one[-1]
801 803 char2 = two[0]
802 804 nonAlphaNumeric1 = not char1.isalnum()
803 805 nonAlphaNumeric2 = not char2.isalnum()
804 806 whitespace1 = nonAlphaNumeric1 and char1.isspace()
805 807 whitespace2 = nonAlphaNumeric2 and char2.isspace()
806 808 lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n")
807 809 lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n")
808 810 blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one)
809 811 blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two)
810 812
811 813 if blankLine1 or blankLine2:
812 814 # Five points for blank lines.
813 815 return 5
814 816 elif lineBreak1 or lineBreak2:
815 817 # Four points for line breaks.
816 818 return 4
817 819 elif nonAlphaNumeric1 and not whitespace1 and whitespace2:
818 820 # Three points for end of sentences.
819 821 return 3
820 822 elif whitespace1 or whitespace2:
821 823 # Two points for whitespace.
822 824 return 2
823 825 elif nonAlphaNumeric1 or nonAlphaNumeric2:
824 826 # One point for non-alphanumeric.
825 827 return 1
826 828 return 0
827 829
828 830 pointer = 1
829 831 # Intentionally ignore the first and last element (don't need checking).
830 832 while pointer < len(diffs) - 1:
831 833 if (
832 834 diffs[pointer - 1][0] == self.DIFF_EQUAL
833 835 and diffs[pointer + 1][0] == self.DIFF_EQUAL
834 836 ):
835 837 # This is a single edit surrounded by equalities.
836 838 equality1 = diffs[pointer - 1][1]
837 839 edit = diffs[pointer][1]
838 840 equality2 = diffs[pointer + 1][1]
839 841
840 842 # First, shift the edit as far left as possible.
841 843 commonOffset = self.diff_commonSuffix(equality1, edit)
842 844 if commonOffset:
843 845 commonString = edit[-commonOffset:]
844 846 equality1 = equality1[:-commonOffset]
845 847 edit = commonString + edit[:-commonOffset]
846 848 equality2 = commonString + equality2
847 849
848 850 # Second, step character by character right, looking for the best fit.
849 851 bestEquality1 = equality1
850 852 bestEdit = edit
851 853 bestEquality2 = equality2
852 854 bestScore = diff_cleanupSemanticScore(
853 855 equality1, edit
854 856 ) + diff_cleanupSemanticScore(edit, equality2)
855 857 while edit and equality2 and edit[0] == equality2[0]:
856 858 equality1 += edit[0]
857 859 edit = edit[1:] + equality2[0]
858 860 equality2 = equality2[1:]
859 861 score = diff_cleanupSemanticScore(
860 862 equality1, edit
861 863 ) + diff_cleanupSemanticScore(edit, equality2)
862 864 # The >= encourages trailing rather than leading whitespace on edits.
863 865 if score >= bestScore:
864 866 bestScore = score
865 867 bestEquality1 = equality1
866 868 bestEdit = edit
867 869 bestEquality2 = equality2
868 870
869 871 if diffs[pointer - 1][1] != bestEquality1:
870 872 # We have an improvement, save it back to the diff.
871 873 if bestEquality1:
872 874 diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
873 875 else:
874 876 del diffs[pointer - 1]
875 877 pointer -= 1
876 878 diffs[pointer] = (diffs[pointer][0], bestEdit)
877 879 if bestEquality2:
878 880 diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
879 881 else:
880 882 del diffs[pointer + 1]
881 883 pointer -= 1
882 884 pointer += 1
883 885
884 886 # Define some regex patterns for matching boundaries.
885 887 BLANKLINEEND = re.compile(r"\n\r?\n$")
886 888 BLANKLINESTART = re.compile(r"^\r?\n\r?\n")
887 889
888 890 def diff_cleanupEfficiency(self, diffs):
889 891 """Reduce the number of edits by eliminating operationally trivial
890 892 equalities.
891 893
892 894 Args:
893 895 diffs: Array of diff tuples.
894 896 """
895 897 changes = False
896 898 equalities = [] # Stack of indices where equalities are found.
897 899 lastequality = None # Always equal to diffs[equalities[-1]][1]
898 900 pointer = 0 # Index of current position.
899 901 pre_ins = False # Is there an insertion operation before the last equality.
900 902 pre_del = False # Is there a deletion operation before the last equality.
901 903 post_ins = False # Is there an insertion operation after the last equality.
902 904 post_del = False # Is there a deletion operation after the last equality.
903 905 while pointer < len(diffs):
904 906 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
905 907 if len(diffs[pointer][1]) < self.Diff_EditCost and (
906 908 post_ins or post_del
907 909 ):
908 910 # Candidate found.
909 911 equalities.append(pointer)
910 912 pre_ins = post_ins
911 913 pre_del = post_del
912 914 lastequality = diffs[pointer][1]
913 915 else:
914 916 # Not a candidate, and can never become one.
915 917 equalities = []
916 918 lastequality = None
917 919
918 920 post_ins = post_del = False
919 921 else: # An insertion or deletion.
920 922 if diffs[pointer][0] == self.DIFF_DELETE:
921 923 post_del = True
922 924 else:
923 925 post_ins = True
924 926
925 927 # Five types to be split:
926 928 # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
927 929 # <ins>A</ins>X<ins>C</ins><del>D</del>
928 930 # <ins>A</ins><del>B</del>X<ins>C</ins>
929 931 # <ins>A</del>X<ins>C</ins><del>D</del>
930 932 # <ins>A</ins><del>B</del>X<del>C</del>
931 933
932 934 if lastequality and (
933 935 (pre_ins and pre_del and post_ins and post_del)
934 936 or (
935 937 (len(lastequality) < self.Diff_EditCost / 2)
936 938 and (pre_ins + pre_del + post_ins + post_del) == 3
937 939 )
938 940 ):
939 941 # Duplicate record.
940 942 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
941 943 # Change second copy to insert.
942 944 diffs[equalities[-1] + 1] = (
943 945 self.DIFF_INSERT,
944 946 diffs[equalities[-1] + 1][1],
945 947 )
946 948 equalities.pop() # Throw away the equality we just deleted.
947 949 lastequality = None
948 950 if pre_ins and pre_del:
949 951 # No changes made which could affect previous entry, keep going.
950 952 post_ins = post_del = True
951 953 equalities = []
952 954 else:
953 955 if len(equalities):
954 956 equalities.pop() # Throw away the previous equality.
955 957 if len(equalities):
956 958 pointer = equalities[-1]
957 959 else:
958 960 pointer = -1
959 961 post_ins = post_del = False
960 962 changes = True
961 963 pointer += 1
962 964
963 965 if changes:
964 966 self.diff_cleanupMerge(diffs)
965 967
966 968 def diff_cleanupMerge(self, diffs):
967 969 """Reorder and merge like edit sections. Merge equalities.
968 970 Any edit section can move as long as it doesn't cross an equality.
969 971
970 972 Args:
971 973 diffs: Array of diff tuples.
972 974 """
973 975 diffs.append((self.DIFF_EQUAL, "")) # Add a dummy entry at the end.
974 976 pointer = 0
975 977 count_delete = 0
976 978 count_insert = 0
977 979 text_delete = ""
978 980 text_insert = ""
979 981 while pointer < len(diffs):
980 982 if diffs[pointer][0] == self.DIFF_INSERT:
981 983 count_insert += 1
982 984 text_insert += diffs[pointer][1]
983 985 pointer += 1
984 986 elif diffs[pointer][0] == self.DIFF_DELETE:
985 987 count_delete += 1
986 988 text_delete += diffs[pointer][1]
987 989 pointer += 1
988 990 elif diffs[pointer][0] == self.DIFF_EQUAL:
989 991 # Upon reaching an equality, check for prior redundancies.
990 992 if count_delete + count_insert > 1:
991 993 if count_delete != 0 and count_insert != 0:
992 994 # Factor out any common prefixies.
993 995 commonlength = self.diff_commonPrefix(text_insert, text_delete)
994 996 if commonlength != 0:
995 997 x = pointer - count_delete - count_insert - 1
996 998 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
997 999 diffs[x] = (
998 1000 diffs[x][0],
999 1001 diffs[x][1] + text_insert[:commonlength],
1000 1002 )
1001 1003 else:
1002 1004 diffs.insert(
1003 1005 0, (self.DIFF_EQUAL, text_insert[:commonlength])
1004 1006 )
1005 1007 pointer += 1
1006 1008 text_insert = text_insert[commonlength:]
1007 1009 text_delete = text_delete[commonlength:]
1008 1010 # Factor out any common suffixies.
1009 1011 commonlength = self.diff_commonSuffix(text_insert, text_delete)
1010 1012 if commonlength != 0:
1011 1013 diffs[pointer] = (
1012 1014 diffs[pointer][0],
1013 1015 text_insert[-commonlength:] + diffs[pointer][1],
1014 1016 )
1015 1017 text_insert = text_insert[:-commonlength]
1016 1018 text_delete = text_delete[:-commonlength]
1017 1019 # Delete the offending records and add the merged ones.
1018 1020 if count_delete == 0:
1019 1021 diffs[pointer - count_insert : pointer] = [
1020 1022 (self.DIFF_INSERT, text_insert)
1021 1023 ]
1022 1024 elif count_insert == 0:
1023 1025 diffs[pointer - count_delete : pointer] = [
1024 1026 (self.DIFF_DELETE, text_delete)
1025 1027 ]
1026 1028 else:
1027 1029 diffs[pointer - count_delete - count_insert : pointer] = [
1028 1030 (self.DIFF_DELETE, text_delete),
1029 1031 (self.DIFF_INSERT, text_insert),
1030 1032 ]
1031 1033 pointer = pointer - count_delete - count_insert + 1
1032 1034 if count_delete != 0:
1033 1035 pointer += 1
1034 1036 if count_insert != 0:
1035 1037 pointer += 1
1036 1038 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
1037 1039 # Merge this equality with the previous one.
1038 1040 diffs[pointer - 1] = (
1039 1041 diffs[pointer - 1][0],
1040 1042 diffs[pointer - 1][1] + diffs[pointer][1],
1041 1043 )
1042 1044 del diffs[pointer]
1043 1045 else:
1044 1046 pointer += 1
1045 1047
1046 1048 count_insert = 0
1047 1049 count_delete = 0
1048 1050 text_delete = ""
1049 1051 text_insert = ""
1050 1052
1051 1053 if diffs[-1][1] == "":
1052 1054 diffs.pop() # Remove the dummy entry at the end.
1053 1055
1054 1056 # Second pass: look for single edits surrounded on both sides by equalities
1055 1057 # which can be shifted sideways to eliminate an equality.
1056 1058 # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
1057 1059 changes = False
1058 1060 pointer = 1
1059 1061 # Intentionally ignore the first and last element (don't need checking).
1060 1062 while pointer < len(diffs) - 1:
1061 1063 if (
1062 1064 diffs[pointer - 1][0] == self.DIFF_EQUAL
1063 1065 and diffs[pointer + 1][0] == self.DIFF_EQUAL
1064 1066 ):
1065 1067 # This is a single edit surrounded by equalities.
1066 1068 if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
1067 1069 # Shift the edit over the previous equality.
1068 1070 diffs[pointer] = (
1069 1071 diffs[pointer][0],
1070 1072 diffs[pointer - 1][1]
1071 1073 + diffs[pointer][1][: -len(diffs[pointer - 1][1])],
1072 1074 )
1073 1075 diffs[pointer + 1] = (
1074 1076 diffs[pointer + 1][0],
1075 1077 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1076 1078 )
1077 1079 del diffs[pointer - 1]
1078 1080 changes = True
1079 1081 elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
1080 1082 # Shift the edit over the next equality.
1081 1083 diffs[pointer - 1] = (
1082 1084 diffs[pointer - 1][0],
1083 1085 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1084 1086 )
1085 1087 diffs[pointer] = (
1086 1088 diffs[pointer][0],
1087 1089 diffs[pointer][1][len(diffs[pointer + 1][1]) :]
1088 1090 + diffs[pointer + 1][1],
1089 1091 )
1090 1092 del diffs[pointer + 1]
1091 1093 changes = True
1092 1094 pointer += 1
1093 1095
1094 1096 # If shifts were made, the diff needs reordering and another shift sweep.
1095 1097 if changes:
1096 1098 self.diff_cleanupMerge(diffs)
1097 1099
1098 1100 def diff_xIndex(self, diffs, loc):
1099 1101 """loc is a location in text1, compute and return the equivalent location
1100 1102 in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
1101 1103
1102 1104 Args:
1103 1105 diffs: Array of diff tuples.
1104 1106 loc: Location within text1.
1105 1107
1106 1108 Returns:
1107 1109 Location within text2.
1108 1110 """
1109 1111 chars1 = 0
1110 1112 chars2 = 0
1111 1113 last_chars1 = 0
1112 1114 last_chars2 = 0
1113 1115 for x in range(len(diffs)):
1114 1116 (op, text) = diffs[x]
1115 1117 if op != self.DIFF_INSERT: # Equality or deletion.
1116 1118 chars1 += len(text)
1117 1119 if op != self.DIFF_DELETE: # Equality or insertion.
1118 1120 chars2 += len(text)
1119 1121 if chars1 > loc: # Overshot the location.
1120 1122 break
1121 1123 last_chars1 = chars1
1122 1124 last_chars2 = chars2
1123 1125
1124 1126 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
1125 1127 # The location was deleted.
1126 1128 return last_chars2
1127 1129 # Add the remaining len(character).
1128 1130 return last_chars2 + (loc - last_chars1)
1129 1131
1130 1132 def diff_prettyHtml(self, diffs):
1131 1133 """Convert a diff array into a pretty HTML report.
1132 1134
1133 1135 Args:
1134 1136 diffs: Array of diff tuples.
1135 1137
1136 1138 Returns:
1137 1139 HTML representation.
1138 1140 """
1139 1141 html = []
1140 1142 for op, data in diffs:
1141 1143 text = (
1142 1144 data.replace("&", "&amp;")
1143 1145 .replace("<", "&lt;")
1144 1146 .replace(">", "&gt;")
1145 1147 .replace("\n", "&para;<br>")
1146 1148 )
1147 1149 if op == self.DIFF_INSERT:
1148 1150 html.append('<ins style="background:#e6ffe6;">%s</ins>' % text)
1149 1151 elif op == self.DIFF_DELETE:
1150 1152 html.append('<del style="background:#ffe6e6;">%s</del>' % text)
1151 1153 elif op == self.DIFF_EQUAL:
1152 1154 html.append("<span>%s</span>" % text)
1153 1155 return "".join(html)
1154 1156
1155 1157 def diff_text1(self, diffs):
1156 1158 """Compute and return the source text (all equalities and deletions).
1157 1159
1158 1160 Args:
1159 1161 diffs: Array of diff tuples.
1160 1162
1161 1163 Returns:
1162 1164 Source text.
1163 1165 """
1164 1166 text = []
1165 1167 for op, data in diffs:
1166 1168 if op != self.DIFF_INSERT:
1167 1169 text.append(data)
1168 1170 return "".join(text)
1169 1171
1170 1172 def diff_text2(self, diffs):
1171 1173 """Compute and return the destination text (all equalities and insertions).
1172 1174
1173 1175 Args:
1174 1176 diffs: Array of diff tuples.
1175 1177
1176 1178 Returns:
1177 1179 Destination text.
1178 1180 """
1179 1181 text = []
1180 1182 for op, data in diffs:
1181 1183 if op != self.DIFF_DELETE:
1182 1184 text.append(data)
1183 1185 return "".join(text)
1184 1186
1185 1187 def diff_levenshtein(self, diffs):
1186 1188 """Compute the Levenshtein distance; the number of inserted, deleted or
1187 1189 substituted characters.
1188 1190
1189 1191 Args:
1190 1192 diffs: Array of diff tuples.
1191 1193
1192 1194 Returns:
1193 1195 Number of changes.
1194 1196 """
1195 1197 levenshtein = 0
1196 1198 insertions = 0
1197 1199 deletions = 0
1198 1200 for op, data in diffs:
1199 1201 if op == self.DIFF_INSERT:
1200 1202 insertions += len(data)
1201 1203 elif op == self.DIFF_DELETE:
1202 1204 deletions += len(data)
1203 1205 elif op == self.DIFF_EQUAL:
1204 1206 # A deletion and an insertion is one substitution.
1205 1207 levenshtein += max(insertions, deletions)
1206 1208 insertions = 0
1207 1209 deletions = 0
1208 1210 levenshtein += max(insertions, deletions)
1209 1211 return levenshtein
1210 1212
1211 1213 def diff_toDelta(self, diffs):
1212 1214 """Crush the diff into an encoded string which describes the operations
1213 1215 required to transform text1 into text2.
1214 1216 E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
1215 1217 Operations are tab-separated. Inserted text is escaped using %xx notation.
1216 1218
1217 1219 Args:
1218 1220 diffs: Array of diff tuples.
1219 1221
1220 1222 Returns:
1221 1223 Delta text.
1222 1224 """
1223 1225 text = []
1224 1226 for op, data in diffs:
1225 1227 if op == self.DIFF_INSERT:
1226 1228 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
1227 1229 data = data.encode("utf-8")
1228 1230 text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
1229 1231 elif op == self.DIFF_DELETE:
1230 1232 text.append("-%d" % len(data))
1231 1233 elif op == self.DIFF_EQUAL:
1232 1234 text.append("=%d" % len(data))
1233 1235 return "\t".join(text)
1234 1236
1235 1237 def diff_fromDelta(self, text1, delta):
1236 1238 """Given the original text1, and an encoded string which describes the
1237 1239 operations required to transform text1 into text2, compute the full diff.
1238 1240
1239 1241 Args:
1240 1242 text1: Source string for the diff.
1241 1243 delta: Delta text.
1242 1244
1243 1245 Returns:
1244 1246 Array of diff tuples.
1245 1247
1246 1248 Raises:
1247 1249 ValueError: If invalid input.
1248 1250 """
1249 1251 if type(delta) == str:
1250 1252 # Deltas should be composed of a subset of ascii chars, Unicode not
1251 1253 # required. If this encode raises UnicodeEncodeError, delta is invalid.
1252 1254 delta = delta.encode("ascii")
1253 1255 diffs = []
1254 1256 pointer = 0 # Cursor in text1
1255 1257 tokens = delta.split("\t")
1256 1258 for token in tokens:
1257 1259 if token == "":
1258 1260 # Blank tokens are ok (from a trailing \t).
1259 1261 continue
1260 1262 # Each token begins with a one character parameter which specifies the
1261 1263 # operation of this token (delete, insert, equality).
1262 1264 param = token[1:]
1263 1265 if token[0] == "+":
1264 1266 param = urllib.parse.unquote(param)
1265 1267 diffs.append((self.DIFF_INSERT, param))
1266 1268 elif token[0] == "-" or token[0] == "=":
1267 1269 try:
1268 1270 n = int(param)
1269 1271 except ValueError:
1270 1272 raise ValueError("Invalid number in diff_fromDelta: " + param)
1271 1273 if n < 0:
1272 1274 raise ValueError("Negative number in diff_fromDelta: " + param)
1273 1275 text = text1[pointer : pointer + n]
1274 1276 pointer += n
1275 1277 if token[0] == "=":
1276 1278 diffs.append((self.DIFF_EQUAL, text))
1277 1279 else:
1278 1280 diffs.append((self.DIFF_DELETE, text))
1279 1281 else:
1280 1282 # Anything else is an error.
1281 1283 raise ValueError(
1282 1284 "Invalid diff operation in diff_fromDelta: " + token[0]
1283 1285 )
1284 1286 if pointer != len(text1):
1285 1287 raise ValueError(
1286 1288 "Delta length (%d) does not equal source text length (%d)."
1287 1289 % (pointer, len(text1))
1288 1290 )
1289 1291 return diffs
1290 1292
1291 1293 # MATCH FUNCTIONS
1292 1294
1293 1295 def match_main(self, text, pattern, loc):
1294 1296 """Locate the best instance of 'pattern' in 'text' near 'loc'.
1295 1297
1296 1298 Args:
1297 1299 text: The text to search.
1298 1300 pattern: The pattern to search for.
1299 1301 loc: The location to search around.
1300 1302
1301 1303 Returns:
1302 1304 Best match index or -1.
1303 1305 """
1304 1306 # Check for null inputs.
1305 1307 if text is None or pattern is None:
1306 1308 raise ValueError("Null inputs. (match_main)")
1307 1309
1308 1310 loc = max(0, min(loc, len(text)))
1309 1311 if text == pattern:
1310 1312 # Shortcut (potentially not guaranteed by the algorithm)
1311 1313 return 0
1312 1314 elif not text:
1313 1315 # Nothing to match.
1314 1316 return -1
1315 1317 elif text[loc : loc + len(pattern)] == pattern:
1316 1318 # Perfect match at the perfect spot! (Includes case of null pattern)
1317 1319 return loc
1318 1320 else:
1319 1321 # Do a fuzzy compare.
1320 1322 match = self.match_bitap(text, pattern, loc)
1321 1323 return match
1322 1324
1323 1325 def match_bitap(self, text, pattern, loc):
1324 1326 """Locate the best instance of 'pattern' in 'text' near 'loc' using the
1325 1327 Bitap algorithm.
1326 1328
1327 1329 Args:
1328 1330 text: The text to search.
1329 1331 pattern: The pattern to search for.
1330 1332 loc: The location to search around.
1331 1333
1332 1334 Returns:
1333 1335 Best match index or -1.
1334 1336 """
1335 1337 # Python doesn't have a maxint limit, so ignore this check.
1336 1338 # if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1337 1339 # raise ValueError("Pattern too long for this application.")
1338 1340
1339 1341 # Initialise the alphabet.
1340 1342 s = self.match_alphabet(pattern)
1341 1343
1342 1344 def match_bitapScore(e, x):
1343 1345 """Compute and return the score for a match with e errors and x location.
1344 1346 Accesses loc and pattern through being a closure.
1345 1347
1346 1348 Args:
1347 1349 e: Number of errors in match.
1348 1350 x: Location of match.
1349 1351
1350 1352 Returns:
1351 1353 Overall score for match (0.0 = good, 1.0 = bad).
1352 1354 """
1353 1355 accuracy = float(e) / len(pattern)
1354 1356 proximity = abs(loc - x)
1355 1357 if not self.Match_Distance:
1356 1358 # Dodge divide by zero error.
1357 1359 return proximity and 1.0 or accuracy
1358 1360 return accuracy + (proximity / float(self.Match_Distance))
1359 1361
1360 1362 # Highest score beyond which we give up.
1361 1363 score_threshold = self.Match_Threshold
1362 1364 # Is there a nearby exact match? (speedup)
1363 1365 best_loc = text.find(pattern, loc)
1364 1366 if best_loc != -1:
1365 1367 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1366 1368 # What about in the other direction? (speedup)
1367 1369 best_loc = text.rfind(pattern, loc + len(pattern))
1368 1370 if best_loc != -1:
1369 1371 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1370 1372
1371 1373 # Initialise the bit arrays.
1372 1374 matchmask = 1 << (len(pattern) - 1)
1373 1375 best_loc = -1
1374 1376
1375 1377 bin_max = len(pattern) + len(text)
1376 1378 # Empty initialization added to appease pychecker.
1377 1379 last_rd = None
1378 1380 for d in range(len(pattern)):
1379 1381 # Scan for the best match each iteration allows for one more error.
1380 1382 # Run a binary search to determine how far from 'loc' we can stray at
1381 1383 # this error level.
1382 1384 bin_min = 0
1383 1385 bin_mid = bin_max
1384 1386 while bin_min < bin_mid:
1385 1387 if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1386 1388 bin_min = bin_mid
1387 1389 else:
1388 1390 bin_max = bin_mid
1389 1391 bin_mid = (bin_max - bin_min) // 2 + bin_min
1390 1392
1391 1393 # Use the result from this iteration as the maximum for the next.
1392 1394 bin_max = bin_mid
1393 1395 start = max(1, loc - bin_mid + 1)
1394 1396 finish = min(loc + bin_mid, len(text)) + len(pattern)
1395 1397
1396 1398 rd = [0] * (finish + 2)
1397 1399 rd[finish + 1] = (1 << d) - 1
1398 1400 for j in range(finish, start - 1, -1):
1399 1401 if len(text) <= j - 1:
1400 1402 # Out of range.
1401 1403 charMatch = 0
1402 1404 else:
1403 1405 charMatch = s.get(text[j - 1], 0)
1404 1406 if d == 0: # First pass: exact match.
1405 1407 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1406 1408 else: # Subsequent passes: fuzzy match.
1407 1409 rd[j] = (
1408 1410 (((rd[j + 1] << 1) | 1) & charMatch)
1409 1411 | (((last_rd[j + 1] | last_rd[j]) << 1) | 1)
1410 1412 | last_rd[j + 1]
1411 1413 )
1412 1414 if rd[j] & matchmask:
1413 1415 score = match_bitapScore(d, j - 1)
1414 1416 # This match will almost certainly be better than any existing match.
1415 1417 # But check anyway.
1416 1418 if score <= score_threshold:
1417 1419 # Told you so.
1418 1420 score_threshold = score
1419 1421 best_loc = j - 1
1420 1422 if best_loc > loc:
1421 1423 # When passing loc, don't exceed our current distance from loc.
1422 1424 start = max(1, 2 * loc - best_loc)
1423 1425 else:
1424 1426 # Already passed loc, downhill from here on in.
1425 1427 break
1426 1428 # No hope for a (better) match at greater error levels.
1427 1429 if match_bitapScore(d + 1, loc) > score_threshold:
1428 1430 break
1429 1431 last_rd = rd
1430 1432 return best_loc
1431 1433
1432 1434 def match_alphabet(self, pattern):
1433 1435 """Initialise the alphabet for the Bitap algorithm.
1434 1436
1435 1437 Args:
1436 1438 pattern: The text to encode.
1437 1439
1438 1440 Returns:
1439 1441 Hash of character locations.
1440 1442 """
1441 1443 s = {}
1442 1444 for char in pattern:
1443 1445 s[char] = 0
1444 1446 for i in range(len(pattern)):
1445 1447 s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1446 1448 return s
1447 1449
1448 1450 # PATCH FUNCTIONS
1449 1451
1450 1452 def patch_addContext(self, patch, text):
1451 1453 """Increase the context until it is unique,
1452 1454 but don't let the pattern expand beyond Match_MaxBits.
1453 1455
1454 1456 Args:
1455 1457 patch: The patch to grow.
1456 1458 text: Source text.
1457 1459 """
1458 1460 if len(text) == 0:
1459 1461 return
1460 1462 pattern = text[patch.start2 : patch.start2 + patch.length1]
1461 1463 padding = 0
1462 1464
1463 1465 # Look for the first and last matches of pattern in text. If two different
1464 1466 # matches are found, increase the pattern length.
1465 1467 while text.find(pattern) != text.rfind(pattern) and (
1466 1468 self.Match_MaxBits == 0
1467 1469 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - self.Patch_Margin
1468 1470 ):
1469 1471 padding += self.Patch_Margin
1470 1472 pattern = text[
1471 1473 max(0, patch.start2 - padding) : patch.start2 + patch.length1 + padding
1472 1474 ]
1473 1475 # Add one chunk for good luck.
1474 1476 padding += self.Patch_Margin
1475 1477
1476 1478 # Add the prefix.
1477 1479 prefix = text[max(0, patch.start2 - padding) : patch.start2]
1478 1480 if prefix:
1479 1481 patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1480 1482 # Add the suffix.
1481 1483 suffix = text[
1482 1484 patch.start2 + patch.length1 : patch.start2 + patch.length1 + padding
1483 1485 ]
1484 1486 if suffix:
1485 1487 patch.diffs.append((self.DIFF_EQUAL, suffix))
1486 1488
1487 1489 # Roll back the start points.
1488 1490 patch.start1 -= len(prefix)
1489 1491 patch.start2 -= len(prefix)
1490 1492 # Extend lengths.
1491 1493 patch.length1 += len(prefix) + len(suffix)
1492 1494 patch.length2 += len(prefix) + len(suffix)
1493 1495
1494 1496 def patch_make(self, a, b=None, c=None):
1495 1497 """Compute a list of patches to turn text1 into text2.
1496 1498 Use diffs if provided, otherwise compute it ourselves.
1497 1499 There are four ways to call this function, depending on what data is
1498 1500 available to the caller:
1499 1501 Method 1:
1500 1502 a = text1, b = text2
1501 1503 Method 2:
1502 1504 a = diffs
1503 1505 Method 3 (optimal):
1504 1506 a = text1, b = diffs
1505 1507 Method 4 (deprecated, use method 3):
1506 1508 a = text1, b = text2, c = diffs
1507 1509
1508 1510 Args:
1509 1511 a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1510 1512 text2 (method 2).
1511 1513 b: text2 (methods 1,4) or Array of diff tuples for text1 to
1512 1514 text2 (method 3) or undefined (method 2).
1513 1515 c: Array of diff tuples for text1 to text2 (method 4) or
1514 1516 undefined (methods 1,2,3).
1515 1517
1516 1518 Returns:
1517 1519 Array of Patch objects.
1518 1520 """
1519 1521 text1 = None
1520 1522 diffs = None
1521 1523 # Note that texts may arrive as 'str' or 'unicode'.
1522 1524 if isinstance(a, str) and isinstance(b, str) and c is None:
1523 1525 # Method 1: text1, text2
1524 1526 # Compute diffs from text1 and text2.
1525 1527 text1 = a
1526 1528 diffs = self.diff_main(text1, b, True)
1527 1529 if len(diffs) > 2:
1528 1530 self.diff_cleanupSemantic(diffs)
1529 1531 self.diff_cleanupEfficiency(diffs)
1530 1532 elif isinstance(a, list) and b is None and c is None:
1531 1533 # Method 2: diffs
1532 1534 # Compute text1 from diffs.
1533 1535 diffs = a
1534 1536 text1 = self.diff_text1(diffs)
1535 1537 elif isinstance(a, str) and isinstance(b, list) and c is None:
1536 1538 # Method 3: text1, diffs
1537 1539 text1 = a
1538 1540 diffs = b
1539 1541 elif isinstance(a, str) and isinstance(b, str) and isinstance(c, list):
1540 1542 # Method 4: text1, text2, diffs
1541 1543 # text2 is not used.
1542 1544 text1 = a
1543 1545 diffs = c
1544 1546 else:
1545 1547 raise ValueError("Unknown call format to patch_make.")
1546 1548
1547 1549 if not diffs:
1548 1550 return [] # Get rid of the None case.
1549 1551 patches = []
1550 1552 patch = patch_obj()
1551 1553 char_count1 = 0 # Number of characters into the text1 string.
1552 1554 char_count2 = 0 # Number of characters into the text2 string.
1553 1555 prepatch_text = text1 # Recreate the patches to determine context info.
1554 1556 postpatch_text = text1
1555 1557 for x in range(len(diffs)):
1556 1558 (diff_type, diff_text) = diffs[x]
1557 1559 if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1558 1560 # A new patch starts here.
1559 1561 patch.start1 = char_count1
1560 1562 patch.start2 = char_count2
1561 1563 if diff_type == self.DIFF_INSERT:
1562 1564 # Insertion
1563 1565 patch.diffs.append(diffs[x])
1564 1566 patch.length2 += len(diff_text)
1565 1567 postpatch_text = (
1566 1568 postpatch_text[:char_count2]
1567 1569 + diff_text
1568 1570 + postpatch_text[char_count2:]
1569 1571 )
1570 1572 elif diff_type == self.DIFF_DELETE:
1571 1573 # Deletion.
1572 1574 patch.length1 += len(diff_text)
1573 1575 patch.diffs.append(diffs[x])
1574 1576 postpatch_text = (
1575 1577 postpatch_text[:char_count2]
1576 1578 + postpatch_text[char_count2 + len(diff_text) :]
1577 1579 )
1578 1580 elif (
1579 1581 diff_type == self.DIFF_EQUAL
1580 1582 and len(diff_text) <= 2 * self.Patch_Margin
1581 1583 and len(patch.diffs) != 0
1582 1584 and len(diffs) != x + 1
1583 1585 ):
1584 1586 # Small equality inside a patch.
1585 1587 patch.diffs.append(diffs[x])
1586 1588 patch.length1 += len(diff_text)
1587 1589 patch.length2 += len(diff_text)
1588 1590
1589 1591 if diff_type == self.DIFF_EQUAL and len(diff_text) >= 2 * self.Patch_Margin:
1590 1592 # Time for a new patch.
1591 1593 if len(patch.diffs) != 0:
1592 1594 self.patch_addContext(patch, prepatch_text)
1593 1595 patches.append(patch)
1594 1596 patch = patch_obj()
1595 1597 # Unlike Unidiff, our patch lists have a rolling context.
1596 1598 # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1597 1599 # Update prepatch text & pos to reflect the application of the
1598 1600 # just completed patch.
1599 1601 prepatch_text = postpatch_text
1600 1602 char_count1 = char_count2
1601 1603
1602 1604 # Update the current character count.
1603 1605 if diff_type != self.DIFF_INSERT:
1604 1606 char_count1 += len(diff_text)
1605 1607 if diff_type != self.DIFF_DELETE:
1606 1608 char_count2 += len(diff_text)
1607 1609
1608 1610 # Pick up the leftover patch if not empty.
1609 1611 if len(patch.diffs) != 0:
1610 1612 self.patch_addContext(patch, prepatch_text)
1611 1613 patches.append(patch)
1612 1614 return patches
1613 1615
1614 1616 def patch_deepCopy(self, patches):
1615 1617 """Given an array of patches, return another array that is identical.
1616 1618
1617 1619 Args:
1618 1620 patches: Array of Patch objects.
1619 1621
1620 1622 Returns:
1621 1623 Array of Patch objects.
1622 1624 """
1623 1625 patchesCopy = []
1624 1626 for patch in patches:
1625 1627 patchCopy = patch_obj()
1626 1628 # No need to deep copy the tuples since they are immutable.
1627 1629 patchCopy.diffs = patch.diffs[:]
1628 1630 patchCopy.start1 = patch.start1
1629 1631 patchCopy.start2 = patch.start2
1630 1632 patchCopy.length1 = patch.length1
1631 1633 patchCopy.length2 = patch.length2
1632 1634 patchesCopy.append(patchCopy)
1633 1635 return patchesCopy
1634 1636
1635 1637 def patch_apply(self, patches, text):
1636 1638 """Merge a set of patches onto the text. Return a patched text, as well
1637 1639 as a list of true/false values indicating which patches were applied.
1638 1640
1639 1641 Args:
1640 1642 patches: Array of Patch objects.
1641 1643 text: Old text.
1642 1644
1643 1645 Returns:
1644 1646 Two element Array, containing the new text and an array of boolean values.
1645 1647 """
1646 1648 if not patches:
1647 1649 return (text, [])
1648 1650
1649 1651 # Deep copy the patches so that no changes are made to originals.
1650 1652 patches = self.patch_deepCopy(patches)
1651 1653
1652 1654 nullPadding = self.patch_addPadding(patches)
1653 1655 text = nullPadding + text + nullPadding
1654 1656 self.patch_splitMax(patches)
1655 1657
1656 1658 # delta keeps track of the offset between the expected and actual location
1657 1659 # of the previous patch. If there are patches expected at positions 10 and
1658 1660 # 20, but the first patch was found at 12, delta is 2 and the second patch
1659 1661 # has an effective expected position of 22.
1660 1662 delta = 0
1661 1663 results = []
1662 1664 for patch in patches:
1663 1665 expected_loc = patch.start2 + delta
1664 1666 text1 = self.diff_text1(patch.diffs)
1665 1667 end_loc = -1
1666 1668 if len(text1) > self.Match_MaxBits:
1667 1669 # patch_splitMax will only provide an oversized pattern in the case of
1668 1670 # a monster delete.
1669 1671 start_loc = self.match_main(
1670 1672 text, text1[: self.Match_MaxBits], expected_loc
1671 1673 )
1672 1674 if start_loc != -1:
1673 1675 end_loc = self.match_main(
1674 1676 text,
1675 1677 text1[-self.Match_MaxBits :],
1676 1678 expected_loc + len(text1) - self.Match_MaxBits,
1677 1679 )
1678 1680 if end_loc == -1 or start_loc >= end_loc:
1679 1681 # Can't find valid trailing context. Drop this patch.
1680 1682 start_loc = -1
1681 1683 else:
1682 1684 start_loc = self.match_main(text, text1, expected_loc)
1683 1685 if start_loc == -1:
1684 1686 # No match found. :(
1685 1687 results.append(False)
1686 1688 # Subtract the delta for this failed patch from subsequent patches.
1687 1689 delta -= patch.length2 - patch.length1
1688 1690 else:
1689 1691 # Found a match. :)
1690 1692 results.append(True)
1691 1693 delta = start_loc - expected_loc
1692 1694 if end_loc == -1:
1693 1695 text2 = text[start_loc : start_loc + len(text1)]
1694 1696 else:
1695 1697 text2 = text[start_loc : end_loc + self.Match_MaxBits]
1696 1698 if text1 == text2:
1697 1699 # Perfect match, just shove the replacement text in.
1698 1700 text = (
1699 1701 text[:start_loc]
1700 1702 + self.diff_text2(patch.diffs)
1701 1703 + text[start_loc + len(text1) :]
1702 1704 )
1703 1705 else:
1704 1706 # Imperfect match.
1705 1707 # Run a diff to get a framework of equivalent indices.
1706 1708 diffs = self.diff_main(text1, text2, False)
1707 1709 if (
1708 1710 len(text1) > self.Match_MaxBits
1709 1711 and self.diff_levenshtein(diffs) / float(len(text1))
1710 1712 > self.Patch_DeleteThreshold
1711 1713 ):
1712 1714 # The end points match, but the content is unacceptably bad.
1713 1715 results[-1] = False
1714 1716 else:
1715 1717 self.diff_cleanupSemanticLossless(diffs)
1716 1718 index1 = 0
1717 1719 for op, data in patch.diffs:
1718 1720 if op != self.DIFF_EQUAL:
1719 1721 index2 = self.diff_xIndex(diffs, index1)
1720 1722 if op == self.DIFF_INSERT: # Insertion
1721 1723 text = (
1722 1724 text[: start_loc + index2]
1723 1725 + data
1724 1726 + text[start_loc + index2 :]
1725 1727 )
1726 1728 elif op == self.DIFF_DELETE: # Deletion
1727 1729 text = (
1728 1730 text[: start_loc + index2]
1729 1731 + text[
1730 1732 start_loc
1731 1733 + self.diff_xIndex(diffs, index1 + len(data)) :
1732 1734 ]
1733 1735 )
1734 1736 if op != self.DIFF_DELETE:
1735 1737 index1 += len(data)
1736 1738 # Strip the padding off.
1737 1739 text = text[len(nullPadding) : -len(nullPadding)]
1738 1740 return (text, results)
1739 1741
1740 1742 def patch_addPadding(self, patches):
1741 1743 """Add some padding on text start and end so that edges can match
1742 1744 something. Intended to be called only from within patch_apply.
1743 1745
1744 1746 Args:
1745 1747 patches: Array of Patch objects.
1746 1748
1747 1749 Returns:
1748 1750 The padding string added to each side.
1749 1751 """
1750 1752 paddingLength = self.Patch_Margin
1751 1753 nullPadding = ""
1752 1754 for x in range(1, paddingLength + 1):
1753 1755 nullPadding += chr(x)
1754 1756
1755 1757 # Bump all the patches forward.
1756 1758 for patch in patches:
1757 1759 patch.start1 += paddingLength
1758 1760 patch.start2 += paddingLength
1759 1761
1760 1762 # Add some padding on start of first diff.
1761 1763 patch = patches[0]
1762 1764 diffs = patch.diffs
1763 1765 if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1764 1766 # Add nullPadding equality.
1765 1767 diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1766 1768 patch.start1 -= paddingLength # Should be 0.
1767 1769 patch.start2 -= paddingLength # Should be 0.
1768 1770 patch.length1 += paddingLength
1769 1771 patch.length2 += paddingLength
1770 1772 elif paddingLength > len(diffs[0][1]):
1771 1773 # Grow first equality.
1772 1774 extraLength = paddingLength - len(diffs[0][1])
1773 1775 newText = nullPadding[len(diffs[0][1]) :] + diffs[0][1]
1774 1776 diffs[0] = (diffs[0][0], newText)
1775 1777 patch.start1 -= extraLength
1776 1778 patch.start2 -= extraLength
1777 1779 patch.length1 += extraLength
1778 1780 patch.length2 += extraLength
1779 1781
1780 1782 # Add some padding on end of last diff.
1781 1783 patch = patches[-1]
1782 1784 diffs = patch.diffs
1783 1785 if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1784 1786 # Add nullPadding equality.
1785 1787 diffs.append((self.DIFF_EQUAL, nullPadding))
1786 1788 patch.length1 += paddingLength
1787 1789 patch.length2 += paddingLength
1788 1790 elif paddingLength > len(diffs[-1][1]):
1789 1791 # Grow last equality.
1790 1792 extraLength = paddingLength - len(diffs[-1][1])
1791 1793 newText = diffs[-1][1] + nullPadding[:extraLength]
1792 1794 diffs[-1] = (diffs[-1][0], newText)
1793 1795 patch.length1 += extraLength
1794 1796 patch.length2 += extraLength
1795 1797
1796 1798 return nullPadding
1797 1799
1798 1800 def patch_splitMax(self, patches):
1799 1801 """Look through the patches and break up any which are longer than the
1800 1802 maximum limit of the match algorithm.
1801 1803 Intended to be called only from within patch_apply.
1802 1804
1803 1805 Args:
1804 1806 patches: Array of Patch objects.
1805 1807 """
1806 1808 patch_size = self.Match_MaxBits
1807 1809 if patch_size == 0:
1808 1810 # Python has the option of not splitting strings due to its ability
1809 1811 # to handle integers of arbitrary precision.
1810 1812 return
1811 1813 for x in range(len(patches)):
1812 1814 if patches[x].length1 <= patch_size:
1813 1815 continue
1814 1816 bigpatch = patches[x]
1815 1817 # Remove the big old patch.
1816 1818 del patches[x]
1817 1819 x -= 1
1818 1820 start1 = bigpatch.start1
1819 1821 start2 = bigpatch.start2
1820 1822 precontext = ""
1821 1823 while len(bigpatch.diffs) != 0:
1822 1824 # Create one of several smaller patches.
1823 1825 patch = patch_obj()
1824 1826 empty = True
1825 1827 patch.start1 = start1 - len(precontext)
1826 1828 patch.start2 = start2 - len(precontext)
1827 1829 if precontext:
1828 1830 patch.length1 = patch.length2 = len(precontext)
1829 1831 patch.diffs.append((self.DIFF_EQUAL, precontext))
1830 1832
1831 1833 while (
1832 1834 len(bigpatch.diffs) != 0
1833 1835 and patch.length1 < patch_size - self.Patch_Margin
1834 1836 ):
1835 1837 (diff_type, diff_text) = bigpatch.diffs[0]
1836 1838 if diff_type == self.DIFF_INSERT:
1837 1839 # Insertions are harmless.
1838 1840 patch.length2 += len(diff_text)
1839 1841 start2 += len(diff_text)
1840 1842 patch.diffs.append(bigpatch.diffs.pop(0))
1841 1843 empty = False
1842 1844 elif (
1843 1845 diff_type == self.DIFF_DELETE
1844 1846 and len(patch.diffs) == 1
1845 1847 and patch.diffs[0][0] == self.DIFF_EQUAL
1846 1848 and len(diff_text) > 2 * patch_size
1847 1849 ):
1848 1850 # This is a large deletion. Let it pass in one chunk.
1849 1851 patch.length1 += len(diff_text)
1850 1852 start1 += len(diff_text)
1851 1853 empty = False
1852 1854 patch.diffs.append((diff_type, diff_text))
1853 1855 del bigpatch.diffs[0]
1854 1856 else:
1855 1857 # Deletion or equality. Only take as much as we can stomach.
1856 1858 diff_text = diff_text[
1857 1859 : patch_size - patch.length1 - self.Patch_Margin
1858 1860 ]
1859 1861 patch.length1 += len(diff_text)
1860 1862 start1 += len(diff_text)
1861 1863 if diff_type == self.DIFF_EQUAL:
1862 1864 patch.length2 += len(diff_text)
1863 1865 start2 += len(diff_text)
1864 1866 else:
1865 1867 empty = False
1866 1868
1867 1869 patch.diffs.append((diff_type, diff_text))
1868 1870 if diff_text == bigpatch.diffs[0][1]:
1869 1871 del bigpatch.diffs[0]
1870 1872 else:
1871 1873 bigpatch.diffs[0] = (
1872 1874 bigpatch.diffs[0][0],
1873 1875 bigpatch.diffs[0][1][len(diff_text) :],
1874 1876 )
1875 1877
1876 1878 # Compute the head context for the next patch.
1877 1879 precontext = self.diff_text2(patch.diffs)
1878 1880 precontext = precontext[-self.Patch_Margin :]
1879 1881 # Append the end context for this patch.
1880 1882 postcontext = self.diff_text1(bigpatch.diffs)[: self.Patch_Margin]
1881 1883 if postcontext:
1882 1884 patch.length1 += len(postcontext)
1883 1885 patch.length2 += len(postcontext)
1884 1886 if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1885 1887 patch.diffs[-1] = (
1886 1888 self.DIFF_EQUAL,
1887 1889 patch.diffs[-1][1] + postcontext,
1888 1890 )
1889 1891 else:
1890 1892 patch.diffs.append((self.DIFF_EQUAL, postcontext))
1891 1893
1892 1894 if not empty:
1893 1895 x += 1
1894 1896 patches.insert(x, patch)
1895 1897
1896 1898 def patch_toText(self, patches):
1897 1899 """Take a list of patches and return a textual representation.
1898 1900
1899 1901 Args:
1900 1902 patches: Array of Patch objects.
1901 1903
1902 1904 Returns:
1903 1905 Text representation of patches.
1904 1906 """
1905 1907 text = []
1906 1908 for patch in patches:
1907 1909 text.append(str(patch))
1908 1910 return "".join(text)
1909 1911
1910 1912 def patch_fromText(self, textline):
1911 1913 """Parse a textual representation of patches and return a list of patch
1912 1914 objects.
1913 1915
1914 1916 Args:
1915 1917 textline: Text representation of patches.
1916 1918
1917 1919 Returns:
1918 1920 Array of Patch objects.
1919 1921
1920 1922 Raises:
1921 1923 ValueError: If invalid input.
1922 1924 """
1923 1925 if type(textline) == str:
1924 1926 # Patches should be composed of a subset of ascii chars, Unicode not
1925 1927 # required. If this encode raises UnicodeEncodeError, patch is invalid.
1926 1928 textline = textline.encode("ascii")
1927 1929 patches = []
1928 1930 if not textline:
1929 1931 return patches
1930 1932 text = textline.split("\n")
1931 1933 while len(text) != 0:
1932 m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1934 m = re.match(r"^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1933 1935 if not m:
1934 1936 raise ValueError("Invalid patch string: " + text[0])
1935 1937 patch = patch_obj()
1936 1938 patches.append(patch)
1937 1939 patch.start1 = int(m.group(1))
1938 1940 if m.group(2) == "":
1939 1941 patch.start1 -= 1
1940 1942 patch.length1 = 1
1941 1943 elif m.group(2) == "0":
1942 1944 patch.length1 = 0
1943 1945 else:
1944 1946 patch.start1 -= 1
1945 1947 patch.length1 = int(m.group(2))
1946 1948
1947 1949 patch.start2 = int(m.group(3))
1948 1950 if m.group(4) == "":
1949 1951 patch.start2 -= 1
1950 1952 patch.length2 = 1
1951 1953 elif m.group(4) == "0":
1952 1954 patch.length2 = 0
1953 1955 else:
1954 1956 patch.start2 -= 1
1955 1957 patch.length2 = int(m.group(4))
1956 1958
1957 1959 del text[0]
1958 1960
1959 1961 while len(text) != 0:
1960 1962 if text[0]:
1961 1963 sign = text[0][0]
1962 1964 else:
1963 1965 sign = ""
1964 1966 line = urllib.parse.unquote(text[0][1:])
1965 1967 line = line.decode("utf-8")
1966 1968 if sign == "+":
1967 1969 # Insertion.
1968 1970 patch.diffs.append((self.DIFF_INSERT, line))
1969 1971 elif sign == "-":
1970 1972 # Deletion.
1971 1973 patch.diffs.append((self.DIFF_DELETE, line))
1972 1974 elif sign == " ":
1973 1975 # Minor equality.
1974 1976 patch.diffs.append((self.DIFF_EQUAL, line))
1975 1977 elif sign == "@":
1976 1978 # Start of next patch.
1977 1979 break
1978 1980 elif sign == "":
1979 1981 # Blank line? Whatever.
1980 1982 pass
1981 1983 else:
1982 1984 # WTF?
1983 1985 raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))
1984 1986 del text[0]
1985 1987 return patches
1986 1988
1987 1989
1988 1990 class patch_obj:
1989 1991 """Class representing one patch operation."""
1990 1992
1991 1993 def __init__(self):
1992 1994 """Initializes with an empty list of diffs."""
1993 1995 self.diffs = []
1994 1996 self.start1 = None
1995 1997 self.start2 = None
1996 1998 self.length1 = 0
1997 1999 self.length2 = 0
1998 2000
1999 2001 def __str__(self):
2000 2002 """Emmulate GNU diff's format.
2001 2003 Header: @@ -382,8 +481,9 @@
2002 2004 Indicies are printed as 1-based, not 0-based.
2003 2005
2004 2006 Returns:
2005 2007 The GNU diff string.
2006 2008 """
2007 2009 if self.length1 == 0:
2008 2010 coords1 = str(self.start1) + ",0"
2009 2011 elif self.length1 == 1:
2010 2012 coords1 = str(self.start1 + 1)
2011 2013 else:
2012 2014 coords1 = str(self.start1 + 1) + "," + str(self.length1)
2013 2015 if self.length2 == 0:
2014 2016 coords2 = str(self.start2) + ",0"
2015 2017 elif self.length2 == 1:
2016 2018 coords2 = str(self.start2 + 1)
2017 2019 else:
2018 2020 coords2 = str(self.start2 + 1) + "," + str(self.length2)
2019 2021 text = ["@@ -", coords1, " +", coords2, " @@\n"]
2020 2022 # Escape the body of the patch with %xx notation.
2021 2023 for op, data in self.diffs:
2022 2024 if op == diff_match_patch.DIFF_INSERT:
2023 2025 text.append("+")
2024 2026 elif op == diff_match_patch.DIFF_DELETE:
2025 2027 text.append("-")
2026 2028 elif op == diff_match_patch.DIFF_EQUAL:
2027 2029 text.append(" ")
2028 2030 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
2029 2031 data = data.encode("utf-8")
2030 2032 text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
2031 2033 return "".join(text)
@@ -1,1271 +1,1146 b''
1 1
2 2
3 3 # Copyright (C) 2011-2020 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21
22 22 """
23 23 Set of diffing helpers, previously part of vcs
24 24 """
25
25 import dataclasses
26 26 import os
27 27 import re
28 28 import bz2
29 29 import gzip
30 30 import time
31 31
32 import collections
33 32 import difflib
34 33 import logging
35 34 import pickle
36 35 from itertools import tee
37 36
38 37 from rhodecode.lib.vcs.exceptions import VCSError
39 38 from rhodecode.lib.vcs.nodes import FileNode, SubModuleNode
40 from rhodecode.lib.utils2 import safe_unicode, safe_str
39 from rhodecode.lib.vcs.backends import base
40 from rhodecode.lib.str_utils import safe_str
41 41
42 42 log = logging.getLogger(__name__)
43 43
44 44 # define max context, a file with more than this numbers of lines is unusable
45 45 # in browser anyway
46 46 MAX_CONTEXT = 20 * 1024
47 47 DEFAULT_CONTEXT = 3
48 48
49 49
50 50 def get_diff_context(request):
51 51 return MAX_CONTEXT if request.GET.get('fullcontext', '') == '1' else DEFAULT_CONTEXT
52 52
53 53
54 54 def get_diff_whitespace_flag(request):
55 55 return request.GET.get('ignorews', '') == '1'
56 56
57 57
58 class OPS(object):
59 ADD = 'A'
60 MOD = 'M'
61 DEL = 'D'
58 @dataclasses.dataclass
59 class OPS:
60 ADD: str = 'A'
61 MOD: str = 'M'
62 DEL: str = 'D'
63
64
65 @dataclasses.dataclass
66 class DiffLineNumber:
67 old: int | None
68 new: int | None
69
70 def __iter__(self):
71 yield self.old
72 yield self.new
62 73
63 74
64 75 def get_gitdiff(filenode_old, filenode_new, ignore_whitespace=True, context=3):
65 76 """
66 77 Returns git style diff between given ``filenode_old`` and ``filenode_new``.
67 78
68 79 :param ignore_whitespace: ignore whitespaces in diff
69 80 """
70 81 # make sure we pass in default context
71 82 context = context or 3
72 83 # protect against IntOverflow when passing HUGE context
73 84 if context > MAX_CONTEXT:
74 85 context = MAX_CONTEXT
75 86
76 87 submodules = [o for o in [filenode_new, filenode_old] if isinstance(o, SubModuleNode)]
77 88 if submodules:
78 89 return ''
79 90
80 91 for filenode in (filenode_old, filenode_new):
81 92 if not isinstance(filenode, FileNode):
82 raise VCSError(
83 "Given object should be FileNode object, not %s"
84 % filenode.__class__)
93 raise VCSError(f"Given object should be FileNode object, not {filenode.__class__}")
85 94
86 95 repo = filenode_new.commit.repository
87 96 old_commit = filenode_old.commit or repo.EMPTY_COMMIT
88 97 new_commit = filenode_new.commit
89 98
90 99 vcs_gitdiff = repo.get_diff(
91 100 old_commit, new_commit, filenode_new.path,
92 101 ignore_whitespace, context, path1=filenode_old.path)
93 102 return vcs_gitdiff
94 103
95 104 NEW_FILENODE = 1
96 105 DEL_FILENODE = 2
97 106 MOD_FILENODE = 3
98 107 RENAMED_FILENODE = 4
99 108 COPIED_FILENODE = 5
100 109 CHMOD_FILENODE = 6
101 110 BIN_FILENODE = 7
102 111
103 112
104 113 class LimitedDiffContainer(object):
105 114
106 def __init__(self, diff_limit, cur_diff_size, diff):
115 def __init__(self, diff_limit: int, cur_diff_size, diff):
107 116 self.diff = diff
108 117 self.diff_limit = diff_limit
109 118 self.cur_diff_size = cur_diff_size
110 119
111 120 def __getitem__(self, key):
112 121 return self.diff.__getitem__(key)
113 122
114 123 def __iter__(self):
115 124 for l in self.diff:
116 125 yield l
117 126
118 127
119 128 class Action(object):
120 129 """
121 130 Contains constants for the action value of the lines in a parsed diff.
122 131 """
123 132
124 133 ADD = 'add'
125 134 DELETE = 'del'
126 135 UNMODIFIED = 'unmod'
127 136
128 137 CONTEXT = 'context'
129 138 OLD_NO_NL = 'old-no-nl'
130 139 NEW_NO_NL = 'new-no-nl'
131 140
132 141
133 142 class DiffProcessor(object):
134 143 """
135 Give it a unified or git diff and it returns a list of the files that were
144 Give it a unified or git diff, and it returns a list of the files that were
136 145 mentioned in the diff together with a dict of meta information that
137 can be used to render it in a HTML template.
146 can be used to render it in an HTML template.
138 147
139 148 .. note:: Unicode handling
140 149
141 150 The original diffs are a byte sequence and can contain filenames
142 151 in mixed encodings. This class generally returns `unicode` objects
143 152 since the result is intended for presentation to the user.
144 153
145 154 """
146 _chunk_re = re.compile(r'^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)')
147 _newline_marker = re.compile(r'^\\ No newline at end of file')
155 _chunk_re = re.compile(br'^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)')
156 _newline_marker = re.compile(br'^\\ No newline at end of file')
148 157
149 158 # used for inline highlighter word split
150 _token_re = re.compile(r'()(&gt;|&lt;|&amp;|\W+?)')
159 _token_re = re.compile(br'()(&gt;|&lt;|&amp;|\W+?)')
151 160
152 161 # collapse ranges of commits over given number
153 162 _collapse_commits_over = 5
154 163
155 def __init__(self, diff, format='gitdiff', diff_limit=None,
156 file_limit=None, show_full_diff=True):
164 def __init__(self, diff: base.Diff, diff_format='gitdiff', diff_limit: int = 0,
165 file_limit: int = 0, show_full_diff=True):
157 166 """
158 167 :param diff: A `Diff` object representing a diff from a vcs backend
159 :param format: format of diff passed, `udiff` or `gitdiff`
168 :param diff_format: format of diff passed, `udiff` or `gitdiff`
160 169 :param diff_limit: define the size of diff that is considered "big"
161 170 based on that parameter cut off will be triggered, set to None
162 171 to show full diff
163 172 """
164 173 self._diff = diff
165 self._format = format
174 self._format = diff_format
166 175 self.adds = 0
167 176 self.removes = 0
168 177 # calculate diff size
169 178 self.diff_limit = diff_limit
170 179 self.file_limit = file_limit
171 180 self.show_full_diff = show_full_diff
172 181 self.cur_diff_size = 0
173 182 self.parsed = False
174 183 self.parsed_diff = []
175 184
176 log.debug('Initialized DiffProcessor with %s mode', format)
177 if format == 'gitdiff':
185 log.debug('Initialized DiffProcessor with %s mode', diff_format)
186 self.differ = self._highlight_line_udiff
187 self._parser = self._new_parse_gitdiff
188
189 if diff_format == 'gitdiff':
178 190 self.differ = self._highlight_line_difflib
179 191 self._parser = self._parse_gitdiff
180 else:
181 self.differ = self._highlight_line_udiff
182 self._parser = self._new_parse_gitdiff
192 raise DeprecationWarning('gitdiff usage is deprecated')
183 193
184 194 def _copy_iterator(self):
185 195 """
186 196 make a fresh copy of generator, we should not iterate thru
187 197 an original as it's needed for repeating operations on
188 198 this instance of DiffProcessor
189 199 """
190 200 self.__udiff, iterator_copy = tee(self.__udiff)
191 201 return iterator_copy
192 202
193 def _escaper(self, string):
203 def _escaper(self, diff_string):
194 204 """
195 205 Escaper for diff escapes special chars and checks the diff limit
196 206
197 207 :param string:
198 208 """
199 self.cur_diff_size += len(string)
209 self.cur_diff_size += len(diff_string)
200 210
201 211 if not self.show_full_diff and (self.cur_diff_size > self.diff_limit):
202 212 raise DiffLimitExceeded('Diff Limit Exceeded')
203 213
204 return string \
205 .replace('&', '&amp;')\
206 .replace('<', '&lt;')\
207 .replace('>', '&gt;')
214 return diff_string \
215 .replace(b'&', b'&amp;')\
216 .replace(b'<', b'&lt;')\
217 .replace(b'>', b'&gt;')
208 218
209 def _line_counter(self, l):
219 def _line_counter(self, diff_line):
210 220 """
211 221 Checks each line and bumps total adds/removes for this diff
212 222
213 :param l:
223 :param diff_line:
214 224 """
215 if l.startswith('+') and not l.startswith('+++'):
225 if diff_line.startswith(b'+') and not diff_line.startswith(b'+++'):
216 226 self.adds += 1
217 elif l.startswith('-') and not l.startswith('---'):
227 elif diff_line.startswith(b'-') and not diff_line.startswith(b'---'):
218 228 self.removes += 1
219 return safe_unicode(l)
229 return diff_line
220 230
221 231 def _highlight_line_difflib(self, line, next_):
222 232 """
223 233 Highlight inline changes in both lines.
224 234 """
225 235
226 236 if line['action'] == Action.DELETE:
227 237 old, new = line, next_
228 238 else:
229 239 old, new = next_, line
230 240
231 241 oldwords = self._token_re.split(old['line'])
232 242 newwords = self._token_re.split(new['line'])
233 243 sequence = difflib.SequenceMatcher(None, oldwords, newwords)
234 244
235 245 oldfragments, newfragments = [], []
236 246 for tag, i1, i2, j1, j2 in sequence.get_opcodes():
237 247 oldfrag = ''.join(oldwords[i1:i2])
238 248 newfrag = ''.join(newwords[j1:j2])
239 249 if tag != 'equal':
240 250 if oldfrag:
241 oldfrag = '<del>%s</del>' % oldfrag
251 oldfrag = f'<del>{oldfrag}</del>'
242 252 if newfrag:
243 newfrag = '<ins>%s</ins>' % newfrag
253 newfrag = f'<ins>{newfrag}</ins>'
244 254 oldfragments.append(oldfrag)
245 255 newfragments.append(newfrag)
246 256
247 257 old['line'] = "".join(oldfragments)
248 258 new['line'] = "".join(newfragments)
249 259
250 260 def _highlight_line_udiff(self, line, next_):
251 261 """
252 262 Highlight inline changes in both lines.
253 263 """
254 264 start = 0
255 265 limit = min(len(line['line']), len(next_['line']))
256 266 while start < limit and line['line'][start] == next_['line'][start]:
257 267 start += 1
258 268 end = -1
259 269 limit -= start
260 270 while -end <= limit and line['line'][end] == next_['line'][end]:
261 271 end -= 1
262 272 end += 1
263 273 if start or end:
264 274 def do(l):
265 275 last = end + len(l['line'])
266 276 if l['action'] == Action.ADD:
267 277 tag = 'ins'
268 278 else:
269 279 tag = 'del'
270 l['line'] = '%s<%s>%s</%s>%s' % (
271 l['line'][:start],
272 tag,
273 l['line'][start:last],
274 tag,
275 l['line'][last:]
276 )
280 l['line'] = f"{l['line'][:start]}<{tag}>{l['line'][start:last]}</{tag}>{l['line'][last:]}"
277 281 do(line)
278 282 do(next_)
279 283
280 def _clean_line(self, line, command):
284 def _clean_line(self, line, command: str):
281 285 if command in ['+', '-', ' ']:
282 286 # only modify the line if it's actually a diff thing
283 287 line = line[1:]
284 288 return line
285 289
286 290 def _parse_gitdiff(self, inline_diff=True):
287 291 _files = []
288 diff_container = lambda arg: arg
292
293 def diff_container(arg):
294 return arg
289 295
290 296 for chunk in self._diff.chunks():
291 297 head = chunk.header
292 298
293 299 diff = map(self._escaper, self.diff_splitter(chunk.diff))
294 300 raw_diff = chunk.raw
295 301 limited_diff = False
296 302 exceeds_limit = False
297 303
298 304 op = None
299 305 stats = {
300 306 'added': 0,
301 307 'deleted': 0,
302 308 'binary': False,
303 309 'ops': {},
304 310 }
305 311
306 312 if head['deleted_file_mode']:
307 313 op = OPS.DEL
308 314 stats['binary'] = True
309 315 stats['ops'][DEL_FILENODE] = 'deleted file'
310 316
311 317 elif head['new_file_mode']:
312 318 op = OPS.ADD
313 319 stats['binary'] = True
314 stats['ops'][NEW_FILENODE] = 'new file %s' % head['new_file_mode']
315 else: # modify operation, can be copy, rename or chmod
320 stats['ops'][NEW_FILENODE] = f"new file {safe_str(head['new_file_mode'])}"
321 else: # modify operation, can be: copy, rename or chmod
316 322
317 323 # CHMOD
318 324 if head['new_mode'] and head['old_mode']:
319 325 op = OPS.MOD
320 326 stats['binary'] = True
321 stats['ops'][CHMOD_FILENODE] = (
322 'modified file chmod %s => %s' % (
323 head['old_mode'], head['new_mode']))
327 stats['ops'][CHMOD_FILENODE] = f"modified file chmod {safe_str(head['old_mode'])} => {safe_str(head['new_mode'])}"
324 328 # RENAME
325 329 if head['rename_from'] != head['rename_to']:
326 330 op = OPS.MOD
327 331 stats['binary'] = True
328 stats['ops'][RENAMED_FILENODE] = (
329 'file renamed from %s to %s' % (
330 head['rename_from'], head['rename_to']))
332 stats['ops'][RENAMED_FILENODE] = f"file renamed from {safe_str(head['rename_from'])} to {safe_str(head['rename_to'])}"
331 333 # COPY
332 334 if head.get('copy_from') and head.get('copy_to'):
333 335 op = OPS.MOD
334 336 stats['binary'] = True
335 stats['ops'][COPIED_FILENODE] = (
336 'file copied from %s to %s' % (
337 head['copy_from'], head['copy_to']))
337 stats['ops'][COPIED_FILENODE] = f"file copied from {safe_str(head['copy_from'])} to {safe_str(head['copy_to'])}"
338 338
339 339 # If our new parsed headers didn't match anything fallback to
340 340 # old style detection
341 341 if op is None:
342 342 if not head['a_file'] and head['b_file']:
343 343 op = OPS.ADD
344 344 stats['binary'] = True
345 345 stats['ops'][NEW_FILENODE] = 'new file'
346 346
347 347 elif head['a_file'] and not head['b_file']:
348 348 op = OPS.DEL
349 349 stats['binary'] = True
350 350 stats['ops'][DEL_FILENODE] = 'deleted file'
351 351
352 352 # it's not ADD not DELETE
353 353 if op is None:
354 354 op = OPS.MOD
355 355 stats['binary'] = True
356 356 stats['ops'][MOD_FILENODE] = 'modified file'
357 357
358 358 # a real non-binary diff
359 359 if head['a_file'] or head['b_file']:
360 360 try:
361 361 raw_diff, chunks, _stats = self._parse_lines(diff)
362 362 stats['binary'] = False
363 363 stats['added'] = _stats[0]
364 364 stats['deleted'] = _stats[1]
365 365 # explicit mark that it's a modified file
366 366 if op == OPS.MOD:
367 367 stats['ops'][MOD_FILENODE] = 'modified file'
368 368 exceeds_limit = len(raw_diff) > self.file_limit
369 369
370 370 # changed from _escaper function so we validate size of
371 371 # each file instead of the whole diff
372 372 # diff will hide big files but still show small ones
373 373 # from my tests, big files are fairly safe to be parsed
374 374 # but the browser is the bottleneck
375 375 if not self.show_full_diff and exceeds_limit:
376 376 raise DiffLimitExceeded('File Limit Exceeded')
377 377
378 378 except DiffLimitExceeded:
379 diff_container = lambda _diff: \
380 LimitedDiffContainer(
381 self.diff_limit, self.cur_diff_size, _diff)
379 def diff_container(_diff):
380 return LimitedDiffContainer(self.diff_limit, self.cur_diff_size, _diff)
382 381
383 382 exceeds_limit = len(raw_diff) > self.file_limit
384 383 limited_diff = True
385 384 chunks = []
386 385
387 386 else: # GIT format binary patch, or possibly empty diff
388 387 if head['bin_patch']:
389 388 # we have operation already extracted, but we mark simply
390 # it's a diff we wont show for binary files
389 # it's a diff we won't show for binary files
391 390 stats['ops'][BIN_FILENODE] = 'binary diff hidden'
392 391 chunks = []
393 392
394 393 if chunks and not self.show_full_diff and op == OPS.DEL:
395 394 # if not full diff mode show deleted file contents
396 395 # TODO: anderson: if the view is not too big, there is no way
397 396 # to see the content of the file
398 397 chunks = []
399 398
400 chunks.insert(0, [{
401 'old_lineno': '',
402 'new_lineno': '',
403 'action': Action.CONTEXT,
404 'line': msg,
405 } for _op, msg in stats['ops'].items()
406 if _op not in [MOD_FILENODE]])
399 frag = [{
400 'old_lineno': '',
401 'new_lineno': '',
402 'action': Action.CONTEXT,
403 'line': msg,
404 } for _op, msg in list(stats['ops'].items())
405 if _op not in [MOD_FILENODE]]
406
407 chunks.insert(0, frag)
407 408
408 409 _files.append({
409 'filename': safe_unicode(head['b_path']),
410 'filename': safe_str(head['b_path']),
410 411 'old_revision': head['a_blob_id'],
411 412 'new_revision': head['b_blob_id'],
412 413 'chunks': chunks,
413 'raw_diff': safe_unicode(raw_diff),
414 'raw_diff': safe_str(raw_diff),
414 415 'operation': op,
415 416 'stats': stats,
416 417 'exceeds_limit': exceeds_limit,
417 418 'is_limited_diff': limited_diff,
418 419 })
419 420
420 sorter = lambda info: {OPS.ADD: 0, OPS.MOD: 1,
421 OPS.DEL: 2}.get(info['operation'])
421 def operation_sorter(info):
422 return {OPS.ADD: 0, OPS.MOD: 1, OPS.DEL: 2}.get(info['operation'])
422 423
423 424 if not inline_diff:
424 return diff_container(sorted(_files, key=sorter))
425 return diff_container(sorted(_files, key=operation_sorter))
425 426
426 427 # highlight inline changes
427 428 for diff_data in _files:
428 429 for chunk in diff_data['chunks']:
429 430 lineiter = iter(chunk)
430 431 try:
431 432 while 1:
432 433 line = next(lineiter)
433 434 if line['action'] not in (
434 435 Action.UNMODIFIED, Action.CONTEXT):
435 436 nextline = next(lineiter)
436 437 if nextline['action'] in ['unmod', 'context'] or \
437 438 nextline['action'] == line['action']:
438 439 continue
439 440 self.differ(line, nextline)
440 441 except StopIteration:
441 442 pass
442 443
443 return diff_container(sorted(_files, key=sorter))
444 return diff_container(sorted(_files, key=operation_sorter))
444 445
445 446 def _check_large_diff(self):
446 447 if self.diff_limit:
447 448 log.debug('Checking if diff exceeds current diff_limit of %s', self.diff_limit)
448 449 if not self.show_full_diff and (self.cur_diff_size > self.diff_limit):
449 raise DiffLimitExceeded('Diff Limit `%s` Exceeded', self.diff_limit)
450 raise DiffLimitExceeded(f'Diff Limit `{self.diff_limit}` Exceeded')
450 451
451 452 # FIXME: NEWDIFFS: dan: this replaces _parse_gitdiff
452 453 def _new_parse_gitdiff(self, inline_diff=True):
453 454 _files = []
454 455
455 # this can be overriden later to a LimitedDiffContainer type
456 diff_container = lambda arg: arg
456 # this can be overridden later to a LimitedDiffContainer type
457 def diff_container(arg):
458 return arg
457 459
458 460 for chunk in self._diff.chunks():
459 head = chunk.header
460 log.debug('parsing diff %r', head)
461 head = chunk.header_as_str
462 log.debug('parsing diff chunk %r', chunk)
461 463
462 464 raw_diff = chunk.raw
463 465 limited_diff = False
464 466 exceeds_limit = False
465 467
466 468 op = None
467 469 stats = {
468 470 'added': 0,
469 471 'deleted': 0,
470 472 'binary': False,
471 'old_mode': None,
472 'new_mode': None,
473 'old_mode': '',
474 'new_mode': '',
473 475 'ops': {},
474 476 }
475 477 if head['old_mode']:
476 478 stats['old_mode'] = head['old_mode']
477 479 if head['new_mode']:
478 480 stats['new_mode'] = head['new_mode']
479 481 if head['b_mode']:
480 482 stats['new_mode'] = head['b_mode']
481 483
482 484 # delete file
483 485 if head['deleted_file_mode']:
484 486 op = OPS.DEL
485 487 stats['binary'] = True
486 488 stats['ops'][DEL_FILENODE] = 'deleted file'
487 489
488 490 # new file
489 491 elif head['new_file_mode']:
490 492 op = OPS.ADD
491 493 stats['binary'] = True
492 stats['old_mode'] = None
494 stats['old_mode'] = ''
493 495 stats['new_mode'] = head['new_file_mode']
494 stats['ops'][NEW_FILENODE] = 'new file %s' % head['new_file_mode']
496 stats['ops'][NEW_FILENODE] = f"new file {head['new_file_mode']}"
495 497
496 # modify operation, can be copy, rename or chmod
498 # modify operation, can be: copy, rename or chmod
497 499 else:
498 500 # CHMOD
499 501 if head['new_mode'] and head['old_mode']:
500 502 op = OPS.MOD
501 503 stats['binary'] = True
502 stats['ops'][CHMOD_FILENODE] = (
503 'modified file chmod %s => %s' % (
504 head['old_mode'], head['new_mode']))
504 stats['ops'][CHMOD_FILENODE] = f"modified file chmod {head['old_mode']} => {head['new_mode']}"
505 505
506 506 # RENAME
507 507 if head['rename_from'] != head['rename_to']:
508 508 op = OPS.MOD
509 509 stats['binary'] = True
510 510 stats['renamed'] = (head['rename_from'], head['rename_to'])
511 stats['ops'][RENAMED_FILENODE] = (
512 'file renamed from %s to %s' % (
513 head['rename_from'], head['rename_to']))
511 stats['ops'][RENAMED_FILENODE] = f"file renamed from {head['rename_from']} to {head['rename_to']}"
514 512 # COPY
515 513 if head.get('copy_from') and head.get('copy_to'):
516 514 op = OPS.MOD
517 515 stats['binary'] = True
518 516 stats['copied'] = (head['copy_from'], head['copy_to'])
519 stats['ops'][COPIED_FILENODE] = (
520 'file copied from %s to %s' % (
521 head['copy_from'], head['copy_to']))
517 stats['ops'][COPIED_FILENODE] = f"file copied from {head['copy_from']} to {head['copy_to']}"
522 518
523 519 # If our new parsed headers didn't match anything fallback to
524 520 # old style detection
525 521 if op is None:
526 522 if not head['a_file'] and head['b_file']:
527 523 op = OPS.ADD
528 524 stats['binary'] = True
529 525 stats['new_file'] = True
530 526 stats['ops'][NEW_FILENODE] = 'new file'
531 527
532 528 elif head['a_file'] and not head['b_file']:
533 529 op = OPS.DEL
534 530 stats['binary'] = True
535 531 stats['ops'][DEL_FILENODE] = 'deleted file'
536 532
537 533 # it's not ADD not DELETE
538 534 if op is None:
539 535 op = OPS.MOD
540 536 stats['binary'] = True
541 537 stats['ops'][MOD_FILENODE] = 'modified file'
542 538
543 539 # a real non-binary diff
544 540 if head['a_file'] or head['b_file']:
545 541 # simulate splitlines, so we keep the line end part
546 542 diff = self.diff_splitter(chunk.diff)
547 543
548 544 # append each file to the diff size
549 545 raw_chunk_size = len(raw_diff)
550 546
551 547 exceeds_limit = raw_chunk_size > self.file_limit
552 548 self.cur_diff_size += raw_chunk_size
553 549
554 550 try:
555 551 # Check each file instead of the whole diff.
556 552 # Diff will hide big files but still show small ones.
557 553 # From the tests big files are fairly safe to be parsed
558 554 # but the browser is the bottleneck.
559 555 if not self.show_full_diff and exceeds_limit:
560 556 log.debug('File `%s` exceeds current file_limit of %s',
561 safe_unicode(head['b_path']), self.file_limit)
562 raise DiffLimitExceeded(
563 'File Limit %s Exceeded', self.file_limit)
557 head['b_path'], self.file_limit)
558 raise DiffLimitExceeded(f'File Limit {self.file_limit} Exceeded')
564 559
565 560 self._check_large_diff()
566 561
567 562 raw_diff, chunks, _stats = self._new_parse_lines(diff)
568 563 stats['binary'] = False
569 564 stats['added'] = _stats[0]
570 565 stats['deleted'] = _stats[1]
571 566 # explicit mark that it's a modified file
572 567 if op == OPS.MOD:
573 568 stats['ops'][MOD_FILENODE] = 'modified file'
574 569
575 570 except DiffLimitExceeded:
576 diff_container = lambda _diff: \
577 LimitedDiffContainer(
578 self.diff_limit, self.cur_diff_size, _diff)
571 def limited_diff_container(_diff):
572 return LimitedDiffContainer(self.diff_limit, self.cur_diff_size, _diff)
573
574 # re-definition of our container wrapper
575 diff_container = limited_diff_container
579 576
580 577 limited_diff = True
581 578 chunks = []
582 579
583 580 else: # GIT format binary patch, or possibly empty diff
584 581 if head['bin_patch']:
585 582 # we have operation already extracted, but we mark simply
586 # it's a diff we wont show for binary files
583 # it's a diff we won't show for binary files
587 584 stats['ops'][BIN_FILENODE] = 'binary diff hidden'
588 585 chunks = []
589 586
590 587 # Hide content of deleted node by setting empty chunks
591 588 if chunks and not self.show_full_diff and op == OPS.DEL:
592 589 # if not full diff mode show deleted file contents
593 590 # TODO: anderson: if the view is not too big, there is no way
594 591 # to see the content of the file
595 592 chunks = []
596 593
597 chunks.insert(
598 0, [{'old_lineno': '',
599 'new_lineno': '',
600 'action': Action.CONTEXT,
601 'line': msg,
602 } for _op, msg in stats['ops'].items()
603 if _op not in [MOD_FILENODE]])
594 frag = [
595 {'old_lineno': '',
596 'new_lineno': '',
597 'action': Action.CONTEXT,
598 'line': msg,
599 } for _op, msg in list(stats['ops'].items())
600 if _op not in [MOD_FILENODE]]
604 601
605 original_filename = safe_unicode(head['a_path'])
602 chunks.insert(0, frag)
603
604 original_filename = safe_str(head['a_path'])
606 605 _files.append({
607 606 'original_filename': original_filename,
608 'filename': safe_unicode(head['b_path']),
607 'filename': safe_str(head['b_path']),
609 608 'old_revision': head['a_blob_id'],
610 609 'new_revision': head['b_blob_id'],
611 610 'chunks': chunks,
612 'raw_diff': safe_unicode(raw_diff),
611 'raw_diff': safe_str(raw_diff),
613 612 'operation': op,
614 613 'stats': stats,
615 614 'exceeds_limit': exceeds_limit,
616 615 'is_limited_diff': limited_diff,
617 616 })
618 617
619 sorter = lambda info: {OPS.ADD: 0, OPS.MOD: 1,
620 OPS.DEL: 2}.get(info['operation'])
621
618 def sorter(info):
619 return {OPS.ADD: 0, OPS.MOD: 1, OPS.DEL: 2}.get(info['operation'])
622 620 return diff_container(sorted(_files, key=sorter))
623 621
624 622 # FIXME: NEWDIFFS: dan: this gets replaced by _new_parse_lines
625 623 def _parse_lines(self, diff_iter):
626 624 """
627 625 Parse the diff an return data for the template.
628 626 """
629 627
630 628 stats = [0, 0]
631 629 chunks = []
632 630 raw_diff = []
633 631
634 632 try:
635 633 line = next(diff_iter)
636 634
637 635 while line:
638 636 raw_diff.append(line)
639 637 lines = []
640 638 chunks.append(lines)
641 639
642 640 match = self._chunk_re.match(line)
643 641
644 642 if not match:
645 643 break
646 644
647 645 gr = match.groups()
648 646 (old_line, old_end,
649 647 new_line, new_end) = [int(x or 1) for x in gr[:-1]]
650 648 old_line -= 1
651 649 new_line -= 1
652 650
653 651 context = len(gr) == 5
654 652 old_end += old_line
655 653 new_end += new_line
656 654
657 655 if context:
658 656 # skip context only if it's first line
659 657 if int(gr[0]) > 1:
660 658 lines.append({
661 659 'old_lineno': '...',
662 660 'new_lineno': '...',
663 661 'action': Action.CONTEXT,
664 662 'line': line,
665 663 })
666 664
667 665 line = next(diff_iter)
668 666
669 667 while old_line < old_end or new_line < new_end:
670 command = ' '
668 command = b' '
671 669 if line:
672 670 command = line[0]
673 671
674 672 affects_old = affects_new = False
675 673
676 674 # ignore those if we don't expect them
677 if command in '#@':
675 if command in b'#@':
678 676 continue
679 elif command == '+':
677 elif command == b'+':
680 678 affects_new = True
681 679 action = Action.ADD
682 680 stats[0] += 1
683 elif command == '-':
681 elif command == b'-':
684 682 affects_old = True
685 683 action = Action.DELETE
686 684 stats[1] += 1
687 685 else:
688 686 affects_old = affects_new = True
689 687 action = Action.UNMODIFIED
690 688
691 689 if not self._newline_marker.match(line):
692 690 old_line += affects_old
693 691 new_line += affects_new
694 692 lines.append({
695 'old_lineno': affects_old and old_line or '',
696 'new_lineno': affects_new and new_line or '',
693 'old_lineno': affects_old and old_line or b'',
694 'new_lineno': affects_new and new_line or b'',
697 695 'action': action,
698 696 'line': self._clean_line(line, command)
699 697 })
700 698 raw_diff.append(line)
701 699
702 700 line = next(diff_iter)
703 701
704 702 if self._newline_marker.match(line):
705 703 # we need to append to lines, since this is not
706 704 # counted in the line specs of diff
707 705 lines.append({
708 706 'old_lineno': '...',
709 707 'new_lineno': '...',
710 708 'action': Action.CONTEXT,
711 709 'line': self._clean_line(line, command)
712 710 })
713 711
714 712 except StopIteration:
715 713 pass
716 714 return ''.join(raw_diff), chunks, stats
717 715
718 716 # FIXME: NEWDIFFS: dan: this replaces _parse_lines
719 717 def _new_parse_lines(self, diff_iter):
720 718 """
721 719 Parse the diff an return data for the template.
722 720 """
723 721
724 722 stats = [0, 0]
725 723 chunks = []
726 724 raw_diff = []
727 725
728 726 try:
729 727 line = next(diff_iter)
728 assert isinstance(line, bytes)
730 729
731 730 while line:
732 731 raw_diff.append(line)
733 732 # match header e.g @@ -0,0 +1 @@\n'
734 733 match = self._chunk_re.match(line)
735 734
736 735 if not match:
737 736 break
738 737
739 738 gr = match.groups()
739
740 740 (old_line, old_end,
741 741 new_line, new_end) = [int(x or 1) for x in gr[:-1]]
742 742
743 743 lines = []
744 744 hunk = {
745 745 'section_header': gr[-1],
746 746 'source_start': old_line,
747 747 'source_length': old_end,
748 748 'target_start': new_line,
749 749 'target_length': new_end,
750 750 'lines': lines,
751 751 }
752 752 chunks.append(hunk)
753 753
754 754 old_line -= 1
755 755 new_line -= 1
756 756
757 context = len(gr) == 5
757 len(gr) == 5
758 758 old_end += old_line
759 759 new_end += new_line
760 760
761 761 line = next(diff_iter)
762 762
763 763 while old_line < old_end or new_line < new_end:
764 764 command = ' '
765 765 if line:
766 command = line[0]
766 # This is bytes, so we need to convert it to a str
767 command: str = chr(line[0])
767 768
768 769 affects_old = affects_new = False
769 770
770 771 # ignore those if we don't expect them
771 772 if command in '#@':
772 773 continue
773 774 elif command == '+':
774 775 affects_new = True
775 776 action = Action.ADD
776 777 stats[0] += 1
777 778 elif command == '-':
778 779 affects_old = True
779 780 action = Action.DELETE
780 781 stats[1] += 1
781 782 else:
782 783 affects_old = affects_new = True
783 784 action = Action.UNMODIFIED
784 785
785 786 if not self._newline_marker.match(line):
786 787 old_line += affects_old
787 788 new_line += affects_new
788 789 lines.append({
789 'old_lineno': affects_old and old_line or '',
790 'new_lineno': affects_new and new_line or '',
790 'old_lineno': affects_old and old_line or None,
791 'new_lineno': affects_new and new_line or None,
791 792 'action': action,
792 793 'line': self._clean_line(line, command)
793 794 })
794 795 raw_diff.append(line)
795 796
796 797 line = next(diff_iter)
797 798
798 799 if self._newline_marker.match(line):
799 800 # we need to append to lines, since this is not
800 801 # counted in the line specs of diff
801 802 if affects_old:
802 803 action = Action.OLD_NO_NL
803 804 elif affects_new:
804 805 action = Action.NEW_NO_NL
805 806 else:
806 807 raise Exception('invalid context for no newline')
807 808
808 809 lines.append({
809 810 'old_lineno': None,
810 811 'new_lineno': None,
811 812 'action': action,
812 813 'line': self._clean_line(line, command)
813 814 })
814 815
815 816 except StopIteration:
816 817 pass
817 818
818 return ''.join(raw_diff), chunks, stats
819 return b''.join(raw_diff), chunks, stats
819 820
820 821 def _safe_id(self, idstring):
821 822 """Make a string safe for including in an id attribute.
822 823
823 824 The HTML spec says that id attributes 'must begin with
824 825 a letter ([A-Za-z]) and may be followed by any number
825 826 of letters, digits ([0-9]), hyphens ("-"), underscores
826 827 ("_"), colons (":"), and periods (".")'. These regexps
827 828 are slightly over-zealous, in that they remove colons
828 829 and periods unnecessarily.
829 830
830 831 Whitespace is transformed into underscores, and then
831 832 anything which is not a hyphen or a character that
832 833 matches \w (alphanumerics and underscore) is removed.
833 834
834 835 """
835 836 # Transform all whitespace to underscore
836 idstring = re.sub(r'\s', "_", '%s' % idstring)
837 idstring = re.sub(r'\s', "_", f'{idstring}')
837 838 # Remove everything that is not a hyphen or a member of \w
838 839 idstring = re.sub(r'(?!-)\W', "", idstring).lower()
839 840 return idstring
840 841
841 842 @classmethod
842 def diff_splitter(cls, string):
843 def diff_splitter(cls, diff_string: bytes):
843 844 """
844 845 Diff split that emulates .splitlines() but works only on \n
845 846 """
846 if not string:
847 if not diff_string:
847 848 return
848 elif string == '\n':
849 yield '\n'
849 elif diff_string == b'\n':
850 yield b'\n'
850 851 else:
851 852
852 has_newline = string.endswith('\n')
853 elements = string.split('\n')
853 has_newline = diff_string.endswith(b'\n')
854 elements = diff_string.split(b'\n')
854 855 if has_newline:
855 856 # skip last element as it's empty string from newlines
856 857 elements = elements[:-1]
857 858
858 859 len_elements = len(elements)
859 860
860 861 for cnt, line in enumerate(elements, start=1):
861 862 last_line = cnt == len_elements
862 863 if last_line and not has_newline:
863 yield safe_unicode(line)
864 yield line
864 865 else:
865 yield safe_unicode(line) + '\n'
866 yield line + b'\n'
866 867
867 868 def prepare(self, inline_diff=True):
868 869 """
869 870 Prepare the passed udiff for HTML rendering.
870 871
871 872 :return: A list of dicts with diff information.
872 873 """
873 874 parsed = self._parser(inline_diff=inline_diff)
874 875 self.parsed = True
875 876 self.parsed_diff = parsed
876 877 return parsed
877 878
878 879 def as_raw(self, diff_lines=None):
879 880 """
880 881 Returns raw diff as a byte string
881 882 """
882 return self._diff.raw
883
884 def as_html(self, table_class='code-difftable', line_class='line',
885 old_lineno_class='lineno old', new_lineno_class='lineno new',
886 code_class='code', enable_comments=False, parsed_lines=None):
887 """
888 Return given diff as html table with customized css classes
889 """
890 # TODO(marcink): not sure how to pass in translator
891 # here in an efficient way, leave the _ for proper gettext extraction
892 _ = lambda s: s
893
894 def _link_to_if(condition, label, url):
895 """
896 Generates a link if condition is meet or just the label if not.
897 """
898
899 if condition:
900 return '''<a href="%(url)s" class="tooltip"
901 title="%(title)s">%(label)s</a>''' % {
902 'title': _('Click to select line'),
903 'url': url,
904 'label': label
905 }
906 else:
907 return label
908 if not self.parsed:
909 self.prepare()
910
911 diff_lines = self.parsed_diff
912 if parsed_lines:
913 diff_lines = parsed_lines
914
915 _html_empty = True
916 _html = []
917 _html.append('''<table class="%(table_class)s">\n''' % {
918 'table_class': table_class
919 })
920
921 for diff in diff_lines:
922 for line in diff['chunks']:
923 _html_empty = False
924 for change in line:
925 _html.append('''<tr class="%(lc)s %(action)s">\n''' % {
926 'lc': line_class,
927 'action': change['action']
928 })
929 anchor_old_id = ''
930 anchor_new_id = ''
931 anchor_old = "%(filename)s_o%(oldline_no)s" % {
932 'filename': self._safe_id(diff['filename']),
933 'oldline_no': change['old_lineno']
934 }
935 anchor_new = "%(filename)s_n%(oldline_no)s" % {
936 'filename': self._safe_id(diff['filename']),
937 'oldline_no': change['new_lineno']
938 }
939 cond_old = (change['old_lineno'] != '...' and
940 change['old_lineno'])
941 cond_new = (change['new_lineno'] != '...' and
942 change['new_lineno'])
943 if cond_old:
944 anchor_old_id = 'id="%s"' % anchor_old
945 if cond_new:
946 anchor_new_id = 'id="%s"' % anchor_new
947
948 if change['action'] != Action.CONTEXT:
949 anchor_link = True
950 else:
951 anchor_link = False
952
953 ###########################################################
954 # COMMENT ICONS
955 ###########################################################
956 _html.append('''\t<td class="add-comment-line"><span class="add-comment-content">''')
957
958 if enable_comments and change['action'] != Action.CONTEXT:
959 _html.append('''<a href="#"><span class="icon-comment-add"></span></a>''')
960
961 _html.append('''</span></td><td class="comment-toggle tooltip" title="Toggle Comment Thread"><i class="icon-comment"></i></td>\n''')
962
963 ###########################################################
964 # OLD LINE NUMBER
965 ###########################################################
966 _html.append('''\t<td %(a_id)s class="%(olc)s">''' % {
967 'a_id': anchor_old_id,
968 'olc': old_lineno_class
969 })
970
971 _html.append('''%(link)s''' % {
972 'link': _link_to_if(anchor_link, change['old_lineno'],
973 '#%s' % anchor_old)
974 })
975 _html.append('''</td>\n''')
976 ###########################################################
977 # NEW LINE NUMBER
978 ###########################################################
979
980 _html.append('''\t<td %(a_id)s class="%(nlc)s">''' % {
981 'a_id': anchor_new_id,
982 'nlc': new_lineno_class
983 })
984
985 _html.append('''%(link)s''' % {
986 'link': _link_to_if(anchor_link, change['new_lineno'],
987 '#%s' % anchor_new)
988 })
989 _html.append('''</td>\n''')
990 ###########################################################
991 # CODE
992 ###########################################################
993 code_classes = [code_class]
994 if (not enable_comments or
995 change['action'] == Action.CONTEXT):
996 code_classes.append('no-comment')
997 _html.append('\t<td class="%s">' % ' '.join(code_classes))
998 _html.append('''\n\t\t<pre>%(code)s</pre>\n''' % {
999 'code': change['line']
1000 })
1001
1002 _html.append('''\t</td>''')
1003 _html.append('''\n</tr>\n''')
1004 _html.append('''</table>''')
1005 if _html_empty:
1006 return None
1007 return ''.join(_html)
883 return self._diff.raw.tobytes()
1008 884
1009 885 def stat(self):
1010 886 """
1011 887 Returns tuple of added, and removed lines for this instance
1012 888 """
1013 889 return self.adds, self.removes
1014 890
1015 891 def get_context_of_line(
1016 self, path, diff_line=None, context_before=3, context_after=3):
892 self, path, diff_line: DiffLineNumber = None, context_before: int = 3, context_after: int = 3):
1017 893 """
1018 894 Returns the context lines for the specified diff line.
1019
1020 :type diff_line: :class:`DiffLineNumber`
1021 895 """
1022 896 assert self.parsed, "DiffProcessor is not initialized."
1023 897
1024 898 if None not in diff_line:
1025 raise ValueError(
1026 "Cannot specify both line numbers: {}".format(diff_line))
899 raise ValueError(f"Cannot specify both line numbers in diff_line: {diff_line}")
1027 900
1028 901 file_diff = self._get_file_diff(path)
1029 902 chunk, idx = self._find_chunk_line_index(file_diff, diff_line)
1030 903
1031 904 first_line_to_include = max(idx - context_before, 0)
1032 905 first_line_after_context = idx + context_after + 1
1033 context_lines = chunk[first_line_to_include:first_line_after_context]
906 context_lines = chunk['lines'][first_line_to_include:first_line_after_context]
1034 907
1035 908 line_contents = [
1036 909 _context_line(line) for line in context_lines
1037 if _is_diff_content(line)]
910 if _is_diff_content(line)
911 ]
912
1038 913 # TODO: johbo: Interim fixup, the diff chunks drop the final newline.
1039 914 # Once they are fixed, we can drop this line here.
1040 915 if line_contents:
1041 916 line_contents[-1] = (
1042 line_contents[-1][0], line_contents[-1][1].rstrip('\n') + '\n')
917 line_contents[-1][0], line_contents[-1][1].rstrip(b'\n') + b'\n')
1043 918 return line_contents
1044 919
1045 920 def find_context(self, path, context, offset=0):
1046 921 """
1047 922 Finds the given `context` inside of the diff.
1048 923
1049 924 Use the parameter `offset` to specify which offset the target line has
1050 925 inside of the given `context`. This way the correct diff line will be
1051 926 returned.
1052 927
1053 928 :param offset: Shall be used to specify the offset of the main line
1054 929 within the given `context`.
1055 930 """
1056 931 if offset < 0 or offset >= len(context):
1057 932 raise ValueError(
1058 933 "Only positive values up to the length of the context "
1059 934 "minus one are allowed.")
1060 935
1061 936 matches = []
1062 937 file_diff = self._get_file_diff(path)
1063 938
1064 939 for chunk in file_diff['chunks']:
940 if not isinstance(chunk, dict):
941 continue
1065 942 context_iter = iter(context)
1066 for line_idx, line in enumerate(chunk):
943 for line_idx, line in enumerate(chunk['lines']):
1067 944 try:
1068 945 if _context_line(line) == next(context_iter):
1069 946 continue
1070 947 except StopIteration:
1071 948 matches.append((line_idx, chunk))
1072 949 context_iter = iter(context)
1073 950
1074 951 # Increment position and triger StopIteration
1075 952 # if we had a match at the end
1076 953 line_idx += 1
1077 954 try:
1078 955 next(context_iter)
1079 956 except StopIteration:
1080 957 matches.append((line_idx, chunk))
1081 958
1082 959 effective_offset = len(context) - offset
1083 960 found_at_diff_lines = [
1084 _line_to_diff_line_number(chunk[idx - effective_offset])
961 _line_to_diff_line_number(chunk['lines'][idx - effective_offset])
1085 962 for idx, chunk in matches]
1086 963
1087 964 return found_at_diff_lines
1088 965
1089 966 def _get_file_diff(self, path):
1090 967 for file_diff in self.parsed_diff:
1091 968 if file_diff['filename'] == path:
1092 969 break
1093 970 else:
1094 raise FileNotInDiffException("File {} not in diff".format(path))
971 raise FileNotInDiffException(f"File {path} not in diff")
1095 972 return file_diff
1096 973
1097 974 def _find_chunk_line_index(self, file_diff, diff_line):
1098 975 for chunk in file_diff['chunks']:
1099 for idx, line in enumerate(chunk):
1100 if line['old_lineno'] == diff_line.old:
1101 return chunk, idx
1102 if line['new_lineno'] == diff_line.new:
1103 return chunk, idx
1104 raise LineNotInDiffException(
1105 "The line {} is not part of the diff.".format(diff_line))
976 if not isinstance(chunk, dict):
977 continue
978 for line_idx, line in enumerate(chunk['lines']):
979 if diff_line.old and line['old_lineno'] == diff_line.old:
980 return chunk, line_idx
981 if diff_line.new and line['new_lineno'] == diff_line.new:
982 return chunk, line_idx
983 raise LineNotInDiffException(f"The line {diff_line} is not part of the diff.")
1106 984
1107 985
1108 986 def _is_diff_content(line):
1109 987 return line['action'] in (
1110 988 Action.UNMODIFIED, Action.ADD, Action.DELETE)
1111 989
1112 990
1113 991 def _context_line(line):
1114 return (line['action'], line['line'])
1115
1116
1117 DiffLineNumber = collections.namedtuple('DiffLineNumber', ['old', 'new'])
992 return line['action'], line['line']
1118 993
1119 994
1120 995 def _line_to_diff_line_number(line):
1121 996 new_line_no = line['new_lineno'] or None
1122 997 old_line_no = line['old_lineno'] or None
1123 998 return DiffLineNumber(old=old_line_no, new=new_line_no)
1124 999
1125 1000
1126 1001 class FileNotInDiffException(Exception):
1127 1002 """
1128 1003 Raised when the context for a missing file is requested.
1129 1004
1130 1005 If you request the context for a line in a file which is not part of the
1131 1006 given diff, then this exception is raised.
1132 1007 """
1133 1008
1134 1009
1135 1010 class LineNotInDiffException(Exception):
1136 1011 """
1137 1012 Raised when the context for a missing line is requested.
1138 1013
1139 1014 If you request the context for a line in a file and this line is not
1140 1015 part of the given diff, then this exception is raised.
1141 1016 """
1142 1017
1143 1018
1144 1019 class DiffLimitExceeded(Exception):
1145 1020 pass
1146 1021
1147 1022
1148 1023 # NOTE(marcink): if diffs.mako change, probably this
1149 1024 # needs a bump to next version
1150 1025 CURRENT_DIFF_VERSION = 'v5'
1151 1026
1152 1027
1153 1028 def _cleanup_cache_file(cached_diff_file):
1154 1029 # cleanup file to not store it "damaged"
1155 1030 try:
1156 1031 os.remove(cached_diff_file)
1157 1032 except Exception:
1158 1033 log.exception('Failed to cleanup path %s', cached_diff_file)
1159 1034
1160 1035
1161 1036 def _get_compression_mode(cached_diff_file):
1162 1037 mode = 'bz2'
1163 1038 if 'mode:plain' in cached_diff_file:
1164 1039 mode = 'plain'
1165 1040 elif 'mode:gzip' in cached_diff_file:
1166 1041 mode = 'gzip'
1167 1042 return mode
1168 1043
1169 1044
1170 1045 def cache_diff(cached_diff_file, diff, commits):
1171 1046 compression_mode = _get_compression_mode(cached_diff_file)
1172 1047
1173 1048 struct = {
1174 1049 'version': CURRENT_DIFF_VERSION,
1175 1050 'diff': diff,
1176 1051 'commits': commits
1177 1052 }
1178 1053
1179 1054 start = time.time()
1180 1055 try:
1181 1056 if compression_mode == 'plain':
1182 1057 with open(cached_diff_file, 'wb') as f:
1183 1058 pickle.dump(struct, f)
1184 1059 elif compression_mode == 'gzip':
1185 1060 with gzip.GzipFile(cached_diff_file, 'wb') as f:
1186 1061 pickle.dump(struct, f)
1187 1062 else:
1188 1063 with bz2.BZ2File(cached_diff_file, 'wb') as f:
1189 1064 pickle.dump(struct, f)
1190 1065 except Exception:
1191 log.warn('Failed to save cache', exc_info=True)
1066 log.warning('Failed to save cache', exc_info=True)
1192 1067 _cleanup_cache_file(cached_diff_file)
1193 1068
1194 1069 log.debug('Saved diff cache under %s in %.4fs', cached_diff_file, time.time() - start)
1195 1070
1196 1071
1197 1072 def load_cached_diff(cached_diff_file):
1198 1073 compression_mode = _get_compression_mode(cached_diff_file)
1199 1074
1200 1075 default_struct = {
1201 1076 'version': CURRENT_DIFF_VERSION,
1202 1077 'diff': None,
1203 1078 'commits': None
1204 1079 }
1205 1080
1206 1081 has_cache = os.path.isfile(cached_diff_file)
1207 1082 if not has_cache:
1208 1083 log.debug('Reading diff cache file failed %s', cached_diff_file)
1209 1084 return default_struct
1210 1085
1211 1086 data = None
1212 1087
1213 1088 start = time.time()
1214 1089 try:
1215 1090 if compression_mode == 'plain':
1216 1091 with open(cached_diff_file, 'rb') as f:
1217 1092 data = pickle.load(f)
1218 1093 elif compression_mode == 'gzip':
1219 1094 with gzip.GzipFile(cached_diff_file, 'rb') as f:
1220 1095 data = pickle.load(f)
1221 1096 else:
1222 1097 with bz2.BZ2File(cached_diff_file, 'rb') as f:
1223 1098 data = pickle.load(f)
1224 1099 except Exception:
1225 log.warn('Failed to read diff cache file', exc_info=True)
1100 log.warning('Failed to read diff cache file', exc_info=True)
1226 1101
1227 1102 if not data:
1228 1103 data = default_struct
1229 1104
1230 1105 if not isinstance(data, dict):
1231 1106 # old version of data ?
1232 1107 data = default_struct
1233 1108
1234 1109 # check version
1235 1110 if data.get('version') != CURRENT_DIFF_VERSION:
1236 1111 # purge cache
1237 1112 _cleanup_cache_file(cached_diff_file)
1238 1113 return default_struct
1239 1114
1240 1115 log.debug('Loaded diff cache from %s in %.4fs', cached_diff_file, time.time() - start)
1241 1116
1242 1117 return data
1243 1118
1244 1119
1245 1120 def generate_diff_cache_key(*args):
1246 1121 """
1247 1122 Helper to generate a cache key using arguments
1248 1123 """
1249 1124 def arg_mapper(input_param):
1250 1125 input_param = safe_str(input_param)
1251 1126 # we cannot allow '/' in arguments since it would allow
1252 1127 # subdirectory usage
1253 1128 input_param.replace('/', '_')
1254 1129 return input_param or None # prevent empty string arguments
1255 1130
1256 1131 return '_'.join([
1257 '{}' for i in range(len(args))]).format(*map(arg_mapper, args))
1132 '{}' for _i in range(len(args))]).format(*list(map(arg_mapper, args)))
1258 1133
1259 1134
1260 1135 def diff_cache_exist(cache_storage, *args):
1261 1136 """
1262 1137 Based on all generated arguments check and return a cache path
1263 1138 """
1264 1139 args = list(args) + ['mode:gzip']
1265 1140 cache_key = generate_diff_cache_key(*args)
1266 1141 cache_file_path = os.path.join(cache_storage, cache_key)
1267 1142 # prevent path traversal attacks using some param that have e.g '../../'
1268 1143 if not os.path.abspath(cache_file_path).startswith(cache_storage):
1269 raise ValueError('Final path must be within {}'.format(cache_storage))
1144 raise ValueError(f'Final path must be within {cache_storage}')
1270 1145
1271 1146 return cache_file_path
General Comments 0
You need to be logged in to leave comments. Login now