##// END OF EJS Templates
diff-match: python3/code fixes for backport code
super-admin -
r4954:f53817a4 default
parent child Browse files
Show More
@@ -1,1919 +1,2031 b''
1
2
3
4 """Diff Match and Patch
1 """Diff Match and Patch
5
2
6 Copyright 2006 Google Inc.
3 Copyright 2006 Google Inc.
7 http://code.google.com/p/google-diff-match-patch/
4 http://code.google.com/p/google-diff-match-patch/
8
5
9 Licensed under the Apache License, Version 2.0 (the "License");
6 Licensed under the Apache License, Version 2.0 (the "License");
10 you may not use this file except in compliance with the License.
7 you may not use this file except in compliance with the License.
11 You may obtain a copy of the License at
8 You may obtain a copy of the License at
12
9
13 http://www.apache.org/licenses/LICENSE-2.0
10 http://www.apache.org/licenses/LICENSE-2.0
14
11
15 Unless required by applicable law or agreed to in writing, software
12 Unless required by applicable law or agreed to in writing, software
16 distributed under the License is distributed on an "AS IS" BASIS,
13 distributed under the License is distributed on an "AS IS" BASIS,
17 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 See the License for the specific language governing permissions and
15 See the License for the specific language governing permissions and
19 limitations under the License.
16 limitations under the License.
20 """
17 """
21
18
22 """Functions for diff, match and patch.
19 """Functions for diff, match and patch.
23
20
24 Computes the difference between two texts to create a patch.
21 Computes the difference between two texts to create a patch.
25 Applies the patch onto another text, allowing for errors.
22 Applies the patch onto another text, allowing for errors.
26 """
23 """
27
24
28 __author__ = 'fraser@google.com (Neil Fraser)'
25 __author__ = "fraser@google.com (Neil Fraser)"
29
26
30 import math
27 import math
31 import re
28 import re
32 import sys
29 import sys
33 import time
30 import time
34 import urllib.request, urllib.parse, urllib.error
31 import urllib.request, urllib.parse, urllib.error
35
32
36
33
37 class diff_match_patch:
34 class diff_match_patch:
38 """Class containing the diff, match and patch methods.
35 """Class containing the diff, match and patch methods.
39
36
40 Also contains the behaviour settings.
37 Also contains the behaviour settings.
41 """
38 """
42
39
43 def __init__(self):
40 def __init__(self):
44 """Inits a diff_match_patch object with default settings.
41 """Inits a diff_match_patch object with default settings.
45 Redefine these in your program to override the defaults.
42 Redefine these in your program to override the defaults.
46 """
43 """
47
44
48 # Number of seconds to map a diff before giving up (0 for infinity).
45 # Number of seconds to map a diff before giving up (0 for infinity).
49 self.Diff_Timeout = 1.0
46 self.Diff_Timeout = 1.0
50 # Cost of an empty edit operation in terms of edit characters.
47 # Cost of an empty edit operation in terms of edit characters.
51 self.Diff_EditCost = 4
48 self.Diff_EditCost = 4
52 # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
49 # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
53 self.Match_Threshold = 0.5
50 self.Match_Threshold = 0.5
54 # How far to search for a match (0 = exact location, 1000+ = broad match).
51 # How far to search for a match (0 = exact location, 1000+ = broad match).
55 # A match this many characters away from the expected location will add
52 # A match this many characters away from the expected location will add
56 # 1.0 to the score (0.0 is a perfect match).
53 # 1.0 to the score (0.0 is a perfect match).
57 self.Match_Distance = 1000
54 self.Match_Distance = 1000
58 # When deleting a large block of text (over ~64 characters), how close do
55 # When deleting a large block of text (over ~64 characters), how close do
59 # the contents have to be to match the expected contents. (0.0 = perfection,
56 # the contents have to be to match the expected contents. (0.0 = perfection,
60 # 1.0 = very loose). Note that Match_Threshold controls how closely the
57 # 1.0 = very loose). Note that Match_Threshold controls how closely the
61 # end points of a delete need to match.
58 # end points of a delete need to match.
62 self.Patch_DeleteThreshold = 0.5
59 self.Patch_DeleteThreshold = 0.5
63 # Chunk size for context length.
60 # Chunk size for context length.
64 self.Patch_Margin = 4
61 self.Patch_Margin = 4
65
62
66 # The number of bits in an int.
63 # The number of bits in an int.
67 # Python has no maximum, thus to disable patch splitting set to 0.
64 # Python has no maximum, thus to disable patch splitting set to 0.
68 # However to avoid long patches in certain pathological cases, use 32.
65 # However to avoid long patches in certain pathological cases, use 32.
69 # Multiple short patches (using native ints) are much faster than long ones.
66 # Multiple short patches (using native ints) are much faster than long ones.
70 self.Match_MaxBits = 32
67 self.Match_MaxBits = 32
71
68
72 # DIFF FUNCTIONS
69 # DIFF FUNCTIONS
73
70
74 # The data structure representing a diff is an array of tuples:
71 # The data structure representing a diff is an array of tuples:
75 # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
72 # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
76 # which means: delete "Hello", add "Goodbye" and keep " world."
73 # which means: delete "Hello", add "Goodbye" and keep " world."
77 DIFF_DELETE = -1
74 DIFF_DELETE = -1
78 DIFF_INSERT = 1
75 DIFF_INSERT = 1
79 DIFF_EQUAL = 0
76 DIFF_EQUAL = 0
80
77
81 def diff_main(self, text1, text2, checklines=True, deadline=None):
78 def diff_main(self, text1, text2, checklines=True, deadline=None):
82 """Find the differences between two texts. Simplifies the problem by
79 """Find the differences between two texts. Simplifies the problem by
83 stripping any common prefix or suffix off the texts before diffing.
80 stripping any common prefix or suffix off the texts before diffing.
84
81
85 Args:
82 Args:
86 text1: Old string to be diffed.
83 text1: Old string to be diffed.
87 text2: New string to be diffed.
84 text2: New string to be diffed.
88 checklines: Optional speedup flag. If present and false, then don't run
85 checklines: Optional speedup flag. If present and false, then don't run
89 a line-level diff first to identify the changed areas.
86 a line-level diff first to identify the changed areas.
90 Defaults to true, which does a faster, slightly less optimal diff.
87 Defaults to true, which does a faster, slightly less optimal diff.
91 deadline: Optional time when the diff should be complete by. Used
88 deadline: Optional time when the diff should be complete by. Used
92 internally for recursive calls. Users should set DiffTimeout instead.
89 internally for recursive calls. Users should set DiffTimeout instead.
93
90
94 Returns:
91 Returns:
95 Array of changes.
92 Array of changes.
96 """
93 """
97 # Set a deadline by which time the diff must be complete.
94 # Set a deadline by which time the diff must be complete.
98 if deadline is None:
95 if deadline is None:
99 # Unlike in most languages, Python counts time in seconds.
96 # Unlike in most languages, Python counts time in seconds.
100 if self.Diff_Timeout <= 0:
97 if self.Diff_Timeout <= 0:
101 deadline = sys.maxint
98 deadline = sys.maxsize
102 else:
99 else:
103 deadline = time.time() + self.Diff_Timeout
100 deadline = time.time() + self.Diff_Timeout
104
101
105 # Check for null inputs.
102 # Check for null inputs.
106 if text1 is None or text2 is None:
103 if text1 is None or text2 is None:
107 raise ValueError("Null inputs. (diff_main)")
104 raise ValueError("Null inputs. (diff_main)")
108
105
109 # Check for equality (speedup).
106 # Check for equality (speedup).
110 if text1 == text2:
107 if text1 == text2:
111 if text1:
108 if text1:
112 return [(self.DIFF_EQUAL, text1)]
109 return [(self.DIFF_EQUAL, text1)]
113 return []
110 return []
114
111
115 # Trim off common prefix (speedup).
112 # Trim off common prefix (speedup).
116 commonlength = self.diff_commonPrefix(text1, text2)
113 commonlength = self.diff_commonPrefix(text1, text2)
117 commonprefix = text1[:commonlength]
114 commonprefix = text1[:commonlength]
118 text1 = text1[commonlength:]
115 text1 = text1[commonlength:]
119 text2 = text2[commonlength:]
116 text2 = text2[commonlength:]
120
117
121 # Trim off common suffix (speedup).
118 # Trim off common suffix (speedup).
122 commonlength = self.diff_commonSuffix(text1, text2)
119 commonlength = self.diff_commonSuffix(text1, text2)
123 if commonlength == 0:
120 if commonlength == 0:
124 commonsuffix = ''
121 commonsuffix = ""
125 else:
122 else:
126 commonsuffix = text1[-commonlength:]
123 commonsuffix = text1[-commonlength:]
127 text1 = text1[:-commonlength]
124 text1 = text1[:-commonlength]
128 text2 = text2[:-commonlength]
125 text2 = text2[:-commonlength]
129
126
130 # Compute the diff on the middle block.
127 # Compute the diff on the middle block.
131 diffs = self.diff_compute(text1, text2, checklines, deadline)
128 diffs = self.diff_compute(text1, text2, checklines, deadline)
132
129
133 # Restore the prefix and suffix.
130 # Restore the prefix and suffix.
134 if commonprefix:
131 if commonprefix:
135 diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
132 diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
136 if commonsuffix:
133 if commonsuffix:
137 diffs.append((self.DIFF_EQUAL, commonsuffix))
134 diffs.append((self.DIFF_EQUAL, commonsuffix))
138 self.diff_cleanupMerge(diffs)
135 self.diff_cleanupMerge(diffs)
139 return diffs
136 return diffs
140
137
141 def diff_compute(self, text1, text2, checklines, deadline):
138 def diff_compute(self, text1, text2, checklines, deadline):
142 """Find the differences between two texts. Assumes that the texts do not
139 """Find the differences between two texts. Assumes that the texts do not
143 have any common prefix or suffix.
140 have any common prefix or suffix.
144
141
145 Args:
142 Args:
146 text1: Old string to be diffed.
143 text1: Old string to be diffed.
147 text2: New string to be diffed.
144 text2: New string to be diffed.
148 checklines: Speedup flag. If false, then don't run a line-level diff
145 checklines: Speedup flag. If false, then don't run a line-level diff
149 first to identify the changed areas.
146 first to identify the changed areas.
150 If true, then run a faster, slightly less optimal diff.
147 If true, then run a faster, slightly less optimal diff.
151 deadline: Time when the diff should be complete by.
148 deadline: Time when the diff should be complete by.
152
149
153 Returns:
150 Returns:
154 Array of changes.
151 Array of changes.
155 """
152 """
156 if not text1:
153 if not text1:
157 # Just add some text (speedup).
154 # Just add some text (speedup).
158 return [(self.DIFF_INSERT, text2)]
155 return [(self.DIFF_INSERT, text2)]
159
156
160 if not text2:
157 if not text2:
161 # Just delete some text (speedup).
158 # Just delete some text (speedup).
162 return [(self.DIFF_DELETE, text1)]
159 return [(self.DIFF_DELETE, text1)]
163
160
164 if len(text1) > len(text2):
161 if len(text1) > len(text2):
165 (longtext, shorttext) = (text1, text2)
162 (longtext, shorttext) = (text1, text2)
166 else:
163 else:
167 (shorttext, longtext) = (text1, text2)
164 (shorttext, longtext) = (text1, text2)
168 i = longtext.find(shorttext)
165 i = longtext.find(shorttext)
169 if i != -1:
166 if i != -1:
170 # Shorter text is inside the longer text (speedup).
167 # Shorter text is inside the longer text (speedup).
171 diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),
168 diffs = [
172 (self.DIFF_INSERT, longtext[i + len(shorttext):])]
169 (self.DIFF_INSERT, longtext[:i]),
170 (self.DIFF_EQUAL, shorttext),
171 (self.DIFF_INSERT, longtext[i + len(shorttext) :]),
172 ]
173 # Swap insertions for deletions if diff is reversed.
173 # Swap insertions for deletions if diff is reversed.
174 if len(text1) > len(text2):
174 if len(text1) > len(text2):
175 diffs[0] = (self.DIFF_DELETE, diffs[0][1])
175 diffs[0] = (self.DIFF_DELETE, diffs[0][1])
176 diffs[2] = (self.DIFF_DELETE, diffs[2][1])
176 diffs[2] = (self.DIFF_DELETE, diffs[2][1])
177 return diffs
177 return diffs
178
178
179 if len(shorttext) == 1:
179 if len(shorttext) == 1:
180 # Single character string.
180 # Single character string.
181 # After the previous speedup, the character can't be an equality.
181 # After the previous speedup, the character can't be an equality.
182 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
182 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
183
183
184 # Check to see if the problem can be split in two.
184 # Check to see if the problem can be split in two.
185 hm = self.diff_halfMatch(text1, text2)
185 hm = self.diff_halfMatch(text1, text2)
186 if hm:
186 if hm:
187 # A half-match was found, sort out the return data.
187 # A half-match was found, sort out the return data.
188 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
188 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
189 # Send both pairs off for separate processing.
189 # Send both pairs off for separate processing.
190 diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)
190 diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)
191 diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)
191 diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)
192 # Merge the results.
192 # Merge the results.
193 return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
193 return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
194
194
195 if checklines and len(text1) > 100 and len(text2) > 100:
195 if checklines and len(text1) > 100 and len(text2) > 100:
196 return self.diff_lineMode(text1, text2, deadline)
196 return self.diff_lineMode(text1, text2, deadline)
197
197
198 return self.diff_bisect(text1, text2, deadline)
198 return self.diff_bisect(text1, text2, deadline)
199
199
200 def diff_lineMode(self, text1, text2, deadline):
200 def diff_lineMode(self, text1, text2, deadline):
201 """Do a quick line-level diff on both strings, then rediff the parts for
201 """Do a quick line-level diff on both strings, then rediff the parts for
202 greater accuracy.
202 greater accuracy.
203 This speedup can produce non-minimal diffs.
203 This speedup can produce non-minimal diffs.
204
204
205 Args:
205 Args:
206 text1: Old string to be diffed.
206 text1: Old string to be diffed.
207 text2: New string to be diffed.
207 text2: New string to be diffed.
208 deadline: Time when the diff should be complete by.
208 deadline: Time when the diff should be complete by.
209
209
210 Returns:
210 Returns:
211 Array of changes.
211 Array of changes.
212 """
212 """
213
213
214 # Scan the text on a line-by-line basis first.
214 # Scan the text on a line-by-line basis first.
215 (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
215 (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
216
216
217 diffs = self.diff_main(text1, text2, False, deadline)
217 diffs = self.diff_main(text1, text2, False, deadline)
218
218
219 # Convert the diff back to original text.
219 # Convert the diff back to original text.
220 self.diff_charsToLines(diffs, linearray)
220 self.diff_charsToLines(diffs, linearray)
221 # Eliminate freak matches (e.g. blank lines)
221 # Eliminate freak matches (e.g. blank lines)
222 self.diff_cleanupSemantic(diffs)
222 self.diff_cleanupSemantic(diffs)
223
223
224 # Rediff any replacement blocks, this time character-by-character.
224 # Rediff any replacement blocks, this time character-by-character.
225 # Add a dummy entry at the end.
225 # Add a dummy entry at the end.
226 diffs.append((self.DIFF_EQUAL, ''))
226 diffs.append((self.DIFF_EQUAL, ""))
227 pointer = 0
227 pointer = 0
228 count_delete = 0
228 count_delete = 0
229 count_insert = 0
229 count_insert = 0
230 text_delete = ''
230 text_delete = ""
231 text_insert = ''
231 text_insert = ""
232 while pointer < len(diffs):
232 while pointer < len(diffs):
233 if diffs[pointer][0] == self.DIFF_INSERT:
233 if diffs[pointer][0] == self.DIFF_INSERT:
234 count_insert += 1
234 count_insert += 1
235 text_insert += diffs[pointer][1]
235 text_insert += diffs[pointer][1]
236 elif diffs[pointer][0] == self.DIFF_DELETE:
236 elif diffs[pointer][0] == self.DIFF_DELETE:
237 count_delete += 1
237 count_delete += 1
238 text_delete += diffs[pointer][1]
238 text_delete += diffs[pointer][1]
239 elif diffs[pointer][0] == self.DIFF_EQUAL:
239 elif diffs[pointer][0] == self.DIFF_EQUAL:
240 # Upon reaching an equality, check for prior redundancies.
240 # Upon reaching an equality, check for prior redundancies.
241 if count_delete >= 1 and count_insert >= 1:
241 if count_delete >= 1 and count_insert >= 1:
242 # Delete the offending records and add the merged ones.
242 # Delete the offending records and add the merged ones.
243 a = self.diff_main(text_delete, text_insert, False, deadline)
243 a = self.diff_main(text_delete, text_insert, False, deadline)
244 diffs[pointer - count_delete - count_insert : pointer] = a
244 diffs[pointer - count_delete - count_insert : pointer] = a
245 pointer = pointer - count_delete - count_insert + len(a)
245 pointer = pointer - count_delete - count_insert + len(a)
246 count_insert = 0
246 count_insert = 0
247 count_delete = 0
247 count_delete = 0
248 text_delete = ''
248 text_delete = ""
249 text_insert = ''
249 text_insert = ""
250
250
251 pointer += 1
251 pointer += 1
252
252
253 diffs.pop() # Remove the dummy entry at the end.
253 diffs.pop() # Remove the dummy entry at the end.
254
254
255 return diffs
255 return diffs
256
256
257 def diff_bisect(self, text1, text2, deadline):
257 def diff_bisect(self, text1, text2, deadline):
258 """Find the 'middle snake' of a diff, split the problem in two
258 """Find the 'middle snake' of a diff, split the problem in two
259 and return the recursively constructed diff.
259 and return the recursively constructed diff.
260 See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
260 See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
261
261
262 Args:
262 Args:
263 text1: Old string to be diffed.
263 text1: Old string to be diffed.
264 text2: New string to be diffed.
264 text2: New string to be diffed.
265 deadline: Time at which to bail if not yet complete.
265 deadline: Time at which to bail if not yet complete.
266
266
267 Returns:
267 Returns:
268 Array of diff tuples.
268 Array of diff tuples.
269 """
269 """
270
270
271 # Cache the text lengths to prevent multiple calls.
271 # Cache the text lengths to prevent multiple calls.
272 text1_length = len(text1)
272 text1_length = len(text1)
273 text2_length = len(text2)
273 text2_length = len(text2)
274 max_d = (text1_length + text2_length + 1) // 2
274 max_d = (text1_length + text2_length + 1) // 2
275 v_offset = max_d
275 v_offset = max_d
276 v_length = 2 * max_d
276 v_length = 2 * max_d
277 v1 = [-1] * v_length
277 v1 = [-1] * v_length
278 v1[v_offset + 1] = 0
278 v1[v_offset + 1] = 0
279 v2 = v1[:]
279 v2 = v1[:]
280 delta = text1_length - text2_length
280 delta = text1_length - text2_length
281 # If the total number of characters is odd, then the front path will
281 # If the total number of characters is odd, then the front path will
282 # collide with the reverse path.
282 # collide with the reverse path.
283 front = (delta % 2 != 0)
283 front = delta % 2 != 0
284 # Offsets for start and end of k loop.
284 # Offsets for start and end of k loop.
285 # Prevents mapping of space beyond the grid.
285 # Prevents mapping of space beyond the grid.
286 k1start = 0
286 k1start = 0
287 k1end = 0
287 k1end = 0
288 k2start = 0
288 k2start = 0
289 k2end = 0
289 k2end = 0
290 for d in range(max_d):
290 for d in range(max_d):
291 # Bail out if deadline is reached.
291 # Bail out if deadline is reached.
292 if time.time() > deadline:
292 if time.time() > deadline:
293 break
293 break
294
294
295 # Walk the front path one step.
295 # Walk the front path one step.
296 for k1 in range(-d + k1start, d + 1 - k1end, 2):
296 for k1 in range(-d + k1start, d + 1 - k1end, 2):
297 k1_offset = v_offset + k1
297 k1_offset = v_offset + k1
298 if k1 == -d or (k1 != d and
298 if k1 == -d or (k1 != d and v1[k1_offset - 1] < v1[k1_offset + 1]):
299 v1[k1_offset - 1] < v1[k1_offset + 1]):
300 x1 = v1[k1_offset + 1]
299 x1 = v1[k1_offset + 1]
301 else:
300 else:
302 x1 = v1[k1_offset - 1] + 1
301 x1 = v1[k1_offset - 1] + 1
303 y1 = x1 - k1
302 y1 = x1 - k1
304 while (x1 < text1_length and y1 < text2_length and
303 while (
305 text1[x1] == text2[y1]):
304 x1 < text1_length and y1 < text2_length and text1[x1] == text2[y1]
305 ):
306 x1 += 1
306 x1 += 1
307 y1 += 1
307 y1 += 1
308 v1[k1_offset] = x1
308 v1[k1_offset] = x1
309 if x1 > text1_length:
309 if x1 > text1_length:
310 # Ran off the right of the graph.
310 # Ran off the right of the graph.
311 k1end += 2
311 k1end += 2
312 elif y1 > text2_length:
312 elif y1 > text2_length:
313 # Ran off the bottom of the graph.
313 # Ran off the bottom of the graph.
314 k1start += 2
314 k1start += 2
315 elif front:
315 elif front:
316 k2_offset = v_offset + delta - k1
316 k2_offset = v_offset + delta - k1
317 if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:
317 if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:
318 # Mirror x2 onto top-left coordinate system.
318 # Mirror x2 onto top-left coordinate system.
319 x2 = text1_length - v2[k2_offset]
319 x2 = text1_length - v2[k2_offset]
320 if x1 >= x2:
320 if x1 >= x2:
321 # Overlap detected.
321 # Overlap detected.
322 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
322 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
323
323
324 # Walk the reverse path one step.
324 # Walk the reverse path one step.
325 for k2 in range(-d + k2start, d + 1 - k2end, 2):
325 for k2 in range(-d + k2start, d + 1 - k2end, 2):
326 k2_offset = v_offset + k2
326 k2_offset = v_offset + k2
327 if k2 == -d or (k2 != d and
327 if k2 == -d or (k2 != d and v2[k2_offset - 1] < v2[k2_offset + 1]):
328 v2[k2_offset - 1] < v2[k2_offset + 1]):
329 x2 = v2[k2_offset + 1]
328 x2 = v2[k2_offset + 1]
330 else:
329 else:
331 x2 = v2[k2_offset - 1] + 1
330 x2 = v2[k2_offset - 1] + 1
332 y2 = x2 - k2
331 y2 = x2 - k2
333 while (x2 < text1_length and y2 < text2_length and
332 while (
334 text1[-x2 - 1] == text2[-y2 - 1]):
333 x2 < text1_length
334 and y2 < text2_length
335 and text1[-x2 - 1] == text2[-y2 - 1]
336 ):
335 x2 += 1
337 x2 += 1
336 y2 += 1
338 y2 += 1
337 v2[k2_offset] = x2
339 v2[k2_offset] = x2
338 if x2 > text1_length:
340 if x2 > text1_length:
339 # Ran off the left of the graph.
341 # Ran off the left of the graph.
340 k2end += 2
342 k2end += 2
341 elif y2 > text2_length:
343 elif y2 > text2_length:
342 # Ran off the top of the graph.
344 # Ran off the top of the graph.
343 k2start += 2
345 k2start += 2
344 elif not front:
346 elif not front:
345 k1_offset = v_offset + delta - k2
347 k1_offset = v_offset + delta - k2
346 if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:
348 if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:
347 x1 = v1[k1_offset]
349 x1 = v1[k1_offset]
348 y1 = v_offset + x1 - k1_offset
350 y1 = v_offset + x1 - k1_offset
349 # Mirror x2 onto top-left coordinate system.
351 # Mirror x2 onto top-left coordinate system.
350 x2 = text1_length - x2
352 x2 = text1_length - x2
351 if x1 >= x2:
353 if x1 >= x2:
352 # Overlap detected.
354 # Overlap detected.
353 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
355 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
354
356
355 # Diff took too long and hit the deadline or
357 # Diff took too long and hit the deadline or
356 # number of diffs equals number of characters, no commonality at all.
358 # number of diffs equals number of characters, no commonality at all.
357 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
359 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
358
360
359 def diff_bisectSplit(self, text1, text2, x, y, deadline):
361 def diff_bisectSplit(self, text1, text2, x, y, deadline):
360 """Given the location of the 'middle snake', split the diff in two parts
362 """Given the location of the 'middle snake', split the diff in two parts
361 and recurse.
363 and recurse.
362
364
363 Args:
365 Args:
364 text1: Old string to be diffed.
366 text1: Old string to be diffed.
365 text2: New string to be diffed.
367 text2: New string to be diffed.
366 x: Index of split point in text1.
368 x: Index of split point in text1.
367 y: Index of split point in text2.
369 y: Index of split point in text2.
368 deadline: Time at which to bail if not yet complete.
370 deadline: Time at which to bail if not yet complete.
369
371
370 Returns:
372 Returns:
371 Array of diff tuples.
373 Array of diff tuples.
372 """
374 """
373 text1a = text1[:x]
375 text1a = text1[:x]
374 text2a = text2[:y]
376 text2a = text2[:y]
375 text1b = text1[x:]
377 text1b = text1[x:]
376 text2b = text2[y:]
378 text2b = text2[y:]
377
379
378 # Compute both diffs serially.
380 # Compute both diffs serially.
379 diffs = self.diff_main(text1a, text2a, False, deadline)
381 diffs = self.diff_main(text1a, text2a, False, deadline)
380 diffsb = self.diff_main(text1b, text2b, False, deadline)
382 diffsb = self.diff_main(text1b, text2b, False, deadline)
381
383
382 return diffs + diffsb
384 return diffs + diffsb
383
385
384 def diff_linesToChars(self, text1, text2):
386 def diff_linesToChars(self, text1, text2):
385 """Split two texts into an array of strings. Reduce the texts to a string
387 """Split two texts into an array of strings. Reduce the texts to a string
386 of hashes where each Unicode character represents one line.
388 of hashes where each Unicode character represents one line.
387
389
388 Args:
390 Args:
389 text1: First string.
391 text1: First string.
390 text2: Second string.
392 text2: Second string.
391
393
392 Returns:
394 Returns:
393 Three element tuple, containing the encoded text1, the encoded text2 and
395 Three element tuple, containing the encoded text1, the encoded text2 and
394 the array of unique strings. The zeroth element of the array of unique
396 the array of unique strings. The zeroth element of the array of unique
395 strings is intentionally blank.
397 strings is intentionally blank.
396 """
398 """
397 lineArray = [] # e.g. lineArray[4] == "Hello\n"
399 lineArray = [] # e.g. lineArray[4] == "Hello\n"
398 lineHash = {} # e.g. lineHash["Hello\n"] == 4
400 lineHash = {} # e.g. lineHash["Hello\n"] == 4
399
401
400 # "\x00" is a valid character, but various debuggers don't like it.
402 # "\x00" is a valid character, but various debuggers don't like it.
401 # So we'll insert a junk entry to avoid generating a null character.
403 # So we'll insert a junk entry to avoid generating a null character.
402 lineArray.append('')
404 lineArray.append("")
403
405
404 def diff_linesToCharsMunge(text):
406 def diff_linesToCharsMunge(text):
405 """Split a text into an array of strings. Reduce the texts to a string
407 """Split a text into an array of strings. Reduce the texts to a string
406 of hashes where each Unicode character represents one line.
408 of hashes where each Unicode character represents one line.
407 Modifies linearray and linehash through being a closure.
409 Modifies linearray and linehash through being a closure.
408
410
409 Args:
411 Args:
410 text: String to encode.
412 text: String to encode.
411
413
412 Returns:
414 Returns:
413 Encoded string.
415 Encoded string.
414 """
416 """
415 chars = []
417 chars = []
416 # Walk the text, pulling out a substring for each line.
418 # Walk the text, pulling out a substring for each line.
417 # text.split('\n') would would temporarily double our memory footprint.
419 # text.split('\n') would would temporarily double our memory footprint.
418 # Modifying text would create many large strings to garbage collect.
420 # Modifying text would create many large strings to garbage collect.
419 lineStart = 0
421 lineStart = 0
420 lineEnd = -1
422 lineEnd = -1
421 while lineEnd < len(text) - 1:
423 while lineEnd < len(text) - 1:
422 lineEnd = text.find('\n', lineStart)
424 lineEnd = text.find("\n", lineStart)
423 if lineEnd == -1:
425 if lineEnd == -1:
424 lineEnd = len(text) - 1
426 lineEnd = len(text) - 1
425 line = text[lineStart:lineEnd + 1]
427 line = text[lineStart : lineEnd + 1]
426 lineStart = lineEnd + 1
428 lineStart = lineEnd + 1
427
429
428 if line in lineHash:
430 if line in lineHash:
429 chars.append(unichr(lineHash[line]))
431 chars.append(chr(lineHash[line]))
430 else:
432 else:
431 lineArray.append(line)
433 lineArray.append(line)
432 lineHash[line] = len(lineArray) - 1
434 lineHash[line] = len(lineArray) - 1
433 chars.append(unichr(len(lineArray) - 1))
435 chars.append(chr(len(lineArray) - 1))
434 return "".join(chars)
436 return "".join(chars)
435
437
436 chars1 = diff_linesToCharsMunge(text1)
438 chars1 = diff_linesToCharsMunge(text1)
437 chars2 = diff_linesToCharsMunge(text2)
439 chars2 = diff_linesToCharsMunge(text2)
438 return (chars1, chars2, lineArray)
440 return (chars1, chars2, lineArray)
439
441
440 def diff_charsToLines(self, diffs, lineArray):
442 def diff_charsToLines(self, diffs, lineArray):
441 """Rehydrate the text in a diff from a string of line hashes to real lines
443 """Rehydrate the text in a diff from a string of line hashes to real lines
442 of text.
444 of text.
443
445
444 Args:
446 Args:
445 diffs: Array of diff tuples.
447 diffs: Array of diff tuples.
446 lineArray: Array of unique strings.
448 lineArray: Array of unique strings.
447 """
449 """
448 for x in range(len(diffs)):
450 for x in range(len(diffs)):
449 text = []
451 text = []
450 for char in diffs[x][1]:
452 for char in diffs[x][1]:
451 text.append(lineArray[ord(char)])
453 text.append(lineArray[ord(char)])
452 diffs[x] = (diffs[x][0], "".join(text))
454 diffs[x] = (diffs[x][0], "".join(text))
453
455
454 def diff_commonPrefix(self, text1, text2):
456 def diff_commonPrefix(self, text1, text2):
455 """Determine the common prefix of two strings.
457 """Determine the common prefix of two strings.
456
458
457 Args:
459 Args:
458 text1: First string.
460 text1: First string.
459 text2: Second string.
461 text2: Second string.
460
462
461 Returns:
463 Returns:
462 The number of characters common to the start of each string.
464 The number of characters common to the start of each string.
463 """
465 """
464 # Quick check for common null cases.
466 # Quick check for common null cases.
465 if not text1 or not text2 or text1[0] != text2[0]:
467 if not text1 or not text2 or text1[0] != text2[0]:
466 return 0
468 return 0
467 # Binary search.
469 # Binary search.
468 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
470 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
469 pointermin = 0
471 pointermin = 0
470 pointermax = min(len(text1), len(text2))
472 pointermax = min(len(text1), len(text2))
471 pointermid = pointermax
473 pointermid = pointermax
472 pointerstart = 0
474 pointerstart = 0
473 while pointermin < pointermid:
475 while pointermin < pointermid:
474 if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
476 if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
475 pointermin = pointermid
477 pointermin = pointermid
476 pointerstart = pointermin
478 pointerstart = pointermin
477 else:
479 else:
478 pointermax = pointermid
480 pointermax = pointermid
479 pointermid = (pointermax - pointermin) // 2 + pointermin
481 pointermid = (pointermax - pointermin) // 2 + pointermin
480 return pointermid
482 return pointermid
481
483
482 def diff_commonSuffix(self, text1, text2):
484 def diff_commonSuffix(self, text1, text2):
483 """Determine the common suffix of two strings.
485 """Determine the common suffix of two strings.
484
486
485 Args:
487 Args:
486 text1: First string.
488 text1: First string.
487 text2: Second string.
489 text2: Second string.
488
490
489 Returns:
491 Returns:
490 The number of characters common to the end of each string.
492 The number of characters common to the end of each string.
491 """
493 """
492 # Quick check for common null cases.
494 # Quick check for common null cases.
493 if not text1 or not text2 or text1[-1] != text2[-1]:
495 if not text1 or not text2 or text1[-1] != text2[-1]:
494 return 0
496 return 0
495 # Binary search.
497 # Binary search.
496 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
498 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
497 pointermin = 0
499 pointermin = 0
498 pointermax = min(len(text1), len(text2))
500 pointermax = min(len(text1), len(text2))
499 pointermid = pointermax
501 pointermid = pointermax
500 pointerend = 0
502 pointerend = 0
501 while pointermin < pointermid:
503 while pointermin < pointermid:
502 if (text1[-pointermid:len(text1) - pointerend] ==
504 if (
503 text2[-pointermid:len(text2) - pointerend]):
505 text1[-pointermid : len(text1) - pointerend]
506 == text2[-pointermid : len(text2) - pointerend]
507 ):
504 pointermin = pointermid
508 pointermin = pointermid
505 pointerend = pointermin
509 pointerend = pointermin
506 else:
510 else:
507 pointermax = pointermid
511 pointermax = pointermid
508 pointermid = (pointermax - pointermin) // 2 + pointermin
512 pointermid = (pointermax - pointermin) // 2 + pointermin
509 return pointermid
513 return pointermid
510
514
511 def diff_commonOverlap(self, text1, text2):
515 def diff_commonOverlap(self, text1, text2):
512 """Determine if the suffix of one string is the prefix of another.
516 """Determine if the suffix of one string is the prefix of another.
513
517
514 Args:
518 Args:
515 text1 First string.
519 text1 First string.
516 text2 Second string.
520 text2 Second string.
517
521
518 Returns:
522 Returns:
519 The number of characters common to the end of the first
523 The number of characters common to the end of the first
520 string and the start of the second string.
524 string and the start of the second string.
521 """
525 """
522 # Cache the text lengths to prevent multiple calls.
526 # Cache the text lengths to prevent multiple calls.
523 text1_length = len(text1)
527 text1_length = len(text1)
524 text2_length = len(text2)
528 text2_length = len(text2)
525 # Eliminate the null case.
529 # Eliminate the null case.
526 if text1_length == 0 or text2_length == 0:
530 if text1_length == 0 or text2_length == 0:
527 return 0
531 return 0
528 # Truncate the longer string.
532 # Truncate the longer string.
529 if text1_length > text2_length:
533 if text1_length > text2_length:
530 text1 = text1[-text2_length:]
534 text1 = text1[-text2_length:]
531 elif text1_length < text2_length:
535 elif text1_length < text2_length:
532 text2 = text2[:text1_length]
536 text2 = text2[:text1_length]
533 text_length = min(text1_length, text2_length)
537 text_length = min(text1_length, text2_length)
534 # Quick check for the worst case.
538 # Quick check for the worst case.
535 if text1 == text2:
539 if text1 == text2:
536 return text_length
540 return text_length
537
541
538 # Start by looking for a single character match
542 # Start by looking for a single character match
539 # and increase length until no match is found.
543 # and increase length until no match is found.
540 # Performance analysis: http://neil.fraser.name/news/2010/11/04/
544 # Performance analysis: http://neil.fraser.name/news/2010/11/04/
541 best = 0
545 best = 0
542 length = 1
546 length = 1
543 while True:
547 while True:
544 pattern = text1[-length:]
548 pattern = text1[-length:]
545 found = text2.find(pattern)
549 found = text2.find(pattern)
546 if found == -1:
550 if found == -1:
547 return best
551 return best
548 length += found
552 length += found
549 if found == 0 or text1[-length:] == text2[:length]:
553 if found == 0 or text1[-length:] == text2[:length]:
550 best = length
554 best = length
551 length += 1
555 length += 1
552
556
553 def diff_halfMatch(self, text1, text2):
557 def diff_halfMatch(self, text1, text2):
554 """Do the two texts share a substring which is at least half the length of
558 """Do the two texts share a substring which is at least half the length of
555 the longer text?
559 the longer text?
556 This speedup can produce non-minimal diffs.
560 This speedup can produce non-minimal diffs.
557
561
558 Args:
562 Args:
559 text1: First string.
563 text1: First string.
560 text2: Second string.
564 text2: Second string.
561
565
562 Returns:
566 Returns:
563 Five element Array, containing the prefix of text1, the suffix of text1,
567 Five element Array, containing the prefix of text1, the suffix of text1,
564 the prefix of text2, the suffix of text2 and the common middle. Or None
568 the prefix of text2, the suffix of text2 and the common middle. Or None
565 if there was no match.
569 if there was no match.
566 """
570 """
567 if self.Diff_Timeout <= 0:
571 if self.Diff_Timeout <= 0:
568 # Don't risk returning a non-optimal diff if we have unlimited time.
572 # Don't risk returning a non-optimal diff if we have unlimited time.
569 return None
573 return None
570 if len(text1) > len(text2):
574 if len(text1) > len(text2):
571 (longtext, shorttext) = (text1, text2)
575 (longtext, shorttext) = (text1, text2)
572 else:
576 else:
573 (shorttext, longtext) = (text1, text2)
577 (shorttext, longtext) = (text1, text2)
574 if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):
578 if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):
575 return None # Pointless.
579 return None # Pointless.
576
580
577 def diff_halfMatchI(longtext, shorttext, i):
581 def diff_halfMatchI(longtext, shorttext, i):
578 """Does a substring of shorttext exist within longtext such that the
582 """Does a substring of shorttext exist within longtext such that the
579 substring is at least half the length of longtext?
583 substring is at least half the length of longtext?
580 Closure, but does not reference any external variables.
584 Closure, but does not reference any external variables.
581
585
582 Args:
586 Args:
583 longtext: Longer string.
587 longtext: Longer string.
584 shorttext: Shorter string.
588 shorttext: Shorter string.
585 i: Start index of quarter length substring within longtext.
589 i: Start index of quarter length substring within longtext.
586
590
587 Returns:
591 Returns:
588 Five element Array, containing the prefix of longtext, the suffix of
592 Five element Array, containing the prefix of longtext, the suffix of
589 longtext, the prefix of shorttext, the suffix of shorttext and the
593 longtext, the prefix of shorttext, the suffix of shorttext and the
590 common middle. Or None if there was no match.
594 common middle. Or None if there was no match.
591 """
595 """
592 seed = longtext[i:i + len(longtext) // 4]
596 seed = longtext[i : i + len(longtext) // 4]
593 best_common = ''
597 best_common = ""
594 j = shorttext.find(seed)
598 j = shorttext.find(seed)
595 while j != -1:
599 while j != -1:
596 prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
600 prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
597 suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
601 suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
598 if len(best_common) < suffixLength + prefixLength:
602 if len(best_common) < suffixLength + prefixLength:
599 best_common = (shorttext[j - suffixLength:j] +
603 best_common = (
600 shorttext[j:j + prefixLength])
604 shorttext[j - suffixLength : j]
605 + shorttext[j : j + prefixLength]
606 )
601 best_longtext_a = longtext[:i - suffixLength]
607 best_longtext_a = longtext[: i - suffixLength]
602 best_longtext_b = longtext[i + prefixLength:]
608 best_longtext_b = longtext[i + prefixLength :]
603 best_shorttext_a = shorttext[:j - suffixLength]
609 best_shorttext_a = shorttext[: j - suffixLength]
604 best_shorttext_b = shorttext[j + prefixLength:]
610 best_shorttext_b = shorttext[j + prefixLength :]
605 j = shorttext.find(seed, j + 1)
611 j = shorttext.find(seed, j + 1)
606
612
607 if len(best_common) * 2 >= len(longtext):
613 if len(best_common) * 2 >= len(longtext):
608 return (best_longtext_a, best_longtext_b,
614 return (
609 best_shorttext_a, best_shorttext_b, best_common)
615 best_longtext_a,
616 best_longtext_b,
617 best_shorttext_a,
618 best_shorttext_b,
619 best_common,
620 )
610 else:
621 else:
611 return None
622 return None
612
623
613 # First check if the second quarter is the seed for a half-match.
624 # First check if the second quarter is the seed for a half-match.
614 hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4)
625 hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4)
615 # Check again based on the third quarter.
626 # Check again based on the third quarter.
616 hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2)
627 hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2)
617 if not hm1 and not hm2:
628 if not hm1 and not hm2:
618 return None
629 return None
619 elif not hm2:
630 elif not hm2:
620 hm = hm1
631 hm = hm1
621 elif not hm1:
632 elif not hm1:
622 hm = hm2
633 hm = hm2
623 else:
634 else:
624 # Both matched. Select the longest.
635 # Both matched. Select the longest.
625 if len(hm1[4]) > len(hm2[4]):
636 if len(hm1[4]) > len(hm2[4]):
626 hm = hm1
637 hm = hm1
627 else:
638 else:
628 hm = hm2
639 hm = hm2
629
640
630 # A half-match was found, sort out the return data.
641 # A half-match was found, sort out the return data.
631 if len(text1) > len(text2):
642 if len(text1) > len(text2):
632 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
643 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
633 else:
644 else:
634 (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
645 (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
635 return (text1_a, text1_b, text2_a, text2_b, mid_common)
646 return (text1_a, text1_b, text2_a, text2_b, mid_common)
636
647
637 def diff_cleanupSemantic(self, diffs):
648 def diff_cleanupSemantic(self, diffs):
638 """Reduce the number of edits by eliminating semantically trivial
649 """Reduce the number of edits by eliminating semantically trivial
639 equalities.
650 equalities.
640
651
641 Args:
652 Args:
642 diffs: Array of diff tuples.
653 diffs: Array of diff tuples.
643 """
654 """
644 changes = False
655 changes = False
645 equalities = [] # Stack of indices where equalities are found.
656 equalities = [] # Stack of indices where equalities are found.
646 lastequality = None # Always equal to diffs[equalities[-1]][1]
657 lastequality = None # Always equal to diffs[equalities[-1]][1]
647 pointer = 0 # Index of current position.
658 pointer = 0 # Index of current position.
648 # Number of chars that changed prior to the equality.
659 # Number of chars that changed prior to the equality.
649 length_insertions1, length_deletions1 = 0, 0
660 length_insertions1, length_deletions1 = 0, 0
650 # Number of chars that changed after the equality.
661 # Number of chars that changed after the equality.
651 length_insertions2, length_deletions2 = 0, 0
662 length_insertions2, length_deletions2 = 0, 0
652 while pointer < len(diffs):
663 while pointer < len(diffs):
653 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
664 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
654 equalities.append(pointer)
665 equalities.append(pointer)
655 length_insertions1, length_insertions2 = length_insertions2, 0
666 length_insertions1, length_insertions2 = length_insertions2, 0
656 length_deletions1, length_deletions2 = length_deletions2, 0
667 length_deletions1, length_deletions2 = length_deletions2, 0
657 lastequality = diffs[pointer][1]
668 lastequality = diffs[pointer][1]
658 else: # An insertion or deletion.
669 else: # An insertion or deletion.
659 if diffs[pointer][0] == self.DIFF_INSERT:
670 if diffs[pointer][0] == self.DIFF_INSERT:
660 length_insertions2 += len(diffs[pointer][1])
671 length_insertions2 += len(diffs[pointer][1])
661 else:
672 else:
662 length_deletions2 += len(diffs[pointer][1])
673 length_deletions2 += len(diffs[pointer][1])
663 # Eliminate an equality that is smaller or equal to the edits on both
674 # Eliminate an equality that is smaller or equal to the edits on both
664 # sides of it.
675 # sides of it.
665 if (lastequality and (len(lastequality) <=
676 if (
666 max(length_insertions1, length_deletions1)) and
677 lastequality
667 (len(lastequality) <= max(length_insertions2, length_deletions2))):
678 and (
679 len(lastequality) <= max(length_insertions1, length_deletions1)
680 )
681 and (
682 len(lastequality) <= max(length_insertions2, length_deletions2)
683 )
684 ):
668 # Duplicate record.
685 # Duplicate record.
669 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
686 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
670 # Change second copy to insert.
687 # Change second copy to insert.
671 diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
688 diffs[equalities[-1] + 1] = (
672 diffs[equalities[-1] + 1][1])
689 self.DIFF_INSERT,
690 diffs[equalities[-1] + 1][1],
691 )
673 # Throw away the equality we just deleted.
692 # Throw away the equality we just deleted.
674 equalities.pop()
693 equalities.pop()
675 # Throw away the previous equality (it needs to be reevaluated).
694 # Throw away the previous equality (it needs to be reevaluated).
676 if len(equalities):
695 if len(equalities):
677 equalities.pop()
696 equalities.pop()
678 if len(equalities):
697 if len(equalities):
679 pointer = equalities[-1]
698 pointer = equalities[-1]
680 else:
699 else:
681 pointer = -1
700 pointer = -1
682 # Reset the counters.
701 # Reset the counters.
683 length_insertions1, length_deletions1 = 0, 0
702 length_insertions1, length_deletions1 = 0, 0
684 length_insertions2, length_deletions2 = 0, 0
703 length_insertions2, length_deletions2 = 0, 0
685 lastequality = None
704 lastequality = None
686 changes = True
705 changes = True
687 pointer += 1
706 pointer += 1
688
707
689 # Normalize the diff.
708 # Normalize the diff.
690 if changes:
709 if changes:
691 self.diff_cleanupMerge(diffs)
710 self.diff_cleanupMerge(diffs)
692 self.diff_cleanupSemanticLossless(diffs)
711 self.diff_cleanupSemanticLossless(diffs)
693
712
694 # Find any overlaps between deletions and insertions.
713 # Find any overlaps between deletions and insertions.
695 # e.g: <del>abcxxx</del><ins>xxxdef</ins>
714 # e.g: <del>abcxxx</del><ins>xxxdef</ins>
696 # -> <del>abc</del>xxx<ins>def</ins>
715 # -> <del>abc</del>xxx<ins>def</ins>
697 # e.g: <del>xxxabc</del><ins>defxxx</ins>
716 # e.g: <del>xxxabc</del><ins>defxxx</ins>
698 # -> <ins>def</ins>xxx<del>abc</del>
717 # -> <ins>def</ins>xxx<del>abc</del>
699 # Only extract an overlap if it is as big as the edit ahead or behind it.
718 # Only extract an overlap if it is as big as the edit ahead or behind it.
700 pointer = 1
719 pointer = 1
701 while pointer < len(diffs):
720 while pointer < len(diffs):
702 if (diffs[pointer - 1][0] == self.DIFF_DELETE and
721 if (
703 diffs[pointer][0] == self.DIFF_INSERT):
722 diffs[pointer - 1][0] == self.DIFF_DELETE
723 and diffs[pointer][0] == self.DIFF_INSERT
724 ):
704 deletion = diffs[pointer - 1][1]
725 deletion = diffs[pointer - 1][1]
705 insertion = diffs[pointer][1]
726 insertion = diffs[pointer][1]
706 overlap_length1 = self.diff_commonOverlap(deletion, insertion)
727 overlap_length1 = self.diff_commonOverlap(deletion, insertion)
707 overlap_length2 = self.diff_commonOverlap(insertion, deletion)
728 overlap_length2 = self.diff_commonOverlap(insertion, deletion)
708 if overlap_length1 >= overlap_length2:
729 if overlap_length1 >= overlap_length2:
709 if (overlap_length1 >= len(deletion) / 2.0 or
730 if (
710 overlap_length1 >= len(insertion) / 2.0):
731 overlap_length1 >= len(deletion) / 2.0
732 or overlap_length1 >= len(insertion) / 2.0
733 ):
711 # Overlap found. Insert an equality and trim the surrounding edits.
734 # Overlap found. Insert an equality and trim the surrounding edits.
712 diffs.insert(pointer, (self.DIFF_EQUAL,
735 diffs.insert(
713 insertion[:overlap_length1]))
736 pointer, (self.DIFF_EQUAL, insertion[:overlap_length1])
714 diffs[pointer - 1] = (self.DIFF_DELETE,
737 )
715 deletion[:len(deletion) - overlap_length1])
738 diffs[pointer - 1] = (
716 diffs[pointer + 1] = (self.DIFF_INSERT,
739 self.DIFF_DELETE,
717 insertion[overlap_length1:])
740 deletion[: len(deletion) - overlap_length1],
741 )
742 diffs[pointer + 1] = (
743 self.DIFF_INSERT,
744 insertion[overlap_length1:],
745 )
718 pointer += 1
746 pointer += 1
719 else:
747 else:
720 if (overlap_length2 >= len(deletion) / 2.0 or
748 if (
721 overlap_length2 >= len(insertion) / 2.0):
749 overlap_length2 >= len(deletion) / 2.0
750 or overlap_length2 >= len(insertion) / 2.0
751 ):
722 # Reverse overlap found.
752 # Reverse overlap found.
723 # Insert an equality and swap and trim the surrounding edits.
753 # Insert an equality and swap and trim the surrounding edits.
724 diffs.insert(pointer, (self.DIFF_EQUAL, deletion[:overlap_length2]))
754 diffs.insert(
725 diffs[pointer - 1] = (self.DIFF_INSERT,
755 pointer, (self.DIFF_EQUAL, deletion[:overlap_length2])
726 insertion[:len(insertion) - overlap_length2])
756 )
727 diffs[pointer + 1] = (self.DIFF_DELETE, deletion[overlap_length2:])
757 diffs[pointer - 1] = (
758 self.DIFF_INSERT,
759 insertion[: len(insertion) - overlap_length2],
760 )
761 diffs[pointer + 1] = (
762 self.DIFF_DELETE,
763 deletion[overlap_length2:],
764 )
728 pointer += 1
765 pointer += 1
729 pointer += 1
766 pointer += 1
730 pointer += 1
767 pointer += 1
731
768
732 def diff_cleanupSemanticLossless(self, diffs):
769 def diff_cleanupSemanticLossless(self, diffs):
733 """Look for single edits surrounded on both sides by equalities
770 """Look for single edits surrounded on both sides by equalities
734 which can be shifted sideways to align the edit to a word boundary.
771 which can be shifted sideways to align the edit to a word boundary.
735 e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
772 e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
736
773
737 Args:
774 Args:
738 diffs: Array of diff tuples.
775 diffs: Array of diff tuples.
739 """
776 """
740
777
741 def diff_cleanupSemanticScore(one, two):
778 def diff_cleanupSemanticScore(one, two):
742 """Given two strings, compute a score representing whether the
779 """Given two strings, compute a score representing whether the
743 internal boundary falls on logical boundaries.
780 internal boundary falls on logical boundaries.
744 Scores range from 6 (best) to 0 (worst).
781 Scores range from 6 (best) to 0 (worst).
745 Closure, but does not reference any external variables.
782 Closure, but does not reference any external variables.
746
783
747 Args:
784 Args:
748 one: First string.
785 one: First string.
749 two: Second string.
786 two: Second string.
750
787
751 Returns:
788 Returns:
752 The score.
789 The score.
753 """
790 """
754 if not one or not two:
791 if not one or not two:
755 # Edges are the best.
792 # Edges are the best.
756 return 6
793 return 6
757
794
758 # Each port of this function behaves slightly differently due to
795 # Each port of this function behaves slightly differently due to
759 # subtle differences in each language's definition of things like
796 # subtle differences in each language's definition of things like
760 # 'whitespace'. Since this function's purpose is largely cosmetic,
797 # 'whitespace'. Since this function's purpose is largely cosmetic,
761 # the choice has been made to use each language's native features
798 # the choice has been made to use each language's native features
762 # rather than force total conformity.
799 # rather than force total conformity.
763 char1 = one[-1]
800 char1 = one[-1]
764 char2 = two[0]
801 char2 = two[0]
765 nonAlphaNumeric1 = not char1.isalnum()
802 nonAlphaNumeric1 = not char1.isalnum()
766 nonAlphaNumeric2 = not char2.isalnum()
803 nonAlphaNumeric2 = not char2.isalnum()
767 whitespace1 = nonAlphaNumeric1 and char1.isspace()
804 whitespace1 = nonAlphaNumeric1 and char1.isspace()
768 whitespace2 = nonAlphaNumeric2 and char2.isspace()
805 whitespace2 = nonAlphaNumeric2 and char2.isspace()
769 lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n")
806 lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n")
770 lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n")
807 lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n")
771 blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one)
808 blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one)
772 blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two)
809 blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two)
773
810
774 if blankLine1 or blankLine2:
811 if blankLine1 or blankLine2:
775 # Five points for blank lines.
812 # Five points for blank lines.
776 return 5
813 return 5
777 elif lineBreak1 or lineBreak2:
814 elif lineBreak1 or lineBreak2:
778 # Four points for line breaks.
815 # Four points for line breaks.
779 return 4
816 return 4
780 elif nonAlphaNumeric1 and not whitespace1 and whitespace2:
817 elif nonAlphaNumeric1 and not whitespace1 and whitespace2:
781 # Three points for end of sentences.
818 # Three points for end of sentences.
782 return 3
819 return 3
783 elif whitespace1 or whitespace2:
820 elif whitespace1 or whitespace2:
784 # Two points for whitespace.
821 # Two points for whitespace.
785 return 2
822 return 2
786 elif nonAlphaNumeric1 or nonAlphaNumeric2:
823 elif nonAlphaNumeric1 or nonAlphaNumeric2:
787 # One point for non-alphanumeric.
824 # One point for non-alphanumeric.
788 return 1
825 return 1
789 return 0
826 return 0
790
827
791 pointer = 1
828 pointer = 1
792 # Intentionally ignore the first and last element (don't need checking).
829 # Intentionally ignore the first and last element (don't need checking).
793 while pointer < len(diffs) - 1:
830 while pointer < len(diffs) - 1:
794 if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
831 if (
795 diffs[pointer + 1][0] == self.DIFF_EQUAL):
832 diffs[pointer - 1][0] == self.DIFF_EQUAL
833 and diffs[pointer + 1][0] == self.DIFF_EQUAL
834 ):
796 # This is a single edit surrounded by equalities.
835 # This is a single edit surrounded by equalities.
797 equality1 = diffs[pointer - 1][1]
836 equality1 = diffs[pointer - 1][1]
798 edit = diffs[pointer][1]
837 edit = diffs[pointer][1]
799 equality2 = diffs[pointer + 1][1]
838 equality2 = diffs[pointer + 1][1]
800
839
801 # First, shift the edit as far left as possible.
840 # First, shift the edit as far left as possible.
802 commonOffset = self.diff_commonSuffix(equality1, edit)
841 commonOffset = self.diff_commonSuffix(equality1, edit)
803 if commonOffset:
842 if commonOffset:
804 commonString = edit[-commonOffset:]
843 commonString = edit[-commonOffset:]
805 equality1 = equality1[:-commonOffset]
844 equality1 = equality1[:-commonOffset]
806 edit = commonString + edit[:-commonOffset]
845 edit = commonString + edit[:-commonOffset]
807 equality2 = commonString + equality2
846 equality2 = commonString + equality2
808
847
809 # Second, step character by character right, looking for the best fit.
848 # Second, step character by character right, looking for the best fit.
810 bestEquality1 = equality1
849 bestEquality1 = equality1
811 bestEdit = edit
850 bestEdit = edit
812 bestEquality2 = equality2
851 bestEquality2 = equality2
813 bestScore = (diff_cleanupSemanticScore(equality1, edit) +
852 bestScore = diff_cleanupSemanticScore(
814 diff_cleanupSemanticScore(edit, equality2))
853 equality1, edit
854 ) + diff_cleanupSemanticScore(edit, equality2)
815 while edit and equality2 and edit[0] == equality2[0]:
855 while edit and equality2 and edit[0] == equality2[0]:
816 equality1 += edit[0]
856 equality1 += edit[0]
817 edit = edit[1:] + equality2[0]
857 edit = edit[1:] + equality2[0]
818 equality2 = equality2[1:]
858 equality2 = equality2[1:]
819 score = (diff_cleanupSemanticScore(equality1, edit) +
859 score = diff_cleanupSemanticScore(
820 diff_cleanupSemanticScore(edit, equality2))
860 equality1, edit
861 ) + diff_cleanupSemanticScore(edit, equality2)
821 # The >= encourages trailing rather than leading whitespace on edits.
862 # The >= encourages trailing rather than leading whitespace on edits.
822 if score >= bestScore:
863 if score >= bestScore:
823 bestScore = score
864 bestScore = score
824 bestEquality1 = equality1
865 bestEquality1 = equality1
825 bestEdit = edit
866 bestEdit = edit
826 bestEquality2 = equality2
867 bestEquality2 = equality2
827
868
828 if diffs[pointer - 1][1] != bestEquality1:
869 if diffs[pointer - 1][1] != bestEquality1:
829 # We have an improvement, save it back to the diff.
870 # We have an improvement, save it back to the diff.
830 if bestEquality1:
871 if bestEquality1:
831 diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
872 diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
832 else:
873 else:
833 del diffs[pointer - 1]
874 del diffs[pointer - 1]
834 pointer -= 1
875 pointer -= 1
835 diffs[pointer] = (diffs[pointer][0], bestEdit)
876 diffs[pointer] = (diffs[pointer][0], bestEdit)
836 if bestEquality2:
877 if bestEquality2:
837 diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
878 diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
838 else:
879 else:
839 del diffs[pointer + 1]
880 del diffs[pointer + 1]
840 pointer -= 1
881 pointer -= 1
841 pointer += 1
882 pointer += 1
842
883
843 # Define some regex patterns for matching boundaries.
884 # Define some regex patterns for matching boundaries.
844 BLANKLINEEND = re.compile(r"\n\r?\n$");
885 BLANKLINEEND = re.compile(r"\n\r?\n$")
845 BLANKLINESTART = re.compile(r"^\r?\n\r?\n");
886 BLANKLINESTART = re.compile(r"^\r?\n\r?\n")
846
887
847 def diff_cleanupEfficiency(self, diffs):
888 def diff_cleanupEfficiency(self, diffs):
848 """Reduce the number of edits by eliminating operationally trivial
889 """Reduce the number of edits by eliminating operationally trivial
849 equalities.
890 equalities.
850
891
851 Args:
892 Args:
852 diffs: Array of diff tuples.
893 diffs: Array of diff tuples.
853 """
894 """
854 changes = False
895 changes = False
855 equalities = [] # Stack of indices where equalities are found.
896 equalities = [] # Stack of indices where equalities are found.
856 lastequality = None # Always equal to diffs[equalities[-1]][1]
897 lastequality = None # Always equal to diffs[equalities[-1]][1]
857 pointer = 0 # Index of current position.
898 pointer = 0 # Index of current position.
858 pre_ins = False # Is there an insertion operation before the last equality.
899 pre_ins = False # Is there an insertion operation before the last equality.
859 pre_del = False # Is there a deletion operation before the last equality.
900 pre_del = False # Is there a deletion operation before the last equality.
860 post_ins = False # Is there an insertion operation after the last equality.
901 post_ins = False # Is there an insertion operation after the last equality.
861 post_del = False # Is there a deletion operation after the last equality.
902 post_del = False # Is there a deletion operation after the last equality.
862 while pointer < len(diffs):
903 while pointer < len(diffs):
863 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
904 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
864 if (len(diffs[pointer][1]) < self.Diff_EditCost and
905 if len(diffs[pointer][1]) < self.Diff_EditCost and (
865 (post_ins or post_del)):
906 post_ins or post_del
907 ):
866 # Candidate found.
908 # Candidate found.
867 equalities.append(pointer)
909 equalities.append(pointer)
868 pre_ins = post_ins
910 pre_ins = post_ins
869 pre_del = post_del
911 pre_del = post_del
870 lastequality = diffs[pointer][1]
912 lastequality = diffs[pointer][1]
871 else:
913 else:
872 # Not a candidate, and can never become one.
914 # Not a candidate, and can never become one.
873 equalities = []
915 equalities = []
874 lastequality = None
916 lastequality = None
875
917
876 post_ins = post_del = False
918 post_ins = post_del = False
877 else: # An insertion or deletion.
919 else: # An insertion or deletion.
878 if diffs[pointer][0] == self.DIFF_DELETE:
920 if diffs[pointer][0] == self.DIFF_DELETE:
879 post_del = True
921 post_del = True
880 else:
922 else:
881 post_ins = True
923 post_ins = True
882
924
883 # Five types to be split:
925 # Five types to be split:
884 # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
926 # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
885 # <ins>A</ins>X<ins>C</ins><del>D</del>
927 # <ins>A</ins>X<ins>C</ins><del>D</del>
886 # <ins>A</ins><del>B</del>X<ins>C</ins>
928 # <ins>A</ins><del>B</del>X<ins>C</ins>
887 # <ins>A</del>X<ins>C</ins><del>D</del>
929 # <ins>A</del>X<ins>C</ins><del>D</del>
888 # <ins>A</ins><del>B</del>X<del>C</del>
930 # <ins>A</ins><del>B</del>X<del>C</del>
889
931
890 if lastequality and ((pre_ins and pre_del and post_ins and post_del) or
932 if lastequality and (
891 ((len(lastequality) < self.Diff_EditCost / 2) and
933 (pre_ins and pre_del and post_ins and post_del)
892 (pre_ins + pre_del + post_ins + post_del) == 3)):
934 or (
935 (len(lastequality) < self.Diff_EditCost / 2)
936 and (pre_ins + pre_del + post_ins + post_del) == 3
937 )
938 ):
893 # Duplicate record.
939 # Duplicate record.
894 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
940 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
895 # Change second copy to insert.
941 # Change second copy to insert.
896 diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
942 diffs[equalities[-1] + 1] = (
897 diffs[equalities[-1] + 1][1])
943 self.DIFF_INSERT,
944 diffs[equalities[-1] + 1][1],
945 )
898 equalities.pop() # Throw away the equality we just deleted.
946 equalities.pop() # Throw away the equality we just deleted.
899 lastequality = None
947 lastequality = None
900 if pre_ins and pre_del:
948 if pre_ins and pre_del:
901 # No changes made which could affect previous entry, keep going.
949 # No changes made which could affect previous entry, keep going.
902 post_ins = post_del = True
950 post_ins = post_del = True
903 equalities = []
951 equalities = []
904 else:
952 else:
905 if len(equalities):
953 if len(equalities):
906 equalities.pop() # Throw away the previous equality.
954 equalities.pop() # Throw away the previous equality.
907 if len(equalities):
955 if len(equalities):
908 pointer = equalities[-1]
956 pointer = equalities[-1]
909 else:
957 else:
910 pointer = -1
958 pointer = -1
911 post_ins = post_del = False
959 post_ins = post_del = False
912 changes = True
960 changes = True
913 pointer += 1
961 pointer += 1
914
962
915 if changes:
963 if changes:
916 self.diff_cleanupMerge(diffs)
964 self.diff_cleanupMerge(diffs)
917
965
918 def diff_cleanupMerge(self, diffs):
966 def diff_cleanupMerge(self, diffs):
919 """Reorder and merge like edit sections. Merge equalities.
967 """Reorder and merge like edit sections. Merge equalities.
920 Any edit section can move as long as it doesn't cross an equality.
968 Any edit section can move as long as it doesn't cross an equality.
921
969
922 Args:
970 Args:
923 diffs: Array of diff tuples.
971 diffs: Array of diff tuples.
924 """
972 """
925 diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end.
973 diffs.append((self.DIFF_EQUAL, "")) # Add a dummy entry at the end.
926 pointer = 0
974 pointer = 0
927 count_delete = 0
975 count_delete = 0
928 count_insert = 0
976 count_insert = 0
929 text_delete = ''
977 text_delete = ""
930 text_insert = ''
978 text_insert = ""
931 while pointer < len(diffs):
979 while pointer < len(diffs):
932 if diffs[pointer][0] == self.DIFF_INSERT:
980 if diffs[pointer][0] == self.DIFF_INSERT:
933 count_insert += 1
981 count_insert += 1
934 text_insert += diffs[pointer][1]
982 text_insert += diffs[pointer][1]
935 pointer += 1
983 pointer += 1
936 elif diffs[pointer][0] == self.DIFF_DELETE:
984 elif diffs[pointer][0] == self.DIFF_DELETE:
937 count_delete += 1
985 count_delete += 1
938 text_delete += diffs[pointer][1]
986 text_delete += diffs[pointer][1]
939 pointer += 1
987 pointer += 1
940 elif diffs[pointer][0] == self.DIFF_EQUAL:
988 elif diffs[pointer][0] == self.DIFF_EQUAL:
941 # Upon reaching an equality, check for prior redundancies.
989 # Upon reaching an equality, check for prior redundancies.
942 if count_delete + count_insert > 1:
990 if count_delete + count_insert > 1:
943 if count_delete != 0 and count_insert != 0:
991 if count_delete != 0 and count_insert != 0:
944 # Factor out any common prefixies.
992 # Factor out any common prefixies.
945 commonlength = self.diff_commonPrefix(text_insert, text_delete)
993 commonlength = self.diff_commonPrefix(text_insert, text_delete)
946 if commonlength != 0:
994 if commonlength != 0:
947 x = pointer - count_delete - count_insert - 1
995 x = pointer - count_delete - count_insert - 1
948 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
996 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
949 diffs[x] = (diffs[x][0], diffs[x][1] +
997 diffs[x] = (
950 text_insert[:commonlength])
998 diffs[x][0],
999 diffs[x][1] + text_insert[:commonlength],
1000 )
951 else:
1001 else:
952 diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength]))
1002 diffs.insert(
1003 0, (self.DIFF_EQUAL, text_insert[:commonlength])
1004 )
953 pointer += 1
1005 pointer += 1
954 text_insert = text_insert[commonlength:]
1006 text_insert = text_insert[commonlength:]
955 text_delete = text_delete[commonlength:]
1007 text_delete = text_delete[commonlength:]
956 # Factor out any common suffixies.
1008 # Factor out any common suffixies.
957 commonlength = self.diff_commonSuffix(text_insert, text_delete)
1009 commonlength = self.diff_commonSuffix(text_insert, text_delete)
958 if commonlength != 0:
1010 if commonlength != 0:
959 diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +
1011 diffs[pointer] = (
960 diffs[pointer][1])
1012 diffs[pointer][0],
1013 text_insert[-commonlength:] + diffs[pointer][1],
1014 )
961 text_insert = text_insert[:-commonlength]
1015 text_insert = text_insert[:-commonlength]
962 text_delete = text_delete[:-commonlength]
1016 text_delete = text_delete[:-commonlength]
963 # Delete the offending records and add the merged ones.
1017 # Delete the offending records and add the merged ones.
964 if count_delete == 0:
1018 if count_delete == 0:
965 diffs[pointer - count_insert : pointer] = [
1019 diffs[pointer - count_insert : pointer] = [
966 (self.DIFF_INSERT, text_insert)]
1020 (self.DIFF_INSERT, text_insert)
1021 ]
967 elif count_insert == 0:
1022 elif count_insert == 0:
968 diffs[pointer - count_delete : pointer] = [
1023 diffs[pointer - count_delete : pointer] = [
969 (self.DIFF_DELETE, text_delete)]
1024 (self.DIFF_DELETE, text_delete)
1025 ]
970 else:
1026 else:
971 diffs[pointer - count_delete - count_insert : pointer] = [
1027 diffs[pointer - count_delete - count_insert : pointer] = [
972 (self.DIFF_DELETE, text_delete),
1028 (self.DIFF_DELETE, text_delete),
973 (self.DIFF_INSERT, text_insert)]
1029 (self.DIFF_INSERT, text_insert),
1030 ]
974 pointer = pointer - count_delete - count_insert + 1
1031 pointer = pointer - count_delete - count_insert + 1
975 if count_delete != 0:
1032 if count_delete != 0:
976 pointer += 1
1033 pointer += 1
977 if count_insert != 0:
1034 if count_insert != 0:
978 pointer += 1
1035 pointer += 1
979 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
1036 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
980 # Merge this equality with the previous one.
1037 # Merge this equality with the previous one.
981 diffs[pointer - 1] = (diffs[pointer - 1][0],
1038 diffs[pointer - 1] = (
982 diffs[pointer - 1][1] + diffs[pointer][1])
1039 diffs[pointer - 1][0],
1040 diffs[pointer - 1][1] + diffs[pointer][1],
1041 )
983 del diffs[pointer]
1042 del diffs[pointer]
984 else:
1043 else:
985 pointer += 1
1044 pointer += 1
986
1045
987 count_insert = 0
1046 count_insert = 0
988 count_delete = 0
1047 count_delete = 0
989 text_delete = ''
1048 text_delete = ""
990 text_insert = ''
1049 text_insert = ""
991
1050
992 if diffs[-1][1] == '':
1051 if diffs[-1][1] == "":
993 diffs.pop() # Remove the dummy entry at the end.
1052 diffs.pop() # Remove the dummy entry at the end.
994
1053
995 # Second pass: look for single edits surrounded on both sides by equalities
1054 # Second pass: look for single edits surrounded on both sides by equalities
996 # which can be shifted sideways to eliminate an equality.
1055 # which can be shifted sideways to eliminate an equality.
997 # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
1056 # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
998 changes = False
1057 changes = False
999 pointer = 1
1058 pointer = 1
1000 # Intentionally ignore the first and last element (don't need checking).
1059 # Intentionally ignore the first and last element (don't need checking).
1001 while pointer < len(diffs) - 1:
1060 while pointer < len(diffs) - 1:
1002 if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
1061 if (
1003 diffs[pointer + 1][0] == self.DIFF_EQUAL):
1062 diffs[pointer - 1][0] == self.DIFF_EQUAL
1063 and diffs[pointer + 1][0] == self.DIFF_EQUAL
1064 ):
1004 # This is a single edit surrounded by equalities.
1065 # This is a single edit surrounded by equalities.
1005 if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
1066 if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
1006 # Shift the edit over the previous equality.
1067 # Shift the edit over the previous equality.
1007 diffs[pointer] = (diffs[pointer][0],
1068 diffs[pointer] = (
1008 diffs[pointer - 1][1] +
1069 diffs[pointer][0],
1009 diffs[pointer][1][:-len(diffs[pointer - 1][1])])
1070 diffs[pointer - 1][1]
1010 diffs[pointer + 1] = (diffs[pointer + 1][0],
1071 + diffs[pointer][1][: -len(diffs[pointer - 1][1])],
1011 diffs[pointer - 1][1] + diffs[pointer + 1][1])
1072 )
1073 diffs[pointer + 1] = (
1074 diffs[pointer + 1][0],
1075 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1076 )
1012 del diffs[pointer - 1]
1077 del diffs[pointer - 1]
1013 changes = True
1078 changes = True
1014 elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
1079 elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
1015 # Shift the edit over the next equality.
1080 # Shift the edit over the next equality.
1016 diffs[pointer - 1] = (diffs[pointer - 1][0],
1081 diffs[pointer - 1] = (
1017 diffs[pointer - 1][1] + diffs[pointer + 1][1])
1082 diffs[pointer - 1][0],
1018 diffs[pointer] = (diffs[pointer][0],
1083 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1019 diffs[pointer][1][len(diffs[pointer + 1][1]):] +
1084 )
1020 diffs[pointer + 1][1])
1085 diffs[pointer] = (
1086 diffs[pointer][0],
1087 diffs[pointer][1][len(diffs[pointer + 1][1]) :]
1088 + diffs[pointer + 1][1],
1089 )
1021 del diffs[pointer + 1]
1090 del diffs[pointer + 1]
1022 changes = True
1091 changes = True
1023 pointer += 1
1092 pointer += 1
1024
1093
1025 # If shifts were made, the diff needs reordering and another shift sweep.
1094 # If shifts were made, the diff needs reordering and another shift sweep.
1026 if changes:
1095 if changes:
1027 self.diff_cleanupMerge(diffs)
1096 self.diff_cleanupMerge(diffs)
1028
1097
1029 def diff_xIndex(self, diffs, loc):
1098 def diff_xIndex(self, diffs, loc):
1030 """loc is a location in text1, compute and return the equivalent location
1099 """loc is a location in text1, compute and return the equivalent location
1031 in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
1100 in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
1032
1101
1033 Args:
1102 Args:
1034 diffs: Array of diff tuples.
1103 diffs: Array of diff tuples.
1035 loc: Location within text1.
1104 loc: Location within text1.
1036
1105
1037 Returns:
1106 Returns:
1038 Location within text2.
1107 Location within text2.
1039 """
1108 """
1040 chars1 = 0
1109 chars1 = 0
1041 chars2 = 0
1110 chars2 = 0
1042 last_chars1 = 0
1111 last_chars1 = 0
1043 last_chars2 = 0
1112 last_chars2 = 0
1044 for x in range(len(diffs)):
1113 for x in range(len(diffs)):
1045 (op, text) = diffs[x]
1114 (op, text) = diffs[x]
1046 if op != self.DIFF_INSERT: # Equality or deletion.
1115 if op != self.DIFF_INSERT: # Equality or deletion.
1047 chars1 += len(text)
1116 chars1 += len(text)
1048 if op != self.DIFF_DELETE: # Equality or insertion.
1117 if op != self.DIFF_DELETE: # Equality or insertion.
1049 chars2 += len(text)
1118 chars2 += len(text)
1050 if chars1 > loc: # Overshot the location.
1119 if chars1 > loc: # Overshot the location.
1051 break
1120 break
1052 last_chars1 = chars1
1121 last_chars1 = chars1
1053 last_chars2 = chars2
1122 last_chars2 = chars2
1054
1123
1055 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
1124 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
1056 # The location was deleted.
1125 # The location was deleted.
1057 return last_chars2
1126 return last_chars2
1058 # Add the remaining len(character).
1127 # Add the remaining len(character).
1059 return last_chars2 + (loc - last_chars1)
1128 return last_chars2 + (loc - last_chars1)
1060
1129
1061 def diff_prettyHtml(self, diffs):
1130 def diff_prettyHtml(self, diffs):
1062 """Convert a diff array into a pretty HTML report.
1131 """Convert a diff array into a pretty HTML report.
1063
1132
1064 Args:
1133 Args:
1065 diffs: Array of diff tuples.
1134 diffs: Array of diff tuples.
1066
1135
1067 Returns:
1136 Returns:
1068 HTML representation.
1137 HTML representation.
1069 """
1138 """
1070 html = []
1139 html = []
1071 for (op, data) in diffs:
1140 for op, data in diffs:
1072 text = (data.replace("&", "&amp;").replace("<", "&lt;")
1141 text = (
1073 .replace(">", "&gt;").replace("\n", "&para;<br>"))
1142 data.replace("&", "&amp;")
1143 .replace("<", "&lt;")
1144 .replace(">", "&gt;")
1145 .replace("\n", "&para;<br>")
1146 )
1074 if op == self.DIFF_INSERT:
1147 if op == self.DIFF_INSERT:
1075 html.append("<ins style=\"background:#e6ffe6;\">%s</ins>" % text)
1148 html.append('<ins style="background:#e6ffe6;">%s</ins>' % text)
1076 elif op == self.DIFF_DELETE:
1149 elif op == self.DIFF_DELETE:
1077 html.append("<del style=\"background:#ffe6e6;\">%s</del>" % text)
1150 html.append('<del style="background:#ffe6e6;">%s</del>' % text)
1078 elif op == self.DIFF_EQUAL:
1151 elif op == self.DIFF_EQUAL:
1079 html.append("<span>%s</span>" % text)
1152 html.append("<span>%s</span>" % text)
1080 return "".join(html)
1153 return "".join(html)
1081
1154
1082 def diff_text1(self, diffs):
1155 def diff_text1(self, diffs):
1083 """Compute and return the source text (all equalities and deletions).
1156 """Compute and return the source text (all equalities and deletions).
1084
1157
1085 Args:
1158 Args:
1086 diffs: Array of diff tuples.
1159 diffs: Array of diff tuples.
1087
1160
1088 Returns:
1161 Returns:
1089 Source text.
1162 Source text.
1090 """
1163 """
1091 text = []
1164 text = []
1092 for (op, data) in diffs:
1165 for op, data in diffs:
1093 if op != self.DIFF_INSERT:
1166 if op != self.DIFF_INSERT:
1094 text.append(data)
1167 text.append(data)
1095 return "".join(text)
1168 return "".join(text)
1096
1169
1097 def diff_text2(self, diffs):
1170 def diff_text2(self, diffs):
1098 """Compute and return the destination text (all equalities and insertions).
1171 """Compute and return the destination text (all equalities and insertions).
1099
1172
1100 Args:
1173 Args:
1101 diffs: Array of diff tuples.
1174 diffs: Array of diff tuples.
1102
1175
1103 Returns:
1176 Returns:
1104 Destination text.
1177 Destination text.
1105 """
1178 """
1106 text = []
1179 text = []
1107 for (op, data) in diffs:
1180 for op, data in diffs:
1108 if op != self.DIFF_DELETE:
1181 if op != self.DIFF_DELETE:
1109 text.append(data)
1182 text.append(data)
1110 return "".join(text)
1183 return "".join(text)
1111
1184
1112 def diff_levenshtein(self, diffs):
1185 def diff_levenshtein(self, diffs):
1113 """Compute the Levenshtein distance; the number of inserted, deleted or
1186 """Compute the Levenshtein distance; the number of inserted, deleted or
1114 substituted characters.
1187 substituted characters.
1115
1188
1116 Args:
1189 Args:
1117 diffs: Array of diff tuples.
1190 diffs: Array of diff tuples.
1118
1191
1119 Returns:
1192 Returns:
1120 Number of changes.
1193 Number of changes.
1121 """
1194 """
1122 levenshtein = 0
1195 levenshtein = 0
1123 insertions = 0
1196 insertions = 0
1124 deletions = 0
1197 deletions = 0
1125 for (op, data) in diffs:
1198 for op, data in diffs:
1126 if op == self.DIFF_INSERT:
1199 if op == self.DIFF_INSERT:
1127 insertions += len(data)
1200 insertions += len(data)
1128 elif op == self.DIFF_DELETE:
1201 elif op == self.DIFF_DELETE:
1129 deletions += len(data)
1202 deletions += len(data)
1130 elif op == self.DIFF_EQUAL:
1203 elif op == self.DIFF_EQUAL:
1131 # A deletion and an insertion is one substitution.
1204 # A deletion and an insertion is one substitution.
1132 levenshtein += max(insertions, deletions)
1205 levenshtein += max(insertions, deletions)
1133 insertions = 0
1206 insertions = 0
1134 deletions = 0
1207 deletions = 0
1135 levenshtein += max(insertions, deletions)
1208 levenshtein += max(insertions, deletions)
1136 return levenshtein
1209 return levenshtein
1137
1210
1138 def diff_toDelta(self, diffs):
1211 def diff_toDelta(self, diffs):
1139 """Crush the diff into an encoded string which describes the operations
1212 """Crush the diff into an encoded string which describes the operations
1140 required to transform text1 into text2.
1213 required to transform text1 into text2.
1141 E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
1214 E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
1142 Operations are tab-separated. Inserted text is escaped using %xx notation.
1215 Operations are tab-separated. Inserted text is escaped using %xx notation.
1143
1216
1144 Args:
1217 Args:
1145 diffs: Array of diff tuples.
1218 diffs: Array of diff tuples.
1146
1219
1147 Returns:
1220 Returns:
1148 Delta text.
1221 Delta text.
1149 """
1222 """
1150 text = []
1223 text = []
1151 for (op, data) in diffs:
1224 for op, data in diffs:
1152 if op == self.DIFF_INSERT:
1225 if op == self.DIFF_INSERT:
1153 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
1226 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
1154 data = data.encode("utf-8")
1227 data = data.encode("utf-8")
1155 text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
1228 text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
1156 elif op == self.DIFF_DELETE:
1229 elif op == self.DIFF_DELETE:
1157 text.append("-%d" % len(data))
1230 text.append("-%d" % len(data))
1158 elif op == self.DIFF_EQUAL:
1231 elif op == self.DIFF_EQUAL:
1159 text.append("=%d" % len(data))
1232 text.append("=%d" % len(data))
1160 return "\t".join(text)
1233 return "\t".join(text)
1161
1234
1162 def diff_fromDelta(self, text1, delta):
1235 def diff_fromDelta(self, text1, delta):
1163 """Given the original text1, and an encoded string which describes the
1236 """Given the original text1, and an encoded string which describes the
1164 operations required to transform text1 into text2, compute the full diff.
1237 operations required to transform text1 into text2, compute the full diff.
1165
1238
1166 Args:
1239 Args:
1167 text1: Source string for the diff.
1240 text1: Source string for the diff.
1168 delta: Delta text.
1241 delta: Delta text.
1169
1242
1170 Returns:
1243 Returns:
1171 Array of diff tuples.
1244 Array of diff tuples.
1172
1245
1173 Raises:
1246 Raises:
1174 ValueError: If invalid input.
1247 ValueError: If invalid input.
1175 """
1248 """
1176 if type(delta) == unicode:
1249 if type(delta) == str:
1177 # Deltas should be composed of a subset of ascii chars, Unicode not
1250 # Deltas should be composed of a subset of ascii chars, Unicode not
1178 # required. If this encode raises UnicodeEncodeError, delta is invalid.
1251 # required. If this encode raises UnicodeEncodeError, delta is invalid.
1179 delta = delta.encode("ascii")
1252 delta = delta.encode("ascii")
1180 diffs = []
1253 diffs = []
1181 pointer = 0 # Cursor in text1
1254 pointer = 0 # Cursor in text1
1182 tokens = delta.split("\t")
1255 tokens = delta.split("\t")
1183 for token in tokens:
1256 for token in tokens:
1184 if token == "":
1257 if token == "":
1185 # Blank tokens are ok (from a trailing \t).
1258 # Blank tokens are ok (from a trailing \t).
1186 continue
1259 continue
1187 # Each token begins with a one character parameter which specifies the
1260 # Each token begins with a one character parameter which specifies the
1188 # operation of this token (delete, insert, equality).
1261 # operation of this token (delete, insert, equality).
1189 param = token[1:]
1262 param = token[1:]
1190 if token[0] == "+":
1263 if token[0] == "+":
1191 param = urllib.parse.unquote(param).decode("utf-8")
1264 param = urllib.parse.unquote(param)
1192 diffs.append((self.DIFF_INSERT, param))
1265 diffs.append((self.DIFF_INSERT, param))
1193 elif token[0] == "-" or token[0] == "=":
1266 elif token[0] == "-" or token[0] == "=":
1194 try:
1267 try:
1195 n = int(param)
1268 n = int(param)
1196 except ValueError:
1269 except ValueError:
1197 raise ValueError("Invalid number in diff_fromDelta: " + param)
1270 raise ValueError("Invalid number in diff_fromDelta: " + param)
1198 if n < 0:
1271 if n < 0:
1199 raise ValueError("Negative number in diff_fromDelta: " + param)
1272 raise ValueError("Negative number in diff_fromDelta: " + param)
1200 text = text1[pointer : pointer + n]
1273 text = text1[pointer : pointer + n]
1201 pointer += n
1274 pointer += n
1202 if token[0] == "=":
1275 if token[0] == "=":
1203 diffs.append((self.DIFF_EQUAL, text))
1276 diffs.append((self.DIFF_EQUAL, text))
1204 else:
1277 else:
1205 diffs.append((self.DIFF_DELETE, text))
1278 diffs.append((self.DIFF_DELETE, text))
1206 else:
1279 else:
1207 # Anything else is an error.
1280 # Anything else is an error.
1208 raise ValueError("Invalid diff operation in diff_fromDelta: " +
1281 raise ValueError(
1209 token[0])
1282 "Invalid diff operation in diff_fromDelta: " + token[0]
1283 )
1210 if pointer != len(text1):
1284 if pointer != len(text1):
1211 raise ValueError(
1285 raise ValueError(
1212 "Delta length (%d) does not equal source text length (%d)." %
1286 "Delta length (%d) does not equal source text length (%d)."
1213 (pointer, len(text1)))
1287 % (pointer, len(text1))
1288 )
1214 return diffs
1289 return diffs
1215
1290
1216 # MATCH FUNCTIONS
1291 # MATCH FUNCTIONS
1217
1292
1218 def match_main(self, text, pattern, loc):
1293 def match_main(self, text, pattern, loc):
1219 """Locate the best instance of 'pattern' in 'text' near 'loc'.
1294 """Locate the best instance of 'pattern' in 'text' near 'loc'.
1220
1295
1221 Args:
1296 Args:
1222 text: The text to search.
1297 text: The text to search.
1223 pattern: The pattern to search for.
1298 pattern: The pattern to search for.
1224 loc: The location to search around.
1299 loc: The location to search around.
1225
1300
1226 Returns:
1301 Returns:
1227 Best match index or -1.
1302 Best match index or -1.
1228 """
1303 """
1229 # Check for null inputs.
1304 # Check for null inputs.
1230 if text is None or pattern is None:
1305 if text is None or pattern is None:
1231 raise ValueError("Null inputs. (match_main)")
1306 raise ValueError("Null inputs. (match_main)")
1232
1307
1233 loc = max(0, min(loc, len(text)))
1308 loc = max(0, min(loc, len(text)))
1234 if text == pattern:
1309 if text == pattern:
1235 # Shortcut (potentially not guaranteed by the algorithm)
1310 # Shortcut (potentially not guaranteed by the algorithm)
1236 return 0
1311 return 0
1237 elif not text:
1312 elif not text:
1238 # Nothing to match.
1313 # Nothing to match.
1239 return -1
1314 return -1
1240 elif text[loc:loc + len(pattern)] == pattern:
1315 elif text[loc : loc + len(pattern)] == pattern:
1241 # Perfect match at the perfect spot! (Includes case of null pattern)
1316 # Perfect match at the perfect spot! (Includes case of null pattern)
1242 return loc
1317 return loc
1243 else:
1318 else:
1244 # Do a fuzzy compare.
1319 # Do a fuzzy compare.
1245 match = self.match_bitap(text, pattern, loc)
1320 match = self.match_bitap(text, pattern, loc)
1246 return match
1321 return match
1247
1322
1248 def match_bitap(self, text, pattern, loc):
1323 def match_bitap(self, text, pattern, loc):
1249 """Locate the best instance of 'pattern' in 'text' near 'loc' using the
1324 """Locate the best instance of 'pattern' in 'text' near 'loc' using the
1250 Bitap algorithm.
1325 Bitap algorithm.
1251
1326
1252 Args:
1327 Args:
1253 text: The text to search.
1328 text: The text to search.
1254 pattern: The pattern to search for.
1329 pattern: The pattern to search for.
1255 loc: The location to search around.
1330 loc: The location to search around.
1256
1331
1257 Returns:
1332 Returns:
1258 Best match index or -1.
1333 Best match index or -1.
1259 """
1334 """
1260 # Python doesn't have a maxint limit, so ignore this check.
1335 # Python doesn't have a maxint limit, so ignore this check.
1261 #if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1336 # if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1262 # raise ValueError("Pattern too long for this application.")
1337 # raise ValueError("Pattern too long for this application.")
1263
1338
1264 # Initialise the alphabet.
1339 # Initialise the alphabet.
1265 s = self.match_alphabet(pattern)
1340 s = self.match_alphabet(pattern)
1266
1341
1267 def match_bitapScore(e, x):
1342 def match_bitapScore(e, x):
1268 """Compute and return the score for a match with e errors and x location.
1343 """Compute and return the score for a match with e errors and x location.
1269 Accesses loc and pattern through being a closure.
1344 Accesses loc and pattern through being a closure.
1270
1345
1271 Args:
1346 Args:
1272 e: Number of errors in match.
1347 e: Number of errors in match.
1273 x: Location of match.
1348 x: Location of match.
1274
1349
1275 Returns:
1350 Returns:
1276 Overall score for match (0.0 = good, 1.0 = bad).
1351 Overall score for match (0.0 = good, 1.0 = bad).
1277 """
1352 """
1278 accuracy = float(e) / len(pattern)
1353 accuracy = float(e) / len(pattern)
1279 proximity = abs(loc - x)
1354 proximity = abs(loc - x)
1280 if not self.Match_Distance:
1355 if not self.Match_Distance:
1281 # Dodge divide by zero error.
1356 # Dodge divide by zero error.
1282 return proximity and 1.0 or accuracy
1357 return proximity and 1.0 or accuracy
1283 return accuracy + (proximity / float(self.Match_Distance))
1358 return accuracy + (proximity / float(self.Match_Distance))
1284
1359
1285 # Highest score beyond which we give up.
1360 # Highest score beyond which we give up.
1286 score_threshold = self.Match_Threshold
1361 score_threshold = self.Match_Threshold
1287 # Is there a nearby exact match? (speedup)
1362 # Is there a nearby exact match? (speedup)
1288 best_loc = text.find(pattern, loc)
1363 best_loc = text.find(pattern, loc)
1289 if best_loc != -1:
1364 if best_loc != -1:
1290 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1365 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1291 # What about in the other direction? (speedup)
1366 # What about in the other direction? (speedup)
1292 best_loc = text.rfind(pattern, loc + len(pattern))
1367 best_loc = text.rfind(pattern, loc + len(pattern))
1293 if best_loc != -1:
1368 if best_loc != -1:
1294 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1369 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1295
1370
1296 # Initialise the bit arrays.
1371 # Initialise the bit arrays.
1297 matchmask = 1 << (len(pattern) - 1)
1372 matchmask = 1 << (len(pattern) - 1)
1298 best_loc = -1
1373 best_loc = -1
1299
1374
1300 bin_max = len(pattern) + len(text)
1375 bin_max = len(pattern) + len(text)
1301 # Empty initialization added to appease pychecker.
1376 # Empty initialization added to appease pychecker.
1302 last_rd = None
1377 last_rd = None
1303 for d in range(len(pattern)):
1378 for d in range(len(pattern)):
1304 # Scan for the best match each iteration allows for one more error.
1379 # Scan for the best match each iteration allows for one more error.
1305 # Run a binary search to determine how far from 'loc' we can stray at
1380 # Run a binary search to determine how far from 'loc' we can stray at
1306 # this error level.
1381 # this error level.
1307 bin_min = 0
1382 bin_min = 0
1308 bin_mid = bin_max
1383 bin_mid = bin_max
1309 while bin_min < bin_mid:
1384 while bin_min < bin_mid:
1310 if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1385 if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1311 bin_min = bin_mid
1386 bin_min = bin_mid
1312 else:
1387 else:
1313 bin_max = bin_mid
1388 bin_max = bin_mid
1314 bin_mid = (bin_max - bin_min) // 2 + bin_min
1389 bin_mid = (bin_max - bin_min) // 2 + bin_min
1315
1390
1316 # Use the result from this iteration as the maximum for the next.
1391 # Use the result from this iteration as the maximum for the next.
1317 bin_max = bin_mid
1392 bin_max = bin_mid
1318 start = max(1, loc - bin_mid + 1)
1393 start = max(1, loc - bin_mid + 1)
1319 finish = min(loc + bin_mid, len(text)) + len(pattern)
1394 finish = min(loc + bin_mid, len(text)) + len(pattern)
1320
1395
1321 rd = [0] * (finish + 2)
1396 rd = [0] * (finish + 2)
1322 rd[finish + 1] = (1 << d) - 1
1397 rd[finish + 1] = (1 << d) - 1
1323 for j in range(finish, start - 1, -1):
1398 for j in range(finish, start - 1, -1):
1324 if len(text) <= j - 1:
1399 if len(text) <= j - 1:
1325 # Out of range.
1400 # Out of range.
1326 charMatch = 0
1401 charMatch = 0
1327 else:
1402 else:
1328 charMatch = s.get(text[j - 1], 0)
1403 charMatch = s.get(text[j - 1], 0)
1329 if d == 0: # First pass: exact match.
1404 if d == 0: # First pass: exact match.
1330 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1405 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1331 else: # Subsequent passes: fuzzy match.
1406 else: # Subsequent passes: fuzzy match.
1332 rd[j] = (((rd[j + 1] << 1) | 1) & charMatch) | (
1407 rd[j] = (
1333 ((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
1408 (((rd[j + 1] << 1) | 1) & charMatch)
1409 | (((last_rd[j + 1] | last_rd[j]) << 1) | 1)
1410 | last_rd[j + 1]
1411 )
1334 if rd[j] & matchmask:
1412 if rd[j] & matchmask:
1335 score = match_bitapScore(d, j - 1)
1413 score = match_bitapScore(d, j - 1)
1336 # This match will almost certainly be better than any existing match.
1414 # This match will almost certainly be better than any existing match.
1337 # But check anyway.
1415 # But check anyway.
1338 if score <= score_threshold:
1416 if score <= score_threshold:
1339 # Told you so.
1417 # Told you so.
1340 score_threshold = score
1418 score_threshold = score
1341 best_loc = j - 1
1419 best_loc = j - 1
1342 if best_loc > loc:
1420 if best_loc > loc:
1343 # When passing loc, don't exceed our current distance from loc.
1421 # When passing loc, don't exceed our current distance from loc.
1344 start = max(1, 2 * loc - best_loc)
1422 start = max(1, 2 * loc - best_loc)
1345 else:
1423 else:
1346 # Already passed loc, downhill from here on in.
1424 # Already passed loc, downhill from here on in.
1347 break
1425 break
1348 # No hope for a (better) match at greater error levels.
1426 # No hope for a (better) match at greater error levels.
1349 if match_bitapScore(d + 1, loc) > score_threshold:
1427 if match_bitapScore(d + 1, loc) > score_threshold:
1350 break
1428 break
1351 last_rd = rd
1429 last_rd = rd
1352 return best_loc
1430 return best_loc
1353
1431
1354 def match_alphabet(self, pattern):
1432 def match_alphabet(self, pattern):
1355 """Initialise the alphabet for the Bitap algorithm.
1433 """Initialise the alphabet for the Bitap algorithm.
1356
1434
1357 Args:
1435 Args:
1358 pattern: The text to encode.
1436 pattern: The text to encode.
1359
1437
1360 Returns:
1438 Returns:
1361 Hash of character locations.
1439 Hash of character locations.
1362 """
1440 """
1363 s = {}
1441 s = {}
1364 for char in pattern:
1442 for char in pattern:
1365 s[char] = 0
1443 s[char] = 0
1366 for i in range(len(pattern)):
1444 for i in range(len(pattern)):
1367 s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1445 s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1368 return s
1446 return s
1369
1447
1370 # PATCH FUNCTIONS
1448 # PATCH FUNCTIONS
1371
1449
1372 def patch_addContext(self, patch, text):
1450 def patch_addContext(self, patch, text):
1373 """Increase the context until it is unique,
1451 """Increase the context until it is unique,
1374 but don't let the pattern expand beyond Match_MaxBits.
1452 but don't let the pattern expand beyond Match_MaxBits.
1375
1453
1376 Args:
1454 Args:
1377 patch: The patch to grow.
1455 patch: The patch to grow.
1378 text: Source text.
1456 text: Source text.
1379 """
1457 """
1380 if len(text) == 0:
1458 if len(text) == 0:
1381 return
1459 return
1382 pattern = text[patch.start2 : patch.start2 + patch.length1]
1460 pattern = text[patch.start2 : patch.start2 + patch.length1]
1383 padding = 0
1461 padding = 0
1384
1462
1385 # Look for the first and last matches of pattern in text. If two different
1463 # Look for the first and last matches of pattern in text. If two different
1386 # matches are found, increase the pattern length.
1464 # matches are found, increase the pattern length.
1387 while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits ==
1465 while text.find(pattern) != text.rfind(pattern) and (
1388 0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin -
1466 self.Match_MaxBits == 0
1389 self.Patch_Margin)):
1467 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - self.Patch_Margin
1468 ):
1390 padding += self.Patch_Margin
1469 padding += self.Patch_Margin
1391 pattern = text[max(0, patch.start2 - padding) :
1470 pattern = text[
1392 patch.start2 + patch.length1 + padding]
1471 max(0, patch.start2 - padding) : patch.start2 + patch.length1 + padding
1472 ]
1393 # Add one chunk for good luck.
1473 # Add one chunk for good luck.
1394 padding += self.Patch_Margin
1474 padding += self.Patch_Margin
1395
1475
1396 # Add the prefix.
1476 # Add the prefix.
1397 prefix = text[max(0, patch.start2 - padding) : patch.start2]
1477 prefix = text[max(0, patch.start2 - padding) : patch.start2]
1398 if prefix:
1478 if prefix:
1399 patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1479 patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1400 # Add the suffix.
1480 # Add the suffix.
1401 suffix = text[patch.start2 + patch.length1 :
1481 suffix = text[
1402 patch.start2 + patch.length1 + padding]
1482 patch.start2 + patch.length1 : patch.start2 + patch.length1 + padding
1483 ]
1403 if suffix:
1484 if suffix:
1404 patch.diffs.append((self.DIFF_EQUAL, suffix))
1485 patch.diffs.append((self.DIFF_EQUAL, suffix))
1405
1486
1406 # Roll back the start points.
1487 # Roll back the start points.
1407 patch.start1 -= len(prefix)
1488 patch.start1 -= len(prefix)
1408 patch.start2 -= len(prefix)
1489 patch.start2 -= len(prefix)
1409 # Extend lengths.
1490 # Extend lengths.
1410 patch.length1 += len(prefix) + len(suffix)
1491 patch.length1 += len(prefix) + len(suffix)
1411 patch.length2 += len(prefix) + len(suffix)
1492 patch.length2 += len(prefix) + len(suffix)
1412
1493
1413 def patch_make(self, a, b=None, c=None):
1494 def patch_make(self, a, b=None, c=None):
1414 """Compute a list of patches to turn text1 into text2.
1495 """Compute a list of patches to turn text1 into text2.
1415 Use diffs if provided, otherwise compute it ourselves.
1496 Use diffs if provided, otherwise compute it ourselves.
1416 There are four ways to call this function, depending on what data is
1497 There are four ways to call this function, depending on what data is
1417 available to the caller:
1498 available to the caller:
1418 Method 1:
1499 Method 1:
1419 a = text1, b = text2
1500 a = text1, b = text2
1420 Method 2:
1501 Method 2:
1421 a = diffs
1502 a = diffs
1422 Method 3 (optimal):
1503 Method 3 (optimal):
1423 a = text1, b = diffs
1504 a = text1, b = diffs
1424 Method 4 (deprecated, use method 3):
1505 Method 4 (deprecated, use method 3):
1425 a = text1, b = text2, c = diffs
1506 a = text1, b = text2, c = diffs
1426
1507
1427 Args:
1508 Args:
1428 a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1509 a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1429 text2 (method 2).
1510 text2 (method 2).
1430 b: text2 (methods 1,4) or Array of diff tuples for text1 to
1511 b: text2 (methods 1,4) or Array of diff tuples for text1 to
1431 text2 (method 3) or undefined (method 2).
1512 text2 (method 3) or undefined (method 2).
1432 c: Array of diff tuples for text1 to text2 (method 4) or
1513 c: Array of diff tuples for text1 to text2 (method 4) or
1433 undefined (methods 1,2,3).
1514 undefined (methods 1,2,3).
1434
1515
1435 Returns:
1516 Returns:
1436 Array of Patch objects.
1517 Array of Patch objects.
1437 """
1518 """
1438 text1 = None
1519 text1 = None
1439 diffs = None
1520 diffs = None
1440 # Note that texts may arrive as 'str' or 'unicode'.
1521 # Note that texts may arrive as 'str' or 'unicode'.
1441 if isinstance(a, str) and isinstance(b, str) and c is None:
1522 if isinstance(a, str) and isinstance(b, str) and c is None:
1442 # Method 1: text1, text2
1523 # Method 1: text1, text2
1443 # Compute diffs from text1 and text2.
1524 # Compute diffs from text1 and text2.
1444 text1 = a
1525 text1 = a
1445 diffs = self.diff_main(text1, b, True)
1526 diffs = self.diff_main(text1, b, True)
1446 if len(diffs) > 2:
1527 if len(diffs) > 2:
1447 self.diff_cleanupSemantic(diffs)
1528 self.diff_cleanupSemantic(diffs)
1448 self.diff_cleanupEfficiency(diffs)
1529 self.diff_cleanupEfficiency(diffs)
1449 elif isinstance(a, list) and b is None and c is None:
1530 elif isinstance(a, list) and b is None and c is None:
1450 # Method 2: diffs
1531 # Method 2: diffs
1451 # Compute text1 from diffs.
1532 # Compute text1 from diffs.
1452 diffs = a
1533 diffs = a
1453 text1 = self.diff_text1(diffs)
1534 text1 = self.diff_text1(diffs)
1454 elif isinstance(a, str) and isinstance(b, list) and c is None:
1535 elif isinstance(a, str) and isinstance(b, list) and c is None:
1455 # Method 3: text1, diffs
1536 # Method 3: text1, diffs
1456 text1 = a
1537 text1 = a
1457 diffs = b
1538 diffs = b
1458 elif (isinstance(a, str) and isinstance(b, str) and
1539 elif isinstance(a, str) and isinstance(b, str) and isinstance(c, list):
1459 isinstance(c, list)):
1460 # Method 4: text1, text2, diffs
1540 # Method 4: text1, text2, diffs
1461 # text2 is not used.
1541 # text2 is not used.
1462 text1 = a
1542 text1 = a
1463 diffs = c
1543 diffs = c
1464 else:
1544 else:
1465 raise ValueError("Unknown call format to patch_make.")
1545 raise ValueError("Unknown call format to patch_make.")
1466
1546
1467 if not diffs:
1547 if not diffs:
1468 return [] # Get rid of the None case.
1548 return [] # Get rid of the None case.
1469 patches = []
1549 patches = []
1470 patch = patch_obj()
1550 patch = patch_obj()
1471 char_count1 = 0 # Number of characters into the text1 string.
1551 char_count1 = 0 # Number of characters into the text1 string.
1472 char_count2 = 0 # Number of characters into the text2 string.
1552 char_count2 = 0 # Number of characters into the text2 string.
1473 prepatch_text = text1 # Recreate the patches to determine context info.
1553 prepatch_text = text1 # Recreate the patches to determine context info.
1474 postpatch_text = text1
1554 postpatch_text = text1
1475 for x in range(len(diffs)):
1555 for x in range(len(diffs)):
1476 (diff_type, diff_text) = diffs[x]
1556 (diff_type, diff_text) = diffs[x]
1477 if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1557 if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1478 # A new patch starts here.
1558 # A new patch starts here.
1479 patch.start1 = char_count1
1559 patch.start1 = char_count1
1480 patch.start2 = char_count2
1560 patch.start2 = char_count2
1481 if diff_type == self.DIFF_INSERT:
1561 if diff_type == self.DIFF_INSERT:
1482 # Insertion
1562 # Insertion
1483 patch.diffs.append(diffs[x])
1563 patch.diffs.append(diffs[x])
1484 patch.length2 += len(diff_text)
1564 patch.length2 += len(diff_text)
1485 postpatch_text = (postpatch_text[:char_count2] + diff_text +
1565 postpatch_text = (
1486 postpatch_text[char_count2:])
1566 postpatch_text[:char_count2]
1567 + diff_text
1568 + postpatch_text[char_count2:]
1569 )
1487 elif diff_type == self.DIFF_DELETE:
1570 elif diff_type == self.DIFF_DELETE:
1488 # Deletion.
1571 # Deletion.
1489 patch.length1 += len(diff_text)
1572 patch.length1 += len(diff_text)
1490 patch.diffs.append(diffs[x])
1573 patch.diffs.append(diffs[x])
1491 postpatch_text = (postpatch_text[:char_count2] +
1574 postpatch_text = (
1492 postpatch_text[char_count2 + len(diff_text):])
1575 postpatch_text[:char_count2]
1493 elif (diff_type == self.DIFF_EQUAL and
1576 + postpatch_text[char_count2 + len(diff_text) :]
1494 len(diff_text) <= 2 * self.Patch_Margin and
1577 )
1495 len(patch.diffs) != 0 and len(diffs) != x + 1):
1578 elif (
1579 diff_type == self.DIFF_EQUAL
1580 and len(diff_text) <= 2 * self.Patch_Margin
1581 and len(patch.diffs) != 0
1582 and len(diffs) != x + 1
1583 ):
1496 # Small equality inside a patch.
1584 # Small equality inside a patch.
1497 patch.diffs.append(diffs[x])
1585 patch.diffs.append(diffs[x])
1498 patch.length1 += len(diff_text)
1586 patch.length1 += len(diff_text)
1499 patch.length2 += len(diff_text)
1587 patch.length2 += len(diff_text)
1500
1588
1501 if (diff_type == self.DIFF_EQUAL and
1589 if diff_type == self.DIFF_EQUAL and len(diff_text) >= 2 * self.Patch_Margin:
1502 len(diff_text) >= 2 * self.Patch_Margin):
1503 # Time for a new patch.
1590 # Time for a new patch.
1504 if len(patch.diffs) != 0:
1591 if len(patch.diffs) != 0:
1505 self.patch_addContext(patch, prepatch_text)
1592 self.patch_addContext(patch, prepatch_text)
1506 patches.append(patch)
1593 patches.append(patch)
1507 patch = patch_obj()
1594 patch = patch_obj()
1508 # Unlike Unidiff, our patch lists have a rolling context.
1595 # Unlike Unidiff, our patch lists have a rolling context.
1509 # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1596 # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1510 # Update prepatch text & pos to reflect the application of the
1597 # Update prepatch text & pos to reflect the application of the
1511 # just completed patch.
1598 # just completed patch.
1512 prepatch_text = postpatch_text
1599 prepatch_text = postpatch_text
1513 char_count1 = char_count2
1600 char_count1 = char_count2
1514
1601
1515 # Update the current character count.
1602 # Update the current character count.
1516 if diff_type != self.DIFF_INSERT:
1603 if diff_type != self.DIFF_INSERT:
1517 char_count1 += len(diff_text)
1604 char_count1 += len(diff_text)
1518 if diff_type != self.DIFF_DELETE:
1605 if diff_type != self.DIFF_DELETE:
1519 char_count2 += len(diff_text)
1606 char_count2 += len(diff_text)
1520
1607
1521 # Pick up the leftover patch if not empty.
1608 # Pick up the leftover patch if not empty.
1522 if len(patch.diffs) != 0:
1609 if len(patch.diffs) != 0:
1523 self.patch_addContext(patch, prepatch_text)
1610 self.patch_addContext(patch, prepatch_text)
1524 patches.append(patch)
1611 patches.append(patch)
1525 return patches
1612 return patches
1526
1613
1527 def patch_deepCopy(self, patches):
1614 def patch_deepCopy(self, patches):
1528 """Given an array of patches, return another array that is identical.
1615 """Given an array of patches, return another array that is identical.
1529
1616
1530 Args:
1617 Args:
1531 patches: Array of Patch objects.
1618 patches: Array of Patch objects.
1532
1619
1533 Returns:
1620 Returns:
1534 Array of Patch objects.
1621 Array of Patch objects.
1535 """
1622 """
1536 patchesCopy = []
1623 patchesCopy = []
1537 for patch in patches:
1624 for patch in patches:
1538 patchCopy = patch_obj()
1625 patchCopy = patch_obj()
1539 # No need to deep copy the tuples since they are immutable.
1626 # No need to deep copy the tuples since they are immutable.
1540 patchCopy.diffs = patch.diffs[:]
1627 patchCopy.diffs = patch.diffs[:]
1541 patchCopy.start1 = patch.start1
1628 patchCopy.start1 = patch.start1
1542 patchCopy.start2 = patch.start2
1629 patchCopy.start2 = patch.start2
1543 patchCopy.length1 = patch.length1
1630 patchCopy.length1 = patch.length1
1544 patchCopy.length2 = patch.length2
1631 patchCopy.length2 = patch.length2
1545 patchesCopy.append(patchCopy)
1632 patchesCopy.append(patchCopy)
1546 return patchesCopy
1633 return patchesCopy
1547
1634
1548 def patch_apply(self, patches, text):
1635 def patch_apply(self, patches, text):
1549 """Merge a set of patches onto the text. Return a patched text, as well
1636 """Merge a set of patches onto the text. Return a patched text, as well
1550 as a list of true/false values indicating which patches were applied.
1637 as a list of true/false values indicating which patches were applied.
1551
1638
1552 Args:
1639 Args:
1553 patches: Array of Patch objects.
1640 patches: Array of Patch objects.
1554 text: Old text.
1641 text: Old text.
1555
1642
1556 Returns:
1643 Returns:
1557 Two element Array, containing the new text and an array of boolean values.
1644 Two element Array, containing the new text and an array of boolean values.
1558 """
1645 """
1559 if not patches:
1646 if not patches:
1560 return (text, [])
1647 return (text, [])
1561
1648
1562 # Deep copy the patches so that no changes are made to originals.
1649 # Deep copy the patches so that no changes are made to originals.
1563 patches = self.patch_deepCopy(patches)
1650 patches = self.patch_deepCopy(patches)
1564
1651
1565 nullPadding = self.patch_addPadding(patches)
1652 nullPadding = self.patch_addPadding(patches)
1566 text = nullPadding + text + nullPadding
1653 text = nullPadding + text + nullPadding
1567 self.patch_splitMax(patches)
1654 self.patch_splitMax(patches)
1568
1655
1569 # delta keeps track of the offset between the expected and actual location
1656 # delta keeps track of the offset between the expected and actual location
1570 # of the previous patch. If there are patches expected at positions 10 and
1657 # of the previous patch. If there are patches expected at positions 10 and
1571 # 20, but the first patch was found at 12, delta is 2 and the second patch
1658 # 20, but the first patch was found at 12, delta is 2 and the second patch
1572 # has an effective expected position of 22.
1659 # has an effective expected position of 22.
1573 delta = 0
1660 delta = 0
1574 results = []
1661 results = []
1575 for patch in patches:
1662 for patch in patches:
1576 expected_loc = patch.start2 + delta
1663 expected_loc = patch.start2 + delta
1577 text1 = self.diff_text1(patch.diffs)
1664 text1 = self.diff_text1(patch.diffs)
1578 end_loc = -1
1665 end_loc = -1
1579 if len(text1) > self.Match_MaxBits:
1666 if len(text1) > self.Match_MaxBits:
1580 # patch_splitMax will only provide an oversized pattern in the case of
1667 # patch_splitMax will only provide an oversized pattern in the case of
1581 # a monster delete.
1668 # a monster delete.
1582 start_loc = self.match_main(text, text1[:self.Match_MaxBits],
1669 start_loc = self.match_main(
1583 expected_loc)
1670 text, text1[: self.Match_MaxBits], expected_loc
1671 )
1584 if start_loc != -1:
1672 if start_loc != -1:
1585 end_loc = self.match_main(text, text1[-self.Match_MaxBits:],
1673 end_loc = self.match_main(
1586 expected_loc + len(text1) - self.Match_MaxBits)
1674 text,
1675 text1[-self.Match_MaxBits :],
1676 expected_loc + len(text1) - self.Match_MaxBits,
1677 )
1587 if end_loc == -1 or start_loc >= end_loc:
1678 if end_loc == -1 or start_loc >= end_loc:
1588 # Can't find valid trailing context. Drop this patch.
1679 # Can't find valid trailing context. Drop this patch.
1589 start_loc = -1
1680 start_loc = -1
1590 else:
1681 else:
1591 start_loc = self.match_main(text, text1, expected_loc)
1682 start_loc = self.match_main(text, text1, expected_loc)
1592 if start_loc == -1:
1683 if start_loc == -1:
1593 # No match found. :(
1684 # No match found. :(
1594 results.append(False)
1685 results.append(False)
1595 # Subtract the delta for this failed patch from subsequent patches.
1686 # Subtract the delta for this failed patch from subsequent patches.
1596 delta -= patch.length2 - patch.length1
1687 delta -= patch.length2 - patch.length1
1597 else:
1688 else:
1598 # Found a match. :)
1689 # Found a match. :)
1599 results.append(True)
1690 results.append(True)
1600 delta = start_loc - expected_loc
1691 delta = start_loc - expected_loc
1601 if end_loc == -1:
1692 if end_loc == -1:
1602 text2 = text[start_loc : start_loc + len(text1)]
1693 text2 = text[start_loc : start_loc + len(text1)]
1603 else:
1694 else:
1604 text2 = text[start_loc : end_loc + self.Match_MaxBits]
1695 text2 = text[start_loc : end_loc + self.Match_MaxBits]
1605 if text1 == text2:
1696 if text1 == text2:
1606 # Perfect match, just shove the replacement text in.
1697 # Perfect match, just shove the replacement text in.
1607 text = (text[:start_loc] + self.diff_text2(patch.diffs) +
1698 text = (
1608 text[start_loc + len(text1):])
1699 text[:start_loc]
1700 + self.diff_text2(patch.diffs)
1701 + text[start_loc + len(text1) :]
1702 )
1609 else:
1703 else:
1610 # Imperfect match.
1704 # Imperfect match.
1611 # Run a diff to get a framework of equivalent indices.
1705 # Run a diff to get a framework of equivalent indices.
1612 diffs = self.diff_main(text1, text2, False)
1706 diffs = self.diff_main(text1, text2, False)
1613 if (len(text1) > self.Match_MaxBits and
1707 if (
1614 self.diff_levenshtein(diffs) / float(len(text1)) >
1708 len(text1) > self.Match_MaxBits
1615 self.Patch_DeleteThreshold):
1709 and self.diff_levenshtein(diffs) / float(len(text1))
1710 > self.Patch_DeleteThreshold
1711 ):
1616 # The end points match, but the content is unacceptably bad.
1712 # The end points match, but the content is unacceptably bad.
1617 results[-1] = False
1713 results[-1] = False
1618 else:
1714 else:
1619 self.diff_cleanupSemanticLossless(diffs)
1715 self.diff_cleanupSemanticLossless(diffs)
1620 index1 = 0
1716 index1 = 0
1621 for (op, data) in patch.diffs:
1717 for op, data in patch.diffs:
1622 if op != self.DIFF_EQUAL:
1718 if op != self.DIFF_EQUAL:
1623 index2 = self.diff_xIndex(diffs, index1)
1719 index2 = self.diff_xIndex(diffs, index1)
1624 if op == self.DIFF_INSERT: # Insertion
1720 if op == self.DIFF_INSERT: # Insertion
1625 text = text[:start_loc + index2] + data + text[start_loc +
1721 text = (
1626 index2:]
1722 text[: start_loc + index2]
1723 + data
1724 + text[start_loc + index2 :]
1725 )
1627 elif op == self.DIFF_DELETE: # Deletion
1726 elif op == self.DIFF_DELETE: # Deletion
1628 text = text[:start_loc + index2] + text[start_loc +
1727 text = (
1629 self.diff_xIndex(diffs, index1 + len(data)):]
1728 text[: start_loc + index2]
1729 + text[
1730 start_loc
1731 + self.diff_xIndex(diffs, index1 + len(data)) :
1732 ]
1733 )
1630 if op != self.DIFF_DELETE:
1734 if op != self.DIFF_DELETE:
1631 index1 += len(data)
1735 index1 += len(data)
1632 # Strip the padding off.
1736 # Strip the padding off.
1633 text = text[len(nullPadding):-len(nullPadding)]
1737 text = text[len(nullPadding) : -len(nullPadding)]
1634 return (text, results)
1738 return (text, results)
1635
1739
1636 def patch_addPadding(self, patches):
1740 def patch_addPadding(self, patches):
1637 """Add some padding on text start and end so that edges can match
1741 """Add some padding on text start and end so that edges can match
1638 something. Intended to be called only from within patch_apply.
1742 something. Intended to be called only from within patch_apply.
1639
1743
1640 Args:
1744 Args:
1641 patches: Array of Patch objects.
1745 patches: Array of Patch objects.
1642
1746
1643 Returns:
1747 Returns:
1644 The padding string added to each side.
1748 The padding string added to each side.
1645 """
1749 """
1646 paddingLength = self.Patch_Margin
1750 paddingLength = self.Patch_Margin
1647 nullPadding = ""
1751 nullPadding = ""
1648 for x in range(1, paddingLength + 1):
1752 for x in range(1, paddingLength + 1):
1649 nullPadding += chr(x)
1753 nullPadding += chr(x)
1650
1754
1651 # Bump all the patches forward.
1755 # Bump all the patches forward.
1652 for patch in patches:
1756 for patch in patches:
1653 patch.start1 += paddingLength
1757 patch.start1 += paddingLength
1654 patch.start2 += paddingLength
1758 patch.start2 += paddingLength
1655
1759
1656 # Add some padding on start of first diff.
1760 # Add some padding on start of first diff.
1657 patch = patches[0]
1761 patch = patches[0]
1658 diffs = patch.diffs
1762 diffs = patch.diffs
1659 if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1763 if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1660 # Add nullPadding equality.
1764 # Add nullPadding equality.
1661 diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1765 diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1662 patch.start1 -= paddingLength # Should be 0.
1766 patch.start1 -= paddingLength # Should be 0.
1663 patch.start2 -= paddingLength # Should be 0.
1767 patch.start2 -= paddingLength # Should be 0.
1664 patch.length1 += paddingLength
1768 patch.length1 += paddingLength
1665 patch.length2 += paddingLength
1769 patch.length2 += paddingLength
1666 elif paddingLength > len(diffs[0][1]):
1770 elif paddingLength > len(diffs[0][1]):
1667 # Grow first equality.
1771 # Grow first equality.
1668 extraLength = paddingLength - len(diffs[0][1])
1772 extraLength = paddingLength - len(diffs[0][1])
1669 newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]
1773 newText = nullPadding[len(diffs[0][1]) :] + diffs[0][1]
1670 diffs[0] = (diffs[0][0], newText)
1774 diffs[0] = (diffs[0][0], newText)
1671 patch.start1 -= extraLength
1775 patch.start1 -= extraLength
1672 patch.start2 -= extraLength
1776 patch.start2 -= extraLength
1673 patch.length1 += extraLength
1777 patch.length1 += extraLength
1674 patch.length2 += extraLength
1778 patch.length2 += extraLength
1675
1779
1676 # Add some padding on end of last diff.
1780 # Add some padding on end of last diff.
1677 patch = patches[-1]
1781 patch = patches[-1]
1678 diffs = patch.diffs
1782 diffs = patch.diffs
1679 if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1783 if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1680 # Add nullPadding equality.
1784 # Add nullPadding equality.
1681 diffs.append((self.DIFF_EQUAL, nullPadding))
1785 diffs.append((self.DIFF_EQUAL, nullPadding))
1682 patch.length1 += paddingLength
1786 patch.length1 += paddingLength
1683 patch.length2 += paddingLength
1787 patch.length2 += paddingLength
1684 elif paddingLength > len(diffs[-1][1]):
1788 elif paddingLength > len(diffs[-1][1]):
1685 # Grow last equality.
1789 # Grow last equality.
1686 extraLength = paddingLength - len(diffs[-1][1])
1790 extraLength = paddingLength - len(diffs[-1][1])
1687 newText = diffs[-1][1] + nullPadding[:extraLength]
1791 newText = diffs[-1][1] + nullPadding[:extraLength]
1688 diffs[-1] = (diffs[-1][0], newText)
1792 diffs[-1] = (diffs[-1][0], newText)
1689 patch.length1 += extraLength
1793 patch.length1 += extraLength
1690 patch.length2 += extraLength
1794 patch.length2 += extraLength
1691
1795
1692 return nullPadding
1796 return nullPadding
1693
1797
1694 def patch_splitMax(self, patches):
1798 def patch_splitMax(self, patches):
1695 """Look through the patches and break up any which are longer than the
1799 """Look through the patches and break up any which are longer than the
1696 maximum limit of the match algorithm.
1800 maximum limit of the match algorithm.
1697 Intended to be called only from within patch_apply.
1801 Intended to be called only from within patch_apply.
1698
1802
1699 Args:
1803 Args:
1700 patches: Array of Patch objects.
1804 patches: Array of Patch objects.
1701 """
1805 """
1702 patch_size = self.Match_MaxBits
1806 patch_size = self.Match_MaxBits
1703 if patch_size == 0:
1807 if patch_size == 0:
1704 # Python has the option of not splitting strings due to its ability
1808 # Python has the option of not splitting strings due to its ability
1705 # to handle integers of arbitrary precision.
1809 # to handle integers of arbitrary precision.
1706 return
1810 return
1707 for x in range(len(patches)):
1811 for x in range(len(patches)):
1708 if patches[x].length1 <= patch_size:
1812 if patches[x].length1 <= patch_size:
1709 continue
1813 continue
1710 bigpatch = patches[x]
1814 bigpatch = patches[x]
1711 # Remove the big old patch.
1815 # Remove the big old patch.
1712 del patches[x]
1816 del patches[x]
1713 x -= 1
1817 x -= 1
1714 start1 = bigpatch.start1
1818 start1 = bigpatch.start1
1715 start2 = bigpatch.start2
1819 start2 = bigpatch.start2
1716 precontext = ''
1820 precontext = ""
1717 while len(bigpatch.diffs) != 0:
1821 while len(bigpatch.diffs) != 0:
1718 # Create one of several smaller patches.
1822 # Create one of several smaller patches.
1719 patch = patch_obj()
1823 patch = patch_obj()
1720 empty = True
1824 empty = True
1721 patch.start1 = start1 - len(precontext)
1825 patch.start1 = start1 - len(precontext)
1722 patch.start2 = start2 - len(precontext)
1826 patch.start2 = start2 - len(precontext)
1723 if precontext:
1827 if precontext:
1724 patch.length1 = patch.length2 = len(precontext)
1828 patch.length1 = patch.length2 = len(precontext)
1725 patch.diffs.append((self.DIFF_EQUAL, precontext))
1829 patch.diffs.append((self.DIFF_EQUAL, precontext))
1726
1830
1727 while (len(bigpatch.diffs) != 0 and
1831 while (
1728 patch.length1 < patch_size - self.Patch_Margin):
1832 len(bigpatch.diffs) != 0
1833 and patch.length1 < patch_size - self.Patch_Margin
1834 ):
1729 (diff_type, diff_text) = bigpatch.diffs[0]
1835 (diff_type, diff_text) = bigpatch.diffs[0]
1730 if diff_type == self.DIFF_INSERT:
1836 if diff_type == self.DIFF_INSERT:
1731 # Insertions are harmless.
1837 # Insertions are harmless.
1732 patch.length2 += len(diff_text)
1838 patch.length2 += len(diff_text)
1733 start2 += len(diff_text)
1839 start2 += len(diff_text)
1734 patch.diffs.append(bigpatch.diffs.pop(0))
1840 patch.diffs.append(bigpatch.diffs.pop(0))
1735 empty = False
1841 empty = False
1736 elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and
1842 elif (
1737 patch.diffs[0][0] == self.DIFF_EQUAL and
1843 diff_type == self.DIFF_DELETE
1738 len(diff_text) > 2 * patch_size):
1844 and len(patch.diffs) == 1
1845 and patch.diffs[0][0] == self.DIFF_EQUAL
1846 and len(diff_text) > 2 * patch_size
1847 ):
1739 # This is a large deletion. Let it pass in one chunk.
1848 # This is a large deletion. Let it pass in one chunk.
1740 patch.length1 += len(diff_text)
1849 patch.length1 += len(diff_text)
1741 start1 += len(diff_text)
1850 start1 += len(diff_text)
1742 empty = False
1851 empty = False
1743 patch.diffs.append((diff_type, diff_text))
1852 patch.diffs.append((diff_type, diff_text))
1744 del bigpatch.diffs[0]
1853 del bigpatch.diffs[0]
1745 else:
1854 else:
1746 # Deletion or equality. Only take as much as we can stomach.
1855 # Deletion or equality. Only take as much as we can stomach.
1747 diff_text = diff_text[:patch_size - patch.length1 -
1856 diff_text = diff_text[
1748 self.Patch_Margin]
1857 : patch_size - patch.length1 - self.Patch_Margin
1858 ]
1749 patch.length1 += len(diff_text)
1859 patch.length1 += len(diff_text)
1750 start1 += len(diff_text)
1860 start1 += len(diff_text)
1751 if diff_type == self.DIFF_EQUAL:
1861 if diff_type == self.DIFF_EQUAL:
1752 patch.length2 += len(diff_text)
1862 patch.length2 += len(diff_text)
1753 start2 += len(diff_text)
1863 start2 += len(diff_text)
1754 else:
1864 else:
1755 empty = False
1865 empty = False
1756
1866
1757 patch.diffs.append((diff_type, diff_text))
1867 patch.diffs.append((diff_type, diff_text))
1758 if diff_text == bigpatch.diffs[0][1]:
1868 if diff_text == bigpatch.diffs[0][1]:
1759 del bigpatch.diffs[0]
1869 del bigpatch.diffs[0]
1760 else:
1870 else:
1761 bigpatch.diffs[0] = (bigpatch.diffs[0][0],
1871 bigpatch.diffs[0] = (
1762 bigpatch.diffs[0][1][len(diff_text):])
1872 bigpatch.diffs[0][0],
1873 bigpatch.diffs[0][1][len(diff_text) :],
1874 )
1763
1875
1764 # Compute the head context for the next patch.
1876 # Compute the head context for the next patch.
1765 precontext = self.diff_text2(patch.diffs)
1877 precontext = self.diff_text2(patch.diffs)
1766 precontext = precontext[-self.Patch_Margin:]
1878 precontext = precontext[-self.Patch_Margin :]
1767 # Append the end context for this patch.
1879 # Append the end context for this patch.
1768 postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin]
1880 postcontext = self.diff_text1(bigpatch.diffs)[: self.Patch_Margin]
1769 if postcontext:
1881 if postcontext:
1770 patch.length1 += len(postcontext)
1882 patch.length1 += len(postcontext)
1771 patch.length2 += len(postcontext)
1883 patch.length2 += len(postcontext)
1772 if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1884 if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1773 patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +
1885 patch.diffs[-1] = (
1774 postcontext)
1886 self.DIFF_EQUAL,
1887 patch.diffs[-1][1] + postcontext,
1888 )
1775 else:
1889 else:
1776 patch.diffs.append((self.DIFF_EQUAL, postcontext))
1890 patch.diffs.append((self.DIFF_EQUAL, postcontext))
1777
1891
1778 if not empty:
1892 if not empty:
1779 x += 1
1893 x += 1
1780 patches.insert(x, patch)
1894 patches.insert(x, patch)
1781
1895
1782 def patch_toText(self, patches):
1896 def patch_toText(self, patches):
1783 """Take a list of patches and return a textual representation.
1897 """Take a list of patches and return a textual representation.
1784
1898
1785 Args:
1899 Args:
1786 patches: Array of Patch objects.
1900 patches: Array of Patch objects.
1787
1901
1788 Returns:
1902 Returns:
1789 Text representation of patches.
1903 Text representation of patches.
1790 """
1904 """
1791 text = []
1905 text = []
1792 for patch in patches:
1906 for patch in patches:
1793 text.append(str(patch))
1907 text.append(str(patch))
1794 return "".join(text)
1908 return "".join(text)
1795
1909
1796 def patch_fromText(self, textline):
1910 def patch_fromText(self, textline):
1797 """Parse a textual representation of patches and return a list of patch
1911 """Parse a textual representation of patches and return a list of patch
1798 objects.
1912 objects.
1799
1913
1800 Args:
1914 Args:
1801 textline: Text representation of patches.
1915 textline: Text representation of patches.
1802
1916
1803 Returns:
1917 Returns:
1804 Array of Patch objects.
1918 Array of Patch objects.
1805
1919
1806 Raises:
1920 Raises:
1807 ValueError: If invalid input.
1921 ValueError: If invalid input.
1808 """
1922 """
1809 if type(textline) == unicode:
1923 if type(textline) == unicode:
1810 # Patches should be composed of a subset of ascii chars, Unicode not
1924 # Patches should be composed of a subset of ascii chars, Unicode not
1811 # required. If this encode raises UnicodeEncodeError, patch is invalid.
1925 # required. If this encode raises UnicodeEncodeError, patch is invalid.
1812 textline = textline.encode("ascii")
1926 textline = textline.encode("ascii")
1813 patches = []
1927 patches = []
1814 if not textline:
1928 if not textline:
1815 return patches
1929 return patches
1816 text = textline.split('\n')
1930 text = textline.split("\n")
1817 while len(text) != 0:
1931 while len(text) != 0:
1818 m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1932 m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1819 if not m:
1933 if not m:
1820 raise ValueError("Invalid patch string: " + text[0])
1934 raise ValueError("Invalid patch string: " + text[0])
1821 patch = patch_obj()
1935 patch = patch_obj()
1822 patches.append(patch)
1936 patches.append(patch)
1823 patch.start1 = int(m.group(1))
1937 patch.start1 = int(m.group(1))
1824 if m.group(2) == '':
1938 if m.group(2) == "":
1825 patch.start1 -= 1
1939 patch.start1 -= 1
1826 patch.length1 = 1
1940 patch.length1 = 1
1827 elif m.group(2) == '0':
1941 elif m.group(2) == "0":
1828 patch.length1 = 0
1942 patch.length1 = 0
1829 else:
1943 else:
1830 patch.start1 -= 1
1944 patch.start1 -= 1
1831 patch.length1 = int(m.group(2))
1945 patch.length1 = int(m.group(2))
1832
1946
1833 patch.start2 = int(m.group(3))
1947 patch.start2 = int(m.group(3))
1834 if m.group(4) == '':
1948 if m.group(4) == "":
1835 patch.start2 -= 1
1949 patch.start2 -= 1
1836 patch.length2 = 1
1950 patch.length2 = 1
1837 elif m.group(4) == '0':
1951 elif m.group(4) == "0":
1838 patch.length2 = 0
1952 patch.length2 = 0
1839 else:
1953 else:
1840 patch.start2 -= 1
1954 patch.start2 -= 1
1841 patch.length2 = int(m.group(4))
1955 patch.length2 = int(m.group(4))
1842
1956
1843 del text[0]
1957 del text[0]
1844
1958
1845 while len(text) != 0:
1959 while len(text) != 0:
1846 if text[0]:
1960 if text[0]:
1847 sign = text[0][0]
1961 sign = text[0][0]
1848 else:
1962 else:
1849 sign = ''
1963 sign = ""
1850 line = urllib.parse.unquote(text[0][1:])
1964 line = urllib.parse.unquote(text[0][1:])
1851 line = line.decode("utf-8")
1965 line = line.decode("utf-8")
1852 if sign == '+':
1966 if sign == "+":
1853 # Insertion.
1967 # Insertion.
1854 patch.diffs.append((self.DIFF_INSERT, line))
1968 patch.diffs.append((self.DIFF_INSERT, line))
1855 elif sign == '-':
1969 elif sign == "-":
1856 # Deletion.
1970 # Deletion.
1857 patch.diffs.append((self.DIFF_DELETE, line))
1971 patch.diffs.append((self.DIFF_DELETE, line))
1858 elif sign == ' ':
1972 elif sign == " ":
1859 # Minor equality.
1973 # Minor equality.
1860 patch.diffs.append((self.DIFF_EQUAL, line))
1974 patch.diffs.append((self.DIFF_EQUAL, line))
1861 elif sign == '@':
1975 elif sign == "@":
1862 # Start of next patch.
1976 # Start of next patch.
1863 break
1977 break
1864 elif sign == '':
1978 elif sign == "":
1865 # Blank line? Whatever.
1979 # Blank line? Whatever.
1866 pass
1980 pass
1867 else:
1981 else:
1868 # WTF?
1982 # WTF?
1869 raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))
1983 raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))
1870 del text[0]
1984 del text[0]
1871 return patches
1985 return patches
1872
1986
1873
1987
1874 class patch_obj:
1988 class patch_obj:
1875 """Class representing one patch operation.
1989 """Class representing one patch operation."""
1876 """
1877
1990
1878 def __init__(self):
1991 def __init__(self):
1879 """Initializes with an empty list of diffs.
1992 """Initializes with an empty list of diffs."""
1880 """
1881 self.diffs = []
1993 self.diffs = []
1882 self.start1 = None
1994 self.start1 = None
1883 self.start2 = None
1995 self.start2 = None
1884 self.length1 = 0
1996 self.length1 = 0
1885 self.length2 = 0
1997 self.length2 = 0
1886
1998
1887 def __str__(self):
1999 def __str__(self):
1888 """Emmulate GNU diff's format.
2000 """Emmulate GNU diff's format.
1889 Header: @@ -382,8 +481,9 @@
2001 Header: @@ -382,8 +481,9 @@
1890 Indicies are printed as 1-based, not 0-based.
2002 Indicies are printed as 1-based, not 0-based.
1891
2003
1892 Returns:
2004 Returns:
1893 The GNU diff string.
2005 The GNU diff string.
1894 """
2006 """
1895 if self.length1 == 0:
2007 if self.length1 == 0:
1896 coords1 = str(self.start1) + ",0"
2008 coords1 = str(self.start1) + ",0"
1897 elif self.length1 == 1:
2009 elif self.length1 == 1:
1898 coords1 = str(self.start1 + 1)
2010 coords1 = str(self.start1 + 1)
1899 else:
2011 else:
1900 coords1 = str(self.start1 + 1) + "," + str(self.length1)
2012 coords1 = str(self.start1 + 1) + "," + str(self.length1)
1901 if self.length2 == 0:
2013 if self.length2 == 0:
1902 coords2 = str(self.start2) + ",0"
2014 coords2 = str(self.start2) + ",0"
1903 elif self.length2 == 1:
2015 elif self.length2 == 1:
1904 coords2 = str(self.start2 + 1)
2016 coords2 = str(self.start2 + 1)
1905 else:
2017 else:
1906 coords2 = str(self.start2 + 1) + "," + str(self.length2)
2018 coords2 = str(self.start2 + 1) + "," + str(self.length2)
1907 text = ["@@ -", coords1, " +", coords2, " @@\n"]
2019 text = ["@@ -", coords1, " +", coords2, " @@\n"]
1908 # Escape the body of the patch with %xx notation.
2020 # Escape the body of the patch with %xx notation.
1909 for (op, data) in self.diffs:
2021 for op, data in self.diffs:
1910 if op == diff_match_patch.DIFF_INSERT:
2022 if op == diff_match_patch.DIFF_INSERT:
1911 text.append("+")
2023 text.append("+")
1912 elif op == diff_match_patch.DIFF_DELETE:
2024 elif op == diff_match_patch.DIFF_DELETE:
1913 text.append("-")
2025 text.append("-")
1914 elif op == diff_match_patch.DIFF_EQUAL:
2026 elif op == diff_match_patch.DIFF_EQUAL:
1915 text.append(" ")
2027 text.append(" ")
1916 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
2028 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
1917 data = data.encode("utf-8")
2029 data = data.encode("utf-8")
1918 text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
2030 text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
1919 return "".join(text) No newline at end of file
2031 return "".join(text)
General Comments 0
You need to be logged in to leave comments. Login now