##// END OF EJS Templates
diffs: python3 port
super-admin -
r5083:875cd526 default
parent child Browse files
Show More
@@ -1,2031 +1,2033 b''
1 """Diff Match and Patch
1 """Diff Match and Patch
2
2
3 Copyright 2006 Google Inc.
3 Copyright 2006 Google Inc.
4 http://code.google.com/p/google-diff-match-patch/
4 http://code.google.com/p/google-diff-match-patch/
5
5
6 Licensed under the Apache License, Version 2.0 (the "License");
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
8 You may obtain a copy of the License at
9
9
10 http://www.apache.org/licenses/LICENSE-2.0
10 http://www.apache.org/licenses/LICENSE-2.0
11
11
12 Unless required by applicable law or agreed to in writing, software
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
15 See the License for the specific language governing permissions and
16 limitations under the License.
16 limitations under the License.
17 """
17 """
18
18
19 """Functions for diff, match and patch.
19 """Functions for diff, match and patch.
20
20
21 Computes the difference between two texts to create a patch.
21 Computes the difference between two texts to create a patch.
22 Applies the patch onto another text, allowing for errors.
22 Applies the patch onto another text, allowing for errors.
23 """
23 """
24
24
25 __author__ = "fraser@google.com (Neil Fraser)"
25 __author__ = "fraser@google.com (Neil Fraser)"
26
26
27 import math
27 import math
28 import re
28 import re
29 import sys
29 import sys
30 import time
30 import time
31 import urllib.request, urllib.parse, urllib.error
31 import urllib.request
32 import urllib.parse
33 import urllib.error
32
34
33
35
34 class diff_match_patch:
36 class diff_match_patch:
35 """Class containing the diff, match and patch methods.
37 """Class containing the diff, match and patch methods.
36
38
37 Also contains the behaviour settings.
39 Also contains the behaviour settings.
38 """
40 """
39
41
40 def __init__(self):
42 def __init__(self):
41 """Inits a diff_match_patch object with default settings.
43 """Inits a diff_match_patch object with default settings.
42 Redefine these in your program to override the defaults.
44 Redefine these in your program to override the defaults.
43 """
45 """
44
46
45 # Number of seconds to map a diff before giving up (0 for infinity).
47 # Number of seconds to map a diff before giving up (0 for infinity).
46 self.Diff_Timeout = 1.0
48 self.Diff_Timeout = 1.0
47 # Cost of an empty edit operation in terms of edit characters.
49 # Cost of an empty edit operation in terms of edit characters.
48 self.Diff_EditCost = 4
50 self.Diff_EditCost = 4
49 # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
51 # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
50 self.Match_Threshold = 0.5
52 self.Match_Threshold = 0.5
51 # How far to search for a match (0 = exact location, 1000+ = broad match).
53 # How far to search for a match (0 = exact location, 1000+ = broad match).
52 # A match this many characters away from the expected location will add
54 # A match this many characters away from the expected location will add
53 # 1.0 to the score (0.0 is a perfect match).
55 # 1.0 to the score (0.0 is a perfect match).
54 self.Match_Distance = 1000
56 self.Match_Distance = 1000
55 # When deleting a large block of text (over ~64 characters), how close do
57 # When deleting a large block of text (over ~64 characters), how close do
56 # the contents have to be to match the expected contents. (0.0 = perfection,
58 # the contents have to be to match the expected contents. (0.0 = perfection,
57 # 1.0 = very loose). Note that Match_Threshold controls how closely the
59 # 1.0 = very loose). Note that Match_Threshold controls how closely the
58 # end points of a delete need to match.
60 # end points of a delete need to match.
59 self.Patch_DeleteThreshold = 0.5
61 self.Patch_DeleteThreshold = 0.5
60 # Chunk size for context length.
62 # Chunk size for context length.
61 self.Patch_Margin = 4
63 self.Patch_Margin = 4
62
64
63 # The number of bits in an int.
65 # The number of bits in an int.
64 # Python has no maximum, thus to disable patch splitting set to 0.
66 # Python has no maximum, thus to disable patch splitting set to 0.
65 # However to avoid long patches in certain pathological cases, use 32.
67 # However to avoid long patches in certain pathological cases, use 32.
66 # Multiple short patches (using native ints) are much faster than long ones.
68 # Multiple short patches (using native ints) are much faster than long ones.
67 self.Match_MaxBits = 32
69 self.Match_MaxBits = 32
68
70
69 # DIFF FUNCTIONS
71 # DIFF FUNCTIONS
70
72
71 # The data structure representing a diff is an array of tuples:
73 # The data structure representing a diff is an array of tuples:
72 # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
74 # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
73 # which means: delete "Hello", add "Goodbye" and keep " world."
75 # which means: delete "Hello", add "Goodbye" and keep " world."
74 DIFF_DELETE = -1
76 DIFF_DELETE = -1
75 DIFF_INSERT = 1
77 DIFF_INSERT = 1
76 DIFF_EQUAL = 0
78 DIFF_EQUAL = 0
77
79
78 def diff_main(self, text1, text2, checklines=True, deadline=None):
80 def diff_main(self, text1, text2, checklines=True, deadline=None):
79 """Find the differences between two texts. Simplifies the problem by
81 """Find the differences between two texts. Simplifies the problem by
80 stripping any common prefix or suffix off the texts before diffing.
82 stripping any common prefix or suffix off the texts before diffing.
81
83
82 Args:
84 Args:
83 text1: Old string to be diffed.
85 text1: Old string to be diffed.
84 text2: New string to be diffed.
86 text2: New string to be diffed.
85 checklines: Optional speedup flag. If present and false, then don't run
87 checklines: Optional speedup flag. If present and false, then don't run
86 a line-level diff first to identify the changed areas.
88 a line-level diff first to identify the changed areas.
87 Defaults to true, which does a faster, slightly less optimal diff.
89 Defaults to true, which does a faster, slightly less optimal diff.
88 deadline: Optional time when the diff should be complete by. Used
90 deadline: Optional time when the diff should be complete by. Used
89 internally for recursive calls. Users should set DiffTimeout instead.
91 internally for recursive calls. Users should set DiffTimeout instead.
90
92
91 Returns:
93 Returns:
92 Array of changes.
94 Array of changes.
93 """
95 """
94 # Set a deadline by which time the diff must be complete.
96 # Set a deadline by which time the diff must be complete.
95 if deadline is None:
97 if deadline is None:
96 # Unlike in most languages, Python counts time in seconds.
98 # Unlike in most languages, Python counts time in seconds.
97 if self.Diff_Timeout <= 0:
99 if self.Diff_Timeout <= 0:
98 deadline = sys.maxsize
100 deadline = sys.maxsize
99 else:
101 else:
100 deadline = time.time() + self.Diff_Timeout
102 deadline = time.time() + self.Diff_Timeout
101
103
102 # Check for null inputs.
104 # Check for null inputs.
103 if text1 is None or text2 is None:
105 if text1 is None or text2 is None:
104 raise ValueError("Null inputs. (diff_main)")
106 raise ValueError("Null inputs. (diff_main)")
105
107
106 # Check for equality (speedup).
108 # Check for equality (speedup).
107 if text1 == text2:
109 if text1 == text2:
108 if text1:
110 if text1:
109 return [(self.DIFF_EQUAL, text1)]
111 return [(self.DIFF_EQUAL, text1)]
110 return []
112 return []
111
113
112 # Trim off common prefix (speedup).
114 # Trim off common prefix (speedup).
113 commonlength = self.diff_commonPrefix(text1, text2)
115 commonlength = self.diff_commonPrefix(text1, text2)
114 commonprefix = text1[:commonlength]
116 commonprefix = text1[:commonlength]
115 text1 = text1[commonlength:]
117 text1 = text1[commonlength:]
116 text2 = text2[commonlength:]
118 text2 = text2[commonlength:]
117
119
118 # Trim off common suffix (speedup).
120 # Trim off common suffix (speedup).
119 commonlength = self.diff_commonSuffix(text1, text2)
121 commonlength = self.diff_commonSuffix(text1, text2)
120 if commonlength == 0:
122 if commonlength == 0:
121 commonsuffix = ""
123 commonsuffix = ""
122 else:
124 else:
123 commonsuffix = text1[-commonlength:]
125 commonsuffix = text1[-commonlength:]
124 text1 = text1[:-commonlength]
126 text1 = text1[:-commonlength]
125 text2 = text2[:-commonlength]
127 text2 = text2[:-commonlength]
126
128
127 # Compute the diff on the middle block.
129 # Compute the diff on the middle block.
128 diffs = self.diff_compute(text1, text2, checklines, deadline)
130 diffs = self.diff_compute(text1, text2, checklines, deadline)
129
131
130 # Restore the prefix and suffix.
132 # Restore the prefix and suffix.
131 if commonprefix:
133 if commonprefix:
132 diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
134 diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
133 if commonsuffix:
135 if commonsuffix:
134 diffs.append((self.DIFF_EQUAL, commonsuffix))
136 diffs.append((self.DIFF_EQUAL, commonsuffix))
135 self.diff_cleanupMerge(diffs)
137 self.diff_cleanupMerge(diffs)
136 return diffs
138 return diffs
137
139
138 def diff_compute(self, text1, text2, checklines, deadline):
140 def diff_compute(self, text1, text2, checklines, deadline):
139 """Find the differences between two texts. Assumes that the texts do not
141 """Find the differences between two texts. Assumes that the texts do not
140 have any common prefix or suffix.
142 have any common prefix or suffix.
141
143
142 Args:
144 Args:
143 text1: Old string to be diffed.
145 text1: Old string to be diffed.
144 text2: New string to be diffed.
146 text2: New string to be diffed.
145 checklines: Speedup flag. If false, then don't run a line-level diff
147 checklines: Speedup flag. If false, then don't run a line-level diff
146 first to identify the changed areas.
148 first to identify the changed areas.
147 If true, then run a faster, slightly less optimal diff.
149 If true, then run a faster, slightly less optimal diff.
148 deadline: Time when the diff should be complete by.
150 deadline: Time when the diff should be complete by.
149
151
150 Returns:
152 Returns:
151 Array of changes.
153 Array of changes.
152 """
154 """
153 if not text1:
155 if not text1:
154 # Just add some text (speedup).
156 # Just add some text (speedup).
155 return [(self.DIFF_INSERT, text2)]
157 return [(self.DIFF_INSERT, text2)]
156
158
157 if not text2:
159 if not text2:
158 # Just delete some text (speedup).
160 # Just delete some text (speedup).
159 return [(self.DIFF_DELETE, text1)]
161 return [(self.DIFF_DELETE, text1)]
160
162
161 if len(text1) > len(text2):
163 if len(text1) > len(text2):
162 (longtext, shorttext) = (text1, text2)
164 (longtext, shorttext) = (text1, text2)
163 else:
165 else:
164 (shorttext, longtext) = (text1, text2)
166 (shorttext, longtext) = (text1, text2)
165 i = longtext.find(shorttext)
167 i = longtext.find(shorttext)
166 if i != -1:
168 if i != -1:
167 # Shorter text is inside the longer text (speedup).
169 # Shorter text is inside the longer text (speedup).
168 diffs = [
170 diffs = [
169 (self.DIFF_INSERT, longtext[:i]),
171 (self.DIFF_INSERT, longtext[:i]),
170 (self.DIFF_EQUAL, shorttext),
172 (self.DIFF_EQUAL, shorttext),
171 (self.DIFF_INSERT, longtext[i + len(shorttext) :]),
173 (self.DIFF_INSERT, longtext[i + len(shorttext):]),
172 ]
174 ]
173 # Swap insertions for deletions if diff is reversed.
175 # Swap insertions for deletions if diff is reversed.
174 if len(text1) > len(text2):
176 if len(text1) > len(text2):
175 diffs[0] = (self.DIFF_DELETE, diffs[0][1])
177 diffs[0] = (self.DIFF_DELETE, diffs[0][1])
176 diffs[2] = (self.DIFF_DELETE, diffs[2][1])
178 diffs[2] = (self.DIFF_DELETE, diffs[2][1])
177 return diffs
179 return diffs
178
180
179 if len(shorttext) == 1:
181 if len(shorttext) == 1:
180 # Single character string.
182 # Single character string.
181 # After the previous speedup, the character can't be an equality.
183 # After the previous speedup, the character can't be an equality.
182 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
184 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
183
185
184 # Check to see if the problem can be split in two.
186 # Check to see if the problem can be split in two.
185 hm = self.diff_halfMatch(text1, text2)
187 hm = self.diff_halfMatch(text1, text2)
186 if hm:
188 if hm:
187 # A half-match was found, sort out the return data.
189 # A half-match was found, sort out the return data.
188 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
190 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
189 # Send both pairs off for separate processing.
191 # Send both pairs off for separate processing.
190 diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)
192 diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)
191 diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)
193 diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)
192 # Merge the results.
194 # Merge the results.
193 return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
195 return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
194
196
195 if checklines and len(text1) > 100 and len(text2) > 100:
197 if checklines and len(text1) > 100 and len(text2) > 100:
196 return self.diff_lineMode(text1, text2, deadline)
198 return self.diff_lineMode(text1, text2, deadline)
197
199
198 return self.diff_bisect(text1, text2, deadline)
200 return self.diff_bisect(text1, text2, deadline)
199
201
200 def diff_lineMode(self, text1, text2, deadline):
202 def diff_lineMode(self, text1, text2, deadline):
201 """Do a quick line-level diff on both strings, then rediff the parts for
203 """Do a quick line-level diff on both strings, then rediff the parts for
202 greater accuracy.
204 greater accuracy.
203 This speedup can produce non-minimal diffs.
205 This speedup can produce non-minimal diffs.
204
206
205 Args:
207 Args:
206 text1: Old string to be diffed.
208 text1: Old string to be diffed.
207 text2: New string to be diffed.
209 text2: New string to be diffed.
208 deadline: Time when the diff should be complete by.
210 deadline: Time when the diff should be complete by.
209
211
210 Returns:
212 Returns:
211 Array of changes.
213 Array of changes.
212 """
214 """
213
215
214 # Scan the text on a line-by-line basis first.
216 # Scan the text on a line-by-line basis first.
215 (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
217 (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
216
218
217 diffs = self.diff_main(text1, text2, False, deadline)
219 diffs = self.diff_main(text1, text2, False, deadline)
218
220
219 # Convert the diff back to original text.
221 # Convert the diff back to original text.
220 self.diff_charsToLines(diffs, linearray)
222 self.diff_charsToLines(diffs, linearray)
221 # Eliminate freak matches (e.g. blank lines)
223 # Eliminate freak matches (e.g. blank lines)
222 self.diff_cleanupSemantic(diffs)
224 self.diff_cleanupSemantic(diffs)
223
225
224 # Rediff any replacement blocks, this time character-by-character.
226 # Rediff any replacement blocks, this time character-by-character.
225 # Add a dummy entry at the end.
227 # Add a dummy entry at the end.
226 diffs.append((self.DIFF_EQUAL, ""))
228 diffs.append((self.DIFF_EQUAL, ""))
227 pointer = 0
229 pointer = 0
228 count_delete = 0
230 count_delete = 0
229 count_insert = 0
231 count_insert = 0
230 text_delete = ""
232 text_delete = ""
231 text_insert = ""
233 text_insert = ""
232 while pointer < len(diffs):
234 while pointer < len(diffs):
233 if diffs[pointer][0] == self.DIFF_INSERT:
235 if diffs[pointer][0] == self.DIFF_INSERT:
234 count_insert += 1
236 count_insert += 1
235 text_insert += diffs[pointer][1]
237 text_insert += diffs[pointer][1]
236 elif diffs[pointer][0] == self.DIFF_DELETE:
238 elif diffs[pointer][0] == self.DIFF_DELETE:
237 count_delete += 1
239 count_delete += 1
238 text_delete += diffs[pointer][1]
240 text_delete += diffs[pointer][1]
239 elif diffs[pointer][0] == self.DIFF_EQUAL:
241 elif diffs[pointer][0] == self.DIFF_EQUAL:
240 # Upon reaching an equality, check for prior redundancies.
242 # Upon reaching an equality, check for prior redundancies.
241 if count_delete >= 1 and count_insert >= 1:
243 if count_delete >= 1 and count_insert >= 1:
242 # Delete the offending records and add the merged ones.
244 # Delete the offending records and add the merged ones.
243 a = self.diff_main(text_delete, text_insert, False, deadline)
245 a = self.diff_main(text_delete, text_insert, False, deadline)
244 diffs[pointer - count_delete - count_insert : pointer] = a
246 diffs[pointer - count_delete - count_insert: pointer] = a
245 pointer = pointer - count_delete - count_insert + len(a)
247 pointer = pointer - count_delete - count_insert + len(a)
246 count_insert = 0
248 count_insert = 0
247 count_delete = 0
249 count_delete = 0
248 text_delete = ""
250 text_delete = ""
249 text_insert = ""
251 text_insert = ""
250
252
251 pointer += 1
253 pointer += 1
252
254
253 diffs.pop() # Remove the dummy entry at the end.
255 diffs.pop() # Remove the dummy entry at the end.
254
256
255 return diffs
257 return diffs
256
258
257 def diff_bisect(self, text1, text2, deadline):
259 def diff_bisect(self, text1, text2, deadline):
258 """Find the 'middle snake' of a diff, split the problem in two
260 """Find the 'middle snake' of a diff, split the problem in two
259 and return the recursively constructed diff.
261 and return the recursively constructed diff.
260 See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
262 See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
261
263
262 Args:
264 Args:
263 text1: Old string to be diffed.
265 text1: Old string to be diffed.
264 text2: New string to be diffed.
266 text2: New string to be diffed.
265 deadline: Time at which to bail if not yet complete.
267 deadline: Time at which to bail if not yet complete.
266
268
267 Returns:
269 Returns:
268 Array of diff tuples.
270 Array of diff tuples.
269 """
271 """
270
272
271 # Cache the text lengths to prevent multiple calls.
273 # Cache the text lengths to prevent multiple calls.
272 text1_length = len(text1)
274 text1_length = len(text1)
273 text2_length = len(text2)
275 text2_length = len(text2)
274 max_d = (text1_length + text2_length + 1) // 2
276 max_d = (text1_length + text2_length + 1) // 2
275 v_offset = max_d
277 v_offset = max_d
276 v_length = 2 * max_d
278 v_length = 2 * max_d
277 v1 = [-1] * v_length
279 v1 = [-1] * v_length
278 v1[v_offset + 1] = 0
280 v1[v_offset + 1] = 0
279 v2 = v1[:]
281 v2 = v1[:]
280 delta = text1_length - text2_length
282 delta = text1_length - text2_length
281 # If the total number of characters is odd, then the front path will
283 # If the total number of characters is odd, then the front path will
282 # collide with the reverse path.
284 # collide with the reverse path.
283 front = delta % 2 != 0
285 front = delta % 2 != 0
284 # Offsets for start and end of k loop.
286 # Offsets for start and end of k loop.
285 # Prevents mapping of space beyond the grid.
287 # Prevents mapping of space beyond the grid.
286 k1start = 0
288 k1start = 0
287 k1end = 0
289 k1end = 0
288 k2start = 0
290 k2start = 0
289 k2end = 0
291 k2end = 0
290 for d in range(max_d):
292 for d in range(max_d):
291 # Bail out if deadline is reached.
293 # Bail out if deadline is reached.
292 if time.time() > deadline:
294 if time.time() > deadline:
293 break
295 break
294
296
295 # Walk the front path one step.
297 # Walk the front path one step.
296 for k1 in range(-d + k1start, d + 1 - k1end, 2):
298 for k1 in range(-d + k1start, d + 1 - k1end, 2):
297 k1_offset = v_offset + k1
299 k1_offset = v_offset + k1
298 if k1 == -d or (k1 != d and v1[k1_offset - 1] < v1[k1_offset + 1]):
300 if k1 == -d or (k1 != d and v1[k1_offset - 1] < v1[k1_offset + 1]):
299 x1 = v1[k1_offset + 1]
301 x1 = v1[k1_offset + 1]
300 else:
302 else:
301 x1 = v1[k1_offset - 1] + 1
303 x1 = v1[k1_offset - 1] + 1
302 y1 = x1 - k1
304 y1 = x1 - k1
303 while (
305 while (
304 x1 < text1_length and y1 < text2_length and text1[x1] == text2[y1]
306 x1 < text1_length and y1 < text2_length and text1[x1] == text2[y1]
305 ):
307 ):
306 x1 += 1
308 x1 += 1
307 y1 += 1
309 y1 += 1
308 v1[k1_offset] = x1
310 v1[k1_offset] = x1
309 if x1 > text1_length:
311 if x1 > text1_length:
310 # Ran off the right of the graph.
312 # Ran off the right of the graph.
311 k1end += 2
313 k1end += 2
312 elif y1 > text2_length:
314 elif y1 > text2_length:
313 # Ran off the bottom of the graph.
315 # Ran off the bottom of the graph.
314 k1start += 2
316 k1start += 2
315 elif front:
317 elif front:
316 k2_offset = v_offset + delta - k1
318 k2_offset = v_offset + delta - k1
317 if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:
319 if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:
318 # Mirror x2 onto top-left coordinate system.
320 # Mirror x2 onto top-left coordinate system.
319 x2 = text1_length - v2[k2_offset]
321 x2 = text1_length - v2[k2_offset]
320 if x1 >= x2:
322 if x1 >= x2:
321 # Overlap detected.
323 # Overlap detected.
322 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
324 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
323
325
324 # Walk the reverse path one step.
326 # Walk the reverse path one step.
325 for k2 in range(-d + k2start, d + 1 - k2end, 2):
327 for k2 in range(-d + k2start, d + 1 - k2end, 2):
326 k2_offset = v_offset + k2
328 k2_offset = v_offset + k2
327 if k2 == -d or (k2 != d and v2[k2_offset - 1] < v2[k2_offset + 1]):
329 if k2 == -d or (k2 != d and v2[k2_offset - 1] < v2[k2_offset + 1]):
328 x2 = v2[k2_offset + 1]
330 x2 = v2[k2_offset + 1]
329 else:
331 else:
330 x2 = v2[k2_offset - 1] + 1
332 x2 = v2[k2_offset - 1] + 1
331 y2 = x2 - k2
333 y2 = x2 - k2
332 while (
334 while (
333 x2 < text1_length
335 x2 < text1_length
334 and y2 < text2_length
336 and y2 < text2_length
335 and text1[-x2 - 1] == text2[-y2 - 1]
337 and text1[-x2 - 1] == text2[-y2 - 1]
336 ):
338 ):
337 x2 += 1
339 x2 += 1
338 y2 += 1
340 y2 += 1
339 v2[k2_offset] = x2
341 v2[k2_offset] = x2
340 if x2 > text1_length:
342 if x2 > text1_length:
341 # Ran off the left of the graph.
343 # Ran off the left of the graph.
342 k2end += 2
344 k2end += 2
343 elif y2 > text2_length:
345 elif y2 > text2_length:
344 # Ran off the top of the graph.
346 # Ran off the top of the graph.
345 k2start += 2
347 k2start += 2
346 elif not front:
348 elif not front:
347 k1_offset = v_offset + delta - k2
349 k1_offset = v_offset + delta - k2
348 if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:
350 if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:
349 x1 = v1[k1_offset]
351 x1 = v1[k1_offset]
350 y1 = v_offset + x1 - k1_offset
352 y1 = v_offset + x1 - k1_offset
351 # Mirror x2 onto top-left coordinate system.
353 # Mirror x2 onto top-left coordinate system.
352 x2 = text1_length - x2
354 x2 = text1_length - x2
353 if x1 >= x2:
355 if x1 >= x2:
354 # Overlap detected.
356 # Overlap detected.
355 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
357 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
356
358
357 # Diff took too long and hit the deadline or
359 # Diff took too long and hit the deadline or
358 # number of diffs equals number of characters, no commonality at all.
360 # number of diffs equals number of characters, no commonality at all.
359 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
361 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
360
362
361 def diff_bisectSplit(self, text1, text2, x, y, deadline):
363 def diff_bisectSplit(self, text1, text2, x, y, deadline):
362 """Given the location of the 'middle snake', split the diff in two parts
364 """Given the location of the 'middle snake', split the diff in two parts
363 and recurse.
365 and recurse.
364
366
365 Args:
367 Args:
366 text1: Old string to be diffed.
368 text1: Old string to be diffed.
367 text2: New string to be diffed.
369 text2: New string to be diffed.
368 x: Index of split point in text1.
370 x: Index of split point in text1.
369 y: Index of split point in text2.
371 y: Index of split point in text2.
370 deadline: Time at which to bail if not yet complete.
372 deadline: Time at which to bail if not yet complete.
371
373
372 Returns:
374 Returns:
373 Array of diff tuples.
375 Array of diff tuples.
374 """
376 """
375 text1a = text1[:x]
377 text1a = text1[:x]
376 text2a = text2[:y]
378 text2a = text2[:y]
377 text1b = text1[x:]
379 text1b = text1[x:]
378 text2b = text2[y:]
380 text2b = text2[y:]
379
381
380 # Compute both diffs serially.
382 # Compute both diffs serially.
381 diffs = self.diff_main(text1a, text2a, False, deadline)
383 diffs = self.diff_main(text1a, text2a, False, deadline)
382 diffsb = self.diff_main(text1b, text2b, False, deadline)
384 diffsb = self.diff_main(text1b, text2b, False, deadline)
383
385
384 return diffs + diffsb
386 return diffs + diffsb
385
387
386 def diff_linesToChars(self, text1, text2):
388 def diff_linesToChars(self, text1, text2):
387 """Split two texts into an array of strings. Reduce the texts to a string
389 """Split two texts into an array of strings. Reduce the texts to a string
388 of hashes where each Unicode character represents one line.
390 of hashes where each Unicode character represents one line.
389
391
390 Args:
392 Args:
391 text1: First string.
393 text1: First string.
392 text2: Second string.
394 text2: Second string.
393
395
394 Returns:
396 Returns:
395 Three element tuple, containing the encoded text1, the encoded text2 and
397 Three element tuple, containing the encoded text1, the encoded text2 and
396 the array of unique strings. The zeroth element of the array of unique
398 the array of unique strings. The zeroth element of the array of unique
397 strings is intentionally blank.
399 strings is intentionally blank.
398 """
400 """
399 lineArray = [] # e.g. lineArray[4] == "Hello\n"
401 lineArray = [] # e.g. lineArray[4] == "Hello\n"
400 lineHash = {} # e.g. lineHash["Hello\n"] == 4
402 lineHash = {} # e.g. lineHash["Hello\n"] == 4
401
403
402 # "\x00" is a valid character, but various debuggers don't like it.
404 # "\x00" is a valid character, but various debuggers don't like it.
403 # So we'll insert a junk entry to avoid generating a null character.
405 # So we'll insert a junk entry to avoid generating a null character.
404 lineArray.append("")
406 lineArray.append("")
405
407
406 def diff_linesToCharsMunge(text):
408 def diff_linesToCharsMunge(text):
407 """Split a text into an array of strings. Reduce the texts to a string
409 """Split a text into an array of strings. Reduce the texts to a string
408 of hashes where each Unicode character represents one line.
410 of hashes where each Unicode character represents one line.
409 Modifies linearray and linehash through being a closure.
411 Modifies linearray and linehash through being a closure.
410
412
411 Args:
413 Args:
412 text: String to encode.
414 text: String to encode.
413
415
414 Returns:
416 Returns:
415 Encoded string.
417 Encoded string.
416 """
418 """
417 chars = []
419 chars = []
418 # Walk the text, pulling out a substring for each line.
420 # Walk the text, pulling out a substring for each line.
419 # text.split('\n') would would temporarily double our memory footprint.
421 # text.split('\n') would would temporarily double our memory footprint.
420 # Modifying text would create many large strings to garbage collect.
422 # Modifying text would create many large strings to garbage collect.
421 lineStart = 0
423 lineStart = 0
422 lineEnd = -1
424 lineEnd = -1
423 while lineEnd < len(text) - 1:
425 while lineEnd < len(text) - 1:
424 lineEnd = text.find("\n", lineStart)
426 lineEnd = text.find("\n", lineStart)
425 if lineEnd == -1:
427 if lineEnd == -1:
426 lineEnd = len(text) - 1
428 lineEnd = len(text) - 1
427 line = text[lineStart : lineEnd + 1]
429 line = text[lineStart : lineEnd + 1]
428 lineStart = lineEnd + 1
430 lineStart = lineEnd + 1
429
431
430 if line in lineHash:
432 if line in lineHash:
431 chars.append(chr(lineHash[line]))
433 chars.append(chr(lineHash[line]))
432 else:
434 else:
433 lineArray.append(line)
435 lineArray.append(line)
434 lineHash[line] = len(lineArray) - 1
436 lineHash[line] = len(lineArray) - 1
435 chars.append(chr(len(lineArray) - 1))
437 chars.append(chr(len(lineArray) - 1))
436 return "".join(chars)
438 return "".join(chars)
437
439
438 chars1 = diff_linesToCharsMunge(text1)
440 chars1 = diff_linesToCharsMunge(text1)
439 chars2 = diff_linesToCharsMunge(text2)
441 chars2 = diff_linesToCharsMunge(text2)
440 return (chars1, chars2, lineArray)
442 return (chars1, chars2, lineArray)
441
443
442 def diff_charsToLines(self, diffs, lineArray):
444 def diff_charsToLines(self, diffs, lineArray):
443 """Rehydrate the text in a diff from a string of line hashes to real lines
445 """Rehydrate the text in a diff from a string of line hashes to real lines
444 of text.
446 of text.
445
447
446 Args:
448 Args:
447 diffs: Array of diff tuples.
449 diffs: Array of diff tuples.
448 lineArray: Array of unique strings.
450 lineArray: Array of unique strings.
449 """
451 """
450 for x in range(len(diffs)):
452 for x in range(len(diffs)):
451 text = []
453 text = []
452 for char in diffs[x][1]:
454 for char in diffs[x][1]:
453 text.append(lineArray[ord(char)])
455 text.append(lineArray[ord(char)])
454 diffs[x] = (diffs[x][0], "".join(text))
456 diffs[x] = (diffs[x][0], "".join(text))
455
457
456 def diff_commonPrefix(self, text1, text2):
458 def diff_commonPrefix(self, text1, text2):
457 """Determine the common prefix of two strings.
459 """Determine the common prefix of two strings.
458
460
459 Args:
461 Args:
460 text1: First string.
462 text1: First string.
461 text2: Second string.
463 text2: Second string.
462
464
463 Returns:
465 Returns:
464 The number of characters common to the start of each string.
466 The number of characters common to the start of each string.
465 """
467 """
466 # Quick check for common null cases.
468 # Quick check for common null cases.
467 if not text1 or not text2 or text1[0] != text2[0]:
469 if not text1 or not text2 or text1[0] != text2[0]:
468 return 0
470 return 0
469 # Binary search.
471 # Binary search.
470 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
472 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
471 pointermin = 0
473 pointermin = 0
472 pointermax = min(len(text1), len(text2))
474 pointermax = min(len(text1), len(text2))
473 pointermid = pointermax
475 pointermid = pointermax
474 pointerstart = 0
476 pointerstart = 0
475 while pointermin < pointermid:
477 while pointermin < pointermid:
476 if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
478 if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
477 pointermin = pointermid
479 pointermin = pointermid
478 pointerstart = pointermin
480 pointerstart = pointermin
479 else:
481 else:
480 pointermax = pointermid
482 pointermax = pointermid
481 pointermid = (pointermax - pointermin) // 2 + pointermin
483 pointermid = (pointermax - pointermin) // 2 + pointermin
482 return pointermid
484 return pointermid
483
485
484 def diff_commonSuffix(self, text1, text2):
486 def diff_commonSuffix(self, text1, text2):
485 """Determine the common suffix of two strings.
487 """Determine the common suffix of two strings.
486
488
487 Args:
489 Args:
488 text1: First string.
490 text1: First string.
489 text2: Second string.
491 text2: Second string.
490
492
491 Returns:
493 Returns:
492 The number of characters common to the end of each string.
494 The number of characters common to the end of each string.
493 """
495 """
494 # Quick check for common null cases.
496 # Quick check for common null cases.
495 if not text1 or not text2 or text1[-1] != text2[-1]:
497 if not text1 or not text2 or text1[-1] != text2[-1]:
496 return 0
498 return 0
497 # Binary search.
499 # Binary search.
498 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
500 # Performance analysis: http://neil.fraser.name/news/2007/10/09/
499 pointermin = 0
501 pointermin = 0
500 pointermax = min(len(text1), len(text2))
502 pointermax = min(len(text1), len(text2))
501 pointermid = pointermax
503 pointermid = pointermax
502 pointerend = 0
504 pointerend = 0
503 while pointermin < pointermid:
505 while pointermin < pointermid:
504 if (
506 if (
505 text1[-pointermid : len(text1) - pointerend]
507 text1[-pointermid : len(text1) - pointerend]
506 == text2[-pointermid : len(text2) - pointerend]
508 == text2[-pointermid : len(text2) - pointerend]
507 ):
509 ):
508 pointermin = pointermid
510 pointermin = pointermid
509 pointerend = pointermin
511 pointerend = pointermin
510 else:
512 else:
511 pointermax = pointermid
513 pointermax = pointermid
512 pointermid = (pointermax - pointermin) // 2 + pointermin
514 pointermid = (pointermax - pointermin) // 2 + pointermin
513 return pointermid
515 return pointermid
514
516
515 def diff_commonOverlap(self, text1, text2):
517 def diff_commonOverlap(self, text1, text2):
516 """Determine if the suffix of one string is the prefix of another.
518 """Determine if the suffix of one string is the prefix of another.
517
519
518 Args:
520 Args:
519 text1 First string.
521 text1 First string.
520 text2 Second string.
522 text2 Second string.
521
523
522 Returns:
524 Returns:
523 The number of characters common to the end of the first
525 The number of characters common to the end of the first
524 string and the start of the second string.
526 string and the start of the second string.
525 """
527 """
526 # Cache the text lengths to prevent multiple calls.
528 # Cache the text lengths to prevent multiple calls.
527 text1_length = len(text1)
529 text1_length = len(text1)
528 text2_length = len(text2)
530 text2_length = len(text2)
529 # Eliminate the null case.
531 # Eliminate the null case.
530 if text1_length == 0 or text2_length == 0:
532 if text1_length == 0 or text2_length == 0:
531 return 0
533 return 0
532 # Truncate the longer string.
534 # Truncate the longer string.
533 if text1_length > text2_length:
535 if text1_length > text2_length:
534 text1 = text1[-text2_length:]
536 text1 = text1[-text2_length:]
535 elif text1_length < text2_length:
537 elif text1_length < text2_length:
536 text2 = text2[:text1_length]
538 text2 = text2[:text1_length]
537 text_length = min(text1_length, text2_length)
539 text_length = min(text1_length, text2_length)
538 # Quick check for the worst case.
540 # Quick check for the worst case.
539 if text1 == text2:
541 if text1 == text2:
540 return text_length
542 return text_length
541
543
542 # Start by looking for a single character match
544 # Start by looking for a single character match
543 # and increase length until no match is found.
545 # and increase length until no match is found.
544 # Performance analysis: http://neil.fraser.name/news/2010/11/04/
546 # Performance analysis: http://neil.fraser.name/news/2010/11/04/
545 best = 0
547 best = 0
546 length = 1
548 length = 1
547 while True:
549 while True:
548 pattern = text1[-length:]
550 pattern = text1[-length:]
549 found = text2.find(pattern)
551 found = text2.find(pattern)
550 if found == -1:
552 if found == -1:
551 return best
553 return best
552 length += found
554 length += found
553 if found == 0 or text1[-length:] == text2[:length]:
555 if found == 0 or text1[-length:] == text2[:length]:
554 best = length
556 best = length
555 length += 1
557 length += 1
556
558
557 def diff_halfMatch(self, text1, text2):
559 def diff_halfMatch(self, text1, text2):
558 """Do the two texts share a substring which is at least half the length of
560 """Do the two texts share a substring which is at least half the length of
559 the longer text?
561 the longer text?
560 This speedup can produce non-minimal diffs.
562 This speedup can produce non-minimal diffs.
561
563
562 Args:
564 Args:
563 text1: First string.
565 text1: First string.
564 text2: Second string.
566 text2: Second string.
565
567
566 Returns:
568 Returns:
567 Five element Array, containing the prefix of text1, the suffix of text1,
569 Five element Array, containing the prefix of text1, the suffix of text1,
568 the prefix of text2, the suffix of text2 and the common middle. Or None
570 the prefix of text2, the suffix of text2 and the common middle. Or None
569 if there was no match.
571 if there was no match.
570 """
572 """
571 if self.Diff_Timeout <= 0:
573 if self.Diff_Timeout <= 0:
572 # Don't risk returning a non-optimal diff if we have unlimited time.
574 # Don't risk returning a non-optimal diff if we have unlimited time.
573 return None
575 return None
574 if len(text1) > len(text2):
576 if len(text1) > len(text2):
575 (longtext, shorttext) = (text1, text2)
577 (longtext, shorttext) = (text1, text2)
576 else:
578 else:
577 (shorttext, longtext) = (text1, text2)
579 (shorttext, longtext) = (text1, text2)
578 if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):
580 if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):
579 return None # Pointless.
581 return None # Pointless.
580
582
581 def diff_halfMatchI(longtext, shorttext, i):
583 def diff_halfMatchI(longtext, shorttext, i):
582 """Does a substring of shorttext exist within longtext such that the
584 """Does a substring of shorttext exist within longtext such that the
583 substring is at least half the length of longtext?
585 substring is at least half the length of longtext?
584 Closure, but does not reference any external variables.
586 Closure, but does not reference any external variables.
585
587
586 Args:
588 Args:
587 longtext: Longer string.
589 longtext: Longer string.
588 shorttext: Shorter string.
590 shorttext: Shorter string.
589 i: Start index of quarter length substring within longtext.
591 i: Start index of quarter length substring within longtext.
590
592
591 Returns:
593 Returns:
592 Five element Array, containing the prefix of longtext, the suffix of
594 Five element Array, containing the prefix of longtext, the suffix of
593 longtext, the prefix of shorttext, the suffix of shorttext and the
595 longtext, the prefix of shorttext, the suffix of shorttext and the
594 common middle. Or None if there was no match.
596 common middle. Or None if there was no match.
595 """
597 """
596 seed = longtext[i : i + len(longtext) // 4]
598 seed = longtext[i : i + len(longtext) // 4]
597 best_common = ""
599 best_common = ""
598 j = shorttext.find(seed)
600 j = shorttext.find(seed)
599 while j != -1:
601 while j != -1:
600 prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
602 prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
601 suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
603 suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
602 if len(best_common) < suffixLength + prefixLength:
604 if len(best_common) < suffixLength + prefixLength:
603 best_common = (
605 best_common = (
604 shorttext[j - suffixLength : j]
606 shorttext[j - suffixLength : j]
605 + shorttext[j : j + prefixLength]
607 + shorttext[j : j + prefixLength]
606 )
608 )
607 best_longtext_a = longtext[: i - suffixLength]
609 best_longtext_a = longtext[: i - suffixLength]
608 best_longtext_b = longtext[i + prefixLength :]
610 best_longtext_b = longtext[i + prefixLength :]
609 best_shorttext_a = shorttext[: j - suffixLength]
611 best_shorttext_a = shorttext[: j - suffixLength]
610 best_shorttext_b = shorttext[j + prefixLength :]
612 best_shorttext_b = shorttext[j + prefixLength :]
611 j = shorttext.find(seed, j + 1)
613 j = shorttext.find(seed, j + 1)
612
614
613 if len(best_common) * 2 >= len(longtext):
615 if len(best_common) * 2 >= len(longtext):
614 return (
616 return (
615 best_longtext_a,
617 best_longtext_a,
616 best_longtext_b,
618 best_longtext_b,
617 best_shorttext_a,
619 best_shorttext_a,
618 best_shorttext_b,
620 best_shorttext_b,
619 best_common,
621 best_common,
620 )
622 )
621 else:
623 else:
622 return None
624 return None
623
625
624 # First check if the second quarter is the seed for a half-match.
626 # First check if the second quarter is the seed for a half-match.
625 hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4)
627 hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4)
626 # Check again based on the third quarter.
628 # Check again based on the third quarter.
627 hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2)
629 hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2)
628 if not hm1 and not hm2:
630 if not hm1 and not hm2:
629 return None
631 return None
630 elif not hm2:
632 elif not hm2:
631 hm = hm1
633 hm = hm1
632 elif not hm1:
634 elif not hm1:
633 hm = hm2
635 hm = hm2
634 else:
636 else:
635 # Both matched. Select the longest.
637 # Both matched. Select the longest.
636 if len(hm1[4]) > len(hm2[4]):
638 if len(hm1[4]) > len(hm2[4]):
637 hm = hm1
639 hm = hm1
638 else:
640 else:
639 hm = hm2
641 hm = hm2
640
642
641 # A half-match was found, sort out the return data.
643 # A half-match was found, sort out the return data.
642 if len(text1) > len(text2):
644 if len(text1) > len(text2):
643 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
645 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
644 else:
646 else:
645 (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
647 (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
646 return (text1_a, text1_b, text2_a, text2_b, mid_common)
648 return (text1_a, text1_b, text2_a, text2_b, mid_common)
647
649
648 def diff_cleanupSemantic(self, diffs):
650 def diff_cleanupSemantic(self, diffs):
649 """Reduce the number of edits by eliminating semantically trivial
651 """Reduce the number of edits by eliminating semantically trivial
650 equalities.
652 equalities.
651
653
652 Args:
654 Args:
653 diffs: Array of diff tuples.
655 diffs: Array of diff tuples.
654 """
656 """
655 changes = False
657 changes = False
656 equalities = [] # Stack of indices where equalities are found.
658 equalities = [] # Stack of indices where equalities are found.
657 lastequality = None # Always equal to diffs[equalities[-1]][1]
659 lastequality = None # Always equal to diffs[equalities[-1]][1]
658 pointer = 0 # Index of current position.
660 pointer = 0 # Index of current position.
659 # Number of chars that changed prior to the equality.
661 # Number of chars that changed prior to the equality.
660 length_insertions1, length_deletions1 = 0, 0
662 length_insertions1, length_deletions1 = 0, 0
661 # Number of chars that changed after the equality.
663 # Number of chars that changed after the equality.
662 length_insertions2, length_deletions2 = 0, 0
664 length_insertions2, length_deletions2 = 0, 0
663 while pointer < len(diffs):
665 while pointer < len(diffs):
664 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
666 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
665 equalities.append(pointer)
667 equalities.append(pointer)
666 length_insertions1, length_insertions2 = length_insertions2, 0
668 length_insertions1, length_insertions2 = length_insertions2, 0
667 length_deletions1, length_deletions2 = length_deletions2, 0
669 length_deletions1, length_deletions2 = length_deletions2, 0
668 lastequality = diffs[pointer][1]
670 lastequality = diffs[pointer][1]
669 else: # An insertion or deletion.
671 else: # An insertion or deletion.
670 if diffs[pointer][0] == self.DIFF_INSERT:
672 if diffs[pointer][0] == self.DIFF_INSERT:
671 length_insertions2 += len(diffs[pointer][1])
673 length_insertions2 += len(diffs[pointer][1])
672 else:
674 else:
673 length_deletions2 += len(diffs[pointer][1])
675 length_deletions2 += len(diffs[pointer][1])
674 # Eliminate an equality that is smaller or equal to the edits on both
676 # Eliminate an equality that is smaller or equal to the edits on both
675 # sides of it.
677 # sides of it.
676 if (
678 if (
677 lastequality
679 lastequality
678 and (
680 and (
679 len(lastequality) <= max(length_insertions1, length_deletions1)
681 len(lastequality) <= max(length_insertions1, length_deletions1)
680 )
682 )
681 and (
683 and (
682 len(lastequality) <= max(length_insertions2, length_deletions2)
684 len(lastequality) <= max(length_insertions2, length_deletions2)
683 )
685 )
684 ):
686 ):
685 # Duplicate record.
687 # Duplicate record.
686 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
688 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
687 # Change second copy to insert.
689 # Change second copy to insert.
688 diffs[equalities[-1] + 1] = (
690 diffs[equalities[-1] + 1] = (
689 self.DIFF_INSERT,
691 self.DIFF_INSERT,
690 diffs[equalities[-1] + 1][1],
692 diffs[equalities[-1] + 1][1],
691 )
693 )
692 # Throw away the equality we just deleted.
694 # Throw away the equality we just deleted.
693 equalities.pop()
695 equalities.pop()
694 # Throw away the previous equality (it needs to be reevaluated).
696 # Throw away the previous equality (it needs to be reevaluated).
695 if len(equalities):
697 if len(equalities):
696 equalities.pop()
698 equalities.pop()
697 if len(equalities):
699 if len(equalities):
698 pointer = equalities[-1]
700 pointer = equalities[-1]
699 else:
701 else:
700 pointer = -1
702 pointer = -1
701 # Reset the counters.
703 # Reset the counters.
702 length_insertions1, length_deletions1 = 0, 0
704 length_insertions1, length_deletions1 = 0, 0
703 length_insertions2, length_deletions2 = 0, 0
705 length_insertions2, length_deletions2 = 0, 0
704 lastequality = None
706 lastequality = None
705 changes = True
707 changes = True
706 pointer += 1
708 pointer += 1
707
709
708 # Normalize the diff.
710 # Normalize the diff.
709 if changes:
711 if changes:
710 self.diff_cleanupMerge(diffs)
712 self.diff_cleanupMerge(diffs)
711 self.diff_cleanupSemanticLossless(diffs)
713 self.diff_cleanupSemanticLossless(diffs)
712
714
713 # Find any overlaps between deletions and insertions.
715 # Find any overlaps between deletions and insertions.
714 # e.g: <del>abcxxx</del><ins>xxxdef</ins>
716 # e.g: <del>abcxxx</del><ins>xxxdef</ins>
715 # -> <del>abc</del>xxx<ins>def</ins>
717 # -> <del>abc</del>xxx<ins>def</ins>
716 # e.g: <del>xxxabc</del><ins>defxxx</ins>
718 # e.g: <del>xxxabc</del><ins>defxxx</ins>
717 # -> <ins>def</ins>xxx<del>abc</del>
719 # -> <ins>def</ins>xxx<del>abc</del>
718 # Only extract an overlap if it is as big as the edit ahead or behind it.
720 # Only extract an overlap if it is as big as the edit ahead or behind it.
719 pointer = 1
721 pointer = 1
720 while pointer < len(diffs):
722 while pointer < len(diffs):
721 if (
723 if (
722 diffs[pointer - 1][0] == self.DIFF_DELETE
724 diffs[pointer - 1][0] == self.DIFF_DELETE
723 and diffs[pointer][0] == self.DIFF_INSERT
725 and diffs[pointer][0] == self.DIFF_INSERT
724 ):
726 ):
725 deletion = diffs[pointer - 1][1]
727 deletion = diffs[pointer - 1][1]
726 insertion = diffs[pointer][1]
728 insertion = diffs[pointer][1]
727 overlap_length1 = self.diff_commonOverlap(deletion, insertion)
729 overlap_length1 = self.diff_commonOverlap(deletion, insertion)
728 overlap_length2 = self.diff_commonOverlap(insertion, deletion)
730 overlap_length2 = self.diff_commonOverlap(insertion, deletion)
729 if overlap_length1 >= overlap_length2:
731 if overlap_length1 >= overlap_length2:
730 if (
732 if (
731 overlap_length1 >= len(deletion) / 2.0
733 overlap_length1 >= len(deletion) / 2.0
732 or overlap_length1 >= len(insertion) / 2.0
734 or overlap_length1 >= len(insertion) / 2.0
733 ):
735 ):
734 # Overlap found. Insert an equality and trim the surrounding edits.
736 # Overlap found. Insert an equality and trim the surrounding edits.
735 diffs.insert(
737 diffs.insert(
736 pointer, (self.DIFF_EQUAL, insertion[:overlap_length1])
738 pointer, (self.DIFF_EQUAL, insertion[:overlap_length1])
737 )
739 )
738 diffs[pointer - 1] = (
740 diffs[pointer - 1] = (
739 self.DIFF_DELETE,
741 self.DIFF_DELETE,
740 deletion[: len(deletion) - overlap_length1],
742 deletion[: len(deletion) - overlap_length1],
741 )
743 )
742 diffs[pointer + 1] = (
744 diffs[pointer + 1] = (
743 self.DIFF_INSERT,
745 self.DIFF_INSERT,
744 insertion[overlap_length1:],
746 insertion[overlap_length1:],
745 )
747 )
746 pointer += 1
748 pointer += 1
747 else:
749 else:
748 if (
750 if (
749 overlap_length2 >= len(deletion) / 2.0
751 overlap_length2 >= len(deletion) / 2.0
750 or overlap_length2 >= len(insertion) / 2.0
752 or overlap_length2 >= len(insertion) / 2.0
751 ):
753 ):
752 # Reverse overlap found.
754 # Reverse overlap found.
753 # Insert an equality and swap and trim the surrounding edits.
755 # Insert an equality and swap and trim the surrounding edits.
754 diffs.insert(
756 diffs.insert(
755 pointer, (self.DIFF_EQUAL, deletion[:overlap_length2])
757 pointer, (self.DIFF_EQUAL, deletion[:overlap_length2])
756 )
758 )
757 diffs[pointer - 1] = (
759 diffs[pointer - 1] = (
758 self.DIFF_INSERT,
760 self.DIFF_INSERT,
759 insertion[: len(insertion) - overlap_length2],
761 insertion[: len(insertion) - overlap_length2],
760 )
762 )
761 diffs[pointer + 1] = (
763 diffs[pointer + 1] = (
762 self.DIFF_DELETE,
764 self.DIFF_DELETE,
763 deletion[overlap_length2:],
765 deletion[overlap_length2:],
764 )
766 )
765 pointer += 1
767 pointer += 1
766 pointer += 1
768 pointer += 1
767 pointer += 1
769 pointer += 1
768
770
769 def diff_cleanupSemanticLossless(self, diffs):
771 def diff_cleanupSemanticLossless(self, diffs):
770 """Look for single edits surrounded on both sides by equalities
772 """Look for single edits surrounded on both sides by equalities
771 which can be shifted sideways to align the edit to a word boundary.
773 which can be shifted sideways to align the edit to a word boundary.
772 e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
774 e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
773
775
774 Args:
776 Args:
775 diffs: Array of diff tuples.
777 diffs: Array of diff tuples.
776 """
778 """
777
779
778 def diff_cleanupSemanticScore(one, two):
780 def diff_cleanupSemanticScore(one, two):
779 """Given two strings, compute a score representing whether the
781 """Given two strings, compute a score representing whether the
780 internal boundary falls on logical boundaries.
782 internal boundary falls on logical boundaries.
781 Scores range from 6 (best) to 0 (worst).
783 Scores range from 6 (best) to 0 (worst).
782 Closure, but does not reference any external variables.
784 Closure, but does not reference any external variables.
783
785
784 Args:
786 Args:
785 one: First string.
787 one: First string.
786 two: Second string.
788 two: Second string.
787
789
788 Returns:
790 Returns:
789 The score.
791 The score.
790 """
792 """
791 if not one or not two:
793 if not one or not two:
792 # Edges are the best.
794 # Edges are the best.
793 return 6
795 return 6
794
796
795 # Each port of this function behaves slightly differently due to
797 # Each port of this function behaves slightly differently due to
796 # subtle differences in each language's definition of things like
798 # subtle differences in each language's definition of things like
797 # 'whitespace'. Since this function's purpose is largely cosmetic,
799 # 'whitespace'. Since this function's purpose is largely cosmetic,
798 # the choice has been made to use each language's native features
800 # the choice has been made to use each language's native features
799 # rather than force total conformity.
801 # rather than force total conformity.
800 char1 = one[-1]
802 char1 = one[-1]
801 char2 = two[0]
803 char2 = two[0]
802 nonAlphaNumeric1 = not char1.isalnum()
804 nonAlphaNumeric1 = not char1.isalnum()
803 nonAlphaNumeric2 = not char2.isalnum()
805 nonAlphaNumeric2 = not char2.isalnum()
804 whitespace1 = nonAlphaNumeric1 and char1.isspace()
806 whitespace1 = nonAlphaNumeric1 and char1.isspace()
805 whitespace2 = nonAlphaNumeric2 and char2.isspace()
807 whitespace2 = nonAlphaNumeric2 and char2.isspace()
806 lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n")
808 lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n")
807 lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n")
809 lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n")
808 blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one)
810 blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one)
809 blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two)
811 blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two)
810
812
811 if blankLine1 or blankLine2:
813 if blankLine1 or blankLine2:
812 # Five points for blank lines.
814 # Five points for blank lines.
813 return 5
815 return 5
814 elif lineBreak1 or lineBreak2:
816 elif lineBreak1 or lineBreak2:
815 # Four points for line breaks.
817 # Four points for line breaks.
816 return 4
818 return 4
817 elif nonAlphaNumeric1 and not whitespace1 and whitespace2:
819 elif nonAlphaNumeric1 and not whitespace1 and whitespace2:
818 # Three points for end of sentences.
820 # Three points for end of sentences.
819 return 3
821 return 3
820 elif whitespace1 or whitespace2:
822 elif whitespace1 or whitespace2:
821 # Two points for whitespace.
823 # Two points for whitespace.
822 return 2
824 return 2
823 elif nonAlphaNumeric1 or nonAlphaNumeric2:
825 elif nonAlphaNumeric1 or nonAlphaNumeric2:
824 # One point for non-alphanumeric.
826 # One point for non-alphanumeric.
825 return 1
827 return 1
826 return 0
828 return 0
827
829
828 pointer = 1
830 pointer = 1
829 # Intentionally ignore the first and last element (don't need checking).
831 # Intentionally ignore the first and last element (don't need checking).
830 while pointer < len(diffs) - 1:
832 while pointer < len(diffs) - 1:
831 if (
833 if (
832 diffs[pointer - 1][0] == self.DIFF_EQUAL
834 diffs[pointer - 1][0] == self.DIFF_EQUAL
833 and diffs[pointer + 1][0] == self.DIFF_EQUAL
835 and diffs[pointer + 1][0] == self.DIFF_EQUAL
834 ):
836 ):
835 # This is a single edit surrounded by equalities.
837 # This is a single edit surrounded by equalities.
836 equality1 = diffs[pointer - 1][1]
838 equality1 = diffs[pointer - 1][1]
837 edit = diffs[pointer][1]
839 edit = diffs[pointer][1]
838 equality2 = diffs[pointer + 1][1]
840 equality2 = diffs[pointer + 1][1]
839
841
840 # First, shift the edit as far left as possible.
842 # First, shift the edit as far left as possible.
841 commonOffset = self.diff_commonSuffix(equality1, edit)
843 commonOffset = self.diff_commonSuffix(equality1, edit)
842 if commonOffset:
844 if commonOffset:
843 commonString = edit[-commonOffset:]
845 commonString = edit[-commonOffset:]
844 equality1 = equality1[:-commonOffset]
846 equality1 = equality1[:-commonOffset]
845 edit = commonString + edit[:-commonOffset]
847 edit = commonString + edit[:-commonOffset]
846 equality2 = commonString + equality2
848 equality2 = commonString + equality2
847
849
848 # Second, step character by character right, looking for the best fit.
850 # Second, step character by character right, looking for the best fit.
849 bestEquality1 = equality1
851 bestEquality1 = equality1
850 bestEdit = edit
852 bestEdit = edit
851 bestEquality2 = equality2
853 bestEquality2 = equality2
852 bestScore = diff_cleanupSemanticScore(
854 bestScore = diff_cleanupSemanticScore(
853 equality1, edit
855 equality1, edit
854 ) + diff_cleanupSemanticScore(edit, equality2)
856 ) + diff_cleanupSemanticScore(edit, equality2)
855 while edit and equality2 and edit[0] == equality2[0]:
857 while edit and equality2 and edit[0] == equality2[0]:
856 equality1 += edit[0]
858 equality1 += edit[0]
857 edit = edit[1:] + equality2[0]
859 edit = edit[1:] + equality2[0]
858 equality2 = equality2[1:]
860 equality2 = equality2[1:]
859 score = diff_cleanupSemanticScore(
861 score = diff_cleanupSemanticScore(
860 equality1, edit
862 equality1, edit
861 ) + diff_cleanupSemanticScore(edit, equality2)
863 ) + diff_cleanupSemanticScore(edit, equality2)
862 # The >= encourages trailing rather than leading whitespace on edits.
864 # The >= encourages trailing rather than leading whitespace on edits.
863 if score >= bestScore:
865 if score >= bestScore:
864 bestScore = score
866 bestScore = score
865 bestEquality1 = equality1
867 bestEquality1 = equality1
866 bestEdit = edit
868 bestEdit = edit
867 bestEquality2 = equality2
869 bestEquality2 = equality2
868
870
869 if diffs[pointer - 1][1] != bestEquality1:
871 if diffs[pointer - 1][1] != bestEquality1:
870 # We have an improvement, save it back to the diff.
872 # We have an improvement, save it back to the diff.
871 if bestEquality1:
873 if bestEquality1:
872 diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
874 diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
873 else:
875 else:
874 del diffs[pointer - 1]
876 del diffs[pointer - 1]
875 pointer -= 1
877 pointer -= 1
876 diffs[pointer] = (diffs[pointer][0], bestEdit)
878 diffs[pointer] = (diffs[pointer][0], bestEdit)
877 if bestEquality2:
879 if bestEquality2:
878 diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
880 diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
879 else:
881 else:
880 del diffs[pointer + 1]
882 del diffs[pointer + 1]
881 pointer -= 1
883 pointer -= 1
882 pointer += 1
884 pointer += 1
883
885
884 # Define some regex patterns for matching boundaries.
886 # Define some regex patterns for matching boundaries.
885 BLANKLINEEND = re.compile(r"\n\r?\n$")
887 BLANKLINEEND = re.compile(r"\n\r?\n$")
886 BLANKLINESTART = re.compile(r"^\r?\n\r?\n")
888 BLANKLINESTART = re.compile(r"^\r?\n\r?\n")
887
889
888 def diff_cleanupEfficiency(self, diffs):
890 def diff_cleanupEfficiency(self, diffs):
889 """Reduce the number of edits by eliminating operationally trivial
891 """Reduce the number of edits by eliminating operationally trivial
890 equalities.
892 equalities.
891
893
892 Args:
894 Args:
893 diffs: Array of diff tuples.
895 diffs: Array of diff tuples.
894 """
896 """
895 changes = False
897 changes = False
896 equalities = [] # Stack of indices where equalities are found.
898 equalities = [] # Stack of indices where equalities are found.
897 lastequality = None # Always equal to diffs[equalities[-1]][1]
899 lastequality = None # Always equal to diffs[equalities[-1]][1]
898 pointer = 0 # Index of current position.
900 pointer = 0 # Index of current position.
899 pre_ins = False # Is there an insertion operation before the last equality.
901 pre_ins = False # Is there an insertion operation before the last equality.
900 pre_del = False # Is there a deletion operation before the last equality.
902 pre_del = False # Is there a deletion operation before the last equality.
901 post_ins = False # Is there an insertion operation after the last equality.
903 post_ins = False # Is there an insertion operation after the last equality.
902 post_del = False # Is there a deletion operation after the last equality.
904 post_del = False # Is there a deletion operation after the last equality.
903 while pointer < len(diffs):
905 while pointer < len(diffs):
904 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
906 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
905 if len(diffs[pointer][1]) < self.Diff_EditCost and (
907 if len(diffs[pointer][1]) < self.Diff_EditCost and (
906 post_ins or post_del
908 post_ins or post_del
907 ):
909 ):
908 # Candidate found.
910 # Candidate found.
909 equalities.append(pointer)
911 equalities.append(pointer)
910 pre_ins = post_ins
912 pre_ins = post_ins
911 pre_del = post_del
913 pre_del = post_del
912 lastequality = diffs[pointer][1]
914 lastequality = diffs[pointer][1]
913 else:
915 else:
914 # Not a candidate, and can never become one.
916 # Not a candidate, and can never become one.
915 equalities = []
917 equalities = []
916 lastequality = None
918 lastequality = None
917
919
918 post_ins = post_del = False
920 post_ins = post_del = False
919 else: # An insertion or deletion.
921 else: # An insertion or deletion.
920 if diffs[pointer][0] == self.DIFF_DELETE:
922 if diffs[pointer][0] == self.DIFF_DELETE:
921 post_del = True
923 post_del = True
922 else:
924 else:
923 post_ins = True
925 post_ins = True
924
926
925 # Five types to be split:
927 # Five types to be split:
926 # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
928 # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
927 # <ins>A</ins>X<ins>C</ins><del>D</del>
929 # <ins>A</ins>X<ins>C</ins><del>D</del>
928 # <ins>A</ins><del>B</del>X<ins>C</ins>
930 # <ins>A</ins><del>B</del>X<ins>C</ins>
929 # <ins>A</del>X<ins>C</ins><del>D</del>
931 # <ins>A</del>X<ins>C</ins><del>D</del>
930 # <ins>A</ins><del>B</del>X<del>C</del>
932 # <ins>A</ins><del>B</del>X<del>C</del>
931
933
932 if lastequality and (
934 if lastequality and (
933 (pre_ins and pre_del and post_ins and post_del)
935 (pre_ins and pre_del and post_ins and post_del)
934 or (
936 or (
935 (len(lastequality) < self.Diff_EditCost / 2)
937 (len(lastequality) < self.Diff_EditCost / 2)
936 and (pre_ins + pre_del + post_ins + post_del) == 3
938 and (pre_ins + pre_del + post_ins + post_del) == 3
937 )
939 )
938 ):
940 ):
939 # Duplicate record.
941 # Duplicate record.
940 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
942 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
941 # Change second copy to insert.
943 # Change second copy to insert.
942 diffs[equalities[-1] + 1] = (
944 diffs[equalities[-1] + 1] = (
943 self.DIFF_INSERT,
945 self.DIFF_INSERT,
944 diffs[equalities[-1] + 1][1],
946 diffs[equalities[-1] + 1][1],
945 )
947 )
946 equalities.pop() # Throw away the equality we just deleted.
948 equalities.pop() # Throw away the equality we just deleted.
947 lastequality = None
949 lastequality = None
948 if pre_ins and pre_del:
950 if pre_ins and pre_del:
949 # No changes made which could affect previous entry, keep going.
951 # No changes made which could affect previous entry, keep going.
950 post_ins = post_del = True
952 post_ins = post_del = True
951 equalities = []
953 equalities = []
952 else:
954 else:
953 if len(equalities):
955 if len(equalities):
954 equalities.pop() # Throw away the previous equality.
956 equalities.pop() # Throw away the previous equality.
955 if len(equalities):
957 if len(equalities):
956 pointer = equalities[-1]
958 pointer = equalities[-1]
957 else:
959 else:
958 pointer = -1
960 pointer = -1
959 post_ins = post_del = False
961 post_ins = post_del = False
960 changes = True
962 changes = True
961 pointer += 1
963 pointer += 1
962
964
963 if changes:
965 if changes:
964 self.diff_cleanupMerge(diffs)
966 self.diff_cleanupMerge(diffs)
965
967
966 def diff_cleanupMerge(self, diffs):
968 def diff_cleanupMerge(self, diffs):
967 """Reorder and merge like edit sections. Merge equalities.
969 """Reorder and merge like edit sections. Merge equalities.
968 Any edit section can move as long as it doesn't cross an equality.
970 Any edit section can move as long as it doesn't cross an equality.
969
971
970 Args:
972 Args:
971 diffs: Array of diff tuples.
973 diffs: Array of diff tuples.
972 """
974 """
973 diffs.append((self.DIFF_EQUAL, "")) # Add a dummy entry at the end.
975 diffs.append((self.DIFF_EQUAL, "")) # Add a dummy entry at the end.
974 pointer = 0
976 pointer = 0
975 count_delete = 0
977 count_delete = 0
976 count_insert = 0
978 count_insert = 0
977 text_delete = ""
979 text_delete = ""
978 text_insert = ""
980 text_insert = ""
979 while pointer < len(diffs):
981 while pointer < len(diffs):
980 if diffs[pointer][0] == self.DIFF_INSERT:
982 if diffs[pointer][0] == self.DIFF_INSERT:
981 count_insert += 1
983 count_insert += 1
982 text_insert += diffs[pointer][1]
984 text_insert += diffs[pointer][1]
983 pointer += 1
985 pointer += 1
984 elif diffs[pointer][0] == self.DIFF_DELETE:
986 elif diffs[pointer][0] == self.DIFF_DELETE:
985 count_delete += 1
987 count_delete += 1
986 text_delete += diffs[pointer][1]
988 text_delete += diffs[pointer][1]
987 pointer += 1
989 pointer += 1
988 elif diffs[pointer][0] == self.DIFF_EQUAL:
990 elif diffs[pointer][0] == self.DIFF_EQUAL:
989 # Upon reaching an equality, check for prior redundancies.
991 # Upon reaching an equality, check for prior redundancies.
990 if count_delete + count_insert > 1:
992 if count_delete + count_insert > 1:
991 if count_delete != 0 and count_insert != 0:
993 if count_delete != 0 and count_insert != 0:
992 # Factor out any common prefixies.
994 # Factor out any common prefixies.
993 commonlength = self.diff_commonPrefix(text_insert, text_delete)
995 commonlength = self.diff_commonPrefix(text_insert, text_delete)
994 if commonlength != 0:
996 if commonlength != 0:
995 x = pointer - count_delete - count_insert - 1
997 x = pointer - count_delete - count_insert - 1
996 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
998 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
997 diffs[x] = (
999 diffs[x] = (
998 diffs[x][0],
1000 diffs[x][0],
999 diffs[x][1] + text_insert[:commonlength],
1001 diffs[x][1] + text_insert[:commonlength],
1000 )
1002 )
1001 else:
1003 else:
1002 diffs.insert(
1004 diffs.insert(
1003 0, (self.DIFF_EQUAL, text_insert[:commonlength])
1005 0, (self.DIFF_EQUAL, text_insert[:commonlength])
1004 )
1006 )
1005 pointer += 1
1007 pointer += 1
1006 text_insert = text_insert[commonlength:]
1008 text_insert = text_insert[commonlength:]
1007 text_delete = text_delete[commonlength:]
1009 text_delete = text_delete[commonlength:]
1008 # Factor out any common suffixies.
1010 # Factor out any common suffixies.
1009 commonlength = self.diff_commonSuffix(text_insert, text_delete)
1011 commonlength = self.diff_commonSuffix(text_insert, text_delete)
1010 if commonlength != 0:
1012 if commonlength != 0:
1011 diffs[pointer] = (
1013 diffs[pointer] = (
1012 diffs[pointer][0],
1014 diffs[pointer][0],
1013 text_insert[-commonlength:] + diffs[pointer][1],
1015 text_insert[-commonlength:] + diffs[pointer][1],
1014 )
1016 )
1015 text_insert = text_insert[:-commonlength]
1017 text_insert = text_insert[:-commonlength]
1016 text_delete = text_delete[:-commonlength]
1018 text_delete = text_delete[:-commonlength]
1017 # Delete the offending records and add the merged ones.
1019 # Delete the offending records and add the merged ones.
1018 if count_delete == 0:
1020 if count_delete == 0:
1019 diffs[pointer - count_insert : pointer] = [
1021 diffs[pointer - count_insert : pointer] = [
1020 (self.DIFF_INSERT, text_insert)
1022 (self.DIFF_INSERT, text_insert)
1021 ]
1023 ]
1022 elif count_insert == 0:
1024 elif count_insert == 0:
1023 diffs[pointer - count_delete : pointer] = [
1025 diffs[pointer - count_delete : pointer] = [
1024 (self.DIFF_DELETE, text_delete)
1026 (self.DIFF_DELETE, text_delete)
1025 ]
1027 ]
1026 else:
1028 else:
1027 diffs[pointer - count_delete - count_insert : pointer] = [
1029 diffs[pointer - count_delete - count_insert : pointer] = [
1028 (self.DIFF_DELETE, text_delete),
1030 (self.DIFF_DELETE, text_delete),
1029 (self.DIFF_INSERT, text_insert),
1031 (self.DIFF_INSERT, text_insert),
1030 ]
1032 ]
1031 pointer = pointer - count_delete - count_insert + 1
1033 pointer = pointer - count_delete - count_insert + 1
1032 if count_delete != 0:
1034 if count_delete != 0:
1033 pointer += 1
1035 pointer += 1
1034 if count_insert != 0:
1036 if count_insert != 0:
1035 pointer += 1
1037 pointer += 1
1036 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
1038 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
1037 # Merge this equality with the previous one.
1039 # Merge this equality with the previous one.
1038 diffs[pointer - 1] = (
1040 diffs[pointer - 1] = (
1039 diffs[pointer - 1][0],
1041 diffs[pointer - 1][0],
1040 diffs[pointer - 1][1] + diffs[pointer][1],
1042 diffs[pointer - 1][1] + diffs[pointer][1],
1041 )
1043 )
1042 del diffs[pointer]
1044 del diffs[pointer]
1043 else:
1045 else:
1044 pointer += 1
1046 pointer += 1
1045
1047
1046 count_insert = 0
1048 count_insert = 0
1047 count_delete = 0
1049 count_delete = 0
1048 text_delete = ""
1050 text_delete = ""
1049 text_insert = ""
1051 text_insert = ""
1050
1052
1051 if diffs[-1][1] == "":
1053 if diffs[-1][1] == "":
1052 diffs.pop() # Remove the dummy entry at the end.
1054 diffs.pop() # Remove the dummy entry at the end.
1053
1055
1054 # Second pass: look for single edits surrounded on both sides by equalities
1056 # Second pass: look for single edits surrounded on both sides by equalities
1055 # which can be shifted sideways to eliminate an equality.
1057 # which can be shifted sideways to eliminate an equality.
1056 # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
1058 # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
1057 changes = False
1059 changes = False
1058 pointer = 1
1060 pointer = 1
1059 # Intentionally ignore the first and last element (don't need checking).
1061 # Intentionally ignore the first and last element (don't need checking).
1060 while pointer < len(diffs) - 1:
1062 while pointer < len(diffs) - 1:
1061 if (
1063 if (
1062 diffs[pointer - 1][0] == self.DIFF_EQUAL
1064 diffs[pointer - 1][0] == self.DIFF_EQUAL
1063 and diffs[pointer + 1][0] == self.DIFF_EQUAL
1065 and diffs[pointer + 1][0] == self.DIFF_EQUAL
1064 ):
1066 ):
1065 # This is a single edit surrounded by equalities.
1067 # This is a single edit surrounded by equalities.
1066 if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
1068 if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
1067 # Shift the edit over the previous equality.
1069 # Shift the edit over the previous equality.
1068 diffs[pointer] = (
1070 diffs[pointer] = (
1069 diffs[pointer][0],
1071 diffs[pointer][0],
1070 diffs[pointer - 1][1]
1072 diffs[pointer - 1][1]
1071 + diffs[pointer][1][: -len(diffs[pointer - 1][1])],
1073 + diffs[pointer][1][: -len(diffs[pointer - 1][1])],
1072 )
1074 )
1073 diffs[pointer + 1] = (
1075 diffs[pointer + 1] = (
1074 diffs[pointer + 1][0],
1076 diffs[pointer + 1][0],
1075 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1077 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1076 )
1078 )
1077 del diffs[pointer - 1]
1079 del diffs[pointer - 1]
1078 changes = True
1080 changes = True
1079 elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
1081 elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
1080 # Shift the edit over the next equality.
1082 # Shift the edit over the next equality.
1081 diffs[pointer - 1] = (
1083 diffs[pointer - 1] = (
1082 diffs[pointer - 1][0],
1084 diffs[pointer - 1][0],
1083 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1085 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1084 )
1086 )
1085 diffs[pointer] = (
1087 diffs[pointer] = (
1086 diffs[pointer][0],
1088 diffs[pointer][0],
1087 diffs[pointer][1][len(diffs[pointer + 1][1]) :]
1089 diffs[pointer][1][len(diffs[pointer + 1][1]) :]
1088 + diffs[pointer + 1][1],
1090 + diffs[pointer + 1][1],
1089 )
1091 )
1090 del diffs[pointer + 1]
1092 del diffs[pointer + 1]
1091 changes = True
1093 changes = True
1092 pointer += 1
1094 pointer += 1
1093
1095
1094 # If shifts were made, the diff needs reordering and another shift sweep.
1096 # If shifts were made, the diff needs reordering and another shift sweep.
1095 if changes:
1097 if changes:
1096 self.diff_cleanupMerge(diffs)
1098 self.diff_cleanupMerge(diffs)
1097
1099
1098 def diff_xIndex(self, diffs, loc):
1100 def diff_xIndex(self, diffs, loc):
1099 """loc is a location in text1, compute and return the equivalent location
1101 """loc is a location in text1, compute and return the equivalent location
1100 in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
1102 in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
1101
1103
1102 Args:
1104 Args:
1103 diffs: Array of diff tuples.
1105 diffs: Array of diff tuples.
1104 loc: Location within text1.
1106 loc: Location within text1.
1105
1107
1106 Returns:
1108 Returns:
1107 Location within text2.
1109 Location within text2.
1108 """
1110 """
1109 chars1 = 0
1111 chars1 = 0
1110 chars2 = 0
1112 chars2 = 0
1111 last_chars1 = 0
1113 last_chars1 = 0
1112 last_chars2 = 0
1114 last_chars2 = 0
1113 for x in range(len(diffs)):
1115 for x in range(len(diffs)):
1114 (op, text) = diffs[x]
1116 (op, text) = diffs[x]
1115 if op != self.DIFF_INSERT: # Equality or deletion.
1117 if op != self.DIFF_INSERT: # Equality or deletion.
1116 chars1 += len(text)
1118 chars1 += len(text)
1117 if op != self.DIFF_DELETE: # Equality or insertion.
1119 if op != self.DIFF_DELETE: # Equality or insertion.
1118 chars2 += len(text)
1120 chars2 += len(text)
1119 if chars1 > loc: # Overshot the location.
1121 if chars1 > loc: # Overshot the location.
1120 break
1122 break
1121 last_chars1 = chars1
1123 last_chars1 = chars1
1122 last_chars2 = chars2
1124 last_chars2 = chars2
1123
1125
1124 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
1126 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
1125 # The location was deleted.
1127 # The location was deleted.
1126 return last_chars2
1128 return last_chars2
1127 # Add the remaining len(character).
1129 # Add the remaining len(character).
1128 return last_chars2 + (loc - last_chars1)
1130 return last_chars2 + (loc - last_chars1)
1129
1131
1130 def diff_prettyHtml(self, diffs):
1132 def diff_prettyHtml(self, diffs):
1131 """Convert a diff array into a pretty HTML report.
1133 """Convert a diff array into a pretty HTML report.
1132
1134
1133 Args:
1135 Args:
1134 diffs: Array of diff tuples.
1136 diffs: Array of diff tuples.
1135
1137
1136 Returns:
1138 Returns:
1137 HTML representation.
1139 HTML representation.
1138 """
1140 """
1139 html = []
1141 html = []
1140 for op, data in diffs:
1142 for op, data in diffs:
1141 text = (
1143 text = (
1142 data.replace("&", "&amp;")
1144 data.replace("&", "&amp;")
1143 .replace("<", "&lt;")
1145 .replace("<", "&lt;")
1144 .replace(">", "&gt;")
1146 .replace(">", "&gt;")
1145 .replace("\n", "&para;<br>")
1147 .replace("\n", "&para;<br>")
1146 )
1148 )
1147 if op == self.DIFF_INSERT:
1149 if op == self.DIFF_INSERT:
1148 html.append('<ins style="background:#e6ffe6;">%s</ins>' % text)
1150 html.append('<ins style="background:#e6ffe6;">%s</ins>' % text)
1149 elif op == self.DIFF_DELETE:
1151 elif op == self.DIFF_DELETE:
1150 html.append('<del style="background:#ffe6e6;">%s</del>' % text)
1152 html.append('<del style="background:#ffe6e6;">%s</del>' % text)
1151 elif op == self.DIFF_EQUAL:
1153 elif op == self.DIFF_EQUAL:
1152 html.append("<span>%s</span>" % text)
1154 html.append("<span>%s</span>" % text)
1153 return "".join(html)
1155 return "".join(html)
1154
1156
1155 def diff_text1(self, diffs):
1157 def diff_text1(self, diffs):
1156 """Compute and return the source text (all equalities and deletions).
1158 """Compute and return the source text (all equalities and deletions).
1157
1159
1158 Args:
1160 Args:
1159 diffs: Array of diff tuples.
1161 diffs: Array of diff tuples.
1160
1162
1161 Returns:
1163 Returns:
1162 Source text.
1164 Source text.
1163 """
1165 """
1164 text = []
1166 text = []
1165 for op, data in diffs:
1167 for op, data in diffs:
1166 if op != self.DIFF_INSERT:
1168 if op != self.DIFF_INSERT:
1167 text.append(data)
1169 text.append(data)
1168 return "".join(text)
1170 return "".join(text)
1169
1171
1170 def diff_text2(self, diffs):
1172 def diff_text2(self, diffs):
1171 """Compute and return the destination text (all equalities and insertions).
1173 """Compute and return the destination text (all equalities and insertions).
1172
1174
1173 Args:
1175 Args:
1174 diffs: Array of diff tuples.
1176 diffs: Array of diff tuples.
1175
1177
1176 Returns:
1178 Returns:
1177 Destination text.
1179 Destination text.
1178 """
1180 """
1179 text = []
1181 text = []
1180 for op, data in diffs:
1182 for op, data in diffs:
1181 if op != self.DIFF_DELETE:
1183 if op != self.DIFF_DELETE:
1182 text.append(data)
1184 text.append(data)
1183 return "".join(text)
1185 return "".join(text)
1184
1186
1185 def diff_levenshtein(self, diffs):
1187 def diff_levenshtein(self, diffs):
1186 """Compute the Levenshtein distance; the number of inserted, deleted or
1188 """Compute the Levenshtein distance; the number of inserted, deleted or
1187 substituted characters.
1189 substituted characters.
1188
1190
1189 Args:
1191 Args:
1190 diffs: Array of diff tuples.
1192 diffs: Array of diff tuples.
1191
1193
1192 Returns:
1194 Returns:
1193 Number of changes.
1195 Number of changes.
1194 """
1196 """
1195 levenshtein = 0
1197 levenshtein = 0
1196 insertions = 0
1198 insertions = 0
1197 deletions = 0
1199 deletions = 0
1198 for op, data in diffs:
1200 for op, data in diffs:
1199 if op == self.DIFF_INSERT:
1201 if op == self.DIFF_INSERT:
1200 insertions += len(data)
1202 insertions += len(data)
1201 elif op == self.DIFF_DELETE:
1203 elif op == self.DIFF_DELETE:
1202 deletions += len(data)
1204 deletions += len(data)
1203 elif op == self.DIFF_EQUAL:
1205 elif op == self.DIFF_EQUAL:
1204 # A deletion and an insertion is one substitution.
1206 # A deletion and an insertion is one substitution.
1205 levenshtein += max(insertions, deletions)
1207 levenshtein += max(insertions, deletions)
1206 insertions = 0
1208 insertions = 0
1207 deletions = 0
1209 deletions = 0
1208 levenshtein += max(insertions, deletions)
1210 levenshtein += max(insertions, deletions)
1209 return levenshtein
1211 return levenshtein
1210
1212
1211 def diff_toDelta(self, diffs):
1213 def diff_toDelta(self, diffs):
1212 """Crush the diff into an encoded string which describes the operations
1214 """Crush the diff into an encoded string which describes the operations
1213 required to transform text1 into text2.
1215 required to transform text1 into text2.
1214 E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
1216 E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
1215 Operations are tab-separated. Inserted text is escaped using %xx notation.
1217 Operations are tab-separated. Inserted text is escaped using %xx notation.
1216
1218
1217 Args:
1219 Args:
1218 diffs: Array of diff tuples.
1220 diffs: Array of diff tuples.
1219
1221
1220 Returns:
1222 Returns:
1221 Delta text.
1223 Delta text.
1222 """
1224 """
1223 text = []
1225 text = []
1224 for op, data in diffs:
1226 for op, data in diffs:
1225 if op == self.DIFF_INSERT:
1227 if op == self.DIFF_INSERT:
1226 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
1228 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
1227 data = data.encode("utf-8")
1229 data = data.encode("utf-8")
1228 text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
1230 text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
1229 elif op == self.DIFF_DELETE:
1231 elif op == self.DIFF_DELETE:
1230 text.append("-%d" % len(data))
1232 text.append("-%d" % len(data))
1231 elif op == self.DIFF_EQUAL:
1233 elif op == self.DIFF_EQUAL:
1232 text.append("=%d" % len(data))
1234 text.append("=%d" % len(data))
1233 return "\t".join(text)
1235 return "\t".join(text)
1234
1236
1235 def diff_fromDelta(self, text1, delta):
1237 def diff_fromDelta(self, text1, delta):
1236 """Given the original text1, and an encoded string which describes the
1238 """Given the original text1, and an encoded string which describes the
1237 operations required to transform text1 into text2, compute the full diff.
1239 operations required to transform text1 into text2, compute the full diff.
1238
1240
1239 Args:
1241 Args:
1240 text1: Source string for the diff.
1242 text1: Source string for the diff.
1241 delta: Delta text.
1243 delta: Delta text.
1242
1244
1243 Returns:
1245 Returns:
1244 Array of diff tuples.
1246 Array of diff tuples.
1245
1247
1246 Raises:
1248 Raises:
1247 ValueError: If invalid input.
1249 ValueError: If invalid input.
1248 """
1250 """
1249 if type(delta) == str:
1251 if type(delta) == str:
1250 # Deltas should be composed of a subset of ascii chars, Unicode not
1252 # Deltas should be composed of a subset of ascii chars, Unicode not
1251 # required. If this encode raises UnicodeEncodeError, delta is invalid.
1253 # required. If this encode raises UnicodeEncodeError, delta is invalid.
1252 delta = delta.encode("ascii")
1254 delta = delta.encode("ascii")
1253 diffs = []
1255 diffs = []
1254 pointer = 0 # Cursor in text1
1256 pointer = 0 # Cursor in text1
1255 tokens = delta.split("\t")
1257 tokens = delta.split("\t")
1256 for token in tokens:
1258 for token in tokens:
1257 if token == "":
1259 if token == "":
1258 # Blank tokens are ok (from a trailing \t).
1260 # Blank tokens are ok (from a trailing \t).
1259 continue
1261 continue
1260 # Each token begins with a one character parameter which specifies the
1262 # Each token begins with a one character parameter which specifies the
1261 # operation of this token (delete, insert, equality).
1263 # operation of this token (delete, insert, equality).
1262 param = token[1:]
1264 param = token[1:]
1263 if token[0] == "+":
1265 if token[0] == "+":
1264 param = urllib.parse.unquote(param)
1266 param = urllib.parse.unquote(param)
1265 diffs.append((self.DIFF_INSERT, param))
1267 diffs.append((self.DIFF_INSERT, param))
1266 elif token[0] == "-" or token[0] == "=":
1268 elif token[0] == "-" or token[0] == "=":
1267 try:
1269 try:
1268 n = int(param)
1270 n = int(param)
1269 except ValueError:
1271 except ValueError:
1270 raise ValueError("Invalid number in diff_fromDelta: " + param)
1272 raise ValueError("Invalid number in diff_fromDelta: " + param)
1271 if n < 0:
1273 if n < 0:
1272 raise ValueError("Negative number in diff_fromDelta: " + param)
1274 raise ValueError("Negative number in diff_fromDelta: " + param)
1273 text = text1[pointer : pointer + n]
1275 text = text1[pointer : pointer + n]
1274 pointer += n
1276 pointer += n
1275 if token[0] == "=":
1277 if token[0] == "=":
1276 diffs.append((self.DIFF_EQUAL, text))
1278 diffs.append((self.DIFF_EQUAL, text))
1277 else:
1279 else:
1278 diffs.append((self.DIFF_DELETE, text))
1280 diffs.append((self.DIFF_DELETE, text))
1279 else:
1281 else:
1280 # Anything else is an error.
1282 # Anything else is an error.
1281 raise ValueError(
1283 raise ValueError(
1282 "Invalid diff operation in diff_fromDelta: " + token[0]
1284 "Invalid diff operation in diff_fromDelta: " + token[0]
1283 )
1285 )
1284 if pointer != len(text1):
1286 if pointer != len(text1):
1285 raise ValueError(
1287 raise ValueError(
1286 "Delta length (%d) does not equal source text length (%d)."
1288 "Delta length (%d) does not equal source text length (%d)."
1287 % (pointer, len(text1))
1289 % (pointer, len(text1))
1288 )
1290 )
1289 return diffs
1291 return diffs
1290
1292
1291 # MATCH FUNCTIONS
1293 # MATCH FUNCTIONS
1292
1294
1293 def match_main(self, text, pattern, loc):
1295 def match_main(self, text, pattern, loc):
1294 """Locate the best instance of 'pattern' in 'text' near 'loc'.
1296 """Locate the best instance of 'pattern' in 'text' near 'loc'.
1295
1297
1296 Args:
1298 Args:
1297 text: The text to search.
1299 text: The text to search.
1298 pattern: The pattern to search for.
1300 pattern: The pattern to search for.
1299 loc: The location to search around.
1301 loc: The location to search around.
1300
1302
1301 Returns:
1303 Returns:
1302 Best match index or -1.
1304 Best match index or -1.
1303 """
1305 """
1304 # Check for null inputs.
1306 # Check for null inputs.
1305 if text is None or pattern is None:
1307 if text is None or pattern is None:
1306 raise ValueError("Null inputs. (match_main)")
1308 raise ValueError("Null inputs. (match_main)")
1307
1309
1308 loc = max(0, min(loc, len(text)))
1310 loc = max(0, min(loc, len(text)))
1309 if text == pattern:
1311 if text == pattern:
1310 # Shortcut (potentially not guaranteed by the algorithm)
1312 # Shortcut (potentially not guaranteed by the algorithm)
1311 return 0
1313 return 0
1312 elif not text:
1314 elif not text:
1313 # Nothing to match.
1315 # Nothing to match.
1314 return -1
1316 return -1
1315 elif text[loc : loc + len(pattern)] == pattern:
1317 elif text[loc : loc + len(pattern)] == pattern:
1316 # Perfect match at the perfect spot! (Includes case of null pattern)
1318 # Perfect match at the perfect spot! (Includes case of null pattern)
1317 return loc
1319 return loc
1318 else:
1320 else:
1319 # Do a fuzzy compare.
1321 # Do a fuzzy compare.
1320 match = self.match_bitap(text, pattern, loc)
1322 match = self.match_bitap(text, pattern, loc)
1321 return match
1323 return match
1322
1324
1323 def match_bitap(self, text, pattern, loc):
1325 def match_bitap(self, text, pattern, loc):
1324 """Locate the best instance of 'pattern' in 'text' near 'loc' using the
1326 """Locate the best instance of 'pattern' in 'text' near 'loc' using the
1325 Bitap algorithm.
1327 Bitap algorithm.
1326
1328
1327 Args:
1329 Args:
1328 text: The text to search.
1330 text: The text to search.
1329 pattern: The pattern to search for.
1331 pattern: The pattern to search for.
1330 loc: The location to search around.
1332 loc: The location to search around.
1331
1333
1332 Returns:
1334 Returns:
1333 Best match index or -1.
1335 Best match index or -1.
1334 """
1336 """
1335 # Python doesn't have a maxint limit, so ignore this check.
1337 # Python doesn't have a maxint limit, so ignore this check.
1336 # if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1338 # if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1337 # raise ValueError("Pattern too long for this application.")
1339 # raise ValueError("Pattern too long for this application.")
1338
1340
1339 # Initialise the alphabet.
1341 # Initialise the alphabet.
1340 s = self.match_alphabet(pattern)
1342 s = self.match_alphabet(pattern)
1341
1343
1342 def match_bitapScore(e, x):
1344 def match_bitapScore(e, x):
1343 """Compute and return the score for a match with e errors and x location.
1345 """Compute and return the score for a match with e errors and x location.
1344 Accesses loc and pattern through being a closure.
1346 Accesses loc and pattern through being a closure.
1345
1347
1346 Args:
1348 Args:
1347 e: Number of errors in match.
1349 e: Number of errors in match.
1348 x: Location of match.
1350 x: Location of match.
1349
1351
1350 Returns:
1352 Returns:
1351 Overall score for match (0.0 = good, 1.0 = bad).
1353 Overall score for match (0.0 = good, 1.0 = bad).
1352 """
1354 """
1353 accuracy = float(e) / len(pattern)
1355 accuracy = float(e) / len(pattern)
1354 proximity = abs(loc - x)
1356 proximity = abs(loc - x)
1355 if not self.Match_Distance:
1357 if not self.Match_Distance:
1356 # Dodge divide by zero error.
1358 # Dodge divide by zero error.
1357 return proximity and 1.0 or accuracy
1359 return proximity and 1.0 or accuracy
1358 return accuracy + (proximity / float(self.Match_Distance))
1360 return accuracy + (proximity / float(self.Match_Distance))
1359
1361
1360 # Highest score beyond which we give up.
1362 # Highest score beyond which we give up.
1361 score_threshold = self.Match_Threshold
1363 score_threshold = self.Match_Threshold
1362 # Is there a nearby exact match? (speedup)
1364 # Is there a nearby exact match? (speedup)
1363 best_loc = text.find(pattern, loc)
1365 best_loc = text.find(pattern, loc)
1364 if best_loc != -1:
1366 if best_loc != -1:
1365 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1367 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1366 # What about in the other direction? (speedup)
1368 # What about in the other direction? (speedup)
1367 best_loc = text.rfind(pattern, loc + len(pattern))
1369 best_loc = text.rfind(pattern, loc + len(pattern))
1368 if best_loc != -1:
1370 if best_loc != -1:
1369 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1371 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1370
1372
1371 # Initialise the bit arrays.
1373 # Initialise the bit arrays.
1372 matchmask = 1 << (len(pattern) - 1)
1374 matchmask = 1 << (len(pattern) - 1)
1373 best_loc = -1
1375 best_loc = -1
1374
1376
1375 bin_max = len(pattern) + len(text)
1377 bin_max = len(pattern) + len(text)
1376 # Empty initialization added to appease pychecker.
1378 # Empty initialization added to appease pychecker.
1377 last_rd = None
1379 last_rd = None
1378 for d in range(len(pattern)):
1380 for d in range(len(pattern)):
1379 # Scan for the best match each iteration allows for one more error.
1381 # Scan for the best match each iteration allows for one more error.
1380 # Run a binary search to determine how far from 'loc' we can stray at
1382 # Run a binary search to determine how far from 'loc' we can stray at
1381 # this error level.
1383 # this error level.
1382 bin_min = 0
1384 bin_min = 0
1383 bin_mid = bin_max
1385 bin_mid = bin_max
1384 while bin_min < bin_mid:
1386 while bin_min < bin_mid:
1385 if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1387 if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1386 bin_min = bin_mid
1388 bin_min = bin_mid
1387 else:
1389 else:
1388 bin_max = bin_mid
1390 bin_max = bin_mid
1389 bin_mid = (bin_max - bin_min) // 2 + bin_min
1391 bin_mid = (bin_max - bin_min) // 2 + bin_min
1390
1392
1391 # Use the result from this iteration as the maximum for the next.
1393 # Use the result from this iteration as the maximum for the next.
1392 bin_max = bin_mid
1394 bin_max = bin_mid
1393 start = max(1, loc - bin_mid + 1)
1395 start = max(1, loc - bin_mid + 1)
1394 finish = min(loc + bin_mid, len(text)) + len(pattern)
1396 finish = min(loc + bin_mid, len(text)) + len(pattern)
1395
1397
1396 rd = [0] * (finish + 2)
1398 rd = [0] * (finish + 2)
1397 rd[finish + 1] = (1 << d) - 1
1399 rd[finish + 1] = (1 << d) - 1
1398 for j in range(finish, start - 1, -1):
1400 for j in range(finish, start - 1, -1):
1399 if len(text) <= j - 1:
1401 if len(text) <= j - 1:
1400 # Out of range.
1402 # Out of range.
1401 charMatch = 0
1403 charMatch = 0
1402 else:
1404 else:
1403 charMatch = s.get(text[j - 1], 0)
1405 charMatch = s.get(text[j - 1], 0)
1404 if d == 0: # First pass: exact match.
1406 if d == 0: # First pass: exact match.
1405 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1407 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1406 else: # Subsequent passes: fuzzy match.
1408 else: # Subsequent passes: fuzzy match.
1407 rd[j] = (
1409 rd[j] = (
1408 (((rd[j + 1] << 1) | 1) & charMatch)
1410 (((rd[j + 1] << 1) | 1) & charMatch)
1409 | (((last_rd[j + 1] | last_rd[j]) << 1) | 1)
1411 | (((last_rd[j + 1] | last_rd[j]) << 1) | 1)
1410 | last_rd[j + 1]
1412 | last_rd[j + 1]
1411 )
1413 )
1412 if rd[j] & matchmask:
1414 if rd[j] & matchmask:
1413 score = match_bitapScore(d, j - 1)
1415 score = match_bitapScore(d, j - 1)
1414 # This match will almost certainly be better than any existing match.
1416 # This match will almost certainly be better than any existing match.
1415 # But check anyway.
1417 # But check anyway.
1416 if score <= score_threshold:
1418 if score <= score_threshold:
1417 # Told you so.
1419 # Told you so.
1418 score_threshold = score
1420 score_threshold = score
1419 best_loc = j - 1
1421 best_loc = j - 1
1420 if best_loc > loc:
1422 if best_loc > loc:
1421 # When passing loc, don't exceed our current distance from loc.
1423 # When passing loc, don't exceed our current distance from loc.
1422 start = max(1, 2 * loc - best_loc)
1424 start = max(1, 2 * loc - best_loc)
1423 else:
1425 else:
1424 # Already passed loc, downhill from here on in.
1426 # Already passed loc, downhill from here on in.
1425 break
1427 break
1426 # No hope for a (better) match at greater error levels.
1428 # No hope for a (better) match at greater error levels.
1427 if match_bitapScore(d + 1, loc) > score_threshold:
1429 if match_bitapScore(d + 1, loc) > score_threshold:
1428 break
1430 break
1429 last_rd = rd
1431 last_rd = rd
1430 return best_loc
1432 return best_loc
1431
1433
1432 def match_alphabet(self, pattern):
1434 def match_alphabet(self, pattern):
1433 """Initialise the alphabet for the Bitap algorithm.
1435 """Initialise the alphabet for the Bitap algorithm.
1434
1436
1435 Args:
1437 Args:
1436 pattern: The text to encode.
1438 pattern: The text to encode.
1437
1439
1438 Returns:
1440 Returns:
1439 Hash of character locations.
1441 Hash of character locations.
1440 """
1442 """
1441 s = {}
1443 s = {}
1442 for char in pattern:
1444 for char in pattern:
1443 s[char] = 0
1445 s[char] = 0
1444 for i in range(len(pattern)):
1446 for i in range(len(pattern)):
1445 s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1447 s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1446 return s
1448 return s
1447
1449
1448 # PATCH FUNCTIONS
1450 # PATCH FUNCTIONS
1449
1451
1450 def patch_addContext(self, patch, text):
1452 def patch_addContext(self, patch, text):
1451 """Increase the context until it is unique,
1453 """Increase the context until it is unique,
1452 but don't let the pattern expand beyond Match_MaxBits.
1454 but don't let the pattern expand beyond Match_MaxBits.
1453
1455
1454 Args:
1456 Args:
1455 patch: The patch to grow.
1457 patch: The patch to grow.
1456 text: Source text.
1458 text: Source text.
1457 """
1459 """
1458 if len(text) == 0:
1460 if len(text) == 0:
1459 return
1461 return
1460 pattern = text[patch.start2 : patch.start2 + patch.length1]
1462 pattern = text[patch.start2 : patch.start2 + patch.length1]
1461 padding = 0
1463 padding = 0
1462
1464
1463 # Look for the first and last matches of pattern in text. If two different
1465 # Look for the first and last matches of pattern in text. If two different
1464 # matches are found, increase the pattern length.
1466 # matches are found, increase the pattern length.
1465 while text.find(pattern) != text.rfind(pattern) and (
1467 while text.find(pattern) != text.rfind(pattern) and (
1466 self.Match_MaxBits == 0
1468 self.Match_MaxBits == 0
1467 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - self.Patch_Margin
1469 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - self.Patch_Margin
1468 ):
1470 ):
1469 padding += self.Patch_Margin
1471 padding += self.Patch_Margin
1470 pattern = text[
1472 pattern = text[
1471 max(0, patch.start2 - padding) : patch.start2 + patch.length1 + padding
1473 max(0, patch.start2 - padding) : patch.start2 + patch.length1 + padding
1472 ]
1474 ]
1473 # Add one chunk for good luck.
1475 # Add one chunk for good luck.
1474 padding += self.Patch_Margin
1476 padding += self.Patch_Margin
1475
1477
1476 # Add the prefix.
1478 # Add the prefix.
1477 prefix = text[max(0, patch.start2 - padding) : patch.start2]
1479 prefix = text[max(0, patch.start2 - padding) : patch.start2]
1478 if prefix:
1480 if prefix:
1479 patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1481 patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1480 # Add the suffix.
1482 # Add the suffix.
1481 suffix = text[
1483 suffix = text[
1482 patch.start2 + patch.length1 : patch.start2 + patch.length1 + padding
1484 patch.start2 + patch.length1 : patch.start2 + patch.length1 + padding
1483 ]
1485 ]
1484 if suffix:
1486 if suffix:
1485 patch.diffs.append((self.DIFF_EQUAL, suffix))
1487 patch.diffs.append((self.DIFF_EQUAL, suffix))
1486
1488
1487 # Roll back the start points.
1489 # Roll back the start points.
1488 patch.start1 -= len(prefix)
1490 patch.start1 -= len(prefix)
1489 patch.start2 -= len(prefix)
1491 patch.start2 -= len(prefix)
1490 # Extend lengths.
1492 # Extend lengths.
1491 patch.length1 += len(prefix) + len(suffix)
1493 patch.length1 += len(prefix) + len(suffix)
1492 patch.length2 += len(prefix) + len(suffix)
1494 patch.length2 += len(prefix) + len(suffix)
1493
1495
1494 def patch_make(self, a, b=None, c=None):
1496 def patch_make(self, a, b=None, c=None):
1495 """Compute a list of patches to turn text1 into text2.
1497 """Compute a list of patches to turn text1 into text2.
1496 Use diffs if provided, otherwise compute it ourselves.
1498 Use diffs if provided, otherwise compute it ourselves.
1497 There are four ways to call this function, depending on what data is
1499 There are four ways to call this function, depending on what data is
1498 available to the caller:
1500 available to the caller:
1499 Method 1:
1501 Method 1:
1500 a = text1, b = text2
1502 a = text1, b = text2
1501 Method 2:
1503 Method 2:
1502 a = diffs
1504 a = diffs
1503 Method 3 (optimal):
1505 Method 3 (optimal):
1504 a = text1, b = diffs
1506 a = text1, b = diffs
1505 Method 4 (deprecated, use method 3):
1507 Method 4 (deprecated, use method 3):
1506 a = text1, b = text2, c = diffs
1508 a = text1, b = text2, c = diffs
1507
1509
1508 Args:
1510 Args:
1509 a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1511 a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1510 text2 (method 2).
1512 text2 (method 2).
1511 b: text2 (methods 1,4) or Array of diff tuples for text1 to
1513 b: text2 (methods 1,4) or Array of diff tuples for text1 to
1512 text2 (method 3) or undefined (method 2).
1514 text2 (method 3) or undefined (method 2).
1513 c: Array of diff tuples for text1 to text2 (method 4) or
1515 c: Array of diff tuples for text1 to text2 (method 4) or
1514 undefined (methods 1,2,3).
1516 undefined (methods 1,2,3).
1515
1517
1516 Returns:
1518 Returns:
1517 Array of Patch objects.
1519 Array of Patch objects.
1518 """
1520 """
1519 text1 = None
1521 text1 = None
1520 diffs = None
1522 diffs = None
1521 # Note that texts may arrive as 'str' or 'unicode'.
1523 # Note that texts may arrive as 'str' or 'unicode'.
1522 if isinstance(a, str) and isinstance(b, str) and c is None:
1524 if isinstance(a, str) and isinstance(b, str) and c is None:
1523 # Method 1: text1, text2
1525 # Method 1: text1, text2
1524 # Compute diffs from text1 and text2.
1526 # Compute diffs from text1 and text2.
1525 text1 = a
1527 text1 = a
1526 diffs = self.diff_main(text1, b, True)
1528 diffs = self.diff_main(text1, b, True)
1527 if len(diffs) > 2:
1529 if len(diffs) > 2:
1528 self.diff_cleanupSemantic(diffs)
1530 self.diff_cleanupSemantic(diffs)
1529 self.diff_cleanupEfficiency(diffs)
1531 self.diff_cleanupEfficiency(diffs)
1530 elif isinstance(a, list) and b is None and c is None:
1532 elif isinstance(a, list) and b is None and c is None:
1531 # Method 2: diffs
1533 # Method 2: diffs
1532 # Compute text1 from diffs.
1534 # Compute text1 from diffs.
1533 diffs = a
1535 diffs = a
1534 text1 = self.diff_text1(diffs)
1536 text1 = self.diff_text1(diffs)
1535 elif isinstance(a, str) and isinstance(b, list) and c is None:
1537 elif isinstance(a, str) and isinstance(b, list) and c is None:
1536 # Method 3: text1, diffs
1538 # Method 3: text1, diffs
1537 text1 = a
1539 text1 = a
1538 diffs = b
1540 diffs = b
1539 elif isinstance(a, str) and isinstance(b, str) and isinstance(c, list):
1541 elif isinstance(a, str) and isinstance(b, str) and isinstance(c, list):
1540 # Method 4: text1, text2, diffs
1542 # Method 4: text1, text2, diffs
1541 # text2 is not used.
1543 # text2 is not used.
1542 text1 = a
1544 text1 = a
1543 diffs = c
1545 diffs = c
1544 else:
1546 else:
1545 raise ValueError("Unknown call format to patch_make.")
1547 raise ValueError("Unknown call format to patch_make.")
1546
1548
1547 if not diffs:
1549 if not diffs:
1548 return [] # Get rid of the None case.
1550 return [] # Get rid of the None case.
1549 patches = []
1551 patches = []
1550 patch = patch_obj()
1552 patch = patch_obj()
1551 char_count1 = 0 # Number of characters into the text1 string.
1553 char_count1 = 0 # Number of characters into the text1 string.
1552 char_count2 = 0 # Number of characters into the text2 string.
1554 char_count2 = 0 # Number of characters into the text2 string.
1553 prepatch_text = text1 # Recreate the patches to determine context info.
1555 prepatch_text = text1 # Recreate the patches to determine context info.
1554 postpatch_text = text1
1556 postpatch_text = text1
1555 for x in range(len(diffs)):
1557 for x in range(len(diffs)):
1556 (diff_type, diff_text) = diffs[x]
1558 (diff_type, diff_text) = diffs[x]
1557 if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1559 if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1558 # A new patch starts here.
1560 # A new patch starts here.
1559 patch.start1 = char_count1
1561 patch.start1 = char_count1
1560 patch.start2 = char_count2
1562 patch.start2 = char_count2
1561 if diff_type == self.DIFF_INSERT:
1563 if diff_type == self.DIFF_INSERT:
1562 # Insertion
1564 # Insertion
1563 patch.diffs.append(diffs[x])
1565 patch.diffs.append(diffs[x])
1564 patch.length2 += len(diff_text)
1566 patch.length2 += len(diff_text)
1565 postpatch_text = (
1567 postpatch_text = (
1566 postpatch_text[:char_count2]
1568 postpatch_text[:char_count2]
1567 + diff_text
1569 + diff_text
1568 + postpatch_text[char_count2:]
1570 + postpatch_text[char_count2:]
1569 )
1571 )
1570 elif diff_type == self.DIFF_DELETE:
1572 elif diff_type == self.DIFF_DELETE:
1571 # Deletion.
1573 # Deletion.
1572 patch.length1 += len(diff_text)
1574 patch.length1 += len(diff_text)
1573 patch.diffs.append(diffs[x])
1575 patch.diffs.append(diffs[x])
1574 postpatch_text = (
1576 postpatch_text = (
1575 postpatch_text[:char_count2]
1577 postpatch_text[:char_count2]
1576 + postpatch_text[char_count2 + len(diff_text) :]
1578 + postpatch_text[char_count2 + len(diff_text) :]
1577 )
1579 )
1578 elif (
1580 elif (
1579 diff_type == self.DIFF_EQUAL
1581 diff_type == self.DIFF_EQUAL
1580 and len(diff_text) <= 2 * self.Patch_Margin
1582 and len(diff_text) <= 2 * self.Patch_Margin
1581 and len(patch.diffs) != 0
1583 and len(patch.diffs) != 0
1582 and len(diffs) != x + 1
1584 and len(diffs) != x + 1
1583 ):
1585 ):
1584 # Small equality inside a patch.
1586 # Small equality inside a patch.
1585 patch.diffs.append(diffs[x])
1587 patch.diffs.append(diffs[x])
1586 patch.length1 += len(diff_text)
1588 patch.length1 += len(diff_text)
1587 patch.length2 += len(diff_text)
1589 patch.length2 += len(diff_text)
1588
1590
1589 if diff_type == self.DIFF_EQUAL and len(diff_text) >= 2 * self.Patch_Margin:
1591 if diff_type == self.DIFF_EQUAL and len(diff_text) >= 2 * self.Patch_Margin:
1590 # Time for a new patch.
1592 # Time for a new patch.
1591 if len(patch.diffs) != 0:
1593 if len(patch.diffs) != 0:
1592 self.patch_addContext(patch, prepatch_text)
1594 self.patch_addContext(patch, prepatch_text)
1593 patches.append(patch)
1595 patches.append(patch)
1594 patch = patch_obj()
1596 patch = patch_obj()
1595 # Unlike Unidiff, our patch lists have a rolling context.
1597 # Unlike Unidiff, our patch lists have a rolling context.
1596 # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1598 # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1597 # Update prepatch text & pos to reflect the application of the
1599 # Update prepatch text & pos to reflect the application of the
1598 # just completed patch.
1600 # just completed patch.
1599 prepatch_text = postpatch_text
1601 prepatch_text = postpatch_text
1600 char_count1 = char_count2
1602 char_count1 = char_count2
1601
1603
1602 # Update the current character count.
1604 # Update the current character count.
1603 if diff_type != self.DIFF_INSERT:
1605 if diff_type != self.DIFF_INSERT:
1604 char_count1 += len(diff_text)
1606 char_count1 += len(diff_text)
1605 if diff_type != self.DIFF_DELETE:
1607 if diff_type != self.DIFF_DELETE:
1606 char_count2 += len(diff_text)
1608 char_count2 += len(diff_text)
1607
1609
1608 # Pick up the leftover patch if not empty.
1610 # Pick up the leftover patch if not empty.
1609 if len(patch.diffs) != 0:
1611 if len(patch.diffs) != 0:
1610 self.patch_addContext(patch, prepatch_text)
1612 self.patch_addContext(patch, prepatch_text)
1611 patches.append(patch)
1613 patches.append(patch)
1612 return patches
1614 return patches
1613
1615
1614 def patch_deepCopy(self, patches):
1616 def patch_deepCopy(self, patches):
1615 """Given an array of patches, return another array that is identical.
1617 """Given an array of patches, return another array that is identical.
1616
1618
1617 Args:
1619 Args:
1618 patches: Array of Patch objects.
1620 patches: Array of Patch objects.
1619
1621
1620 Returns:
1622 Returns:
1621 Array of Patch objects.
1623 Array of Patch objects.
1622 """
1624 """
1623 patchesCopy = []
1625 patchesCopy = []
1624 for patch in patches:
1626 for patch in patches:
1625 patchCopy = patch_obj()
1627 patchCopy = patch_obj()
1626 # No need to deep copy the tuples since they are immutable.
1628 # No need to deep copy the tuples since they are immutable.
1627 patchCopy.diffs = patch.diffs[:]
1629 patchCopy.diffs = patch.diffs[:]
1628 patchCopy.start1 = patch.start1
1630 patchCopy.start1 = patch.start1
1629 patchCopy.start2 = patch.start2
1631 patchCopy.start2 = patch.start2
1630 patchCopy.length1 = patch.length1
1632 patchCopy.length1 = patch.length1
1631 patchCopy.length2 = patch.length2
1633 patchCopy.length2 = patch.length2
1632 patchesCopy.append(patchCopy)
1634 patchesCopy.append(patchCopy)
1633 return patchesCopy
1635 return patchesCopy
1634
1636
1635 def patch_apply(self, patches, text):
1637 def patch_apply(self, patches, text):
1636 """Merge a set of patches onto the text. Return a patched text, as well
1638 """Merge a set of patches onto the text. Return a patched text, as well
1637 as a list of true/false values indicating which patches were applied.
1639 as a list of true/false values indicating which patches were applied.
1638
1640
1639 Args:
1641 Args:
1640 patches: Array of Patch objects.
1642 patches: Array of Patch objects.
1641 text: Old text.
1643 text: Old text.
1642
1644
1643 Returns:
1645 Returns:
1644 Two element Array, containing the new text and an array of boolean values.
1646 Two element Array, containing the new text and an array of boolean values.
1645 """
1647 """
1646 if not patches:
1648 if not patches:
1647 return (text, [])
1649 return (text, [])
1648
1650
1649 # Deep copy the patches so that no changes are made to originals.
1651 # Deep copy the patches so that no changes are made to originals.
1650 patches = self.patch_deepCopy(patches)
1652 patches = self.patch_deepCopy(patches)
1651
1653
1652 nullPadding = self.patch_addPadding(patches)
1654 nullPadding = self.patch_addPadding(patches)
1653 text = nullPadding + text + nullPadding
1655 text = nullPadding + text + nullPadding
1654 self.patch_splitMax(patches)
1656 self.patch_splitMax(patches)
1655
1657
1656 # delta keeps track of the offset between the expected and actual location
1658 # delta keeps track of the offset between the expected and actual location
1657 # of the previous patch. If there are patches expected at positions 10 and
1659 # of the previous patch. If there are patches expected at positions 10 and
1658 # 20, but the first patch was found at 12, delta is 2 and the second patch
1660 # 20, but the first patch was found at 12, delta is 2 and the second patch
1659 # has an effective expected position of 22.
1661 # has an effective expected position of 22.
1660 delta = 0
1662 delta = 0
1661 results = []
1663 results = []
1662 for patch in patches:
1664 for patch in patches:
1663 expected_loc = patch.start2 + delta
1665 expected_loc = patch.start2 + delta
1664 text1 = self.diff_text1(patch.diffs)
1666 text1 = self.diff_text1(patch.diffs)
1665 end_loc = -1
1667 end_loc = -1
1666 if len(text1) > self.Match_MaxBits:
1668 if len(text1) > self.Match_MaxBits:
1667 # patch_splitMax will only provide an oversized pattern in the case of
1669 # patch_splitMax will only provide an oversized pattern in the case of
1668 # a monster delete.
1670 # a monster delete.
1669 start_loc = self.match_main(
1671 start_loc = self.match_main(
1670 text, text1[: self.Match_MaxBits], expected_loc
1672 text, text1[: self.Match_MaxBits], expected_loc
1671 )
1673 )
1672 if start_loc != -1:
1674 if start_loc != -1:
1673 end_loc = self.match_main(
1675 end_loc = self.match_main(
1674 text,
1676 text,
1675 text1[-self.Match_MaxBits :],
1677 text1[-self.Match_MaxBits :],
1676 expected_loc + len(text1) - self.Match_MaxBits,
1678 expected_loc + len(text1) - self.Match_MaxBits,
1677 )
1679 )
1678 if end_loc == -1 or start_loc >= end_loc:
1680 if end_loc == -1 or start_loc >= end_loc:
1679 # Can't find valid trailing context. Drop this patch.
1681 # Can't find valid trailing context. Drop this patch.
1680 start_loc = -1
1682 start_loc = -1
1681 else:
1683 else:
1682 start_loc = self.match_main(text, text1, expected_loc)
1684 start_loc = self.match_main(text, text1, expected_loc)
1683 if start_loc == -1:
1685 if start_loc == -1:
1684 # No match found. :(
1686 # No match found. :(
1685 results.append(False)
1687 results.append(False)
1686 # Subtract the delta for this failed patch from subsequent patches.
1688 # Subtract the delta for this failed patch from subsequent patches.
1687 delta -= patch.length2 - patch.length1
1689 delta -= patch.length2 - patch.length1
1688 else:
1690 else:
1689 # Found a match. :)
1691 # Found a match. :)
1690 results.append(True)
1692 results.append(True)
1691 delta = start_loc - expected_loc
1693 delta = start_loc - expected_loc
1692 if end_loc == -1:
1694 if end_loc == -1:
1693 text2 = text[start_loc : start_loc + len(text1)]
1695 text2 = text[start_loc : start_loc + len(text1)]
1694 else:
1696 else:
1695 text2 = text[start_loc : end_loc + self.Match_MaxBits]
1697 text2 = text[start_loc : end_loc + self.Match_MaxBits]
1696 if text1 == text2:
1698 if text1 == text2:
1697 # Perfect match, just shove the replacement text in.
1699 # Perfect match, just shove the replacement text in.
1698 text = (
1700 text = (
1699 text[:start_loc]
1701 text[:start_loc]
1700 + self.diff_text2(patch.diffs)
1702 + self.diff_text2(patch.diffs)
1701 + text[start_loc + len(text1) :]
1703 + text[start_loc + len(text1) :]
1702 )
1704 )
1703 else:
1705 else:
1704 # Imperfect match.
1706 # Imperfect match.
1705 # Run a diff to get a framework of equivalent indices.
1707 # Run a diff to get a framework of equivalent indices.
1706 diffs = self.diff_main(text1, text2, False)
1708 diffs = self.diff_main(text1, text2, False)
1707 if (
1709 if (
1708 len(text1) > self.Match_MaxBits
1710 len(text1) > self.Match_MaxBits
1709 and self.diff_levenshtein(diffs) / float(len(text1))
1711 and self.diff_levenshtein(diffs) / float(len(text1))
1710 > self.Patch_DeleteThreshold
1712 > self.Patch_DeleteThreshold
1711 ):
1713 ):
1712 # The end points match, but the content is unacceptably bad.
1714 # The end points match, but the content is unacceptably bad.
1713 results[-1] = False
1715 results[-1] = False
1714 else:
1716 else:
1715 self.diff_cleanupSemanticLossless(diffs)
1717 self.diff_cleanupSemanticLossless(diffs)
1716 index1 = 0
1718 index1 = 0
1717 for op, data in patch.diffs:
1719 for op, data in patch.diffs:
1718 if op != self.DIFF_EQUAL:
1720 if op != self.DIFF_EQUAL:
1719 index2 = self.diff_xIndex(diffs, index1)
1721 index2 = self.diff_xIndex(diffs, index1)
1720 if op == self.DIFF_INSERT: # Insertion
1722 if op == self.DIFF_INSERT: # Insertion
1721 text = (
1723 text = (
1722 text[: start_loc + index2]
1724 text[: start_loc + index2]
1723 + data
1725 + data
1724 + text[start_loc + index2 :]
1726 + text[start_loc + index2 :]
1725 )
1727 )
1726 elif op == self.DIFF_DELETE: # Deletion
1728 elif op == self.DIFF_DELETE: # Deletion
1727 text = (
1729 text = (
1728 text[: start_loc + index2]
1730 text[: start_loc + index2]
1729 + text[
1731 + text[
1730 start_loc
1732 start_loc
1731 + self.diff_xIndex(diffs, index1 + len(data)) :
1733 + self.diff_xIndex(diffs, index1 + len(data)) :
1732 ]
1734 ]
1733 )
1735 )
1734 if op != self.DIFF_DELETE:
1736 if op != self.DIFF_DELETE:
1735 index1 += len(data)
1737 index1 += len(data)
1736 # Strip the padding off.
1738 # Strip the padding off.
1737 text = text[len(nullPadding) : -len(nullPadding)]
1739 text = text[len(nullPadding) : -len(nullPadding)]
1738 return (text, results)
1740 return (text, results)
1739
1741
1740 def patch_addPadding(self, patches):
1742 def patch_addPadding(self, patches):
1741 """Add some padding on text start and end so that edges can match
1743 """Add some padding on text start and end so that edges can match
1742 something. Intended to be called only from within patch_apply.
1744 something. Intended to be called only from within patch_apply.
1743
1745
1744 Args:
1746 Args:
1745 patches: Array of Patch objects.
1747 patches: Array of Patch objects.
1746
1748
1747 Returns:
1749 Returns:
1748 The padding string added to each side.
1750 The padding string added to each side.
1749 """
1751 """
1750 paddingLength = self.Patch_Margin
1752 paddingLength = self.Patch_Margin
1751 nullPadding = ""
1753 nullPadding = ""
1752 for x in range(1, paddingLength + 1):
1754 for x in range(1, paddingLength + 1):
1753 nullPadding += chr(x)
1755 nullPadding += chr(x)
1754
1756
1755 # Bump all the patches forward.
1757 # Bump all the patches forward.
1756 for patch in patches:
1758 for patch in patches:
1757 patch.start1 += paddingLength
1759 patch.start1 += paddingLength
1758 patch.start2 += paddingLength
1760 patch.start2 += paddingLength
1759
1761
1760 # Add some padding on start of first diff.
1762 # Add some padding on start of first diff.
1761 patch = patches[0]
1763 patch = patches[0]
1762 diffs = patch.diffs
1764 diffs = patch.diffs
1763 if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1765 if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1764 # Add nullPadding equality.
1766 # Add nullPadding equality.
1765 diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1767 diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1766 patch.start1 -= paddingLength # Should be 0.
1768 patch.start1 -= paddingLength # Should be 0.
1767 patch.start2 -= paddingLength # Should be 0.
1769 patch.start2 -= paddingLength # Should be 0.
1768 patch.length1 += paddingLength
1770 patch.length1 += paddingLength
1769 patch.length2 += paddingLength
1771 patch.length2 += paddingLength
1770 elif paddingLength > len(diffs[0][1]):
1772 elif paddingLength > len(diffs[0][1]):
1771 # Grow first equality.
1773 # Grow first equality.
1772 extraLength = paddingLength - len(diffs[0][1])
1774 extraLength = paddingLength - len(diffs[0][1])
1773 newText = nullPadding[len(diffs[0][1]) :] + diffs[0][1]
1775 newText = nullPadding[len(diffs[0][1]) :] + diffs[0][1]
1774 diffs[0] = (diffs[0][0], newText)
1776 diffs[0] = (diffs[0][0], newText)
1775 patch.start1 -= extraLength
1777 patch.start1 -= extraLength
1776 patch.start2 -= extraLength
1778 patch.start2 -= extraLength
1777 patch.length1 += extraLength
1779 patch.length1 += extraLength
1778 patch.length2 += extraLength
1780 patch.length2 += extraLength
1779
1781
1780 # Add some padding on end of last diff.
1782 # Add some padding on end of last diff.
1781 patch = patches[-1]
1783 patch = patches[-1]
1782 diffs = patch.diffs
1784 diffs = patch.diffs
1783 if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1785 if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1784 # Add nullPadding equality.
1786 # Add nullPadding equality.
1785 diffs.append((self.DIFF_EQUAL, nullPadding))
1787 diffs.append((self.DIFF_EQUAL, nullPadding))
1786 patch.length1 += paddingLength
1788 patch.length1 += paddingLength
1787 patch.length2 += paddingLength
1789 patch.length2 += paddingLength
1788 elif paddingLength > len(diffs[-1][1]):
1790 elif paddingLength > len(diffs[-1][1]):
1789 # Grow last equality.
1791 # Grow last equality.
1790 extraLength = paddingLength - len(diffs[-1][1])
1792 extraLength = paddingLength - len(diffs[-1][1])
1791 newText = diffs[-1][1] + nullPadding[:extraLength]
1793 newText = diffs[-1][1] + nullPadding[:extraLength]
1792 diffs[-1] = (diffs[-1][0], newText)
1794 diffs[-1] = (diffs[-1][0], newText)
1793 patch.length1 += extraLength
1795 patch.length1 += extraLength
1794 patch.length2 += extraLength
1796 patch.length2 += extraLength
1795
1797
1796 return nullPadding
1798 return nullPadding
1797
1799
1798 def patch_splitMax(self, patches):
1800 def patch_splitMax(self, patches):
1799 """Look through the patches and break up any which are longer than the
1801 """Look through the patches and break up any which are longer than the
1800 maximum limit of the match algorithm.
1802 maximum limit of the match algorithm.
1801 Intended to be called only from within patch_apply.
1803 Intended to be called only from within patch_apply.
1802
1804
1803 Args:
1805 Args:
1804 patches: Array of Patch objects.
1806 patches: Array of Patch objects.
1805 """
1807 """
1806 patch_size = self.Match_MaxBits
1808 patch_size = self.Match_MaxBits
1807 if patch_size == 0:
1809 if patch_size == 0:
1808 # Python has the option of not splitting strings due to its ability
1810 # Python has the option of not splitting strings due to its ability
1809 # to handle integers of arbitrary precision.
1811 # to handle integers of arbitrary precision.
1810 return
1812 return
1811 for x in range(len(patches)):
1813 for x in range(len(patches)):
1812 if patches[x].length1 <= patch_size:
1814 if patches[x].length1 <= patch_size:
1813 continue
1815 continue
1814 bigpatch = patches[x]
1816 bigpatch = patches[x]
1815 # Remove the big old patch.
1817 # Remove the big old patch.
1816 del patches[x]
1818 del patches[x]
1817 x -= 1
1819 x -= 1
1818 start1 = bigpatch.start1
1820 start1 = bigpatch.start1
1819 start2 = bigpatch.start2
1821 start2 = bigpatch.start2
1820 precontext = ""
1822 precontext = ""
1821 while len(bigpatch.diffs) != 0:
1823 while len(bigpatch.diffs) != 0:
1822 # Create one of several smaller patches.
1824 # Create one of several smaller patches.
1823 patch = patch_obj()
1825 patch = patch_obj()
1824 empty = True
1826 empty = True
1825 patch.start1 = start1 - len(precontext)
1827 patch.start1 = start1 - len(precontext)
1826 patch.start2 = start2 - len(precontext)
1828 patch.start2 = start2 - len(precontext)
1827 if precontext:
1829 if precontext:
1828 patch.length1 = patch.length2 = len(precontext)
1830 patch.length1 = patch.length2 = len(precontext)
1829 patch.diffs.append((self.DIFF_EQUAL, precontext))
1831 patch.diffs.append((self.DIFF_EQUAL, precontext))
1830
1832
1831 while (
1833 while (
1832 len(bigpatch.diffs) != 0
1834 len(bigpatch.diffs) != 0
1833 and patch.length1 < patch_size - self.Patch_Margin
1835 and patch.length1 < patch_size - self.Patch_Margin
1834 ):
1836 ):
1835 (diff_type, diff_text) = bigpatch.diffs[0]
1837 (diff_type, diff_text) = bigpatch.diffs[0]
1836 if diff_type == self.DIFF_INSERT:
1838 if diff_type == self.DIFF_INSERT:
1837 # Insertions are harmless.
1839 # Insertions are harmless.
1838 patch.length2 += len(diff_text)
1840 patch.length2 += len(diff_text)
1839 start2 += len(diff_text)
1841 start2 += len(diff_text)
1840 patch.diffs.append(bigpatch.diffs.pop(0))
1842 patch.diffs.append(bigpatch.diffs.pop(0))
1841 empty = False
1843 empty = False
1842 elif (
1844 elif (
1843 diff_type == self.DIFF_DELETE
1845 diff_type == self.DIFF_DELETE
1844 and len(patch.diffs) == 1
1846 and len(patch.diffs) == 1
1845 and patch.diffs[0][0] == self.DIFF_EQUAL
1847 and patch.diffs[0][0] == self.DIFF_EQUAL
1846 and len(diff_text) > 2 * patch_size
1848 and len(diff_text) > 2 * patch_size
1847 ):
1849 ):
1848 # This is a large deletion. Let it pass in one chunk.
1850 # This is a large deletion. Let it pass in one chunk.
1849 patch.length1 += len(diff_text)
1851 patch.length1 += len(diff_text)
1850 start1 += len(diff_text)
1852 start1 += len(diff_text)
1851 empty = False
1853 empty = False
1852 patch.diffs.append((diff_type, diff_text))
1854 patch.diffs.append((diff_type, diff_text))
1853 del bigpatch.diffs[0]
1855 del bigpatch.diffs[0]
1854 else:
1856 else:
1855 # Deletion or equality. Only take as much as we can stomach.
1857 # Deletion or equality. Only take as much as we can stomach.
1856 diff_text = diff_text[
1858 diff_text = diff_text[
1857 : patch_size - patch.length1 - self.Patch_Margin
1859 : patch_size - patch.length1 - self.Patch_Margin
1858 ]
1860 ]
1859 patch.length1 += len(diff_text)
1861 patch.length1 += len(diff_text)
1860 start1 += len(diff_text)
1862 start1 += len(diff_text)
1861 if diff_type == self.DIFF_EQUAL:
1863 if diff_type == self.DIFF_EQUAL:
1862 patch.length2 += len(diff_text)
1864 patch.length2 += len(diff_text)
1863 start2 += len(diff_text)
1865 start2 += len(diff_text)
1864 else:
1866 else:
1865 empty = False
1867 empty = False
1866
1868
1867 patch.diffs.append((diff_type, diff_text))
1869 patch.diffs.append((diff_type, diff_text))
1868 if diff_text == bigpatch.diffs[0][1]:
1870 if diff_text == bigpatch.diffs[0][1]:
1869 del bigpatch.diffs[0]
1871 del bigpatch.diffs[0]
1870 else:
1872 else:
1871 bigpatch.diffs[0] = (
1873 bigpatch.diffs[0] = (
1872 bigpatch.diffs[0][0],
1874 bigpatch.diffs[0][0],
1873 bigpatch.diffs[0][1][len(diff_text) :],
1875 bigpatch.diffs[0][1][len(diff_text) :],
1874 )
1876 )
1875
1877
1876 # Compute the head context for the next patch.
1878 # Compute the head context for the next patch.
1877 precontext = self.diff_text2(patch.diffs)
1879 precontext = self.diff_text2(patch.diffs)
1878 precontext = precontext[-self.Patch_Margin :]
1880 precontext = precontext[-self.Patch_Margin :]
1879 # Append the end context for this patch.
1881 # Append the end context for this patch.
1880 postcontext = self.diff_text1(bigpatch.diffs)[: self.Patch_Margin]
1882 postcontext = self.diff_text1(bigpatch.diffs)[: self.Patch_Margin]
1881 if postcontext:
1883 if postcontext:
1882 patch.length1 += len(postcontext)
1884 patch.length1 += len(postcontext)
1883 patch.length2 += len(postcontext)
1885 patch.length2 += len(postcontext)
1884 if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1886 if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1885 patch.diffs[-1] = (
1887 patch.diffs[-1] = (
1886 self.DIFF_EQUAL,
1888 self.DIFF_EQUAL,
1887 patch.diffs[-1][1] + postcontext,
1889 patch.diffs[-1][1] + postcontext,
1888 )
1890 )
1889 else:
1891 else:
1890 patch.diffs.append((self.DIFF_EQUAL, postcontext))
1892 patch.diffs.append((self.DIFF_EQUAL, postcontext))
1891
1893
1892 if not empty:
1894 if not empty:
1893 x += 1
1895 x += 1
1894 patches.insert(x, patch)
1896 patches.insert(x, patch)
1895
1897
1896 def patch_toText(self, patches):
1898 def patch_toText(self, patches):
1897 """Take a list of patches and return a textual representation.
1899 """Take a list of patches and return a textual representation.
1898
1900
1899 Args:
1901 Args:
1900 patches: Array of Patch objects.
1902 patches: Array of Patch objects.
1901
1903
1902 Returns:
1904 Returns:
1903 Text representation of patches.
1905 Text representation of patches.
1904 """
1906 """
1905 text = []
1907 text = []
1906 for patch in patches:
1908 for patch in patches:
1907 text.append(str(patch))
1909 text.append(str(patch))
1908 return "".join(text)
1910 return "".join(text)
1909
1911
1910 def patch_fromText(self, textline):
1912 def patch_fromText(self, textline):
1911 """Parse a textual representation of patches and return a list of patch
1913 """Parse a textual representation of patches and return a list of patch
1912 objects.
1914 objects.
1913
1915
1914 Args:
1916 Args:
1915 textline: Text representation of patches.
1917 textline: Text representation of patches.
1916
1918
1917 Returns:
1919 Returns:
1918 Array of Patch objects.
1920 Array of Patch objects.
1919
1921
1920 Raises:
1922 Raises:
1921 ValueError: If invalid input.
1923 ValueError: If invalid input.
1922 """
1924 """
1923 if type(textline) == str:
1925 if type(textline) == str:
1924 # Patches should be composed of a subset of ascii chars, Unicode not
1926 # Patches should be composed of a subset of ascii chars, Unicode not
1925 # required. If this encode raises UnicodeEncodeError, patch is invalid.
1927 # required. If this encode raises UnicodeEncodeError, patch is invalid.
1926 textline = textline.encode("ascii")
1928 textline = textline.encode("ascii")
1927 patches = []
1929 patches = []
1928 if not textline:
1930 if not textline:
1929 return patches
1931 return patches
1930 text = textline.split("\n")
1932 text = textline.split("\n")
1931 while len(text) != 0:
1933 while len(text) != 0:
1932 m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1934 m = re.match(r"^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1933 if not m:
1935 if not m:
1934 raise ValueError("Invalid patch string: " + text[0])
1936 raise ValueError("Invalid patch string: " + text[0])
1935 patch = patch_obj()
1937 patch = patch_obj()
1936 patches.append(patch)
1938 patches.append(patch)
1937 patch.start1 = int(m.group(1))
1939 patch.start1 = int(m.group(1))
1938 if m.group(2) == "":
1940 if m.group(2) == "":
1939 patch.start1 -= 1
1941 patch.start1 -= 1
1940 patch.length1 = 1
1942 patch.length1 = 1
1941 elif m.group(2) == "0":
1943 elif m.group(2) == "0":
1942 patch.length1 = 0
1944 patch.length1 = 0
1943 else:
1945 else:
1944 patch.start1 -= 1
1946 patch.start1 -= 1
1945 patch.length1 = int(m.group(2))
1947 patch.length1 = int(m.group(2))
1946
1948
1947 patch.start2 = int(m.group(3))
1949 patch.start2 = int(m.group(3))
1948 if m.group(4) == "":
1950 if m.group(4) == "":
1949 patch.start2 -= 1
1951 patch.start2 -= 1
1950 patch.length2 = 1
1952 patch.length2 = 1
1951 elif m.group(4) == "0":
1953 elif m.group(4) == "0":
1952 patch.length2 = 0
1954 patch.length2 = 0
1953 else:
1955 else:
1954 patch.start2 -= 1
1956 patch.start2 -= 1
1955 patch.length2 = int(m.group(4))
1957 patch.length2 = int(m.group(4))
1956
1958
1957 del text[0]
1959 del text[0]
1958
1960
1959 while len(text) != 0:
1961 while len(text) != 0:
1960 if text[0]:
1962 if text[0]:
1961 sign = text[0][0]
1963 sign = text[0][0]
1962 else:
1964 else:
1963 sign = ""
1965 sign = ""
1964 line = urllib.parse.unquote(text[0][1:])
1966 line = urllib.parse.unquote(text[0][1:])
1965 line = line.decode("utf-8")
1967 line = line.decode("utf-8")
1966 if sign == "+":
1968 if sign == "+":
1967 # Insertion.
1969 # Insertion.
1968 patch.diffs.append((self.DIFF_INSERT, line))
1970 patch.diffs.append((self.DIFF_INSERT, line))
1969 elif sign == "-":
1971 elif sign == "-":
1970 # Deletion.
1972 # Deletion.
1971 patch.diffs.append((self.DIFF_DELETE, line))
1973 patch.diffs.append((self.DIFF_DELETE, line))
1972 elif sign == " ":
1974 elif sign == " ":
1973 # Minor equality.
1975 # Minor equality.
1974 patch.diffs.append((self.DIFF_EQUAL, line))
1976 patch.diffs.append((self.DIFF_EQUAL, line))
1975 elif sign == "@":
1977 elif sign == "@":
1976 # Start of next patch.
1978 # Start of next patch.
1977 break
1979 break
1978 elif sign == "":
1980 elif sign == "":
1979 # Blank line? Whatever.
1981 # Blank line? Whatever.
1980 pass
1982 pass
1981 else:
1983 else:
1982 # WTF?
1984 # WTF?
1983 raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))
1985 raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))
1984 del text[0]
1986 del text[0]
1985 return patches
1987 return patches
1986
1988
1987
1989
1988 class patch_obj:
1990 class patch_obj:
1989 """Class representing one patch operation."""
1991 """Class representing one patch operation."""
1990
1992
1991 def __init__(self):
1993 def __init__(self):
1992 """Initializes with an empty list of diffs."""
1994 """Initializes with an empty list of diffs."""
1993 self.diffs = []
1995 self.diffs = []
1994 self.start1 = None
1996 self.start1 = None
1995 self.start2 = None
1997 self.start2 = None
1996 self.length1 = 0
1998 self.length1 = 0
1997 self.length2 = 0
1999 self.length2 = 0
1998
2000
1999 def __str__(self):
2001 def __str__(self):
2000 """Emmulate GNU diff's format.
2002 """Emmulate GNU diff's format.
2001 Header: @@ -382,8 +481,9 @@
2003 Header: @@ -382,8 +481,9 @@
2002 Indicies are printed as 1-based, not 0-based.
2004 Indicies are printed as 1-based, not 0-based.
2003
2005
2004 Returns:
2006 Returns:
2005 The GNU diff string.
2007 The GNU diff string.
2006 """
2008 """
2007 if self.length1 == 0:
2009 if self.length1 == 0:
2008 coords1 = str(self.start1) + ",0"
2010 coords1 = str(self.start1) + ",0"
2009 elif self.length1 == 1:
2011 elif self.length1 == 1:
2010 coords1 = str(self.start1 + 1)
2012 coords1 = str(self.start1 + 1)
2011 else:
2013 else:
2012 coords1 = str(self.start1 + 1) + "," + str(self.length1)
2014 coords1 = str(self.start1 + 1) + "," + str(self.length1)
2013 if self.length2 == 0:
2015 if self.length2 == 0:
2014 coords2 = str(self.start2) + ",0"
2016 coords2 = str(self.start2) + ",0"
2015 elif self.length2 == 1:
2017 elif self.length2 == 1:
2016 coords2 = str(self.start2 + 1)
2018 coords2 = str(self.start2 + 1)
2017 else:
2019 else:
2018 coords2 = str(self.start2 + 1) + "," + str(self.length2)
2020 coords2 = str(self.start2 + 1) + "," + str(self.length2)
2019 text = ["@@ -", coords1, " +", coords2, " @@\n"]
2021 text = ["@@ -", coords1, " +", coords2, " @@\n"]
2020 # Escape the body of the patch with %xx notation.
2022 # Escape the body of the patch with %xx notation.
2021 for op, data in self.diffs:
2023 for op, data in self.diffs:
2022 if op == diff_match_patch.DIFF_INSERT:
2024 if op == diff_match_patch.DIFF_INSERT:
2023 text.append("+")
2025 text.append("+")
2024 elif op == diff_match_patch.DIFF_DELETE:
2026 elif op == diff_match_patch.DIFF_DELETE:
2025 text.append("-")
2027 text.append("-")
2026 elif op == diff_match_patch.DIFF_EQUAL:
2028 elif op == diff_match_patch.DIFF_EQUAL:
2027 text.append(" ")
2029 text.append(" ")
2028 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
2030 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
2029 data = data.encode("utf-8")
2031 data = data.encode("utf-8")
2030 text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
2032 text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
2031 return "".join(text)
2033 return "".join(text)
@@ -1,1271 +1,1146 b''
1
1
2
2
3 # Copyright (C) 2011-2020 RhodeCode GmbH
3 # Copyright (C) 2011-2020 RhodeCode GmbH
4 #
4 #
5 # This program is free software: you can redistribute it and/or modify
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Affero General Public License, version 3
6 # it under the terms of the GNU Affero General Public License, version 3
7 # (only), as published by the Free Software Foundation.
7 # (only), as published by the Free Software Foundation.
8 #
8 #
9 # This program is distributed in the hope that it will be useful,
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
12 # GNU General Public License for more details.
13 #
13 #
14 # You should have received a copy of the GNU Affero General Public License
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 #
16 #
17 # This program is dual-licensed. If you wish to learn more about the
17 # This program is dual-licensed. If you wish to learn more about the
18 # RhodeCode Enterprise Edition, including its added features, Support services,
18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 # and proprietary license terms, please see https://rhodecode.com/licenses/
19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20
20
21
21
22 """
22 """
23 Set of diffing helpers, previously part of vcs
23 Set of diffing helpers, previously part of vcs
24 """
24 """
25
25 import dataclasses
26 import os
26 import os
27 import re
27 import re
28 import bz2
28 import bz2
29 import gzip
29 import gzip
30 import time
30 import time
31
31
32 import collections
33 import difflib
32 import difflib
34 import logging
33 import logging
35 import pickle
34 import pickle
36 from itertools import tee
35 from itertools import tee
37
36
38 from rhodecode.lib.vcs.exceptions import VCSError
37 from rhodecode.lib.vcs.exceptions import VCSError
39 from rhodecode.lib.vcs.nodes import FileNode, SubModuleNode
38 from rhodecode.lib.vcs.nodes import FileNode, SubModuleNode
40 from rhodecode.lib.utils2 import safe_unicode, safe_str
39 from rhodecode.lib.vcs.backends import base
40 from rhodecode.lib.str_utils import safe_str
41
41
42 log = logging.getLogger(__name__)
42 log = logging.getLogger(__name__)
43
43
44 # define max context, a file with more than this numbers of lines is unusable
44 # define max context, a file with more than this numbers of lines is unusable
45 # in browser anyway
45 # in browser anyway
46 MAX_CONTEXT = 20 * 1024
46 MAX_CONTEXT = 20 * 1024
47 DEFAULT_CONTEXT = 3
47 DEFAULT_CONTEXT = 3
48
48
49
49
50 def get_diff_context(request):
50 def get_diff_context(request):
51 return MAX_CONTEXT if request.GET.get('fullcontext', '') == '1' else DEFAULT_CONTEXT
51 return MAX_CONTEXT if request.GET.get('fullcontext', '') == '1' else DEFAULT_CONTEXT
52
52
53
53
54 def get_diff_whitespace_flag(request):
54 def get_diff_whitespace_flag(request):
55 return request.GET.get('ignorews', '') == '1'
55 return request.GET.get('ignorews', '') == '1'
56
56
57
57
58 class OPS(object):
58 @dataclasses.dataclass
59 ADD = 'A'
59 class OPS:
60 MOD = 'M'
60 ADD: str = 'A'
61 DEL = 'D'
61 MOD: str = 'M'
62 DEL: str = 'D'
63
64
65 @dataclasses.dataclass
66 class DiffLineNumber:
67 old: int | None
68 new: int | None
69
70 def __iter__(self):
71 yield self.old
72 yield self.new
62
73
63
74
64 def get_gitdiff(filenode_old, filenode_new, ignore_whitespace=True, context=3):
75 def get_gitdiff(filenode_old, filenode_new, ignore_whitespace=True, context=3):
65 """
76 """
66 Returns git style diff between given ``filenode_old`` and ``filenode_new``.
77 Returns git style diff between given ``filenode_old`` and ``filenode_new``.
67
78
68 :param ignore_whitespace: ignore whitespaces in diff
79 :param ignore_whitespace: ignore whitespaces in diff
69 """
80 """
70 # make sure we pass in default context
81 # make sure we pass in default context
71 context = context or 3
82 context = context or 3
72 # protect against IntOverflow when passing HUGE context
83 # protect against IntOverflow when passing HUGE context
73 if context > MAX_CONTEXT:
84 if context > MAX_CONTEXT:
74 context = MAX_CONTEXT
85 context = MAX_CONTEXT
75
86
76 submodules = [o for o in [filenode_new, filenode_old] if isinstance(o, SubModuleNode)]
87 submodules = [o for o in [filenode_new, filenode_old] if isinstance(o, SubModuleNode)]
77 if submodules:
88 if submodules:
78 return ''
89 return ''
79
90
80 for filenode in (filenode_old, filenode_new):
91 for filenode in (filenode_old, filenode_new):
81 if not isinstance(filenode, FileNode):
92 if not isinstance(filenode, FileNode):
82 raise VCSError(
93 raise VCSError(f"Given object should be FileNode object, not {filenode.__class__}")
83 "Given object should be FileNode object, not %s"
84 % filenode.__class__)
85
94
86 repo = filenode_new.commit.repository
95 repo = filenode_new.commit.repository
87 old_commit = filenode_old.commit or repo.EMPTY_COMMIT
96 old_commit = filenode_old.commit or repo.EMPTY_COMMIT
88 new_commit = filenode_new.commit
97 new_commit = filenode_new.commit
89
98
90 vcs_gitdiff = repo.get_diff(
99 vcs_gitdiff = repo.get_diff(
91 old_commit, new_commit, filenode_new.path,
100 old_commit, new_commit, filenode_new.path,
92 ignore_whitespace, context, path1=filenode_old.path)
101 ignore_whitespace, context, path1=filenode_old.path)
93 return vcs_gitdiff
102 return vcs_gitdiff
94
103
95 NEW_FILENODE = 1
104 NEW_FILENODE = 1
96 DEL_FILENODE = 2
105 DEL_FILENODE = 2
97 MOD_FILENODE = 3
106 MOD_FILENODE = 3
98 RENAMED_FILENODE = 4
107 RENAMED_FILENODE = 4
99 COPIED_FILENODE = 5
108 COPIED_FILENODE = 5
100 CHMOD_FILENODE = 6
109 CHMOD_FILENODE = 6
101 BIN_FILENODE = 7
110 BIN_FILENODE = 7
102
111
103
112
104 class LimitedDiffContainer(object):
113 class LimitedDiffContainer(object):
105
114
106 def __init__(self, diff_limit, cur_diff_size, diff):
115 def __init__(self, diff_limit: int, cur_diff_size, diff):
107 self.diff = diff
116 self.diff = diff
108 self.diff_limit = diff_limit
117 self.diff_limit = diff_limit
109 self.cur_diff_size = cur_diff_size
118 self.cur_diff_size = cur_diff_size
110
119
111 def __getitem__(self, key):
120 def __getitem__(self, key):
112 return self.diff.__getitem__(key)
121 return self.diff.__getitem__(key)
113
122
114 def __iter__(self):
123 def __iter__(self):
115 for l in self.diff:
124 for l in self.diff:
116 yield l
125 yield l
117
126
118
127
119 class Action(object):
128 class Action(object):
120 """
129 """
121 Contains constants for the action value of the lines in a parsed diff.
130 Contains constants for the action value of the lines in a parsed diff.
122 """
131 """
123
132
124 ADD = 'add'
133 ADD = 'add'
125 DELETE = 'del'
134 DELETE = 'del'
126 UNMODIFIED = 'unmod'
135 UNMODIFIED = 'unmod'
127
136
128 CONTEXT = 'context'
137 CONTEXT = 'context'
129 OLD_NO_NL = 'old-no-nl'
138 OLD_NO_NL = 'old-no-nl'
130 NEW_NO_NL = 'new-no-nl'
139 NEW_NO_NL = 'new-no-nl'
131
140
132
141
133 class DiffProcessor(object):
142 class DiffProcessor(object):
134 """
143 """
135 Give it a unified or git diff and it returns a list of the files that were
144 Give it a unified or git diff, and it returns a list of the files that were
136 mentioned in the diff together with a dict of meta information that
145 mentioned in the diff together with a dict of meta information that
137 can be used to render it in a HTML template.
146 can be used to render it in an HTML template.
138
147
139 .. note:: Unicode handling
148 .. note:: Unicode handling
140
149
141 The original diffs are a byte sequence and can contain filenames
150 The original diffs are a byte sequence and can contain filenames
142 in mixed encodings. This class generally returns `unicode` objects
151 in mixed encodings. This class generally returns `unicode` objects
143 since the result is intended for presentation to the user.
152 since the result is intended for presentation to the user.
144
153
145 """
154 """
146 _chunk_re = re.compile(r'^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)')
155 _chunk_re = re.compile(br'^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)')
147 _newline_marker = re.compile(r'^\\ No newline at end of file')
156 _newline_marker = re.compile(br'^\\ No newline at end of file')
148
157
149 # used for inline highlighter word split
158 # used for inline highlighter word split
150 _token_re = re.compile(r'()(&gt;|&lt;|&amp;|\W+?)')
159 _token_re = re.compile(br'()(&gt;|&lt;|&amp;|\W+?)')
151
160
152 # collapse ranges of commits over given number
161 # collapse ranges of commits over given number
153 _collapse_commits_over = 5
162 _collapse_commits_over = 5
154
163
155 def __init__(self, diff, format='gitdiff', diff_limit=None,
164 def __init__(self, diff: base.Diff, diff_format='gitdiff', diff_limit: int = 0,
156 file_limit=None, show_full_diff=True):
165 file_limit: int = 0, show_full_diff=True):
157 """
166 """
158 :param diff: A `Diff` object representing a diff from a vcs backend
167 :param diff: A `Diff` object representing a diff from a vcs backend
159 :param format: format of diff passed, `udiff` or `gitdiff`
168 :param diff_format: format of diff passed, `udiff` or `gitdiff`
160 :param diff_limit: define the size of diff that is considered "big"
169 :param diff_limit: define the size of diff that is considered "big"
161 based on that parameter cut off will be triggered, set to None
170 based on that parameter cut off will be triggered, set to None
162 to show full diff
171 to show full diff
163 """
172 """
164 self._diff = diff
173 self._diff = diff
165 self._format = format
174 self._format = diff_format
166 self.adds = 0
175 self.adds = 0
167 self.removes = 0
176 self.removes = 0
168 # calculate diff size
177 # calculate diff size
169 self.diff_limit = diff_limit
178 self.diff_limit = diff_limit
170 self.file_limit = file_limit
179 self.file_limit = file_limit
171 self.show_full_diff = show_full_diff
180 self.show_full_diff = show_full_diff
172 self.cur_diff_size = 0
181 self.cur_diff_size = 0
173 self.parsed = False
182 self.parsed = False
174 self.parsed_diff = []
183 self.parsed_diff = []
175
184
176 log.debug('Initialized DiffProcessor with %s mode', format)
185 log.debug('Initialized DiffProcessor with %s mode', diff_format)
177 if format == 'gitdiff':
186 self.differ = self._highlight_line_udiff
187 self._parser = self._new_parse_gitdiff
188
189 if diff_format == 'gitdiff':
178 self.differ = self._highlight_line_difflib
190 self.differ = self._highlight_line_difflib
179 self._parser = self._parse_gitdiff
191 self._parser = self._parse_gitdiff
180 else:
192 raise DeprecationWarning('gitdiff usage is deprecated')
181 self.differ = self._highlight_line_udiff
182 self._parser = self._new_parse_gitdiff
183
193
184 def _copy_iterator(self):
194 def _copy_iterator(self):
185 """
195 """
186 make a fresh copy of generator, we should not iterate thru
196 make a fresh copy of generator, we should not iterate thru
187 an original as it's needed for repeating operations on
197 an original as it's needed for repeating operations on
188 this instance of DiffProcessor
198 this instance of DiffProcessor
189 """
199 """
190 self.__udiff, iterator_copy = tee(self.__udiff)
200 self.__udiff, iterator_copy = tee(self.__udiff)
191 return iterator_copy
201 return iterator_copy
192
202
193 def _escaper(self, string):
203 def _escaper(self, diff_string):
194 """
204 """
195 Escaper for diff escapes special chars and checks the diff limit
205 Escaper for diff escapes special chars and checks the diff limit
196
206
197 :param string:
207 :param string:
198 """
208 """
199 self.cur_diff_size += len(string)
209 self.cur_diff_size += len(diff_string)
200
210
201 if not self.show_full_diff and (self.cur_diff_size > self.diff_limit):
211 if not self.show_full_diff and (self.cur_diff_size > self.diff_limit):
202 raise DiffLimitExceeded('Diff Limit Exceeded')
212 raise DiffLimitExceeded('Diff Limit Exceeded')
203
213
204 return string \
214 return diff_string \
205 .replace('&', '&amp;')\
215 .replace(b'&', b'&amp;')\
206 .replace('<', '&lt;')\
216 .replace(b'<', b'&lt;')\
207 .replace('>', '&gt;')
217 .replace(b'>', b'&gt;')
208
218
209 def _line_counter(self, l):
219 def _line_counter(self, diff_line):
210 """
220 """
211 Checks each line and bumps total adds/removes for this diff
221 Checks each line and bumps total adds/removes for this diff
212
222
213 :param l:
223 :param diff_line:
214 """
224 """
215 if l.startswith('+') and not l.startswith('+++'):
225 if diff_line.startswith(b'+') and not diff_line.startswith(b'+++'):
216 self.adds += 1
226 self.adds += 1
217 elif l.startswith('-') and not l.startswith('---'):
227 elif diff_line.startswith(b'-') and not diff_line.startswith(b'---'):
218 self.removes += 1
228 self.removes += 1
219 return safe_unicode(l)
229 return diff_line
220
230
221 def _highlight_line_difflib(self, line, next_):
231 def _highlight_line_difflib(self, line, next_):
222 """
232 """
223 Highlight inline changes in both lines.
233 Highlight inline changes in both lines.
224 """
234 """
225
235
226 if line['action'] == Action.DELETE:
236 if line['action'] == Action.DELETE:
227 old, new = line, next_
237 old, new = line, next_
228 else:
238 else:
229 old, new = next_, line
239 old, new = next_, line
230
240
231 oldwords = self._token_re.split(old['line'])
241 oldwords = self._token_re.split(old['line'])
232 newwords = self._token_re.split(new['line'])
242 newwords = self._token_re.split(new['line'])
233 sequence = difflib.SequenceMatcher(None, oldwords, newwords)
243 sequence = difflib.SequenceMatcher(None, oldwords, newwords)
234
244
235 oldfragments, newfragments = [], []
245 oldfragments, newfragments = [], []
236 for tag, i1, i2, j1, j2 in sequence.get_opcodes():
246 for tag, i1, i2, j1, j2 in sequence.get_opcodes():
237 oldfrag = ''.join(oldwords[i1:i2])
247 oldfrag = ''.join(oldwords[i1:i2])
238 newfrag = ''.join(newwords[j1:j2])
248 newfrag = ''.join(newwords[j1:j2])
239 if tag != 'equal':
249 if tag != 'equal':
240 if oldfrag:
250 if oldfrag:
241 oldfrag = '<del>%s</del>' % oldfrag
251 oldfrag = f'<del>{oldfrag}</del>'
242 if newfrag:
252 if newfrag:
243 newfrag = '<ins>%s</ins>' % newfrag
253 newfrag = f'<ins>{newfrag}</ins>'
244 oldfragments.append(oldfrag)
254 oldfragments.append(oldfrag)
245 newfragments.append(newfrag)
255 newfragments.append(newfrag)
246
256
247 old['line'] = "".join(oldfragments)
257 old['line'] = "".join(oldfragments)
248 new['line'] = "".join(newfragments)
258 new['line'] = "".join(newfragments)
249
259
250 def _highlight_line_udiff(self, line, next_):
260 def _highlight_line_udiff(self, line, next_):
251 """
261 """
252 Highlight inline changes in both lines.
262 Highlight inline changes in both lines.
253 """
263 """
254 start = 0
264 start = 0
255 limit = min(len(line['line']), len(next_['line']))
265 limit = min(len(line['line']), len(next_['line']))
256 while start < limit and line['line'][start] == next_['line'][start]:
266 while start < limit and line['line'][start] == next_['line'][start]:
257 start += 1
267 start += 1
258 end = -1
268 end = -1
259 limit -= start
269 limit -= start
260 while -end <= limit and line['line'][end] == next_['line'][end]:
270 while -end <= limit and line['line'][end] == next_['line'][end]:
261 end -= 1
271 end -= 1
262 end += 1
272 end += 1
263 if start or end:
273 if start or end:
264 def do(l):
274 def do(l):
265 last = end + len(l['line'])
275 last = end + len(l['line'])
266 if l['action'] == Action.ADD:
276 if l['action'] == Action.ADD:
267 tag = 'ins'
277 tag = 'ins'
268 else:
278 else:
269 tag = 'del'
279 tag = 'del'
270 l['line'] = '%s<%s>%s</%s>%s' % (
280 l['line'] = f"{l['line'][:start]}<{tag}>{l['line'][start:last]}</{tag}>{l['line'][last:]}"
271 l['line'][:start],
272 tag,
273 l['line'][start:last],
274 tag,
275 l['line'][last:]
276 )
277 do(line)
281 do(line)
278 do(next_)
282 do(next_)
279
283
280 def _clean_line(self, line, command):
284 def _clean_line(self, line, command: str):
281 if command in ['+', '-', ' ']:
285 if command in ['+', '-', ' ']:
282 # only modify the line if it's actually a diff thing
286 # only modify the line if it's actually a diff thing
283 line = line[1:]
287 line = line[1:]
284 return line
288 return line
285
289
286 def _parse_gitdiff(self, inline_diff=True):
290 def _parse_gitdiff(self, inline_diff=True):
287 _files = []
291 _files = []
288 diff_container = lambda arg: arg
292
293 def diff_container(arg):
294 return arg
289
295
290 for chunk in self._diff.chunks():
296 for chunk in self._diff.chunks():
291 head = chunk.header
297 head = chunk.header
292
298
293 diff = map(self._escaper, self.diff_splitter(chunk.diff))
299 diff = map(self._escaper, self.diff_splitter(chunk.diff))
294 raw_diff = chunk.raw
300 raw_diff = chunk.raw
295 limited_diff = False
301 limited_diff = False
296 exceeds_limit = False
302 exceeds_limit = False
297
303
298 op = None
304 op = None
299 stats = {
305 stats = {
300 'added': 0,
306 'added': 0,
301 'deleted': 0,
307 'deleted': 0,
302 'binary': False,
308 'binary': False,
303 'ops': {},
309 'ops': {},
304 }
310 }
305
311
306 if head['deleted_file_mode']:
312 if head['deleted_file_mode']:
307 op = OPS.DEL
313 op = OPS.DEL
308 stats['binary'] = True
314 stats['binary'] = True
309 stats['ops'][DEL_FILENODE] = 'deleted file'
315 stats['ops'][DEL_FILENODE] = 'deleted file'
310
316
311 elif head['new_file_mode']:
317 elif head['new_file_mode']:
312 op = OPS.ADD
318 op = OPS.ADD
313 stats['binary'] = True
319 stats['binary'] = True
314 stats['ops'][NEW_FILENODE] = 'new file %s' % head['new_file_mode']
320 stats['ops'][NEW_FILENODE] = f"new file {safe_str(head['new_file_mode'])}"
315 else: # modify operation, can be copy, rename or chmod
321 else: # modify operation, can be: copy, rename or chmod
316
322
317 # CHMOD
323 # CHMOD
318 if head['new_mode'] and head['old_mode']:
324 if head['new_mode'] and head['old_mode']:
319 op = OPS.MOD
325 op = OPS.MOD
320 stats['binary'] = True
326 stats['binary'] = True
321 stats['ops'][CHMOD_FILENODE] = (
327 stats['ops'][CHMOD_FILENODE] = f"modified file chmod {safe_str(head['old_mode'])} => {safe_str(head['new_mode'])}"
322 'modified file chmod %s => %s' % (
323 head['old_mode'], head['new_mode']))
324 # RENAME
328 # RENAME
325 if head['rename_from'] != head['rename_to']:
329 if head['rename_from'] != head['rename_to']:
326 op = OPS.MOD
330 op = OPS.MOD
327 stats['binary'] = True
331 stats['binary'] = True
328 stats['ops'][RENAMED_FILENODE] = (
332 stats['ops'][RENAMED_FILENODE] = f"file renamed from {safe_str(head['rename_from'])} to {safe_str(head['rename_to'])}"
329 'file renamed from %s to %s' % (
330 head['rename_from'], head['rename_to']))
331 # COPY
333 # COPY
332 if head.get('copy_from') and head.get('copy_to'):
334 if head.get('copy_from') and head.get('copy_to'):
333 op = OPS.MOD
335 op = OPS.MOD
334 stats['binary'] = True
336 stats['binary'] = True
335 stats['ops'][COPIED_FILENODE] = (
337 stats['ops'][COPIED_FILENODE] = f"file copied from {safe_str(head['copy_from'])} to {safe_str(head['copy_to'])}"
336 'file copied from %s to %s' % (
337 head['copy_from'], head['copy_to']))
338
338
339 # If our new parsed headers didn't match anything fallback to
339 # If our new parsed headers didn't match anything fallback to
340 # old style detection
340 # old style detection
341 if op is None:
341 if op is None:
342 if not head['a_file'] and head['b_file']:
342 if not head['a_file'] and head['b_file']:
343 op = OPS.ADD
343 op = OPS.ADD
344 stats['binary'] = True
344 stats['binary'] = True
345 stats['ops'][NEW_FILENODE] = 'new file'
345 stats['ops'][NEW_FILENODE] = 'new file'
346
346
347 elif head['a_file'] and not head['b_file']:
347 elif head['a_file'] and not head['b_file']:
348 op = OPS.DEL
348 op = OPS.DEL
349 stats['binary'] = True
349 stats['binary'] = True
350 stats['ops'][DEL_FILENODE] = 'deleted file'
350 stats['ops'][DEL_FILENODE] = 'deleted file'
351
351
352 # it's not ADD not DELETE
352 # it's not ADD not DELETE
353 if op is None:
353 if op is None:
354 op = OPS.MOD
354 op = OPS.MOD
355 stats['binary'] = True
355 stats['binary'] = True
356 stats['ops'][MOD_FILENODE] = 'modified file'
356 stats['ops'][MOD_FILENODE] = 'modified file'
357
357
358 # a real non-binary diff
358 # a real non-binary diff
359 if head['a_file'] or head['b_file']:
359 if head['a_file'] or head['b_file']:
360 try:
360 try:
361 raw_diff, chunks, _stats = self._parse_lines(diff)
361 raw_diff, chunks, _stats = self._parse_lines(diff)
362 stats['binary'] = False
362 stats['binary'] = False
363 stats['added'] = _stats[0]
363 stats['added'] = _stats[0]
364 stats['deleted'] = _stats[1]
364 stats['deleted'] = _stats[1]
365 # explicit mark that it's a modified file
365 # explicit mark that it's a modified file
366 if op == OPS.MOD:
366 if op == OPS.MOD:
367 stats['ops'][MOD_FILENODE] = 'modified file'
367 stats['ops'][MOD_FILENODE] = 'modified file'
368 exceeds_limit = len(raw_diff) > self.file_limit
368 exceeds_limit = len(raw_diff) > self.file_limit
369
369
370 # changed from _escaper function so we validate size of
370 # changed from _escaper function so we validate size of
371 # each file instead of the whole diff
371 # each file instead of the whole diff
372 # diff will hide big files but still show small ones
372 # diff will hide big files but still show small ones
373 # from my tests, big files are fairly safe to be parsed
373 # from my tests, big files are fairly safe to be parsed
374 # but the browser is the bottleneck
374 # but the browser is the bottleneck
375 if not self.show_full_diff and exceeds_limit:
375 if not self.show_full_diff and exceeds_limit:
376 raise DiffLimitExceeded('File Limit Exceeded')
376 raise DiffLimitExceeded('File Limit Exceeded')
377
377
378 except DiffLimitExceeded:
378 except DiffLimitExceeded:
379 diff_container = lambda _diff: \
379 def diff_container(_diff):
380 LimitedDiffContainer(
380 return LimitedDiffContainer(self.diff_limit, self.cur_diff_size, _diff)
381 self.diff_limit, self.cur_diff_size, _diff)
382
381
383 exceeds_limit = len(raw_diff) > self.file_limit
382 exceeds_limit = len(raw_diff) > self.file_limit
384 limited_diff = True
383 limited_diff = True
385 chunks = []
384 chunks = []
386
385
387 else: # GIT format binary patch, or possibly empty diff
386 else: # GIT format binary patch, or possibly empty diff
388 if head['bin_patch']:
387 if head['bin_patch']:
389 # we have operation already extracted, but we mark simply
388 # we have operation already extracted, but we mark simply
390 # it's a diff we wont show for binary files
389 # it's a diff we won't show for binary files
391 stats['ops'][BIN_FILENODE] = 'binary diff hidden'
390 stats['ops'][BIN_FILENODE] = 'binary diff hidden'
392 chunks = []
391 chunks = []
393
392
394 if chunks and not self.show_full_diff and op == OPS.DEL:
393 if chunks and not self.show_full_diff and op == OPS.DEL:
395 # if not full diff mode show deleted file contents
394 # if not full diff mode show deleted file contents
396 # TODO: anderson: if the view is not too big, there is no way
395 # TODO: anderson: if the view is not too big, there is no way
397 # to see the content of the file
396 # to see the content of the file
398 chunks = []
397 chunks = []
399
398
400 chunks.insert(0, [{
399 frag = [{
401 'old_lineno': '',
400 'old_lineno': '',
402 'new_lineno': '',
401 'new_lineno': '',
403 'action': Action.CONTEXT,
402 'action': Action.CONTEXT,
404 'line': msg,
403 'line': msg,
405 } for _op, msg in stats['ops'].items()
404 } for _op, msg in list(stats['ops'].items())
406 if _op not in [MOD_FILENODE]])
405 if _op not in [MOD_FILENODE]]
406
407 chunks.insert(0, frag)
407
408
408 _files.append({
409 _files.append({
409 'filename': safe_unicode(head['b_path']),
410 'filename': safe_str(head['b_path']),
410 'old_revision': head['a_blob_id'],
411 'old_revision': head['a_blob_id'],
411 'new_revision': head['b_blob_id'],
412 'new_revision': head['b_blob_id'],
412 'chunks': chunks,
413 'chunks': chunks,
413 'raw_diff': safe_unicode(raw_diff),
414 'raw_diff': safe_str(raw_diff),
414 'operation': op,
415 'operation': op,
415 'stats': stats,
416 'stats': stats,
416 'exceeds_limit': exceeds_limit,
417 'exceeds_limit': exceeds_limit,
417 'is_limited_diff': limited_diff,
418 'is_limited_diff': limited_diff,
418 })
419 })
419
420
420 sorter = lambda info: {OPS.ADD: 0, OPS.MOD: 1,
421 def operation_sorter(info):
421 OPS.DEL: 2}.get(info['operation'])
422 return {OPS.ADD: 0, OPS.MOD: 1, OPS.DEL: 2}.get(info['operation'])
422
423
423 if not inline_diff:
424 if not inline_diff:
424 return diff_container(sorted(_files, key=sorter))
425 return diff_container(sorted(_files, key=operation_sorter))
425
426
426 # highlight inline changes
427 # highlight inline changes
427 for diff_data in _files:
428 for diff_data in _files:
428 for chunk in diff_data['chunks']:
429 for chunk in diff_data['chunks']:
429 lineiter = iter(chunk)
430 lineiter = iter(chunk)
430 try:
431 try:
431 while 1:
432 while 1:
432 line = next(lineiter)
433 line = next(lineiter)
433 if line['action'] not in (
434 if line['action'] not in (
434 Action.UNMODIFIED, Action.CONTEXT):
435 Action.UNMODIFIED, Action.CONTEXT):
435 nextline = next(lineiter)
436 nextline = next(lineiter)
436 if nextline['action'] in ['unmod', 'context'] or \
437 if nextline['action'] in ['unmod', 'context'] or \
437 nextline['action'] == line['action']:
438 nextline['action'] == line['action']:
438 continue
439 continue
439 self.differ(line, nextline)
440 self.differ(line, nextline)
440 except StopIteration:
441 except StopIteration:
441 pass
442 pass
442
443
443 return diff_container(sorted(_files, key=sorter))
444 return diff_container(sorted(_files, key=operation_sorter))
444
445
445 def _check_large_diff(self):
446 def _check_large_diff(self):
446 if self.diff_limit:
447 if self.diff_limit:
447 log.debug('Checking if diff exceeds current diff_limit of %s', self.diff_limit)
448 log.debug('Checking if diff exceeds current diff_limit of %s', self.diff_limit)
448 if not self.show_full_diff and (self.cur_diff_size > self.diff_limit):
449 if not self.show_full_diff and (self.cur_diff_size > self.diff_limit):
449 raise DiffLimitExceeded('Diff Limit `%s` Exceeded', self.diff_limit)
450 raise DiffLimitExceeded(f'Diff Limit `{self.diff_limit}` Exceeded')
450
451
451 # FIXME: NEWDIFFS: dan: this replaces _parse_gitdiff
452 # FIXME: NEWDIFFS: dan: this replaces _parse_gitdiff
452 def _new_parse_gitdiff(self, inline_diff=True):
453 def _new_parse_gitdiff(self, inline_diff=True):
453 _files = []
454 _files = []
454
455
455 # this can be overriden later to a LimitedDiffContainer type
456 # this can be overridden later to a LimitedDiffContainer type
456 diff_container = lambda arg: arg
457 def diff_container(arg):
458 return arg
457
459
458 for chunk in self._diff.chunks():
460 for chunk in self._diff.chunks():
459 head = chunk.header
461 head = chunk.header_as_str
460 log.debug('parsing diff %r', head)
462 log.debug('parsing diff chunk %r', chunk)
461
463
462 raw_diff = chunk.raw
464 raw_diff = chunk.raw
463 limited_diff = False
465 limited_diff = False
464 exceeds_limit = False
466 exceeds_limit = False
465
467
466 op = None
468 op = None
467 stats = {
469 stats = {
468 'added': 0,
470 'added': 0,
469 'deleted': 0,
471 'deleted': 0,
470 'binary': False,
472 'binary': False,
471 'old_mode': None,
473 'old_mode': '',
472 'new_mode': None,
474 'new_mode': '',
473 'ops': {},
475 'ops': {},
474 }
476 }
475 if head['old_mode']:
477 if head['old_mode']:
476 stats['old_mode'] = head['old_mode']
478 stats['old_mode'] = head['old_mode']
477 if head['new_mode']:
479 if head['new_mode']:
478 stats['new_mode'] = head['new_mode']
480 stats['new_mode'] = head['new_mode']
479 if head['b_mode']:
481 if head['b_mode']:
480 stats['new_mode'] = head['b_mode']
482 stats['new_mode'] = head['b_mode']
481
483
482 # delete file
484 # delete file
483 if head['deleted_file_mode']:
485 if head['deleted_file_mode']:
484 op = OPS.DEL
486 op = OPS.DEL
485 stats['binary'] = True
487 stats['binary'] = True
486 stats['ops'][DEL_FILENODE] = 'deleted file'
488 stats['ops'][DEL_FILENODE] = 'deleted file'
487
489
488 # new file
490 # new file
489 elif head['new_file_mode']:
491 elif head['new_file_mode']:
490 op = OPS.ADD
492 op = OPS.ADD
491 stats['binary'] = True
493 stats['binary'] = True
492 stats['old_mode'] = None
494 stats['old_mode'] = ''
493 stats['new_mode'] = head['new_file_mode']
495 stats['new_mode'] = head['new_file_mode']
494 stats['ops'][NEW_FILENODE] = 'new file %s' % head['new_file_mode']
496 stats['ops'][NEW_FILENODE] = f"new file {head['new_file_mode']}"
495
497
496 # modify operation, can be copy, rename or chmod
498 # modify operation, can be: copy, rename or chmod
497 else:
499 else:
498 # CHMOD
500 # CHMOD
499 if head['new_mode'] and head['old_mode']:
501 if head['new_mode'] and head['old_mode']:
500 op = OPS.MOD
502 op = OPS.MOD
501 stats['binary'] = True
503 stats['binary'] = True
502 stats['ops'][CHMOD_FILENODE] = (
504 stats['ops'][CHMOD_FILENODE] = f"modified file chmod {head['old_mode']} => {head['new_mode']}"
503 'modified file chmod %s => %s' % (
504 head['old_mode'], head['new_mode']))
505
505
506 # RENAME
506 # RENAME
507 if head['rename_from'] != head['rename_to']:
507 if head['rename_from'] != head['rename_to']:
508 op = OPS.MOD
508 op = OPS.MOD
509 stats['binary'] = True
509 stats['binary'] = True
510 stats['renamed'] = (head['rename_from'], head['rename_to'])
510 stats['renamed'] = (head['rename_from'], head['rename_to'])
511 stats['ops'][RENAMED_FILENODE] = (
511 stats['ops'][RENAMED_FILENODE] = f"file renamed from {head['rename_from']} to {head['rename_to']}"
512 'file renamed from %s to %s' % (
513 head['rename_from'], head['rename_to']))
514 # COPY
512 # COPY
515 if head.get('copy_from') and head.get('copy_to'):
513 if head.get('copy_from') and head.get('copy_to'):
516 op = OPS.MOD
514 op = OPS.MOD
517 stats['binary'] = True
515 stats['binary'] = True
518 stats['copied'] = (head['copy_from'], head['copy_to'])
516 stats['copied'] = (head['copy_from'], head['copy_to'])
519 stats['ops'][COPIED_FILENODE] = (
517 stats['ops'][COPIED_FILENODE] = f"file copied from {head['copy_from']} to {head['copy_to']}"
520 'file copied from %s to %s' % (
521 head['copy_from'], head['copy_to']))
522
518
523 # If our new parsed headers didn't match anything fallback to
519 # If our new parsed headers didn't match anything fallback to
524 # old style detection
520 # old style detection
525 if op is None:
521 if op is None:
526 if not head['a_file'] and head['b_file']:
522 if not head['a_file'] and head['b_file']:
527 op = OPS.ADD
523 op = OPS.ADD
528 stats['binary'] = True
524 stats['binary'] = True
529 stats['new_file'] = True
525 stats['new_file'] = True
530 stats['ops'][NEW_FILENODE] = 'new file'
526 stats['ops'][NEW_FILENODE] = 'new file'
531
527
532 elif head['a_file'] and not head['b_file']:
528 elif head['a_file'] and not head['b_file']:
533 op = OPS.DEL
529 op = OPS.DEL
534 stats['binary'] = True
530 stats['binary'] = True
535 stats['ops'][DEL_FILENODE] = 'deleted file'
531 stats['ops'][DEL_FILENODE] = 'deleted file'
536
532
537 # it's not ADD not DELETE
533 # it's not ADD not DELETE
538 if op is None:
534 if op is None:
539 op = OPS.MOD
535 op = OPS.MOD
540 stats['binary'] = True
536 stats['binary'] = True
541 stats['ops'][MOD_FILENODE] = 'modified file'
537 stats['ops'][MOD_FILENODE] = 'modified file'
542
538
543 # a real non-binary diff
539 # a real non-binary diff
544 if head['a_file'] or head['b_file']:
540 if head['a_file'] or head['b_file']:
545 # simulate splitlines, so we keep the line end part
541 # simulate splitlines, so we keep the line end part
546 diff = self.diff_splitter(chunk.diff)
542 diff = self.diff_splitter(chunk.diff)
547
543
548 # append each file to the diff size
544 # append each file to the diff size
549 raw_chunk_size = len(raw_diff)
545 raw_chunk_size = len(raw_diff)
550
546
551 exceeds_limit = raw_chunk_size > self.file_limit
547 exceeds_limit = raw_chunk_size > self.file_limit
552 self.cur_diff_size += raw_chunk_size
548 self.cur_diff_size += raw_chunk_size
553
549
554 try:
550 try:
555 # Check each file instead of the whole diff.
551 # Check each file instead of the whole diff.
556 # Diff will hide big files but still show small ones.
552 # Diff will hide big files but still show small ones.
557 # From the tests big files are fairly safe to be parsed
553 # From the tests big files are fairly safe to be parsed
558 # but the browser is the bottleneck.
554 # but the browser is the bottleneck.
559 if not self.show_full_diff and exceeds_limit:
555 if not self.show_full_diff and exceeds_limit:
560 log.debug('File `%s` exceeds current file_limit of %s',
556 log.debug('File `%s` exceeds current file_limit of %s',
561 safe_unicode(head['b_path']), self.file_limit)
557 head['b_path'], self.file_limit)
562 raise DiffLimitExceeded(
558 raise DiffLimitExceeded(f'File Limit {self.file_limit} Exceeded')
563 'File Limit %s Exceeded', self.file_limit)
564
559
565 self._check_large_diff()
560 self._check_large_diff()
566
561
567 raw_diff, chunks, _stats = self._new_parse_lines(diff)
562 raw_diff, chunks, _stats = self._new_parse_lines(diff)
568 stats['binary'] = False
563 stats['binary'] = False
569 stats['added'] = _stats[0]
564 stats['added'] = _stats[0]
570 stats['deleted'] = _stats[1]
565 stats['deleted'] = _stats[1]
571 # explicit mark that it's a modified file
566 # explicit mark that it's a modified file
572 if op == OPS.MOD:
567 if op == OPS.MOD:
573 stats['ops'][MOD_FILENODE] = 'modified file'
568 stats['ops'][MOD_FILENODE] = 'modified file'
574
569
575 except DiffLimitExceeded:
570 except DiffLimitExceeded:
576 diff_container = lambda _diff: \
571 def limited_diff_container(_diff):
577 LimitedDiffContainer(
572 return LimitedDiffContainer(self.diff_limit, self.cur_diff_size, _diff)
578 self.diff_limit, self.cur_diff_size, _diff)
573
574 # re-definition of our container wrapper
575 diff_container = limited_diff_container
579
576
580 limited_diff = True
577 limited_diff = True
581 chunks = []
578 chunks = []
582
579
583 else: # GIT format binary patch, or possibly empty diff
580 else: # GIT format binary patch, or possibly empty diff
584 if head['bin_patch']:
581 if head['bin_patch']:
585 # we have operation already extracted, but we mark simply
582 # we have operation already extracted, but we mark simply
586 # it's a diff we wont show for binary files
583 # it's a diff we won't show for binary files
587 stats['ops'][BIN_FILENODE] = 'binary diff hidden'
584 stats['ops'][BIN_FILENODE] = 'binary diff hidden'
588 chunks = []
585 chunks = []
589
586
590 # Hide content of deleted node by setting empty chunks
587 # Hide content of deleted node by setting empty chunks
591 if chunks and not self.show_full_diff and op == OPS.DEL:
588 if chunks and not self.show_full_diff and op == OPS.DEL:
592 # if not full diff mode show deleted file contents
589 # if not full diff mode show deleted file contents
593 # TODO: anderson: if the view is not too big, there is no way
590 # TODO: anderson: if the view is not too big, there is no way
594 # to see the content of the file
591 # to see the content of the file
595 chunks = []
592 chunks = []
596
593
597 chunks.insert(
594 frag = [
598 0, [{'old_lineno': '',
595 {'old_lineno': '',
599 'new_lineno': '',
596 'new_lineno': '',
600 'action': Action.CONTEXT,
597 'action': Action.CONTEXT,
601 'line': msg,
598 'line': msg,
602 } for _op, msg in stats['ops'].items()
599 } for _op, msg in list(stats['ops'].items())
603 if _op not in [MOD_FILENODE]])
600 if _op not in [MOD_FILENODE]]
604
601
605 original_filename = safe_unicode(head['a_path'])
602 chunks.insert(0, frag)
603
604 original_filename = safe_str(head['a_path'])
606 _files.append({
605 _files.append({
607 'original_filename': original_filename,
606 'original_filename': original_filename,
608 'filename': safe_unicode(head['b_path']),
607 'filename': safe_str(head['b_path']),
609 'old_revision': head['a_blob_id'],
608 'old_revision': head['a_blob_id'],
610 'new_revision': head['b_blob_id'],
609 'new_revision': head['b_blob_id'],
611 'chunks': chunks,
610 'chunks': chunks,
612 'raw_diff': safe_unicode(raw_diff),
611 'raw_diff': safe_str(raw_diff),
613 'operation': op,
612 'operation': op,
614 'stats': stats,
613 'stats': stats,
615 'exceeds_limit': exceeds_limit,
614 'exceeds_limit': exceeds_limit,
616 'is_limited_diff': limited_diff,
615 'is_limited_diff': limited_diff,
617 })
616 })
618
617
619 sorter = lambda info: {OPS.ADD: 0, OPS.MOD: 1,
618 def sorter(info):
620 OPS.DEL: 2}.get(info['operation'])
619 return {OPS.ADD: 0, OPS.MOD: 1, OPS.DEL: 2}.get(info['operation'])
621
622 return diff_container(sorted(_files, key=sorter))
620 return diff_container(sorted(_files, key=sorter))
623
621
624 # FIXME: NEWDIFFS: dan: this gets replaced by _new_parse_lines
622 # FIXME: NEWDIFFS: dan: this gets replaced by _new_parse_lines
625 def _parse_lines(self, diff_iter):
623 def _parse_lines(self, diff_iter):
626 """
624 """
627 Parse the diff an return data for the template.
625 Parse the diff an return data for the template.
628 """
626 """
629
627
630 stats = [0, 0]
628 stats = [0, 0]
631 chunks = []
629 chunks = []
632 raw_diff = []
630 raw_diff = []
633
631
634 try:
632 try:
635 line = next(diff_iter)
633 line = next(diff_iter)
636
634
637 while line:
635 while line:
638 raw_diff.append(line)
636 raw_diff.append(line)
639 lines = []
637 lines = []
640 chunks.append(lines)
638 chunks.append(lines)
641
639
642 match = self._chunk_re.match(line)
640 match = self._chunk_re.match(line)
643
641
644 if not match:
642 if not match:
645 break
643 break
646
644
647 gr = match.groups()
645 gr = match.groups()
648 (old_line, old_end,
646 (old_line, old_end,
649 new_line, new_end) = [int(x or 1) for x in gr[:-1]]
647 new_line, new_end) = [int(x or 1) for x in gr[:-1]]
650 old_line -= 1
648 old_line -= 1
651 new_line -= 1
649 new_line -= 1
652
650
653 context = len(gr) == 5
651 context = len(gr) == 5
654 old_end += old_line
652 old_end += old_line
655 new_end += new_line
653 new_end += new_line
656
654
657 if context:
655 if context:
658 # skip context only if it's first line
656 # skip context only if it's first line
659 if int(gr[0]) > 1:
657 if int(gr[0]) > 1:
660 lines.append({
658 lines.append({
661 'old_lineno': '...',
659 'old_lineno': '...',
662 'new_lineno': '...',
660 'new_lineno': '...',
663 'action': Action.CONTEXT,
661 'action': Action.CONTEXT,
664 'line': line,
662 'line': line,
665 })
663 })
666
664
667 line = next(diff_iter)
665 line = next(diff_iter)
668
666
669 while old_line < old_end or new_line < new_end:
667 while old_line < old_end or new_line < new_end:
670 command = ' '
668 command = b' '
671 if line:
669 if line:
672 command = line[0]
670 command = line[0]
673
671
674 affects_old = affects_new = False
672 affects_old = affects_new = False
675
673
676 # ignore those if we don't expect them
674 # ignore those if we don't expect them
677 if command in '#@':
675 if command in b'#@':
678 continue
676 continue
679 elif command == '+':
677 elif command == b'+':
680 affects_new = True
678 affects_new = True
681 action = Action.ADD
679 action = Action.ADD
682 stats[0] += 1
680 stats[0] += 1
683 elif command == '-':
681 elif command == b'-':
684 affects_old = True
682 affects_old = True
685 action = Action.DELETE
683 action = Action.DELETE
686 stats[1] += 1
684 stats[1] += 1
687 else:
685 else:
688 affects_old = affects_new = True
686 affects_old = affects_new = True
689 action = Action.UNMODIFIED
687 action = Action.UNMODIFIED
690
688
691 if not self._newline_marker.match(line):
689 if not self._newline_marker.match(line):
692 old_line += affects_old
690 old_line += affects_old
693 new_line += affects_new
691 new_line += affects_new
694 lines.append({
692 lines.append({
695 'old_lineno': affects_old and old_line or '',
693 'old_lineno': affects_old and old_line or b'',
696 'new_lineno': affects_new and new_line or '',
694 'new_lineno': affects_new and new_line or b'',
697 'action': action,
695 'action': action,
698 'line': self._clean_line(line, command)
696 'line': self._clean_line(line, command)
699 })
697 })
700 raw_diff.append(line)
698 raw_diff.append(line)
701
699
702 line = next(diff_iter)
700 line = next(diff_iter)
703
701
704 if self._newline_marker.match(line):
702 if self._newline_marker.match(line):
705 # we need to append to lines, since this is not
703 # we need to append to lines, since this is not
706 # counted in the line specs of diff
704 # counted in the line specs of diff
707 lines.append({
705 lines.append({
708 'old_lineno': '...',
706 'old_lineno': '...',
709 'new_lineno': '...',
707 'new_lineno': '...',
710 'action': Action.CONTEXT,
708 'action': Action.CONTEXT,
711 'line': self._clean_line(line, command)
709 'line': self._clean_line(line, command)
712 })
710 })
713
711
714 except StopIteration:
712 except StopIteration:
715 pass
713 pass
716 return ''.join(raw_diff), chunks, stats
714 return ''.join(raw_diff), chunks, stats
717
715
718 # FIXME: NEWDIFFS: dan: this replaces _parse_lines
716 # FIXME: NEWDIFFS: dan: this replaces _parse_lines
719 def _new_parse_lines(self, diff_iter):
717 def _new_parse_lines(self, diff_iter):
720 """
718 """
721 Parse the diff an return data for the template.
719 Parse the diff an return data for the template.
722 """
720 """
723
721
724 stats = [0, 0]
722 stats = [0, 0]
725 chunks = []
723 chunks = []
726 raw_diff = []
724 raw_diff = []
727
725
728 try:
726 try:
729 line = next(diff_iter)
727 line = next(diff_iter)
728 assert isinstance(line, bytes)
730
729
731 while line:
730 while line:
732 raw_diff.append(line)
731 raw_diff.append(line)
733 # match header e.g @@ -0,0 +1 @@\n'
732 # match header e.g @@ -0,0 +1 @@\n'
734 match = self._chunk_re.match(line)
733 match = self._chunk_re.match(line)
735
734
736 if not match:
735 if not match:
737 break
736 break
738
737
739 gr = match.groups()
738 gr = match.groups()
739
740 (old_line, old_end,
740 (old_line, old_end,
741 new_line, new_end) = [int(x or 1) for x in gr[:-1]]
741 new_line, new_end) = [int(x or 1) for x in gr[:-1]]
742
742
743 lines = []
743 lines = []
744 hunk = {
744 hunk = {
745 'section_header': gr[-1],
745 'section_header': gr[-1],
746 'source_start': old_line,
746 'source_start': old_line,
747 'source_length': old_end,
747 'source_length': old_end,
748 'target_start': new_line,
748 'target_start': new_line,
749 'target_length': new_end,
749 'target_length': new_end,
750 'lines': lines,
750 'lines': lines,
751 }
751 }
752 chunks.append(hunk)
752 chunks.append(hunk)
753
753
754 old_line -= 1
754 old_line -= 1
755 new_line -= 1
755 new_line -= 1
756
756
757 context = len(gr) == 5
757 len(gr) == 5
758 old_end += old_line
758 old_end += old_line
759 new_end += new_line
759 new_end += new_line
760
760
761 line = next(diff_iter)
761 line = next(diff_iter)
762
762
763 while old_line < old_end or new_line < new_end:
763 while old_line < old_end or new_line < new_end:
764 command = ' '
764 command = ' '
765 if line:
765 if line:
766 command = line[0]
766 # This is bytes, so we need to convert it to a str
767 command: str = chr(line[0])
767
768
768 affects_old = affects_new = False
769 affects_old = affects_new = False
769
770
770 # ignore those if we don't expect them
771 # ignore those if we don't expect them
771 if command in '#@':
772 if command in '#@':
772 continue
773 continue
773 elif command == '+':
774 elif command == '+':
774 affects_new = True
775 affects_new = True
775 action = Action.ADD
776 action = Action.ADD
776 stats[0] += 1
777 stats[0] += 1
777 elif command == '-':
778 elif command == '-':
778 affects_old = True
779 affects_old = True
779 action = Action.DELETE
780 action = Action.DELETE
780 stats[1] += 1
781 stats[1] += 1
781 else:
782 else:
782 affects_old = affects_new = True
783 affects_old = affects_new = True
783 action = Action.UNMODIFIED
784 action = Action.UNMODIFIED
784
785
785 if not self._newline_marker.match(line):
786 if not self._newline_marker.match(line):
786 old_line += affects_old
787 old_line += affects_old
787 new_line += affects_new
788 new_line += affects_new
788 lines.append({
789 lines.append({
789 'old_lineno': affects_old and old_line or '',
790 'old_lineno': affects_old and old_line or None,
790 'new_lineno': affects_new and new_line or '',
791 'new_lineno': affects_new and new_line or None,
791 'action': action,
792 'action': action,
792 'line': self._clean_line(line, command)
793 'line': self._clean_line(line, command)
793 })
794 })
794 raw_diff.append(line)
795 raw_diff.append(line)
795
796
796 line = next(diff_iter)
797 line = next(diff_iter)
797
798
798 if self._newline_marker.match(line):
799 if self._newline_marker.match(line):
799 # we need to append to lines, since this is not
800 # we need to append to lines, since this is not
800 # counted in the line specs of diff
801 # counted in the line specs of diff
801 if affects_old:
802 if affects_old:
802 action = Action.OLD_NO_NL
803 action = Action.OLD_NO_NL
803 elif affects_new:
804 elif affects_new:
804 action = Action.NEW_NO_NL
805 action = Action.NEW_NO_NL
805 else:
806 else:
806 raise Exception('invalid context for no newline')
807 raise Exception('invalid context for no newline')
807
808
808 lines.append({
809 lines.append({
809 'old_lineno': None,
810 'old_lineno': None,
810 'new_lineno': None,
811 'new_lineno': None,
811 'action': action,
812 'action': action,
812 'line': self._clean_line(line, command)
813 'line': self._clean_line(line, command)
813 })
814 })
814
815
815 except StopIteration:
816 except StopIteration:
816 pass
817 pass
817
818
818 return ''.join(raw_diff), chunks, stats
819 return b''.join(raw_diff), chunks, stats
819
820
820 def _safe_id(self, idstring):
821 def _safe_id(self, idstring):
821 """Make a string safe for including in an id attribute.
822 """Make a string safe for including in an id attribute.
822
823
823 The HTML spec says that id attributes 'must begin with
824 The HTML spec says that id attributes 'must begin with
824 a letter ([A-Za-z]) and may be followed by any number
825 a letter ([A-Za-z]) and may be followed by any number
825 of letters, digits ([0-9]), hyphens ("-"), underscores
826 of letters, digits ([0-9]), hyphens ("-"), underscores
826 ("_"), colons (":"), and periods (".")'. These regexps
827 ("_"), colons (":"), and periods (".")'. These regexps
827 are slightly over-zealous, in that they remove colons
828 are slightly over-zealous, in that they remove colons
828 and periods unnecessarily.
829 and periods unnecessarily.
829
830
830 Whitespace is transformed into underscores, and then
831 Whitespace is transformed into underscores, and then
831 anything which is not a hyphen or a character that
832 anything which is not a hyphen or a character that
832 matches \w (alphanumerics and underscore) is removed.
833 matches \w (alphanumerics and underscore) is removed.
833
834
834 """
835 """
835 # Transform all whitespace to underscore
836 # Transform all whitespace to underscore
836 idstring = re.sub(r'\s', "_", '%s' % idstring)
837 idstring = re.sub(r'\s', "_", f'{idstring}')
837 # Remove everything that is not a hyphen or a member of \w
838 # Remove everything that is not a hyphen or a member of \w
838 idstring = re.sub(r'(?!-)\W', "", idstring).lower()
839 idstring = re.sub(r'(?!-)\W', "", idstring).lower()
839 return idstring
840 return idstring
840
841
841 @classmethod
842 @classmethod
842 def diff_splitter(cls, string):
843 def diff_splitter(cls, diff_string: bytes):
843 """
844 """
844 Diff split that emulates .splitlines() but works only on \n
845 Diff split that emulates .splitlines() but works only on \n
845 """
846 """
846 if not string:
847 if not diff_string:
847 return
848 return
848 elif string == '\n':
849 elif diff_string == b'\n':
849 yield '\n'
850 yield b'\n'
850 else:
851 else:
851
852
852 has_newline = string.endswith('\n')
853 has_newline = diff_string.endswith(b'\n')
853 elements = string.split('\n')
854 elements = diff_string.split(b'\n')
854 if has_newline:
855 if has_newline:
855 # skip last element as it's empty string from newlines
856 # skip last element as it's empty string from newlines
856 elements = elements[:-1]
857 elements = elements[:-1]
857
858
858 len_elements = len(elements)
859 len_elements = len(elements)
859
860
860 for cnt, line in enumerate(elements, start=1):
861 for cnt, line in enumerate(elements, start=1):
861 last_line = cnt == len_elements
862 last_line = cnt == len_elements
862 if last_line and not has_newline:
863 if last_line and not has_newline:
863 yield safe_unicode(line)
864 yield line
864 else:
865 else:
865 yield safe_unicode(line) + '\n'
866 yield line + b'\n'
866
867
867 def prepare(self, inline_diff=True):
868 def prepare(self, inline_diff=True):
868 """
869 """
869 Prepare the passed udiff for HTML rendering.
870 Prepare the passed udiff for HTML rendering.
870
871
871 :return: A list of dicts with diff information.
872 :return: A list of dicts with diff information.
872 """
873 """
873 parsed = self._parser(inline_diff=inline_diff)
874 parsed = self._parser(inline_diff=inline_diff)
874 self.parsed = True
875 self.parsed = True
875 self.parsed_diff = parsed
876 self.parsed_diff = parsed
876 return parsed
877 return parsed
877
878
878 def as_raw(self, diff_lines=None):
879 def as_raw(self, diff_lines=None):
879 """
880 """
880 Returns raw diff as a byte string
881 Returns raw diff as a byte string
881 """
882 """
882 return self._diff.raw
883 return self._diff.raw.tobytes()
883
884 def as_html(self, table_class='code-difftable', line_class='line',
885 old_lineno_class='lineno old', new_lineno_class='lineno new',
886 code_class='code', enable_comments=False, parsed_lines=None):
887 """
888 Return given diff as html table with customized css classes
889 """
890 # TODO(marcink): not sure how to pass in translator
891 # here in an efficient way, leave the _ for proper gettext extraction
892 _ = lambda s: s
893
894 def _link_to_if(condition, label, url):
895 """
896 Generates a link if condition is meet or just the label if not.
897 """
898
899 if condition:
900 return '''<a href="%(url)s" class="tooltip"
901 title="%(title)s">%(label)s</a>''' % {
902 'title': _('Click to select line'),
903 'url': url,
904 'label': label
905 }
906 else:
907 return label
908 if not self.parsed:
909 self.prepare()
910
911 diff_lines = self.parsed_diff
912 if parsed_lines:
913 diff_lines = parsed_lines
914
915 _html_empty = True
916 _html = []
917 _html.append('''<table class="%(table_class)s">\n''' % {
918 'table_class': table_class
919 })
920
921 for diff in diff_lines:
922 for line in diff['chunks']:
923 _html_empty = False
924 for change in line:
925 _html.append('''<tr class="%(lc)s %(action)s">\n''' % {
926 'lc': line_class,
927 'action': change['action']
928 })
929 anchor_old_id = ''
930 anchor_new_id = ''
931 anchor_old = "%(filename)s_o%(oldline_no)s" % {
932 'filename': self._safe_id(diff['filename']),
933 'oldline_no': change['old_lineno']
934 }
935 anchor_new = "%(filename)s_n%(oldline_no)s" % {
936 'filename': self._safe_id(diff['filename']),
937 'oldline_no': change['new_lineno']
938 }
939 cond_old = (change['old_lineno'] != '...' and
940 change['old_lineno'])
941 cond_new = (change['new_lineno'] != '...' and
942 change['new_lineno'])
943 if cond_old:
944 anchor_old_id = 'id="%s"' % anchor_old
945 if cond_new:
946 anchor_new_id = 'id="%s"' % anchor_new
947
948 if change['action'] != Action.CONTEXT:
949 anchor_link = True
950 else:
951 anchor_link = False
952
953 ###########################################################
954 # COMMENT ICONS
955 ###########################################################
956 _html.append('''\t<td class="add-comment-line"><span class="add-comment-content">''')
957
958 if enable_comments and change['action'] != Action.CONTEXT:
959 _html.append('''<a href="#"><span class="icon-comment-add"></span></a>''')
960
961 _html.append('''</span></td><td class="comment-toggle tooltip" title="Toggle Comment Thread"><i class="icon-comment"></i></td>\n''')
962
963 ###########################################################
964 # OLD LINE NUMBER
965 ###########################################################
966 _html.append('''\t<td %(a_id)s class="%(olc)s">''' % {
967 'a_id': anchor_old_id,
968 'olc': old_lineno_class
969 })
970
971 _html.append('''%(link)s''' % {
972 'link': _link_to_if(anchor_link, change['old_lineno'],
973 '#%s' % anchor_old)
974 })
975 _html.append('''</td>\n''')
976 ###########################################################
977 # NEW LINE NUMBER
978 ###########################################################
979
980 _html.append('''\t<td %(a_id)s class="%(nlc)s">''' % {
981 'a_id': anchor_new_id,
982 'nlc': new_lineno_class
983 })
984
985 _html.append('''%(link)s''' % {
986 'link': _link_to_if(anchor_link, change['new_lineno'],
987 '#%s' % anchor_new)
988 })
989 _html.append('''</td>\n''')
990 ###########################################################
991 # CODE
992 ###########################################################
993 code_classes = [code_class]
994 if (not enable_comments or
995 change['action'] == Action.CONTEXT):
996 code_classes.append('no-comment')
997 _html.append('\t<td class="%s">' % ' '.join(code_classes))
998 _html.append('''\n\t\t<pre>%(code)s</pre>\n''' % {
999 'code': change['line']
1000 })
1001
1002 _html.append('''\t</td>''')
1003 _html.append('''\n</tr>\n''')
1004 _html.append('''</table>''')
1005 if _html_empty:
1006 return None
1007 return ''.join(_html)
1008
884
1009 def stat(self):
885 def stat(self):
1010 """
886 """
1011 Returns tuple of added, and removed lines for this instance
887 Returns tuple of added, and removed lines for this instance
1012 """
888 """
1013 return self.adds, self.removes
889 return self.adds, self.removes
1014
890
1015 def get_context_of_line(
891 def get_context_of_line(
1016 self, path, diff_line=None, context_before=3, context_after=3):
892 self, path, diff_line: DiffLineNumber = None, context_before: int = 3, context_after: int = 3):
1017 """
893 """
1018 Returns the context lines for the specified diff line.
894 Returns the context lines for the specified diff line.
1019
1020 :type diff_line: :class:`DiffLineNumber`
1021 """
895 """
1022 assert self.parsed, "DiffProcessor is not initialized."
896 assert self.parsed, "DiffProcessor is not initialized."
1023
897
1024 if None not in diff_line:
898 if None not in diff_line:
1025 raise ValueError(
899 raise ValueError(f"Cannot specify both line numbers in diff_line: {diff_line}")
1026 "Cannot specify both line numbers: {}".format(diff_line))
1027
900
1028 file_diff = self._get_file_diff(path)
901 file_diff = self._get_file_diff(path)
1029 chunk, idx = self._find_chunk_line_index(file_diff, diff_line)
902 chunk, idx = self._find_chunk_line_index(file_diff, diff_line)
1030
903
1031 first_line_to_include = max(idx - context_before, 0)
904 first_line_to_include = max(idx - context_before, 0)
1032 first_line_after_context = idx + context_after + 1
905 first_line_after_context = idx + context_after + 1
1033 context_lines = chunk[first_line_to_include:first_line_after_context]
906 context_lines = chunk['lines'][first_line_to_include:first_line_after_context]
1034
907
1035 line_contents = [
908 line_contents = [
1036 _context_line(line) for line in context_lines
909 _context_line(line) for line in context_lines
1037 if _is_diff_content(line)]
910 if _is_diff_content(line)
911 ]
912
1038 # TODO: johbo: Interim fixup, the diff chunks drop the final newline.
913 # TODO: johbo: Interim fixup, the diff chunks drop the final newline.
1039 # Once they are fixed, we can drop this line here.
914 # Once they are fixed, we can drop this line here.
1040 if line_contents:
915 if line_contents:
1041 line_contents[-1] = (
916 line_contents[-1] = (
1042 line_contents[-1][0], line_contents[-1][1].rstrip('\n') + '\n')
917 line_contents[-1][0], line_contents[-1][1].rstrip(b'\n') + b'\n')
1043 return line_contents
918 return line_contents
1044
919
1045 def find_context(self, path, context, offset=0):
920 def find_context(self, path, context, offset=0):
1046 """
921 """
1047 Finds the given `context` inside of the diff.
922 Finds the given `context` inside of the diff.
1048
923
1049 Use the parameter `offset` to specify which offset the target line has
924 Use the parameter `offset` to specify which offset the target line has
1050 inside of the given `context`. This way the correct diff line will be
925 inside of the given `context`. This way the correct diff line will be
1051 returned.
926 returned.
1052
927
1053 :param offset: Shall be used to specify the offset of the main line
928 :param offset: Shall be used to specify the offset of the main line
1054 within the given `context`.
929 within the given `context`.
1055 """
930 """
1056 if offset < 0 or offset >= len(context):
931 if offset < 0 or offset >= len(context):
1057 raise ValueError(
932 raise ValueError(
1058 "Only positive values up to the length of the context "
933 "Only positive values up to the length of the context "
1059 "minus one are allowed.")
934 "minus one are allowed.")
1060
935
1061 matches = []
936 matches = []
1062 file_diff = self._get_file_diff(path)
937 file_diff = self._get_file_diff(path)
1063
938
1064 for chunk in file_diff['chunks']:
939 for chunk in file_diff['chunks']:
940 if not isinstance(chunk, dict):
941 continue
1065 context_iter = iter(context)
942 context_iter = iter(context)
1066 for line_idx, line in enumerate(chunk):
943 for line_idx, line in enumerate(chunk['lines']):
1067 try:
944 try:
1068 if _context_line(line) == next(context_iter):
945 if _context_line(line) == next(context_iter):
1069 continue
946 continue
1070 except StopIteration:
947 except StopIteration:
1071 matches.append((line_idx, chunk))
948 matches.append((line_idx, chunk))
1072 context_iter = iter(context)
949 context_iter = iter(context)
1073
950
1074 # Increment position and triger StopIteration
951 # Increment position and triger StopIteration
1075 # if we had a match at the end
952 # if we had a match at the end
1076 line_idx += 1
953 line_idx += 1
1077 try:
954 try:
1078 next(context_iter)
955 next(context_iter)
1079 except StopIteration:
956 except StopIteration:
1080 matches.append((line_idx, chunk))
957 matches.append((line_idx, chunk))
1081
958
1082 effective_offset = len(context) - offset
959 effective_offset = len(context) - offset
1083 found_at_diff_lines = [
960 found_at_diff_lines = [
1084 _line_to_diff_line_number(chunk[idx - effective_offset])
961 _line_to_diff_line_number(chunk['lines'][idx - effective_offset])
1085 for idx, chunk in matches]
962 for idx, chunk in matches]
1086
963
1087 return found_at_diff_lines
964 return found_at_diff_lines
1088
965
1089 def _get_file_diff(self, path):
966 def _get_file_diff(self, path):
1090 for file_diff in self.parsed_diff:
967 for file_diff in self.parsed_diff:
1091 if file_diff['filename'] == path:
968 if file_diff['filename'] == path:
1092 break
969 break
1093 else:
970 else:
1094 raise FileNotInDiffException("File {} not in diff".format(path))
971 raise FileNotInDiffException(f"File {path} not in diff")
1095 return file_diff
972 return file_diff
1096
973
1097 def _find_chunk_line_index(self, file_diff, diff_line):
974 def _find_chunk_line_index(self, file_diff, diff_line):
1098 for chunk in file_diff['chunks']:
975 for chunk in file_diff['chunks']:
1099 for idx, line in enumerate(chunk):
976 if not isinstance(chunk, dict):
1100 if line['old_lineno'] == diff_line.old:
977 continue
1101 return chunk, idx
978 for line_idx, line in enumerate(chunk['lines']):
1102 if line['new_lineno'] == diff_line.new:
979 if diff_line.old and line['old_lineno'] == diff_line.old:
1103 return chunk, idx
980 return chunk, line_idx
1104 raise LineNotInDiffException(
981 if diff_line.new and line['new_lineno'] == diff_line.new:
1105 "The line {} is not part of the diff.".format(diff_line))
982 return chunk, line_idx
983 raise LineNotInDiffException(f"The line {diff_line} is not part of the diff.")
1106
984
1107
985
1108 def _is_diff_content(line):
986 def _is_diff_content(line):
1109 return line['action'] in (
987 return line['action'] in (
1110 Action.UNMODIFIED, Action.ADD, Action.DELETE)
988 Action.UNMODIFIED, Action.ADD, Action.DELETE)
1111
989
1112
990
1113 def _context_line(line):
991 def _context_line(line):
1114 return (line['action'], line['line'])
992 return line['action'], line['line']
1115
1116
1117 DiffLineNumber = collections.namedtuple('DiffLineNumber', ['old', 'new'])
1118
993
1119
994
1120 def _line_to_diff_line_number(line):
995 def _line_to_diff_line_number(line):
1121 new_line_no = line['new_lineno'] or None
996 new_line_no = line['new_lineno'] or None
1122 old_line_no = line['old_lineno'] or None
997 old_line_no = line['old_lineno'] or None
1123 return DiffLineNumber(old=old_line_no, new=new_line_no)
998 return DiffLineNumber(old=old_line_no, new=new_line_no)
1124
999
1125
1000
1126 class FileNotInDiffException(Exception):
1001 class FileNotInDiffException(Exception):
1127 """
1002 """
1128 Raised when the context for a missing file is requested.
1003 Raised when the context for a missing file is requested.
1129
1004
1130 If you request the context for a line in a file which is not part of the
1005 If you request the context for a line in a file which is not part of the
1131 given diff, then this exception is raised.
1006 given diff, then this exception is raised.
1132 """
1007 """
1133
1008
1134
1009
1135 class LineNotInDiffException(Exception):
1010 class LineNotInDiffException(Exception):
1136 """
1011 """
1137 Raised when the context for a missing line is requested.
1012 Raised when the context for a missing line is requested.
1138
1013
1139 If you request the context for a line in a file and this line is not
1014 If you request the context for a line in a file and this line is not
1140 part of the given diff, then this exception is raised.
1015 part of the given diff, then this exception is raised.
1141 """
1016 """
1142
1017
1143
1018
1144 class DiffLimitExceeded(Exception):
1019 class DiffLimitExceeded(Exception):
1145 pass
1020 pass
1146
1021
1147
1022
1148 # NOTE(marcink): if diffs.mako change, probably this
1023 # NOTE(marcink): if diffs.mako change, probably this
1149 # needs a bump to next version
1024 # needs a bump to next version
1150 CURRENT_DIFF_VERSION = 'v5'
1025 CURRENT_DIFF_VERSION = 'v5'
1151
1026
1152
1027
1153 def _cleanup_cache_file(cached_diff_file):
1028 def _cleanup_cache_file(cached_diff_file):
1154 # cleanup file to not store it "damaged"
1029 # cleanup file to not store it "damaged"
1155 try:
1030 try:
1156 os.remove(cached_diff_file)
1031 os.remove(cached_diff_file)
1157 except Exception:
1032 except Exception:
1158 log.exception('Failed to cleanup path %s', cached_diff_file)
1033 log.exception('Failed to cleanup path %s', cached_diff_file)
1159
1034
1160
1035
1161 def _get_compression_mode(cached_diff_file):
1036 def _get_compression_mode(cached_diff_file):
1162 mode = 'bz2'
1037 mode = 'bz2'
1163 if 'mode:plain' in cached_diff_file:
1038 if 'mode:plain' in cached_diff_file:
1164 mode = 'plain'
1039 mode = 'plain'
1165 elif 'mode:gzip' in cached_diff_file:
1040 elif 'mode:gzip' in cached_diff_file:
1166 mode = 'gzip'
1041 mode = 'gzip'
1167 return mode
1042 return mode
1168
1043
1169
1044
1170 def cache_diff(cached_diff_file, diff, commits):
1045 def cache_diff(cached_diff_file, diff, commits):
1171 compression_mode = _get_compression_mode(cached_diff_file)
1046 compression_mode = _get_compression_mode(cached_diff_file)
1172
1047
1173 struct = {
1048 struct = {
1174 'version': CURRENT_DIFF_VERSION,
1049 'version': CURRENT_DIFF_VERSION,
1175 'diff': diff,
1050 'diff': diff,
1176 'commits': commits
1051 'commits': commits
1177 }
1052 }
1178
1053
1179 start = time.time()
1054 start = time.time()
1180 try:
1055 try:
1181 if compression_mode == 'plain':
1056 if compression_mode == 'plain':
1182 with open(cached_diff_file, 'wb') as f:
1057 with open(cached_diff_file, 'wb') as f:
1183 pickle.dump(struct, f)
1058 pickle.dump(struct, f)
1184 elif compression_mode == 'gzip':
1059 elif compression_mode == 'gzip':
1185 with gzip.GzipFile(cached_diff_file, 'wb') as f:
1060 with gzip.GzipFile(cached_diff_file, 'wb') as f:
1186 pickle.dump(struct, f)
1061 pickle.dump(struct, f)
1187 else:
1062 else:
1188 with bz2.BZ2File(cached_diff_file, 'wb') as f:
1063 with bz2.BZ2File(cached_diff_file, 'wb') as f:
1189 pickle.dump(struct, f)
1064 pickle.dump(struct, f)
1190 except Exception:
1065 except Exception:
1191 log.warn('Failed to save cache', exc_info=True)
1066 log.warning('Failed to save cache', exc_info=True)
1192 _cleanup_cache_file(cached_diff_file)
1067 _cleanup_cache_file(cached_diff_file)
1193
1068
1194 log.debug('Saved diff cache under %s in %.4fs', cached_diff_file, time.time() - start)
1069 log.debug('Saved diff cache under %s in %.4fs', cached_diff_file, time.time() - start)
1195
1070
1196
1071
1197 def load_cached_diff(cached_diff_file):
1072 def load_cached_diff(cached_diff_file):
1198 compression_mode = _get_compression_mode(cached_diff_file)
1073 compression_mode = _get_compression_mode(cached_diff_file)
1199
1074
1200 default_struct = {
1075 default_struct = {
1201 'version': CURRENT_DIFF_VERSION,
1076 'version': CURRENT_DIFF_VERSION,
1202 'diff': None,
1077 'diff': None,
1203 'commits': None
1078 'commits': None
1204 }
1079 }
1205
1080
1206 has_cache = os.path.isfile(cached_diff_file)
1081 has_cache = os.path.isfile(cached_diff_file)
1207 if not has_cache:
1082 if not has_cache:
1208 log.debug('Reading diff cache file failed %s', cached_diff_file)
1083 log.debug('Reading diff cache file failed %s', cached_diff_file)
1209 return default_struct
1084 return default_struct
1210
1085
1211 data = None
1086 data = None
1212
1087
1213 start = time.time()
1088 start = time.time()
1214 try:
1089 try:
1215 if compression_mode == 'plain':
1090 if compression_mode == 'plain':
1216 with open(cached_diff_file, 'rb') as f:
1091 with open(cached_diff_file, 'rb') as f:
1217 data = pickle.load(f)
1092 data = pickle.load(f)
1218 elif compression_mode == 'gzip':
1093 elif compression_mode == 'gzip':
1219 with gzip.GzipFile(cached_diff_file, 'rb') as f:
1094 with gzip.GzipFile(cached_diff_file, 'rb') as f:
1220 data = pickle.load(f)
1095 data = pickle.load(f)
1221 else:
1096 else:
1222 with bz2.BZ2File(cached_diff_file, 'rb') as f:
1097 with bz2.BZ2File(cached_diff_file, 'rb') as f:
1223 data = pickle.load(f)
1098 data = pickle.load(f)
1224 except Exception:
1099 except Exception:
1225 log.warn('Failed to read diff cache file', exc_info=True)
1100 log.warning('Failed to read diff cache file', exc_info=True)
1226
1101
1227 if not data:
1102 if not data:
1228 data = default_struct
1103 data = default_struct
1229
1104
1230 if not isinstance(data, dict):
1105 if not isinstance(data, dict):
1231 # old version of data ?
1106 # old version of data ?
1232 data = default_struct
1107 data = default_struct
1233
1108
1234 # check version
1109 # check version
1235 if data.get('version') != CURRENT_DIFF_VERSION:
1110 if data.get('version') != CURRENT_DIFF_VERSION:
1236 # purge cache
1111 # purge cache
1237 _cleanup_cache_file(cached_diff_file)
1112 _cleanup_cache_file(cached_diff_file)
1238 return default_struct
1113 return default_struct
1239
1114
1240 log.debug('Loaded diff cache from %s in %.4fs', cached_diff_file, time.time() - start)
1115 log.debug('Loaded diff cache from %s in %.4fs', cached_diff_file, time.time() - start)
1241
1116
1242 return data
1117 return data
1243
1118
1244
1119
1245 def generate_diff_cache_key(*args):
1120 def generate_diff_cache_key(*args):
1246 """
1121 """
1247 Helper to generate a cache key using arguments
1122 Helper to generate a cache key using arguments
1248 """
1123 """
1249 def arg_mapper(input_param):
1124 def arg_mapper(input_param):
1250 input_param = safe_str(input_param)
1125 input_param = safe_str(input_param)
1251 # we cannot allow '/' in arguments since it would allow
1126 # we cannot allow '/' in arguments since it would allow
1252 # subdirectory usage
1127 # subdirectory usage
1253 input_param.replace('/', '_')
1128 input_param.replace('/', '_')
1254 return input_param or None # prevent empty string arguments
1129 return input_param or None # prevent empty string arguments
1255
1130
1256 return '_'.join([
1131 return '_'.join([
1257 '{}' for i in range(len(args))]).format(*map(arg_mapper, args))
1132 '{}' for _i in range(len(args))]).format(*list(map(arg_mapper, args)))
1258
1133
1259
1134
1260 def diff_cache_exist(cache_storage, *args):
1135 def diff_cache_exist(cache_storage, *args):
1261 """
1136 """
1262 Based on all generated arguments check and return a cache path
1137 Based on all generated arguments check and return a cache path
1263 """
1138 """
1264 args = list(args) + ['mode:gzip']
1139 args = list(args) + ['mode:gzip']
1265 cache_key = generate_diff_cache_key(*args)
1140 cache_key = generate_diff_cache_key(*args)
1266 cache_file_path = os.path.join(cache_storage, cache_key)
1141 cache_file_path = os.path.join(cache_storage, cache_key)
1267 # prevent path traversal attacks using some param that have e.g '../../'
1142 # prevent path traversal attacks using some param that have e.g '../../'
1268 if not os.path.abspath(cache_file_path).startswith(cache_storage):
1143 if not os.path.abspath(cache_file_path).startswith(cache_storage):
1269 raise ValueError('Final path must be within {}'.format(cache_storage))
1144 raise ValueError(f'Final path must be within {cache_storage}')
1270
1145
1271 return cache_file_path
1146 return cache_file_path
General Comments 0
You need to be logged in to leave comments. Login now