##// END OF EJS Templates
diffs: use custom lexer extraction in diffs to so it behaves consistently with...
marcink -
r1591:9abd8b35 default
parent child Browse files
Show More
@@ -1,703 +1,707 b''
1 # -*- coding: utf-8 -*-
1 # -*- coding: utf-8 -*-
2
2
3 # Copyright (C) 2011-2017 RhodeCode GmbH
3 # Copyright (C) 2011-2017 RhodeCode GmbH
4 #
4 #
5 # This program is free software: you can redistribute it and/or modify
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Affero General Public License, version 3
6 # it under the terms of the GNU Affero General Public License, version 3
7 # (only), as published by the Free Software Foundation.
7 # (only), as published by the Free Software Foundation.
8 #
8 #
9 # This program is distributed in the hope that it will be useful,
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
12 # GNU General Public License for more details.
13 #
13 #
14 # You should have received a copy of the GNU Affero General Public License
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 #
16 #
17 # This program is dual-licensed. If you wish to learn more about the
17 # This program is dual-licensed. If you wish to learn more about the
18 # RhodeCode Enterprise Edition, including its added features, Support services,
18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 # and proprietary license terms, please see https://rhodecode.com/licenses/
19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20
20
21 import logging
21 import logging
22 import difflib
22 import difflib
23 from itertools import groupby
23 from itertools import groupby
24
24
25 from pygments import lex
25 from pygments import lex
26 from pygments.formatters.html import _get_ttype_class as pygment_token_class
26 from pygments.formatters.html import _get_ttype_class as pygment_token_class
27 from rhodecode.lib.helpers import (
27 from rhodecode.lib.helpers import (
28 get_lexer_for_filenode, html_escape)
28 get_lexer_for_filenode, html_escape, get_custom_lexer)
29 from rhodecode.lib.utils2 import AttributeDict
29 from rhodecode.lib.utils2 import AttributeDict
30 from rhodecode.lib.vcs.nodes import FileNode
30 from rhodecode.lib.vcs.nodes import FileNode
31 from rhodecode.lib.diff_match_patch import diff_match_patch
31 from rhodecode.lib.diff_match_patch import diff_match_patch
32 from rhodecode.lib.diffs import LimitedDiffContainer
32 from rhodecode.lib.diffs import LimitedDiffContainer
33 from pygments.lexers import get_lexer_by_name
33 from pygments.lexers import get_lexer_by_name
34
34
35 plain_text_lexer = get_lexer_by_name(
35 plain_text_lexer = get_lexer_by_name(
36 'text', stripall=False, stripnl=False, ensurenl=False)
36 'text', stripall=False, stripnl=False, ensurenl=False)
37
37
38
38
39 log = logging.getLogger()
39 log = logging.getLogger()
40
40
41
41
42 def filenode_as_lines_tokens(filenode, lexer=None):
42 def filenode_as_lines_tokens(filenode, lexer=None):
43 org_lexer = lexer
43 org_lexer = lexer
44 lexer = lexer or get_lexer_for_filenode(filenode)
44 lexer = lexer or get_lexer_for_filenode(filenode)
45 log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',
45 log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',
46 lexer, filenode, org_lexer)
46 lexer, filenode, org_lexer)
47 tokens = tokenize_string(filenode.content, lexer)
47 tokens = tokenize_string(filenode.content, lexer)
48 lines = split_token_stream(tokens, split_string='\n')
48 lines = split_token_stream(tokens, split_string='\n')
49 rv = list(lines)
49 rv = list(lines)
50 return rv
50 return rv
51
51
52
52
53 def tokenize_string(content, lexer):
53 def tokenize_string(content, lexer):
54 """
54 """
55 Use pygments to tokenize some content based on a lexer
55 Use pygments to tokenize some content based on a lexer
56 ensuring all original new lines and whitespace is preserved
56 ensuring all original new lines and whitespace is preserved
57 """
57 """
58
58
59 lexer.stripall = False
59 lexer.stripall = False
60 lexer.stripnl = False
60 lexer.stripnl = False
61 lexer.ensurenl = False
61 lexer.ensurenl = False
62 for token_type, token_text in lex(content, lexer):
62 for token_type, token_text in lex(content, lexer):
63 yield pygment_token_class(token_type), token_text
63 yield pygment_token_class(token_type), token_text
64
64
65
65
66 def split_token_stream(tokens, split_string=u'\n'):
66 def split_token_stream(tokens, split_string=u'\n'):
67 """
67 """
68 Take a list of (TokenType, text) tuples and split them by a string
68 Take a list of (TokenType, text) tuples and split them by a string
69
69
70 >>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
70 >>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
71 [(TEXT, 'some'), (TEXT, 'text'),
71 [(TEXT, 'some'), (TEXT, 'text'),
72 (TEXT, 'more'), (TEXT, 'text')]
72 (TEXT, 'more'), (TEXT, 'text')]
73 """
73 """
74
74
75 buffer = []
75 buffer = []
76 for token_class, token_text in tokens:
76 for token_class, token_text in tokens:
77 parts = token_text.split(split_string)
77 parts = token_text.split(split_string)
78 for part in parts[:-1]:
78 for part in parts[:-1]:
79 buffer.append((token_class, part))
79 buffer.append((token_class, part))
80 yield buffer
80 yield buffer
81 buffer = []
81 buffer = []
82
82
83 buffer.append((token_class, parts[-1]))
83 buffer.append((token_class, parts[-1]))
84
84
85 if buffer:
85 if buffer:
86 yield buffer
86 yield buffer
87
87
88
88
89 def filenode_as_annotated_lines_tokens(filenode):
89 def filenode_as_annotated_lines_tokens(filenode):
90 """
90 """
91 Take a file node and return a list of annotations => lines, if no annotation
91 Take a file node and return a list of annotations => lines, if no annotation
92 is found, it will be None.
92 is found, it will be None.
93
93
94 eg:
94 eg:
95
95
96 [
96 [
97 (annotation1, [
97 (annotation1, [
98 (1, line1_tokens_list),
98 (1, line1_tokens_list),
99 (2, line2_tokens_list),
99 (2, line2_tokens_list),
100 ]),
100 ]),
101 (annotation2, [
101 (annotation2, [
102 (3, line1_tokens_list),
102 (3, line1_tokens_list),
103 ]),
103 ]),
104 (None, [
104 (None, [
105 (4, line1_tokens_list),
105 (4, line1_tokens_list),
106 ]),
106 ]),
107 (annotation1, [
107 (annotation1, [
108 (5, line1_tokens_list),
108 (5, line1_tokens_list),
109 (6, line2_tokens_list),
109 (6, line2_tokens_list),
110 ])
110 ])
111 ]
111 ]
112 """
112 """
113
113
114 commit_cache = {} # cache commit_getter lookups
114 commit_cache = {} # cache commit_getter lookups
115
115
116 def _get_annotation(commit_id, commit_getter):
116 def _get_annotation(commit_id, commit_getter):
117 if commit_id not in commit_cache:
117 if commit_id not in commit_cache:
118 commit_cache[commit_id] = commit_getter()
118 commit_cache[commit_id] = commit_getter()
119 return commit_cache[commit_id]
119 return commit_cache[commit_id]
120
120
121 annotation_lookup = {
121 annotation_lookup = {
122 line_no: _get_annotation(commit_id, commit_getter)
122 line_no: _get_annotation(commit_id, commit_getter)
123 for line_no, commit_id, commit_getter, line_content
123 for line_no, commit_id, commit_getter, line_content
124 in filenode.annotate
124 in filenode.annotate
125 }
125 }
126
126
127 annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
127 annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
128 for line_no, tokens
128 for line_no, tokens
129 in enumerate(filenode_as_lines_tokens(filenode), 1))
129 in enumerate(filenode_as_lines_tokens(filenode), 1))
130
130
131 grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
131 grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
132
132
133 for annotation, group in grouped_annotations_lines:
133 for annotation, group in grouped_annotations_lines:
134 yield (
134 yield (
135 annotation, [(line_no, tokens)
135 annotation, [(line_no, tokens)
136 for (_, line_no, tokens) in group]
136 for (_, line_no, tokens) in group]
137 )
137 )
138
138
139
139
140 def render_tokenstream(tokenstream):
140 def render_tokenstream(tokenstream):
141 result = []
141 result = []
142 for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
142 for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
143
143
144 if token_class:
144 if token_class:
145 result.append(u'<span class="%s">' % token_class)
145 result.append(u'<span class="%s">' % token_class)
146 else:
146 else:
147 result.append(u'<span>')
147 result.append(u'<span>')
148
148
149 for op_tag, token_text in token_ops_texts:
149 for op_tag, token_text in token_ops_texts:
150
150
151 if op_tag:
151 if op_tag:
152 result.append(u'<%s>' % op_tag)
152 result.append(u'<%s>' % op_tag)
153
153
154 escaped_text = html_escape(token_text)
154 escaped_text = html_escape(token_text)
155
155
156 # TODO: dan: investigate showing hidden characters like space/nl/tab
156 # TODO: dan: investigate showing hidden characters like space/nl/tab
157 # escaped_text = escaped_text.replace(' ', '<sp> </sp>')
157 # escaped_text = escaped_text.replace(' ', '<sp> </sp>')
158 # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
158 # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
159 # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')
159 # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')
160
160
161 result.append(escaped_text)
161 result.append(escaped_text)
162
162
163 if op_tag:
163 if op_tag:
164 result.append(u'</%s>' % op_tag)
164 result.append(u'</%s>' % op_tag)
165
165
166 result.append(u'</span>')
166 result.append(u'</span>')
167
167
168 html = ''.join(result)
168 html = ''.join(result)
169 return html
169 return html
170
170
171
171
172 def rollup_tokenstream(tokenstream):
172 def rollup_tokenstream(tokenstream):
173 """
173 """
174 Group a token stream of the format:
174 Group a token stream of the format:
175
175
176 ('class', 'op', 'text')
176 ('class', 'op', 'text')
177 or
177 or
178 ('class', 'text')
178 ('class', 'text')
179
179
180 into
180 into
181
181
182 [('class1',
182 [('class1',
183 [('op1', 'text'),
183 [('op1', 'text'),
184 ('op2', 'text')]),
184 ('op2', 'text')]),
185 ('class2',
185 ('class2',
186 [('op3', 'text')])]
186 [('op3', 'text')])]
187
187
188 This is used to get the minimal tags necessary when
188 This is used to get the minimal tags necessary when
189 rendering to html eg for a token stream ie.
189 rendering to html eg for a token stream ie.
190
190
191 <span class="A"><ins>he</ins>llo</span>
191 <span class="A"><ins>he</ins>llo</span>
192 vs
192 vs
193 <span class="A"><ins>he</ins></span><span class="A">llo</span>
193 <span class="A"><ins>he</ins></span><span class="A">llo</span>
194
194
195 If a 2 tuple is passed in, the output op will be an empty string.
195 If a 2 tuple is passed in, the output op will be an empty string.
196
196
197 eg:
197 eg:
198
198
199 >>> rollup_tokenstream([('classA', '', 'h'),
199 >>> rollup_tokenstream([('classA', '', 'h'),
200 ('classA', 'del', 'ell'),
200 ('classA', 'del', 'ell'),
201 ('classA', '', 'o'),
201 ('classA', '', 'o'),
202 ('classB', '', ' '),
202 ('classB', '', ' '),
203 ('classA', '', 'the'),
203 ('classA', '', 'the'),
204 ('classA', '', 're'),
204 ('classA', '', 're'),
205 ])
205 ])
206
206
207 [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
207 [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
208 ('classB', [('', ' ')],
208 ('classB', [('', ' ')],
209 ('classA', [('', 'there')]]
209 ('classA', [('', 'there')]]
210
210
211 """
211 """
212 if tokenstream and len(tokenstream[0]) == 2:
212 if tokenstream and len(tokenstream[0]) == 2:
213 tokenstream = ((t[0], '', t[1]) for t in tokenstream)
213 tokenstream = ((t[0], '', t[1]) for t in tokenstream)
214
214
215 result = []
215 result = []
216 for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
216 for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
217 ops = []
217 ops = []
218 for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
218 for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
219 text_buffer = []
219 text_buffer = []
220 for t_class, t_op, t_text in token_text_list:
220 for t_class, t_op, t_text in token_text_list:
221 text_buffer.append(t_text)
221 text_buffer.append(t_text)
222 ops.append((token_op, ''.join(text_buffer)))
222 ops.append((token_op, ''.join(text_buffer)))
223 result.append((token_class, ops))
223 result.append((token_class, ops))
224 return result
224 return result
225
225
226
226
227 def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
227 def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
228 """
228 """
229 Converts a list of (token_class, token_text) tuples to a list of
229 Converts a list of (token_class, token_text) tuples to a list of
230 (token_class, token_op, token_text) tuples where token_op is one of
230 (token_class, token_op, token_text) tuples where token_op is one of
231 ('ins', 'del', '')
231 ('ins', 'del', '')
232
232
233 :param old_tokens: list of (token_class, token_text) tuples of old line
233 :param old_tokens: list of (token_class, token_text) tuples of old line
234 :param new_tokens: list of (token_class, token_text) tuples of new line
234 :param new_tokens: list of (token_class, token_text) tuples of new line
235 :param use_diff_match_patch: boolean, will use google's diff match patch
235 :param use_diff_match_patch: boolean, will use google's diff match patch
236 library which has options to 'smooth' out the character by character
236 library which has options to 'smooth' out the character by character
237 differences making nicer ins/del blocks
237 differences making nicer ins/del blocks
238 """
238 """
239
239
240 old_tokens_result = []
240 old_tokens_result = []
241 new_tokens_result = []
241 new_tokens_result = []
242
242
243 similarity = difflib.SequenceMatcher(None,
243 similarity = difflib.SequenceMatcher(None,
244 ''.join(token_text for token_class, token_text in old_tokens),
244 ''.join(token_text for token_class, token_text in old_tokens),
245 ''.join(token_text for token_class, token_text in new_tokens)
245 ''.join(token_text for token_class, token_text in new_tokens)
246 ).ratio()
246 ).ratio()
247
247
248 if similarity < 0.6: # return, the blocks are too different
248 if similarity < 0.6: # return, the blocks are too different
249 for token_class, token_text in old_tokens:
249 for token_class, token_text in old_tokens:
250 old_tokens_result.append((token_class, '', token_text))
250 old_tokens_result.append((token_class, '', token_text))
251 for token_class, token_text in new_tokens:
251 for token_class, token_text in new_tokens:
252 new_tokens_result.append((token_class, '', token_text))
252 new_tokens_result.append((token_class, '', token_text))
253 return old_tokens_result, new_tokens_result, similarity
253 return old_tokens_result, new_tokens_result, similarity
254
254
255 token_sequence_matcher = difflib.SequenceMatcher(None,
255 token_sequence_matcher = difflib.SequenceMatcher(None,
256 [x[1] for x in old_tokens],
256 [x[1] for x in old_tokens],
257 [x[1] for x in new_tokens])
257 [x[1] for x in new_tokens])
258
258
259 for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
259 for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
260 # check the differences by token block types first to give a more
260 # check the differences by token block types first to give a more
261 # nicer "block" level replacement vs character diffs
261 # nicer "block" level replacement vs character diffs
262
262
263 if tag == 'equal':
263 if tag == 'equal':
264 for token_class, token_text in old_tokens[o1:o2]:
264 for token_class, token_text in old_tokens[o1:o2]:
265 old_tokens_result.append((token_class, '', token_text))
265 old_tokens_result.append((token_class, '', token_text))
266 for token_class, token_text in new_tokens[n1:n2]:
266 for token_class, token_text in new_tokens[n1:n2]:
267 new_tokens_result.append((token_class, '', token_text))
267 new_tokens_result.append((token_class, '', token_text))
268 elif tag == 'delete':
268 elif tag == 'delete':
269 for token_class, token_text in old_tokens[o1:o2]:
269 for token_class, token_text in old_tokens[o1:o2]:
270 old_tokens_result.append((token_class, 'del', token_text))
270 old_tokens_result.append((token_class, 'del', token_text))
271 elif tag == 'insert':
271 elif tag == 'insert':
272 for token_class, token_text in new_tokens[n1:n2]:
272 for token_class, token_text in new_tokens[n1:n2]:
273 new_tokens_result.append((token_class, 'ins', token_text))
273 new_tokens_result.append((token_class, 'ins', token_text))
274 elif tag == 'replace':
274 elif tag == 'replace':
275 # if same type token blocks must be replaced, do a diff on the
275 # if same type token blocks must be replaced, do a diff on the
276 # characters in the token blocks to show individual changes
276 # characters in the token blocks to show individual changes
277
277
278 old_char_tokens = []
278 old_char_tokens = []
279 new_char_tokens = []
279 new_char_tokens = []
280 for token_class, token_text in old_tokens[o1:o2]:
280 for token_class, token_text in old_tokens[o1:o2]:
281 for char in token_text:
281 for char in token_text:
282 old_char_tokens.append((token_class, char))
282 old_char_tokens.append((token_class, char))
283
283
284 for token_class, token_text in new_tokens[n1:n2]:
284 for token_class, token_text in new_tokens[n1:n2]:
285 for char in token_text:
285 for char in token_text:
286 new_char_tokens.append((token_class, char))
286 new_char_tokens.append((token_class, char))
287
287
288 old_string = ''.join([token_text for
288 old_string = ''.join([token_text for
289 token_class, token_text in old_char_tokens])
289 token_class, token_text in old_char_tokens])
290 new_string = ''.join([token_text for
290 new_string = ''.join([token_text for
291 token_class, token_text in new_char_tokens])
291 token_class, token_text in new_char_tokens])
292
292
293 char_sequence = difflib.SequenceMatcher(
293 char_sequence = difflib.SequenceMatcher(
294 None, old_string, new_string)
294 None, old_string, new_string)
295 copcodes = char_sequence.get_opcodes()
295 copcodes = char_sequence.get_opcodes()
296 obuffer, nbuffer = [], []
296 obuffer, nbuffer = [], []
297
297
298 if use_diff_match_patch:
298 if use_diff_match_patch:
299 dmp = diff_match_patch()
299 dmp = diff_match_patch()
300 dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting
300 dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting
301 reps = dmp.diff_main(old_string, new_string)
301 reps = dmp.diff_main(old_string, new_string)
302 dmp.diff_cleanupEfficiency(reps)
302 dmp.diff_cleanupEfficiency(reps)
303
303
304 a, b = 0, 0
304 a, b = 0, 0
305 for op, rep in reps:
305 for op, rep in reps:
306 l = len(rep)
306 l = len(rep)
307 if op == 0:
307 if op == 0:
308 for i, c in enumerate(rep):
308 for i, c in enumerate(rep):
309 obuffer.append((old_char_tokens[a+i][0], '', c))
309 obuffer.append((old_char_tokens[a+i][0], '', c))
310 nbuffer.append((new_char_tokens[b+i][0], '', c))
310 nbuffer.append((new_char_tokens[b+i][0], '', c))
311 a += l
311 a += l
312 b += l
312 b += l
313 elif op == -1:
313 elif op == -1:
314 for i, c in enumerate(rep):
314 for i, c in enumerate(rep):
315 obuffer.append((old_char_tokens[a+i][0], 'del', c))
315 obuffer.append((old_char_tokens[a+i][0], 'del', c))
316 a += l
316 a += l
317 elif op == 1:
317 elif op == 1:
318 for i, c in enumerate(rep):
318 for i, c in enumerate(rep):
319 nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
319 nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
320 b += l
320 b += l
321 else:
321 else:
322 for ctag, co1, co2, cn1, cn2 in copcodes:
322 for ctag, co1, co2, cn1, cn2 in copcodes:
323 if ctag == 'equal':
323 if ctag == 'equal':
324 for token_class, token_text in old_char_tokens[co1:co2]:
324 for token_class, token_text in old_char_tokens[co1:co2]:
325 obuffer.append((token_class, '', token_text))
325 obuffer.append((token_class, '', token_text))
326 for token_class, token_text in new_char_tokens[cn1:cn2]:
326 for token_class, token_text in new_char_tokens[cn1:cn2]:
327 nbuffer.append((token_class, '', token_text))
327 nbuffer.append((token_class, '', token_text))
328 elif ctag == 'delete':
328 elif ctag == 'delete':
329 for token_class, token_text in old_char_tokens[co1:co2]:
329 for token_class, token_text in old_char_tokens[co1:co2]:
330 obuffer.append((token_class, 'del', token_text))
330 obuffer.append((token_class, 'del', token_text))
331 elif ctag == 'insert':
331 elif ctag == 'insert':
332 for token_class, token_text in new_char_tokens[cn1:cn2]:
332 for token_class, token_text in new_char_tokens[cn1:cn2]:
333 nbuffer.append((token_class, 'ins', token_text))
333 nbuffer.append((token_class, 'ins', token_text))
334 elif ctag == 'replace':
334 elif ctag == 'replace':
335 for token_class, token_text in old_char_tokens[co1:co2]:
335 for token_class, token_text in old_char_tokens[co1:co2]:
336 obuffer.append((token_class, 'del', token_text))
336 obuffer.append((token_class, 'del', token_text))
337 for token_class, token_text in new_char_tokens[cn1:cn2]:
337 for token_class, token_text in new_char_tokens[cn1:cn2]:
338 nbuffer.append((token_class, 'ins', token_text))
338 nbuffer.append((token_class, 'ins', token_text))
339
339
340 old_tokens_result.extend(obuffer)
340 old_tokens_result.extend(obuffer)
341 new_tokens_result.extend(nbuffer)
341 new_tokens_result.extend(nbuffer)
342
342
343 return old_tokens_result, new_tokens_result, similarity
343 return old_tokens_result, new_tokens_result, similarity
344
344
345
345
346 class DiffSet(object):
346 class DiffSet(object):
347 """
347 """
348 An object for parsing the diff result from diffs.DiffProcessor and
348 An object for parsing the diff result from diffs.DiffProcessor and
349 adding highlighting, side by side/unified renderings and line diffs
349 adding highlighting, side by side/unified renderings and line diffs
350 """
350 """
351
351
352 HL_REAL = 'REAL' # highlights using original file, slow
352 HL_REAL = 'REAL' # highlights using original file, slow
353 HL_FAST = 'FAST' # highlights using just the line, fast but not correct
353 HL_FAST = 'FAST' # highlights using just the line, fast but not correct
354 # in the case of multiline code
354 # in the case of multiline code
355 HL_NONE = 'NONE' # no highlighting, fastest
355 HL_NONE = 'NONE' # no highlighting, fastest
356
356
357 def __init__(self, highlight_mode=HL_REAL, repo_name=None,
357 def __init__(self, highlight_mode=HL_REAL, repo_name=None,
358 source_repo_name=None,
358 source_repo_name=None,
359 source_node_getter=lambda filename: None,
359 source_node_getter=lambda filename: None,
360 target_node_getter=lambda filename: None,
360 target_node_getter=lambda filename: None,
361 source_nodes=None, target_nodes=None,
361 source_nodes=None, target_nodes=None,
362 max_file_size_limit=150 * 1024, # files over this size will
362 max_file_size_limit=150 * 1024, # files over this size will
363 # use fast highlighting
363 # use fast highlighting
364 comments=None,
364 comments=None,
365 ):
365 ):
366
366
367 self.highlight_mode = highlight_mode
367 self.highlight_mode = highlight_mode
368 self.highlighted_filenodes = {}
368 self.highlighted_filenodes = {}
369 self.source_node_getter = source_node_getter
369 self.source_node_getter = source_node_getter
370 self.target_node_getter = target_node_getter
370 self.target_node_getter = target_node_getter
371 self.source_nodes = source_nodes or {}
371 self.source_nodes = source_nodes or {}
372 self.target_nodes = target_nodes or {}
372 self.target_nodes = target_nodes or {}
373 self.repo_name = repo_name
373 self.repo_name = repo_name
374 self.source_repo_name = source_repo_name or repo_name
374 self.source_repo_name = source_repo_name or repo_name
375 self.comments = comments or {}
375 self.comments = comments or {}
376 self.comments_store = self.comments.copy()
376 self.comments_store = self.comments.copy()
377 self.max_file_size_limit = max_file_size_limit
377 self.max_file_size_limit = max_file_size_limit
378
378
379 def render_patchset(self, patchset, source_ref=None, target_ref=None):
379 def render_patchset(self, patchset, source_ref=None, target_ref=None):
380 diffset = AttributeDict(dict(
380 diffset = AttributeDict(dict(
381 lines_added=0,
381 lines_added=0,
382 lines_deleted=0,
382 lines_deleted=0,
383 changed_files=0,
383 changed_files=0,
384 files=[],
384 files=[],
385 file_stats={},
385 file_stats={},
386 limited_diff=isinstance(patchset, LimitedDiffContainer),
386 limited_diff=isinstance(patchset, LimitedDiffContainer),
387 repo_name=self.repo_name,
387 repo_name=self.repo_name,
388 source_repo_name=self.source_repo_name,
388 source_repo_name=self.source_repo_name,
389 source_ref=source_ref,
389 source_ref=source_ref,
390 target_ref=target_ref,
390 target_ref=target_ref,
391 ))
391 ))
392 for patch in patchset:
392 for patch in patchset:
393 diffset.file_stats[patch['filename']] = patch['stats']
393 diffset.file_stats[patch['filename']] = patch['stats']
394 filediff = self.render_patch(patch)
394 filediff = self.render_patch(patch)
395 filediff.diffset = diffset
395 filediff.diffset = diffset
396 diffset.files.append(filediff)
396 diffset.files.append(filediff)
397 diffset.changed_files += 1
397 diffset.changed_files += 1
398 if not patch['stats']['binary']:
398 if not patch['stats']['binary']:
399 diffset.lines_added += patch['stats']['added']
399 diffset.lines_added += patch['stats']['added']
400 diffset.lines_deleted += patch['stats']['deleted']
400 diffset.lines_deleted += patch['stats']['deleted']
401
401
402 return diffset
402 return diffset
403
403
404 _lexer_cache = {}
404 _lexer_cache = {}
405 def _get_lexer_for_filename(self, filename, filenode=None):
405 def _get_lexer_for_filename(self, filename, filenode=None):
406 # cached because we might need to call it twice for source/target
406 # cached because we might need to call it twice for source/target
407 if filename not in self._lexer_cache:
407 if filename not in self._lexer_cache:
408 if filenode:
408 if filenode:
409 lexer = filenode.lexer
409 lexer = filenode.lexer
410 extension = filenode.extension
410 else:
411 else:
411 lexer = FileNode.get_lexer(filename=filename)
412 lexer = FileNode.get_lexer(filename=filename)
413 extension = filename.split('.')[-1]
414
415 lexer = get_custom_lexer(extension) or lexer
412 self._lexer_cache[filename] = lexer
416 self._lexer_cache[filename] = lexer
413 return self._lexer_cache[filename]
417 return self._lexer_cache[filename]
414
418
415 def render_patch(self, patch):
419 def render_patch(self, patch):
416 log.debug('rendering diff for %r' % patch['filename'])
420 log.debug('rendering diff for %r' % patch['filename'])
417
421
418 source_filename = patch['original_filename']
422 source_filename = patch['original_filename']
419 target_filename = patch['filename']
423 target_filename = patch['filename']
420
424
421 source_lexer = plain_text_lexer
425 source_lexer = plain_text_lexer
422 target_lexer = plain_text_lexer
426 target_lexer = plain_text_lexer
423
427
424 if not patch['stats']['binary']:
428 if not patch['stats']['binary']:
425 if self.highlight_mode == self.HL_REAL:
429 if self.highlight_mode == self.HL_REAL:
426 if (source_filename and patch['operation'] in ('D', 'M')
430 if (source_filename and patch['operation'] in ('D', 'M')
427 and source_filename not in self.source_nodes):
431 and source_filename not in self.source_nodes):
428 self.source_nodes[source_filename] = (
432 self.source_nodes[source_filename] = (
429 self.source_node_getter(source_filename))
433 self.source_node_getter(source_filename))
430
434
431 if (target_filename and patch['operation'] in ('A', 'M')
435 if (target_filename and patch['operation'] in ('A', 'M')
432 and target_filename not in self.target_nodes):
436 and target_filename not in self.target_nodes):
433 self.target_nodes[target_filename] = (
437 self.target_nodes[target_filename] = (
434 self.target_node_getter(target_filename))
438 self.target_node_getter(target_filename))
435
439
436 elif self.highlight_mode == self.HL_FAST:
440 elif self.highlight_mode == self.HL_FAST:
437 source_lexer = self._get_lexer_for_filename(source_filename)
441 source_lexer = self._get_lexer_for_filename(source_filename)
438 target_lexer = self._get_lexer_for_filename(target_filename)
442 target_lexer = self._get_lexer_for_filename(target_filename)
439
443
440 source_file = self.source_nodes.get(source_filename, source_filename)
444 source_file = self.source_nodes.get(source_filename, source_filename)
441 target_file = self.target_nodes.get(target_filename, target_filename)
445 target_file = self.target_nodes.get(target_filename, target_filename)
442
446
443 source_filenode, target_filenode = None, None
447 source_filenode, target_filenode = None, None
444
448
445 # TODO: dan: FileNode.lexer works on the content of the file - which
449 # TODO: dan: FileNode.lexer works on the content of the file - which
446 # can be slow - issue #4289 explains a lexer clean up - which once
450 # can be slow - issue #4289 explains a lexer clean up - which once
447 # done can allow caching a lexer for a filenode to avoid the file lookup
451 # done can allow caching a lexer for a filenode to avoid the file lookup
448 if isinstance(source_file, FileNode):
452 if isinstance(source_file, FileNode):
449 source_filenode = source_file
453 source_filenode = source_file
450 #source_lexer = source_file.lexer
454 #source_lexer = source_file.lexer
451 source_lexer = self._get_lexer_for_filename(source_filename)
455 source_lexer = self._get_lexer_for_filename(source_filename)
452 source_file.lexer = source_lexer
456 source_file.lexer = source_lexer
453
457
454 if isinstance(target_file, FileNode):
458 if isinstance(target_file, FileNode):
455 target_filenode = target_file
459 target_filenode = target_file
456 #target_lexer = target_file.lexer
460 #target_lexer = target_file.lexer
457 target_lexer = self._get_lexer_for_filename(target_filename)
461 target_lexer = self._get_lexer_for_filename(target_filename)
458 target_file.lexer = target_lexer
462 target_file.lexer = target_lexer
459
463
460 source_file_path, target_file_path = None, None
464 source_file_path, target_file_path = None, None
461
465
462 if source_filename != '/dev/null':
466 if source_filename != '/dev/null':
463 source_file_path = source_filename
467 source_file_path = source_filename
464 if target_filename != '/dev/null':
468 if target_filename != '/dev/null':
465 target_file_path = target_filename
469 target_file_path = target_filename
466
470
467 source_file_type = source_lexer.name
471 source_file_type = source_lexer.name
468 target_file_type = target_lexer.name
472 target_file_type = target_lexer.name
469
473
470 op_hunks = patch['chunks'][0]
474 op_hunks = patch['chunks'][0]
471 hunks = patch['chunks'][1:]
475 hunks = patch['chunks'][1:]
472
476
473 filediff = AttributeDict({
477 filediff = AttributeDict({
474 'source_file_path': source_file_path,
478 'source_file_path': source_file_path,
475 'target_file_path': target_file_path,
479 'target_file_path': target_file_path,
476 'source_filenode': source_filenode,
480 'source_filenode': source_filenode,
477 'target_filenode': target_filenode,
481 'target_filenode': target_filenode,
478 'hunks': [],
482 'hunks': [],
479 'source_file_type': target_file_type,
483 'source_file_type': target_file_type,
480 'target_file_type': source_file_type,
484 'target_file_type': source_file_type,
481 'patch': patch,
485 'patch': patch,
482 'source_mode': patch['stats']['old_mode'],
486 'source_mode': patch['stats']['old_mode'],
483 'target_mode': patch['stats']['new_mode'],
487 'target_mode': patch['stats']['new_mode'],
484 'limited_diff': isinstance(patch, LimitedDiffContainer),
488 'limited_diff': isinstance(patch, LimitedDiffContainer),
485 'diffset': self,
489 'diffset': self,
486 })
490 })
487
491
488 for hunk in hunks:
492 for hunk in hunks:
489 hunkbit = self.parse_hunk(hunk, source_file, target_file)
493 hunkbit = self.parse_hunk(hunk, source_file, target_file)
490 hunkbit.filediff = filediff
494 hunkbit.filediff = filediff
491 filediff.hunks.append(hunkbit)
495 filediff.hunks.append(hunkbit)
492
496
493 left_comments = {}
497 left_comments = {}
494
498
495 if source_file_path in self.comments_store:
499 if source_file_path in self.comments_store:
496 for lineno, comments in self.comments_store[source_file_path].items():
500 for lineno, comments in self.comments_store[source_file_path].items():
497 left_comments[lineno] = comments
501 left_comments[lineno] = comments
498
502
499 if target_file_path in self.comments_store:
503 if target_file_path in self.comments_store:
500 for lineno, comments in self.comments_store[target_file_path].items():
504 for lineno, comments in self.comments_store[target_file_path].items():
501 left_comments[lineno] = comments
505 left_comments[lineno] = comments
502
506
503 filediff.left_comments = left_comments
507 filediff.left_comments = left_comments
504 return filediff
508 return filediff
505
509
506 def parse_hunk(self, hunk, source_file, target_file):
510 def parse_hunk(self, hunk, source_file, target_file):
507 result = AttributeDict(dict(
511 result = AttributeDict(dict(
508 source_start=hunk['source_start'],
512 source_start=hunk['source_start'],
509 source_length=hunk['source_length'],
513 source_length=hunk['source_length'],
510 target_start=hunk['target_start'],
514 target_start=hunk['target_start'],
511 target_length=hunk['target_length'],
515 target_length=hunk['target_length'],
512 section_header=hunk['section_header'],
516 section_header=hunk['section_header'],
513 lines=[],
517 lines=[],
514 ))
518 ))
515 before, after = [], []
519 before, after = [], []
516
520
517 for line in hunk['lines']:
521 for line in hunk['lines']:
518 if line['action'] == 'unmod':
522 if line['action'] == 'unmod':
519 result.lines.extend(
523 result.lines.extend(
520 self.parse_lines(before, after, source_file, target_file))
524 self.parse_lines(before, after, source_file, target_file))
521 after.append(line)
525 after.append(line)
522 before.append(line)
526 before.append(line)
523 elif line['action'] == 'add':
527 elif line['action'] == 'add':
524 after.append(line)
528 after.append(line)
525 elif line['action'] == 'del':
529 elif line['action'] == 'del':
526 before.append(line)
530 before.append(line)
527 elif line['action'] == 'old-no-nl':
531 elif line['action'] == 'old-no-nl':
528 before.append(line)
532 before.append(line)
529 elif line['action'] == 'new-no-nl':
533 elif line['action'] == 'new-no-nl':
530 after.append(line)
534 after.append(line)
531
535
532 result.lines.extend(
536 result.lines.extend(
533 self.parse_lines(before, after, source_file, target_file))
537 self.parse_lines(before, after, source_file, target_file))
534 result.unified = self.as_unified(result.lines)
538 result.unified = self.as_unified(result.lines)
535 result.sideside = result.lines
539 result.sideside = result.lines
536
540
537 return result
541 return result
538
542
539 def parse_lines(self, before_lines, after_lines, source_file, target_file):
543 def parse_lines(self, before_lines, after_lines, source_file, target_file):
540 # TODO: dan: investigate doing the diff comparison and fast highlighting
544 # TODO: dan: investigate doing the diff comparison and fast highlighting
541 # on the entire before and after buffered block lines rather than by
545 # on the entire before and after buffered block lines rather than by
542 # line, this means we can get better 'fast' highlighting if the context
546 # line, this means we can get better 'fast' highlighting if the context
543 # allows it - eg.
547 # allows it - eg.
544 # line 4: """
548 # line 4: """
545 # line 5: this gets highlighted as a string
549 # line 5: this gets highlighted as a string
546 # line 6: """
550 # line 6: """
547
551
548 lines = []
552 lines = []
549 while before_lines or after_lines:
553 while before_lines or after_lines:
550 before, after = None, None
554 before, after = None, None
551 before_tokens, after_tokens = None, None
555 before_tokens, after_tokens = None, None
552
556
553 if before_lines:
557 if before_lines:
554 before = before_lines.pop(0)
558 before = before_lines.pop(0)
555 if after_lines:
559 if after_lines:
556 after = after_lines.pop(0)
560 after = after_lines.pop(0)
557
561
558 original = AttributeDict()
562 original = AttributeDict()
559 modified = AttributeDict()
563 modified = AttributeDict()
560
564
561 if before:
565 if before:
562 if before['action'] == 'old-no-nl':
566 if before['action'] == 'old-no-nl':
563 before_tokens = [('nonl', before['line'])]
567 before_tokens = [('nonl', before['line'])]
564 else:
568 else:
565 before_tokens = self.get_line_tokens(
569 before_tokens = self.get_line_tokens(
566 line_text=before['line'], line_number=before['old_lineno'],
570 line_text=before['line'], line_number=before['old_lineno'],
567 file=source_file)
571 file=source_file)
568 original.lineno = before['old_lineno']
572 original.lineno = before['old_lineno']
569 original.content = before['line']
573 original.content = before['line']
570 original.action = self.action_to_op(before['action'])
574 original.action = self.action_to_op(before['action'])
571 original.comments = self.get_comments_for('old',
575 original.comments = self.get_comments_for('old',
572 source_file, before['old_lineno'])
576 source_file, before['old_lineno'])
573
577
574 if after:
578 if after:
575 if after['action'] == 'new-no-nl':
579 if after['action'] == 'new-no-nl':
576 after_tokens = [('nonl', after['line'])]
580 after_tokens = [('nonl', after['line'])]
577 else:
581 else:
578 after_tokens = self.get_line_tokens(
582 after_tokens = self.get_line_tokens(
579 line_text=after['line'], line_number=after['new_lineno'],
583 line_text=after['line'], line_number=after['new_lineno'],
580 file=target_file)
584 file=target_file)
581 modified.lineno = after['new_lineno']
585 modified.lineno = after['new_lineno']
582 modified.content = after['line']
586 modified.content = after['line']
583 modified.action = self.action_to_op(after['action'])
587 modified.action = self.action_to_op(after['action'])
584 modified.comments = self.get_comments_for('new',
588 modified.comments = self.get_comments_for('new',
585 target_file, after['new_lineno'])
589 target_file, after['new_lineno'])
586
590
587 # diff the lines
591 # diff the lines
588 if before_tokens and after_tokens:
592 if before_tokens and after_tokens:
589 o_tokens, m_tokens, similarity = tokens_diff(
593 o_tokens, m_tokens, similarity = tokens_diff(
590 before_tokens, after_tokens)
594 before_tokens, after_tokens)
591 original.content = render_tokenstream(o_tokens)
595 original.content = render_tokenstream(o_tokens)
592 modified.content = render_tokenstream(m_tokens)
596 modified.content = render_tokenstream(m_tokens)
593 elif before_tokens:
597 elif before_tokens:
594 original.content = render_tokenstream(
598 original.content = render_tokenstream(
595 [(x[0], '', x[1]) for x in before_tokens])
599 [(x[0], '', x[1]) for x in before_tokens])
596 elif after_tokens:
600 elif after_tokens:
597 modified.content = render_tokenstream(
601 modified.content = render_tokenstream(
598 [(x[0], '', x[1]) for x in after_tokens])
602 [(x[0], '', x[1]) for x in after_tokens])
599
603
600 lines.append(AttributeDict({
604 lines.append(AttributeDict({
601 'original': original,
605 'original': original,
602 'modified': modified,
606 'modified': modified,
603 }))
607 }))
604
608
605 return lines
609 return lines
606
610
607 def get_comments_for(self, version, file, line_number):
611 def get_comments_for(self, version, file, line_number):
608 if hasattr(file, 'unicode_path'):
612 if hasattr(file, 'unicode_path'):
609 file = file.unicode_path
613 file = file.unicode_path
610
614
611 if not isinstance(file, basestring):
615 if not isinstance(file, basestring):
612 return None
616 return None
613
617
614 line_key = {
618 line_key = {
615 'old': 'o',
619 'old': 'o',
616 'new': 'n',
620 'new': 'n',
617 }[version] + str(line_number)
621 }[version] + str(line_number)
618
622
619 if file in self.comments_store:
623 if file in self.comments_store:
620 file_comments = self.comments_store[file]
624 file_comments = self.comments_store[file]
621 if line_key in file_comments:
625 if line_key in file_comments:
622 return file_comments.pop(line_key)
626 return file_comments.pop(line_key)
623
627
624 def get_line_tokens(self, line_text, line_number, file=None):
628 def get_line_tokens(self, line_text, line_number, file=None):
625 filenode = None
629 filenode = None
626 filename = None
630 filename = None
627
631
628 if isinstance(file, basestring):
632 if isinstance(file, basestring):
629 filename = file
633 filename = file
630 elif isinstance(file, FileNode):
634 elif isinstance(file, FileNode):
631 filenode = file
635 filenode = file
632 filename = file.unicode_path
636 filename = file.unicode_path
633
637
634 if self.highlight_mode == self.HL_REAL and filenode:
638 if self.highlight_mode == self.HL_REAL and filenode:
635 lexer = self._get_lexer_for_filename(filename)
639 lexer = self._get_lexer_for_filename(filename)
636 file_size_allowed = file.size < self.max_file_size_limit
640 file_size_allowed = file.size < self.max_file_size_limit
637 if line_number and file_size_allowed:
641 if line_number and file_size_allowed:
638 return self.get_tokenized_filenode_line(
642 return self.get_tokenized_filenode_line(
639 file, line_number, lexer)
643 file, line_number, lexer)
640
644
641 if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:
645 if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:
642 lexer = self._get_lexer_for_filename(filename)
646 lexer = self._get_lexer_for_filename(filename)
643 return list(tokenize_string(line_text, lexer))
647 return list(tokenize_string(line_text, lexer))
644
648
645 return list(tokenize_string(line_text, plain_text_lexer))
649 return list(tokenize_string(line_text, plain_text_lexer))
646
650
647 def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):
651 def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):
648
652
649 if filenode not in self.highlighted_filenodes:
653 if filenode not in self.highlighted_filenodes:
650 tokenized_lines = filenode_as_lines_tokens(filenode, lexer)
654 tokenized_lines = filenode_as_lines_tokens(filenode, lexer)
651 self.highlighted_filenodes[filenode] = tokenized_lines
655 self.highlighted_filenodes[filenode] = tokenized_lines
652 return self.highlighted_filenodes[filenode][line_number - 1]
656 return self.highlighted_filenodes[filenode][line_number - 1]
653
657
654 def action_to_op(self, action):
658 def action_to_op(self, action):
655 return {
659 return {
656 'add': '+',
660 'add': '+',
657 'del': '-',
661 'del': '-',
658 'unmod': ' ',
662 'unmod': ' ',
659 'old-no-nl': ' ',
663 'old-no-nl': ' ',
660 'new-no-nl': ' ',
664 'new-no-nl': ' ',
661 }.get(action, action)
665 }.get(action, action)
662
666
663 def as_unified(self, lines):
667 def as_unified(self, lines):
664 """
668 """
665 Return a generator that yields the lines of a diff in unified order
669 Return a generator that yields the lines of a diff in unified order
666 """
670 """
667 def generator():
671 def generator():
668 buf = []
672 buf = []
669 for line in lines:
673 for line in lines:
670
674
671 if buf and not line.original or line.original.action == ' ':
675 if buf and not line.original or line.original.action == ' ':
672 for b in buf:
676 for b in buf:
673 yield b
677 yield b
674 buf = []
678 buf = []
675
679
676 if line.original:
680 if line.original:
677 if line.original.action == ' ':
681 if line.original.action == ' ':
678 yield (line.original.lineno, line.modified.lineno,
682 yield (line.original.lineno, line.modified.lineno,
679 line.original.action, line.original.content,
683 line.original.action, line.original.content,
680 line.original.comments)
684 line.original.comments)
681 continue
685 continue
682
686
683 if line.original.action == '-':
687 if line.original.action == '-':
684 yield (line.original.lineno, None,
688 yield (line.original.lineno, None,
685 line.original.action, line.original.content,
689 line.original.action, line.original.content,
686 line.original.comments)
690 line.original.comments)
687
691
688 if line.modified.action == '+':
692 if line.modified.action == '+':
689 buf.append((
693 buf.append((
690 None, line.modified.lineno,
694 None, line.modified.lineno,
691 line.modified.action, line.modified.content,
695 line.modified.action, line.modified.content,
692 line.modified.comments))
696 line.modified.comments))
693 continue
697 continue
694
698
695 if line.modified:
699 if line.modified:
696 yield (None, line.modified.lineno,
700 yield (None, line.modified.lineno,
697 line.modified.action, line.modified.content,
701 line.modified.action, line.modified.content,
698 line.modified.comments)
702 line.modified.comments)
699
703
700 for b in buf:
704 for b in buf:
701 yield b
705 yield b
702
706
703 return generator()
707 return generator()
General Comments 0
You need to be logged in to leave comments. Login now