##// END OF EJS Templates
diffs: switched lexer extraction to use single function in all places.
marcink -
r1358:f0122102 default
parent child Browse files
Show More
@@ -1,701 +1,701 b''
1 # -*- coding: utf-8 -*-
1 # -*- coding: utf-8 -*-
2
2
3 # Copyright (C) 2011-2017 RhodeCode GmbH
3 # Copyright (C) 2011-2017 RhodeCode GmbH
4 #
4 #
5 # This program is free software: you can redistribute it and/or modify
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Affero General Public License, version 3
6 # it under the terms of the GNU Affero General Public License, version 3
7 # (only), as published by the Free Software Foundation.
7 # (only), as published by the Free Software Foundation.
8 #
8 #
9 # This program is distributed in the hope that it will be useful,
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
12 # GNU General Public License for more details.
13 #
13 #
14 # You should have received a copy of the GNU Affero General Public License
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 #
16 #
17 # This program is dual-licensed. If you wish to learn more about the
17 # This program is dual-licensed. If you wish to learn more about the
18 # RhodeCode Enterprise Edition, including its added features, Support services,
18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 # and proprietary license terms, please see https://rhodecode.com/licenses/
19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20
20
21 import logging
21 import logging
22 import difflib
22 import difflib
23 from itertools import groupby
23 from itertools import groupby
24
24
25 from pygments import lex
25 from pygments import lex
26 from pygments.formatters.html import _get_ttype_class as pygment_token_class
26 from pygments.formatters.html import _get_ttype_class as pygment_token_class
27 from rhodecode.lib.helpers import (
27 from rhodecode.lib.helpers import (
28 get_lexer_for_filenode, get_lexer_safe, html_escape)
28 get_lexer_for_filenode, html_escape)
29 from rhodecode.lib.utils2 import AttributeDict
29 from rhodecode.lib.utils2 import AttributeDict
30 from rhodecode.lib.vcs.nodes import FileNode
30 from rhodecode.lib.vcs.nodes import FileNode
31 from rhodecode.lib.diff_match_patch import diff_match_patch
31 from rhodecode.lib.diff_match_patch import diff_match_patch
32 from rhodecode.lib.diffs import LimitedDiffContainer
32 from rhodecode.lib.diffs import LimitedDiffContainer
33 from pygments.lexers import get_lexer_by_name
33 from pygments.lexers import get_lexer_by_name
34
34
35 plain_text_lexer = get_lexer_by_name(
35 plain_text_lexer = get_lexer_by_name(
36 'text', stripall=False, stripnl=False, ensurenl=False)
36 'text', stripall=False, stripnl=False, ensurenl=False)
37
37
38
38
39 log = logging.getLogger()
39 log = logging.getLogger()
40
40
41
41
42 def filenode_as_lines_tokens(filenode, lexer=None):
42 def filenode_as_lines_tokens(filenode, lexer=None):
43 org_lexer = lexer
43 org_lexer = lexer
44 lexer = lexer or get_lexer_for_filenode(filenode)
44 lexer = lexer or get_lexer_for_filenode(filenode)
45 log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',
45 log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',
46 lexer, filenode, org_lexer)
46 lexer, filenode, org_lexer)
47 tokens = tokenize_string(filenode.content, lexer)
47 tokens = tokenize_string(filenode.content, lexer)
48 lines = split_token_stream(tokens, split_string='\n')
48 lines = split_token_stream(tokens, split_string='\n')
49 rv = list(lines)
49 rv = list(lines)
50 return rv
50 return rv
51
51
52
52
53 def tokenize_string(content, lexer):
53 def tokenize_string(content, lexer):
54 """
54 """
55 Use pygments to tokenize some content based on a lexer
55 Use pygments to tokenize some content based on a lexer
56 ensuring all original new lines and whitespace is preserved
56 ensuring all original new lines and whitespace is preserved
57 """
57 """
58
58
59 lexer.stripall = False
59 lexer.stripall = False
60 lexer.stripnl = False
60 lexer.stripnl = False
61 lexer.ensurenl = False
61 lexer.ensurenl = False
62 for token_type, token_text in lex(content, lexer):
62 for token_type, token_text in lex(content, lexer):
63 yield pygment_token_class(token_type), token_text
63 yield pygment_token_class(token_type), token_text
64
64
65
65
66 def split_token_stream(tokens, split_string=u'\n'):
66 def split_token_stream(tokens, split_string=u'\n'):
67 """
67 """
68 Take a list of (TokenType, text) tuples and split them by a string
68 Take a list of (TokenType, text) tuples and split them by a string
69
69
70 >>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
70 >>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
71 [(TEXT, 'some'), (TEXT, 'text'),
71 [(TEXT, 'some'), (TEXT, 'text'),
72 (TEXT, 'more'), (TEXT, 'text')]
72 (TEXT, 'more'), (TEXT, 'text')]
73 """
73 """
74
74
75 buffer = []
75 buffer = []
76 for token_class, token_text in tokens:
76 for token_class, token_text in tokens:
77 parts = token_text.split(split_string)
77 parts = token_text.split(split_string)
78 for part in parts[:-1]:
78 for part in parts[:-1]:
79 buffer.append((token_class, part))
79 buffer.append((token_class, part))
80 yield buffer
80 yield buffer
81 buffer = []
81 buffer = []
82
82
83 buffer.append((token_class, parts[-1]))
83 buffer.append((token_class, parts[-1]))
84
84
85 if buffer:
85 if buffer:
86 yield buffer
86 yield buffer
87
87
88
88
89 def filenode_as_annotated_lines_tokens(filenode):
89 def filenode_as_annotated_lines_tokens(filenode):
90 """
90 """
91 Take a file node and return a list of annotations => lines, if no annotation
91 Take a file node and return a list of annotations => lines, if no annotation
92 is found, it will be None.
92 is found, it will be None.
93
93
94 eg:
94 eg:
95
95
96 [
96 [
97 (annotation1, [
97 (annotation1, [
98 (1, line1_tokens_list),
98 (1, line1_tokens_list),
99 (2, line2_tokens_list),
99 (2, line2_tokens_list),
100 ]),
100 ]),
101 (annotation2, [
101 (annotation2, [
102 (3, line1_tokens_list),
102 (3, line1_tokens_list),
103 ]),
103 ]),
104 (None, [
104 (None, [
105 (4, line1_tokens_list),
105 (4, line1_tokens_list),
106 ]),
106 ]),
107 (annotation1, [
107 (annotation1, [
108 (5, line1_tokens_list),
108 (5, line1_tokens_list),
109 (6, line2_tokens_list),
109 (6, line2_tokens_list),
110 ])
110 ])
111 ]
111 ]
112 """
112 """
113
113
114 commit_cache = {} # cache commit_getter lookups
114 commit_cache = {} # cache commit_getter lookups
115
115
116 def _get_annotation(commit_id, commit_getter):
116 def _get_annotation(commit_id, commit_getter):
117 if commit_id not in commit_cache:
117 if commit_id not in commit_cache:
118 commit_cache[commit_id] = commit_getter()
118 commit_cache[commit_id] = commit_getter()
119 return commit_cache[commit_id]
119 return commit_cache[commit_id]
120
120
121 annotation_lookup = {
121 annotation_lookup = {
122 line_no: _get_annotation(commit_id, commit_getter)
122 line_no: _get_annotation(commit_id, commit_getter)
123 for line_no, commit_id, commit_getter, line_content
123 for line_no, commit_id, commit_getter, line_content
124 in filenode.annotate
124 in filenode.annotate
125 }
125 }
126
126
127 annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
127 annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
128 for line_no, tokens
128 for line_no, tokens
129 in enumerate(filenode_as_lines_tokens(filenode), 1))
129 in enumerate(filenode_as_lines_tokens(filenode), 1))
130
130
131 grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
131 grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
132
132
133 for annotation, group in grouped_annotations_lines:
133 for annotation, group in grouped_annotations_lines:
134 yield (
134 yield (
135 annotation, [(line_no, tokens)
135 annotation, [(line_no, tokens)
136 for (_, line_no, tokens) in group]
136 for (_, line_no, tokens) in group]
137 )
137 )
138
138
139
139
140 def render_tokenstream(tokenstream):
140 def render_tokenstream(tokenstream):
141 result = []
141 result = []
142 for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
142 for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
143
143
144 if token_class:
144 if token_class:
145 result.append(u'<span class="%s">' % token_class)
145 result.append(u'<span class="%s">' % token_class)
146 else:
146 else:
147 result.append(u'<span>')
147 result.append(u'<span>')
148
148
149 for op_tag, token_text in token_ops_texts:
149 for op_tag, token_text in token_ops_texts:
150
150
151 if op_tag:
151 if op_tag:
152 result.append(u'<%s>' % op_tag)
152 result.append(u'<%s>' % op_tag)
153
153
154 escaped_text = html_escape(token_text)
154 escaped_text = html_escape(token_text)
155
155
156 # TODO: dan: investigate showing hidden characters like space/nl/tab
156 # TODO: dan: investigate showing hidden characters like space/nl/tab
157 # escaped_text = escaped_text.replace(' ', '<sp> </sp>')
157 # escaped_text = escaped_text.replace(' ', '<sp> </sp>')
158 # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
158 # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
159 # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')
159 # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')
160
160
161 result.append(escaped_text)
161 result.append(escaped_text)
162
162
163 if op_tag:
163 if op_tag:
164 result.append(u'</%s>' % op_tag)
164 result.append(u'</%s>' % op_tag)
165
165
166 result.append(u'</span>')
166 result.append(u'</span>')
167
167
168 html = ''.join(result)
168 html = ''.join(result)
169 return html
169 return html
170
170
171
171
172 def rollup_tokenstream(tokenstream):
172 def rollup_tokenstream(tokenstream):
173 """
173 """
174 Group a token stream of the format:
174 Group a token stream of the format:
175
175
176 ('class', 'op', 'text')
176 ('class', 'op', 'text')
177 or
177 or
178 ('class', 'text')
178 ('class', 'text')
179
179
180 into
180 into
181
181
182 [('class1',
182 [('class1',
183 [('op1', 'text'),
183 [('op1', 'text'),
184 ('op2', 'text')]),
184 ('op2', 'text')]),
185 ('class2',
185 ('class2',
186 [('op3', 'text')])]
186 [('op3', 'text')])]
187
187
188 This is used to get the minimal tags necessary when
188 This is used to get the minimal tags necessary when
189 rendering to html eg for a token stream ie.
189 rendering to html eg for a token stream ie.
190
190
191 <span class="A"><ins>he</ins>llo</span>
191 <span class="A"><ins>he</ins>llo</span>
192 vs
192 vs
193 <span class="A"><ins>he</ins></span><span class="A">llo</span>
193 <span class="A"><ins>he</ins></span><span class="A">llo</span>
194
194
195 If a 2 tuple is passed in, the output op will be an empty string.
195 If a 2 tuple is passed in, the output op will be an empty string.
196
196
197 eg:
197 eg:
198
198
199 >>> rollup_tokenstream([('classA', '', 'h'),
199 >>> rollup_tokenstream([('classA', '', 'h'),
200 ('classA', 'del', 'ell'),
200 ('classA', 'del', 'ell'),
201 ('classA', '', 'o'),
201 ('classA', '', 'o'),
202 ('classB', '', ' '),
202 ('classB', '', ' '),
203 ('classA', '', 'the'),
203 ('classA', '', 'the'),
204 ('classA', '', 're'),
204 ('classA', '', 're'),
205 ])
205 ])
206
206
207 [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
207 [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
208 ('classB', [('', ' ')],
208 ('classB', [('', ' ')],
209 ('classA', [('', 'there')]]
209 ('classA', [('', 'there')]]
210
210
211 """
211 """
212 if tokenstream and len(tokenstream[0]) == 2:
212 if tokenstream and len(tokenstream[0]) == 2:
213 tokenstream = ((t[0], '', t[1]) for t in tokenstream)
213 tokenstream = ((t[0], '', t[1]) for t in tokenstream)
214
214
215 result = []
215 result = []
216 for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
216 for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
217 ops = []
217 ops = []
218 for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
218 for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
219 text_buffer = []
219 text_buffer = []
220 for t_class, t_op, t_text in token_text_list:
220 for t_class, t_op, t_text in token_text_list:
221 text_buffer.append(t_text)
221 text_buffer.append(t_text)
222 ops.append((token_op, ''.join(text_buffer)))
222 ops.append((token_op, ''.join(text_buffer)))
223 result.append((token_class, ops))
223 result.append((token_class, ops))
224 return result
224 return result
225
225
226
226
227 def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
227 def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
228 """
228 """
229 Converts a list of (token_class, token_text) tuples to a list of
229 Converts a list of (token_class, token_text) tuples to a list of
230 (token_class, token_op, token_text) tuples where token_op is one of
230 (token_class, token_op, token_text) tuples where token_op is one of
231 ('ins', 'del', '')
231 ('ins', 'del', '')
232
232
233 :param old_tokens: list of (token_class, token_text) tuples of old line
233 :param old_tokens: list of (token_class, token_text) tuples of old line
234 :param new_tokens: list of (token_class, token_text) tuples of new line
234 :param new_tokens: list of (token_class, token_text) tuples of new line
235 :param use_diff_match_patch: boolean, will use google's diff match patch
235 :param use_diff_match_patch: boolean, will use google's diff match patch
236 library which has options to 'smooth' out the character by character
236 library which has options to 'smooth' out the character by character
237 differences making nicer ins/del blocks
237 differences making nicer ins/del blocks
238 """
238 """
239
239
240 old_tokens_result = []
240 old_tokens_result = []
241 new_tokens_result = []
241 new_tokens_result = []
242
242
243 similarity = difflib.SequenceMatcher(None,
243 similarity = difflib.SequenceMatcher(None,
244 ''.join(token_text for token_class, token_text in old_tokens),
244 ''.join(token_text for token_class, token_text in old_tokens),
245 ''.join(token_text for token_class, token_text in new_tokens)
245 ''.join(token_text for token_class, token_text in new_tokens)
246 ).ratio()
246 ).ratio()
247
247
248 if similarity < 0.6: # return, the blocks are too different
248 if similarity < 0.6: # return, the blocks are too different
249 for token_class, token_text in old_tokens:
249 for token_class, token_text in old_tokens:
250 old_tokens_result.append((token_class, '', token_text))
250 old_tokens_result.append((token_class, '', token_text))
251 for token_class, token_text in new_tokens:
251 for token_class, token_text in new_tokens:
252 new_tokens_result.append((token_class, '', token_text))
252 new_tokens_result.append((token_class, '', token_text))
253 return old_tokens_result, new_tokens_result, similarity
253 return old_tokens_result, new_tokens_result, similarity
254
254
255 token_sequence_matcher = difflib.SequenceMatcher(None,
255 token_sequence_matcher = difflib.SequenceMatcher(None,
256 [x[1] for x in old_tokens],
256 [x[1] for x in old_tokens],
257 [x[1] for x in new_tokens])
257 [x[1] for x in new_tokens])
258
258
259 for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
259 for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
260 # check the differences by token block types first to give a more
260 # check the differences by token block types first to give a more
261 # nicer "block" level replacement vs character diffs
261 # nicer "block" level replacement vs character diffs
262
262
263 if tag == 'equal':
263 if tag == 'equal':
264 for token_class, token_text in old_tokens[o1:o2]:
264 for token_class, token_text in old_tokens[o1:o2]:
265 old_tokens_result.append((token_class, '', token_text))
265 old_tokens_result.append((token_class, '', token_text))
266 for token_class, token_text in new_tokens[n1:n2]:
266 for token_class, token_text in new_tokens[n1:n2]:
267 new_tokens_result.append((token_class, '', token_text))
267 new_tokens_result.append((token_class, '', token_text))
268 elif tag == 'delete':
268 elif tag == 'delete':
269 for token_class, token_text in old_tokens[o1:o2]:
269 for token_class, token_text in old_tokens[o1:o2]:
270 old_tokens_result.append((token_class, 'del', token_text))
270 old_tokens_result.append((token_class, 'del', token_text))
271 elif tag == 'insert':
271 elif tag == 'insert':
272 for token_class, token_text in new_tokens[n1:n2]:
272 for token_class, token_text in new_tokens[n1:n2]:
273 new_tokens_result.append((token_class, 'ins', token_text))
273 new_tokens_result.append((token_class, 'ins', token_text))
274 elif tag == 'replace':
274 elif tag == 'replace':
275 # if same type token blocks must be replaced, do a diff on the
275 # if same type token blocks must be replaced, do a diff on the
276 # characters in the token blocks to show individual changes
276 # characters in the token blocks to show individual changes
277
277
278 old_char_tokens = []
278 old_char_tokens = []
279 new_char_tokens = []
279 new_char_tokens = []
280 for token_class, token_text in old_tokens[o1:o2]:
280 for token_class, token_text in old_tokens[o1:o2]:
281 for char in token_text:
281 for char in token_text:
282 old_char_tokens.append((token_class, char))
282 old_char_tokens.append((token_class, char))
283
283
284 for token_class, token_text in new_tokens[n1:n2]:
284 for token_class, token_text in new_tokens[n1:n2]:
285 for char in token_text:
285 for char in token_text:
286 new_char_tokens.append((token_class, char))
286 new_char_tokens.append((token_class, char))
287
287
288 old_string = ''.join([token_text for
288 old_string = ''.join([token_text for
289 token_class, token_text in old_char_tokens])
289 token_class, token_text in old_char_tokens])
290 new_string = ''.join([token_text for
290 new_string = ''.join([token_text for
291 token_class, token_text in new_char_tokens])
291 token_class, token_text in new_char_tokens])
292
292
293 char_sequence = difflib.SequenceMatcher(
293 char_sequence = difflib.SequenceMatcher(
294 None, old_string, new_string)
294 None, old_string, new_string)
295 copcodes = char_sequence.get_opcodes()
295 copcodes = char_sequence.get_opcodes()
296 obuffer, nbuffer = [], []
296 obuffer, nbuffer = [], []
297
297
298 if use_diff_match_patch:
298 if use_diff_match_patch:
299 dmp = diff_match_patch()
299 dmp = diff_match_patch()
300 dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting
300 dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting
301 reps = dmp.diff_main(old_string, new_string)
301 reps = dmp.diff_main(old_string, new_string)
302 dmp.diff_cleanupEfficiency(reps)
302 dmp.diff_cleanupEfficiency(reps)
303
303
304 a, b = 0, 0
304 a, b = 0, 0
305 for op, rep in reps:
305 for op, rep in reps:
306 l = len(rep)
306 l = len(rep)
307 if op == 0:
307 if op == 0:
308 for i, c in enumerate(rep):
308 for i, c in enumerate(rep):
309 obuffer.append((old_char_tokens[a+i][0], '', c))
309 obuffer.append((old_char_tokens[a+i][0], '', c))
310 nbuffer.append((new_char_tokens[b+i][0], '', c))
310 nbuffer.append((new_char_tokens[b+i][0], '', c))
311 a += l
311 a += l
312 b += l
312 b += l
313 elif op == -1:
313 elif op == -1:
314 for i, c in enumerate(rep):
314 for i, c in enumerate(rep):
315 obuffer.append((old_char_tokens[a+i][0], 'del', c))
315 obuffer.append((old_char_tokens[a+i][0], 'del', c))
316 a += l
316 a += l
317 elif op == 1:
317 elif op == 1:
318 for i, c in enumerate(rep):
318 for i, c in enumerate(rep):
319 nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
319 nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
320 b += l
320 b += l
321 else:
321 else:
322 for ctag, co1, co2, cn1, cn2 in copcodes:
322 for ctag, co1, co2, cn1, cn2 in copcodes:
323 if ctag == 'equal':
323 if ctag == 'equal':
324 for token_class, token_text in old_char_tokens[co1:co2]:
324 for token_class, token_text in old_char_tokens[co1:co2]:
325 obuffer.append((token_class, '', token_text))
325 obuffer.append((token_class, '', token_text))
326 for token_class, token_text in new_char_tokens[cn1:cn2]:
326 for token_class, token_text in new_char_tokens[cn1:cn2]:
327 nbuffer.append((token_class, '', token_text))
327 nbuffer.append((token_class, '', token_text))
328 elif ctag == 'delete':
328 elif ctag == 'delete':
329 for token_class, token_text in old_char_tokens[co1:co2]:
329 for token_class, token_text in old_char_tokens[co1:co2]:
330 obuffer.append((token_class, 'del', token_text))
330 obuffer.append((token_class, 'del', token_text))
331 elif ctag == 'insert':
331 elif ctag == 'insert':
332 for token_class, token_text in new_char_tokens[cn1:cn2]:
332 for token_class, token_text in new_char_tokens[cn1:cn2]:
333 nbuffer.append((token_class, 'ins', token_text))
333 nbuffer.append((token_class, 'ins', token_text))
334 elif ctag == 'replace':
334 elif ctag == 'replace':
335 for token_class, token_text in old_char_tokens[co1:co2]:
335 for token_class, token_text in old_char_tokens[co1:co2]:
336 obuffer.append((token_class, 'del', token_text))
336 obuffer.append((token_class, 'del', token_text))
337 for token_class, token_text in new_char_tokens[cn1:cn2]:
337 for token_class, token_text in new_char_tokens[cn1:cn2]:
338 nbuffer.append((token_class, 'ins', token_text))
338 nbuffer.append((token_class, 'ins', token_text))
339
339
340 old_tokens_result.extend(obuffer)
340 old_tokens_result.extend(obuffer)
341 new_tokens_result.extend(nbuffer)
341 new_tokens_result.extend(nbuffer)
342
342
343 return old_tokens_result, new_tokens_result, similarity
343 return old_tokens_result, new_tokens_result, similarity
344
344
345
345
346 class DiffSet(object):
346 class DiffSet(object):
347 """
347 """
348 An object for parsing the diff result from diffs.DiffProcessor and
348 An object for parsing the diff result from diffs.DiffProcessor and
349 adding highlighting, side by side/unified renderings and line diffs
349 adding highlighting, side by side/unified renderings and line diffs
350 """
350 """
351
351
352 HL_REAL = 'REAL' # highlights using original file, slow
352 HL_REAL = 'REAL' # highlights using original file, slow
353 HL_FAST = 'FAST' # highlights using just the line, fast but not correct
353 HL_FAST = 'FAST' # highlights using just the line, fast but not correct
354 # in the case of multiline code
354 # in the case of multiline code
355 HL_NONE = 'NONE' # no highlighting, fastest
355 HL_NONE = 'NONE' # no highlighting, fastest
356
356
357 def __init__(self, highlight_mode=HL_REAL, repo_name=None,
357 def __init__(self, highlight_mode=HL_REAL, repo_name=None,
358 source_repo_name=None,
358 source_repo_name=None,
359 source_node_getter=lambda filename: None,
359 source_node_getter=lambda filename: None,
360 target_node_getter=lambda filename: None,
360 target_node_getter=lambda filename: None,
361 source_nodes=None, target_nodes=None,
361 source_nodes=None, target_nodes=None,
362 max_file_size_limit=150 * 1024, # files over this size will
362 max_file_size_limit=150 * 1024, # files over this size will
363 # use fast highlighting
363 # use fast highlighting
364 comments=None,
364 comments=None,
365 ):
365 ):
366
366
367 self.highlight_mode = highlight_mode
367 self.highlight_mode = highlight_mode
368 self.highlighted_filenodes = {}
368 self.highlighted_filenodes = {}
369 self.source_node_getter = source_node_getter
369 self.source_node_getter = source_node_getter
370 self.target_node_getter = target_node_getter
370 self.target_node_getter = target_node_getter
371 self.source_nodes = source_nodes or {}
371 self.source_nodes = source_nodes or {}
372 self.target_nodes = target_nodes or {}
372 self.target_nodes = target_nodes or {}
373 self.repo_name = repo_name
373 self.repo_name = repo_name
374 self.source_repo_name = source_repo_name or repo_name
374 self.source_repo_name = source_repo_name or repo_name
375 self.comments = comments or {}
375 self.comments = comments or {}
376 self.comments_store = self.comments.copy()
376 self.comments_store = self.comments.copy()
377 self.max_file_size_limit = max_file_size_limit
377 self.max_file_size_limit = max_file_size_limit
378
378
379 def render_patchset(self, patchset, source_ref=None, target_ref=None):
379 def render_patchset(self, patchset, source_ref=None, target_ref=None):
380 diffset = AttributeDict(dict(
380 diffset = AttributeDict(dict(
381 lines_added=0,
381 lines_added=0,
382 lines_deleted=0,
382 lines_deleted=0,
383 changed_files=0,
383 changed_files=0,
384 files=[],
384 files=[],
385 limited_diff=isinstance(patchset, LimitedDiffContainer),
385 limited_diff=isinstance(patchset, LimitedDiffContainer),
386 repo_name=self.repo_name,
386 repo_name=self.repo_name,
387 source_repo_name=self.source_repo_name,
387 source_repo_name=self.source_repo_name,
388 source_ref=source_ref,
388 source_ref=source_ref,
389 target_ref=target_ref,
389 target_ref=target_ref,
390 ))
390 ))
391 for patch in patchset:
391 for patch in patchset:
392 filediff = self.render_patch(patch)
392 filediff = self.render_patch(patch)
393 filediff.diffset = diffset
393 filediff.diffset = diffset
394 diffset.files.append(filediff)
394 diffset.files.append(filediff)
395 diffset.changed_files += 1
395 diffset.changed_files += 1
396 if not patch['stats']['binary']:
396 if not patch['stats']['binary']:
397 diffset.lines_added += patch['stats']['added']
397 diffset.lines_added += patch['stats']['added']
398 diffset.lines_deleted += patch['stats']['deleted']
398 diffset.lines_deleted += patch['stats']['deleted']
399
399
400 return diffset
400 return diffset
401
401
402 _lexer_cache = {}
402 _lexer_cache = {}
403 def _get_lexer_for_filename(self, filename, filenode=None):
403 def _get_lexer_for_filename(self, filename, filenode=None):
404 # cached because we might need to call it twice for source/target
404 # cached because we might need to call it twice for source/target
405 if filename not in self._lexer_cache:
405 if filename not in self._lexer_cache:
406 if filenode:
406 if filenode:
407 lexer = filenode.lexer
407 lexer = filenode.lexer
408 else:
408 else:
409 lexer = get_lexer_safe(filepath=filename)
409 lexer = FileNode.get_lexer(filename=filename)
410 self._lexer_cache[filename] = lexer
410 self._lexer_cache[filename] = lexer
411 return self._lexer_cache[filename]
411 return self._lexer_cache[filename]
412
412
413 def render_patch(self, patch):
413 def render_patch(self, patch):
414 log.debug('rendering diff for %r' % patch['filename'])
414 log.debug('rendering diff for %r' % patch['filename'])
415
415
416 source_filename = patch['original_filename']
416 source_filename = patch['original_filename']
417 target_filename = patch['filename']
417 target_filename = patch['filename']
418
418
419 source_lexer = plain_text_lexer
419 source_lexer = plain_text_lexer
420 target_lexer = plain_text_lexer
420 target_lexer = plain_text_lexer
421
421
422 if not patch['stats']['binary']:
422 if not patch['stats']['binary']:
423 if self.highlight_mode == self.HL_REAL:
423 if self.highlight_mode == self.HL_REAL:
424 if (source_filename and patch['operation'] in ('D', 'M')
424 if (source_filename and patch['operation'] in ('D', 'M')
425 and source_filename not in self.source_nodes):
425 and source_filename not in self.source_nodes):
426 self.source_nodes[source_filename] = (
426 self.source_nodes[source_filename] = (
427 self.source_node_getter(source_filename))
427 self.source_node_getter(source_filename))
428
428
429 if (target_filename and patch['operation'] in ('A', 'M')
429 if (target_filename and patch['operation'] in ('A', 'M')
430 and target_filename not in self.target_nodes):
430 and target_filename not in self.target_nodes):
431 self.target_nodes[target_filename] = (
431 self.target_nodes[target_filename] = (
432 self.target_node_getter(target_filename))
432 self.target_node_getter(target_filename))
433
433
434 elif self.highlight_mode == self.HL_FAST:
434 elif self.highlight_mode == self.HL_FAST:
435 source_lexer = self._get_lexer_for_filename(source_filename)
435 source_lexer = self._get_lexer_for_filename(source_filename)
436 target_lexer = self._get_lexer_for_filename(target_filename)
436 target_lexer = self._get_lexer_for_filename(target_filename)
437
437
438 source_file = self.source_nodes.get(source_filename, source_filename)
438 source_file = self.source_nodes.get(source_filename, source_filename)
439 target_file = self.target_nodes.get(target_filename, target_filename)
439 target_file = self.target_nodes.get(target_filename, target_filename)
440
440
441 source_filenode, target_filenode = None, None
441 source_filenode, target_filenode = None, None
442
442
443 # TODO: dan: FileNode.lexer works on the content of the file - which
443 # TODO: dan: FileNode.lexer works on the content of the file - which
444 # can be slow - issue #4289 explains a lexer clean up - which once
444 # can be slow - issue #4289 explains a lexer clean up - which once
445 # done can allow caching a lexer for a filenode to avoid the file lookup
445 # done can allow caching a lexer for a filenode to avoid the file lookup
446 if isinstance(source_file, FileNode):
446 if isinstance(source_file, FileNode):
447 source_filenode = source_file
447 source_filenode = source_file
448 #source_lexer = source_file.lexer
448 #source_lexer = source_file.lexer
449 source_lexer = self._get_lexer_for_filename(source_filename)
449 source_lexer = self._get_lexer_for_filename(source_filename)
450 source_file.lexer = source_lexer
450 source_file.lexer = source_lexer
451
451
452 if isinstance(target_file, FileNode):
452 if isinstance(target_file, FileNode):
453 target_filenode = target_file
453 target_filenode = target_file
454 #target_lexer = target_file.lexer
454 #target_lexer = target_file.lexer
455 target_lexer = self._get_lexer_for_filename(target_filename)
455 target_lexer = self._get_lexer_for_filename(target_filename)
456 target_file.lexer = target_lexer
456 target_file.lexer = target_lexer
457
457
458 source_file_path, target_file_path = None, None
458 source_file_path, target_file_path = None, None
459
459
460 if source_filename != '/dev/null':
460 if source_filename != '/dev/null':
461 source_file_path = source_filename
461 source_file_path = source_filename
462 if target_filename != '/dev/null':
462 if target_filename != '/dev/null':
463 target_file_path = target_filename
463 target_file_path = target_filename
464
464
465 source_file_type = source_lexer.name
465 source_file_type = source_lexer.name
466 target_file_type = target_lexer.name
466 target_file_type = target_lexer.name
467
467
468 op_hunks = patch['chunks'][0]
468 op_hunks = patch['chunks'][0]
469 hunks = patch['chunks'][1:]
469 hunks = patch['chunks'][1:]
470
470
471 filediff = AttributeDict({
471 filediff = AttributeDict({
472 'source_file_path': source_file_path,
472 'source_file_path': source_file_path,
473 'target_file_path': target_file_path,
473 'target_file_path': target_file_path,
474 'source_filenode': source_filenode,
474 'source_filenode': source_filenode,
475 'target_filenode': target_filenode,
475 'target_filenode': target_filenode,
476 'hunks': [],
476 'hunks': [],
477 'source_file_type': target_file_type,
477 'source_file_type': target_file_type,
478 'target_file_type': source_file_type,
478 'target_file_type': source_file_type,
479 'patch': patch,
479 'patch': patch,
480 'source_mode': patch['stats']['old_mode'],
480 'source_mode': patch['stats']['old_mode'],
481 'target_mode': patch['stats']['new_mode'],
481 'target_mode': patch['stats']['new_mode'],
482 'limited_diff': isinstance(patch, LimitedDiffContainer),
482 'limited_diff': isinstance(patch, LimitedDiffContainer),
483 'diffset': self,
483 'diffset': self,
484 })
484 })
485
485
486 for hunk in hunks:
486 for hunk in hunks:
487 hunkbit = self.parse_hunk(hunk, source_file, target_file)
487 hunkbit = self.parse_hunk(hunk, source_file, target_file)
488 hunkbit.filediff = filediff
488 hunkbit.filediff = filediff
489 filediff.hunks.append(hunkbit)
489 filediff.hunks.append(hunkbit)
490
490
491 left_comments = {}
491 left_comments = {}
492
492
493 if source_file_path in self.comments_store:
493 if source_file_path in self.comments_store:
494 for lineno, comments in self.comments_store[source_file_path].items():
494 for lineno, comments in self.comments_store[source_file_path].items():
495 left_comments[lineno] = comments
495 left_comments[lineno] = comments
496
496
497 if target_file_path in self.comments_store:
497 if target_file_path in self.comments_store:
498 for lineno, comments in self.comments_store[target_file_path].items():
498 for lineno, comments in self.comments_store[target_file_path].items():
499 left_comments[lineno] = comments
499 left_comments[lineno] = comments
500
500
501 filediff.left_comments = left_comments
501 filediff.left_comments = left_comments
502 return filediff
502 return filediff
503
503
504 def parse_hunk(self, hunk, source_file, target_file):
504 def parse_hunk(self, hunk, source_file, target_file):
505 result = AttributeDict(dict(
505 result = AttributeDict(dict(
506 source_start=hunk['source_start'],
506 source_start=hunk['source_start'],
507 source_length=hunk['source_length'],
507 source_length=hunk['source_length'],
508 target_start=hunk['target_start'],
508 target_start=hunk['target_start'],
509 target_length=hunk['target_length'],
509 target_length=hunk['target_length'],
510 section_header=hunk['section_header'],
510 section_header=hunk['section_header'],
511 lines=[],
511 lines=[],
512 ))
512 ))
513 before, after = [], []
513 before, after = [], []
514
514
515 for line in hunk['lines']:
515 for line in hunk['lines']:
516 if line['action'] == 'unmod':
516 if line['action'] == 'unmod':
517 result.lines.extend(
517 result.lines.extend(
518 self.parse_lines(before, after, source_file, target_file))
518 self.parse_lines(before, after, source_file, target_file))
519 after.append(line)
519 after.append(line)
520 before.append(line)
520 before.append(line)
521 elif line['action'] == 'add':
521 elif line['action'] == 'add':
522 after.append(line)
522 after.append(line)
523 elif line['action'] == 'del':
523 elif line['action'] == 'del':
524 before.append(line)
524 before.append(line)
525 elif line['action'] == 'old-no-nl':
525 elif line['action'] == 'old-no-nl':
526 before.append(line)
526 before.append(line)
527 elif line['action'] == 'new-no-nl':
527 elif line['action'] == 'new-no-nl':
528 after.append(line)
528 after.append(line)
529
529
530 result.lines.extend(
530 result.lines.extend(
531 self.parse_lines(before, after, source_file, target_file))
531 self.parse_lines(before, after, source_file, target_file))
532 result.unified = self.as_unified(result.lines)
532 result.unified = self.as_unified(result.lines)
533 result.sideside = result.lines
533 result.sideside = result.lines
534
534
535 return result
535 return result
536
536
537 def parse_lines(self, before_lines, after_lines, source_file, target_file):
537 def parse_lines(self, before_lines, after_lines, source_file, target_file):
538 # TODO: dan: investigate doing the diff comparison and fast highlighting
538 # TODO: dan: investigate doing the diff comparison and fast highlighting
539 # on the entire before and after buffered block lines rather than by
539 # on the entire before and after buffered block lines rather than by
540 # line, this means we can get better 'fast' highlighting if the context
540 # line, this means we can get better 'fast' highlighting if the context
541 # allows it - eg.
541 # allows it - eg.
542 # line 4: """
542 # line 4: """
543 # line 5: this gets highlighted as a string
543 # line 5: this gets highlighted as a string
544 # line 6: """
544 # line 6: """
545
545
546 lines = []
546 lines = []
547 while before_lines or after_lines:
547 while before_lines or after_lines:
548 before, after = None, None
548 before, after = None, None
549 before_tokens, after_tokens = None, None
549 before_tokens, after_tokens = None, None
550
550
551 if before_lines:
551 if before_lines:
552 before = before_lines.pop(0)
552 before = before_lines.pop(0)
553 if after_lines:
553 if after_lines:
554 after = after_lines.pop(0)
554 after = after_lines.pop(0)
555
555
556 original = AttributeDict()
556 original = AttributeDict()
557 modified = AttributeDict()
557 modified = AttributeDict()
558
558
559 if before:
559 if before:
560 if before['action'] == 'old-no-nl':
560 if before['action'] == 'old-no-nl':
561 before_tokens = [('nonl', before['line'])]
561 before_tokens = [('nonl', before['line'])]
562 else:
562 else:
563 before_tokens = self.get_line_tokens(
563 before_tokens = self.get_line_tokens(
564 line_text=before['line'], line_number=before['old_lineno'],
564 line_text=before['line'], line_number=before['old_lineno'],
565 file=source_file)
565 file=source_file)
566 original.lineno = before['old_lineno']
566 original.lineno = before['old_lineno']
567 original.content = before['line']
567 original.content = before['line']
568 original.action = self.action_to_op(before['action'])
568 original.action = self.action_to_op(before['action'])
569 original.comments = self.get_comments_for('old',
569 original.comments = self.get_comments_for('old',
570 source_file, before['old_lineno'])
570 source_file, before['old_lineno'])
571
571
572 if after:
572 if after:
573 if after['action'] == 'new-no-nl':
573 if after['action'] == 'new-no-nl':
574 after_tokens = [('nonl', after['line'])]
574 after_tokens = [('nonl', after['line'])]
575 else:
575 else:
576 after_tokens = self.get_line_tokens(
576 after_tokens = self.get_line_tokens(
577 line_text=after['line'], line_number=after['new_lineno'],
577 line_text=after['line'], line_number=after['new_lineno'],
578 file=target_file)
578 file=target_file)
579 modified.lineno = after['new_lineno']
579 modified.lineno = after['new_lineno']
580 modified.content = after['line']
580 modified.content = after['line']
581 modified.action = self.action_to_op(after['action'])
581 modified.action = self.action_to_op(after['action'])
582 modified.comments = self.get_comments_for('new',
582 modified.comments = self.get_comments_for('new',
583 target_file, after['new_lineno'])
583 target_file, after['new_lineno'])
584
584
585 # diff the lines
585 # diff the lines
586 if before_tokens and after_tokens:
586 if before_tokens and after_tokens:
587 o_tokens, m_tokens, similarity = tokens_diff(
587 o_tokens, m_tokens, similarity = tokens_diff(
588 before_tokens, after_tokens)
588 before_tokens, after_tokens)
589 original.content = render_tokenstream(o_tokens)
589 original.content = render_tokenstream(o_tokens)
590 modified.content = render_tokenstream(m_tokens)
590 modified.content = render_tokenstream(m_tokens)
591 elif before_tokens:
591 elif before_tokens:
592 original.content = render_tokenstream(
592 original.content = render_tokenstream(
593 [(x[0], '', x[1]) for x in before_tokens])
593 [(x[0], '', x[1]) for x in before_tokens])
594 elif after_tokens:
594 elif after_tokens:
595 modified.content = render_tokenstream(
595 modified.content = render_tokenstream(
596 [(x[0], '', x[1]) for x in after_tokens])
596 [(x[0], '', x[1]) for x in after_tokens])
597
597
598 lines.append(AttributeDict({
598 lines.append(AttributeDict({
599 'original': original,
599 'original': original,
600 'modified': modified,
600 'modified': modified,
601 }))
601 }))
602
602
603 return lines
603 return lines
604
604
605 def get_comments_for(self, version, file, line_number):
605 def get_comments_for(self, version, file, line_number):
606 if hasattr(file, 'unicode_path'):
606 if hasattr(file, 'unicode_path'):
607 file = file.unicode_path
607 file = file.unicode_path
608
608
609 if not isinstance(file, basestring):
609 if not isinstance(file, basestring):
610 return None
610 return None
611
611
612 line_key = {
612 line_key = {
613 'old': 'o',
613 'old': 'o',
614 'new': 'n',
614 'new': 'n',
615 }[version] + str(line_number)
615 }[version] + str(line_number)
616
616
617 if file in self.comments_store:
617 if file in self.comments_store:
618 file_comments = self.comments_store[file]
618 file_comments = self.comments_store[file]
619 if line_key in file_comments:
619 if line_key in file_comments:
620 return file_comments.pop(line_key)
620 return file_comments.pop(line_key)
621
621
622 def get_line_tokens(self, line_text, line_number, file=None):
622 def get_line_tokens(self, line_text, line_number, file=None):
623 filenode = None
623 filenode = None
624 filename = None
624 filename = None
625
625
626 if isinstance(file, basestring):
626 if isinstance(file, basestring):
627 filename = file
627 filename = file
628 elif isinstance(file, FileNode):
628 elif isinstance(file, FileNode):
629 filenode = file
629 filenode = file
630 filename = file.unicode_path
630 filename = file.unicode_path
631
631
632 if self.highlight_mode == self.HL_REAL and filenode:
632 if self.highlight_mode == self.HL_REAL and filenode:
633 lexer = self._get_lexer_for_filename(filename)
633 lexer = self._get_lexer_for_filename(filename)
634 file_size_allowed = file.size < self.max_file_size_limit
634 file_size_allowed = file.size < self.max_file_size_limit
635 if line_number and file_size_allowed:
635 if line_number and file_size_allowed:
636 return self.get_tokenized_filenode_line(
636 return self.get_tokenized_filenode_line(
637 file, line_number, lexer)
637 file, line_number, lexer)
638
638
639 if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:
639 if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:
640 lexer = self._get_lexer_for_filename(filename)
640 lexer = self._get_lexer_for_filename(filename)
641 return list(tokenize_string(line_text, lexer))
641 return list(tokenize_string(line_text, lexer))
642
642
643 return list(tokenize_string(line_text, plain_text_lexer))
643 return list(tokenize_string(line_text, plain_text_lexer))
644
644
645 def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):
645 def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):
646
646
647 if filenode not in self.highlighted_filenodes:
647 if filenode not in self.highlighted_filenodes:
648 tokenized_lines = filenode_as_lines_tokens(filenode, lexer)
648 tokenized_lines = filenode_as_lines_tokens(filenode, lexer)
649 self.highlighted_filenodes[filenode] = tokenized_lines
649 self.highlighted_filenodes[filenode] = tokenized_lines
650 return self.highlighted_filenodes[filenode][line_number - 1]
650 return self.highlighted_filenodes[filenode][line_number - 1]
651
651
652 def action_to_op(self, action):
652 def action_to_op(self, action):
653 return {
653 return {
654 'add': '+',
654 'add': '+',
655 'del': '-',
655 'del': '-',
656 'unmod': ' ',
656 'unmod': ' ',
657 'old-no-nl': ' ',
657 'old-no-nl': ' ',
658 'new-no-nl': ' ',
658 'new-no-nl': ' ',
659 }.get(action, action)
659 }.get(action, action)
660
660
661 def as_unified(self, lines):
661 def as_unified(self, lines):
662 """
662 """
663 Return a generator that yields the lines of a diff in unified order
663 Return a generator that yields the lines of a diff in unified order
664 """
664 """
665 def generator():
665 def generator():
666 buf = []
666 buf = []
667 for line in lines:
667 for line in lines:
668
668
669 if buf and not line.original or line.original.action == ' ':
669 if buf and not line.original or line.original.action == ' ':
670 for b in buf:
670 for b in buf:
671 yield b
671 yield b
672 buf = []
672 buf = []
673
673
674 if line.original:
674 if line.original:
675 if line.original.action == ' ':
675 if line.original.action == ' ':
676 yield (line.original.lineno, line.modified.lineno,
676 yield (line.original.lineno, line.modified.lineno,
677 line.original.action, line.original.content,
677 line.original.action, line.original.content,
678 line.original.comments)
678 line.original.comments)
679 continue
679 continue
680
680
681 if line.original.action == '-':
681 if line.original.action == '-':
682 yield (line.original.lineno, None,
682 yield (line.original.lineno, None,
683 line.original.action, line.original.content,
683 line.original.action, line.original.content,
684 line.original.comments)
684 line.original.comments)
685
685
686 if line.modified.action == '+':
686 if line.modified.action == '+':
687 buf.append((
687 buf.append((
688 None, line.modified.lineno,
688 None, line.modified.lineno,
689 line.modified.action, line.modified.content,
689 line.modified.action, line.modified.content,
690 line.modified.comments))
690 line.modified.comments))
691 continue
691 continue
692
692
693 if line.modified:
693 if line.modified:
694 yield (None, line.modified.lineno,
694 yield (None, line.modified.lineno,
695 line.modified.action, line.modified.content,
695 line.modified.action, line.modified.content,
696 line.modified.comments)
696 line.modified.comments)
697
697
698 for b in buf:
698 for b in buf:
699 yield b
699 yield b
700
700
701 return generator()
701 return generator()
General Comments 0
You need to be logged in to leave comments. Login now