##// END OF EJS Templates
diffs: optimize how lexer is fetche for rich highlight mode....
marcink -
r1356:1e4a47eb default
parent child Browse files
Show More
@@ -1,687 +1,701 b''
1 # -*- coding: utf-8 -*-
1 # -*- coding: utf-8 -*-
2
2
3 # Copyright (C) 2011-2017 RhodeCode GmbH
3 # Copyright (C) 2011-2017 RhodeCode GmbH
4 #
4 #
5 # This program is free software: you can redistribute it and/or modify
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Affero General Public License, version 3
6 # it under the terms of the GNU Affero General Public License, version 3
7 # (only), as published by the Free Software Foundation.
7 # (only), as published by the Free Software Foundation.
8 #
8 #
9 # This program is distributed in the hope that it will be useful,
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
12 # GNU General Public License for more details.
13 #
13 #
14 # You should have received a copy of the GNU Affero General Public License
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 #
16 #
17 # This program is dual-licensed. If you wish to learn more about the
17 # This program is dual-licensed. If you wish to learn more about the
18 # RhodeCode Enterprise Edition, including its added features, Support services,
18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 # and proprietary license terms, please see https://rhodecode.com/licenses/
19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20
20
21 import logging
21 import logging
22 import difflib
22 import difflib
23 from itertools import groupby
23 from itertools import groupby
24
24
25 from pygments import lex
25 from pygments import lex
26 from pygments.formatters.html import _get_ttype_class as pygment_token_class
26 from pygments.formatters.html import _get_ttype_class as pygment_token_class
27 from rhodecode.lib.helpers import (
27 from rhodecode.lib.helpers import (
28 get_lexer_for_filenode, get_lexer_safe, html_escape)
28 get_lexer_for_filenode, get_lexer_safe, html_escape)
29 from rhodecode.lib.utils2 import AttributeDict
29 from rhodecode.lib.utils2 import AttributeDict
30 from rhodecode.lib.vcs.nodes import FileNode
30 from rhodecode.lib.vcs.nodes import FileNode
31 from rhodecode.lib.diff_match_patch import diff_match_patch
31 from rhodecode.lib.diff_match_patch import diff_match_patch
32 from rhodecode.lib.diffs import LimitedDiffContainer
32 from rhodecode.lib.diffs import LimitedDiffContainer
33 from pygments.lexers import get_lexer_by_name
33 from pygments.lexers import get_lexer_by_name
34
34
35 plain_text_lexer = get_lexer_by_name(
35 plain_text_lexer = get_lexer_by_name(
36 'text', stripall=False, stripnl=False, ensurenl=False)
36 'text', stripall=False, stripnl=False, ensurenl=False)
37
37
38
38
39 log = logging.getLogger()
39 log = logging.getLogger()
40
40
41
41
42 def filenode_as_lines_tokens(filenode, lexer=None):
42 def filenode_as_lines_tokens(filenode, lexer=None):
43 org_lexer = lexer
43 lexer = lexer or get_lexer_for_filenode(filenode)
44 lexer = lexer or get_lexer_for_filenode(filenode)
44 log.debug('Generating file node pygment tokens for %s, %s', lexer, filenode)
45 log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',
46 lexer, filenode, org_lexer)
45 tokens = tokenize_string(filenode.content, lexer)
47 tokens = tokenize_string(filenode.content, lexer)
46 lines = split_token_stream(tokens, split_string='\n')
48 lines = split_token_stream(tokens, split_string='\n')
47 rv = list(lines)
49 rv = list(lines)
48 return rv
50 return rv
49
51
50
52
51 def tokenize_string(content, lexer):
53 def tokenize_string(content, lexer):
52 """
54 """
53 Use pygments to tokenize some content based on a lexer
55 Use pygments to tokenize some content based on a lexer
54 ensuring all original new lines and whitespace is preserved
56 ensuring all original new lines and whitespace is preserved
55 """
57 """
56
58
57 lexer.stripall = False
59 lexer.stripall = False
58 lexer.stripnl = False
60 lexer.stripnl = False
59 lexer.ensurenl = False
61 lexer.ensurenl = False
60 for token_type, token_text in lex(content, lexer):
62 for token_type, token_text in lex(content, lexer):
61 yield pygment_token_class(token_type), token_text
63 yield pygment_token_class(token_type), token_text
62
64
63
65
64 def split_token_stream(tokens, split_string=u'\n'):
66 def split_token_stream(tokens, split_string=u'\n'):
65 """
67 """
66 Take a list of (TokenType, text) tuples and split them by a string
68 Take a list of (TokenType, text) tuples and split them by a string
67
69
68 >>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
70 >>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
69 [(TEXT, 'some'), (TEXT, 'text'),
71 [(TEXT, 'some'), (TEXT, 'text'),
70 (TEXT, 'more'), (TEXT, 'text')]
72 (TEXT, 'more'), (TEXT, 'text')]
71 """
73 """
72
74
73 buffer = []
75 buffer = []
74 for token_class, token_text in tokens:
76 for token_class, token_text in tokens:
75 parts = token_text.split(split_string)
77 parts = token_text.split(split_string)
76 for part in parts[:-1]:
78 for part in parts[:-1]:
77 buffer.append((token_class, part))
79 buffer.append((token_class, part))
78 yield buffer
80 yield buffer
79 buffer = []
81 buffer = []
80
82
81 buffer.append((token_class, parts[-1]))
83 buffer.append((token_class, parts[-1]))
82
84
83 if buffer:
85 if buffer:
84 yield buffer
86 yield buffer
85
87
86
88
87 def filenode_as_annotated_lines_tokens(filenode):
89 def filenode_as_annotated_lines_tokens(filenode):
88 """
90 """
89 Take a file node and return a list of annotations => lines, if no annotation
91 Take a file node and return a list of annotations => lines, if no annotation
90 is found, it will be None.
92 is found, it will be None.
91
93
92 eg:
94 eg:
93
95
94 [
96 [
95 (annotation1, [
97 (annotation1, [
96 (1, line1_tokens_list),
98 (1, line1_tokens_list),
97 (2, line2_tokens_list),
99 (2, line2_tokens_list),
98 ]),
100 ]),
99 (annotation2, [
101 (annotation2, [
100 (3, line1_tokens_list),
102 (3, line1_tokens_list),
101 ]),
103 ]),
102 (None, [
104 (None, [
103 (4, line1_tokens_list),
105 (4, line1_tokens_list),
104 ]),
106 ]),
105 (annotation1, [
107 (annotation1, [
106 (5, line1_tokens_list),
108 (5, line1_tokens_list),
107 (6, line2_tokens_list),
109 (6, line2_tokens_list),
108 ])
110 ])
109 ]
111 ]
110 """
112 """
111
113
112 commit_cache = {} # cache commit_getter lookups
114 commit_cache = {} # cache commit_getter lookups
113
115
114 def _get_annotation(commit_id, commit_getter):
116 def _get_annotation(commit_id, commit_getter):
115 if commit_id not in commit_cache:
117 if commit_id not in commit_cache:
116 commit_cache[commit_id] = commit_getter()
118 commit_cache[commit_id] = commit_getter()
117 return commit_cache[commit_id]
119 return commit_cache[commit_id]
118
120
119 annotation_lookup = {
121 annotation_lookup = {
120 line_no: _get_annotation(commit_id, commit_getter)
122 line_no: _get_annotation(commit_id, commit_getter)
121 for line_no, commit_id, commit_getter, line_content
123 for line_no, commit_id, commit_getter, line_content
122 in filenode.annotate
124 in filenode.annotate
123 }
125 }
124
126
125 annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
127 annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
126 for line_no, tokens
128 for line_no, tokens
127 in enumerate(filenode_as_lines_tokens(filenode), 1))
129 in enumerate(filenode_as_lines_tokens(filenode), 1))
128
130
129 grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
131 grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
130
132
131 for annotation, group in grouped_annotations_lines:
133 for annotation, group in grouped_annotations_lines:
132 yield (
134 yield (
133 annotation, [(line_no, tokens)
135 annotation, [(line_no, tokens)
134 for (_, line_no, tokens) in group]
136 for (_, line_no, tokens) in group]
135 )
137 )
136
138
137
139
138 def render_tokenstream(tokenstream):
140 def render_tokenstream(tokenstream):
139 result = []
141 result = []
140 for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
142 for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
141
143
142 if token_class:
144 if token_class:
143 result.append(u'<span class="%s">' % token_class)
145 result.append(u'<span class="%s">' % token_class)
144 else:
146 else:
145 result.append(u'<span>')
147 result.append(u'<span>')
146
148
147 for op_tag, token_text in token_ops_texts:
149 for op_tag, token_text in token_ops_texts:
148
150
149 if op_tag:
151 if op_tag:
150 result.append(u'<%s>' % op_tag)
152 result.append(u'<%s>' % op_tag)
151
153
152 escaped_text = html_escape(token_text)
154 escaped_text = html_escape(token_text)
153
155
154 # TODO: dan: investigate showing hidden characters like space/nl/tab
156 # TODO: dan: investigate showing hidden characters like space/nl/tab
155 # escaped_text = escaped_text.replace(' ', '<sp> </sp>')
157 # escaped_text = escaped_text.replace(' ', '<sp> </sp>')
156 # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
158 # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
157 # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')
159 # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')
158
160
159 result.append(escaped_text)
161 result.append(escaped_text)
160
162
161 if op_tag:
163 if op_tag:
162 result.append(u'</%s>' % op_tag)
164 result.append(u'</%s>' % op_tag)
163
165
164 result.append(u'</span>')
166 result.append(u'</span>')
165
167
166 html = ''.join(result)
168 html = ''.join(result)
167 return html
169 return html
168
170
169
171
170 def rollup_tokenstream(tokenstream):
172 def rollup_tokenstream(tokenstream):
171 """
173 """
172 Group a token stream of the format:
174 Group a token stream of the format:
173
175
174 ('class', 'op', 'text')
176 ('class', 'op', 'text')
175 or
177 or
176 ('class', 'text')
178 ('class', 'text')
177
179
178 into
180 into
179
181
180 [('class1',
182 [('class1',
181 [('op1', 'text'),
183 [('op1', 'text'),
182 ('op2', 'text')]),
184 ('op2', 'text')]),
183 ('class2',
185 ('class2',
184 [('op3', 'text')])]
186 [('op3', 'text')])]
185
187
186 This is used to get the minimal tags necessary when
188 This is used to get the minimal tags necessary when
187 rendering to html eg for a token stream ie.
189 rendering to html eg for a token stream ie.
188
190
189 <span class="A"><ins>he</ins>llo</span>
191 <span class="A"><ins>he</ins>llo</span>
190 vs
192 vs
191 <span class="A"><ins>he</ins></span><span class="A">llo</span>
193 <span class="A"><ins>he</ins></span><span class="A">llo</span>
192
194
193 If a 2 tuple is passed in, the output op will be an empty string.
195 If a 2 tuple is passed in, the output op will be an empty string.
194
196
195 eg:
197 eg:
196
198
197 >>> rollup_tokenstream([('classA', '', 'h'),
199 >>> rollup_tokenstream([('classA', '', 'h'),
198 ('classA', 'del', 'ell'),
200 ('classA', 'del', 'ell'),
199 ('classA', '', 'o'),
201 ('classA', '', 'o'),
200 ('classB', '', ' '),
202 ('classB', '', ' '),
201 ('classA', '', 'the'),
203 ('classA', '', 'the'),
202 ('classA', '', 're'),
204 ('classA', '', 're'),
203 ])
205 ])
204
206
205 [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
207 [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
206 ('classB', [('', ' ')],
208 ('classB', [('', ' ')],
207 ('classA', [('', 'there')]]
209 ('classA', [('', 'there')]]
208
210
209 """
211 """
210 if tokenstream and len(tokenstream[0]) == 2:
212 if tokenstream and len(tokenstream[0]) == 2:
211 tokenstream = ((t[0], '', t[1]) for t in tokenstream)
213 tokenstream = ((t[0], '', t[1]) for t in tokenstream)
212
214
213 result = []
215 result = []
214 for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
216 for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
215 ops = []
217 ops = []
216 for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
218 for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
217 text_buffer = []
219 text_buffer = []
218 for t_class, t_op, t_text in token_text_list:
220 for t_class, t_op, t_text in token_text_list:
219 text_buffer.append(t_text)
221 text_buffer.append(t_text)
220 ops.append((token_op, ''.join(text_buffer)))
222 ops.append((token_op, ''.join(text_buffer)))
221 result.append((token_class, ops))
223 result.append((token_class, ops))
222 return result
224 return result
223
225
224
226
225 def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
227 def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
226 """
228 """
227 Converts a list of (token_class, token_text) tuples to a list of
229 Converts a list of (token_class, token_text) tuples to a list of
228 (token_class, token_op, token_text) tuples where token_op is one of
230 (token_class, token_op, token_text) tuples where token_op is one of
229 ('ins', 'del', '')
231 ('ins', 'del', '')
230
232
231 :param old_tokens: list of (token_class, token_text) tuples of old line
233 :param old_tokens: list of (token_class, token_text) tuples of old line
232 :param new_tokens: list of (token_class, token_text) tuples of new line
234 :param new_tokens: list of (token_class, token_text) tuples of new line
233 :param use_diff_match_patch: boolean, will use google's diff match patch
235 :param use_diff_match_patch: boolean, will use google's diff match patch
234 library which has options to 'smooth' out the character by character
236 library which has options to 'smooth' out the character by character
235 differences making nicer ins/del blocks
237 differences making nicer ins/del blocks
236 """
238 """
237
239
238 old_tokens_result = []
240 old_tokens_result = []
239 new_tokens_result = []
241 new_tokens_result = []
240
242
241 similarity = difflib.SequenceMatcher(None,
243 similarity = difflib.SequenceMatcher(None,
242 ''.join(token_text for token_class, token_text in old_tokens),
244 ''.join(token_text for token_class, token_text in old_tokens),
243 ''.join(token_text for token_class, token_text in new_tokens)
245 ''.join(token_text for token_class, token_text in new_tokens)
244 ).ratio()
246 ).ratio()
245
247
246 if similarity < 0.6: # return, the blocks are too different
248 if similarity < 0.6: # return, the blocks are too different
247 for token_class, token_text in old_tokens:
249 for token_class, token_text in old_tokens:
248 old_tokens_result.append((token_class, '', token_text))
250 old_tokens_result.append((token_class, '', token_text))
249 for token_class, token_text in new_tokens:
251 for token_class, token_text in new_tokens:
250 new_tokens_result.append((token_class, '', token_text))
252 new_tokens_result.append((token_class, '', token_text))
251 return old_tokens_result, new_tokens_result, similarity
253 return old_tokens_result, new_tokens_result, similarity
252
254
253 token_sequence_matcher = difflib.SequenceMatcher(None,
255 token_sequence_matcher = difflib.SequenceMatcher(None,
254 [x[1] for x in old_tokens],
256 [x[1] for x in old_tokens],
255 [x[1] for x in new_tokens])
257 [x[1] for x in new_tokens])
256
258
257 for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
259 for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
258 # check the differences by token block types first to give a more
260 # check the differences by token block types first to give a more
259 # nicer "block" level replacement vs character diffs
261 # nicer "block" level replacement vs character diffs
260
262
261 if tag == 'equal':
263 if tag == 'equal':
262 for token_class, token_text in old_tokens[o1:o2]:
264 for token_class, token_text in old_tokens[o1:o2]:
263 old_tokens_result.append((token_class, '', token_text))
265 old_tokens_result.append((token_class, '', token_text))
264 for token_class, token_text in new_tokens[n1:n2]:
266 for token_class, token_text in new_tokens[n1:n2]:
265 new_tokens_result.append((token_class, '', token_text))
267 new_tokens_result.append((token_class, '', token_text))
266 elif tag == 'delete':
268 elif tag == 'delete':
267 for token_class, token_text in old_tokens[o1:o2]:
269 for token_class, token_text in old_tokens[o1:o2]:
268 old_tokens_result.append((token_class, 'del', token_text))
270 old_tokens_result.append((token_class, 'del', token_text))
269 elif tag == 'insert':
271 elif tag == 'insert':
270 for token_class, token_text in new_tokens[n1:n2]:
272 for token_class, token_text in new_tokens[n1:n2]:
271 new_tokens_result.append((token_class, 'ins', token_text))
273 new_tokens_result.append((token_class, 'ins', token_text))
272 elif tag == 'replace':
274 elif tag == 'replace':
273 # if same type token blocks must be replaced, do a diff on the
275 # if same type token blocks must be replaced, do a diff on the
274 # characters in the token blocks to show individual changes
276 # characters in the token blocks to show individual changes
275
277
276 old_char_tokens = []
278 old_char_tokens = []
277 new_char_tokens = []
279 new_char_tokens = []
278 for token_class, token_text in old_tokens[o1:o2]:
280 for token_class, token_text in old_tokens[o1:o2]:
279 for char in token_text:
281 for char in token_text:
280 old_char_tokens.append((token_class, char))
282 old_char_tokens.append((token_class, char))
281
283
282 for token_class, token_text in new_tokens[n1:n2]:
284 for token_class, token_text in new_tokens[n1:n2]:
283 for char in token_text:
285 for char in token_text:
284 new_char_tokens.append((token_class, char))
286 new_char_tokens.append((token_class, char))
285
287
286 old_string = ''.join([token_text for
288 old_string = ''.join([token_text for
287 token_class, token_text in old_char_tokens])
289 token_class, token_text in old_char_tokens])
288 new_string = ''.join([token_text for
290 new_string = ''.join([token_text for
289 token_class, token_text in new_char_tokens])
291 token_class, token_text in new_char_tokens])
290
292
291 char_sequence = difflib.SequenceMatcher(
293 char_sequence = difflib.SequenceMatcher(
292 None, old_string, new_string)
294 None, old_string, new_string)
293 copcodes = char_sequence.get_opcodes()
295 copcodes = char_sequence.get_opcodes()
294 obuffer, nbuffer = [], []
296 obuffer, nbuffer = [], []
295
297
296 if use_diff_match_patch:
298 if use_diff_match_patch:
297 dmp = diff_match_patch()
299 dmp = diff_match_patch()
298 dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting
300 dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting
299 reps = dmp.diff_main(old_string, new_string)
301 reps = dmp.diff_main(old_string, new_string)
300 dmp.diff_cleanupEfficiency(reps)
302 dmp.diff_cleanupEfficiency(reps)
301
303
302 a, b = 0, 0
304 a, b = 0, 0
303 for op, rep in reps:
305 for op, rep in reps:
304 l = len(rep)
306 l = len(rep)
305 if op == 0:
307 if op == 0:
306 for i, c in enumerate(rep):
308 for i, c in enumerate(rep):
307 obuffer.append((old_char_tokens[a+i][0], '', c))
309 obuffer.append((old_char_tokens[a+i][0], '', c))
308 nbuffer.append((new_char_tokens[b+i][0], '', c))
310 nbuffer.append((new_char_tokens[b+i][0], '', c))
309 a += l
311 a += l
310 b += l
312 b += l
311 elif op == -1:
313 elif op == -1:
312 for i, c in enumerate(rep):
314 for i, c in enumerate(rep):
313 obuffer.append((old_char_tokens[a+i][0], 'del', c))
315 obuffer.append((old_char_tokens[a+i][0], 'del', c))
314 a += l
316 a += l
315 elif op == 1:
317 elif op == 1:
316 for i, c in enumerate(rep):
318 for i, c in enumerate(rep):
317 nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
319 nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
318 b += l
320 b += l
319 else:
321 else:
320 for ctag, co1, co2, cn1, cn2 in copcodes:
322 for ctag, co1, co2, cn1, cn2 in copcodes:
321 if ctag == 'equal':
323 if ctag == 'equal':
322 for token_class, token_text in old_char_tokens[co1:co2]:
324 for token_class, token_text in old_char_tokens[co1:co2]:
323 obuffer.append((token_class, '', token_text))
325 obuffer.append((token_class, '', token_text))
324 for token_class, token_text in new_char_tokens[cn1:cn2]:
326 for token_class, token_text in new_char_tokens[cn1:cn2]:
325 nbuffer.append((token_class, '', token_text))
327 nbuffer.append((token_class, '', token_text))
326 elif ctag == 'delete':
328 elif ctag == 'delete':
327 for token_class, token_text in old_char_tokens[co1:co2]:
329 for token_class, token_text in old_char_tokens[co1:co2]:
328 obuffer.append((token_class, 'del', token_text))
330 obuffer.append((token_class, 'del', token_text))
329 elif ctag == 'insert':
331 elif ctag == 'insert':
330 for token_class, token_text in new_char_tokens[cn1:cn2]:
332 for token_class, token_text in new_char_tokens[cn1:cn2]:
331 nbuffer.append((token_class, 'ins', token_text))
333 nbuffer.append((token_class, 'ins', token_text))
332 elif ctag == 'replace':
334 elif ctag == 'replace':
333 for token_class, token_text in old_char_tokens[co1:co2]:
335 for token_class, token_text in old_char_tokens[co1:co2]:
334 obuffer.append((token_class, 'del', token_text))
336 obuffer.append((token_class, 'del', token_text))
335 for token_class, token_text in new_char_tokens[cn1:cn2]:
337 for token_class, token_text in new_char_tokens[cn1:cn2]:
336 nbuffer.append((token_class, 'ins', token_text))
338 nbuffer.append((token_class, 'ins', token_text))
337
339
338 old_tokens_result.extend(obuffer)
340 old_tokens_result.extend(obuffer)
339 new_tokens_result.extend(nbuffer)
341 new_tokens_result.extend(nbuffer)
340
342
341 return old_tokens_result, new_tokens_result, similarity
343 return old_tokens_result, new_tokens_result, similarity
342
344
343
345
344 class DiffSet(object):
346 class DiffSet(object):
345 """
347 """
346 An object for parsing the diff result from diffs.DiffProcessor and
348 An object for parsing the diff result from diffs.DiffProcessor and
347 adding highlighting, side by side/unified renderings and line diffs
349 adding highlighting, side by side/unified renderings and line diffs
348 """
350 """
349
351
350 HL_REAL = 'REAL' # highlights using original file, slow
352 HL_REAL = 'REAL' # highlights using original file, slow
351 HL_FAST = 'FAST' # highlights using just the line, fast but not correct
353 HL_FAST = 'FAST' # highlights using just the line, fast but not correct
352 # in the case of multiline code
354 # in the case of multiline code
353 HL_NONE = 'NONE' # no highlighting, fastest
355 HL_NONE = 'NONE' # no highlighting, fastest
354
356
355 def __init__(self, highlight_mode=HL_REAL, repo_name=None,
357 def __init__(self, highlight_mode=HL_REAL, repo_name=None,
356 source_repo_name=None,
358 source_repo_name=None,
357 source_node_getter=lambda filename: None,
359 source_node_getter=lambda filename: None,
358 target_node_getter=lambda filename: None,
360 target_node_getter=lambda filename: None,
359 source_nodes=None, target_nodes=None,
361 source_nodes=None, target_nodes=None,
360 max_file_size_limit=150 * 1024, # files over this size will
362 max_file_size_limit=150 * 1024, # files over this size will
361 # use fast highlighting
363 # use fast highlighting
362 comments=None,
364 comments=None,
363 ):
365 ):
364
366
365 self.highlight_mode = highlight_mode
367 self.highlight_mode = highlight_mode
366 self.highlighted_filenodes = {}
368 self.highlighted_filenodes = {}
367 self.source_node_getter = source_node_getter
369 self.source_node_getter = source_node_getter
368 self.target_node_getter = target_node_getter
370 self.target_node_getter = target_node_getter
369 self.source_nodes = source_nodes or {}
371 self.source_nodes = source_nodes or {}
370 self.target_nodes = target_nodes or {}
372 self.target_nodes = target_nodes or {}
371 self.repo_name = repo_name
373 self.repo_name = repo_name
372 self.source_repo_name = source_repo_name or repo_name
374 self.source_repo_name = source_repo_name or repo_name
373 self.comments = comments or {}
375 self.comments = comments or {}
374 self.comments_store = self.comments.copy()
376 self.comments_store = self.comments.copy()
375 self.max_file_size_limit = max_file_size_limit
377 self.max_file_size_limit = max_file_size_limit
376
378
377 def render_patchset(self, patchset, source_ref=None, target_ref=None):
379 def render_patchset(self, patchset, source_ref=None, target_ref=None):
378 diffset = AttributeDict(dict(
380 diffset = AttributeDict(dict(
379 lines_added=0,
381 lines_added=0,
380 lines_deleted=0,
382 lines_deleted=0,
381 changed_files=0,
383 changed_files=0,
382 files=[],
384 files=[],
383 limited_diff=isinstance(patchset, LimitedDiffContainer),
385 limited_diff=isinstance(patchset, LimitedDiffContainer),
384 repo_name=self.repo_name,
386 repo_name=self.repo_name,
385 source_repo_name=self.source_repo_name,
387 source_repo_name=self.source_repo_name,
386 source_ref=source_ref,
388 source_ref=source_ref,
387 target_ref=target_ref,
389 target_ref=target_ref,
388 ))
390 ))
389 for patch in patchset:
391 for patch in patchset:
390 filediff = self.render_patch(patch)
392 filediff = self.render_patch(patch)
391 filediff.diffset = diffset
393 filediff.diffset = diffset
392 diffset.files.append(filediff)
394 diffset.files.append(filediff)
393 diffset.changed_files += 1
395 diffset.changed_files += 1
394 if not patch['stats']['binary']:
396 if not patch['stats']['binary']:
395 diffset.lines_added += patch['stats']['added']
397 diffset.lines_added += patch['stats']['added']
396 diffset.lines_deleted += patch['stats']['deleted']
398 diffset.lines_deleted += patch['stats']['deleted']
397
399
398 return diffset
400 return diffset
399
401
400 _lexer_cache = {}
402 _lexer_cache = {}
401 def _get_lexer_for_filename(self, filename):
403 def _get_lexer_for_filename(self, filename, filenode=None):
402 # cached because we might need to call it twice for source/target
404 # cached because we might need to call it twice for source/target
403 if filename not in self._lexer_cache:
405 if filename not in self._lexer_cache:
404 self._lexer_cache[filename] = get_lexer_safe(filepath=filename)
406 if filenode:
407 lexer = filenode.lexer
408 else:
409 lexer = get_lexer_safe(filepath=filename)
410 self._lexer_cache[filename] = lexer
405 return self._lexer_cache[filename]
411 return self._lexer_cache[filename]
406
412
407 def render_patch(self, patch):
413 def render_patch(self, patch):
408 log.debug('rendering diff for %r' % patch['filename'])
414 log.debug('rendering diff for %r' % patch['filename'])
409
415
410 source_filename = patch['original_filename']
416 source_filename = patch['original_filename']
411 target_filename = patch['filename']
417 target_filename = patch['filename']
412
418
413 source_lexer = plain_text_lexer
419 source_lexer = plain_text_lexer
414 target_lexer = plain_text_lexer
420 target_lexer = plain_text_lexer
415
421
416 if not patch['stats']['binary']:
422 if not patch['stats']['binary']:
417 if self.highlight_mode == self.HL_REAL:
423 if self.highlight_mode == self.HL_REAL:
418 if (source_filename and patch['operation'] in ('D', 'M')
424 if (source_filename and patch['operation'] in ('D', 'M')
419 and source_filename not in self.source_nodes):
425 and source_filename not in self.source_nodes):
420 self.source_nodes[source_filename] = (
426 self.source_nodes[source_filename] = (
421 self.source_node_getter(source_filename))
427 self.source_node_getter(source_filename))
422
428
423 if (target_filename and patch['operation'] in ('A', 'M')
429 if (target_filename and patch['operation'] in ('A', 'M')
424 and target_filename not in self.target_nodes):
430 and target_filename not in self.target_nodes):
425 self.target_nodes[target_filename] = (
431 self.target_nodes[target_filename] = (
426 self.target_node_getter(target_filename))
432 self.target_node_getter(target_filename))
427
433
428 elif self.highlight_mode == self.HL_FAST:
434 elif self.highlight_mode == self.HL_FAST:
429 source_lexer = self._get_lexer_for_filename(source_filename)
435 source_lexer = self._get_lexer_for_filename(source_filename)
430 target_lexer = self._get_lexer_for_filename(target_filename)
436 target_lexer = self._get_lexer_for_filename(target_filename)
431
437
432 source_file = self.source_nodes.get(source_filename, source_filename)
438 source_file = self.source_nodes.get(source_filename, source_filename)
433 target_file = self.target_nodes.get(target_filename, target_filename)
439 target_file = self.target_nodes.get(target_filename, target_filename)
434
440
435 source_filenode, target_filenode = None, None
441 source_filenode, target_filenode = None, None
436
442
437 # TODO: dan: FileNode.lexer works on the content of the file - which
443 # TODO: dan: FileNode.lexer works on the content of the file - which
438 # can be slow - issue #4289 explains a lexer clean up - which once
444 # can be slow - issue #4289 explains a lexer clean up - which once
439 # done can allow caching a lexer for a filenode to avoid the file lookup
445 # done can allow caching a lexer for a filenode to avoid the file lookup
440 if isinstance(source_file, FileNode):
446 if isinstance(source_file, FileNode):
441 source_filenode = source_file
447 source_filenode = source_file
442 source_lexer = source_file.lexer
448 #source_lexer = source_file.lexer
449 source_lexer = self._get_lexer_for_filename(source_filename)
450 source_file.lexer = source_lexer
451
443 if isinstance(target_file, FileNode):
452 if isinstance(target_file, FileNode):
444 target_filenode = target_file
453 target_filenode = target_file
445 target_lexer = target_file.lexer
454 #target_lexer = target_file.lexer
455 target_lexer = self._get_lexer_for_filename(target_filename)
456 target_file.lexer = target_lexer
446
457
447 source_file_path, target_file_path = None, None
458 source_file_path, target_file_path = None, None
448
459
449 if source_filename != '/dev/null':
460 if source_filename != '/dev/null':
450 source_file_path = source_filename
461 source_file_path = source_filename
451 if target_filename != '/dev/null':
462 if target_filename != '/dev/null':
452 target_file_path = target_filename
463 target_file_path = target_filename
453
464
454 source_file_type = source_lexer.name
465 source_file_type = source_lexer.name
455 target_file_type = target_lexer.name
466 target_file_type = target_lexer.name
456
467
457 op_hunks = patch['chunks'][0]
468 op_hunks = patch['chunks'][0]
458 hunks = patch['chunks'][1:]
469 hunks = patch['chunks'][1:]
459
470
460 filediff = AttributeDict({
471 filediff = AttributeDict({
461 'source_file_path': source_file_path,
472 'source_file_path': source_file_path,
462 'target_file_path': target_file_path,
473 'target_file_path': target_file_path,
463 'source_filenode': source_filenode,
474 'source_filenode': source_filenode,
464 'target_filenode': target_filenode,
475 'target_filenode': target_filenode,
465 'hunks': [],
476 'hunks': [],
466 'source_file_type': target_file_type,
477 'source_file_type': target_file_type,
467 'target_file_type': source_file_type,
478 'target_file_type': source_file_type,
468 'patch': patch,
479 'patch': patch,
469 'source_mode': patch['stats']['old_mode'],
480 'source_mode': patch['stats']['old_mode'],
470 'target_mode': patch['stats']['new_mode'],
481 'target_mode': patch['stats']['new_mode'],
471 'limited_diff': isinstance(patch, LimitedDiffContainer),
482 'limited_diff': isinstance(patch, LimitedDiffContainer),
472 'diffset': self,
483 'diffset': self,
473 })
484 })
474
485
475 for hunk in hunks:
486 for hunk in hunks:
476 hunkbit = self.parse_hunk(hunk, source_file, target_file)
487 hunkbit = self.parse_hunk(hunk, source_file, target_file)
477 hunkbit.filediff = filediff
488 hunkbit.filediff = filediff
478 filediff.hunks.append(hunkbit)
489 filediff.hunks.append(hunkbit)
479
490
480 left_comments = {}
491 left_comments = {}
481
492
482 if source_file_path in self.comments_store:
493 if source_file_path in self.comments_store:
483 for lineno, comments in self.comments_store[source_file_path].items():
494 for lineno, comments in self.comments_store[source_file_path].items():
484 left_comments[lineno] = comments
495 left_comments[lineno] = comments
485
496
486 if target_file_path in self.comments_store:
497 if target_file_path in self.comments_store:
487 for lineno, comments in self.comments_store[target_file_path].items():
498 for lineno, comments in self.comments_store[target_file_path].items():
488 left_comments[lineno] = comments
499 left_comments[lineno] = comments
489
500
490 filediff.left_comments = left_comments
501 filediff.left_comments = left_comments
491 return filediff
502 return filediff
492
503
493 def parse_hunk(self, hunk, source_file, target_file):
504 def parse_hunk(self, hunk, source_file, target_file):
494 result = AttributeDict(dict(
505 result = AttributeDict(dict(
495 source_start=hunk['source_start'],
506 source_start=hunk['source_start'],
496 source_length=hunk['source_length'],
507 source_length=hunk['source_length'],
497 target_start=hunk['target_start'],
508 target_start=hunk['target_start'],
498 target_length=hunk['target_length'],
509 target_length=hunk['target_length'],
499 section_header=hunk['section_header'],
510 section_header=hunk['section_header'],
500 lines=[],
511 lines=[],
501 ))
512 ))
502 before, after = [], []
513 before, after = [], []
503
514
504 for line in hunk['lines']:
515 for line in hunk['lines']:
505 if line['action'] == 'unmod':
516 if line['action'] == 'unmod':
506 result.lines.extend(
517 result.lines.extend(
507 self.parse_lines(before, after, source_file, target_file))
518 self.parse_lines(before, after, source_file, target_file))
508 after.append(line)
519 after.append(line)
509 before.append(line)
520 before.append(line)
510 elif line['action'] == 'add':
521 elif line['action'] == 'add':
511 after.append(line)
522 after.append(line)
512 elif line['action'] == 'del':
523 elif line['action'] == 'del':
513 before.append(line)
524 before.append(line)
514 elif line['action'] == 'old-no-nl':
525 elif line['action'] == 'old-no-nl':
515 before.append(line)
526 before.append(line)
516 elif line['action'] == 'new-no-nl':
527 elif line['action'] == 'new-no-nl':
517 after.append(line)
528 after.append(line)
518
529
519 result.lines.extend(
530 result.lines.extend(
520 self.parse_lines(before, after, source_file, target_file))
531 self.parse_lines(before, after, source_file, target_file))
521 result.unified = self.as_unified(result.lines)
532 result.unified = self.as_unified(result.lines)
522 result.sideside = result.lines
533 result.sideside = result.lines
523
534
524 return result
535 return result
525
536
526 def parse_lines(self, before_lines, after_lines, source_file, target_file):
537 def parse_lines(self, before_lines, after_lines, source_file, target_file):
527 # TODO: dan: investigate doing the diff comparison and fast highlighting
538 # TODO: dan: investigate doing the diff comparison and fast highlighting
528 # on the entire before and after buffered block lines rather than by
539 # on the entire before and after buffered block lines rather than by
529 # line, this means we can get better 'fast' highlighting if the context
540 # line, this means we can get better 'fast' highlighting if the context
530 # allows it - eg.
541 # allows it - eg.
531 # line 4: """
542 # line 4: """
532 # line 5: this gets highlighted as a string
543 # line 5: this gets highlighted as a string
533 # line 6: """
544 # line 6: """
534
545
535 lines = []
546 lines = []
536 while before_lines or after_lines:
547 while before_lines or after_lines:
537 before, after = None, None
548 before, after = None, None
538 before_tokens, after_tokens = None, None
549 before_tokens, after_tokens = None, None
539
550
540 if before_lines:
551 if before_lines:
541 before = before_lines.pop(0)
552 before = before_lines.pop(0)
542 if after_lines:
553 if after_lines:
543 after = after_lines.pop(0)
554 after = after_lines.pop(0)
544
555
545 original = AttributeDict()
556 original = AttributeDict()
546 modified = AttributeDict()
557 modified = AttributeDict()
547
558
548 if before:
559 if before:
549 if before['action'] == 'old-no-nl':
560 if before['action'] == 'old-no-nl':
550 before_tokens = [('nonl', before['line'])]
561 before_tokens = [('nonl', before['line'])]
551 else:
562 else:
552 before_tokens = self.get_line_tokens(
563 before_tokens = self.get_line_tokens(
553 line_text=before['line'], line_number=before['old_lineno'],
564 line_text=before['line'], line_number=before['old_lineno'],
554 file=source_file)
565 file=source_file)
555 original.lineno = before['old_lineno']
566 original.lineno = before['old_lineno']
556 original.content = before['line']
567 original.content = before['line']
557 original.action = self.action_to_op(before['action'])
568 original.action = self.action_to_op(before['action'])
558 original.comments = self.get_comments_for('old',
569 original.comments = self.get_comments_for('old',
559 source_file, before['old_lineno'])
570 source_file, before['old_lineno'])
560
571
561 if after:
572 if after:
562 if after['action'] == 'new-no-nl':
573 if after['action'] == 'new-no-nl':
563 after_tokens = [('nonl', after['line'])]
574 after_tokens = [('nonl', after['line'])]
564 else:
575 else:
565 after_tokens = self.get_line_tokens(
576 after_tokens = self.get_line_tokens(
566 line_text=after['line'], line_number=after['new_lineno'],
577 line_text=after['line'], line_number=after['new_lineno'],
567 file=target_file)
578 file=target_file)
568 modified.lineno = after['new_lineno']
579 modified.lineno = after['new_lineno']
569 modified.content = after['line']
580 modified.content = after['line']
570 modified.action = self.action_to_op(after['action'])
581 modified.action = self.action_to_op(after['action'])
571 modified.comments = self.get_comments_for('new',
582 modified.comments = self.get_comments_for('new',
572 target_file, after['new_lineno'])
583 target_file, after['new_lineno'])
573
584
574 # diff the lines
585 # diff the lines
575 if before_tokens and after_tokens:
586 if before_tokens and after_tokens:
576 o_tokens, m_tokens, similarity = tokens_diff(
587 o_tokens, m_tokens, similarity = tokens_diff(
577 before_tokens, after_tokens)
588 before_tokens, after_tokens)
578 original.content = render_tokenstream(o_tokens)
589 original.content = render_tokenstream(o_tokens)
579 modified.content = render_tokenstream(m_tokens)
590 modified.content = render_tokenstream(m_tokens)
580 elif before_tokens:
591 elif before_tokens:
581 original.content = render_tokenstream(
592 original.content = render_tokenstream(
582 [(x[0], '', x[1]) for x in before_tokens])
593 [(x[0], '', x[1]) for x in before_tokens])
583 elif after_tokens:
594 elif after_tokens:
584 modified.content = render_tokenstream(
595 modified.content = render_tokenstream(
585 [(x[0], '', x[1]) for x in after_tokens])
596 [(x[0], '', x[1]) for x in after_tokens])
586
597
587 lines.append(AttributeDict({
598 lines.append(AttributeDict({
588 'original': original,
599 'original': original,
589 'modified': modified,
600 'modified': modified,
590 }))
601 }))
591
602
592 return lines
603 return lines
593
604
594 def get_comments_for(self, version, file, line_number):
605 def get_comments_for(self, version, file, line_number):
595 if hasattr(file, 'unicode_path'):
606 if hasattr(file, 'unicode_path'):
596 file = file.unicode_path
607 file = file.unicode_path
597
608
598 if not isinstance(file, basestring):
609 if not isinstance(file, basestring):
599 return None
610 return None
600
611
601 line_key = {
612 line_key = {
602 'old': 'o',
613 'old': 'o',
603 'new': 'n',
614 'new': 'n',
604 }[version] + str(line_number)
615 }[version] + str(line_number)
605
616
606 if file in self.comments_store:
617 if file in self.comments_store:
607 file_comments = self.comments_store[file]
618 file_comments = self.comments_store[file]
608 if line_key in file_comments:
619 if line_key in file_comments:
609 return file_comments.pop(line_key)
620 return file_comments.pop(line_key)
610
621
611 def get_line_tokens(self, line_text, line_number, file=None):
622 def get_line_tokens(self, line_text, line_number, file=None):
612 filenode = None
623 filenode = None
613 filename = None
624 filename = None
614
625
615 if isinstance(file, basestring):
626 if isinstance(file, basestring):
616 filename = file
627 filename = file
617 elif isinstance(file, FileNode):
628 elif isinstance(file, FileNode):
618 filenode = file
629 filenode = file
619 filename = file.unicode_path
630 filename = file.unicode_path
620
631
621 if self.highlight_mode == self.HL_REAL and filenode:
632 if self.highlight_mode == self.HL_REAL and filenode:
622 if line_number and file.size < self.max_file_size_limit:
633 lexer = self._get_lexer_for_filename(filename)
623 return self.get_tokenized_filenode_line(file, line_number)
634 file_size_allowed = file.size < self.max_file_size_limit
635 if line_number and file_size_allowed:
636 return self.get_tokenized_filenode_line(
637 file, line_number, lexer)
624
638
625 if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:
639 if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:
626 lexer = self._get_lexer_for_filename(filename)
640 lexer = self._get_lexer_for_filename(filename)
627 return list(tokenize_string(line_text, lexer))
641 return list(tokenize_string(line_text, lexer))
628
642
629 return list(tokenize_string(line_text, plain_text_lexer))
643 return list(tokenize_string(line_text, plain_text_lexer))
630
644
631 def get_tokenized_filenode_line(self, filenode, line_number):
645 def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):
632
646
633 if filenode not in self.highlighted_filenodes:
647 if filenode not in self.highlighted_filenodes:
634 tokenized_lines = filenode_as_lines_tokens(filenode, filenode.lexer)
648 tokenized_lines = filenode_as_lines_tokens(filenode, lexer)
635 self.highlighted_filenodes[filenode] = tokenized_lines
649 self.highlighted_filenodes[filenode] = tokenized_lines
636 return self.highlighted_filenodes[filenode][line_number - 1]
650 return self.highlighted_filenodes[filenode][line_number - 1]
637
651
638 def action_to_op(self, action):
652 def action_to_op(self, action):
639 return {
653 return {
640 'add': '+',
654 'add': '+',
641 'del': '-',
655 'del': '-',
642 'unmod': ' ',
656 'unmod': ' ',
643 'old-no-nl': ' ',
657 'old-no-nl': ' ',
644 'new-no-nl': ' ',
658 'new-no-nl': ' ',
645 }.get(action, action)
659 }.get(action, action)
646
660
647 def as_unified(self, lines):
661 def as_unified(self, lines):
648 """
662 """
649 Return a generator that yields the lines of a diff in unified order
663 Return a generator that yields the lines of a diff in unified order
650 """
664 """
651 def generator():
665 def generator():
652 buf = []
666 buf = []
653 for line in lines:
667 for line in lines:
654
668
655 if buf and not line.original or line.original.action == ' ':
669 if buf and not line.original or line.original.action == ' ':
656 for b in buf:
670 for b in buf:
657 yield b
671 yield b
658 buf = []
672 buf = []
659
673
660 if line.original:
674 if line.original:
661 if line.original.action == ' ':
675 if line.original.action == ' ':
662 yield (line.original.lineno, line.modified.lineno,
676 yield (line.original.lineno, line.modified.lineno,
663 line.original.action, line.original.content,
677 line.original.action, line.original.content,
664 line.original.comments)
678 line.original.comments)
665 continue
679 continue
666
680
667 if line.original.action == '-':
681 if line.original.action == '-':
668 yield (line.original.lineno, None,
682 yield (line.original.lineno, None,
669 line.original.action, line.original.content,
683 line.original.action, line.original.content,
670 line.original.comments)
684 line.original.comments)
671
685
672 if line.modified.action == '+':
686 if line.modified.action == '+':
673 buf.append((
687 buf.append((
674 None, line.modified.lineno,
688 None, line.modified.lineno,
675 line.modified.action, line.modified.content,
689 line.modified.action, line.modified.content,
676 line.modified.comments))
690 line.modified.comments))
677 continue
691 continue
678
692
679 if line.modified:
693 if line.modified:
680 yield (None, line.modified.lineno,
694 yield (None, line.modified.lineno,
681 line.modified.action, line.modified.content,
695 line.modified.action, line.modified.content,
682 line.modified.comments)
696 line.modified.comments)
683
697
684 for b in buf:
698 for b in buf:
685 yield b
699 yield b
686
700
687 return generator()
701 return generator()
General Comments 0
You need to be logged in to leave comments. Login now