##// END OF EJS Templates
diffs: fixed case of bogus files diff rendering...
ergo -
r3444:e5ce0962 default
parent child Browse files
Show More
@@ -1,776 +1,786 b''
1 1 # -*- coding: utf-8 -*-
2 2
3 3 # Copyright (C) 2011-2019 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21 import logging
22 22 import difflib
23 23 from itertools import groupby
24 24
25 25 from pygments import lex
26 26 from pygments.formatters.html import _get_ttype_class as pygment_token_class
27 27 from pygments.lexers.special import TextLexer, Token
28 28 from pygments.lexers import get_lexer_by_name
29 29 from pyramid import compat
30 30
31 31 from rhodecode.lib.helpers import (
32 32 get_lexer_for_filenode, html_escape, get_custom_lexer)
33 33 from rhodecode.lib.utils2 import AttributeDict, StrictAttributeDict, safe_unicode
34 34 from rhodecode.lib.vcs.nodes import FileNode
35 35 from rhodecode.lib.vcs.exceptions import VCSError, NodeDoesNotExistError
36 36 from rhodecode.lib.diff_match_patch import diff_match_patch
37 37 from rhodecode.lib.diffs import LimitedDiffContainer, DEL_FILENODE, BIN_FILENODE
38 38
39 39
40 40 plain_text_lexer = get_lexer_by_name(
41 41 'text', stripall=False, stripnl=False, ensurenl=False)
42 42
43 43
44 44 log = logging.getLogger(__name__)
45 45
46 46
47 47 def filenode_as_lines_tokens(filenode, lexer=None):
48 48 org_lexer = lexer
49 49 lexer = lexer or get_lexer_for_filenode(filenode)
50 50 log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',
51 51 lexer, filenode, org_lexer)
52 tokens = tokenize_string(filenode.content, lexer)
53 lines = split_token_stream(tokens)
52 content = filenode.content
53 tokens = tokenize_string(content, lexer)
54 lines = split_token_stream(tokens, content)
54 55 rv = list(lines)
55 56 return rv
56 57
57 58
58 59 def tokenize_string(content, lexer):
59 60 """
60 61 Use pygments to tokenize some content based on a lexer
61 62 ensuring all original new lines and whitespace is preserved
62 63 """
63 64
64 65 lexer.stripall = False
65 66 lexer.stripnl = False
66 67 lexer.ensurenl = False
67 68
68 69 if isinstance(lexer, TextLexer):
69 70 lexed = [(Token.Text, content)]
70 71 else:
71 72 lexed = lex(content, lexer)
72 73
73 74 for token_type, token_text in lexed:
74 75 yield pygment_token_class(token_type), token_text
75 76
76 77
77 def split_token_stream(tokens):
78 def split_token_stream(tokens, content):
78 79 """
79 80 Take a list of (TokenType, text) tuples and split them by a string
80 81
81 82 split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
82 83 [(TEXT, 'some'), (TEXT, 'text'),
83 84 (TEXT, 'more'), (TEXT, 'text')]
84 85 """
85 86
86 buffer = []
87 token_buffer = []
87 88 for token_class, token_text in tokens:
88 89 parts = token_text.split('\n')
89 90 for part in parts[:-1]:
90 buffer.append((token_class, part))
91 yield buffer
92 buffer = []
91 token_buffer.append((token_class, part))
92 yield token_buffer
93 token_buffer = []
94
95 token_buffer.append((token_class, parts[-1]))
93 96
94 buffer.append((token_class, parts[-1]))
95
96 if buffer:
97 yield buffer
97 if token_buffer:
98 yield token_buffer
99 elif content:
100 # this is a special case, we have the content, but tokenization didn't produce
101 # any results. THis can happen if know file extensions like .css have some bogus
102 # unicode content without any newline characters
103 yield [(pygment_token_class(Token.Text), content)]
98 104
99 105
100 106 def filenode_as_annotated_lines_tokens(filenode):
101 107 """
102 108 Take a file node and return a list of annotations => lines, if no annotation
103 109 is found, it will be None.
104 110
105 111 eg:
106 112
107 113 [
108 114 (annotation1, [
109 115 (1, line1_tokens_list),
110 116 (2, line2_tokens_list),
111 117 ]),
112 118 (annotation2, [
113 119 (3, line1_tokens_list),
114 120 ]),
115 121 (None, [
116 122 (4, line1_tokens_list),
117 123 ]),
118 124 (annotation1, [
119 125 (5, line1_tokens_list),
120 126 (6, line2_tokens_list),
121 127 ])
122 128 ]
123 129 """
124 130
125 131 commit_cache = {} # cache commit_getter lookups
126 132
127 133 def _get_annotation(commit_id, commit_getter):
128 134 if commit_id not in commit_cache:
129 135 commit_cache[commit_id] = commit_getter()
130 136 return commit_cache[commit_id]
131 137
132 138 annotation_lookup = {
133 139 line_no: _get_annotation(commit_id, commit_getter)
134 140 for line_no, commit_id, commit_getter, line_content
135 141 in filenode.annotate
136 142 }
137 143
138 144 annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
139 145 for line_no, tokens
140 146 in enumerate(filenode_as_lines_tokens(filenode), 1))
141 147
142 148 grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
143 149
144 150 for annotation, group in grouped_annotations_lines:
145 151 yield (
146 152 annotation, [(line_no, tokens)
147 153 for (_, line_no, tokens) in group]
148 154 )
149 155
150 156
151 157 def render_tokenstream(tokenstream):
152 158 result = []
153 159 for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
154 160
155 161 if token_class:
156 162 result.append(u'<span class="%s">' % token_class)
157 163 else:
158 164 result.append(u'<span>')
159 165
160 166 for op_tag, token_text in token_ops_texts:
161 167
162 168 if op_tag:
163 169 result.append(u'<%s>' % op_tag)
164 170
165 171 escaped_text = html_escape(token_text)
166 172
167 173 # TODO: dan: investigate showing hidden characters like space/nl/tab
168 174 # escaped_text = escaped_text.replace(' ', '<sp> </sp>')
169 175 # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
170 176 # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')
171 177
172 178 result.append(escaped_text)
173 179
174 180 if op_tag:
175 181 result.append(u'</%s>' % op_tag)
176 182
177 183 result.append(u'</span>')
178 184
179 185 html = ''.join(result)
180 186 return html
181 187
182 188
183 189 def rollup_tokenstream(tokenstream):
184 190 """
185 191 Group a token stream of the format:
186 192
187 193 ('class', 'op', 'text')
188 194 or
189 195 ('class', 'text')
190 196
191 197 into
192 198
193 199 [('class1',
194 200 [('op1', 'text'),
195 201 ('op2', 'text')]),
196 202 ('class2',
197 203 [('op3', 'text')])]
198 204
199 205 This is used to get the minimal tags necessary when
200 206 rendering to html eg for a token stream ie.
201 207
202 208 <span class="A"><ins>he</ins>llo</span>
203 209 vs
204 210 <span class="A"><ins>he</ins></span><span class="A">llo</span>
205 211
206 212 If a 2 tuple is passed in, the output op will be an empty string.
207 213
208 214 eg:
209 215
210 216 >>> rollup_tokenstream([('classA', '', 'h'),
211 217 ('classA', 'del', 'ell'),
212 218 ('classA', '', 'o'),
213 219 ('classB', '', ' '),
214 220 ('classA', '', 'the'),
215 221 ('classA', '', 're'),
216 222 ])
217 223
218 224 [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
219 225 ('classB', [('', ' ')],
220 226 ('classA', [('', 'there')]]
221 227
222 228 """
223 229 if tokenstream and len(tokenstream[0]) == 2:
224 230 tokenstream = ((t[0], '', t[1]) for t in tokenstream)
225 231
226 232 result = []
227 233 for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
228 234 ops = []
229 235 for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
230 236 text_buffer = []
231 237 for t_class, t_op, t_text in token_text_list:
232 238 text_buffer.append(t_text)
233 239 ops.append((token_op, ''.join(text_buffer)))
234 240 result.append((token_class, ops))
235 241 return result
236 242
237 243
238 244 def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
239 245 """
240 246 Converts a list of (token_class, token_text) tuples to a list of
241 247 (token_class, token_op, token_text) tuples where token_op is one of
242 248 ('ins', 'del', '')
243 249
244 250 :param old_tokens: list of (token_class, token_text) tuples of old line
245 251 :param new_tokens: list of (token_class, token_text) tuples of new line
246 252 :param use_diff_match_patch: boolean, will use google's diff match patch
247 253 library which has options to 'smooth' out the character by character
248 254 differences making nicer ins/del blocks
249 255 """
250 256
251 257 old_tokens_result = []
252 258 new_tokens_result = []
253 259
254 260 similarity = difflib.SequenceMatcher(None,
255 261 ''.join(token_text for token_class, token_text in old_tokens),
256 262 ''.join(token_text for token_class, token_text in new_tokens)
257 263 ).ratio()
258 264
259 265 if similarity < 0.6: # return, the blocks are too different
260 266 for token_class, token_text in old_tokens:
261 267 old_tokens_result.append((token_class, '', token_text))
262 268 for token_class, token_text in new_tokens:
263 269 new_tokens_result.append((token_class, '', token_text))
264 270 return old_tokens_result, new_tokens_result, similarity
265 271
266 272 token_sequence_matcher = difflib.SequenceMatcher(None,
267 273 [x[1] for x in old_tokens],
268 274 [x[1] for x in new_tokens])
269 275
270 276 for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
271 277 # check the differences by token block types first to give a more
272 278 # nicer "block" level replacement vs character diffs
273 279
274 280 if tag == 'equal':
275 281 for token_class, token_text in old_tokens[o1:o2]:
276 282 old_tokens_result.append((token_class, '', token_text))
277 283 for token_class, token_text in new_tokens[n1:n2]:
278 284 new_tokens_result.append((token_class, '', token_text))
279 285 elif tag == 'delete':
280 286 for token_class, token_text in old_tokens[o1:o2]:
281 287 old_tokens_result.append((token_class, 'del', token_text))
282 288 elif tag == 'insert':
283 289 for token_class, token_text in new_tokens[n1:n2]:
284 290 new_tokens_result.append((token_class, 'ins', token_text))
285 291 elif tag == 'replace':
286 292 # if same type token blocks must be replaced, do a diff on the
287 293 # characters in the token blocks to show individual changes
288 294
289 295 old_char_tokens = []
290 296 new_char_tokens = []
291 297 for token_class, token_text in old_tokens[o1:o2]:
292 298 for char in token_text:
293 299 old_char_tokens.append((token_class, char))
294 300
295 301 for token_class, token_text in new_tokens[n1:n2]:
296 302 for char in token_text:
297 303 new_char_tokens.append((token_class, char))
298 304
299 305 old_string = ''.join([token_text for
300 306 token_class, token_text in old_char_tokens])
301 307 new_string = ''.join([token_text for
302 308 token_class, token_text in new_char_tokens])
303 309
304 310 char_sequence = difflib.SequenceMatcher(
305 311 None, old_string, new_string)
306 312 copcodes = char_sequence.get_opcodes()
307 313 obuffer, nbuffer = [], []
308 314
309 315 if use_diff_match_patch:
310 316 dmp = diff_match_patch()
311 317 dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting
312 318 reps = dmp.diff_main(old_string, new_string)
313 319 dmp.diff_cleanupEfficiency(reps)
314 320
315 321 a, b = 0, 0
316 322 for op, rep in reps:
317 323 l = len(rep)
318 324 if op == 0:
319 325 for i, c in enumerate(rep):
320 326 obuffer.append((old_char_tokens[a+i][0], '', c))
321 327 nbuffer.append((new_char_tokens[b+i][0], '', c))
322 328 a += l
323 329 b += l
324 330 elif op == -1:
325 331 for i, c in enumerate(rep):
326 332 obuffer.append((old_char_tokens[a+i][0], 'del', c))
327 333 a += l
328 334 elif op == 1:
329 335 for i, c in enumerate(rep):
330 336 nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
331 337 b += l
332 338 else:
333 339 for ctag, co1, co2, cn1, cn2 in copcodes:
334 340 if ctag == 'equal':
335 341 for token_class, token_text in old_char_tokens[co1:co2]:
336 342 obuffer.append((token_class, '', token_text))
337 343 for token_class, token_text in new_char_tokens[cn1:cn2]:
338 344 nbuffer.append((token_class, '', token_text))
339 345 elif ctag == 'delete':
340 346 for token_class, token_text in old_char_tokens[co1:co2]:
341 347 obuffer.append((token_class, 'del', token_text))
342 348 elif ctag == 'insert':
343 349 for token_class, token_text in new_char_tokens[cn1:cn2]:
344 350 nbuffer.append((token_class, 'ins', token_text))
345 351 elif ctag == 'replace':
346 352 for token_class, token_text in old_char_tokens[co1:co2]:
347 353 obuffer.append((token_class, 'del', token_text))
348 354 for token_class, token_text in new_char_tokens[cn1:cn2]:
349 355 nbuffer.append((token_class, 'ins', token_text))
350 356
351 357 old_tokens_result.extend(obuffer)
352 358 new_tokens_result.extend(nbuffer)
353 359
354 360 return old_tokens_result, new_tokens_result, similarity
355 361
356 362
357 363 def diffset_node_getter(commit):
358 364 def get_node(fname):
359 365 try:
360 366 return commit.get_node(fname)
361 367 except NodeDoesNotExistError:
362 368 return None
363 369
364 370 return get_node
365 371
366 372
367 373 class DiffSet(object):
368 374 """
369 375 An object for parsing the diff result from diffs.DiffProcessor and
370 376 adding highlighting, side by side/unified renderings and line diffs
371 377 """
372 378
373 379 HL_REAL = 'REAL' # highlights using original file, slow
374 380 HL_FAST = 'FAST' # highlights using just the line, fast but not correct
375 381 # in the case of multiline code
376 382 HL_NONE = 'NONE' # no highlighting, fastest
377 383
378 384 def __init__(self, highlight_mode=HL_REAL, repo_name=None,
379 385 source_repo_name=None,
380 386 source_node_getter=lambda filename: None,
381 387 target_repo_name=None,
382 388 target_node_getter=lambda filename: None,
383 389 source_nodes=None, target_nodes=None,
384 390 # files over this size will use fast highlighting
385 391 max_file_size_limit=150 * 1024,
386 392 ):
387 393
388 394 self.highlight_mode = highlight_mode
389 395 self.highlighted_filenodes = {}
390 396 self.source_node_getter = source_node_getter
391 397 self.target_node_getter = target_node_getter
392 398 self.source_nodes = source_nodes or {}
393 399 self.target_nodes = target_nodes or {}
394 400 self.repo_name = repo_name
395 401 self.target_repo_name = target_repo_name or repo_name
396 402 self.source_repo_name = source_repo_name or repo_name
397 403 self.max_file_size_limit = max_file_size_limit
398 404
399 405 def render_patchset(self, patchset, source_ref=None, target_ref=None):
400 406 diffset = AttributeDict(dict(
401 407 lines_added=0,
402 408 lines_deleted=0,
403 409 changed_files=0,
404 410 files=[],
405 411 file_stats={},
406 412 limited_diff=isinstance(patchset, LimitedDiffContainer),
407 413 repo_name=self.repo_name,
408 414 target_repo_name=self.target_repo_name,
409 415 source_repo_name=self.source_repo_name,
410 416 source_ref=source_ref,
411 417 target_ref=target_ref,
412 418 ))
413 419 for patch in patchset:
414 420 diffset.file_stats[patch['filename']] = patch['stats']
415 421 filediff = self.render_patch(patch)
416 422 filediff.diffset = StrictAttributeDict(dict(
417 423 source_ref=diffset.source_ref,
418 424 target_ref=diffset.target_ref,
419 425 repo_name=diffset.repo_name,
420 426 source_repo_name=diffset.source_repo_name,
421 427 target_repo_name=diffset.target_repo_name,
422 428 ))
423 429 diffset.files.append(filediff)
424 430 diffset.changed_files += 1
425 431 if not patch['stats']['binary']:
426 432 diffset.lines_added += patch['stats']['added']
427 433 diffset.lines_deleted += patch['stats']['deleted']
428 434
429 435 return diffset
430 436
431 437 _lexer_cache = {}
432 438
433 439 def _get_lexer_for_filename(self, filename, filenode=None):
434 440 # cached because we might need to call it twice for source/target
435 441 if filename not in self._lexer_cache:
436 442 if filenode:
437 443 lexer = filenode.lexer
438 444 extension = filenode.extension
439 445 else:
440 446 lexer = FileNode.get_lexer(filename=filename)
441 447 extension = filename.split('.')[-1]
442 448
443 449 lexer = get_custom_lexer(extension) or lexer
444 450 self._lexer_cache[filename] = lexer
445 451 return self._lexer_cache[filename]
446 452
447 453 def render_patch(self, patch):
448 454 log.debug('rendering diff for %r', patch['filename'])
449 455
450 456 source_filename = patch['original_filename']
451 457 target_filename = patch['filename']
452 458
453 459 source_lexer = plain_text_lexer
454 460 target_lexer = plain_text_lexer
455 461
456 462 if not patch['stats']['binary']:
457 463 node_hl_mode = self.HL_NONE if patch['chunks'] == [] else None
458 464 hl_mode = node_hl_mode or self.highlight_mode
459 465
460 466 if hl_mode == self.HL_REAL:
461 467 if (source_filename and patch['operation'] in ('D', 'M')
462 468 and source_filename not in self.source_nodes):
463 469 self.source_nodes[source_filename] = (
464 470 self.source_node_getter(source_filename))
465 471
466 472 if (target_filename and patch['operation'] in ('A', 'M')
467 473 and target_filename not in self.target_nodes):
468 474 self.target_nodes[target_filename] = (
469 475 self.target_node_getter(target_filename))
470 476
471 477 elif hl_mode == self.HL_FAST:
472 478 source_lexer = self._get_lexer_for_filename(source_filename)
473 479 target_lexer = self._get_lexer_for_filename(target_filename)
474 480
475 481 source_file = self.source_nodes.get(source_filename, source_filename)
476 482 target_file = self.target_nodes.get(target_filename, target_filename)
477 483 raw_id_uid = ''
478 484 if self.source_nodes.get(source_filename):
479 485 raw_id_uid = self.source_nodes[source_filename].commit.raw_id
480 486
481 487 if not raw_id_uid and self.target_nodes.get(target_filename):
482 488 # in case this is a new file we only have it in target
483 489 raw_id_uid = self.target_nodes[target_filename].commit.raw_id
484 490
485 491 source_filenode, target_filenode = None, None
486 492
487 493 # TODO: dan: FileNode.lexer works on the content of the file - which
488 494 # can be slow - issue #4289 explains a lexer clean up - which once
489 495 # done can allow caching a lexer for a filenode to avoid the file lookup
490 496 if isinstance(source_file, FileNode):
491 497 source_filenode = source_file
492 498 #source_lexer = source_file.lexer
493 499 source_lexer = self._get_lexer_for_filename(source_filename)
494 500 source_file.lexer = source_lexer
495 501
496 502 if isinstance(target_file, FileNode):
497 503 target_filenode = target_file
498 504 #target_lexer = target_file.lexer
499 505 target_lexer = self._get_lexer_for_filename(target_filename)
500 506 target_file.lexer = target_lexer
501 507
502 508 source_file_path, target_file_path = None, None
503 509
504 510 if source_filename != '/dev/null':
505 511 source_file_path = source_filename
506 512 if target_filename != '/dev/null':
507 513 target_file_path = target_filename
508 514
509 515 source_file_type = source_lexer.name
510 516 target_file_type = target_lexer.name
511 517
512 518 filediff = AttributeDict({
513 519 'source_file_path': source_file_path,
514 520 'target_file_path': target_file_path,
515 521 'source_filenode': source_filenode,
516 522 'target_filenode': target_filenode,
517 523 'source_file_type': target_file_type,
518 524 'target_file_type': source_file_type,
519 525 'patch': {'filename': patch['filename'], 'stats': patch['stats']},
520 526 'operation': patch['operation'],
521 527 'source_mode': patch['stats']['old_mode'],
522 528 'target_mode': patch['stats']['new_mode'],
523 529 'limited_diff': patch['is_limited_diff'],
524 530 'hunks': [],
525 531 'hunk_ops': None,
526 532 'diffset': self,
527 533 'raw_id': raw_id_uid,
528 534 })
529 535
530 536 file_chunks = patch['chunks'][1:]
531 537 for hunk in file_chunks:
532 538 hunkbit = self.parse_hunk(hunk, source_file, target_file)
533 539 hunkbit.source_file_path = source_file_path
534 540 hunkbit.target_file_path = target_file_path
535 541 filediff.hunks.append(hunkbit)
536 542
537 543 # Simulate hunk on OPS type line which doesn't really contain any diff
538 544 # this allows commenting on those
539 545 if not file_chunks:
540 546 actions = []
541 547 for op_id, op_text in filediff.patch['stats']['ops'].items():
542 548 if op_id == DEL_FILENODE:
543 549 actions.append(u'file was removed')
544 550 elif op_id == BIN_FILENODE:
545 551 actions.append(u'binary diff hidden')
546 552 else:
547 553 actions.append(safe_unicode(op_text))
548 554 action_line = u'NO CONTENT: ' + \
549 555 u', '.join(actions) or u'UNDEFINED_ACTION'
550 556
551 557 hunk_ops = {'source_length': 0, 'source_start': 0,
552 558 'lines': [
553 559 {'new_lineno': 0, 'old_lineno': 1,
554 560 'action': 'unmod-no-hl', 'line': action_line}
555 561 ],
556 562 'section_header': u'', 'target_start': 1, 'target_length': 1}
557 563
558 564 hunkbit = self.parse_hunk(hunk_ops, source_file, target_file)
559 565 hunkbit.source_file_path = source_file_path
560 566 hunkbit.target_file_path = target_file_path
561 567 filediff.hunk_ops = hunkbit
562 568 return filediff
563 569
564 570 def parse_hunk(self, hunk, source_file, target_file):
565 571 result = AttributeDict(dict(
566 572 source_start=hunk['source_start'],
567 573 source_length=hunk['source_length'],
568 574 target_start=hunk['target_start'],
569 575 target_length=hunk['target_length'],
570 576 section_header=hunk['section_header'],
571 577 lines=[],
572 578 ))
573 579 before, after = [], []
574 580
575 581 for line in hunk['lines']:
576 582 if line['action'] in ['unmod', 'unmod-no-hl']:
577 583 no_hl = line['action'] == 'unmod-no-hl'
578 584 result.lines.extend(
579 585 self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))
580 586 after.append(line)
581 587 before.append(line)
582 588 elif line['action'] == 'add':
583 589 after.append(line)
584 590 elif line['action'] == 'del':
585 591 before.append(line)
586 592 elif line['action'] == 'old-no-nl':
587 593 before.append(line)
588 594 elif line['action'] == 'new-no-nl':
589 595 after.append(line)
590 596
591 597 all_actions = [x['action'] for x in after] + [x['action'] for x in before]
592 598 no_hl = {x for x in all_actions} == {'unmod-no-hl'}
593 599 result.lines.extend(
594 600 self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))
595 601 # NOTE(marcink): we must keep list() call here so we can cache the result...
596 602 result.unified = list(self.as_unified(result.lines))
597 603 result.sideside = result.lines
598 604
599 605 return result
600 606
601 607 def parse_lines(self, before_lines, after_lines, source_file, target_file,
602 608 no_hl=False):
603 609 # TODO: dan: investigate doing the diff comparison and fast highlighting
604 610 # on the entire before and after buffered block lines rather than by
605 611 # line, this means we can get better 'fast' highlighting if the context
606 612 # allows it - eg.
607 613 # line 4: """
608 614 # line 5: this gets highlighted as a string
609 615 # line 6: """
610 616
611 617 lines = []
612 618
613 619 before_newline = AttributeDict()
614 620 after_newline = AttributeDict()
615 621 if before_lines and before_lines[-1]['action'] == 'old-no-nl':
616 622 before_newline_line = before_lines.pop(-1)
617 623 before_newline.content = '\n {}'.format(
618 624 render_tokenstream(
619 625 [(x[0], '', x[1])
620 626 for x in [('nonl', before_newline_line['line'])]]))
621 627
622 628 if after_lines and after_lines[-1]['action'] == 'new-no-nl':
623 629 after_newline_line = after_lines.pop(-1)
624 630 after_newline.content = '\n {}'.format(
625 631 render_tokenstream(
626 632 [(x[0], '', x[1])
627 633 for x in [('nonl', after_newline_line['line'])]]))
628 634
629 635 while before_lines or after_lines:
630 636 before, after = None, None
631 637 before_tokens, after_tokens = None, None
632 638
633 639 if before_lines:
634 640 before = before_lines.pop(0)
635 641 if after_lines:
636 642 after = after_lines.pop(0)
637 643
638 644 original = AttributeDict()
639 645 modified = AttributeDict()
640 646
641 647 if before:
642 648 if before['action'] == 'old-no-nl':
643 649 before_tokens = [('nonl', before['line'])]
644 650 else:
645 651 before_tokens = self.get_line_tokens(
646 652 line_text=before['line'], line_number=before['old_lineno'],
647 653 input_file=source_file, no_hl=no_hl)
648 654 original.lineno = before['old_lineno']
649 655 original.content = before['line']
650 656 original.action = self.action_to_op(before['action'])
651 657
652 658 original.get_comment_args = (
653 659 source_file, 'o', before['old_lineno'])
654 660
655 661 if after:
656 662 if after['action'] == 'new-no-nl':
657 663 after_tokens = [('nonl', after['line'])]
658 664 else:
659 665 after_tokens = self.get_line_tokens(
660 666 line_text=after['line'], line_number=after['new_lineno'],
661 667 input_file=target_file, no_hl=no_hl)
662 668 modified.lineno = after['new_lineno']
663 669 modified.content = after['line']
664 670 modified.action = self.action_to_op(after['action'])
665 671
666 672 modified.get_comment_args = (target_file, 'n', after['new_lineno'])
667 673
668 674 # diff the lines
669 675 if before_tokens and after_tokens:
670 676 o_tokens, m_tokens, similarity = tokens_diff(
671 677 before_tokens, after_tokens)
672 678 original.content = render_tokenstream(o_tokens)
673 679 modified.content = render_tokenstream(m_tokens)
674 680 elif before_tokens:
675 681 original.content = render_tokenstream(
676 682 [(x[0], '', x[1]) for x in before_tokens])
677 683 elif after_tokens:
678 684 modified.content = render_tokenstream(
679 685 [(x[0], '', x[1]) for x in after_tokens])
680 686
681 687 if not before_lines and before_newline:
682 688 original.content += before_newline.content
683 689 before_newline = None
684 690 if not after_lines and after_newline:
685 691 modified.content += after_newline.content
686 692 after_newline = None
687 693
688 694 lines.append(AttributeDict({
689 695 'original': original,
690 696 'modified': modified,
691 697 }))
692 698
693 699 return lines
694 700
695 701 def get_line_tokens(self, line_text, line_number, input_file=None, no_hl=False):
696 702 filenode = None
697 703 filename = None
698 704
699 705 if isinstance(input_file, compat.string_types):
700 706 filename = input_file
701 707 elif isinstance(input_file, FileNode):
702 708 filenode = input_file
703 709 filename = input_file.unicode_path
704 710
705 711 hl_mode = self.HL_NONE if no_hl else self.highlight_mode
706 712 if hl_mode == self.HL_REAL and filenode:
707 713 lexer = self._get_lexer_for_filename(filename)
708 714 file_size_allowed = input_file.size < self.max_file_size_limit
709 715 if line_number and file_size_allowed:
710 716 return self.get_tokenized_filenode_line(
711 717 input_file, line_number, lexer)
712 718
713 719 if hl_mode in (self.HL_REAL, self.HL_FAST) and filename:
714 720 lexer = self._get_lexer_for_filename(filename)
715 721 return list(tokenize_string(line_text, lexer))
716 722
717 723 return list(tokenize_string(line_text, plain_text_lexer))
718 724
719 725 def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):
720 726
721 727 if filenode not in self.highlighted_filenodes:
722 728 tokenized_lines = filenode_as_lines_tokens(filenode, lexer)
723 729 self.highlighted_filenodes[filenode] = tokenized_lines
730
731 try:
724 732 return self.highlighted_filenodes[filenode][line_number - 1]
733 except Exception:
734 return [('', u'rhodecode diff rendering error')]
725 735
726 736 def action_to_op(self, action):
727 737 return {
728 738 'add': '+',
729 739 'del': '-',
730 740 'unmod': ' ',
731 741 'unmod-no-hl': ' ',
732 742 'old-no-nl': ' ',
733 743 'new-no-nl': ' ',
734 744 }.get(action, action)
735 745
736 746 def as_unified(self, lines):
737 747 """
738 748 Return a generator that yields the lines of a diff in unified order
739 749 """
740 750 def generator():
741 751 buf = []
742 752 for line in lines:
743 753
744 754 if buf and not line.original or line.original.action == ' ':
745 755 for b in buf:
746 756 yield b
747 757 buf = []
748 758
749 759 if line.original:
750 760 if line.original.action == ' ':
751 761 yield (line.original.lineno, line.modified.lineno,
752 762 line.original.action, line.original.content,
753 763 line.original.get_comment_args)
754 764 continue
755 765
756 766 if line.original.action == '-':
757 767 yield (line.original.lineno, None,
758 768 line.original.action, line.original.content,
759 769 line.original.get_comment_args)
760 770
761 771 if line.modified.action == '+':
762 772 buf.append((
763 773 None, line.modified.lineno,
764 774 line.modified.action, line.modified.content,
765 775 line.modified.get_comment_args))
766 776 continue
767 777
768 778 if line.modified:
769 779 yield (None, line.modified.lineno,
770 780 line.modified.action, line.modified.content,
771 781 line.modified.get_comment_args)
772 782
773 783 for b in buf:
774 784 yield b
775 785
776 786 return generator()
@@ -1,312 +1,335 b''
1 1 # -*- coding: utf-8 -*-
2 2
3 3 # Copyright (C) 2016-2019 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21 import pytest
22 22 from pygments.lexers import get_lexer_by_name
23 23
24 24 from rhodecode.tests import no_newline_id_generator
25 25 from rhodecode.lib.codeblocks import (
26 26 tokenize_string, split_token_stream, rollup_tokenstream,
27 27 render_tokenstream)
28 28
29 29
30 30 class TestTokenizeString(object):
31 31
32 32 python_code = '''
33 33 import this
34 34
35 35 var = 6
36 36 print("this")
37 37
38 38 '''
39 39
40 40 def test_tokenize_as_python(self):
41 41 lexer = get_lexer_by_name('python')
42 42 tokens = list(tokenize_string(self.python_code, lexer))
43 43
44 44 assert tokens == [
45 45 ('', u'\n'),
46 46 ('', u' '),
47 47 ('kn', u'import'),
48 48 ('', u' '),
49 49 ('nn', u'this'),
50 50 ('', u'\n'),
51 51 ('', u'\n'),
52 52 ('', u' '),
53 53 ('n', u'var'),
54 54 ('', u' '),
55 55 ('o', u'='),
56 56 ('', u' '),
57 57 ('mi', u'6'),
58 58 ('', u'\n'),
59 59 ('', u' '),
60 60 ('k', u'print'),
61 61 ('p', u'('),
62 62 ('s2', u'"'),
63 63 ('s2', u'this'),
64 64 ('s2', u'"'),
65 65 ('p', u')'),
66 66 ('', u'\n'),
67 67 ('', u'\n'),
68 68 ('', u' ')
69 69 ]
70 70
71 71 def test_tokenize_as_text(self):
72 72 lexer = get_lexer_by_name('text')
73 73 tokens = list(tokenize_string(self.python_code, lexer))
74 74
75 75 assert tokens == [
76 76 ('',
77 77 u'\n import this\n\n var = 6\n print("this")\n\n ')
78 78 ]
79 79
80 80
81 81 class TestSplitTokenStream(object):
82 82
83 83 def test_split_token_stream(self):
84 lines = list(split_token_stream(
85 [('type1', 'some\ntext'), ('type2', 'more\n')]))
84 tokens = [('type1', 'some\ntext'), ('type2', 'more\n')]
85 content = [x + y for x, y in tokens]
86 lines = list(split_token_stream(tokens, content))
86 87
87 88 assert lines == [
88 89 [('type1', u'some')],
89 90 [('type1', u'text'), ('type2', u'more')],
90 91 [('type2', u'')],
91 92 ]
92 93
93 94 def test_split_token_stream_single(self):
94 lines = list(split_token_stream(
95 [('type1', '\n')]))
96
95 tokens = [('type1', '\n')]
96 content = [x + y for x, y in tokens]
97 lines = list(split_token_stream(tokens, content))
97 98 assert lines == [
98 99 [('type1', '')],
99 100 [('type1', '')],
100 101 ]
101 102
102 103 def test_split_token_stream_single_repeat(self):
103 lines = list(split_token_stream(
104 [('type1', '\n\n\n')]))
105
104 tokens = [('type1', '\n\n\n')]
105 content = [x + y for x, y in tokens]
106 lines = list(split_token_stream(tokens, content))
106 107 assert lines == [
107 108 [('type1', '')],
108 109 [('type1', '')],
109 110 [('type1', '')],
110 111 [('type1', '')],
111 112 ]
112 113
113 114 def test_split_token_stream_multiple_repeat(self):
114 lines = list(split_token_stream(
115 [('type1', '\n\n'), ('type2', '\n\n')]))
115 tokens = [('type1', '\n\n'), ('type2', '\n\n')]
116 content = [x + y for x, y in tokens]
116 117
118 lines = list(split_token_stream(tokens, content))
117 119 assert lines == [
118 120 [('type1', '')],
119 121 [('type1', '')],
120 122 [('type1', ''), ('type2', '')],
121 123 [('type2', '')],
122 124 [('type2', '')],
123 125 ]
124 126
127 def test_no_tokens_by_content(self):
128 tokens = []
129 content = u'\ufeff'
130 lines = list(split_token_stream(tokens, content))
131 assert lines == [
132 [('', content)],
133 ]
134
135 def test_no_tokens_by_valid_content(self):
136 from pygments.lexers.css import CssLexer
137 content = u'\ufeff table.dataTable'
138 tokens = tokenize_string(content, CssLexer())
139
140 lines = list(split_token_stream(tokens, content))
141 assert lines == [
142 [('', u' '),
143 ('nt', u'table'),
144 ('p', u'.'),
145 ('nc', u'dataTable')],
146 ]
147
125 148
126 149 class TestRollupTokens(object):
127 150
128 151 @pytest.mark.parametrize('tokenstream,output', [
129 152 ([],
130 153 []),
131 154 ([('A', 'hell'), ('A', 'o')], [
132 155 ('A', [
133 156 ('', 'hello')]),
134 157 ]),
135 158 ([('A', 'hell'), ('B', 'o')], [
136 159 ('A', [
137 160 ('', 'hell')]),
138 161 ('B', [
139 162 ('', 'o')]),
140 163 ]),
141 164 ([('A', 'hel'), ('A', 'lo'), ('B', ' '), ('A', 'there')], [
142 165 ('A', [
143 166 ('', 'hello')]),
144 167 ('B', [
145 168 ('', ' ')]),
146 169 ('A', [
147 170 ('', 'there')]),
148 171 ]),
149 172 ])
150 173 def test_rollup_tokenstream_without_ops(self, tokenstream, output):
151 174 assert list(rollup_tokenstream(tokenstream)) == output
152 175
153 176 @pytest.mark.parametrize('tokenstream,output', [
154 177 ([],
155 178 []),
156 179 ([('A', '', 'hell'), ('A', '', 'o')], [
157 180 ('A', [
158 181 ('', 'hello')]),
159 182 ]),
160 183 ([('A', '', 'hell'), ('B', '', 'o')], [
161 184 ('A', [
162 185 ('', 'hell')]),
163 186 ('B', [
164 187 ('', 'o')]),
165 188 ]),
166 189 ([('A', '', 'h'), ('B', '', 'e'), ('C', '', 'y')], [
167 190 ('A', [
168 191 ('', 'h')]),
169 192 ('B', [
170 193 ('', 'e')]),
171 194 ('C', [
172 195 ('', 'y')]),
173 196 ]),
174 197 ([('A', '', 'h'), ('A', '', 'e'), ('C', '', 'y')], [
175 198 ('A', [
176 199 ('', 'he')]),
177 200 ('C', [
178 201 ('', 'y')]),
179 202 ]),
180 203 ([('A', 'ins', 'h'), ('A', 'ins', 'e')], [
181 204 ('A', [
182 205 ('ins', 'he')
183 206 ]),
184 207 ]),
185 208 ([('A', 'ins', 'h'), ('A', 'del', 'e')], [
186 209 ('A', [
187 210 ('ins', 'h'),
188 211 ('del', 'e')
189 212 ]),
190 213 ]),
191 214 ([('A', 'ins', 'h'), ('B', 'del', 'e'), ('B', 'del', 'y')], [
192 215 ('A', [
193 216 ('ins', 'h'),
194 217 ]),
195 218 ('B', [
196 219 ('del', 'ey'),
197 220 ]),
198 221 ]),
199 222 ([('A', 'ins', 'h'), ('A', 'del', 'e'), ('B', 'del', 'y')], [
200 223 ('A', [
201 224 ('ins', 'h'),
202 225 ('del', 'e'),
203 226 ]),
204 227 ('B', [
205 228 ('del', 'y'),
206 229 ]),
207 230 ]),
208 231 ([('A', '', 'some'), ('A', 'ins', 'new'), ('A', '', 'name')], [
209 232 ('A', [
210 233 ('', 'some'),
211 234 ('ins', 'new'),
212 235 ('', 'name'),
213 236 ]),
214 237 ]),
215 238 ])
216 239 def test_rollup_tokenstream_with_ops(self, tokenstream, output):
217 240 assert list(rollup_tokenstream(tokenstream)) == output
218 241
219 242
220 243 class TestRenderTokenStream(object):
221 244
222 245 @pytest.mark.parametrize('tokenstream,output', [
223 246 (
224 247 [],
225 248 '',
226 249 ),
227 250 (
228 251 [('', '', u'')],
229 252 '<span></span>',
230 253 ),
231 254 (
232 255 [('', '', u'text')],
233 256 '<span>text</span>',
234 257 ),
235 258 (
236 259 [('A', '', u'')],
237 260 '<span class="A"></span>',
238 261 ),
239 262 (
240 263 [('A', '', u'hello')],
241 264 '<span class="A">hello</span>',
242 265 ),
243 266 (
244 267 [('A', '', u'hel'), ('A', '', u'lo')],
245 268 '<span class="A">hello</span>',
246 269 ),
247 270 (
248 271 [('A', '', u'two\n'), ('A', '', u'lines')],
249 272 '<span class="A">two\nlines</span>',
250 273 ),
251 274 (
252 275 [('A', '', u'\nthree\n'), ('A', '', u'lines')],
253 276 '<span class="A">\nthree\nlines</span>',
254 277 ),
255 278 (
256 279 [('', '', u'\n'), ('A', '', u'line')],
257 280 '<span>\n</span><span class="A">line</span>',
258 281 ),
259 282 (
260 283 [('', 'ins', u'\n'), ('A', '', u'line')],
261 284 '<span><ins>\n</ins></span><span class="A">line</span>',
262 285 ),
263 286 (
264 287 [('A', '', u'hel'), ('A', 'ins', u'lo')],
265 288 '<span class="A">hel<ins>lo</ins></span>',
266 289 ),
267 290 (
268 291 [('A', '', u'hel'), ('A', 'ins', u'l'), ('A', 'ins', u'o')],
269 292 '<span class="A">hel<ins>lo</ins></span>',
270 293 ),
271 294 (
272 295 [('A', '', u'hel'), ('A', 'ins', u'l'), ('A', 'del', u'o')],
273 296 '<span class="A">hel<ins>l</ins><del>o</del></span>',
274 297 ),
275 298 (
276 299 [('A', '', u'hel'), ('B', '', u'lo')],
277 300 '<span class="A">hel</span><span class="B">lo</span>',
278 301 ),
279 302 (
280 303 [('A', '', u'hel'), ('B', 'ins', u'lo')],
281 304 '<span class="A">hel</span><span class="B"><ins>lo</ins></span>',
282 305 ),
283 306 ], ids=no_newline_id_generator)
284 307 def test_render_tokenstream_with_ops(self, tokenstream, output):
285 308 html = render_tokenstream(tokenstream)
286 309 assert html == output
287 310
288 311 @pytest.mark.parametrize('tokenstream,output', [
289 312 (
290 313 [('A', u'hel'), ('A', u'lo')],
291 314 '<span class="A">hello</span>',
292 315 ),
293 316 (
294 317 [('A', u'hel'), ('A', u'l'), ('A', u'o')],
295 318 '<span class="A">hello</span>',
296 319 ),
297 320 (
298 321 [('A', u'hel'), ('A', u'l'), ('A', u'o')],
299 322 '<span class="A">hello</span>',
300 323 ),
301 324 (
302 325 [('A', u'hel'), ('B', u'lo')],
303 326 '<span class="A">hel</span><span class="B">lo</span>',
304 327 ),
305 328 (
306 329 [('A', u'hel'), ('B', u'lo')],
307 330 '<span class="A">hel</span><span class="B">lo</span>',
308 331 ),
309 332 ])
310 333 def test_render_tokenstream_without_ops(self, tokenstream, output):
311 334 html = render_tokenstream(tokenstream)
312 335 assert html == output
General Comments 0
You need to be logged in to leave comments. Login now