##// END OF EJS Templates
diffs: switched bz2 into gzip since it can be 10x faster in some cases with only slight size penalty
marcink -
r3854:7b87073e default
parent child Browse files
Show More
@@ -1,1254 +1,1271 b''
1 1 # -*- coding: utf-8 -*-
2 2
3 3 # Copyright (C) 2011-2019 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21
22 22 """
23 23 Set of diffing helpers, previously part of vcs
24 24 """
25 25
26 26 import os
27 27 import re
28 28 import bz2
29 import gzip
29 30 import time
30 31
31 32 import collections
32 33 import difflib
33 34 import logging
34 35 import cPickle as pickle
35 36 from itertools import tee, imap
36 37
37 38 from rhodecode.lib.vcs.exceptions import VCSError
38 39 from rhodecode.lib.vcs.nodes import FileNode, SubModuleNode
39 40 from rhodecode.lib.utils2 import safe_unicode, safe_str
40 41
41 42 log = logging.getLogger(__name__)
42 43
43 44 # define max context, a file with more than this numbers of lines is unusable
44 45 # in browser anyway
45 46 MAX_CONTEXT = 20 * 1024
46 47 DEFAULT_CONTEXT = 3
47 48
48 49
49 50 def get_diff_context(request):
50 51 return MAX_CONTEXT if request.GET.get('fullcontext', '') == '1' else DEFAULT_CONTEXT
51 52
52 53
53 54 def get_diff_whitespace_flag(request):
54 55 return request.GET.get('ignorews', '') == '1'
55 56
56 57
57 58 class OPS(object):
58 59 ADD = 'A'
59 60 MOD = 'M'
60 61 DEL = 'D'
61 62
62 63
63 64 def get_gitdiff(filenode_old, filenode_new, ignore_whitespace=True, context=3):
64 65 """
65 66 Returns git style diff between given ``filenode_old`` and ``filenode_new``.
66 67
67 68 :param ignore_whitespace: ignore whitespaces in diff
68 69 """
69 70 # make sure we pass in default context
70 71 context = context or 3
71 72 # protect against IntOverflow when passing HUGE context
72 73 if context > MAX_CONTEXT:
73 74 context = MAX_CONTEXT
74 75
75 76 submodules = filter(lambda o: isinstance(o, SubModuleNode),
76 77 [filenode_new, filenode_old])
77 78 if submodules:
78 79 return ''
79 80
80 81 for filenode in (filenode_old, filenode_new):
81 82 if not isinstance(filenode, FileNode):
82 83 raise VCSError(
83 84 "Given object should be FileNode object, not %s"
84 85 % filenode.__class__)
85 86
86 87 repo = filenode_new.commit.repository
87 88 old_commit = filenode_old.commit or repo.EMPTY_COMMIT
88 89 new_commit = filenode_new.commit
89 90
90 91 vcs_gitdiff = repo.get_diff(
91 92 old_commit, new_commit, filenode_new.path,
92 93 ignore_whitespace, context, path1=filenode_old.path)
93 94 return vcs_gitdiff
94 95
95 96 NEW_FILENODE = 1
96 97 DEL_FILENODE = 2
97 98 MOD_FILENODE = 3
98 99 RENAMED_FILENODE = 4
99 100 COPIED_FILENODE = 5
100 101 CHMOD_FILENODE = 6
101 102 BIN_FILENODE = 7
102 103
103 104
104 105 class LimitedDiffContainer(object):
105 106
106 107 def __init__(self, diff_limit, cur_diff_size, diff):
107 108 self.diff = diff
108 109 self.diff_limit = diff_limit
109 110 self.cur_diff_size = cur_diff_size
110 111
111 112 def __getitem__(self, key):
112 113 return self.diff.__getitem__(key)
113 114
114 115 def __iter__(self):
115 116 for l in self.diff:
116 117 yield l
117 118
118 119
119 120 class Action(object):
120 121 """
121 122 Contains constants for the action value of the lines in a parsed diff.
122 123 """
123 124
124 125 ADD = 'add'
125 126 DELETE = 'del'
126 127 UNMODIFIED = 'unmod'
127 128
128 129 CONTEXT = 'context'
129 130 OLD_NO_NL = 'old-no-nl'
130 131 NEW_NO_NL = 'new-no-nl'
131 132
132 133
133 134 class DiffProcessor(object):
134 135 """
135 136 Give it a unified or git diff and it returns a list of the files that were
136 137 mentioned in the diff together with a dict of meta information that
137 138 can be used to render it in a HTML template.
138 139
139 140 .. note:: Unicode handling
140 141
141 142 The original diffs are a byte sequence and can contain filenames
142 143 in mixed encodings. This class generally returns `unicode` objects
143 144 since the result is intended for presentation to the user.
144 145
145 146 """
146 147 _chunk_re = re.compile(r'^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)')
147 148 _newline_marker = re.compile(r'^\\ No newline at end of file')
148 149
149 150 # used for inline highlighter word split
150 151 _token_re = re.compile(r'()(&gt;|&lt;|&amp;|\W+?)')
151 152
152 153 # collapse ranges of commits over given number
153 154 _collapse_commits_over = 5
154 155
155 156 def __init__(self, diff, format='gitdiff', diff_limit=None,
156 157 file_limit=None, show_full_diff=True):
157 158 """
158 159 :param diff: A `Diff` object representing a diff from a vcs backend
159 160 :param format: format of diff passed, `udiff` or `gitdiff`
160 161 :param diff_limit: define the size of diff that is considered "big"
161 162 based on that parameter cut off will be triggered, set to None
162 163 to show full diff
163 164 """
164 165 self._diff = diff
165 166 self._format = format
166 167 self.adds = 0
167 168 self.removes = 0
168 169 # calculate diff size
169 170 self.diff_limit = diff_limit
170 171 self.file_limit = file_limit
171 172 self.show_full_diff = show_full_diff
172 173 self.cur_diff_size = 0
173 174 self.parsed = False
174 175 self.parsed_diff = []
175 176
176 177 log.debug('Initialized DiffProcessor with %s mode', format)
177 178 if format == 'gitdiff':
178 179 self.differ = self._highlight_line_difflib
179 180 self._parser = self._parse_gitdiff
180 181 else:
181 182 self.differ = self._highlight_line_udiff
182 183 self._parser = self._new_parse_gitdiff
183 184
184 185 def _copy_iterator(self):
185 186 """
186 187 make a fresh copy of generator, we should not iterate thru
187 188 an original as it's needed for repeating operations on
188 189 this instance of DiffProcessor
189 190 """
190 191 self.__udiff, iterator_copy = tee(self.__udiff)
191 192 return iterator_copy
192 193
193 194 def _escaper(self, string):
194 195 """
195 196 Escaper for diff escapes special chars and checks the diff limit
196 197
197 198 :param string:
198 199 """
199 200 self.cur_diff_size += len(string)
200 201
201 202 if not self.show_full_diff and (self.cur_diff_size > self.diff_limit):
202 203 raise DiffLimitExceeded('Diff Limit Exceeded')
203 204
204 205 return string \
205 206 .replace('&', '&amp;')\
206 207 .replace('<', '&lt;')\
207 208 .replace('>', '&gt;')
208 209
209 210 def _line_counter(self, l):
210 211 """
211 212 Checks each line and bumps total adds/removes for this diff
212 213
213 214 :param l:
214 215 """
215 216 if l.startswith('+') and not l.startswith('+++'):
216 217 self.adds += 1
217 218 elif l.startswith('-') and not l.startswith('---'):
218 219 self.removes += 1
219 220 return safe_unicode(l)
220 221
221 222 def _highlight_line_difflib(self, line, next_):
222 223 """
223 224 Highlight inline changes in both lines.
224 225 """
225 226
226 227 if line['action'] == Action.DELETE:
227 228 old, new = line, next_
228 229 else:
229 230 old, new = next_, line
230 231
231 232 oldwords = self._token_re.split(old['line'])
232 233 newwords = self._token_re.split(new['line'])
233 234 sequence = difflib.SequenceMatcher(None, oldwords, newwords)
234 235
235 236 oldfragments, newfragments = [], []
236 237 for tag, i1, i2, j1, j2 in sequence.get_opcodes():
237 238 oldfrag = ''.join(oldwords[i1:i2])
238 239 newfrag = ''.join(newwords[j1:j2])
239 240 if tag != 'equal':
240 241 if oldfrag:
241 242 oldfrag = '<del>%s</del>' % oldfrag
242 243 if newfrag:
243 244 newfrag = '<ins>%s</ins>' % newfrag
244 245 oldfragments.append(oldfrag)
245 246 newfragments.append(newfrag)
246 247
247 248 old['line'] = "".join(oldfragments)
248 249 new['line'] = "".join(newfragments)
249 250
250 251 def _highlight_line_udiff(self, line, next_):
251 252 """
252 253 Highlight inline changes in both lines.
253 254 """
254 255 start = 0
255 256 limit = min(len(line['line']), len(next_['line']))
256 257 while start < limit and line['line'][start] == next_['line'][start]:
257 258 start += 1
258 259 end = -1
259 260 limit -= start
260 261 while -end <= limit and line['line'][end] == next_['line'][end]:
261 262 end -= 1
262 263 end += 1
263 264 if start or end:
264 265 def do(l):
265 266 last = end + len(l['line'])
266 267 if l['action'] == Action.ADD:
267 268 tag = 'ins'
268 269 else:
269 270 tag = 'del'
270 271 l['line'] = '%s<%s>%s</%s>%s' % (
271 272 l['line'][:start],
272 273 tag,
273 274 l['line'][start:last],
274 275 tag,
275 276 l['line'][last:]
276 277 )
277 278 do(line)
278 279 do(next_)
279 280
280 281 def _clean_line(self, line, command):
281 282 if command in ['+', '-', ' ']:
282 283 # only modify the line if it's actually a diff thing
283 284 line = line[1:]
284 285 return line
285 286
286 287 def _parse_gitdiff(self, inline_diff=True):
287 288 _files = []
288 289 diff_container = lambda arg: arg
289 290
290 291 for chunk in self._diff.chunks():
291 292 head = chunk.header
292 293
293 294 diff = imap(self._escaper, self.diff_splitter(chunk.diff))
294 295 raw_diff = chunk.raw
295 296 limited_diff = False
296 297 exceeds_limit = False
297 298
298 299 op = None
299 300 stats = {
300 301 'added': 0,
301 302 'deleted': 0,
302 303 'binary': False,
303 304 'ops': {},
304 305 }
305 306
306 307 if head['deleted_file_mode']:
307 308 op = OPS.DEL
308 309 stats['binary'] = True
309 310 stats['ops'][DEL_FILENODE] = 'deleted file'
310 311
311 312 elif head['new_file_mode']:
312 313 op = OPS.ADD
313 314 stats['binary'] = True
314 315 stats['ops'][NEW_FILENODE] = 'new file %s' % head['new_file_mode']
315 316 else: # modify operation, can be copy, rename or chmod
316 317
317 318 # CHMOD
318 319 if head['new_mode'] and head['old_mode']:
319 320 op = OPS.MOD
320 321 stats['binary'] = True
321 322 stats['ops'][CHMOD_FILENODE] = (
322 323 'modified file chmod %s => %s' % (
323 324 head['old_mode'], head['new_mode']))
324 325 # RENAME
325 326 if head['rename_from'] != head['rename_to']:
326 327 op = OPS.MOD
327 328 stats['binary'] = True
328 329 stats['ops'][RENAMED_FILENODE] = (
329 330 'file renamed from %s to %s' % (
330 331 head['rename_from'], head['rename_to']))
331 332 # COPY
332 333 if head.get('copy_from') and head.get('copy_to'):
333 334 op = OPS.MOD
334 335 stats['binary'] = True
335 336 stats['ops'][COPIED_FILENODE] = (
336 337 'file copied from %s to %s' % (
337 338 head['copy_from'], head['copy_to']))
338 339
339 340 # If our new parsed headers didn't match anything fallback to
340 341 # old style detection
341 342 if op is None:
342 343 if not head['a_file'] and head['b_file']:
343 344 op = OPS.ADD
344 345 stats['binary'] = True
345 346 stats['ops'][NEW_FILENODE] = 'new file'
346 347
347 348 elif head['a_file'] and not head['b_file']:
348 349 op = OPS.DEL
349 350 stats['binary'] = True
350 351 stats['ops'][DEL_FILENODE] = 'deleted file'
351 352
352 353 # it's not ADD not DELETE
353 354 if op is None:
354 355 op = OPS.MOD
355 356 stats['binary'] = True
356 357 stats['ops'][MOD_FILENODE] = 'modified file'
357 358
358 359 # a real non-binary diff
359 360 if head['a_file'] or head['b_file']:
360 361 try:
361 362 raw_diff, chunks, _stats = self._parse_lines(diff)
362 363 stats['binary'] = False
363 364 stats['added'] = _stats[0]
364 365 stats['deleted'] = _stats[1]
365 366 # explicit mark that it's a modified file
366 367 if op == OPS.MOD:
367 368 stats['ops'][MOD_FILENODE] = 'modified file'
368 369 exceeds_limit = len(raw_diff) > self.file_limit
369 370
370 371 # changed from _escaper function so we validate size of
371 372 # each file instead of the whole diff
372 373 # diff will hide big files but still show small ones
373 374 # from my tests, big files are fairly safe to be parsed
374 375 # but the browser is the bottleneck
375 376 if not self.show_full_diff and exceeds_limit:
376 377 raise DiffLimitExceeded('File Limit Exceeded')
377 378
378 379 except DiffLimitExceeded:
379 380 diff_container = lambda _diff: \
380 381 LimitedDiffContainer(
381 382 self.diff_limit, self.cur_diff_size, _diff)
382 383
383 384 exceeds_limit = len(raw_diff) > self.file_limit
384 385 limited_diff = True
385 386 chunks = []
386 387
387 388 else: # GIT format binary patch, or possibly empty diff
388 389 if head['bin_patch']:
389 390 # we have operation already extracted, but we mark simply
390 391 # it's a diff we wont show for binary files
391 392 stats['ops'][BIN_FILENODE] = 'binary diff hidden'
392 393 chunks = []
393 394
394 395 if chunks and not self.show_full_diff and op == OPS.DEL:
395 396 # if not full diff mode show deleted file contents
396 397 # TODO: anderson: if the view is not too big, there is no way
397 398 # to see the content of the file
398 399 chunks = []
399 400
400 401 chunks.insert(0, [{
401 402 'old_lineno': '',
402 403 'new_lineno': '',
403 404 'action': Action.CONTEXT,
404 405 'line': msg,
405 406 } for _op, msg in stats['ops'].iteritems()
406 407 if _op not in [MOD_FILENODE]])
407 408
408 409 _files.append({
409 410 'filename': safe_unicode(head['b_path']),
410 411 'old_revision': head['a_blob_id'],
411 412 'new_revision': head['b_blob_id'],
412 413 'chunks': chunks,
413 414 'raw_diff': safe_unicode(raw_diff),
414 415 'operation': op,
415 416 'stats': stats,
416 417 'exceeds_limit': exceeds_limit,
417 418 'is_limited_diff': limited_diff,
418 419 })
419 420
420 421 sorter = lambda info: {OPS.ADD: 0, OPS.MOD: 1,
421 422 OPS.DEL: 2}.get(info['operation'])
422 423
423 424 if not inline_diff:
424 425 return diff_container(sorted(_files, key=sorter))
425 426
426 427 # highlight inline changes
427 428 for diff_data in _files:
428 429 for chunk in diff_data['chunks']:
429 430 lineiter = iter(chunk)
430 431 try:
431 432 while 1:
432 433 line = lineiter.next()
433 434 if line['action'] not in (
434 435 Action.UNMODIFIED, Action.CONTEXT):
435 436 nextline = lineiter.next()
436 437 if nextline['action'] in ['unmod', 'context'] or \
437 438 nextline['action'] == line['action']:
438 439 continue
439 440 self.differ(line, nextline)
440 441 except StopIteration:
441 442 pass
442 443
443 444 return diff_container(sorted(_files, key=sorter))
444 445
445 446 def _check_large_diff(self):
446 447 log.debug('Diff exceeds current diff_limit of %s', self.diff_limit)
447 448 if not self.show_full_diff and (self.cur_diff_size > self.diff_limit):
448 449 raise DiffLimitExceeded('Diff Limit `%s` Exceeded', self.diff_limit)
449 450
450 451 # FIXME: NEWDIFFS: dan: this replaces _parse_gitdiff
451 452 def _new_parse_gitdiff(self, inline_diff=True):
452 453 _files = []
453 454
454 455 # this can be overriden later to a LimitedDiffContainer type
455 456 diff_container = lambda arg: arg
456 457
457 458 for chunk in self._diff.chunks():
458 459 head = chunk.header
459 460 log.debug('parsing diff %r', head)
460 461
461 462 raw_diff = chunk.raw
462 463 limited_diff = False
463 464 exceeds_limit = False
464 465
465 466 op = None
466 467 stats = {
467 468 'added': 0,
468 469 'deleted': 0,
469 470 'binary': False,
470 471 'old_mode': None,
471 472 'new_mode': None,
472 473 'ops': {},
473 474 }
474 475 if head['old_mode']:
475 476 stats['old_mode'] = head['old_mode']
476 477 if head['new_mode']:
477 478 stats['new_mode'] = head['new_mode']
478 479 if head['b_mode']:
479 480 stats['new_mode'] = head['b_mode']
480 481
481 482 # delete file
482 483 if head['deleted_file_mode']:
483 484 op = OPS.DEL
484 485 stats['binary'] = True
485 486 stats['ops'][DEL_FILENODE] = 'deleted file'
486 487
487 488 # new file
488 489 elif head['new_file_mode']:
489 490 op = OPS.ADD
490 491 stats['binary'] = True
491 492 stats['old_mode'] = None
492 493 stats['new_mode'] = head['new_file_mode']
493 494 stats['ops'][NEW_FILENODE] = 'new file %s' % head['new_file_mode']
494 495
495 496 # modify operation, can be copy, rename or chmod
496 497 else:
497 498 # CHMOD
498 499 if head['new_mode'] and head['old_mode']:
499 500 op = OPS.MOD
500 501 stats['binary'] = True
501 502 stats['ops'][CHMOD_FILENODE] = (
502 503 'modified file chmod %s => %s' % (
503 504 head['old_mode'], head['new_mode']))
504 505
505 506 # RENAME
506 507 if head['rename_from'] != head['rename_to']:
507 508 op = OPS.MOD
508 509 stats['binary'] = True
509 510 stats['renamed'] = (head['rename_from'], head['rename_to'])
510 511 stats['ops'][RENAMED_FILENODE] = (
511 512 'file renamed from %s to %s' % (
512 513 head['rename_from'], head['rename_to']))
513 514 # COPY
514 515 if head.get('copy_from') and head.get('copy_to'):
515 516 op = OPS.MOD
516 517 stats['binary'] = True
517 518 stats['copied'] = (head['copy_from'], head['copy_to'])
518 519 stats['ops'][COPIED_FILENODE] = (
519 520 'file copied from %s to %s' % (
520 521 head['copy_from'], head['copy_to']))
521 522
522 523 # If our new parsed headers didn't match anything fallback to
523 524 # old style detection
524 525 if op is None:
525 526 if not head['a_file'] and head['b_file']:
526 527 op = OPS.ADD
527 528 stats['binary'] = True
528 529 stats['new_file'] = True
529 530 stats['ops'][NEW_FILENODE] = 'new file'
530 531
531 532 elif head['a_file'] and not head['b_file']:
532 533 op = OPS.DEL
533 534 stats['binary'] = True
534 535 stats['ops'][DEL_FILENODE] = 'deleted file'
535 536
536 537 # it's not ADD not DELETE
537 538 if op is None:
538 539 op = OPS.MOD
539 540 stats['binary'] = True
540 541 stats['ops'][MOD_FILENODE] = 'modified file'
541 542
542 543 # a real non-binary diff
543 544 if head['a_file'] or head['b_file']:
544 545 # simulate splitlines, so we keep the line end part
545 546 diff = self.diff_splitter(chunk.diff)
546 547
547 548 # append each file to the diff size
548 549 raw_chunk_size = len(raw_diff)
549 550
550 551 exceeds_limit = raw_chunk_size > self.file_limit
551 552 self.cur_diff_size += raw_chunk_size
552 553
553 554 try:
554 555 # Check each file instead of the whole diff.
555 556 # Diff will hide big files but still show small ones.
556 557 # From the tests big files are fairly safe to be parsed
557 558 # but the browser is the bottleneck.
558 559 if not self.show_full_diff and exceeds_limit:
559 560 log.debug('File `%s` exceeds current file_limit of %s',
560 561 safe_unicode(head['b_path']), self.file_limit)
561 562 raise DiffLimitExceeded(
562 563 'File Limit %s Exceeded', self.file_limit)
563 564
564 565 self._check_large_diff()
565 566
566 567 raw_diff, chunks, _stats = self._new_parse_lines(diff)
567 568 stats['binary'] = False
568 569 stats['added'] = _stats[0]
569 570 stats['deleted'] = _stats[1]
570 571 # explicit mark that it's a modified file
571 572 if op == OPS.MOD:
572 573 stats['ops'][MOD_FILENODE] = 'modified file'
573 574
574 575 except DiffLimitExceeded:
575 576 diff_container = lambda _diff: \
576 577 LimitedDiffContainer(
577 578 self.diff_limit, self.cur_diff_size, _diff)
578 579
579 580 limited_diff = True
580 581 chunks = []
581 582
582 583 else: # GIT format binary patch, or possibly empty diff
583 584 if head['bin_patch']:
584 585 # we have operation already extracted, but we mark simply
585 586 # it's a diff we wont show for binary files
586 587 stats['ops'][BIN_FILENODE] = 'binary diff hidden'
587 588 chunks = []
588 589
589 590 # Hide content of deleted node by setting empty chunks
590 591 if chunks and not self.show_full_diff and op == OPS.DEL:
591 592 # if not full diff mode show deleted file contents
592 593 # TODO: anderson: if the view is not too big, there is no way
593 594 # to see the content of the file
594 595 chunks = []
595 596
596 597 chunks.insert(
597 598 0, [{'old_lineno': '',
598 599 'new_lineno': '',
599 600 'action': Action.CONTEXT,
600 601 'line': msg,
601 602 } for _op, msg in stats['ops'].iteritems()
602 603 if _op not in [MOD_FILENODE]])
603 604
604 605 original_filename = safe_unicode(head['a_path'])
605 606 _files.append({
606 607 'original_filename': original_filename,
607 608 'filename': safe_unicode(head['b_path']),
608 609 'old_revision': head['a_blob_id'],
609 610 'new_revision': head['b_blob_id'],
610 611 'chunks': chunks,
611 612 'raw_diff': safe_unicode(raw_diff),
612 613 'operation': op,
613 614 'stats': stats,
614 615 'exceeds_limit': exceeds_limit,
615 616 'is_limited_diff': limited_diff,
616 617 })
617 618
618 619 sorter = lambda info: {OPS.ADD: 0, OPS.MOD: 1,
619 620 OPS.DEL: 2}.get(info['operation'])
620 621
621 622 return diff_container(sorted(_files, key=sorter))
622 623
623 624 # FIXME: NEWDIFFS: dan: this gets replaced by _new_parse_lines
624 625 def _parse_lines(self, diff_iter):
625 626 """
626 627 Parse the diff an return data for the template.
627 628 """
628 629
629 630 stats = [0, 0]
630 631 chunks = []
631 632 raw_diff = []
632 633
633 634 try:
634 635 line = diff_iter.next()
635 636
636 637 while line:
637 638 raw_diff.append(line)
638 639 lines = []
639 640 chunks.append(lines)
640 641
641 642 match = self._chunk_re.match(line)
642 643
643 644 if not match:
644 645 break
645 646
646 647 gr = match.groups()
647 648 (old_line, old_end,
648 649 new_line, new_end) = [int(x or 1) for x in gr[:-1]]
649 650 old_line -= 1
650 651 new_line -= 1
651 652
652 653 context = len(gr) == 5
653 654 old_end += old_line
654 655 new_end += new_line
655 656
656 657 if context:
657 658 # skip context only if it's first line
658 659 if int(gr[0]) > 1:
659 660 lines.append({
660 661 'old_lineno': '...',
661 662 'new_lineno': '...',
662 663 'action': Action.CONTEXT,
663 664 'line': line,
664 665 })
665 666
666 667 line = diff_iter.next()
667 668
668 669 while old_line < old_end or new_line < new_end:
669 670 command = ' '
670 671 if line:
671 672 command = line[0]
672 673
673 674 affects_old = affects_new = False
674 675
675 676 # ignore those if we don't expect them
676 677 if command in '#@':
677 678 continue
678 679 elif command == '+':
679 680 affects_new = True
680 681 action = Action.ADD
681 682 stats[0] += 1
682 683 elif command == '-':
683 684 affects_old = True
684 685 action = Action.DELETE
685 686 stats[1] += 1
686 687 else:
687 688 affects_old = affects_new = True
688 689 action = Action.UNMODIFIED
689 690
690 691 if not self._newline_marker.match(line):
691 692 old_line += affects_old
692 693 new_line += affects_new
693 694 lines.append({
694 695 'old_lineno': affects_old and old_line or '',
695 696 'new_lineno': affects_new and new_line or '',
696 697 'action': action,
697 698 'line': self._clean_line(line, command)
698 699 })
699 700 raw_diff.append(line)
700 701
701 702 line = diff_iter.next()
702 703
703 704 if self._newline_marker.match(line):
704 705 # we need to append to lines, since this is not
705 706 # counted in the line specs of diff
706 707 lines.append({
707 708 'old_lineno': '...',
708 709 'new_lineno': '...',
709 710 'action': Action.CONTEXT,
710 711 'line': self._clean_line(line, command)
711 712 })
712 713
713 714 except StopIteration:
714 715 pass
715 716 return ''.join(raw_diff), chunks, stats
716 717
717 718 # FIXME: NEWDIFFS: dan: this replaces _parse_lines
718 719 def _new_parse_lines(self, diff_iter):
719 720 """
720 721 Parse the diff an return data for the template.
721 722 """
722 723
723 724 stats = [0, 0]
724 725 chunks = []
725 726 raw_diff = []
726 727
727 728 try:
728 729 line = diff_iter.next()
729 730
730 731 while line:
731 732 raw_diff.append(line)
732 733 # match header e.g @@ -0,0 +1 @@\n'
733 734 match = self._chunk_re.match(line)
734 735
735 736 if not match:
736 737 break
737 738
738 739 gr = match.groups()
739 740 (old_line, old_end,
740 741 new_line, new_end) = [int(x or 1) for x in gr[:-1]]
741 742
742 743 lines = []
743 744 hunk = {
744 745 'section_header': gr[-1],
745 746 'source_start': old_line,
746 747 'source_length': old_end,
747 748 'target_start': new_line,
748 749 'target_length': new_end,
749 750 'lines': lines,
750 751 }
751 752 chunks.append(hunk)
752 753
753 754 old_line -= 1
754 755 new_line -= 1
755 756
756 757 context = len(gr) == 5
757 758 old_end += old_line
758 759 new_end += new_line
759 760
760 761 line = diff_iter.next()
761 762
762 763 while old_line < old_end or new_line < new_end:
763 764 command = ' '
764 765 if line:
765 766 command = line[0]
766 767
767 768 affects_old = affects_new = False
768 769
769 770 # ignore those if we don't expect them
770 771 if command in '#@':
771 772 continue
772 773 elif command == '+':
773 774 affects_new = True
774 775 action = Action.ADD
775 776 stats[0] += 1
776 777 elif command == '-':
777 778 affects_old = True
778 779 action = Action.DELETE
779 780 stats[1] += 1
780 781 else:
781 782 affects_old = affects_new = True
782 783 action = Action.UNMODIFIED
783 784
784 785 if not self._newline_marker.match(line):
785 786 old_line += affects_old
786 787 new_line += affects_new
787 788 lines.append({
788 789 'old_lineno': affects_old and old_line or '',
789 790 'new_lineno': affects_new and new_line or '',
790 791 'action': action,
791 792 'line': self._clean_line(line, command)
792 793 })
793 794 raw_diff.append(line)
794 795
795 796 line = diff_iter.next()
796 797
797 798 if self._newline_marker.match(line):
798 799 # we need to append to lines, since this is not
799 800 # counted in the line specs of diff
800 801 if affects_old:
801 802 action = Action.OLD_NO_NL
802 803 elif affects_new:
803 804 action = Action.NEW_NO_NL
804 805 else:
805 806 raise Exception('invalid context for no newline')
806 807
807 808 lines.append({
808 809 'old_lineno': None,
809 810 'new_lineno': None,
810 811 'action': action,
811 812 'line': self._clean_line(line, command)
812 813 })
813 814
814 815 except StopIteration:
815 816 pass
816 817
817 818 return ''.join(raw_diff), chunks, stats
818 819
819 820 def _safe_id(self, idstring):
820 821 """Make a string safe for including in an id attribute.
821 822
822 823 The HTML spec says that id attributes 'must begin with
823 824 a letter ([A-Za-z]) and may be followed by any number
824 825 of letters, digits ([0-9]), hyphens ("-"), underscores
825 826 ("_"), colons (":"), and periods (".")'. These regexps
826 827 are slightly over-zealous, in that they remove colons
827 828 and periods unnecessarily.
828 829
829 830 Whitespace is transformed into underscores, and then
830 831 anything which is not a hyphen or a character that
831 832 matches \w (alphanumerics and underscore) is removed.
832 833
833 834 """
834 835 # Transform all whitespace to underscore
835 836 idstring = re.sub(r'\s', "_", '%s' % idstring)
836 837 # Remove everything that is not a hyphen or a member of \w
837 838 idstring = re.sub(r'(?!-)\W', "", idstring).lower()
838 839 return idstring
839 840
840 841 @classmethod
841 842 def diff_splitter(cls, string):
842 843 """
843 844 Diff split that emulates .splitlines() but works only on \n
844 845 """
845 846 if not string:
846 847 return
847 848 elif string == '\n':
848 849 yield u'\n'
849 850 else:
850 851
851 852 has_newline = string.endswith('\n')
852 853 elements = string.split('\n')
853 854 if has_newline:
854 855 # skip last element as it's empty string from newlines
855 856 elements = elements[:-1]
856 857
857 858 len_elements = len(elements)
858 859
859 860 for cnt, line in enumerate(elements, start=1):
860 861 last_line = cnt == len_elements
861 862 if last_line and not has_newline:
862 863 yield safe_unicode(line)
863 864 else:
864 865 yield safe_unicode(line) + '\n'
865 866
866 867 def prepare(self, inline_diff=True):
867 868 """
868 869 Prepare the passed udiff for HTML rendering.
869 870
870 871 :return: A list of dicts with diff information.
871 872 """
872 873 parsed = self._parser(inline_diff=inline_diff)
873 874 self.parsed = True
874 875 self.parsed_diff = parsed
875 876 return parsed
876 877
877 878 def as_raw(self, diff_lines=None):
878 879 """
879 880 Returns raw diff as a byte string
880 881 """
881 882 return self._diff.raw
882 883
883 884 def as_html(self, table_class='code-difftable', line_class='line',
884 885 old_lineno_class='lineno old', new_lineno_class='lineno new',
885 886 code_class='code', enable_comments=False, parsed_lines=None):
886 887 """
887 888 Return given diff as html table with customized css classes
888 889 """
889 890 # TODO(marcink): not sure how to pass in translator
890 891 # here in an efficient way, leave the _ for proper gettext extraction
891 892 _ = lambda s: s
892 893
893 894 def _link_to_if(condition, label, url):
894 895 """
895 896 Generates a link if condition is meet or just the label if not.
896 897 """
897 898
898 899 if condition:
899 900 return '''<a href="%(url)s" class="tooltip"
900 901 title="%(title)s">%(label)s</a>''' % {
901 902 'title': _('Click to select line'),
902 903 'url': url,
903 904 'label': label
904 905 }
905 906 else:
906 907 return label
907 908 if not self.parsed:
908 909 self.prepare()
909 910
910 911 diff_lines = self.parsed_diff
911 912 if parsed_lines:
912 913 diff_lines = parsed_lines
913 914
914 915 _html_empty = True
915 916 _html = []
916 917 _html.append('''<table class="%(table_class)s">\n''' % {
917 918 'table_class': table_class
918 919 })
919 920
920 921 for diff in diff_lines:
921 922 for line in diff['chunks']:
922 923 _html_empty = False
923 924 for change in line:
924 925 _html.append('''<tr class="%(lc)s %(action)s">\n''' % {
925 926 'lc': line_class,
926 927 'action': change['action']
927 928 })
928 929 anchor_old_id = ''
929 930 anchor_new_id = ''
930 931 anchor_old = "%(filename)s_o%(oldline_no)s" % {
931 932 'filename': self._safe_id(diff['filename']),
932 933 'oldline_no': change['old_lineno']
933 934 }
934 935 anchor_new = "%(filename)s_n%(oldline_no)s" % {
935 936 'filename': self._safe_id(diff['filename']),
936 937 'oldline_no': change['new_lineno']
937 938 }
938 939 cond_old = (change['old_lineno'] != '...' and
939 940 change['old_lineno'])
940 941 cond_new = (change['new_lineno'] != '...' and
941 942 change['new_lineno'])
942 943 if cond_old:
943 944 anchor_old_id = 'id="%s"' % anchor_old
944 945 if cond_new:
945 946 anchor_new_id = 'id="%s"' % anchor_new
946 947
947 948 if change['action'] != Action.CONTEXT:
948 949 anchor_link = True
949 950 else:
950 951 anchor_link = False
951 952
952 953 ###########################################################
953 954 # COMMENT ICONS
954 955 ###########################################################
955 956 _html.append('''\t<td class="add-comment-line"><span class="add-comment-content">''')
956 957
957 958 if enable_comments and change['action'] != Action.CONTEXT:
958 959 _html.append('''<a href="#"><span class="icon-comment-add"></span></a>''')
959 960
960 961 _html.append('''</span></td><td class="comment-toggle tooltip" title="Toggle Comment Thread"><i class="icon-comment"></i></td>\n''')
961 962
962 963 ###########################################################
963 964 # OLD LINE NUMBER
964 965 ###########################################################
965 966 _html.append('''\t<td %(a_id)s class="%(olc)s">''' % {
966 967 'a_id': anchor_old_id,
967 968 'olc': old_lineno_class
968 969 })
969 970
970 971 _html.append('''%(link)s''' % {
971 972 'link': _link_to_if(anchor_link, change['old_lineno'],
972 973 '#%s' % anchor_old)
973 974 })
974 975 _html.append('''</td>\n''')
975 976 ###########################################################
976 977 # NEW LINE NUMBER
977 978 ###########################################################
978 979
979 980 _html.append('''\t<td %(a_id)s class="%(nlc)s">''' % {
980 981 'a_id': anchor_new_id,
981 982 'nlc': new_lineno_class
982 983 })
983 984
984 985 _html.append('''%(link)s''' % {
985 986 'link': _link_to_if(anchor_link, change['new_lineno'],
986 987 '#%s' % anchor_new)
987 988 })
988 989 _html.append('''</td>\n''')
989 990 ###########################################################
990 991 # CODE
991 992 ###########################################################
992 993 code_classes = [code_class]
993 994 if (not enable_comments or
994 995 change['action'] == Action.CONTEXT):
995 996 code_classes.append('no-comment')
996 997 _html.append('\t<td class="%s">' % ' '.join(code_classes))
997 998 _html.append('''\n\t\t<pre>%(code)s</pre>\n''' % {
998 999 'code': change['line']
999 1000 })
1000 1001
1001 1002 _html.append('''\t</td>''')
1002 1003 _html.append('''\n</tr>\n''')
1003 1004 _html.append('''</table>''')
1004 1005 if _html_empty:
1005 1006 return None
1006 1007 return ''.join(_html)
1007 1008
1008 1009 def stat(self):
1009 1010 """
1010 1011 Returns tuple of added, and removed lines for this instance
1011 1012 """
1012 1013 return self.adds, self.removes
1013 1014
1014 1015 def get_context_of_line(
1015 1016 self, path, diff_line=None, context_before=3, context_after=3):
1016 1017 """
1017 1018 Returns the context lines for the specified diff line.
1018 1019
1019 1020 :type diff_line: :class:`DiffLineNumber`
1020 1021 """
1021 1022 assert self.parsed, "DiffProcessor is not initialized."
1022 1023
1023 1024 if None not in diff_line:
1024 1025 raise ValueError(
1025 1026 "Cannot specify both line numbers: {}".format(diff_line))
1026 1027
1027 1028 file_diff = self._get_file_diff(path)
1028 1029 chunk, idx = self._find_chunk_line_index(file_diff, diff_line)
1029 1030
1030 1031 first_line_to_include = max(idx - context_before, 0)
1031 1032 first_line_after_context = idx + context_after + 1
1032 1033 context_lines = chunk[first_line_to_include:first_line_after_context]
1033 1034
1034 1035 line_contents = [
1035 1036 _context_line(line) for line in context_lines
1036 1037 if _is_diff_content(line)]
1037 1038 # TODO: johbo: Interim fixup, the diff chunks drop the final newline.
1038 1039 # Once they are fixed, we can drop this line here.
1039 1040 if line_contents:
1040 1041 line_contents[-1] = (
1041 1042 line_contents[-1][0], line_contents[-1][1].rstrip('\n') + '\n')
1042 1043 return line_contents
1043 1044
1044 1045 def find_context(self, path, context, offset=0):
1045 1046 """
1046 1047 Finds the given `context` inside of the diff.
1047 1048
1048 1049 Use the parameter `offset` to specify which offset the target line has
1049 1050 inside of the given `context`. This way the correct diff line will be
1050 1051 returned.
1051 1052
1052 1053 :param offset: Shall be used to specify the offset of the main line
1053 1054 within the given `context`.
1054 1055 """
1055 1056 if offset < 0 or offset >= len(context):
1056 1057 raise ValueError(
1057 1058 "Only positive values up to the length of the context "
1058 1059 "minus one are allowed.")
1059 1060
1060 1061 matches = []
1061 1062 file_diff = self._get_file_diff(path)
1062 1063
1063 1064 for chunk in file_diff['chunks']:
1064 1065 context_iter = iter(context)
1065 1066 for line_idx, line in enumerate(chunk):
1066 1067 try:
1067 1068 if _context_line(line) == context_iter.next():
1068 1069 continue
1069 1070 except StopIteration:
1070 1071 matches.append((line_idx, chunk))
1071 1072 context_iter = iter(context)
1072 1073
1073 1074 # Increment position and triger StopIteration
1074 1075 # if we had a match at the end
1075 1076 line_idx += 1
1076 1077 try:
1077 1078 context_iter.next()
1078 1079 except StopIteration:
1079 1080 matches.append((line_idx, chunk))
1080 1081
1081 1082 effective_offset = len(context) - offset
1082 1083 found_at_diff_lines = [
1083 1084 _line_to_diff_line_number(chunk[idx - effective_offset])
1084 1085 for idx, chunk in matches]
1085 1086
1086 1087 return found_at_diff_lines
1087 1088
1088 1089 def _get_file_diff(self, path):
1089 1090 for file_diff in self.parsed_diff:
1090 1091 if file_diff['filename'] == path:
1091 1092 break
1092 1093 else:
1093 1094 raise FileNotInDiffException("File {} not in diff".format(path))
1094 1095 return file_diff
1095 1096
1096 1097 def _find_chunk_line_index(self, file_diff, diff_line):
1097 1098 for chunk in file_diff['chunks']:
1098 1099 for idx, line in enumerate(chunk):
1099 1100 if line['old_lineno'] == diff_line.old:
1100 1101 return chunk, idx
1101 1102 if line['new_lineno'] == diff_line.new:
1102 1103 return chunk, idx
1103 1104 raise LineNotInDiffException(
1104 1105 "The line {} is not part of the diff.".format(diff_line))
1105 1106
1106 1107
1107 1108 def _is_diff_content(line):
1108 1109 return line['action'] in (
1109 1110 Action.UNMODIFIED, Action.ADD, Action.DELETE)
1110 1111
1111 1112
1112 1113 def _context_line(line):
1113 1114 return (line['action'], line['line'])
1114 1115
1115 1116
1116 1117 DiffLineNumber = collections.namedtuple('DiffLineNumber', ['old', 'new'])
1117 1118
1118 1119
1119 1120 def _line_to_diff_line_number(line):
1120 1121 new_line_no = line['new_lineno'] or None
1121 1122 old_line_no = line['old_lineno'] or None
1122 1123 return DiffLineNumber(old=old_line_no, new=new_line_no)
1123 1124
1124 1125
1125 1126 class FileNotInDiffException(Exception):
1126 1127 """
1127 1128 Raised when the context for a missing file is requested.
1128 1129
1129 1130 If you request the context for a line in a file which is not part of the
1130 1131 given diff, then this exception is raised.
1131 1132 """
1132 1133
1133 1134
1134 1135 class LineNotInDiffException(Exception):
1135 1136 """
1136 1137 Raised when the context for a missing line is requested.
1137 1138
1138 1139 If you request the context for a line in a file and this line is not
1139 1140 part of the given diff, then this exception is raised.
1140 1141 """
1141 1142
1142 1143
1143 1144 class DiffLimitExceeded(Exception):
1144 1145 pass
1145 1146
1146 1147
1147 1148 # NOTE(marcink): if diffs.mako change, probably this
1148 1149 # needs a bump to next version
1149 1150 CURRENT_DIFF_VERSION = 'v4'
1150 1151
1151 1152
1152 1153 def _cleanup_cache_file(cached_diff_file):
1153 1154 # cleanup file to not store it "damaged"
1154 1155 try:
1155 1156 os.remove(cached_diff_file)
1156 1157 except Exception:
1157 1158 log.exception('Failed to cleanup path %s', cached_diff_file)
1158 1159
1159 1160
1161 def _get_compression_mode(cached_diff_file):
1162 mode = 'bz2'
1163 if 'mode:plain' in cached_diff_file:
1164 mode = 'plain'
1165 elif 'mode:gzip' in cached_diff_file:
1166 mode = 'gzip'
1167 return mode
1168
1169
1160 1170 def cache_diff(cached_diff_file, diff, commits):
1161 mode = 'plain' if 'mode:plain' in cached_diff_file else ''
1171 compression_mode = _get_compression_mode(cached_diff_file)
1162 1172
1163 1173 struct = {
1164 1174 'version': CURRENT_DIFF_VERSION,
1165 1175 'diff': diff,
1166 1176 'commits': commits
1167 1177 }
1168 1178
1169 1179 start = time.time()
1170 1180 try:
1171 if mode == 'plain':
1181 if compression_mode == 'plain':
1172 1182 with open(cached_diff_file, 'wb') as f:
1173 1183 pickle.dump(struct, f)
1184 elif compression_mode == 'gzip':
1185 with gzip.GzipFile(cached_diff_file, 'wb') as f:
1186 pickle.dump(struct, f)
1174 1187 else:
1175 1188 with bz2.BZ2File(cached_diff_file, 'wb') as f:
1176 1189 pickle.dump(struct, f)
1177 1190 except Exception:
1178 1191 log.warn('Failed to save cache', exc_info=True)
1179 1192 _cleanup_cache_file(cached_diff_file)
1180 1193
1181 1194 log.debug('Saved diff cache under %s in %.4fs', cached_diff_file, time.time() - start)
1182 1195
1183 1196
1184 1197 def load_cached_diff(cached_diff_file):
1185 mode = 'plain' if 'mode:plain' in cached_diff_file else ''
1198 compression_mode = _get_compression_mode(cached_diff_file)
1186 1199
1187 1200 default_struct = {
1188 1201 'version': CURRENT_DIFF_VERSION,
1189 1202 'diff': None,
1190 1203 'commits': None
1191 1204 }
1192 1205
1193 1206 has_cache = os.path.isfile(cached_diff_file)
1194 1207 if not has_cache:
1195 1208 log.debug('Reading diff cache file failed %s', cached_diff_file)
1196 1209 return default_struct
1197 1210
1198 1211 data = None
1199 1212
1200 1213 start = time.time()
1201 1214 try:
1202 if mode == 'plain':
1215 if compression_mode == 'plain':
1203 1216 with open(cached_diff_file, 'rb') as f:
1204 1217 data = pickle.load(f)
1218 elif compression_mode == 'gzip':
1219 with gzip.GzipFile(cached_diff_file, 'rb') as f:
1220 data = pickle.load(f)
1205 1221 else:
1206 1222 with bz2.BZ2File(cached_diff_file, 'rb') as f:
1207 1223 data = pickle.load(f)
1208 1224 except Exception:
1209 1225 log.warn('Failed to read diff cache file', exc_info=True)
1210 1226
1211 1227 if not data:
1212 1228 data = default_struct
1213 1229
1214 1230 if not isinstance(data, dict):
1215 1231 # old version of data ?
1216 1232 data = default_struct
1217 1233
1218 1234 # check version
1219 1235 if data.get('version') != CURRENT_DIFF_VERSION:
1220 1236 # purge cache
1221 1237 _cleanup_cache_file(cached_diff_file)
1222 1238 return default_struct
1223 1239
1224 1240 log.debug('Loaded diff cache from %s in %.4fs', cached_diff_file, time.time() - start)
1225 1241
1226 1242 return data
1227 1243
1228 1244
1229 1245 def generate_diff_cache_key(*args):
1230 1246 """
1231 1247 Helper to generate a cache key using arguments
1232 1248 """
1233 1249 def arg_mapper(input_param):
1234 1250 input_param = safe_str(input_param)
1235 1251 # we cannot allow '/' in arguments since it would allow
1236 1252 # subdirectory usage
1237 1253 input_param.replace('/', '_')
1238 1254 return input_param or None # prevent empty string arguments
1239 1255
1240 1256 return '_'.join([
1241 1257 '{}' for i in range(len(args))]).format(*map(arg_mapper, args))
1242 1258
1243 1259
1244 1260 def diff_cache_exist(cache_storage, *args):
1245 1261 """
1246 1262 Based on all generated arguments check and return a cache path
1247 1263 """
1264 args = list(args) + ['mode:gzip']
1248 1265 cache_key = generate_diff_cache_key(*args)
1249 1266 cache_file_path = os.path.join(cache_storage, cache_key)
1250 1267 # prevent path traversal attacks using some param that have e.g '../../'
1251 1268 if not os.path.abspath(cache_file_path).startswith(cache_storage):
1252 1269 raise ValueError('Final path must be within {}'.format(cache_storage))
1253 1270
1254 1271 return cache_file_path
General Comments 0
You need to be logged in to leave comments. Login now