##// END OF EJS Templates
diff-cache: use bz2 to reduce diff-cache size.
marcink -
r2690:01439ec4 default
parent child Browse files
Show More
@@ -1,1212 +1,1213 b''
1 1 # -*- coding: utf-8 -*-
2 2
3 3 # Copyright (C) 2011-2018 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21
22 22 """
23 23 Set of diffing helpers, previously part of vcs
24 24 """
25 25
26 26 import os
27 27 import re
28 import bz2
29
28 30 import collections
29 31 import difflib
30 32 import logging
31 33 import cPickle as pickle
32
33 34 from itertools import tee, imap
34 35
35 36 from rhodecode.lib.vcs.exceptions import VCSError
36 37 from rhodecode.lib.vcs.nodes import FileNode, SubModuleNode
37 38 from rhodecode.lib.utils2 import safe_unicode, safe_str
38 39
39 40 log = logging.getLogger(__name__)
40 41
41 42 # define max context, a file with more than this numbers of lines is unusable
42 43 # in browser anyway
43 44 MAX_CONTEXT = 1024 * 1014
44 45
45 46
46 47 class OPS(object):
47 48 ADD = 'A'
48 49 MOD = 'M'
49 50 DEL = 'D'
50 51
51 52
52 53 def get_gitdiff(filenode_old, filenode_new, ignore_whitespace=True, context=3):
53 54 """
54 55 Returns git style diff between given ``filenode_old`` and ``filenode_new``.
55 56
56 57 :param ignore_whitespace: ignore whitespaces in diff
57 58 """
58 59 # make sure we pass in default context
59 60 context = context or 3
60 61 # protect against IntOverflow when passing HUGE context
61 62 if context > MAX_CONTEXT:
62 63 context = MAX_CONTEXT
63 64
64 65 submodules = filter(lambda o: isinstance(o, SubModuleNode),
65 66 [filenode_new, filenode_old])
66 67 if submodules:
67 68 return ''
68 69
69 70 for filenode in (filenode_old, filenode_new):
70 71 if not isinstance(filenode, FileNode):
71 72 raise VCSError(
72 73 "Given object should be FileNode object, not %s"
73 74 % filenode.__class__)
74 75
75 76 repo = filenode_new.commit.repository
76 77 old_commit = filenode_old.commit or repo.EMPTY_COMMIT
77 78 new_commit = filenode_new.commit
78 79
79 80 vcs_gitdiff = repo.get_diff(
80 81 old_commit, new_commit, filenode_new.path,
81 82 ignore_whitespace, context, path1=filenode_old.path)
82 83 return vcs_gitdiff
83 84
84 85 NEW_FILENODE = 1
85 86 DEL_FILENODE = 2
86 87 MOD_FILENODE = 3
87 88 RENAMED_FILENODE = 4
88 89 COPIED_FILENODE = 5
89 90 CHMOD_FILENODE = 6
90 91 BIN_FILENODE = 7
91 92
92 93
93 94 class LimitedDiffContainer(object):
94 95
95 96 def __init__(self, diff_limit, cur_diff_size, diff):
96 97 self.diff = diff
97 98 self.diff_limit = diff_limit
98 99 self.cur_diff_size = cur_diff_size
99 100
100 101 def __getitem__(self, key):
101 102 return self.diff.__getitem__(key)
102 103
103 104 def __iter__(self):
104 105 for l in self.diff:
105 106 yield l
106 107
107 108
108 109 class Action(object):
109 110 """
110 111 Contains constants for the action value of the lines in a parsed diff.
111 112 """
112 113
113 114 ADD = 'add'
114 115 DELETE = 'del'
115 116 UNMODIFIED = 'unmod'
116 117
117 118 CONTEXT = 'context'
118 119 OLD_NO_NL = 'old-no-nl'
119 120 NEW_NO_NL = 'new-no-nl'
120 121
121 122
122 123 class DiffProcessor(object):
123 124 """
124 125 Give it a unified or git diff and it returns a list of the files that were
125 126 mentioned in the diff together with a dict of meta information that
126 127 can be used to render it in a HTML template.
127 128
128 129 .. note:: Unicode handling
129 130
130 131 The original diffs are a byte sequence and can contain filenames
131 132 in mixed encodings. This class generally returns `unicode` objects
132 133 since the result is intended for presentation to the user.
133 134
134 135 """
135 136 _chunk_re = re.compile(r'^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)')
136 137 _newline_marker = re.compile(r'^\\ No newline at end of file')
137 138
138 139 # used for inline highlighter word split
139 140 _token_re = re.compile(r'()(&gt;|&lt;|&amp;|\W+?)')
140 141
141 142 # collapse ranges of commits over given number
142 143 _collapse_commits_over = 5
143 144
144 145 def __init__(self, diff, format='gitdiff', diff_limit=None,
145 146 file_limit=None, show_full_diff=True):
146 147 """
147 148 :param diff: A `Diff` object representing a diff from a vcs backend
148 149 :param format: format of diff passed, `udiff` or `gitdiff`
149 150 :param diff_limit: define the size of diff that is considered "big"
150 151 based on that parameter cut off will be triggered, set to None
151 152 to show full diff
152 153 """
153 154 self._diff = diff
154 155 self._format = format
155 156 self.adds = 0
156 157 self.removes = 0
157 158 # calculate diff size
158 159 self.diff_limit = diff_limit
159 160 self.file_limit = file_limit
160 161 self.show_full_diff = show_full_diff
161 162 self.cur_diff_size = 0
162 163 self.parsed = False
163 164 self.parsed_diff = []
164 165
165 166 log.debug('Initialized DiffProcessor with %s mode', format)
166 167 if format == 'gitdiff':
167 168 self.differ = self._highlight_line_difflib
168 169 self._parser = self._parse_gitdiff
169 170 else:
170 171 self.differ = self._highlight_line_udiff
171 172 self._parser = self._new_parse_gitdiff
172 173
173 174 def _copy_iterator(self):
174 175 """
175 176 make a fresh copy of generator, we should not iterate thru
176 177 an original as it's needed for repeating operations on
177 178 this instance of DiffProcessor
178 179 """
179 180 self.__udiff, iterator_copy = tee(self.__udiff)
180 181 return iterator_copy
181 182
182 183 def _escaper(self, string):
183 184 """
184 185 Escaper for diff escapes special chars and checks the diff limit
185 186
186 187 :param string:
187 188 """
188 189 self.cur_diff_size += len(string)
189 190
190 191 if not self.show_full_diff and (self.cur_diff_size > self.diff_limit):
191 192 raise DiffLimitExceeded('Diff Limit Exceeded')
192 193
193 194 return string \
194 195 .replace('&', '&amp;')\
195 196 .replace('<', '&lt;')\
196 197 .replace('>', '&gt;')
197 198
198 199 def _line_counter(self, l):
199 200 """
200 201 Checks each line and bumps total adds/removes for this diff
201 202
202 203 :param l:
203 204 """
204 205 if l.startswith('+') and not l.startswith('+++'):
205 206 self.adds += 1
206 207 elif l.startswith('-') and not l.startswith('---'):
207 208 self.removes += 1
208 209 return safe_unicode(l)
209 210
210 211 def _highlight_line_difflib(self, line, next_):
211 212 """
212 213 Highlight inline changes in both lines.
213 214 """
214 215
215 216 if line['action'] == Action.DELETE:
216 217 old, new = line, next_
217 218 else:
218 219 old, new = next_, line
219 220
220 221 oldwords = self._token_re.split(old['line'])
221 222 newwords = self._token_re.split(new['line'])
222 223 sequence = difflib.SequenceMatcher(None, oldwords, newwords)
223 224
224 225 oldfragments, newfragments = [], []
225 226 for tag, i1, i2, j1, j2 in sequence.get_opcodes():
226 227 oldfrag = ''.join(oldwords[i1:i2])
227 228 newfrag = ''.join(newwords[j1:j2])
228 229 if tag != 'equal':
229 230 if oldfrag:
230 231 oldfrag = '<del>%s</del>' % oldfrag
231 232 if newfrag:
232 233 newfrag = '<ins>%s</ins>' % newfrag
233 234 oldfragments.append(oldfrag)
234 235 newfragments.append(newfrag)
235 236
236 237 old['line'] = "".join(oldfragments)
237 238 new['line'] = "".join(newfragments)
238 239
239 240 def _highlight_line_udiff(self, line, next_):
240 241 """
241 242 Highlight inline changes in both lines.
242 243 """
243 244 start = 0
244 245 limit = min(len(line['line']), len(next_['line']))
245 246 while start < limit and line['line'][start] == next_['line'][start]:
246 247 start += 1
247 248 end = -1
248 249 limit -= start
249 250 while -end <= limit and line['line'][end] == next_['line'][end]:
250 251 end -= 1
251 252 end += 1
252 253 if start or end:
253 254 def do(l):
254 255 last = end + len(l['line'])
255 256 if l['action'] == Action.ADD:
256 257 tag = 'ins'
257 258 else:
258 259 tag = 'del'
259 260 l['line'] = '%s<%s>%s</%s>%s' % (
260 261 l['line'][:start],
261 262 tag,
262 263 l['line'][start:last],
263 264 tag,
264 265 l['line'][last:]
265 266 )
266 267 do(line)
267 268 do(next_)
268 269
269 270 def _clean_line(self, line, command):
270 271 if command in ['+', '-', ' ']:
271 272 # only modify the line if it's actually a diff thing
272 273 line = line[1:]
273 274 return line
274 275
275 276 def _parse_gitdiff(self, inline_diff=True):
276 277 _files = []
277 278 diff_container = lambda arg: arg
278 279
279 280 for chunk in self._diff.chunks():
280 281 head = chunk.header
281 282
282 283 diff = imap(self._escaper, self.diff_splitter(chunk.diff))
283 284 raw_diff = chunk.raw
284 285 limited_diff = False
285 286 exceeds_limit = False
286 287
287 288 op = None
288 289 stats = {
289 290 'added': 0,
290 291 'deleted': 0,
291 292 'binary': False,
292 293 'ops': {},
293 294 }
294 295
295 296 if head['deleted_file_mode']:
296 297 op = OPS.DEL
297 298 stats['binary'] = True
298 299 stats['ops'][DEL_FILENODE] = 'deleted file'
299 300
300 301 elif head['new_file_mode']:
301 302 op = OPS.ADD
302 303 stats['binary'] = True
303 304 stats['ops'][NEW_FILENODE] = 'new file %s' % head['new_file_mode']
304 305 else: # modify operation, can be copy, rename or chmod
305 306
306 307 # CHMOD
307 308 if head['new_mode'] and head['old_mode']:
308 309 op = OPS.MOD
309 310 stats['binary'] = True
310 311 stats['ops'][CHMOD_FILENODE] = (
311 312 'modified file chmod %s => %s' % (
312 313 head['old_mode'], head['new_mode']))
313 314 # RENAME
314 315 if head['rename_from'] != head['rename_to']:
315 316 op = OPS.MOD
316 317 stats['binary'] = True
317 318 stats['ops'][RENAMED_FILENODE] = (
318 319 'file renamed from %s to %s' % (
319 320 head['rename_from'], head['rename_to']))
320 321 # COPY
321 322 if head.get('copy_from') and head.get('copy_to'):
322 323 op = OPS.MOD
323 324 stats['binary'] = True
324 325 stats['ops'][COPIED_FILENODE] = (
325 326 'file copied from %s to %s' % (
326 327 head['copy_from'], head['copy_to']))
327 328
328 329 # If our new parsed headers didn't match anything fallback to
329 330 # old style detection
330 331 if op is None:
331 332 if not head['a_file'] and head['b_file']:
332 333 op = OPS.ADD
333 334 stats['binary'] = True
334 335 stats['ops'][NEW_FILENODE] = 'new file'
335 336
336 337 elif head['a_file'] and not head['b_file']:
337 338 op = OPS.DEL
338 339 stats['binary'] = True
339 340 stats['ops'][DEL_FILENODE] = 'deleted file'
340 341
341 342 # it's not ADD not DELETE
342 343 if op is None:
343 344 op = OPS.MOD
344 345 stats['binary'] = True
345 346 stats['ops'][MOD_FILENODE] = 'modified file'
346 347
347 348 # a real non-binary diff
348 349 if head['a_file'] or head['b_file']:
349 350 try:
350 351 raw_diff, chunks, _stats = self._parse_lines(diff)
351 352 stats['binary'] = False
352 353 stats['added'] = _stats[0]
353 354 stats['deleted'] = _stats[1]
354 355 # explicit mark that it's a modified file
355 356 if op == OPS.MOD:
356 357 stats['ops'][MOD_FILENODE] = 'modified file'
357 358 exceeds_limit = len(raw_diff) > self.file_limit
358 359
359 360 # changed from _escaper function so we validate size of
360 361 # each file instead of the whole diff
361 362 # diff will hide big files but still show small ones
362 363 # from my tests, big files are fairly safe to be parsed
363 364 # but the browser is the bottleneck
364 365 if not self.show_full_diff and exceeds_limit:
365 366 raise DiffLimitExceeded('File Limit Exceeded')
366 367
367 368 except DiffLimitExceeded:
368 369 diff_container = lambda _diff: \
369 370 LimitedDiffContainer(
370 371 self.diff_limit, self.cur_diff_size, _diff)
371 372
372 373 exceeds_limit = len(raw_diff) > self.file_limit
373 374 limited_diff = True
374 375 chunks = []
375 376
376 377 else: # GIT format binary patch, or possibly empty diff
377 378 if head['bin_patch']:
378 379 # we have operation already extracted, but we mark simply
379 380 # it's a diff we wont show for binary files
380 381 stats['ops'][BIN_FILENODE] = 'binary diff hidden'
381 382 chunks = []
382 383
383 384 if chunks and not self.show_full_diff and op == OPS.DEL:
384 385 # if not full diff mode show deleted file contents
385 386 # TODO: anderson: if the view is not too big, there is no way
386 387 # to see the content of the file
387 388 chunks = []
388 389
389 390 chunks.insert(0, [{
390 391 'old_lineno': '',
391 392 'new_lineno': '',
392 393 'action': Action.CONTEXT,
393 394 'line': msg,
394 395 } for _op, msg in stats['ops'].iteritems()
395 396 if _op not in [MOD_FILENODE]])
396 397
397 398 _files.append({
398 399 'filename': safe_unicode(head['b_path']),
399 400 'old_revision': head['a_blob_id'],
400 401 'new_revision': head['b_blob_id'],
401 402 'chunks': chunks,
402 403 'raw_diff': safe_unicode(raw_diff),
403 404 'operation': op,
404 405 'stats': stats,
405 406 'exceeds_limit': exceeds_limit,
406 407 'is_limited_diff': limited_diff,
407 408 })
408 409
409 410 sorter = lambda info: {OPS.ADD: 0, OPS.MOD: 1,
410 411 OPS.DEL: 2}.get(info['operation'])
411 412
412 413 if not inline_diff:
413 414 return diff_container(sorted(_files, key=sorter))
414 415
415 416 # highlight inline changes
416 417 for diff_data in _files:
417 418 for chunk in diff_data['chunks']:
418 419 lineiter = iter(chunk)
419 420 try:
420 421 while 1:
421 422 line = lineiter.next()
422 423 if line['action'] not in (
423 424 Action.UNMODIFIED, Action.CONTEXT):
424 425 nextline = lineiter.next()
425 426 if nextline['action'] in ['unmod', 'context'] or \
426 427 nextline['action'] == line['action']:
427 428 continue
428 429 self.differ(line, nextline)
429 430 except StopIteration:
430 431 pass
431 432
432 433 return diff_container(sorted(_files, key=sorter))
433 434
434 435 def _check_large_diff(self):
435 436 log.debug('Diff exceeds current diff_limit of %s', self.diff_limit)
436 437 if not self.show_full_diff and (self.cur_diff_size > self.diff_limit):
437 438 raise DiffLimitExceeded('Diff Limit `%s` Exceeded', self.diff_limit)
438 439
439 440 # FIXME: NEWDIFFS: dan: this replaces _parse_gitdiff
440 441 def _new_parse_gitdiff(self, inline_diff=True):
441 442 _files = []
442 443
443 444 # this can be overriden later to a LimitedDiffContainer type
444 445 diff_container = lambda arg: arg
445 446
446 447 for chunk in self._diff.chunks():
447 448 head = chunk.header
448 449 log.debug('parsing diff %r' % head)
449 450
450 451 raw_diff = chunk.raw
451 452 limited_diff = False
452 453 exceeds_limit = False
453 454
454 455 op = None
455 456 stats = {
456 457 'added': 0,
457 458 'deleted': 0,
458 459 'binary': False,
459 460 'old_mode': None,
460 461 'new_mode': None,
461 462 'ops': {},
462 463 }
463 464 if head['old_mode']:
464 465 stats['old_mode'] = head['old_mode']
465 466 if head['new_mode']:
466 467 stats['new_mode'] = head['new_mode']
467 468 if head['b_mode']:
468 469 stats['new_mode'] = head['b_mode']
469 470
470 471 # delete file
471 472 if head['deleted_file_mode']:
472 473 op = OPS.DEL
473 474 stats['binary'] = True
474 475 stats['ops'][DEL_FILENODE] = 'deleted file'
475 476
476 477 # new file
477 478 elif head['new_file_mode']:
478 479 op = OPS.ADD
479 480 stats['binary'] = True
480 481 stats['old_mode'] = None
481 482 stats['new_mode'] = head['new_file_mode']
482 483 stats['ops'][NEW_FILENODE] = 'new file %s' % head['new_file_mode']
483 484
484 485 # modify operation, can be copy, rename or chmod
485 486 else:
486 487 # CHMOD
487 488 if head['new_mode'] and head['old_mode']:
488 489 op = OPS.MOD
489 490 stats['binary'] = True
490 491 stats['ops'][CHMOD_FILENODE] = (
491 492 'modified file chmod %s => %s' % (
492 493 head['old_mode'], head['new_mode']))
493 494
494 495 # RENAME
495 496 if head['rename_from'] != head['rename_to']:
496 497 op = OPS.MOD
497 498 stats['binary'] = True
498 499 stats['renamed'] = (head['rename_from'], head['rename_to'])
499 500 stats['ops'][RENAMED_FILENODE] = (
500 501 'file renamed from %s to %s' % (
501 502 head['rename_from'], head['rename_to']))
502 503 # COPY
503 504 if head.get('copy_from') and head.get('copy_to'):
504 505 op = OPS.MOD
505 506 stats['binary'] = True
506 507 stats['copied'] = (head['copy_from'], head['copy_to'])
507 508 stats['ops'][COPIED_FILENODE] = (
508 509 'file copied from %s to %s' % (
509 510 head['copy_from'], head['copy_to']))
510 511
511 512 # If our new parsed headers didn't match anything fallback to
512 513 # old style detection
513 514 if op is None:
514 515 if not head['a_file'] and head['b_file']:
515 516 op = OPS.ADD
516 517 stats['binary'] = True
517 518 stats['new_file'] = True
518 519 stats['ops'][NEW_FILENODE] = 'new file'
519 520
520 521 elif head['a_file'] and not head['b_file']:
521 522 op = OPS.DEL
522 523 stats['binary'] = True
523 524 stats['ops'][DEL_FILENODE] = 'deleted file'
524 525
525 526 # it's not ADD not DELETE
526 527 if op is None:
527 528 op = OPS.MOD
528 529 stats['binary'] = True
529 530 stats['ops'][MOD_FILENODE] = 'modified file'
530 531
531 532 # a real non-binary diff
532 533 if head['a_file'] or head['b_file']:
533 534 # simulate splitlines, so we keep the line end part
534 535 diff = self.diff_splitter(chunk.diff)
535 536
536 537 # append each file to the diff size
537 538 raw_chunk_size = len(raw_diff)
538 539
539 540 exceeds_limit = raw_chunk_size > self.file_limit
540 541 self.cur_diff_size += raw_chunk_size
541 542
542 543 try:
543 544 # Check each file instead of the whole diff.
544 545 # Diff will hide big files but still show small ones.
545 546 # From the tests big files are fairly safe to be parsed
546 547 # but the browser is the bottleneck.
547 548 if not self.show_full_diff and exceeds_limit:
548 549 log.debug('File `%s` exceeds current file_limit of %s',
549 550 safe_unicode(head['b_path']), self.file_limit)
550 551 raise DiffLimitExceeded(
551 552 'File Limit %s Exceeded', self.file_limit)
552 553
553 554 self._check_large_diff()
554 555
555 556 raw_diff, chunks, _stats = self._new_parse_lines(diff)
556 557 stats['binary'] = False
557 558 stats['added'] = _stats[0]
558 559 stats['deleted'] = _stats[1]
559 560 # explicit mark that it's a modified file
560 561 if op == OPS.MOD:
561 562 stats['ops'][MOD_FILENODE] = 'modified file'
562 563
563 564 except DiffLimitExceeded:
564 565 diff_container = lambda _diff: \
565 566 LimitedDiffContainer(
566 567 self.diff_limit, self.cur_diff_size, _diff)
567 568
568 569 limited_diff = True
569 570 chunks = []
570 571
571 572 else: # GIT format binary patch, or possibly empty diff
572 573 if head['bin_patch']:
573 574 # we have operation already extracted, but we mark simply
574 575 # it's a diff we wont show for binary files
575 576 stats['ops'][BIN_FILENODE] = 'binary diff hidden'
576 577 chunks = []
577 578
578 579 # Hide content of deleted node by setting empty chunks
579 580 if chunks and not self.show_full_diff and op == OPS.DEL:
580 581 # if not full diff mode show deleted file contents
581 582 # TODO: anderson: if the view is not too big, there is no way
582 583 # to see the content of the file
583 584 chunks = []
584 585
585 586 chunks.insert(
586 587 0, [{'old_lineno': '',
587 588 'new_lineno': '',
588 589 'action': Action.CONTEXT,
589 590 'line': msg,
590 591 } for _op, msg in stats['ops'].iteritems()
591 592 if _op not in [MOD_FILENODE]])
592 593
593 594 original_filename = safe_unicode(head['a_path'])
594 595 _files.append({
595 596 'original_filename': original_filename,
596 597 'filename': safe_unicode(head['b_path']),
597 598 'old_revision': head['a_blob_id'],
598 599 'new_revision': head['b_blob_id'],
599 600 'chunks': chunks,
600 601 'raw_diff': safe_unicode(raw_diff),
601 602 'operation': op,
602 603 'stats': stats,
603 604 'exceeds_limit': exceeds_limit,
604 605 'is_limited_diff': limited_diff,
605 606 })
606 607
607 608 sorter = lambda info: {OPS.ADD: 0, OPS.MOD: 1,
608 609 OPS.DEL: 2}.get(info['operation'])
609 610
610 611 return diff_container(sorted(_files, key=sorter))
611 612
612 613 # FIXME: NEWDIFFS: dan: this gets replaced by _new_parse_lines
613 614 def _parse_lines(self, diff_iter):
614 615 """
615 616 Parse the diff an return data for the template.
616 617 """
617 618
618 619 stats = [0, 0]
619 620 chunks = []
620 621 raw_diff = []
621 622
622 623 try:
623 624 line = diff_iter.next()
624 625
625 626 while line:
626 627 raw_diff.append(line)
627 628 lines = []
628 629 chunks.append(lines)
629 630
630 631 match = self._chunk_re.match(line)
631 632
632 633 if not match:
633 634 break
634 635
635 636 gr = match.groups()
636 637 (old_line, old_end,
637 638 new_line, new_end) = [int(x or 1) for x in gr[:-1]]
638 639 old_line -= 1
639 640 new_line -= 1
640 641
641 642 context = len(gr) == 5
642 643 old_end += old_line
643 644 new_end += new_line
644 645
645 646 if context:
646 647 # skip context only if it's first line
647 648 if int(gr[0]) > 1:
648 649 lines.append({
649 650 'old_lineno': '...',
650 651 'new_lineno': '...',
651 652 'action': Action.CONTEXT,
652 653 'line': line,
653 654 })
654 655
655 656 line = diff_iter.next()
656 657
657 658 while old_line < old_end or new_line < new_end:
658 659 command = ' '
659 660 if line:
660 661 command = line[0]
661 662
662 663 affects_old = affects_new = False
663 664
664 665 # ignore those if we don't expect them
665 666 if command in '#@':
666 667 continue
667 668 elif command == '+':
668 669 affects_new = True
669 670 action = Action.ADD
670 671 stats[0] += 1
671 672 elif command == '-':
672 673 affects_old = True
673 674 action = Action.DELETE
674 675 stats[1] += 1
675 676 else:
676 677 affects_old = affects_new = True
677 678 action = Action.UNMODIFIED
678 679
679 680 if not self._newline_marker.match(line):
680 681 old_line += affects_old
681 682 new_line += affects_new
682 683 lines.append({
683 684 'old_lineno': affects_old and old_line or '',
684 685 'new_lineno': affects_new and new_line or '',
685 686 'action': action,
686 687 'line': self._clean_line(line, command)
687 688 })
688 689 raw_diff.append(line)
689 690
690 691 line = diff_iter.next()
691 692
692 693 if self._newline_marker.match(line):
693 694 # we need to append to lines, since this is not
694 695 # counted in the line specs of diff
695 696 lines.append({
696 697 'old_lineno': '...',
697 698 'new_lineno': '...',
698 699 'action': Action.CONTEXT,
699 700 'line': self._clean_line(line, command)
700 701 })
701 702
702 703 except StopIteration:
703 704 pass
704 705 return ''.join(raw_diff), chunks, stats
705 706
706 707 # FIXME: NEWDIFFS: dan: this replaces _parse_lines
707 708 def _new_parse_lines(self, diff_iter):
708 709 """
709 710 Parse the diff an return data for the template.
710 711 """
711 712
712 713 stats = [0, 0]
713 714 chunks = []
714 715 raw_diff = []
715 716
716 717 try:
717 718 line = diff_iter.next()
718 719
719 720 while line:
720 721 raw_diff.append(line)
721 722 # match header e.g @@ -0,0 +1 @@\n'
722 723 match = self._chunk_re.match(line)
723 724
724 725 if not match:
725 726 break
726 727
727 728 gr = match.groups()
728 729 (old_line, old_end,
729 730 new_line, new_end) = [int(x or 1) for x in gr[:-1]]
730 731
731 732 lines = []
732 733 hunk = {
733 734 'section_header': gr[-1],
734 735 'source_start': old_line,
735 736 'source_length': old_end,
736 737 'target_start': new_line,
737 738 'target_length': new_end,
738 739 'lines': lines,
739 740 }
740 741 chunks.append(hunk)
741 742
742 743 old_line -= 1
743 744 new_line -= 1
744 745
745 746 context = len(gr) == 5
746 747 old_end += old_line
747 748 new_end += new_line
748 749
749 750 line = diff_iter.next()
750 751
751 752 while old_line < old_end or new_line < new_end:
752 753 command = ' '
753 754 if line:
754 755 command = line[0]
755 756
756 757 affects_old = affects_new = False
757 758
758 759 # ignore those if we don't expect them
759 760 if command in '#@':
760 761 continue
761 762 elif command == '+':
762 763 affects_new = True
763 764 action = Action.ADD
764 765 stats[0] += 1
765 766 elif command == '-':
766 767 affects_old = True
767 768 action = Action.DELETE
768 769 stats[1] += 1
769 770 else:
770 771 affects_old = affects_new = True
771 772 action = Action.UNMODIFIED
772 773
773 774 if not self._newline_marker.match(line):
774 775 old_line += affects_old
775 776 new_line += affects_new
776 777 lines.append({
777 778 'old_lineno': affects_old and old_line or '',
778 779 'new_lineno': affects_new and new_line or '',
779 780 'action': action,
780 781 'line': self._clean_line(line, command)
781 782 })
782 783 raw_diff.append(line)
783 784
784 785 line = diff_iter.next()
785 786
786 787 if self._newline_marker.match(line):
787 788 # we need to append to lines, since this is not
788 789 # counted in the line specs of diff
789 790 if affects_old:
790 791 action = Action.OLD_NO_NL
791 792 elif affects_new:
792 793 action = Action.NEW_NO_NL
793 794 else:
794 795 raise Exception('invalid context for no newline')
795 796
796 797 lines.append({
797 798 'old_lineno': None,
798 799 'new_lineno': None,
799 800 'action': action,
800 801 'line': self._clean_line(line, command)
801 802 })
802 803
803 804 except StopIteration:
804 805 pass
805 806
806 807 return ''.join(raw_diff), chunks, stats
807 808
808 809 def _safe_id(self, idstring):
809 810 """Make a string safe for including in an id attribute.
810 811
811 812 The HTML spec says that id attributes 'must begin with
812 813 a letter ([A-Za-z]) and may be followed by any number
813 814 of letters, digits ([0-9]), hyphens ("-"), underscores
814 815 ("_"), colons (":"), and periods (".")'. These regexps
815 816 are slightly over-zealous, in that they remove colons
816 817 and periods unnecessarily.
817 818
818 819 Whitespace is transformed into underscores, and then
819 820 anything which is not a hyphen or a character that
820 821 matches \w (alphanumerics and underscore) is removed.
821 822
822 823 """
823 824 # Transform all whitespace to underscore
824 825 idstring = re.sub(r'\s', "_", '%s' % idstring)
825 826 # Remove everything that is not a hyphen or a member of \w
826 827 idstring = re.sub(r'(?!-)\W', "", idstring).lower()
827 828 return idstring
828 829
829 830 @classmethod
830 831 def diff_splitter(cls, string):
831 832 """
832 833 Diff split that emulates .splitlines() but works only on \n
833 834 """
834 835 if not string:
835 836 return
836 837 elif string == '\n':
837 838 yield u'\n'
838 839 else:
839 840
840 841 has_newline = string.endswith('\n')
841 842 elements = string.split('\n')
842 843 if has_newline:
843 844 # skip last element as it's empty string from newlines
844 845 elements = elements[:-1]
845 846
846 847 len_elements = len(elements)
847 848
848 849 for cnt, line in enumerate(elements, start=1):
849 850 last_line = cnt == len_elements
850 851 if last_line and not has_newline:
851 852 yield safe_unicode(line)
852 853 else:
853 854 yield safe_unicode(line) + '\n'
854 855
855 856 def prepare(self, inline_diff=True):
856 857 """
857 858 Prepare the passed udiff for HTML rendering.
858 859
859 860 :return: A list of dicts with diff information.
860 861 """
861 862 parsed = self._parser(inline_diff=inline_diff)
862 863 self.parsed = True
863 864 self.parsed_diff = parsed
864 865 return parsed
865 866
866 867 def as_raw(self, diff_lines=None):
867 868 """
868 869 Returns raw diff as a byte string
869 870 """
870 871 return self._diff.raw
871 872
872 873 def as_html(self, table_class='code-difftable', line_class='line',
873 874 old_lineno_class='lineno old', new_lineno_class='lineno new',
874 875 code_class='code', enable_comments=False, parsed_lines=None):
875 876 """
876 877 Return given diff as html table with customized css classes
877 878 """
878 879 # TODO(marcink): not sure how to pass in translator
879 880 # here in an efficient way, leave the _ for proper gettext extraction
880 881 _ = lambda s: s
881 882
882 883 def _link_to_if(condition, label, url):
883 884 """
884 885 Generates a link if condition is meet or just the label if not.
885 886 """
886 887
887 888 if condition:
888 889 return '''<a href="%(url)s" class="tooltip"
889 890 title="%(title)s">%(label)s</a>''' % {
890 891 'title': _('Click to select line'),
891 892 'url': url,
892 893 'label': label
893 894 }
894 895 else:
895 896 return label
896 897 if not self.parsed:
897 898 self.prepare()
898 899
899 900 diff_lines = self.parsed_diff
900 901 if parsed_lines:
901 902 diff_lines = parsed_lines
902 903
903 904 _html_empty = True
904 905 _html = []
905 906 _html.append('''<table class="%(table_class)s">\n''' % {
906 907 'table_class': table_class
907 908 })
908 909
909 910 for diff in diff_lines:
910 911 for line in diff['chunks']:
911 912 _html_empty = False
912 913 for change in line:
913 914 _html.append('''<tr class="%(lc)s %(action)s">\n''' % {
914 915 'lc': line_class,
915 916 'action': change['action']
916 917 })
917 918 anchor_old_id = ''
918 919 anchor_new_id = ''
919 920 anchor_old = "%(filename)s_o%(oldline_no)s" % {
920 921 'filename': self._safe_id(diff['filename']),
921 922 'oldline_no': change['old_lineno']
922 923 }
923 924 anchor_new = "%(filename)s_n%(oldline_no)s" % {
924 925 'filename': self._safe_id(diff['filename']),
925 926 'oldline_no': change['new_lineno']
926 927 }
927 928 cond_old = (change['old_lineno'] != '...' and
928 929 change['old_lineno'])
929 930 cond_new = (change['new_lineno'] != '...' and
930 931 change['new_lineno'])
931 932 if cond_old:
932 933 anchor_old_id = 'id="%s"' % anchor_old
933 934 if cond_new:
934 935 anchor_new_id = 'id="%s"' % anchor_new
935 936
936 937 if change['action'] != Action.CONTEXT:
937 938 anchor_link = True
938 939 else:
939 940 anchor_link = False
940 941
941 942 ###########################################################
942 943 # COMMENT ICONS
943 944 ###########################################################
944 945 _html.append('''\t<td class="add-comment-line"><span class="add-comment-content">''')
945 946
946 947 if enable_comments and change['action'] != Action.CONTEXT:
947 948 _html.append('''<a href="#"><span class="icon-comment-add"></span></a>''')
948 949
949 950 _html.append('''</span></td><td class="comment-toggle tooltip" title="Toggle Comment Thread"><i class="icon-comment"></i></td>\n''')
950 951
951 952 ###########################################################
952 953 # OLD LINE NUMBER
953 954 ###########################################################
954 955 _html.append('''\t<td %(a_id)s class="%(olc)s">''' % {
955 956 'a_id': anchor_old_id,
956 957 'olc': old_lineno_class
957 958 })
958 959
959 960 _html.append('''%(link)s''' % {
960 961 'link': _link_to_if(anchor_link, change['old_lineno'],
961 962 '#%s' % anchor_old)
962 963 })
963 964 _html.append('''</td>\n''')
964 965 ###########################################################
965 966 # NEW LINE NUMBER
966 967 ###########################################################
967 968
968 969 _html.append('''\t<td %(a_id)s class="%(nlc)s">''' % {
969 970 'a_id': anchor_new_id,
970 971 'nlc': new_lineno_class
971 972 })
972 973
973 974 _html.append('''%(link)s''' % {
974 975 'link': _link_to_if(anchor_link, change['new_lineno'],
975 976 '#%s' % anchor_new)
976 977 })
977 978 _html.append('''</td>\n''')
978 979 ###########################################################
979 980 # CODE
980 981 ###########################################################
981 982 code_classes = [code_class]
982 983 if (not enable_comments or
983 984 change['action'] == Action.CONTEXT):
984 985 code_classes.append('no-comment')
985 986 _html.append('\t<td class="%s">' % ' '.join(code_classes))
986 987 _html.append('''\n\t\t<pre>%(code)s</pre>\n''' % {
987 988 'code': change['line']
988 989 })
989 990
990 991 _html.append('''\t</td>''')
991 992 _html.append('''\n</tr>\n''')
992 993 _html.append('''</table>''')
993 994 if _html_empty:
994 995 return None
995 996 return ''.join(_html)
996 997
997 998 def stat(self):
998 999 """
999 1000 Returns tuple of added, and removed lines for this instance
1000 1001 """
1001 1002 return self.adds, self.removes
1002 1003
1003 1004 def get_context_of_line(
1004 1005 self, path, diff_line=None, context_before=3, context_after=3):
1005 1006 """
1006 1007 Returns the context lines for the specified diff line.
1007 1008
1008 1009 :type diff_line: :class:`DiffLineNumber`
1009 1010 """
1010 1011 assert self.parsed, "DiffProcessor is not initialized."
1011 1012
1012 1013 if None not in diff_line:
1013 1014 raise ValueError(
1014 1015 "Cannot specify both line numbers: {}".format(diff_line))
1015 1016
1016 1017 file_diff = self._get_file_diff(path)
1017 1018 chunk, idx = self._find_chunk_line_index(file_diff, diff_line)
1018 1019
1019 1020 first_line_to_include = max(idx - context_before, 0)
1020 1021 first_line_after_context = idx + context_after + 1
1021 1022 context_lines = chunk[first_line_to_include:first_line_after_context]
1022 1023
1023 1024 line_contents = [
1024 1025 _context_line(line) for line in context_lines
1025 1026 if _is_diff_content(line)]
1026 1027 # TODO: johbo: Interim fixup, the diff chunks drop the final newline.
1027 1028 # Once they are fixed, we can drop this line here.
1028 1029 if line_contents:
1029 1030 line_contents[-1] = (
1030 1031 line_contents[-1][0], line_contents[-1][1].rstrip('\n') + '\n')
1031 1032 return line_contents
1032 1033
1033 1034 def find_context(self, path, context, offset=0):
1034 1035 """
1035 1036 Finds the given `context` inside of the diff.
1036 1037
1037 1038 Use the parameter `offset` to specify which offset the target line has
1038 1039 inside of the given `context`. This way the correct diff line will be
1039 1040 returned.
1040 1041
1041 1042 :param offset: Shall be used to specify the offset of the main line
1042 1043 within the given `context`.
1043 1044 """
1044 1045 if offset < 0 or offset >= len(context):
1045 1046 raise ValueError(
1046 1047 "Only positive values up to the length of the context "
1047 1048 "minus one are allowed.")
1048 1049
1049 1050 matches = []
1050 1051 file_diff = self._get_file_diff(path)
1051 1052
1052 1053 for chunk in file_diff['chunks']:
1053 1054 context_iter = iter(context)
1054 1055 for line_idx, line in enumerate(chunk):
1055 1056 try:
1056 1057 if _context_line(line) == context_iter.next():
1057 1058 continue
1058 1059 except StopIteration:
1059 1060 matches.append((line_idx, chunk))
1060 1061 context_iter = iter(context)
1061 1062
1062 1063 # Increment position and triger StopIteration
1063 1064 # if we had a match at the end
1064 1065 line_idx += 1
1065 1066 try:
1066 1067 context_iter.next()
1067 1068 except StopIteration:
1068 1069 matches.append((line_idx, chunk))
1069 1070
1070 1071 effective_offset = len(context) - offset
1071 1072 found_at_diff_lines = [
1072 1073 _line_to_diff_line_number(chunk[idx - effective_offset])
1073 1074 for idx, chunk in matches]
1074 1075
1075 1076 return found_at_diff_lines
1076 1077
1077 1078 def _get_file_diff(self, path):
1078 1079 for file_diff in self.parsed_diff:
1079 1080 if file_diff['filename'] == path:
1080 1081 break
1081 1082 else:
1082 1083 raise FileNotInDiffException("File {} not in diff".format(path))
1083 1084 return file_diff
1084 1085
1085 1086 def _find_chunk_line_index(self, file_diff, diff_line):
1086 1087 for chunk in file_diff['chunks']:
1087 1088 for idx, line in enumerate(chunk):
1088 1089 if line['old_lineno'] == diff_line.old:
1089 1090 return chunk, idx
1090 1091 if line['new_lineno'] == diff_line.new:
1091 1092 return chunk, idx
1092 1093 raise LineNotInDiffException(
1093 1094 "The line {} is not part of the diff.".format(diff_line))
1094 1095
1095 1096
1096 1097 def _is_diff_content(line):
1097 1098 return line['action'] in (
1098 1099 Action.UNMODIFIED, Action.ADD, Action.DELETE)
1099 1100
1100 1101
1101 1102 def _context_line(line):
1102 1103 return (line['action'], line['line'])
1103 1104
1104 1105
1105 1106 DiffLineNumber = collections.namedtuple('DiffLineNumber', ['old', 'new'])
1106 1107
1107 1108
1108 1109 def _line_to_diff_line_number(line):
1109 1110 new_line_no = line['new_lineno'] or None
1110 1111 old_line_no = line['old_lineno'] or None
1111 1112 return DiffLineNumber(old=old_line_no, new=new_line_no)
1112 1113
1113 1114
1114 1115 class FileNotInDiffException(Exception):
1115 1116 """
1116 1117 Raised when the context for a missing file is requested.
1117 1118
1118 1119 If you request the context for a line in a file which is not part of the
1119 1120 given diff, then this exception is raised.
1120 1121 """
1121 1122
1122 1123
1123 1124 class LineNotInDiffException(Exception):
1124 1125 """
1125 1126 Raised when the context for a missing line is requested.
1126 1127
1127 1128 If you request the context for a line in a file and this line is not
1128 1129 part of the given diff, then this exception is raised.
1129 1130 """
1130 1131
1131 1132
1132 1133 class DiffLimitExceeded(Exception):
1133 1134 pass
1134 1135
1135 1136
1136 1137 def cache_diff(cached_diff_file, diff, commits):
1137 1138
1138 1139 struct = {
1139 1140 'version': 'v1',
1140 1141 'diff': diff,
1141 1142 'commits': commits
1142 1143 }
1143 1144
1144 1145 try:
1145 with open(cached_diff_file, 'wb') as f:
1146 with bz2.BZ2File(cached_diff_file, 'wb') as f:
1146 1147 pickle.dump(struct, f)
1147 1148 log.debug('Saved diff cache under %s', cached_diff_file)
1148 1149 except Exception:
1149 1150 log.warn('Failed to save cache', exc_info=True)
1150 1151 # cleanup file to not store it "damaged"
1151 1152 try:
1152 1153 os.remove(cached_diff_file)
1153 1154 except Exception:
1154 1155 log.exception('Failed to cleanup path %s', cached_diff_file)
1155 1156
1156 1157
1157 1158 def load_cached_diff(cached_diff_file):
1158 1159
1159 1160 default_struct = {
1160 1161 'version': 'v1',
1161 1162 'diff': None,
1162 1163 'commits': None
1163 1164 }
1164 1165
1165 1166 has_cache = os.path.isfile(cached_diff_file)
1166 1167 if not has_cache:
1167 1168 return default_struct
1168 1169
1169 1170 data = None
1170 1171 try:
1171 with open(cached_diff_file, 'rb') as f:
1172 with bz2.BZ2File(cached_diff_file, 'rb') as f:
1172 1173 data = pickle.load(f)
1173 1174 log.debug('Loaded diff cache from %s', cached_diff_file)
1174 1175 except Exception:
1175 1176 log.warn('Failed to read diff cache file', exc_info=True)
1176 1177
1177 1178 if not data:
1178 1179 data = default_struct
1179 1180
1180 1181 if not isinstance(data, dict):
1181 1182 # old version of data ?
1182 1183 data = default_struct
1183 1184
1184 1185 return data
1185 1186
1186 1187
1187 1188 def generate_diff_cache_key(*args):
1188 1189 """
1189 1190 Helper to generate a cache key using arguments
1190 1191 """
1191 1192 def arg_mapper(input_param):
1192 1193 input_param = safe_str(input_param)
1193 1194 # we cannot allow '/' in arguments since it would allow
1194 1195 # subdirectory usage
1195 1196 input_param.replace('/', '_')
1196 1197 return input_param or None # prevent empty string arguments
1197 1198
1198 1199 return '_'.join([
1199 1200 '{}' for i in range(len(args))]).format(*map(arg_mapper, args))
1200 1201
1201 1202
1202 1203 def diff_cache_exist(cache_storage, *args):
1203 1204 """
1204 1205 Based on all generated arguments check and return a cache path
1205 1206 """
1206 1207 cache_key = generate_diff_cache_key(*args)
1207 1208 cache_file_path = os.path.join(cache_storage, cache_key)
1208 1209 # prevent path traversal attacks using some param that have e.g '../../'
1209 1210 if not os.path.abspath(cache_file_path).startswith(cache_storage):
1210 1211 raise ValueError('Final path must be within {}'.format(cache_storage))
1211 1212
1212 1213 return cache_file_path
General Comments 0
You need to be logged in to leave comments. Login now