diffs.py
1143 lines
| 39.3 KiB
| text/x-python
|
PythonLexer
r5088 | # Copyright (C) 2011-2023 RhodeCode GmbH | |||
r1 | # | |||
# This program is free software: you can redistribute it and/or modify | ||||
# it under the terms of the GNU Affero General Public License, version 3 | ||||
# (only), as published by the Free Software Foundation. | ||||
# | ||||
# This program is distributed in the hope that it will be useful, | ||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||
# GNU General Public License for more details. | ||||
# | ||||
# You should have received a copy of the GNU Affero General Public License | ||||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||||
# | ||||
# This program is dual-licensed. If you wish to learn more about the | ||||
# RhodeCode Enterprise Edition, including its added features, Support services, | ||||
# and proprietary license terms, please see https://rhodecode.com/licenses/ | ||||
""" | ||||
Set of diffing helpers, previously part of vcs | ||||
""" | ||||
r5083 | import dataclasses | |||
Bartłomiej Wołyńczyk
|
r2685 | import os | ||
r2358 | import re | |||
r2690 | import bz2 | |||
r3854 | import gzip | |||
r3838 | import time | |||
r2690 | ||||
r1 | import difflib | |||
import logging | ||||
r4930 | import pickle | |||
from itertools import tee | ||||
r1 | ||||
from rhodecode.lib.vcs.exceptions import VCSError | ||||
from rhodecode.lib.vcs.nodes import FileNode, SubModuleNode | ||||
r5083 | from rhodecode.lib.vcs.backends import base | |||
from rhodecode.lib.str_utils import safe_str | ||||
r1 | ||||
log = logging.getLogger(__name__) | ||||
r679 | # define max context, a file with more than this numbers of lines is unusable | |||
# in browser anyway | ||||
r3134 | MAX_CONTEXT = 20 * 1024 | |||
DEFAULT_CONTEXT = 3 | ||||
def get_diff_context(request): | ||||
return MAX_CONTEXT if request.GET.get('fullcontext', '') == '1' else DEFAULT_CONTEXT | ||||
def get_diff_whitespace_flag(request): | ||||
return request.GET.get('ignorews', '') == '1' | ||||
r679 | ||||
r1 | ||||
r5083 | @dataclasses.dataclass | |||
class OPS: | ||||
ADD: str = 'A' | ||||
MOD: str = 'M' | ||||
DEL: str = 'D' | ||||
@dataclasses.dataclass | ||||
class DiffLineNumber: | ||||
old: int | None | ||||
new: int | None | ||||
def __iter__(self): | ||||
yield self.old | ||||
yield self.new | ||||
r1 | ||||
r678 | ||||
r1 | def get_gitdiff(filenode_old, filenode_new, ignore_whitespace=True, context=3): | |||
""" | ||||
Returns git style diff between given ``filenode_old`` and ``filenode_new``. | ||||
:param ignore_whitespace: ignore whitespaces in diff | ||||
""" | ||||
# make sure we pass in default context | ||||
context = context or 3 | ||||
r679 | # protect against IntOverflow when passing HUGE context | |||
if context > MAX_CONTEXT: | ||||
context = MAX_CONTEXT | ||||
r4973 | submodules = [o for o in [filenode_new, filenode_old] if isinstance(o, SubModuleNode)] | |||
r1 | if submodules: | |||
return '' | ||||
for filenode in (filenode_old, filenode_new): | ||||
if not isinstance(filenode, FileNode): | ||||
r5083 | raise VCSError(f"Given object should be FileNode object, not {filenode.__class__}") | |||
r1 | ||||
repo = filenode_new.commit.repository | ||||
old_commit = filenode_old.commit or repo.EMPTY_COMMIT | ||||
new_commit = filenode_new.commit | ||||
vcs_gitdiff = repo.get_diff( | ||||
old_commit, new_commit, filenode_new.path, | ||||
ignore_whitespace, context, path1=filenode_old.path) | ||||
return vcs_gitdiff | ||||
NEW_FILENODE = 1 | ||||
DEL_FILENODE = 2 | ||||
MOD_FILENODE = 3 | ||||
RENAMED_FILENODE = 4 | ||||
COPIED_FILENODE = 5 | ||||
CHMOD_FILENODE = 6 | ||||
BIN_FILENODE = 7 | ||||
class LimitedDiffContainer(object): | ||||
r5083 | def __init__(self, diff_limit: int, cur_diff_size, diff): | |||
r1 | self.diff = diff | |||
self.diff_limit = diff_limit | ||||
self.cur_diff_size = cur_diff_size | ||||
def __getitem__(self, key): | ||||
return self.diff.__getitem__(key) | ||||
def __iter__(self): | ||||
r5095 | yield from self.diff | |||
r1 | ||||
class Action(object): | ||||
""" | ||||
Contains constants for the action value of the lines in a parsed diff. | ||||
""" | ||||
ADD = 'add' | ||||
DELETE = 'del' | ||||
UNMODIFIED = 'unmod' | ||||
CONTEXT = 'context' | ||||
r1032 | OLD_NO_NL = 'old-no-nl' | |||
NEW_NO_NL = 'new-no-nl' | ||||
r1 | ||||
class DiffProcessor(object): | ||||
""" | ||||
r5083 | Give it a unified or git diff, and it returns a list of the files that were | |||
r1 | mentioned in the diff together with a dict of meta information that | |||
r5083 | can be used to render it in an HTML template. | |||
r1 | ||||
.. note:: Unicode handling | ||||
The original diffs are a byte sequence and can contain filenames | ||||
in mixed encodings. This class generally returns `unicode` objects | ||||
since the result is intended for presentation to the user. | ||||
""" | ||||
r5083 | _chunk_re = re.compile(br'^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)') | |||
_newline_marker = re.compile(br'^\\ No newline at end of file') | ||||
r1 | ||||
# used for inline highlighter word split | ||||
r5083 | _token_re = re.compile(br'()(>|<|&|\W+?)') | |||
r1 | ||||
r1268 | # collapse ranges of commits over given number | |||
_collapse_commits_over = 5 | ||||
r5083 | def __init__(self, diff: base.Diff, diff_format='gitdiff', diff_limit: int = 0, | |||
file_limit: int = 0, show_full_diff=True): | ||||
r1 | """ | |||
:param diff: A `Diff` object representing a diff from a vcs backend | ||||
r5083 | :param diff_format: format of diff passed, `udiff` or `gitdiff` | |||
r1 | :param diff_limit: define the size of diff that is considered "big" | |||
based on that parameter cut off will be triggered, set to None | ||||
to show full diff | ||||
""" | ||||
self._diff = diff | ||||
r5083 | self._format = diff_format | |||
r1 | self.adds = 0 | |||
self.removes = 0 | ||||
# calculate diff size | ||||
self.diff_limit = diff_limit | ||||
self.file_limit = file_limit | ||||
self.show_full_diff = show_full_diff | ||||
self.cur_diff_size = 0 | ||||
self.parsed = False | ||||
self.parsed_diff = [] | ||||
r5083 | log.debug('Initialized DiffProcessor with %s mode', diff_format) | |||
self.differ = self._highlight_line_udiff | ||||
self._parser = self._new_parse_gitdiff | ||||
if diff_format == 'gitdiff': | ||||
r1 | self.differ = self._highlight_line_difflib | |||
self._parser = self._parse_gitdiff | ||||
r5083 | raise DeprecationWarning('gitdiff usage is deprecated') | |||
r1 | ||||
def _copy_iterator(self): | ||||
""" | ||||
make a fresh copy of generator, we should not iterate thru | ||||
an original as it's needed for repeating operations on | ||||
this instance of DiffProcessor | ||||
""" | ||||
self.__udiff, iterator_copy = tee(self.__udiff) | ||||
return iterator_copy | ||||
r5083 | def _escaper(self, diff_string): | |||
r1 | """ | |||
Escaper for diff escapes special chars and checks the diff limit | ||||
:param string: | ||||
""" | ||||
r5083 | self.cur_diff_size += len(diff_string) | |||
r1 | ||||
if not self.show_full_diff and (self.cur_diff_size > self.diff_limit): | ||||
raise DiffLimitExceeded('Diff Limit Exceeded') | ||||
r5083 | return diff_string \ | |||
.replace(b'&', b'&')\ | ||||
.replace(b'<', b'<')\ | ||||
.replace(b'>', b'>') | ||||
r1 | ||||
r5083 | def _line_counter(self, diff_line): | |||
r1 | """ | |||
Checks each line and bumps total adds/removes for this diff | ||||
r5083 | :param diff_line: | |||
r1 | """ | |||
r5083 | if diff_line.startswith(b'+') and not diff_line.startswith(b'+++'): | |||
r1 | self.adds += 1 | |||
r5083 | elif diff_line.startswith(b'-') and not diff_line.startswith(b'---'): | |||
r1 | self.removes += 1 | |||
r5083 | return diff_line | |||
r1 | ||||
def _highlight_line_difflib(self, line, next_): | ||||
""" | ||||
Highlight inline changes in both lines. | ||||
""" | ||||
if line['action'] == Action.DELETE: | ||||
old, new = line, next_ | ||||
else: | ||||
old, new = next_, line | ||||
oldwords = self._token_re.split(old['line']) | ||||
newwords = self._token_re.split(new['line']) | ||||
sequence = difflib.SequenceMatcher(None, oldwords, newwords) | ||||
oldfragments, newfragments = [], [] | ||||
for tag, i1, i2, j1, j2 in sequence.get_opcodes(): | ||||
oldfrag = ''.join(oldwords[i1:i2]) | ||||
newfrag = ''.join(newwords[j1:j2]) | ||||
if tag != 'equal': | ||||
if oldfrag: | ||||
r5083 | oldfrag = f'<del>{oldfrag}</del>' | |||
r1 | if newfrag: | |||
r5083 | newfrag = f'<ins>{newfrag}</ins>' | |||
r1 | oldfragments.append(oldfrag) | |||
newfragments.append(newfrag) | ||||
old['line'] = "".join(oldfragments) | ||||
new['line'] = "".join(newfragments) | ||||
def _highlight_line_udiff(self, line, next_): | ||||
""" | ||||
Highlight inline changes in both lines. | ||||
""" | ||||
start = 0 | ||||
limit = min(len(line['line']), len(next_['line'])) | ||||
while start < limit and line['line'][start] == next_['line'][start]: | ||||
start += 1 | ||||
end = -1 | ||||
limit -= start | ||||
while -end <= limit and line['line'][end] == next_['line'][end]: | ||||
end -= 1 | ||||
end += 1 | ||||
if start or end: | ||||
def do(l): | ||||
last = end + len(l['line']) | ||||
if l['action'] == Action.ADD: | ||||
tag = 'ins' | ||||
else: | ||||
tag = 'del' | ||||
r5083 | l['line'] = f"{l['line'][:start]}<{tag}>{l['line'][start:last]}</{tag}>{l['line'][last:]}" | |||
r1 | do(line) | |||
do(next_) | ||||
r5083 | def _clean_line(self, line, command: str): | |||
r1 | if command in ['+', '-', ' ']: | |||
# only modify the line if it's actually a diff thing | ||||
line = line[1:] | ||||
return line | ||||
def _parse_gitdiff(self, inline_diff=True): | ||||
_files = [] | ||||
r5083 | ||||
def diff_container(arg): | ||||
return arg | ||||
r1 | ||||
for chunk in self._diff.chunks(): | ||||
head = chunk.header | ||||
r4931 | diff = map(self._escaper, self.diff_splitter(chunk.diff)) | |||
r1 | raw_diff = chunk.raw | |||
limited_diff = False | ||||
exceeds_limit = False | ||||
op = None | ||||
stats = { | ||||
'added': 0, | ||||
'deleted': 0, | ||||
'binary': False, | ||||
'ops': {}, | ||||
} | ||||
if head['deleted_file_mode']: | ||||
op = OPS.DEL | ||||
stats['binary'] = True | ||||
stats['ops'][DEL_FILENODE] = 'deleted file' | ||||
elif head['new_file_mode']: | ||||
op = OPS.ADD | ||||
stats['binary'] = True | ||||
r5083 | stats['ops'][NEW_FILENODE] = f"new file {safe_str(head['new_file_mode'])}" | |||
else: # modify operation, can be: copy, rename or chmod | ||||
r1 | ||||
# CHMOD | ||||
if head['new_mode'] and head['old_mode']: | ||||
op = OPS.MOD | ||||
stats['binary'] = True | ||||
r5083 | stats['ops'][CHMOD_FILENODE] = f"modified file chmod {safe_str(head['old_mode'])} => {safe_str(head['new_mode'])}" | |||
r1 | # RENAME | |||
if head['rename_from'] != head['rename_to']: | ||||
op = OPS.MOD | ||||
stats['binary'] = True | ||||
r5083 | stats['ops'][RENAMED_FILENODE] = f"file renamed from {safe_str(head['rename_from'])} to {safe_str(head['rename_to'])}" | |||
r1 | # COPY | |||
if head.get('copy_from') and head.get('copy_to'): | ||||
op = OPS.MOD | ||||
stats['binary'] = True | ||||
r5083 | stats['ops'][COPIED_FILENODE] = f"file copied from {safe_str(head['copy_from'])} to {safe_str(head['copy_to'])}" | |||
r1 | ||||
# If our new parsed headers didn't match anything fallback to | ||||
# old style detection | ||||
if op is None: | ||||
if not head['a_file'] and head['b_file']: | ||||
op = OPS.ADD | ||||
stats['binary'] = True | ||||
stats['ops'][NEW_FILENODE] = 'new file' | ||||
elif head['a_file'] and not head['b_file']: | ||||
op = OPS.DEL | ||||
stats['binary'] = True | ||||
stats['ops'][DEL_FILENODE] = 'deleted file' | ||||
# it's not ADD not DELETE | ||||
if op is None: | ||||
op = OPS.MOD | ||||
stats['binary'] = True | ||||
stats['ops'][MOD_FILENODE] = 'modified file' | ||||
# a real non-binary diff | ||||
if head['a_file'] or head['b_file']: | ||||
try: | ||||
raw_diff, chunks, _stats = self._parse_lines(diff) | ||||
stats['binary'] = False | ||||
stats['added'] = _stats[0] | ||||
stats['deleted'] = _stats[1] | ||||
# explicit mark that it's a modified file | ||||
if op == OPS.MOD: | ||||
stats['ops'][MOD_FILENODE] = 'modified file' | ||||
exceeds_limit = len(raw_diff) > self.file_limit | ||||
# changed from _escaper function so we validate size of | ||||
# each file instead of the whole diff | ||||
# diff will hide big files but still show small ones | ||||
# from my tests, big files are fairly safe to be parsed | ||||
# but the browser is the bottleneck | ||||
if not self.show_full_diff and exceeds_limit: | ||||
raise DiffLimitExceeded('File Limit Exceeded') | ||||
except DiffLimitExceeded: | ||||
r5083 | def diff_container(_diff): | |||
return LimitedDiffContainer(self.diff_limit, self.cur_diff_size, _diff) | ||||
r1 | ||||
exceeds_limit = len(raw_diff) > self.file_limit | ||||
limited_diff = True | ||||
chunks = [] | ||||
else: # GIT format binary patch, or possibly empty diff | ||||
if head['bin_patch']: | ||||
# we have operation already extracted, but we mark simply | ||||
r5083 | # it's a diff we won't show for binary files | |||
r1 | stats['ops'][BIN_FILENODE] = 'binary diff hidden' | |||
chunks = [] | ||||
if chunks and not self.show_full_diff and op == OPS.DEL: | ||||
# if not full diff mode show deleted file contents | ||||
# TODO: anderson: if the view is not too big, there is no way | ||||
# to see the content of the file | ||||
chunks = [] | ||||
r5083 | frag = [{ | |||
'old_lineno': '', | ||||
'new_lineno': '', | ||||
'action': Action.CONTEXT, | ||||
'line': msg, | ||||
} for _op, msg in list(stats['ops'].items()) | ||||
if _op not in [MOD_FILENODE]] | ||||
chunks.insert(0, frag) | ||||
r1 | ||||
_files.append({ | ||||
r5083 | 'filename': safe_str(head['b_path']), | |||
r1 | 'old_revision': head['a_blob_id'], | |||
'new_revision': head['b_blob_id'], | ||||
'chunks': chunks, | ||||
r5083 | 'raw_diff': safe_str(raw_diff), | |||
r1 | 'operation': op, | |||
'stats': stats, | ||||
'exceeds_limit': exceeds_limit, | ||||
'is_limited_diff': limited_diff, | ||||
}) | ||||
r5083 | def operation_sorter(info): | |||
return {OPS.ADD: 0, OPS.MOD: 1, OPS.DEL: 2}.get(info['operation']) | ||||
r1 | ||||
if not inline_diff: | ||||
r5083 | return diff_container(sorted(_files, key=operation_sorter)) | |||
r1 | ||||
# highlight inline changes | ||||
for diff_data in _files: | ||||
for chunk in diff_data['chunks']: | ||||
lineiter = iter(chunk) | ||||
try: | ||||
while 1: | ||||
r4930 | line = next(lineiter) | |||
r1 | if line['action'] not in ( | |||
Action.UNMODIFIED, Action.CONTEXT): | ||||
r4930 | nextline = next(lineiter) | |||
r1 | if nextline['action'] in ['unmod', 'context'] or \ | |||
nextline['action'] == line['action']: | ||||
continue | ||||
self.differ(line, nextline) | ||||
except StopIteration: | ||||
pass | ||||
r5083 | return diff_container(sorted(_files, key=operation_sorter)) | |||
r1 | ||||
r2070 | def _check_large_diff(self): | |||
r4324 | if self.diff_limit: | |||
log.debug('Checking if diff exceeds current diff_limit of %s', self.diff_limit) | ||||
r1030 | if not self.show_full_diff and (self.cur_diff_size > self.diff_limit): | |||
r5083 | raise DiffLimitExceeded(f'Diff Limit `{self.diff_limit}` Exceeded') | |||
r1030 | ||||
# FIXME: NEWDIFFS: dan: this replaces _parse_gitdiff | ||||
def _new_parse_gitdiff(self, inline_diff=True): | ||||
_files = [] | ||||
r2070 | ||||
r5083 | # this can be overridden later to a LimitedDiffContainer type | |||
def diff_container(arg): | ||||
return arg | ||||
r2070 | ||||
r1030 | for chunk in self._diff.chunks(): | |||
r5083 | head = chunk.header_as_str | |||
log.debug('parsing diff chunk %r', chunk) | ||||
r1030 | ||||
raw_diff = chunk.raw | ||||
limited_diff = False | ||||
exceeds_limit = False | ||||
r2070 | ||||
r1030 | op = None | |||
stats = { | ||||
'added': 0, | ||||
'deleted': 0, | ||||
'binary': False, | ||||
r5083 | 'old_mode': '', | |||
'new_mode': '', | ||||
r1030 | 'ops': {}, | |||
} | ||||
if head['old_mode']: | ||||
stats['old_mode'] = head['old_mode'] | ||||
if head['new_mode']: | ||||
stats['new_mode'] = head['new_mode'] | ||||
if head['b_mode']: | ||||
stats['new_mode'] = head['b_mode'] | ||||
r2070 | # delete file | |||
r1030 | if head['deleted_file_mode']: | |||
op = OPS.DEL | ||||
stats['binary'] = True | ||||
stats['ops'][DEL_FILENODE] = 'deleted file' | ||||
r2070 | # new file | |||
r1030 | elif head['new_file_mode']: | |||
op = OPS.ADD | ||||
stats['binary'] = True | ||||
r5083 | stats['old_mode'] = '' | |||
r1030 | stats['new_mode'] = head['new_file_mode'] | |||
r5083 | stats['ops'][NEW_FILENODE] = f"new file {head['new_file_mode']}" | |||
r1030 | ||||
r5083 | # modify operation, can be: copy, rename or chmod | |||
r2070 | else: | |||
r1030 | # CHMOD | |||
if head['new_mode'] and head['old_mode']: | ||||
op = OPS.MOD | ||||
stats['binary'] = True | ||||
r5083 | stats['ops'][CHMOD_FILENODE] = f"modified file chmod {head['old_mode']} => {head['new_mode']}" | |||
r1030 | ||||
# RENAME | ||||
if head['rename_from'] != head['rename_to']: | ||||
op = OPS.MOD | ||||
stats['binary'] = True | ||||
stats['renamed'] = (head['rename_from'], head['rename_to']) | ||||
r5083 | stats['ops'][RENAMED_FILENODE] = f"file renamed from {head['rename_from']} to {head['rename_to']}" | |||
r1030 | # COPY | |||
if head.get('copy_from') and head.get('copy_to'): | ||||
op = OPS.MOD | ||||
stats['binary'] = True | ||||
stats['copied'] = (head['copy_from'], head['copy_to']) | ||||
r5083 | stats['ops'][COPIED_FILENODE] = f"file copied from {head['copy_from']} to {head['copy_to']}" | |||
r1 | ||||
r1030 | # If our new parsed headers didn't match anything fallback to | |||
# old style detection | ||||
if op is None: | ||||
if not head['a_file'] and head['b_file']: | ||||
op = OPS.ADD | ||||
stats['binary'] = True | ||||
stats['new_file'] = True | ||||
stats['ops'][NEW_FILENODE] = 'new file' | ||||
elif head['a_file'] and not head['b_file']: | ||||
op = OPS.DEL | ||||
stats['binary'] = True | ||||
stats['ops'][DEL_FILENODE] = 'deleted file' | ||||
# it's not ADD not DELETE | ||||
if op is None: | ||||
op = OPS.MOD | ||||
stats['binary'] = True | ||||
stats['ops'][MOD_FILENODE] = 'modified file' | ||||
# a real non-binary diff | ||||
if head['a_file'] or head['b_file']: | ||||
r2546 | # simulate splitlines, so we keep the line end part | |||
diff = self.diff_splitter(chunk.diff) | ||||
r2070 | ||||
# append each file to the diff size | ||||
raw_chunk_size = len(raw_diff) | ||||
exceeds_limit = raw_chunk_size > self.file_limit | ||||
self.cur_diff_size += raw_chunk_size | ||||
r1030 | try: | |||
r2070 | # Check each file instead of the whole diff. | |||
# Diff will hide big files but still show small ones. | ||||
# From the tests big files are fairly safe to be parsed | ||||
# but the browser is the bottleneck. | ||||
if not self.show_full_diff and exceeds_limit: | ||||
log.debug('File `%s` exceeds current file_limit of %s', | ||||
r5083 | head['b_path'], self.file_limit) | |||
raise DiffLimitExceeded(f'File Limit {self.file_limit} Exceeded') | ||||
r2070 | ||||
self._check_large_diff() | ||||
r1030 | raw_diff, chunks, _stats = self._new_parse_lines(diff) | |||
stats['binary'] = False | ||||
stats['added'] = _stats[0] | ||||
stats['deleted'] = _stats[1] | ||||
# explicit mark that it's a modified file | ||||
if op == OPS.MOD: | ||||
stats['ops'][MOD_FILENODE] = 'modified file' | ||||
except DiffLimitExceeded: | ||||
r5083 | def limited_diff_container(_diff): | |||
return LimitedDiffContainer(self.diff_limit, self.cur_diff_size, _diff) | ||||
# re-definition of our container wrapper | ||||
diff_container = limited_diff_container | ||||
r1030 | ||||
limited_diff = True | ||||
chunks = [] | ||||
else: # GIT format binary patch, or possibly empty diff | ||||
if head['bin_patch']: | ||||
# we have operation already extracted, but we mark simply | ||||
r5083 | # it's a diff we won't show for binary files | |||
r1030 | stats['ops'][BIN_FILENODE] = 'binary diff hidden' | |||
chunks = [] | ||||
r2070 | # Hide content of deleted node by setting empty chunks | |||
r1030 | if chunks and not self.show_full_diff and op == OPS.DEL: | |||
# if not full diff mode show deleted file contents | ||||
# TODO: anderson: if the view is not too big, there is no way | ||||
# to see the content of the file | ||||
chunks = [] | ||||
r5083 | frag = [ | |||
{'old_lineno': '', | ||||
'new_lineno': '', | ||||
'action': Action.CONTEXT, | ||||
'line': msg, | ||||
} for _op, msg in list(stats['ops'].items()) | ||||
if _op not in [MOD_FILENODE]] | ||||
r1030 | ||||
r5083 | chunks.insert(0, frag) | |||
original_filename = safe_str(head['a_path']) | ||||
r1030 | _files.append({ | |||
'original_filename': original_filename, | ||||
r5083 | 'filename': safe_str(head['b_path']), | |||
r1030 | 'old_revision': head['a_blob_id'], | |||
'new_revision': head['b_blob_id'], | ||||
'chunks': chunks, | ||||
r5083 | 'raw_diff': safe_str(raw_diff), | |||
r1030 | 'operation': op, | |||
'stats': stats, | ||||
'exceeds_limit': exceeds_limit, | ||||
'is_limited_diff': limited_diff, | ||||
}) | ||||
r5083 | def sorter(info): | |||
return {OPS.ADD: 0, OPS.MOD: 1, OPS.DEL: 2}.get(info['operation']) | ||||
r1030 | return diff_container(sorted(_files, key=sorter)) | |||
# FIXME: NEWDIFFS: dan: this gets replaced by _new_parse_lines | ||||
r2546 | def _parse_lines(self, diff_iter): | |||
r1 | """ | |||
Parse the diff an return data for the template. | ||||
""" | ||||
stats = [0, 0] | ||||
chunks = [] | ||||
raw_diff = [] | ||||
try: | ||||
r4930 | line = next(diff_iter) | |||
r1 | ||||
while line: | ||||
raw_diff.append(line) | ||||
lines = [] | ||||
chunks.append(lines) | ||||
match = self._chunk_re.match(line) | ||||
if not match: | ||||
break | ||||
gr = match.groups() | ||||
(old_line, old_end, | ||||
new_line, new_end) = [int(x or 1) for x in gr[:-1]] | ||||
old_line -= 1 | ||||
new_line -= 1 | ||||
context = len(gr) == 5 | ||||
old_end += old_line | ||||
new_end += new_line | ||||
if context: | ||||
# skip context only if it's first line | ||||
if int(gr[0]) > 1: | ||||
lines.append({ | ||||
'old_lineno': '...', | ||||
'new_lineno': '...', | ||||
'action': Action.CONTEXT, | ||||
'line': line, | ||||
}) | ||||
r4930 | line = next(diff_iter) | |||
r1 | ||||
while old_line < old_end or new_line < new_end: | ||||
r5083 | command = b' ' | |||
r1 | if line: | |||
command = line[0] | ||||
affects_old = affects_new = False | ||||
# ignore those if we don't expect them | ||||
r5083 | if command in b'#@': | |||
r1 | continue | |||
r5083 | elif command == b'+': | |||
r1 | affects_new = True | |||
action = Action.ADD | ||||
stats[0] += 1 | ||||
r5083 | elif command == b'-': | |||
r1 | affects_old = True | |||
action = Action.DELETE | ||||
stats[1] += 1 | ||||
else: | ||||
affects_old = affects_new = True | ||||
action = Action.UNMODIFIED | ||||
if not self._newline_marker.match(line): | ||||
old_line += affects_old | ||||
new_line += affects_new | ||||
lines.append({ | ||||
r5083 | 'old_lineno': affects_old and old_line or b'', | |||
'new_lineno': affects_new and new_line or b'', | ||||
r1 | 'action': action, | |||
'line': self._clean_line(line, command) | ||||
}) | ||||
raw_diff.append(line) | ||||
r4930 | line = next(diff_iter) | |||
r1 | ||||
if self._newline_marker.match(line): | ||||
# we need to append to lines, since this is not | ||||
# counted in the line specs of diff | ||||
lines.append({ | ||||
'old_lineno': '...', | ||||
'new_lineno': '...', | ||||
'action': Action.CONTEXT, | ||||
'line': self._clean_line(line, command) | ||||
}) | ||||
except StopIteration: | ||||
pass | ||||
return ''.join(raw_diff), chunks, stats | ||||
r1030 | # FIXME: NEWDIFFS: dan: this replaces _parse_lines | |||
r2070 | def _new_parse_lines(self, diff_iter): | |||
r1030 | """ | |||
Parse the diff an return data for the template. | ||||
""" | ||||
stats = [0, 0] | ||||
chunks = [] | ||||
raw_diff = [] | ||||
try: | ||||
r4930 | line = next(diff_iter) | |||
r5083 | assert isinstance(line, bytes) | |||
r1030 | ||||
while line: | ||||
raw_diff.append(line) | ||||
r2546 | # match header e.g @@ -0,0 +1 @@\n' | |||
r1030 | match = self._chunk_re.match(line) | |||
if not match: | ||||
break | ||||
gr = match.groups() | ||||
r5083 | ||||
r1030 | (old_line, old_end, | |||
new_line, new_end) = [int(x or 1) for x in gr[:-1]] | ||||
lines = [] | ||||
hunk = { | ||||
'section_header': gr[-1], | ||||
'source_start': old_line, | ||||
'source_length': old_end, | ||||
'target_start': new_line, | ||||
'target_length': new_end, | ||||
'lines': lines, | ||||
} | ||||
chunks.append(hunk) | ||||
old_line -= 1 | ||||
new_line -= 1 | ||||
r5083 | len(gr) == 5 | |||
r1030 | old_end += old_line | |||
new_end += new_line | ||||
r4930 | line = next(diff_iter) | |||
r1030 | ||||
while old_line < old_end or new_line < new_end: | ||||
command = ' ' | ||||
if line: | ||||
r5083 | # This is bytes, so we need to convert it to a str | |||
command: str = chr(line[0]) | ||||
r1030 | ||||
affects_old = affects_new = False | ||||
# ignore those if we don't expect them | ||||
if command in '#@': | ||||
continue | ||||
elif command == '+': | ||||
affects_new = True | ||||
action = Action.ADD | ||||
stats[0] += 1 | ||||
elif command == '-': | ||||
affects_old = True | ||||
action = Action.DELETE | ||||
stats[1] += 1 | ||||
else: | ||||
affects_old = affects_new = True | ||||
action = Action.UNMODIFIED | ||||
if not self._newline_marker.match(line): | ||||
old_line += affects_old | ||||
new_line += affects_new | ||||
lines.append({ | ||||
r5083 | 'old_lineno': affects_old and old_line or None, | |||
'new_lineno': affects_new and new_line or None, | ||||
r1030 | 'action': action, | |||
'line': self._clean_line(line, command) | ||||
}) | ||||
r2252 | raw_diff.append(line) | |||
r1030 | ||||
r4930 | line = next(diff_iter) | |||
r1030 | ||||
if self._newline_marker.match(line): | ||||
# we need to append to lines, since this is not | ||||
# counted in the line specs of diff | ||||
if affects_old: | ||||
r1032 | action = Action.OLD_NO_NL | |||
r1030 | elif affects_new: | |||
r1032 | action = Action.NEW_NO_NL | |||
r1030 | else: | |||
raise Exception('invalid context for no newline') | ||||
lines.append({ | ||||
'old_lineno': None, | ||||
'new_lineno': None, | ||||
'action': action, | ||||
'line': self._clean_line(line, command) | ||||
}) | ||||
except StopIteration: | ||||
pass | ||||
r2070 | ||||
r5083 | return b''.join(raw_diff), chunks, stats | |||
r1030 | ||||
r1 | def _safe_id(self, idstring): | |||
"""Make a string safe for including in an id attribute. | ||||
The HTML spec says that id attributes 'must begin with | ||||
a letter ([A-Za-z]) and may be followed by any number | ||||
of letters, digits ([0-9]), hyphens ("-"), underscores | ||||
("_"), colons (":"), and periods (".")'. These regexps | ||||
are slightly over-zealous, in that they remove colons | ||||
and periods unnecessarily. | ||||
Whitespace is transformed into underscores, and then | ||||
anything which is not a hyphen or a character that | ||||
matches \w (alphanumerics and underscore) is removed. | ||||
""" | ||||
# Transform all whitespace to underscore | ||||
r5083 | idstring = re.sub(r'\s', "_", f'{idstring}') | |||
r1 | # Remove everything that is not a hyphen or a member of \w | |||
idstring = re.sub(r'(?!-)\W', "", idstring).lower() | ||||
return idstring | ||||
r2546 | @classmethod | |||
r5083 | def diff_splitter(cls, diff_string: bytes): | |||
r2546 | """ | |||
Diff split that emulates .splitlines() but works only on \n | ||||
""" | ||||
r5083 | if not diff_string: | |||
r2546 | return | |||
r5083 | elif diff_string == b'\n': | |||
yield b'\n' | ||||
r2546 | else: | |||
r5083 | has_newline = diff_string.endswith(b'\n') | |||
elements = diff_string.split(b'\n') | ||||
r2546 | if has_newline: | |||
# skip last element as it's empty string from newlines | ||||
elements = elements[:-1] | ||||
len_elements = len(elements) | ||||
for cnt, line in enumerate(elements, start=1): | ||||
last_line = cnt == len_elements | ||||
if last_line and not has_newline: | ||||
r5083 | yield line | |||
r2546 | else: | |||
r5083 | yield line + b'\n' | |||
r2546 | ||||
r1 | def prepare(self, inline_diff=True): | |||
""" | ||||
Prepare the passed udiff for HTML rendering. | ||||
:return: A list of dicts with diff information. | ||||
""" | ||||
parsed = self._parser(inline_diff=inline_diff) | ||||
self.parsed = True | ||||
self.parsed_diff = parsed | ||||
return parsed | ||||
def as_raw(self, diff_lines=None): | ||||
""" | ||||
Returns raw diff as a byte string | ||||
""" | ||||
r5083 | return self._diff.raw.tobytes() | |||
r1 | ||||
def stat(self): | ||||
""" | ||||
Returns tuple of added, and removed lines for this instance | ||||
""" | ||||
return self.adds, self.removes | ||||
def get_context_of_line( | ||||
r5083 | self, path, diff_line: DiffLineNumber = None, context_before: int = 3, context_after: int = 3): | |||
r1 | """ | |||
Returns the context lines for the specified diff line. | ||||
""" | ||||
assert self.parsed, "DiffProcessor is not initialized." | ||||
if None not in diff_line: | ||||
r5083 | raise ValueError(f"Cannot specify both line numbers in diff_line: {diff_line}") | |||
r1 | ||||
file_diff = self._get_file_diff(path) | ||||
chunk, idx = self._find_chunk_line_index(file_diff, diff_line) | ||||
first_line_to_include = max(idx - context_before, 0) | ||||
first_line_after_context = idx + context_after + 1 | ||||
r5083 | context_lines = chunk['lines'][first_line_to_include:first_line_after_context] | |||
r1 | ||||
line_contents = [ | ||||
_context_line(line) for line in context_lines | ||||
r5083 | if _is_diff_content(line) | |||
] | ||||
r1 | # TODO: johbo: Interim fixup, the diff chunks drop the final newline. | |||
# Once they are fixed, we can drop this line here. | ||||
if line_contents: | ||||
line_contents[-1] = ( | ||||
r5083 | line_contents[-1][0], line_contents[-1][1].rstrip(b'\n') + b'\n') | |||
r1 | return line_contents | |||
def find_context(self, path, context, offset=0): | ||||
""" | ||||
Finds the given `context` inside of the diff. | ||||
Use the parameter `offset` to specify which offset the target line has | ||||
inside of the given `context`. This way the correct diff line will be | ||||
returned. | ||||
:param offset: Shall be used to specify the offset of the main line | ||||
within the given `context`. | ||||
""" | ||||
if offset < 0 or offset >= len(context): | ||||
raise ValueError( | ||||
"Only positive values up to the length of the context " | ||||
"minus one are allowed.") | ||||
matches = [] | ||||
file_diff = self._get_file_diff(path) | ||||
for chunk in file_diff['chunks']: | ||||
r5083 | if not isinstance(chunk, dict): | |||
continue | ||||
r1 | context_iter = iter(context) | |||
r5083 | for line_idx, line in enumerate(chunk['lines']): | |||
r1 | try: | |||
r4930 | if _context_line(line) == next(context_iter): | |||
r1 | continue | |||
except StopIteration: | ||||
matches.append((line_idx, chunk)) | ||||
context_iter = iter(context) | ||||
# Increment position and triger StopIteration | ||||
# if we had a match at the end | ||||
line_idx += 1 | ||||
try: | ||||
r4930 | next(context_iter) | |||
r1 | except StopIteration: | |||
matches.append((line_idx, chunk)) | ||||
effective_offset = len(context) - offset | ||||
found_at_diff_lines = [ | ||||
r5083 | _line_to_diff_line_number(chunk['lines'][idx - effective_offset]) | |||
r1 | for idx, chunk in matches] | |||
return found_at_diff_lines | ||||
def _get_file_diff(self, path): | ||||
for file_diff in self.parsed_diff: | ||||
if file_diff['filename'] == path: | ||||
break | ||||
else: | ||||
r5083 | raise FileNotInDiffException(f"File {path} not in diff") | |||
r1 | return file_diff | |||
def _find_chunk_line_index(self, file_diff, diff_line): | ||||
for chunk in file_diff['chunks']: | ||||
r5083 | if not isinstance(chunk, dict): | |||
continue | ||||
for line_idx, line in enumerate(chunk['lines']): | ||||
if diff_line.old and line['old_lineno'] == diff_line.old: | ||||
return chunk, line_idx | ||||
if diff_line.new and line['new_lineno'] == diff_line.new: | ||||
return chunk, line_idx | ||||
raise LineNotInDiffException(f"The line {diff_line} is not part of the diff.") | ||||
r1 | ||||
def _is_diff_content(line): | ||||
return line['action'] in ( | ||||
Action.UNMODIFIED, Action.ADD, Action.DELETE) | ||||
def _context_line(line): | ||||
r5083 | return line['action'], line['line'] | |||
r1 | ||||
def _line_to_diff_line_number(line): | ||||
new_line_no = line['new_lineno'] or None | ||||
old_line_no = line['old_lineno'] or None | ||||
return DiffLineNumber(old=old_line_no, new=new_line_no) | ||||
class FileNotInDiffException(Exception): | ||||
""" | ||||
Raised when the context for a missing file is requested. | ||||
If you request the context for a line in a file which is not part of the | ||||
given diff, then this exception is raised. | ||||
""" | ||||
class LineNotInDiffException(Exception): | ||||
""" | ||||
Raised when the context for a missing line is requested. | ||||
If you request the context for a line in a file and this line is not | ||||
part of the given diff, then this exception is raised. | ||||
""" | ||||
class DiffLimitExceeded(Exception): | ||||
pass | ||||
Bartłomiej Wołyńczyk
|
r2685 | |||
r3079 | # NOTE(marcink): if diffs.mako change, probably this | |||
# needs a bump to next version | ||||
r4543 | CURRENT_DIFF_VERSION = 'v5' | |||
r3079 | ||||
def _cleanup_cache_file(cached_diff_file): | ||||
# cleanup file to not store it "damaged" | ||||
try: | ||||
os.remove(cached_diff_file) | ||||
except Exception: | ||||
log.exception('Failed to cleanup path %s', cached_diff_file) | ||||
r3854 | def _get_compression_mode(cached_diff_file): | |||
mode = 'bz2' | ||||
if 'mode:plain' in cached_diff_file: | ||||
mode = 'plain' | ||||
elif 'mode:gzip' in cached_diff_file: | ||||
mode = 'gzip' | ||||
return mode | ||||
Bartłomiej Wołyńczyk
|
r2685 | def cache_diff(cached_diff_file, diff, commits): | ||
r3854 | compression_mode = _get_compression_mode(cached_diff_file) | |||
Bartłomiej Wołyńczyk
|
r2685 | |||
struct = { | ||||
r3079 | 'version': CURRENT_DIFF_VERSION, | |||
Bartłomiej Wołyńczyk
|
r2685 | 'diff': diff, | ||
'commits': commits | ||||
} | ||||
r3839 | start = time.time() | |||
Bartłomiej Wołyńczyk
|
r2685 | try: | ||
r3854 | if compression_mode == 'plain': | |||
r3839 | with open(cached_diff_file, 'wb') as f: | |||
pickle.dump(struct, f) | ||||
r3854 | elif compression_mode == 'gzip': | |||
with gzip.GzipFile(cached_diff_file, 'wb') as f: | ||||
pickle.dump(struct, f) | ||||
r3839 | else: | |||
with bz2.BZ2File(cached_diff_file, 'wb') as f: | ||||
pickle.dump(struct, f) | ||||
Bartłomiej Wołyńczyk
|
r2685 | except Exception: | ||
r5083 | log.warning('Failed to save cache', exc_info=True) | |||
r3079 | _cleanup_cache_file(cached_diff_file) | |||
Bartłomiej Wołyńczyk
|
r2685 | |||
r3853 | log.debug('Saved diff cache under %s in %.4fs', cached_diff_file, time.time() - start) | |||
r3839 | ||||
Bartłomiej Wołyńczyk
|
r2685 | |||
def load_cached_diff(cached_diff_file): | ||||
r3854 | compression_mode = _get_compression_mode(cached_diff_file) | |||
Bartłomiej Wołyńczyk
|
r2685 | |||
default_struct = { | ||||
r3079 | 'version': CURRENT_DIFF_VERSION, | |||
Bartłomiej Wołyńczyk
|
r2685 | 'diff': None, | ||
'commits': None | ||||
} | ||||
has_cache = os.path.isfile(cached_diff_file) | ||||
if not has_cache: | ||||
r3841 | log.debug('Reading diff cache file failed %s', cached_diff_file) | |||
Bartłomiej Wołyńczyk
|
r2685 | return default_struct | ||
data = None | ||||
r3839 | ||||
r3838 | start = time.time() | |||
Bartłomiej Wołyńczyk
|
r2685 | try: | ||
r3854 | if compression_mode == 'plain': | |||
r3839 | with open(cached_diff_file, 'rb') as f: | |||
data = pickle.load(f) | ||||
r3854 | elif compression_mode == 'gzip': | |||
with gzip.GzipFile(cached_diff_file, 'rb') as f: | ||||
data = pickle.load(f) | ||||
r3839 | else: | |||
with bz2.BZ2File(cached_diff_file, 'rb') as f: | ||||
data = pickle.load(f) | ||||
Bartłomiej Wołyńczyk
|
r2685 | except Exception: | ||
r5083 | log.warning('Failed to read diff cache file', exc_info=True) | |||
Bartłomiej Wołyńczyk
|
r2685 | |||
if not data: | ||||
data = default_struct | ||||
if not isinstance(data, dict): | ||||
# old version of data ? | ||||
data = default_struct | ||||
r3079 | # check version | |||
if data.get('version') != CURRENT_DIFF_VERSION: | ||||
# purge cache | ||||
_cleanup_cache_file(cached_diff_file) | ||||
return default_struct | ||||
r3853 | log.debug('Loaded diff cache from %s in %.4fs', cached_diff_file, time.time() - start) | |||
r3839 | ||||
Bartłomiej Wołyńczyk
|
r2685 | return data | ||
def generate_diff_cache_key(*args): | ||||
""" | ||||
Helper to generate a cache key using arguments | ||||
""" | ||||
def arg_mapper(input_param): | ||||
input_param = safe_str(input_param) | ||||
# we cannot allow '/' in arguments since it would allow | ||||
# subdirectory usage | ||||
input_param.replace('/', '_') | ||||
return input_param or None # prevent empty string arguments | ||||
return '_'.join([ | ||||
r5083 | '{}' for _i in range(len(args))]).format(*list(map(arg_mapper, args))) | |||
Bartłomiej Wołyńczyk
|
r2685 | |||
def diff_cache_exist(cache_storage, *args): | ||||
""" | ||||
Based on all generated arguments check and return a cache path | ||||
""" | ||||
r3854 | args = list(args) + ['mode:gzip'] | |||
Bartłomiej Wołyńczyk
|
r2685 | cache_key = generate_diff_cache_key(*args) | ||
cache_file_path = os.path.join(cache_storage, cache_key) | ||||
# prevent path traversal attacks using some param that have e.g '../../' | ||||
if not os.path.abspath(cache_file_path).startswith(cache_storage): | ||||
r5083 | raise ValueError(f'Final path must be within {cache_storage}') | |||
Bartłomiej Wołyńczyk
|
r2685 | |||
return cache_file_path | ||||