# HG changeset patch
# User Daniel Dourvaris <daniel@rhodecode.com>
# Date 2017-09-01 13:56:50
# Node ID 7939c6bfd1509cb8a818908d64bb05c6d790bbd2
# Parent  d39ea19b888c0fee7ea5b4510c4fc7e65363c1d0

diffs: use whole chunk diff to calculate if it's oversized or not.

- This fixes an issue if a file is added that has very large number of small lines.
In this case the time to detect if the diff should be limited was very very long and CPU intensive.

diff --git a/configs/development.ini b/configs/development.ini
--- a/configs/development.ini
+++ b/configs/development.ini
@@ -166,9 +166,15 @@ startup.import_repos = false
 ## change this to unique ID for security
 app_instance_uuid = rc-production
 
-## cut off limit for large diffs (size in bytes)
-cut_off_limit_diff = 1024000
-cut_off_limit_file = 256000
+## cut off limit for large diffs (size in bytes). If overall diff size on
+## commit, or pull request exceeds this limit this diff will be displayed
+## partially. E.g 512000 == 512Kb
+cut_off_limit_diff = 512000
+
+## cut off limit for large files inside diffs (size in bytes). Each individual
+## file inside diff which exceeds this limit will be displayed partially.
+##  E.g 128000 == 128Kb
+cut_off_limit_file = 128000
 
 ## use cache version of scm repo everywhere
 vcs_full_cache = true
diff --git a/configs/production.ini b/configs/production.ini
--- a/configs/production.ini
+++ b/configs/production.ini
@@ -140,9 +140,15 @@ startup.import_repos = false
 ## change this to unique ID for security
 app_instance_uuid = rc-production
 
-## cut off limit for large diffs (size in bytes)
-cut_off_limit_diff = 1024000
-cut_off_limit_file = 256000
+## cut off limit for large diffs (size in bytes). If overall diff size on
+## commit, or pull request exceeds this limit this diff will be displayed
+## partially. E.g 512000 == 512Kb
+cut_off_limit_diff = 512000
+
+## cut off limit for large files inside diffs (size in bytes). Each individual
+## file inside diff which exceeds this limit will be displayed partially.
+##  E.g 128000 == 128Kb
+cut_off_limit_file = 128000
 
 ## use cache version of scm repo everywhere
 vcs_full_cache = true
diff --git a/rhodecode/lib/diffs.py b/rhodecode/lib/diffs.py
--- a/rhodecode/lib/diffs.py
+++ b/rhodecode/lib/diffs.py
@@ -227,6 +227,7 @@ class DiffProcessor(object):
         self.parsed = False
         self.parsed_diff = []
 
+        log.debug('Initialized DiffProcessor with %s mode', format)
         if format == 'gitdiff':
             self.differ = self._highlight_line_difflib
             self._parser = self._parse_gitdiff
@@ -496,36 +497,26 @@ class DiffProcessor(object):
 
         return diff_container(sorted(_files, key=sorter))
 
-
-    # FIXME: NEWDIFFS: dan: this replaces the old _escaper function
-    def _process_line(self, string):
-        """
-        Process a diff line, checks the diff limit
-
-        :param string:
-        """
-
-        self.cur_diff_size += len(string)
-
+    def _check_large_diff(self):
+        log.debug('Diff exceeds current diff_limit of %s', self.diff_limit)
         if not self.show_full_diff and (self.cur_diff_size > self.diff_limit):
-            raise DiffLimitExceeded('Diff Limit Exceeded')
-
-        return safe_unicode(string)
+            raise DiffLimitExceeded('Diff Limit `%s` Exceeded', self.diff_limit)
 
     # FIXME: NEWDIFFS: dan: this replaces _parse_gitdiff
     def _new_parse_gitdiff(self, inline_diff=True):
         _files = []
+
+        # this can be overriden later to a LimitedDiffContainer type
         diff_container = lambda arg: arg
+
         for chunk in self._diff.chunks():
             head = chunk.header
             log.debug('parsing diff %r' % head)
 
-            diff = imap(self._process_line, chunk.diff.splitlines(1))
             raw_diff = chunk.raw
             limited_diff = False
             exceeds_limit = False
-            # if 'empty_file_to_modify_and_rename' in head['a_path']:
-            #     1/0
+
             op = None
             stats = {
                 'added': 0,
@@ -542,19 +533,22 @@ class DiffProcessor(object):
             if head['b_mode']:
                 stats['new_mode'] = head['b_mode']
 
+            # delete file
             if head['deleted_file_mode']:
                 op = OPS.DEL
                 stats['binary'] = True
                 stats['ops'][DEL_FILENODE] = 'deleted file'
 
+            # new file
             elif head['new_file_mode']:
                 op = OPS.ADD
                 stats['binary'] = True
                 stats['old_mode'] = None
                 stats['new_mode'] = head['new_file_mode']
                 stats['ops'][NEW_FILENODE] = 'new file %s' % head['new_file_mode']
-            else:  # modify operation, can be copy, rename or chmod
 
+            # modify operation, can be copy, rename or chmod
+            else:
                 # CHMOD
                 if head['new_mode'] and head['old_mode']:
                     op = OPS.MOD
@@ -602,7 +596,27 @@ class DiffProcessor(object):
 
             # a real non-binary diff
             if head['a_file'] or head['b_file']:
+                diff = iter(chunk.diff.splitlines(1))
+
+                # append each file to the diff size
+                raw_chunk_size = len(raw_diff)
+
+                exceeds_limit = raw_chunk_size > self.file_limit
+                self.cur_diff_size += raw_chunk_size
+
                 try:
+                    # Check each file instead of the whole diff.
+                    # Diff will hide big files but still show small ones.
+                    # From the tests big files are fairly safe to be parsed
+                    # but the browser is the bottleneck.
+                    if not self.show_full_diff and exceeds_limit:
+                        log.debug('File `%s` exceeds current file_limit of %s',
+                                  safe_unicode(head['b_path']), self.file_limit)
+                        raise DiffLimitExceeded(
+                            'File Limit %s Exceeded', self.file_limit)
+
+                    self._check_large_diff()
+
                     raw_diff, chunks, _stats = self._new_parse_lines(diff)
                     stats['binary'] = False
                     stats['added'] = _stats[0]
@@ -610,22 +624,12 @@ class DiffProcessor(object):
                     # explicit mark that it's a modified file
                     if op == OPS.MOD:
                         stats['ops'][MOD_FILENODE] = 'modified file'
-                    exceeds_limit = len(raw_diff) > self.file_limit
-
-                    # changed from _escaper function so we validate size of
-                    # each file instead of the whole diff
-                    # diff will hide big files but still show small ones
-                    # from my tests, big files are fairly safe to be parsed
-                    # but the browser is the bottleneck
-                    if not self.show_full_diff and exceeds_limit:
-                        raise DiffLimitExceeded('File Limit Exceeded')
 
                 except DiffLimitExceeded:
                     diff_container = lambda _diff: \
                         LimitedDiffContainer(
                             self.diff_limit, self.cur_diff_size, _diff)
 
-                    exceeds_limit = len(raw_diff) > self.file_limit
                     limited_diff = True
                     chunks = []
 
@@ -636,19 +640,20 @@ class DiffProcessor(object):
                     stats['ops'][BIN_FILENODE] = 'binary diff hidden'
                 chunks = []
 
+            # Hide content of deleted node by setting empty chunks
             if chunks and not self.show_full_diff and op == OPS.DEL:
                 # if not full diff mode show deleted file contents
                 # TODO: anderson: if the view is not too big, there is no way
                 # to see the content of the file
                 chunks = []
 
-            chunks.insert(0, [{
-                                  'old_lineno': '',
-                                  'new_lineno': '',
-                                  'action': Action.CONTEXT,
-                                  'line': msg,
-                              } for _op, msg in stats['ops'].iteritems()
-                              if _op not in [MOD_FILENODE]])
+            chunks.insert(
+                0, [{'old_lineno': '',
+                     'new_lineno': '',
+                     'action': Action.CONTEXT,
+                     'line': msg,
+                     } for _op, msg in stats['ops'].iteritems()
+                    if _op not in [MOD_FILENODE]])
 
             original_filename = safe_unicode(head['a_path'])
             _files.append({
@@ -664,7 +669,6 @@ class DiffProcessor(object):
                 'is_limited_diff': limited_diff,
             })
 
-
         sorter = lambda info: {OPS.ADD: 0, OPS.MOD: 1,
                                OPS.DEL: 2}.get(info['operation'])
 
@@ -766,18 +770,19 @@ class DiffProcessor(object):
         return ''.join(raw_diff), chunks, stats
 
     # FIXME: NEWDIFFS: dan: this replaces _parse_lines
-    def _new_parse_lines(self, diff):
+    def _new_parse_lines(self, diff_iter):
         """
         Parse the diff an return data for the template.
         """
 
-        lineiter = iter(diff)
         stats = [0, 0]
         chunks = []
         raw_diff = []
 
+        diff_iter = imap(lambda s: safe_unicode(s), diff_iter)
+
         try:
-            line = lineiter.next()
+            line = diff_iter.next()
 
             while line:
                 raw_diff.append(line)
@@ -808,7 +813,7 @@ class DiffProcessor(object):
                 old_end += old_line
                 new_end += new_line
 
-                line = lineiter.next()
+                line = diff_iter.next()
 
                 while old_line < old_end or new_line < new_end:
                     command = ' '
@@ -843,7 +848,7 @@ class DiffProcessor(object):
                         })
                         raw_diff.append(line)
 
-                    line = lineiter.next()
+                    line = diff_iter.next()
 
                     if self._newline_marker.match(line):
                         # we need to append to lines, since this is not
@@ -864,6 +869,7 @@ class DiffProcessor(object):
 
         except StopIteration:
             pass
+
         return ''.join(raw_diff), chunks, stats
 
     def _safe_id(self, idstring):