##// END OF EJS Templates
bdiff: rearrange the "better longest match" code...
bdiff: rearrange the "better longest match" code This is primarily to make the code more managable and prepare for later changes. More specific assignments might also be slightly faster, even thought it also might generate a bit more code.

File last commit:

r30430:5c4e2636 default
r30430:5c4e2636 default
Show More
bdiff.c
311 lines | 6.6 KiB | text/x-c | CLexer
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 /*
bdiff.c - efficient binary diff extension for Mercurial
Vadim Gelfer
update copyrights.
r2859 Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400
This software may be used and distributed according to the terms of
the GNU General Public License, incorporated herein by reference.
Based roughly on Python difflib
*/
#include <stdlib.h>
#include <string.h>
Matt Mackall
bdiff: use INT_MAX to avoid some inner loop comparisons
r5341 #include <limits.h>
tksoh@users.sourceforge.net
Allow Mercurial to build on HP-UX 11...
r867
Maciej Fijalkowski
bdiff: use ssize_t in favor of Py_ssize_t in cpython-unaware locations...
r29539 #include "compat.h"
Maciej Fijalkowski
internals: move the bitmanipulation routines into its own file...
r29444 #include "bitmanipulation.h"
Maciej Fijalkowski
bdiff: split bdiff into cpy-aware and cpy-agnostic part
r29541 #include "bdiff.h"
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400
Gregory Szorc
bdiff: replace hash algorithm...
r30318 /* Hash implementation from diffutils */
#define ROL(v, n) ((v) << (n) | (v) >> (sizeof(v) * CHAR_BIT - (n)))
#define HASH(h, c) ((c) + ROL(h ,7))
mpm@selenic.com
Minor speed improvements for bdiff...
r474 struct pos {
int pos, len;
};
Maciej Fijalkowski
bdiff: split bdiff into cpy-aware and cpy-agnostic part
r29541 int bdiff_splitlines(const char *a, ssize_t len, struct bdiff_line **lr)
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 {
Markus F.X.J. Oberhumer
bdiff.c: rename all variables which hold a hash value to "hash"
r13732 unsigned hash;
Markus F.X.J. Oberhumer
bdiff.c: use unsigned arithmetic for hash computation...
r13731 int i;
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 const char *p, *b = a;
Christoph Spiel
bdiff: simple splitlines optimization
r5340 const char * const plast = a + len - 1;
Maciej Fijalkowski
bdiff: rename functions and structs to be amenable for later exporting
r29540 struct bdiff_line *l;
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400
/* count the lines */
i = 1; /* extra line for sentinel */
Gregory Szorc
bdiff: don't check border condition in loop...
r30308 for (p = a; p < plast; p++)
if (*p == '\n')
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 i++;
Gregory Szorc
bdiff: don't check border condition in loop...
r30308 if (p == plast)
i++;
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400
Maciej Fijalkowski
bdiff: rename functions and structs to be amenable for later exporting
r29540 *lr = l = (struct bdiff_line *)malloc(sizeof(struct bdiff_line) * i);
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 if (!l)
return -1;
/* build the line array and calculate hashes */
Markus F.X.J. Oberhumer
bdiff.c: rename all variables which hold a hash value to "hash"
r13732 hash = 0;
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 for (p = a; p < a + len; p++) {
Gregory Szorc
bdiff: replace hash algorithm...
r30318 hash = HASH(hash, *p);
Matt Mackall
bdiff: switch to lyhash...
r5342
Christoph Spiel
bdiff: simple splitlines optimization
r5340 if (*p == '\n' || p == plast) {
Markus F.X.J. Oberhumer
bdiff.c: rename all variables which hold a hash value to "hash"
r13732 l->hash = hash;
hash = 0;
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 l->len = p - b + 1;
l->l = b;
Matt Mackall
bdiff: use INT_MAX to avoid some inner loop comparisons
r5341 l->n = INT_MAX;
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 l++;
b = p + 1;
}
}
/* set up a sentinel */
Markus F.X.J. Oberhumer
bdiff.c: rename all variables which hold a hash value to "hash"
r13732 l->hash = 0;
Markus F.X.J. Oberhumer
bdiff.c: use unsigned arithmetic for hash computation...
r13731 l->len = 0;
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 l->l = a + len;
return i - 1;
}
Maciej Fijalkowski
bdiff: rename functions and structs to be amenable for later exporting
r29540 static inline int cmp(struct bdiff_line *a, struct bdiff_line *b)
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 {
Markus F.X.J. Oberhumer
bdiff.c: rename all variables which hold a hash value to "hash"
r13732 return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 }
Maciej Fijalkowski
bdiff: rename functions and structs to be amenable for later exporting
r29540 static int equatelines(struct bdiff_line *a, int an, struct bdiff_line *b,
int bn)
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 {
Matt Mackall
bdiff: tweaks for large files...
r5452 int i, j, buckets = 1, t, scale;
struct pos *h = NULL;
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400
/* build a hash table of the next highest power of 2 */
while (buckets < bn + 1)
buckets *= 2;
Christoph Spiel
I have spotted the biggest bottleneck in "bdiff.c". Actually it was...
r5339 /* try to allocate a large hash table to avoid collisions */
Matt Mackall
bdiff: tweaks for large files...
r5452 for (scale = 4; scale; scale /= 2) {
Christoph Spiel
I have spotted the biggest bottleneck in "bdiff.c". Actually it was...
r5339 h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));
Matt Mackall
bdiff: tweaks for large files...
r5452 if (h)
break;
}
Christoph Spiel
I have spotted the biggest bottleneck in "bdiff.c". Actually it was...
r5339
mpm@selenic.com
Minor speed improvements for bdiff...
r474 if (!h)
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 return 0;
Christoph Spiel
I have spotted the biggest bottleneck in "bdiff.c". Actually it was...
r5339 buckets = buckets * scale - 1;
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 /* clear the hash table */
mpm@selenic.com
Minor speed improvements for bdiff...
r474 for (i = 0; i <= buckets; i++) {
Matt Mackall
bdiff: deal better with duplicate lines...
r29013 h[i].pos = -1;
mpm@selenic.com
Minor speed improvements for bdiff...
r474 h[i].len = 0;
}
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400
/* add lines to the hash table chains */
Matt Mackall
bdiff: deal better with duplicate lines...
r29013 for (i = 0; i < bn; i++) {
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 /* find the equivalence class */
Matt Mackall
bdiff: deal better with duplicate lines...
r29013 for (j = b[i].hash & buckets; h[j].pos != -1;
mpm@selenic.com
Minor speed improvements for bdiff...
r474 j = (j + 1) & buckets)
if (!cmp(b + i, b + h[j].pos))
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 break;
/* add to the head of the equivalence class */
mpm@selenic.com
Minor speed improvements for bdiff...
r474 b[i].n = h[j].pos;
mpm@selenic.com
Add bdiff.blocks / minor performance tweaks...
r433 b[i].e = j;
mpm@selenic.com
Minor speed improvements for bdiff...
r474 h[j].pos = i;
h[j].len++; /* keep track of popularity */
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 }
/* compute popularity threshold */
Benoit Boissinot
bdiff: gradually enable the popularity hack...
r9534 t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400
/* match items in a to their equivalence class in b */
for (i = 0; i < an; i++) {
/* find the equivalence class */
Matt Mackall
bdiff: deal better with duplicate lines...
r29013 for (j = a[i].hash & buckets; h[j].pos != -1;
mpm@selenic.com
Minor speed improvements for bdiff...
r474 j = (j + 1) & buckets)
if (!cmp(a + i, b + h[j].pos))
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 break;
mpm@selenic.com
Add bdiff.blocks / minor performance tweaks...
r433 a[i].e = j; /* use equivalence class for quick compare */
twaldmann@thinkmo.de
made C src formatting more consistent
r1542 if (h[j].len <= t)
mpm@selenic.com
Minor speed improvements for bdiff...
r474 a[i].n = h[j].pos; /* point to head of match list */
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 else
Matt Mackall
bdiff: deal better with duplicate lines...
r29013 a[i].n = -1; /* too popular */
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 }
/* discard hash tables */
free(h);
return 1;
}
Maciej Fijalkowski
bdiff: rename functions and structs to be amenable for later exporting
r29540 static int longest_match(struct bdiff_line *a, struct bdiff_line *b,
struct pos *pos,
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 int a1, int a2, int b1, int b2, int *omi, int *omj)
{
Matt Mackall
bdiff: remove effectively dead code...
r29323 int mi = a1, mj = b1, mk = 0, i, j, k, half;
Matt Mackall
bdiff: further restrain potential quadratic performance...
r29015
/* window our search on large regions to better bound
worst-case performance. by choosing a window at the end, we
reduce skipping overhead on the b chains. */
if (a2 - a1 > 30000)
a1 = a2 - 30000;
Mads Kiilerich
bdiff: adjust criteria for getting optimal longest match in the A side middle...
r30429 half = (a1 + a2 - 1) / 2;
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400
for (i = a1; i < a2; i++) {
Matt Mackall
bdiff: deal better with duplicate lines...
r29013 /* skip all lines in b after the current block */
for (j = a[i].n; j >= b2; j = b[j].n)
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 ;
/* loop through all lines match a[i] in b */
Matt Mackall
bdiff: deal better with duplicate lines...
r29013 for (; j >= b1; j = b[j].n) {
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 /* does this extend an earlier match? */
Matt Mackall
bdiff: extend matches across popular lines...
r29322 for (k = 1; j - k >= b1 && i - k >= a1; k++) {
/* reached an earlier match? */
if (pos[j - k].pos == i - k) {
k += pos[j - k].len;
break;
}
/* previous line mismatch? */
if (a[i - k].e != b[j - k].e)
break;
}
mpm@selenic.com
Minor speed improvements for bdiff...
r474 pos[j].pos = i;
pos[j].len = k;
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400
Matt Mackall
bdiff: balance recursion to avoid quadratic behavior (issue4704)...
r29014 /* best match so far? we prefer matches closer
to the middle to balance recursion */
Mads Kiilerich
bdiff: rearrange the "better longest match" code...
r30430 if (k > mk) {
/* a longer match */
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 mi = i;
mj = j;
mk = k;
Mads Kiilerich
bdiff: rearrange the "better longest match" code...
r30430 } else if (k == mk) {
if (i > mi && i <= half) {
/* same match but closer to half */
mi = i;
mj = j;
} else if (i == mi) {
/* same i but earlier j */
mj = j;
}
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 }
}
}
if (mk) {
mi = mi - mk + 1;
mj = mj - mk + 1;
}
Matt Mackall
bdiff: remove effectively dead code...
r29323 /* expand match to include subsequent popular lines */
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 while (mi + mk < a2 && mj + mk < b2 &&
mpm@selenic.com
Add bdiff.blocks / minor performance tweaks...
r433 a[mi + mk].e == b[mj + mk].e)
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 mk++;
Matt Mackall
bdiff: remove effectively dead code...
r29323 *omi = mi;
*omj = mj;
Matt Mackall
bdiff: use INT_MAX to avoid some inner loop comparisons
r5341
Matt Mackall
bdiff: remove effectively dead code...
r29323 return mk;
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 }
Maciej Fijalkowski
bdiff: rename functions and structs to be amenable for later exporting
r29540 static struct bdiff_hunk *recurse(struct bdiff_line *a, struct bdiff_line *b,
struct pos *pos,
int a1, int a2, int b1, int b2, struct bdiff_hunk *l)
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 {
int i, j, k;
Alistair Bell
bdiff: do not use recursion / avoid stackoverflow (issue1940)
r10500 while (1) {
/* find the longest match in this chunk */
k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
if (!k)
Matt Mackall
bdiff: dynamically allocate hunks...
r13089 return l;
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400
Alistair Bell
bdiff: do not use recursion / avoid stackoverflow (issue1940)
r10500 /* and recurse on the remaining chunks on either side */
Matt Mackall
bdiff: dynamically allocate hunks...
r13089 l = recurse(a, b, pos, a1, i, b1, j, l);
if (!l)
return NULL;
Maciej Fijalkowski
bdiff: rename functions and structs to be amenable for later exporting
r29540 l->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk));
Matt Mackall
bdiff: dynamically allocate hunks...
r13089 if (!l->next)
return NULL;
l = l->next;
l->a1 = i;
l->a2 = i + k;
l->b1 = j;
l->b2 = j + k;
l->next = NULL;
/* tail-recursion didn't happen, so do equivalent iteration */
Alistair Bell
bdiff: do not use recursion / avoid stackoverflow (issue1940)
r10500 a1 = i + k;
b1 = j + k;
}
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 }
Maciej Fijalkowski
bdiff: split bdiff into cpy-aware and cpy-agnostic part
r29541 int bdiff_diff(struct bdiff_line *a, int an, struct bdiff_line *b,
Maciej Fijalkowski
bdiff: rename functions and structs to be amenable for later exporting
r29540 int bn, struct bdiff_hunk *base)
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400 {
Maciej Fijalkowski
bdiff: rename functions and structs to be amenable for later exporting
r29540 struct bdiff_hunk *curr;
mpm@selenic.com
Minor speed improvements for bdiff...
r474 struct pos *pos;
Matt Mackall
bdiff: dynamically allocate hunks...
r13089 int t, count = 0;
mpm@selenic.com
Add bdiff.blocks / minor performance tweaks...
r433
/* allocate and fill arrays */
t = equatelines(a, an, b, bn);
Jim Hague
fix calloc(0, ...) issue
r5571 pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));
Matt Mackall
bdiff: dynamically allocate hunks...
r13089
if (pos && t) {
/* generate the matching block list */
curr = recurse(a, b, pos, 0, an, 0, bn, base);
if (!curr)
return -1;
mpm@selenic.com
Add bdiff.blocks / minor performance tweaks...
r433
Matt Mackall
bdiff: dynamically allocate hunks...
r13089 /* sentinel end hunk */
Maciej Fijalkowski
bdiff: rename functions and structs to be amenable for later exporting
r29540 curr->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk));
Matt Mackall
bdiff: dynamically allocate hunks...
r13089 if (!curr->next)
Matt Mackall
bdiff: Fix bogus NULL return
r13090 return -1;
Matt Mackall
bdiff: dynamically allocate hunks...
r13089 curr = curr->next;
curr->a1 = curr->a2 = an;
curr->b1 = curr->b2 = bn;
curr->next = NULL;
mpm@selenic.com
Add bdiff.blocks / minor performance tweaks...
r433 }
mpm@selenic.com
Minor speed improvements for bdiff...
r474 free(pos);
Benoit Boissinot
bdiff: normalize the diff (issue1295)...
r7104
Benoit Boissinot
bdiff: add comment about normalization
r7625 /* normalize the hunk list, try to push each hunk towards the end */
Matt Mackall
bdiff: dynamically allocate hunks...
r13089 for (curr = base->next; curr; curr = curr->next) {
Maciej Fijalkowski
bdiff: rename functions and structs to be amenable for later exporting
r29540 struct bdiff_hunk *next = curr->next;
Benoit Boissinot
bdiff: normalize the diff (issue1295)...
r7104
Matt Mackall
bdiff: dynamically allocate hunks...
r13089 if (!next)
Benoit Boissinot
bdiff: normalize the diff (issue1295)...
r7104 break;
Matt Mackall
bdiff: unify duplicate normalize loops...
r29010 if (curr->a2 == next->a1 || curr->b2 == next->b1)
Matt Mackall
bdiff: fold in shift calculation in normalize...
r29011 while (curr->a2 < an && curr->b2 < bn
Matt Mackall
bdiff: fix latent normalization bug...
r29012 && next->a1 < next->a2
&& next->b1 < next->b2
Matt Mackall
bdiff: fold in shift calculation in normalize...
r29011 && !cmp(a + curr->a2, b + curr->b2)) {
curr->a2++;
next->a1++;
curr->b2++;
next->b1++;
}
Benoit Boissinot
bdiff: normalize the diff (issue1295)...
r7104 }
Matt Mackall
bdiff: dynamically allocate hunks...
r13089 for (curr = base->next; curr; curr = curr->next)
count++;
return count;
}
Maciej Fijalkowski
bdiff: split bdiff into cpy-aware and cpy-agnostic part
r29541 void bdiff_freehunks(struct bdiff_hunk *l)
Matt Mackall
bdiff: dynamically allocate hunks...
r13089 {
Maciej Fijalkowski
bdiff: rename functions and structs to be amenable for later exporting
r29540 struct bdiff_hunk *n;
Matt Mackall
bdiff: dynamically allocate hunks...
r13089 for (; l; l = n) {
n = l->next;
free(l);
}
mpm@selenic.com
Add bdiff.blocks / minor performance tweaks...
r433 }
mpm@selenic.com
Add a fast binary diff extension (not yet used)...
r400