bdiff.c
296 lines
| 6.3 KiB
| text/x-c
|
CLexer
/ mercurial / bdiff.c
mpm@selenic.com
|
r400 | /* | ||
bdiff.c - efficient binary diff extension for Mercurial | ||||
Vadim Gelfer
|
r2859 | Copyright 2005, 2006 Matt Mackall <mpm@selenic.com> | ||
mpm@selenic.com
|
r400 | |||
This software may be used and distributed according to the terms of | ||||
the GNU General Public License, incorporated herein by reference. | ||||
Based roughly on Python difflib | ||||
*/ | ||||
#include <stdlib.h> | ||||
#include <string.h> | ||||
Matt Mackall
|
r5341 | #include <limits.h> | ||
tksoh@users.sourceforge.net
|
r867 | |||
Maciej Fijalkowski
|
r29539 | #include "compat.h" | ||
Maciej Fijalkowski
|
r29444 | #include "bitmanipulation.h" | ||
Maciej Fijalkowski
|
r29541 | #include "bdiff.h" | ||
mpm@selenic.com
|
r400 | |||
mpm@selenic.com
|
r474 | struct pos { | ||
int pos, len; | ||||
}; | ||||
Maciej Fijalkowski
|
r29541 | int bdiff_splitlines(const char *a, ssize_t len, struct bdiff_line **lr) | ||
mpm@selenic.com
|
r400 | { | ||
Markus F.X.J. Oberhumer
|
r13732 | unsigned hash; | ||
Markus F.X.J. Oberhumer
|
r13731 | int i; | ||
mpm@selenic.com
|
r400 | const char *p, *b = a; | ||
Christoph Spiel
|
r5340 | const char * const plast = a + len - 1; | ||
Maciej Fijalkowski
|
r29540 | struct bdiff_line *l; | ||
mpm@selenic.com
|
r400 | |||
/* count the lines */ | ||||
i = 1; /* extra line for sentinel */ | ||||
for (p = a; p < a + len; p++) | ||||
Christoph Spiel
|
r5340 | if (*p == '\n' || p == plast) | ||
mpm@selenic.com
|
r400 | i++; | ||
Maciej Fijalkowski
|
r29540 | *lr = l = (struct bdiff_line *)malloc(sizeof(struct bdiff_line) * i); | ||
mpm@selenic.com
|
r400 | if (!l) | ||
return -1; | ||||
/* build the line array and calculate hashes */ | ||||
Markus F.X.J. Oberhumer
|
r13732 | hash = 0; | ||
mpm@selenic.com
|
r400 | for (p = a; p < a + len; p++) { | ||
Matt Mackall
|
r5342 | /* Leonid Yuriev's hash */ | ||
Markus F.X.J. Oberhumer
|
r13732 | hash = (hash * 1664525) + (unsigned char)*p + 1013904223; | ||
Matt Mackall
|
r5342 | |||
Christoph Spiel
|
r5340 | if (*p == '\n' || p == plast) { | ||
Markus F.X.J. Oberhumer
|
r13732 | l->hash = hash; | ||
hash = 0; | ||||
mpm@selenic.com
|
r400 | l->len = p - b + 1; | ||
l->l = b; | ||||
Matt Mackall
|
r5341 | l->n = INT_MAX; | ||
mpm@selenic.com
|
r400 | l++; | ||
b = p + 1; | ||||
} | ||||
} | ||||
/* set up a sentinel */ | ||||
Markus F.X.J. Oberhumer
|
r13732 | l->hash = 0; | ||
Markus F.X.J. Oberhumer
|
r13731 | l->len = 0; | ||
mpm@selenic.com
|
r400 | l->l = a + len; | ||
return i - 1; | ||||
} | ||||
Maciej Fijalkowski
|
r29540 | static inline int cmp(struct bdiff_line *a, struct bdiff_line *b) | ||
mpm@selenic.com
|
r400 | { | ||
Markus F.X.J. Oberhumer
|
r13732 | return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len); | ||
mpm@selenic.com
|
r400 | } | ||
Maciej Fijalkowski
|
r29540 | static int equatelines(struct bdiff_line *a, int an, struct bdiff_line *b, | ||
int bn) | ||||
mpm@selenic.com
|
r400 | { | ||
Matt Mackall
|
r5452 | int i, j, buckets = 1, t, scale; | ||
struct pos *h = NULL; | ||||
mpm@selenic.com
|
r400 | |||
/* build a hash table of the next highest power of 2 */ | ||||
while (buckets < bn + 1) | ||||
buckets *= 2; | ||||
Christoph Spiel
|
r5339 | /* try to allocate a large hash table to avoid collisions */ | ||
Matt Mackall
|
r5452 | for (scale = 4; scale; scale /= 2) { | ||
Christoph Spiel
|
r5339 | h = (struct pos *)malloc(scale * buckets * sizeof(struct pos)); | ||
Matt Mackall
|
r5452 | if (h) | ||
break; | ||||
} | ||||
Christoph Spiel
|
r5339 | |||
mpm@selenic.com
|
r474 | if (!h) | ||
mpm@selenic.com
|
r400 | return 0; | ||
Christoph Spiel
|
r5339 | buckets = buckets * scale - 1; | ||
mpm@selenic.com
|
r400 | /* clear the hash table */ | ||
mpm@selenic.com
|
r474 | for (i = 0; i <= buckets; i++) { | ||
Matt Mackall
|
r29013 | h[i].pos = -1; | ||
mpm@selenic.com
|
r474 | h[i].len = 0; | ||
} | ||||
mpm@selenic.com
|
r400 | |||
/* add lines to the hash table chains */ | ||||
Matt Mackall
|
r29013 | for (i = 0; i < bn; i++) { | ||
mpm@selenic.com
|
r400 | /* find the equivalence class */ | ||
Matt Mackall
|
r29013 | for (j = b[i].hash & buckets; h[j].pos != -1; | ||
mpm@selenic.com
|
r474 | j = (j + 1) & buckets) | ||
if (!cmp(b + i, b + h[j].pos)) | ||||
mpm@selenic.com
|
r400 | break; | ||
/* add to the head of the equivalence class */ | ||||
mpm@selenic.com
|
r474 | b[i].n = h[j].pos; | ||
mpm@selenic.com
|
r433 | b[i].e = j; | ||
mpm@selenic.com
|
r474 | h[j].pos = i; | ||
h[j].len++; /* keep track of popularity */ | ||||
mpm@selenic.com
|
r400 | } | ||
/* compute popularity threshold */ | ||||
Benoit Boissinot
|
r9534 | t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1); | ||
mpm@selenic.com
|
r400 | |||
/* match items in a to their equivalence class in b */ | ||||
for (i = 0; i < an; i++) { | ||||
/* find the equivalence class */ | ||||
Matt Mackall
|
r29013 | for (j = a[i].hash & buckets; h[j].pos != -1; | ||
mpm@selenic.com
|
r474 | j = (j + 1) & buckets) | ||
if (!cmp(a + i, b + h[j].pos)) | ||||
mpm@selenic.com
|
r400 | break; | ||
mpm@selenic.com
|
r433 | a[i].e = j; /* use equivalence class for quick compare */ | ||
twaldmann@thinkmo.de
|
r1542 | if (h[j].len <= t) | ||
mpm@selenic.com
|
r474 | a[i].n = h[j].pos; /* point to head of match list */ | ||
mpm@selenic.com
|
r400 | else | ||
Matt Mackall
|
r29013 | a[i].n = -1; /* too popular */ | ||
mpm@selenic.com
|
r400 | } | ||
/* discard hash tables */ | ||||
free(h); | ||||
return 1; | ||||
} | ||||
Maciej Fijalkowski
|
r29540 | static int longest_match(struct bdiff_line *a, struct bdiff_line *b, | ||
struct pos *pos, | ||||
mpm@selenic.com
|
r400 | int a1, int a2, int b1, int b2, int *omi, int *omj) | ||
{ | ||||
Matt Mackall
|
r29323 | int mi = a1, mj = b1, mk = 0, i, j, k, half; | ||
Matt Mackall
|
r29015 | |||
/* window our search on large regions to better bound | ||||
worst-case performance. by choosing a window at the end, we | ||||
reduce skipping overhead on the b chains. */ | ||||
if (a2 - a1 > 30000) | ||||
a1 = a2 - 30000; | ||||
half = (a1 + a2) / 2; | ||||
mpm@selenic.com
|
r400 | |||
for (i = a1; i < a2; i++) { | ||||
Matt Mackall
|
r29013 | /* skip all lines in b after the current block */ | ||
for (j = a[i].n; j >= b2; j = b[j].n) | ||||
mpm@selenic.com
|
r400 | ; | ||
/* loop through all lines match a[i] in b */ | ||||
Matt Mackall
|
r29013 | for (; j >= b1; j = b[j].n) { | ||
mpm@selenic.com
|
r400 | /* does this extend an earlier match? */ | ||
Matt Mackall
|
r29322 | for (k = 1; j - k >= b1 && i - k >= a1; k++) { | ||
/* reached an earlier match? */ | ||||
if (pos[j - k].pos == i - k) { | ||||
k += pos[j - k].len; | ||||
break; | ||||
} | ||||
/* previous line mismatch? */ | ||||
if (a[i - k].e != b[j - k].e) | ||||
break; | ||||
} | ||||
mpm@selenic.com
|
r474 | pos[j].pos = i; | ||
pos[j].len = k; | ||||
mpm@selenic.com
|
r400 | |||
Matt Mackall
|
r29014 | /* best match so far? we prefer matches closer | ||
to the middle to balance recursion */ | ||||
if (k > mk || (k == mk && (i <= mi || i < half))) { | ||||
mpm@selenic.com
|
r400 | mi = i; | ||
mj = j; | ||||
mk = k; | ||||
} | ||||
} | ||||
} | ||||
if (mk) { | ||||
mi = mi - mk + 1; | ||||
mj = mj - mk + 1; | ||||
} | ||||
Matt Mackall
|
r29323 | /* expand match to include subsequent popular lines */ | ||
mpm@selenic.com
|
r400 | while (mi + mk < a2 && mj + mk < b2 && | ||
mpm@selenic.com
|
r433 | a[mi + mk].e == b[mj + mk].e) | ||
mpm@selenic.com
|
r400 | mk++; | ||
Matt Mackall
|
r29323 | *omi = mi; | ||
*omj = mj; | ||||
Matt Mackall
|
r5341 | |||
Matt Mackall
|
r29323 | return mk; | ||
mpm@selenic.com
|
r400 | } | ||
Maciej Fijalkowski
|
r29540 | static struct bdiff_hunk *recurse(struct bdiff_line *a, struct bdiff_line *b, | ||
struct pos *pos, | ||||
int a1, int a2, int b1, int b2, struct bdiff_hunk *l) | ||||
mpm@selenic.com
|
r400 | { | ||
int i, j, k; | ||||
Alistair Bell
|
r10500 | while (1) { | ||
/* find the longest match in this chunk */ | ||||
k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j); | ||||
if (!k) | ||||
Matt Mackall
|
r13089 | return l; | ||
mpm@selenic.com
|
r400 | |||
Alistair Bell
|
r10500 | /* and recurse on the remaining chunks on either side */ | ||
Matt Mackall
|
r13089 | l = recurse(a, b, pos, a1, i, b1, j, l); | ||
if (!l) | ||||
return NULL; | ||||
Maciej Fijalkowski
|
r29540 | l->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk)); | ||
Matt Mackall
|
r13089 | if (!l->next) | ||
return NULL; | ||||
l = l->next; | ||||
l->a1 = i; | ||||
l->a2 = i + k; | ||||
l->b1 = j; | ||||
l->b2 = j + k; | ||||
l->next = NULL; | ||||
/* tail-recursion didn't happen, so do equivalent iteration */ | ||||
Alistair Bell
|
r10500 | a1 = i + k; | ||
b1 = j + k; | ||||
} | ||||
mpm@selenic.com
|
r400 | } | ||
Maciej Fijalkowski
|
r29541 | int bdiff_diff(struct bdiff_line *a, int an, struct bdiff_line *b, | ||
Maciej Fijalkowski
|
r29540 | int bn, struct bdiff_hunk *base) | ||
mpm@selenic.com
|
r400 | { | ||
Maciej Fijalkowski
|
r29540 | struct bdiff_hunk *curr; | ||
mpm@selenic.com
|
r474 | struct pos *pos; | ||
Matt Mackall
|
r13089 | int t, count = 0; | ||
mpm@selenic.com
|
r433 | |||
/* allocate and fill arrays */ | ||||
t = equatelines(a, an, b, bn); | ||||
Jim Hague
|
r5571 | pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos)); | ||
Matt Mackall
|
r13089 | |||
if (pos && t) { | ||||
/* generate the matching block list */ | ||||
curr = recurse(a, b, pos, 0, an, 0, bn, base); | ||||
if (!curr) | ||||
return -1; | ||||
mpm@selenic.com
|
r433 | |||
Matt Mackall
|
r13089 | /* sentinel end hunk */ | ||
Maciej Fijalkowski
|
r29540 | curr->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk)); | ||
Matt Mackall
|
r13089 | if (!curr->next) | ||
Matt Mackall
|
r13090 | return -1; | ||
Matt Mackall
|
r13089 | curr = curr->next; | ||
curr->a1 = curr->a2 = an; | ||||
curr->b1 = curr->b2 = bn; | ||||
curr->next = NULL; | ||||
mpm@selenic.com
|
r433 | } | ||
mpm@selenic.com
|
r474 | free(pos); | ||
Benoit Boissinot
|
r7104 | |||
Benoit Boissinot
|
r7625 | /* normalize the hunk list, try to push each hunk towards the end */ | ||
Matt Mackall
|
r13089 | for (curr = base->next; curr; curr = curr->next) { | ||
Maciej Fijalkowski
|
r29540 | struct bdiff_hunk *next = curr->next; | ||
Benoit Boissinot
|
r7104 | |||
Matt Mackall
|
r13089 | if (!next) | ||
Benoit Boissinot
|
r7104 | break; | ||
Matt Mackall
|
r29010 | if (curr->a2 == next->a1 || curr->b2 == next->b1) | ||
Matt Mackall
|
r29011 | while (curr->a2 < an && curr->b2 < bn | ||
Matt Mackall
|
r29012 | && next->a1 < next->a2 | ||
&& next->b1 < next->b2 | ||||
Matt Mackall
|
r29011 | && !cmp(a + curr->a2, b + curr->b2)) { | ||
curr->a2++; | ||||
next->a1++; | ||||
curr->b2++; | ||||
next->b1++; | ||||
} | ||||
Benoit Boissinot
|
r7104 | } | ||
Matt Mackall
|
r13089 | for (curr = base->next; curr; curr = curr->next) | ||
count++; | ||||
return count; | ||||
} | ||||
Maciej Fijalkowski
|
r29541 | void bdiff_freehunks(struct bdiff_hunk *l) | ||
Matt Mackall
|
r13089 | { | ||
Maciej Fijalkowski
|
r29540 | struct bdiff_hunk *n; | ||
Matt Mackall
|
r13089 | for (; l; l = n) { | ||
n = l->next; | ||||
free(l); | ||||
} | ||||
mpm@selenic.com
|
r433 | } | ||
mpm@selenic.com
|
r400 | |||