bdiff.c
401 lines
| 8.4 KiB
| text/x-c
|
CLexer
/ mercurial / bdiff.c
mpm@selenic.com
|
r400 | /* | ||
bdiff.c - efficient binary diff extension for Mercurial | ||||
Vadim Gelfer
|
r2859 | Copyright 2005, 2006 Matt Mackall <mpm@selenic.com> | ||
mpm@selenic.com
|
r400 | |||
This software may be used and distributed according to the terms of | ||||
the GNU General Public License, incorporated herein by reference. | ||||
Based roughly on Python difflib | ||||
*/ | ||||
#include <Python.h> | ||||
#include <stdlib.h> | ||||
#include <string.h> | ||||
Matt Mackall
|
r5341 | #include <limits.h> | ||
tksoh@users.sourceforge.net
|
r867 | |||
Benoit Boissinot
|
r3561 | #if defined __hpux || defined __SUNPRO_C || defined _AIX | ||
Fabian Otto
|
r1759 | # define inline | ||
Vadim Gelfer
|
r2600 | #endif | ||
Fabian Otto
|
r1759 | |||
Matt Mackall
|
r8858 | #ifdef __linux | ||
# define inline __inline | ||||
#endif | ||||
mpm@selenic.com
|
r411 | #ifdef _WIN32 | ||
mpm@selenic.com
|
r551 | #ifdef _MSC_VER | ||
#define inline __inline | ||||
typedef unsigned long uint32_t; | ||||
#else | ||||
mpm@selenic.com
|
r510 | #include <stdint.h> | ||
mpm@selenic.com
|
r551 | #endif | ||
mpm@selenic.com
|
r411 | static uint32_t htonl(uint32_t x) | ||
{ | ||||
return ((x & 0x000000ffUL) << 24) | | ||||
((x & 0x0000ff00UL) << 8) | | ||||
((x & 0x00ff0000UL) >> 8) | | ||||
((x & 0xff000000UL) >> 24); | ||||
} | ||||
#else | ||||
mpm@selenic.com
|
r510 | #include <sys/types.h> | ||
Scott McCreary
|
r7036 | #if defined __BEOS__ && !defined __HAIKU__ | ||
Andrew Bachmann
|
r4073 | #include <ByteOrder.h> | ||
#else | ||||
mpm@selenic.com
|
r597 | #include <arpa/inet.h> | ||
Andrew Bachmann
|
r4073 | #endif | ||
Thomas Arendsen Hein
|
r2543 | #include <inttypes.h> | ||
mpm@selenic.com
|
r411 | #endif | ||
mpm@selenic.com
|
r400 | |||
struct line { | ||||
mpm@selenic.com
|
r433 | int h, len, n, e; | ||
mpm@selenic.com
|
r400 | const char *l; | ||
}; | ||||
mpm@selenic.com
|
r474 | struct pos { | ||
int pos, len; | ||||
}; | ||||
mpm@selenic.com
|
r400 | struct hunk { | ||
int a1, a2, b1, b2; | ||||
}; | ||||
struct hunklist { | ||||
struct hunk *base, *head; | ||||
}; | ||||
int splitlines(const char *a, int len, struct line **lr) | ||||
{ | ||||
Matt Mackall
|
r5342 | int h, i; | ||
mpm@selenic.com
|
r400 | const char *p, *b = a; | ||
Christoph Spiel
|
r5340 | const char * const plast = a + len - 1; | ||
mpm@selenic.com
|
r400 | struct line *l; | ||
/* count the lines */ | ||||
i = 1; /* extra line for sentinel */ | ||||
for (p = a; p < a + len; p++) | ||||
Christoph Spiel
|
r5340 | if (*p == '\n' || p == plast) | ||
mpm@selenic.com
|
r400 | i++; | ||
TK Soh
|
r1978 | *lr = l = (struct line *)malloc(sizeof(struct line) * i); | ||
mpm@selenic.com
|
r400 | if (!l) | ||
return -1; | ||||
/* build the line array and calculate hashes */ | ||||
h = 0; | ||||
for (p = a; p < a + len; p++) { | ||||
Matt Mackall
|
r5342 | /* Leonid Yuriev's hash */ | ||
Thomas Arendsen Hein
|
r7187 | h = (h * 1664525) + *p + 1013904223; | ||
Matt Mackall
|
r5342 | |||
Christoph Spiel
|
r5340 | if (*p == '\n' || p == plast) { | ||
Matt Mackall
|
r5342 | l->h = h; | ||
h = 0; | ||||
mpm@selenic.com
|
r400 | l->len = p - b + 1; | ||
l->l = b; | ||||
Matt Mackall
|
r5341 | l->n = INT_MAX; | ||
mpm@selenic.com
|
r400 | l++; | ||
b = p + 1; | ||||
} | ||||
} | ||||
/* set up a sentinel */ | ||||
l->h = l->len = 0; | ||||
l->l = a + len; | ||||
return i - 1; | ||||
} | ||||
int inline cmp(struct line *a, struct line *b) | ||||
{ | ||||
mpm@selenic.com
|
r433 | return a->h != b->h || a->len != b->len || memcmp(a->l, b->l, a->len); | ||
mpm@selenic.com
|
r400 | } | ||
static int equatelines(struct line *a, int an, struct line *b, int bn) | ||||
{ | ||||
Matt Mackall
|
r5452 | int i, j, buckets = 1, t, scale; | ||
struct pos *h = NULL; | ||||
mpm@selenic.com
|
r400 | |||
/* build a hash table of the next highest power of 2 */ | ||||
while (buckets < bn + 1) | ||||
buckets *= 2; | ||||
Christoph Spiel
|
r5339 | /* try to allocate a large hash table to avoid collisions */ | ||
Matt Mackall
|
r5452 | for (scale = 4; scale; scale /= 2) { | ||
Christoph Spiel
|
r5339 | h = (struct pos *)malloc(scale * buckets * sizeof(struct pos)); | ||
Matt Mackall
|
r5452 | if (h) | ||
break; | ||||
} | ||||
Christoph Spiel
|
r5339 | |||
mpm@selenic.com
|
r474 | if (!h) | ||
mpm@selenic.com
|
r400 | return 0; | ||
Christoph Spiel
|
r5339 | buckets = buckets * scale - 1; | ||
mpm@selenic.com
|
r400 | /* clear the hash table */ | ||
mpm@selenic.com
|
r474 | for (i = 0; i <= buckets; i++) { | ||
Matt Mackall
|
r5341 | h[i].pos = INT_MAX; | ||
mpm@selenic.com
|
r474 | h[i].len = 0; | ||
} | ||||
mpm@selenic.com
|
r400 | |||
/* add lines to the hash table chains */ | ||||
for (i = bn - 1; i >= 0; i--) { | ||||
/* find the equivalence class */ | ||||
Matt Mackall
|
r5341 | for (j = b[i].h & buckets; h[j].pos != INT_MAX; | ||
mpm@selenic.com
|
r474 | j = (j + 1) & buckets) | ||
if (!cmp(b + i, b + h[j].pos)) | ||||
mpm@selenic.com
|
r400 | break; | ||
/* add to the head of the equivalence class */ | ||||
mpm@selenic.com
|
r474 | b[i].n = h[j].pos; | ||
mpm@selenic.com
|
r433 | b[i].e = j; | ||
mpm@selenic.com
|
r474 | h[j].pos = i; | ||
h[j].len++; /* keep track of popularity */ | ||||
mpm@selenic.com
|
r400 | } | ||
/* compute popularity threshold */ | ||||
Benoit Boissinot
|
r9534 | t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1); | ||
mpm@selenic.com
|
r400 | |||
/* match items in a to their equivalence class in b */ | ||||
for (i = 0; i < an; i++) { | ||||
/* find the equivalence class */ | ||||
Matt Mackall
|
r5341 | for (j = a[i].h & buckets; h[j].pos != INT_MAX; | ||
mpm@selenic.com
|
r474 | j = (j + 1) & buckets) | ||
if (!cmp(a + i, b + h[j].pos)) | ||||
mpm@selenic.com
|
r400 | break; | ||
mpm@selenic.com
|
r433 | a[i].e = j; /* use equivalence class for quick compare */ | ||
twaldmann@thinkmo.de
|
r1542 | if (h[j].len <= t) | ||
mpm@selenic.com
|
r474 | a[i].n = h[j].pos; /* point to head of match list */ | ||
mpm@selenic.com
|
r400 | else | ||
Matt Mackall
|
r5341 | a[i].n = INT_MAX; /* too popular */ | ||
mpm@selenic.com
|
r400 | } | ||
/* discard hash tables */ | ||||
free(h); | ||||
return 1; | ||||
} | ||||
mpm@selenic.com
|
r474 | static int longest_match(struct line *a, struct line *b, struct pos *pos, | ||
mpm@selenic.com
|
r400 | int a1, int a2, int b1, int b2, int *omi, int *omj) | ||
{ | ||||
int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k; | ||||
for (i = a1; i < a2; i++) { | ||||
/* skip things before the current block */ | ||||
Matt Mackall
|
r5341 | for (j = a[i].n; j < b1; j = b[j].n) | ||
mpm@selenic.com
|
r400 | ; | ||
/* loop through all lines match a[i] in b */ | ||||
Matt Mackall
|
r5341 | for (; j < b2; j = b[j].n) { | ||
mpm@selenic.com
|
r400 | /* does this extend an earlier match? */ | ||
mpm@selenic.com
|
r474 | if (i > a1 && j > b1 && pos[j - 1].pos == i - 1) | ||
k = pos[j - 1].len + 1; | ||||
mpm@selenic.com
|
r400 | else | ||
k = 1; | ||||
mpm@selenic.com
|
r474 | pos[j].pos = i; | ||
pos[j].len = k; | ||||
mpm@selenic.com
|
r400 | |||
/* best match so far? */ | ||||
if (k > mk) { | ||||
mi = i; | ||||
mj = j; | ||||
mk = k; | ||||
} | ||||
} | ||||
} | ||||
if (mk) { | ||||
mi = mi - mk + 1; | ||||
mj = mj - mk + 1; | ||||
} | ||||
/* expand match to include neighboring popular lines */ | ||||
while (mi - mb > a1 && mj - mb > b1 && | ||||
mpm@selenic.com
|
r433 | a[mi - mb - 1].e == b[mj - mb - 1].e) | ||
mpm@selenic.com
|
r400 | mb++; | ||
while (mi + mk < a2 && mj + mk < b2 && | ||||
mpm@selenic.com
|
r433 | a[mi + mk].e == b[mj + mk].e) | ||
mpm@selenic.com
|
r400 | mk++; | ||
*omi = mi - mb; | ||||
*omj = mj - mb; | ||||
Matt Mackall
|
r5341 | |||
mpm@selenic.com
|
r400 | return mk + mb; | ||
} | ||||
mpm@selenic.com
|
r474 | static void recurse(struct line *a, struct line *b, struct pos *pos, | ||
mpm@selenic.com
|
r400 | int a1, int a2, int b1, int b2, struct hunklist *l) | ||
{ | ||||
int i, j, k; | ||||
/* find the longest match in this chunk */ | ||||
mpm@selenic.com
|
r474 | k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j); | ||
mpm@selenic.com
|
r400 | if (!k) | ||
return; | ||||
/* and recurse on the remaining chunks on either side */ | ||||
mpm@selenic.com
|
r474 | recurse(a, b, pos, a1, i, b1, j, l); | ||
mpm@selenic.com
|
r400 | l->head->a1 = i; | ||
l->head->a2 = i + k; | ||||
l->head->b1 = j; | ||||
l->head->b2 = j + k; | ||||
l->head++; | ||||
mpm@selenic.com
|
r474 | recurse(a, b, pos, i + k, a2, j + k, b2, l); | ||
mpm@selenic.com
|
r400 | } | ||
mpm@selenic.com
|
r433 | static struct hunklist diff(struct line *a, int an, struct line *b, int bn) | ||
mpm@selenic.com
|
r400 | { | ||
mpm@selenic.com
|
r433 | struct hunklist l; | ||
Benoit Boissinot
|
r7104 | struct hunk *curr; | ||
mpm@selenic.com
|
r474 | struct pos *pos; | ||
int t; | ||||
mpm@selenic.com
|
r433 | |||
/* allocate and fill arrays */ | ||||
t = equatelines(a, an, b, bn); | ||||
Jim Hague
|
r5571 | pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos)); | ||
"Wallace, Eric S"
|
r827 | /* we can't have more matches than lines in the shorter file */ | ||
TK Soh
|
r1978 | l.head = l.base = (struct hunk *)malloc(sizeof(struct hunk) * | ||
((an<bn ? an:bn) + 1)); | ||||
mpm@selenic.com
|
r433 | |||
mpm@selenic.com
|
r474 | if (pos && l.base && t) { | ||
mpm@selenic.com
|
r433 | /* generate the matching block list */ | ||
mpm@selenic.com
|
r474 | recurse(a, b, pos, 0, an, 0, bn, &l); | ||
Erling Ellingsen
|
r4131 | l.head->a1 = l.head->a2 = an; | ||
l.head->b1 = l.head->b2 = bn; | ||||
mpm@selenic.com
|
r433 | l.head++; | ||
} | ||||
mpm@selenic.com
|
r474 | free(pos); | ||
Benoit Boissinot
|
r7104 | |||
Benoit Boissinot
|
r7625 | /* normalize the hunk list, try to push each hunk towards the end */ | ||
Benoit Boissinot
|
r7104 | for (curr = l.base; curr != l.head; curr++) { | ||
struct hunk *next = curr+1; | ||||
int shift = 0; | ||||
if (next == l.head) | ||||
break; | ||||
if (curr->a2 == next->a1) | ||||
while (curr->a2+shift < an && curr->b2+shift < bn | ||||
&& !cmp(a+curr->a2+shift, b+curr->b2+shift)) | ||||
shift++; | ||||
else if (curr->b2 == next->b1) | ||||
while (curr->b2+shift < bn && curr->a2+shift < an | ||||
&& !cmp(b+curr->b2+shift, a+curr->a2+shift)) | ||||
shift++; | ||||
if (!shift) | ||||
continue; | ||||
curr->b2 += shift; | ||||
next->b1 += shift; | ||||
curr->a2 += shift; | ||||
next->a1 += shift; | ||||
} | ||||
mpm@selenic.com
|
r433 | return l; | ||
} | ||||
static PyObject *blocks(PyObject *self, PyObject *args) | ||||
{ | ||||
mpm@selenic.com
|
r435 | PyObject *sa, *sb, *rl = NULL, *m; | ||
mpm@selenic.com
|
r433 | struct line *a, *b; | ||
mpm@selenic.com
|
r970 | struct hunklist l = {NULL, NULL}; | ||
mpm@selenic.com
|
r400 | struct hunk *h; | ||
mpm@selenic.com
|
r433 | int an, bn, pos = 0; | ||
mpm@selenic.com
|
r400 | |||
if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb)) | ||||
return NULL; | ||||
mpm@selenic.com
|
r433 | an = splitlines(PyString_AsString(sa), PyString_Size(sa), &a); | ||
bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &b); | ||||
if (!a || !b) | ||||
goto nomem; | ||||
l = diff(a, an, b, bn); | ||||
rl = PyList_New(l.head - l.base); | ||||
if (!l.head || !rl) | ||||
goto nomem; | ||||
twaldmann@thinkmo.de
|
r1542 | for (h = l.base; h != l.head; h++) { | ||
mpm@selenic.com
|
r433 | m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2); | ||
PyList_SetItem(rl, pos, m); | ||||
pos++; | ||||
} | ||||
nomem: | ||||
free(a); | ||||
free(b); | ||||
free(l.base); | ||||
return rl ? rl : PyErr_NoMemory(); | ||||
} | ||||
static PyObject *bdiff(PyObject *self, PyObject *args) | ||||
{ | ||||
Brendan Cully
|
r3335 | char *sa, *sb; | ||
PyObject *result = NULL; | ||||
mpm@selenic.com
|
r433 | struct line *al, *bl; | ||
mpm@selenic.com
|
r970 | struct hunklist l = {NULL, NULL}; | ||
mpm@selenic.com
|
r433 | struct hunk *h; | ||
char encode[12], *rb; | ||||
Brendan Cully
|
r3335 | int an, bn, len = 0, la, lb; | ||
mpm@selenic.com
|
r433 | |||
Alexis S. L. Carvalho
|
r3369 | if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb)) | ||
mpm@selenic.com
|
r433 | return NULL; | ||
Brendan Cully
|
r3335 | an = splitlines(sa, la, &al); | ||
bn = splitlines(sb, lb, &bl); | ||||
mpm@selenic.com
|
r433 | if (!al || !bl) | ||
mpm@selenic.com
|
r400 | goto nomem; | ||
mpm@selenic.com
|
r433 | l = diff(al, an, bl, bn); | ||
if (!l.head) | ||||
goto nomem; | ||||
mpm@selenic.com
|
r400 | |||
/* calculate length of output */ | ||||
Brendan Cully
|
r3335 | la = lb = 0; | ||
twaldmann@thinkmo.de
|
r1542 | for (h = l.base; h != l.head; h++) { | ||
mpm@selenic.com
|
r400 | if (h->a1 != la || h->b1 != lb) | ||
len += 12 + bl[h->b1].l - bl[lb].l; | ||||
la = h->a2; | ||||
lb = h->b2; | ||||
} | ||||
result = PyString_FromStringAndSize(NULL, len); | ||||
if (!result) | ||||
goto nomem; | ||||
/* build binary patch */ | ||||
rb = PyString_AsString(result); | ||||
la = lb = 0; | ||||
twaldmann@thinkmo.de
|
r1542 | for (h = l.base; h != l.head; h++) { | ||
mpm@selenic.com
|
r400 | if (h->a1 != la || h->b1 != lb) { | ||
len = bl[h->b1].l - bl[lb].l; | ||||
*(uint32_t *)(encode) = htonl(al[la].l - al->l); | ||||
*(uint32_t *)(encode + 4) = htonl(al[h->a1].l - al->l); | ||||
*(uint32_t *)(encode + 8) = htonl(len); | ||||
memcpy(rb, encode, 12); | ||||
memcpy(rb + 12, bl[lb].l, len); | ||||
rb += 12 + len; | ||||
} | ||||
la = h->a2; | ||||
lb = h->b2; | ||||
} | ||||
nomem: | ||||
free(al); | ||||
free(bl); | ||||
free(l.base); | ||||
return result ? result : PyErr_NoMemory(); | ||||
} | ||||
static char mdiff_doc[] = "Efficient binary diff."; | ||||
static PyMethodDef methods[] = { | ||||
{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"}, | ||||
mpm@selenic.com
|
r433 | {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"}, | ||
mpm@selenic.com
|
r400 | {NULL, NULL} | ||
}; | ||||
PyMODINIT_FUNC initbdiff(void) | ||||
{ | ||||
Py_InitModule3("bdiff", methods, mdiff_doc); | ||||
} | ||||
twaldmann@thinkmo.de
|
r1542 | |||