bdiff.c
488 lines
| 10.0 KiB
| text/x-c
|
CLexer
/ mercurial / bdiff.c
mpm@selenic.com
|
r400 | /* | ||
bdiff.c - efficient binary diff extension for Mercurial | ||||
Vadim Gelfer
|
r2859 | Copyright 2005, 2006 Matt Mackall <mpm@selenic.com> | ||
mpm@selenic.com
|
r400 | |||
This software may be used and distributed according to the terms of | ||||
the GNU General Public License, incorporated herein by reference. | ||||
Based roughly on Python difflib | ||||
*/ | ||||
Adrian Buehlmann
|
r16749 | #define PY_SSIZE_T_CLEAN | ||
mpm@selenic.com
|
r400 | #include <Python.h> | ||
#include <stdlib.h> | ||||
#include <string.h> | ||||
Matt Mackall
|
r5341 | #include <limits.h> | ||
tksoh@users.sourceforge.net
|
r867 | |||
Renato Cunha
|
r11364 | #include "util.h" | ||
Maciej Fijalkowski
|
r29444 | #include "bitmanipulation.h" | ||
Renato Cunha
|
r11364 | |||
mpm@selenic.com
|
r400 | struct line { | ||
Adrian Buehlmann
|
r16749 | int hash, n, e; | ||
Py_ssize_t len; | ||||
mpm@selenic.com
|
r400 | const char *l; | ||
}; | ||||
mpm@selenic.com
|
r474 | struct pos { | ||
int pos, len; | ||||
}; | ||||
Matt Mackall
|
r13089 | struct hunk; | ||
mpm@selenic.com
|
r400 | struct hunk { | ||
int a1, a2, b1, b2; | ||||
Matt Mackall
|
r13089 | struct hunk *next; | ||
mpm@selenic.com
|
r400 | }; | ||
Adrian Buehlmann
|
r16749 | static int splitlines(const char *a, Py_ssize_t len, struct line **lr) | ||
mpm@selenic.com
|
r400 | { | ||
Markus F.X.J. Oberhumer
|
r13732 | unsigned hash; | ||
Markus F.X.J. Oberhumer
|
r13731 | int i; | ||
mpm@selenic.com
|
r400 | const char *p, *b = a; | ||
Christoph Spiel
|
r5340 | const char * const plast = a + len - 1; | ||
mpm@selenic.com
|
r400 | struct line *l; | ||
/* count the lines */ | ||||
i = 1; /* extra line for sentinel */ | ||||
for (p = a; p < a + len; p++) | ||||
Christoph Spiel
|
r5340 | if (*p == '\n' || p == plast) | ||
mpm@selenic.com
|
r400 | i++; | ||
TK Soh
|
r1978 | *lr = l = (struct line *)malloc(sizeof(struct line) * i); | ||
mpm@selenic.com
|
r400 | if (!l) | ||
return -1; | ||||
/* build the line array and calculate hashes */ | ||||
Markus F.X.J. Oberhumer
|
r13732 | hash = 0; | ||
mpm@selenic.com
|
r400 | for (p = a; p < a + len; p++) { | ||
Matt Mackall
|
r5342 | /* Leonid Yuriev's hash */ | ||
Markus F.X.J. Oberhumer
|
r13732 | hash = (hash * 1664525) + (unsigned char)*p + 1013904223; | ||
Matt Mackall
|
r5342 | |||
Christoph Spiel
|
r5340 | if (*p == '\n' || p == plast) { | ||
Markus F.X.J. Oberhumer
|
r13732 | l->hash = hash; | ||
hash = 0; | ||||
mpm@selenic.com
|
r400 | l->len = p - b + 1; | ||
l->l = b; | ||||
Matt Mackall
|
r5341 | l->n = INT_MAX; | ||
mpm@selenic.com
|
r400 | l++; | ||
b = p + 1; | ||||
} | ||||
} | ||||
/* set up a sentinel */ | ||||
Markus F.X.J. Oberhumer
|
r13732 | l->hash = 0; | ||
Markus F.X.J. Oberhumer
|
r13731 | l->len = 0; | ||
mpm@selenic.com
|
r400 | l->l = a + len; | ||
return i - 1; | ||||
} | ||||
Markus F.X.J. Oberhumer
|
r13729 | static inline int cmp(struct line *a, struct line *b) | ||
mpm@selenic.com
|
r400 | { | ||
Markus F.X.J. Oberhumer
|
r13732 | return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len); | ||
mpm@selenic.com
|
r400 | } | ||
static int equatelines(struct line *a, int an, struct line *b, int bn) | ||||
{ | ||||
Matt Mackall
|
r5452 | int i, j, buckets = 1, t, scale; | ||
struct pos *h = NULL; | ||||
mpm@selenic.com
|
r400 | |||
/* build a hash table of the next highest power of 2 */ | ||||
while (buckets < bn + 1) | ||||
buckets *= 2; | ||||
Christoph Spiel
|
r5339 | /* try to allocate a large hash table to avoid collisions */ | ||
Matt Mackall
|
r5452 | for (scale = 4; scale; scale /= 2) { | ||
Christoph Spiel
|
r5339 | h = (struct pos *)malloc(scale * buckets * sizeof(struct pos)); | ||
Matt Mackall
|
r5452 | if (h) | ||
break; | ||||
} | ||||
Christoph Spiel
|
r5339 | |||
mpm@selenic.com
|
r474 | if (!h) | ||
mpm@selenic.com
|
r400 | return 0; | ||
Christoph Spiel
|
r5339 | buckets = buckets * scale - 1; | ||
mpm@selenic.com
|
r400 | /* clear the hash table */ | ||
mpm@selenic.com
|
r474 | for (i = 0; i <= buckets; i++) { | ||
Matt Mackall
|
r29013 | h[i].pos = -1; | ||
mpm@selenic.com
|
r474 | h[i].len = 0; | ||
} | ||||
mpm@selenic.com
|
r400 | |||
/* add lines to the hash table chains */ | ||||
Matt Mackall
|
r29013 | for (i = 0; i < bn; i++) { | ||
mpm@selenic.com
|
r400 | /* find the equivalence class */ | ||
Matt Mackall
|
r29013 | for (j = b[i].hash & buckets; h[j].pos != -1; | ||
mpm@selenic.com
|
r474 | j = (j + 1) & buckets) | ||
if (!cmp(b + i, b + h[j].pos)) | ||||
mpm@selenic.com
|
r400 | break; | ||
/* add to the head of the equivalence class */ | ||||
mpm@selenic.com
|
r474 | b[i].n = h[j].pos; | ||
mpm@selenic.com
|
r433 | b[i].e = j; | ||
mpm@selenic.com
|
r474 | h[j].pos = i; | ||
h[j].len++; /* keep track of popularity */ | ||||
mpm@selenic.com
|
r400 | } | ||
/* compute popularity threshold */ | ||||
Benoit Boissinot
|
r9534 | t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1); | ||
mpm@selenic.com
|
r400 | |||
/* match items in a to their equivalence class in b */ | ||||
for (i = 0; i < an; i++) { | ||||
/* find the equivalence class */ | ||||
Matt Mackall
|
r29013 | for (j = a[i].hash & buckets; h[j].pos != -1; | ||
mpm@selenic.com
|
r474 | j = (j + 1) & buckets) | ||
if (!cmp(a + i, b + h[j].pos)) | ||||
mpm@selenic.com
|
r400 | break; | ||
mpm@selenic.com
|
r433 | a[i].e = j; /* use equivalence class for quick compare */ | ||
twaldmann@thinkmo.de
|
r1542 | if (h[j].len <= t) | ||
mpm@selenic.com
|
r474 | a[i].n = h[j].pos; /* point to head of match list */ | ||
mpm@selenic.com
|
r400 | else | ||
Matt Mackall
|
r29013 | a[i].n = -1; /* too popular */ | ||
mpm@selenic.com
|
r400 | } | ||
/* discard hash tables */ | ||||
free(h); | ||||
return 1; | ||||
} | ||||
mpm@selenic.com
|
r474 | static int longest_match(struct line *a, struct line *b, struct pos *pos, | ||
mpm@selenic.com
|
r400 | int a1, int a2, int b1, int b2, int *omi, int *omj) | ||
{ | ||||
Matt Mackall
|
r29323 | int mi = a1, mj = b1, mk = 0, i, j, k, half; | ||
Matt Mackall
|
r29015 | |||
/* window our search on large regions to better bound | ||||
worst-case performance. by choosing a window at the end, we | ||||
reduce skipping overhead on the b chains. */ | ||||
if (a2 - a1 > 30000) | ||||
a1 = a2 - 30000; | ||||
half = (a1 + a2) / 2; | ||||
mpm@selenic.com
|
r400 | |||
for (i = a1; i < a2; i++) { | ||||
Matt Mackall
|
r29013 | /* skip all lines in b after the current block */ | ||
for (j = a[i].n; j >= b2; j = b[j].n) | ||||
mpm@selenic.com
|
r400 | ; | ||
/* loop through all lines match a[i] in b */ | ||||
Matt Mackall
|
r29013 | for (; j >= b1; j = b[j].n) { | ||
mpm@selenic.com
|
r400 | /* does this extend an earlier match? */ | ||
Matt Mackall
|
r29322 | for (k = 1; j - k >= b1 && i - k >= a1; k++) { | ||
/* reached an earlier match? */ | ||||
if (pos[j - k].pos == i - k) { | ||||
k += pos[j - k].len; | ||||
break; | ||||
} | ||||
/* previous line mismatch? */ | ||||
if (a[i - k].e != b[j - k].e) | ||||
break; | ||||
} | ||||
mpm@selenic.com
|
r474 | pos[j].pos = i; | ||
pos[j].len = k; | ||||
mpm@selenic.com
|
r400 | |||
Matt Mackall
|
r29014 | /* best match so far? we prefer matches closer | ||
to the middle to balance recursion */ | ||||
if (k > mk || (k == mk && (i <= mi || i < half))) { | ||||
mpm@selenic.com
|
r400 | mi = i; | ||
mj = j; | ||||
mk = k; | ||||
} | ||||
} | ||||
} | ||||
if (mk) { | ||||
mi = mi - mk + 1; | ||||
mj = mj - mk + 1; | ||||
} | ||||
Matt Mackall
|
r29323 | /* expand match to include subsequent popular lines */ | ||
mpm@selenic.com
|
r400 | while (mi + mk < a2 && mj + mk < b2 && | ||
mpm@selenic.com
|
r433 | a[mi + mk].e == b[mj + mk].e) | ||
mpm@selenic.com
|
r400 | mk++; | ||
Matt Mackall
|
r29323 | *omi = mi; | ||
*omj = mj; | ||||
Matt Mackall
|
r5341 | |||
Matt Mackall
|
r29323 | return mk; | ||
mpm@selenic.com
|
r400 | } | ||
Matt Mackall
|
r13089 | static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos, | ||
int a1, int a2, int b1, int b2, struct hunk *l) | ||||
mpm@selenic.com
|
r400 | { | ||
int i, j, k; | ||||
Alistair Bell
|
r10500 | while (1) { | ||
/* find the longest match in this chunk */ | ||||
k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j); | ||||
if (!k) | ||||
Matt Mackall
|
r13089 | return l; | ||
mpm@selenic.com
|
r400 | |||
Alistair Bell
|
r10500 | /* and recurse on the remaining chunks on either side */ | ||
Matt Mackall
|
r13089 | l = recurse(a, b, pos, a1, i, b1, j, l); | ||
if (!l) | ||||
return NULL; | ||||
l->next = (struct hunk *)malloc(sizeof(struct hunk)); | ||||
if (!l->next) | ||||
return NULL; | ||||
l = l->next; | ||||
l->a1 = i; | ||||
l->a2 = i + k; | ||||
l->b1 = j; | ||||
l->b2 = j + k; | ||||
l->next = NULL; | ||||
/* tail-recursion didn't happen, so do equivalent iteration */ | ||||
Alistair Bell
|
r10500 | a1 = i + k; | ||
b1 = j + k; | ||||
} | ||||
mpm@selenic.com
|
r400 | } | ||
Matt Mackall
|
r13089 | static int diff(struct line *a, int an, struct line *b, int bn, | ||
struct hunk *base) | ||||
mpm@selenic.com
|
r400 | { | ||
Benoit Boissinot
|
r7104 | struct hunk *curr; | ||
mpm@selenic.com
|
r474 | struct pos *pos; | ||
Matt Mackall
|
r13089 | int t, count = 0; | ||
mpm@selenic.com
|
r433 | |||
/* allocate and fill arrays */ | ||||
t = equatelines(a, an, b, bn); | ||||
Jim Hague
|
r5571 | pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos)); | ||
Matt Mackall
|
r13089 | |||
if (pos && t) { | ||||
/* generate the matching block list */ | ||||
curr = recurse(a, b, pos, 0, an, 0, bn, base); | ||||
if (!curr) | ||||
return -1; | ||||
mpm@selenic.com
|
r433 | |||
Matt Mackall
|
r13089 | /* sentinel end hunk */ | ||
curr->next = (struct hunk *)malloc(sizeof(struct hunk)); | ||||
if (!curr->next) | ||||
Matt Mackall
|
r13090 | return -1; | ||
Matt Mackall
|
r13089 | curr = curr->next; | ||
curr->a1 = curr->a2 = an; | ||||
curr->b1 = curr->b2 = bn; | ||||
curr->next = NULL; | ||||
mpm@selenic.com
|
r433 | } | ||
mpm@selenic.com
|
r474 | free(pos); | ||
Benoit Boissinot
|
r7104 | |||
Benoit Boissinot
|
r7625 | /* normalize the hunk list, try to push each hunk towards the end */ | ||
Matt Mackall
|
r13089 | for (curr = base->next; curr; curr = curr->next) { | ||
struct hunk *next = curr->next; | ||||
Benoit Boissinot
|
r7104 | |||
Matt Mackall
|
r13089 | if (!next) | ||
Benoit Boissinot
|
r7104 | break; | ||
Matt Mackall
|
r29010 | if (curr->a2 == next->a1 || curr->b2 == next->b1) | ||
Matt Mackall
|
r29011 | while (curr->a2 < an && curr->b2 < bn | ||
Matt Mackall
|
r29012 | && next->a1 < next->a2 | ||
&& next->b1 < next->b2 | ||||
Matt Mackall
|
r29011 | && !cmp(a + curr->a2, b + curr->b2)) { | ||
curr->a2++; | ||||
next->a1++; | ||||
curr->b2++; | ||||
next->b1++; | ||||
} | ||||
Benoit Boissinot
|
r7104 | } | ||
Matt Mackall
|
r13089 | for (curr = base->next; curr; curr = curr->next) | ||
count++; | ||||
return count; | ||||
} | ||||
static void freehunks(struct hunk *l) | ||||
{ | ||||
struct hunk *n; | ||||
for (; l; l = n) { | ||||
n = l->next; | ||||
free(l); | ||||
} | ||||
mpm@selenic.com
|
r433 | } | ||
static PyObject *blocks(PyObject *self, PyObject *args) | ||||
{ | ||||
mpm@selenic.com
|
r435 | PyObject *sa, *sb, *rl = NULL, *m; | ||
mpm@selenic.com
|
r433 | struct line *a, *b; | ||
Matt Mackall
|
r13089 | struct hunk l, *h; | ||
int an, bn, count, pos = 0; | ||||
mpm@selenic.com
|
r400 | |||
Matt Mackall
|
r19962 | l.next = NULL; | ||
mpm@selenic.com
|
r400 | if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb)) | ||
return NULL; | ||||
Renato Cunha
|
r11364 | an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a); | ||
bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b); | ||||
mpm@selenic.com
|
r433 | if (!a || !b) | ||
goto nomem; | ||||
Matt Mackall
|
r13089 | count = diff(a, an, b, bn, &l); | ||
if (count < 0) | ||||
mpm@selenic.com
|
r433 | goto nomem; | ||
Matt Mackall
|
r13089 | rl = PyList_New(count); | ||
if (!rl) | ||||
goto nomem; | ||||
for (h = l.next; h; h = h->next) { | ||||
mpm@selenic.com
|
r433 | m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2); | ||
PyList_SetItem(rl, pos, m); | ||||
pos++; | ||||
} | ||||
nomem: | ||||
free(a); | ||||
free(b); | ||||
Matt Mackall
|
r13089 | freehunks(l.next); | ||
mpm@selenic.com
|
r433 | return rl ? rl : PyErr_NoMemory(); | ||
} | ||||
static PyObject *bdiff(PyObject *self, PyObject *args) | ||||
{ | ||||
Matt Mackall
|
r15222 | char *sa, *sb, *rb; | ||
Brendan Cully
|
r3335 | PyObject *result = NULL; | ||
mpm@selenic.com
|
r433 | struct line *al, *bl; | ||
Matt Mackall
|
r13089 | struct hunk l, *h; | ||
Adrian Buehlmann
|
r16749 | int an, bn, count; | ||
Py_ssize_t len = 0, la, lb; | ||||
Augie Fackler
|
r16477 | PyThreadState *_save; | ||
mpm@selenic.com
|
r433 | |||
Matt Mackall
|
r19962 | l.next = NULL; | ||
Alexis S. L. Carvalho
|
r3369 | if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb)) | ||
mpm@selenic.com
|
r433 | return NULL; | ||
Matt Mackall
|
r18551 | if (la > UINT_MAX || lb > UINT_MAX) { | ||
PyErr_SetString(PyExc_ValueError, "bdiff inputs too large"); | ||||
return NULL; | ||||
} | ||||
Augie Fackler
|
r16477 | _save = PyEval_SaveThread(); | ||
Brendan Cully
|
r3335 | an = splitlines(sa, la, &al); | ||
bn = splitlines(sb, lb, &bl); | ||||
mpm@selenic.com
|
r433 | if (!al || !bl) | ||
mpm@selenic.com
|
r400 | goto nomem; | ||
Matt Mackall
|
r13089 | count = diff(al, an, bl, bn, &l); | ||
if (count < 0) | ||||
mpm@selenic.com
|
r433 | goto nomem; | ||
mpm@selenic.com
|
r400 | |||
/* calculate length of output */ | ||||
Brendan Cully
|
r3335 | la = lb = 0; | ||
Matt Mackall
|
r13089 | for (h = l.next; h; h = h->next) { | ||
mpm@selenic.com
|
r400 | if (h->a1 != la || h->b1 != lb) | ||
len += 12 + bl[h->b1].l - bl[lb].l; | ||||
la = h->a2; | ||||
lb = h->b2; | ||||
} | ||||
Augie Fackler
|
r16477 | PyEval_RestoreThread(_save); | ||
_save = NULL; | ||||
mpm@selenic.com
|
r400 | |||
Renato Cunha
|
r11364 | result = PyBytes_FromStringAndSize(NULL, len); | ||
mpm@selenic.com
|
r400 | if (!result) | ||
goto nomem; | ||||
/* build binary patch */ | ||||
Renato Cunha
|
r11364 | rb = PyBytes_AsString(result); | ||
mpm@selenic.com
|
r400 | la = lb = 0; | ||
Matt Mackall
|
r13089 | for (h = l.next; h; h = h->next) { | ||
mpm@selenic.com
|
r400 | if (h->a1 != la || h->b1 != lb) { | ||
len = bl[h->b1].l - bl[lb].l; | ||||
Matt Mackall
|
r18551 | putbe32((uint32_t)(al[la].l - al->l), rb); | ||
putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4); | ||||
putbe32((uint32_t)len, rb + 8); | ||||
mpm@selenic.com
|
r400 | memcpy(rb + 12, bl[lb].l, len); | ||
rb += 12 + len; | ||||
} | ||||
la = h->a2; | ||||
lb = h->b2; | ||||
} | ||||
nomem: | ||||
Augie Fackler
|
r16477 | if (_save) | ||
PyEval_RestoreThread(_save); | ||||
mpm@selenic.com
|
r400 | free(al); | ||
free(bl); | ||||
Matt Mackall
|
r13089 | freehunks(l.next); | ||
mpm@selenic.com
|
r400 | return result ? result : PyErr_NoMemory(); | ||
} | ||||
Patrick Mezard
|
r15530 | /* | ||
* If allws != 0, remove all whitespace (' ', \t and \r). Otherwise, | ||||
* reduce whitespace sequences to a single space and trim remaining whitespace | ||||
* from end of lines. | ||||
*/ | ||||
static PyObject *fixws(PyObject *self, PyObject *args) | ||||
{ | ||||
PyObject *s, *result = NULL; | ||||
char allws, c; | ||||
const char *r; | ||||
Adrian Buehlmann
|
r16749 | Py_ssize_t i, rlen, wlen = 0; | ||
Patrick Mezard
|
r15530 | char *w; | ||
if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws)) | ||||
return NULL; | ||||
r = PyBytes_AsString(s); | ||||
rlen = PyBytes_Size(s); | ||||
Jim Hague
|
r16071 | w = (char *)malloc(rlen ? rlen : 1); | ||
Patrick Mezard
|
r15530 | if (!w) | ||
goto nomem; | ||||
for (i = 0; i != rlen; i++) { | ||||
c = r[i]; | ||||
if (c == ' ' || c == '\t' || c == '\r') { | ||||
if (!allws && (wlen == 0 || w[wlen - 1] != ' ')) | ||||
w[wlen++] = ' '; | ||||
} else if (c == '\n' && !allws | ||||
&& wlen > 0 && w[wlen - 1] == ' ') { | ||||
w[wlen - 1] = '\n'; | ||||
} else { | ||||
w[wlen++] = c; | ||||
} | ||||
} | ||||
result = PyBytes_FromStringAndSize(w, wlen); | ||||
nomem: | ||||
free(w); | ||||
return result ? result : PyErr_NoMemory(); | ||||
} | ||||
mpm@selenic.com
|
r400 | static char mdiff_doc[] = "Efficient binary diff."; | ||
static PyMethodDef methods[] = { | ||||
{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"}, | ||||
mpm@selenic.com
|
r433 | {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"}, | ||
Patrick Mezard
|
r15530 | {"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"}, | ||
mpm@selenic.com
|
r400 | {NULL, NULL} | ||
}; | ||||
Renato Cunha
|
r11364 | #ifdef IS_PY3K | ||
static struct PyModuleDef bdiff_module = { | ||||
PyModuleDef_HEAD_INIT, | ||||
"bdiff", | ||||
mdiff_doc, | ||||
-1, | ||||
methods | ||||
}; | ||||
PyMODINIT_FUNC PyInit_bdiff(void) | ||||
{ | ||||
return PyModule_Create(&bdiff_module); | ||||
} | ||||
#else | ||||
mpm@selenic.com
|
r400 | PyMODINIT_FUNC initbdiff(void) | ||
{ | ||||
Py_InitModule3("bdiff", methods, mdiff_doc); | ||||
} | ||||
Renato Cunha
|
r11364 | #endif | ||
twaldmann@thinkmo.de
|
r1542 | |||