bdiff.c
467 lines
| 9.5 KiB
| text/x-c
|
CLexer
/ mercurial / bdiff.c
mpm@selenic.com
|
r400 | /* | ||
bdiff.c - efficient binary diff extension for Mercurial | ||||
Vadim Gelfer
|
r2859 | Copyright 2005, 2006 Matt Mackall <mpm@selenic.com> | ||
mpm@selenic.com
|
r400 | |||
This software may be used and distributed according to the terms of | ||||
the GNU General Public License, incorporated herein by reference. | ||||
Based roughly on Python difflib | ||||
*/ | ||||
#include <Python.h> | ||||
#include <stdlib.h> | ||||
#include <string.h> | ||||
Matt Mackall
|
r5341 | #include <limits.h> | ||
tksoh@users.sourceforge.net
|
r867 | |||
Renato Cunha
|
r11364 | #include "util.h" | ||
mpm@selenic.com
|
r400 | struct line { | ||
Markus F.X.J. Oberhumer
|
r13732 | int hash, len, n, e; | ||
mpm@selenic.com
|
r400 | const char *l; | ||
}; | ||||
mpm@selenic.com
|
r474 | struct pos { | ||
int pos, len; | ||||
}; | ||||
Matt Mackall
|
r13089 | struct hunk; | ||
mpm@selenic.com
|
r400 | struct hunk { | ||
int a1, a2, b1, b2; | ||||
Matt Mackall
|
r13089 | struct hunk *next; | ||
mpm@selenic.com
|
r400 | }; | ||
Markus F.X.J. Oberhumer
|
r13729 | static int splitlines(const char *a, int len, struct line **lr) | ||
mpm@selenic.com
|
r400 | { | ||
Markus F.X.J. Oberhumer
|
r13732 | unsigned hash; | ||
Markus F.X.J. Oberhumer
|
r13731 | int i; | ||
mpm@selenic.com
|
r400 | const char *p, *b = a; | ||
Christoph Spiel
|
r5340 | const char * const plast = a + len - 1; | ||
mpm@selenic.com
|
r400 | struct line *l; | ||
/* count the lines */ | ||||
i = 1; /* extra line for sentinel */ | ||||
for (p = a; p < a + len; p++) | ||||
Christoph Spiel
|
r5340 | if (*p == '\n' || p == plast) | ||
mpm@selenic.com
|
r400 | i++; | ||
TK Soh
|
r1978 | *lr = l = (struct line *)malloc(sizeof(struct line) * i); | ||
mpm@selenic.com
|
r400 | if (!l) | ||
return -1; | ||||
/* build the line array and calculate hashes */ | ||||
Markus F.X.J. Oberhumer
|
r13732 | hash = 0; | ||
mpm@selenic.com
|
r400 | for (p = a; p < a + len; p++) { | ||
Matt Mackall
|
r5342 | /* Leonid Yuriev's hash */ | ||
Markus F.X.J. Oberhumer
|
r13732 | hash = (hash * 1664525) + (unsigned char)*p + 1013904223; | ||
Matt Mackall
|
r5342 | |||
Christoph Spiel
|
r5340 | if (*p == '\n' || p == plast) { | ||
Markus F.X.J. Oberhumer
|
r13732 | l->hash = hash; | ||
hash = 0; | ||||
mpm@selenic.com
|
r400 | l->len = p - b + 1; | ||
l->l = b; | ||||
Matt Mackall
|
r5341 | l->n = INT_MAX; | ||
mpm@selenic.com
|
r400 | l++; | ||
b = p + 1; | ||||
} | ||||
} | ||||
/* set up a sentinel */ | ||||
Markus F.X.J. Oberhumer
|
r13732 | l->hash = 0; | ||
Markus F.X.J. Oberhumer
|
r13731 | l->len = 0; | ||
mpm@selenic.com
|
r400 | l->l = a + len; | ||
return i - 1; | ||||
} | ||||
Markus F.X.J. Oberhumer
|
r13729 | static inline int cmp(struct line *a, struct line *b) | ||
mpm@selenic.com
|
r400 | { | ||
Markus F.X.J. Oberhumer
|
r13732 | return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len); | ||
mpm@selenic.com
|
r400 | } | ||
static int equatelines(struct line *a, int an, struct line *b, int bn) | ||||
{ | ||||
Matt Mackall
|
r5452 | int i, j, buckets = 1, t, scale; | ||
struct pos *h = NULL; | ||||
mpm@selenic.com
|
r400 | |||
/* build a hash table of the next highest power of 2 */ | ||||
while (buckets < bn + 1) | ||||
buckets *= 2; | ||||
Christoph Spiel
|
r5339 | /* try to allocate a large hash table to avoid collisions */ | ||
Matt Mackall
|
r5452 | for (scale = 4; scale; scale /= 2) { | ||
Christoph Spiel
|
r5339 | h = (struct pos *)malloc(scale * buckets * sizeof(struct pos)); | ||
Matt Mackall
|
r5452 | if (h) | ||
break; | ||||
} | ||||
Christoph Spiel
|
r5339 | |||
mpm@selenic.com
|
r474 | if (!h) | ||
mpm@selenic.com
|
r400 | return 0; | ||
Christoph Spiel
|
r5339 | buckets = buckets * scale - 1; | ||
mpm@selenic.com
|
r400 | /* clear the hash table */ | ||
mpm@selenic.com
|
r474 | for (i = 0; i <= buckets; i++) { | ||
Matt Mackall
|
r5341 | h[i].pos = INT_MAX; | ||
mpm@selenic.com
|
r474 | h[i].len = 0; | ||
} | ||||
mpm@selenic.com
|
r400 | |||
/* add lines to the hash table chains */ | ||||
for (i = bn - 1; i >= 0; i--) { | ||||
/* find the equivalence class */ | ||||
Markus F.X.J. Oberhumer
|
r13732 | for (j = b[i].hash & buckets; h[j].pos != INT_MAX; | ||
mpm@selenic.com
|
r474 | j = (j + 1) & buckets) | ||
if (!cmp(b + i, b + h[j].pos)) | ||||
mpm@selenic.com
|
r400 | break; | ||
/* add to the head of the equivalence class */ | ||||
mpm@selenic.com
|
r474 | b[i].n = h[j].pos; | ||
mpm@selenic.com
|
r433 | b[i].e = j; | ||
mpm@selenic.com
|
r474 | h[j].pos = i; | ||
h[j].len++; /* keep track of popularity */ | ||||
mpm@selenic.com
|
r400 | } | ||
/* compute popularity threshold */ | ||||
Benoit Boissinot
|
r9534 | t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1); | ||
mpm@selenic.com
|
r400 | |||
/* match items in a to their equivalence class in b */ | ||||
for (i = 0; i < an; i++) { | ||||
/* find the equivalence class */ | ||||
Markus F.X.J. Oberhumer
|
r13732 | for (j = a[i].hash & buckets; h[j].pos != INT_MAX; | ||
mpm@selenic.com
|
r474 | j = (j + 1) & buckets) | ||
if (!cmp(a + i, b + h[j].pos)) | ||||
mpm@selenic.com
|
r400 | break; | ||
mpm@selenic.com
|
r433 | a[i].e = j; /* use equivalence class for quick compare */ | ||
twaldmann@thinkmo.de
|
r1542 | if (h[j].len <= t) | ||
mpm@selenic.com
|
r474 | a[i].n = h[j].pos; /* point to head of match list */ | ||
mpm@selenic.com
|
r400 | else | ||
Matt Mackall
|
r5341 | a[i].n = INT_MAX; /* too popular */ | ||
mpm@selenic.com
|
r400 | } | ||
/* discard hash tables */ | ||||
free(h); | ||||
return 1; | ||||
} | ||||
mpm@selenic.com
|
r474 | static int longest_match(struct line *a, struct line *b, struct pos *pos, | ||
mpm@selenic.com
|
r400 | int a1, int a2, int b1, int b2, int *omi, int *omj) | ||
{ | ||||
int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k; | ||||
for (i = a1; i < a2; i++) { | ||||
/* skip things before the current block */ | ||||
Matt Mackall
|
r5341 | for (j = a[i].n; j < b1; j = b[j].n) | ||
mpm@selenic.com
|
r400 | ; | ||
/* loop through all lines match a[i] in b */ | ||||
Matt Mackall
|
r5341 | for (; j < b2; j = b[j].n) { | ||
mpm@selenic.com
|
r400 | /* does this extend an earlier match? */ | ||
mpm@selenic.com
|
r474 | if (i > a1 && j > b1 && pos[j - 1].pos == i - 1) | ||
k = pos[j - 1].len + 1; | ||||
mpm@selenic.com
|
r400 | else | ||
k = 1; | ||||
mpm@selenic.com
|
r474 | pos[j].pos = i; | ||
pos[j].len = k; | ||||
mpm@selenic.com
|
r400 | |||
/* best match so far? */ | ||||
if (k > mk) { | ||||
mi = i; | ||||
mj = j; | ||||
mk = k; | ||||
} | ||||
} | ||||
} | ||||
if (mk) { | ||||
mi = mi - mk + 1; | ||||
mj = mj - mk + 1; | ||||
} | ||||
/* expand match to include neighboring popular lines */ | ||||
while (mi - mb > a1 && mj - mb > b1 && | ||||
mpm@selenic.com
|
r433 | a[mi - mb - 1].e == b[mj - mb - 1].e) | ||
mpm@selenic.com
|
r400 | mb++; | ||
while (mi + mk < a2 && mj + mk < b2 && | ||||
mpm@selenic.com
|
r433 | a[mi + mk].e == b[mj + mk].e) | ||
mpm@selenic.com
|
r400 | mk++; | ||
*omi = mi - mb; | ||||
*omj = mj - mb; | ||||
Matt Mackall
|
r5341 | |||
mpm@selenic.com
|
r400 | return mk + mb; | ||
} | ||||
Matt Mackall
|
r13089 | static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos, | ||
int a1, int a2, int b1, int b2, struct hunk *l) | ||||
mpm@selenic.com
|
r400 | { | ||
int i, j, k; | ||||
Alistair Bell
|
r10500 | while (1) { | ||
/* find the longest match in this chunk */ | ||||
k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j); | ||||
if (!k) | ||||
Matt Mackall
|
r13089 | return l; | ||
mpm@selenic.com
|
r400 | |||
Alistair Bell
|
r10500 | /* and recurse on the remaining chunks on either side */ | ||
Matt Mackall
|
r13089 | l = recurse(a, b, pos, a1, i, b1, j, l); | ||
if (!l) | ||||
return NULL; | ||||
l->next = (struct hunk *)malloc(sizeof(struct hunk)); | ||||
if (!l->next) | ||||
return NULL; | ||||
l = l->next; | ||||
l->a1 = i; | ||||
l->a2 = i + k; | ||||
l->b1 = j; | ||||
l->b2 = j + k; | ||||
l->next = NULL; | ||||
/* tail-recursion didn't happen, so do equivalent iteration */ | ||||
Alistair Bell
|
r10500 | a1 = i + k; | ||
b1 = j + k; | ||||
} | ||||
mpm@selenic.com
|
r400 | } | ||
Matt Mackall
|
r13089 | static int diff(struct line *a, int an, struct line *b, int bn, | ||
struct hunk *base) | ||||
mpm@selenic.com
|
r400 | { | ||
Benoit Boissinot
|
r7104 | struct hunk *curr; | ||
mpm@selenic.com
|
r474 | struct pos *pos; | ||
Matt Mackall
|
r13089 | int t, count = 0; | ||
mpm@selenic.com
|
r433 | |||
/* allocate and fill arrays */ | ||||
t = equatelines(a, an, b, bn); | ||||
Jim Hague
|
r5571 | pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos)); | ||
Matt Mackall
|
r13089 | |||
if (pos && t) { | ||||
/* generate the matching block list */ | ||||
curr = recurse(a, b, pos, 0, an, 0, bn, base); | ||||
if (!curr) | ||||
return -1; | ||||
mpm@selenic.com
|
r433 | |||
Matt Mackall
|
r13089 | /* sentinel end hunk */ | ||
curr->next = (struct hunk *)malloc(sizeof(struct hunk)); | ||||
if (!curr->next) | ||||
Matt Mackall
|
r13090 | return -1; | ||
Matt Mackall
|
r13089 | curr = curr->next; | ||
curr->a1 = curr->a2 = an; | ||||
curr->b1 = curr->b2 = bn; | ||||
curr->next = NULL; | ||||
mpm@selenic.com
|
r433 | } | ||
mpm@selenic.com
|
r474 | free(pos); | ||
Benoit Boissinot
|
r7104 | |||
Benoit Boissinot
|
r7625 | /* normalize the hunk list, try to push each hunk towards the end */ | ||
Matt Mackall
|
r13089 | for (curr = base->next; curr; curr = curr->next) { | ||
struct hunk *next = curr->next; | ||||
Benoit Boissinot
|
r7104 | int shift = 0; | ||
Matt Mackall
|
r13089 | if (!next) | ||
Benoit Boissinot
|
r7104 | break; | ||
if (curr->a2 == next->a1) | ||||
Matt Mackall
|
r10282 | while (curr->a2 + shift < an && curr->b2 + shift < bn | ||
&& !cmp(a + curr->a2 + shift, | ||||
b + curr->b2 + shift)) | ||||
Benoit Boissinot
|
r7104 | shift++; | ||
else if (curr->b2 == next->b1) | ||||
Matt Mackall
|
r10282 | while (curr->b2 + shift < bn && curr->a2 + shift < an | ||
&& !cmp(b + curr->b2 + shift, | ||||
a + curr->a2 + shift)) | ||||
Benoit Boissinot
|
r7104 | shift++; | ||
if (!shift) | ||||
continue; | ||||
curr->b2 += shift; | ||||
next->b1 += shift; | ||||
curr->a2 += shift; | ||||
next->a1 += shift; | ||||
} | ||||
Matt Mackall
|
r13089 | for (curr = base->next; curr; curr = curr->next) | ||
count++; | ||||
return count; | ||||
} | ||||
static void freehunks(struct hunk *l) | ||||
{ | ||||
struct hunk *n; | ||||
for (; l; l = n) { | ||||
n = l->next; | ||||
free(l); | ||||
} | ||||
mpm@selenic.com
|
r433 | } | ||
static PyObject *blocks(PyObject *self, PyObject *args) | ||||
{ | ||||
mpm@selenic.com
|
r435 | PyObject *sa, *sb, *rl = NULL, *m; | ||
mpm@selenic.com
|
r433 | struct line *a, *b; | ||
Matt Mackall
|
r13089 | struct hunk l, *h; | ||
int an, bn, count, pos = 0; | ||||
mpm@selenic.com
|
r400 | |||
if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb)) | ||||
return NULL; | ||||
Renato Cunha
|
r11364 | an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a); | ||
bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b); | ||||
mpm@selenic.com
|
r433 | if (!a || !b) | ||
goto nomem; | ||||
Matt Mackall
|
r13089 | l.next = NULL; | ||
count = diff(a, an, b, bn, &l); | ||||
if (count < 0) | ||||
mpm@selenic.com
|
r433 | goto nomem; | ||
Matt Mackall
|
r13089 | rl = PyList_New(count); | ||
if (!rl) | ||||
goto nomem; | ||||
for (h = l.next; h; h = h->next) { | ||||
mpm@selenic.com
|
r433 | m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2); | ||
PyList_SetItem(rl, pos, m); | ||||
pos++; | ||||
} | ||||
nomem: | ||||
free(a); | ||||
free(b); | ||||
Matt Mackall
|
r13089 | freehunks(l.next); | ||
mpm@selenic.com
|
r433 | return rl ? rl : PyErr_NoMemory(); | ||
} | ||||
static PyObject *bdiff(PyObject *self, PyObject *args) | ||||
{ | ||||
Matt Mackall
|
r15222 | char *sa, *sb, *rb; | ||
Brendan Cully
|
r3335 | PyObject *result = NULL; | ||
mpm@selenic.com
|
r433 | struct line *al, *bl; | ||
Matt Mackall
|
r13089 | struct hunk l, *h; | ||
Matt Mackall
|
r15222 | uint32_t encode[3]; | ||
Matt Mackall
|
r13089 | int an, bn, len = 0, la, lb, count; | ||
mpm@selenic.com
|
r433 | |||
Alexis S. L. Carvalho
|
r3369 | if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb)) | ||
mpm@selenic.com
|
r433 | return NULL; | ||
Brendan Cully
|
r3335 | an = splitlines(sa, la, &al); | ||
bn = splitlines(sb, lb, &bl); | ||||
mpm@selenic.com
|
r433 | if (!al || !bl) | ||
mpm@selenic.com
|
r400 | goto nomem; | ||
Matt Mackall
|
r13089 | l.next = NULL; | ||
count = diff(al, an, bl, bn, &l); | ||||
if (count < 0) | ||||
mpm@selenic.com
|
r433 | goto nomem; | ||
mpm@selenic.com
|
r400 | |||
/* calculate length of output */ | ||||
Brendan Cully
|
r3335 | la = lb = 0; | ||
Matt Mackall
|
r13089 | for (h = l.next; h; h = h->next) { | ||
mpm@selenic.com
|
r400 | if (h->a1 != la || h->b1 != lb) | ||
len += 12 + bl[h->b1].l - bl[lb].l; | ||||
la = h->a2; | ||||
lb = h->b2; | ||||
} | ||||
Renato Cunha
|
r11364 | result = PyBytes_FromStringAndSize(NULL, len); | ||
mpm@selenic.com
|
r400 | if (!result) | ||
goto nomem; | ||||
/* build binary patch */ | ||||
Renato Cunha
|
r11364 | rb = PyBytes_AsString(result); | ||
mpm@selenic.com
|
r400 | la = lb = 0; | ||
Matt Mackall
|
r13089 | for (h = l.next; h; h = h->next) { | ||
mpm@selenic.com
|
r400 | if (h->a1 != la || h->b1 != lb) { | ||
len = bl[h->b1].l - bl[lb].l; | ||||
Matt Mackall
|
r15222 | encode[0] = htonl(al[la].l - al->l); | ||
encode[1] = htonl(al[h->a1].l - al->l); | ||||
encode[2] = htonl(len); | ||||
mpm@selenic.com
|
r400 | memcpy(rb, encode, 12); | ||
memcpy(rb + 12, bl[lb].l, len); | ||||
rb += 12 + len; | ||||
} | ||||
la = h->a2; | ||||
lb = h->b2; | ||||
} | ||||
nomem: | ||||
free(al); | ||||
free(bl); | ||||
Matt Mackall
|
r13089 | freehunks(l.next); | ||
mpm@selenic.com
|
r400 | return result ? result : PyErr_NoMemory(); | ||
} | ||||
Patrick Mezard
|
r15530 | /* | ||
* If allws != 0, remove all whitespace (' ', \t and \r). Otherwise, | ||||
* reduce whitespace sequences to a single space and trim remaining whitespace | ||||
* from end of lines. | ||||
*/ | ||||
static PyObject *fixws(PyObject *self, PyObject *args) | ||||
{ | ||||
PyObject *s, *result = NULL; | ||||
char allws, c; | ||||
const char *r; | ||||
int i, rlen, wlen = 0; | ||||
char *w; | ||||
if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws)) | ||||
return NULL; | ||||
r = PyBytes_AsString(s); | ||||
rlen = PyBytes_Size(s); | ||||
Jim Hague
|
r16071 | w = (char *)malloc(rlen ? rlen : 1); | ||
Patrick Mezard
|
r15530 | if (!w) | ||
goto nomem; | ||||
for (i = 0; i != rlen; i++) { | ||||
c = r[i]; | ||||
if (c == ' ' || c == '\t' || c == '\r') { | ||||
if (!allws && (wlen == 0 || w[wlen - 1] != ' ')) | ||||
w[wlen++] = ' '; | ||||
} else if (c == '\n' && !allws | ||||
&& wlen > 0 && w[wlen - 1] == ' ') { | ||||
w[wlen - 1] = '\n'; | ||||
} else { | ||||
w[wlen++] = c; | ||||
} | ||||
} | ||||
result = PyBytes_FromStringAndSize(w, wlen); | ||||
nomem: | ||||
free(w); | ||||
return result ? result : PyErr_NoMemory(); | ||||
} | ||||
mpm@selenic.com
|
r400 | static char mdiff_doc[] = "Efficient binary diff."; | ||
static PyMethodDef methods[] = { | ||||
{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"}, | ||||
mpm@selenic.com
|
r433 | {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"}, | ||
Patrick Mezard
|
r15530 | {"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"}, | ||
mpm@selenic.com
|
r400 | {NULL, NULL} | ||
}; | ||||
Renato Cunha
|
r11364 | #ifdef IS_PY3K | ||
static struct PyModuleDef bdiff_module = { | ||||
PyModuleDef_HEAD_INIT, | ||||
"bdiff", | ||||
mdiff_doc, | ||||
-1, | ||||
methods | ||||
}; | ||||
PyMODINIT_FUNC PyInit_bdiff(void) | ||||
{ | ||||
return PyModule_Create(&bdiff_module); | ||||
} | ||||
#else | ||||
mpm@selenic.com
|
r400 | PyMODINIT_FUNC initbdiff(void) | ||
{ | ||||
Py_InitModule3("bdiff", methods, mdiff_doc); | ||||
} | ||||
Renato Cunha
|
r11364 | #endif | ||
twaldmann@thinkmo.de
|
r1542 | |||