mpatch.c
444 lines
| 9.3 KiB
| text/x-c
|
CLexer
/ mercurial / mpatch.c
mpm@selenic.com
|
r72 | /* | ||
mpatch.c - efficient binary patching for Mercurial | ||||
This implements a patch algorithm that's O(m + nlog n) where m is the | ||||
size of the output and n is the number of patches. | ||||
Given a list of binary patches, it unpacks each into a hunk list, | ||||
then combines the hunk lists with a treewise recursion to form a | ||||
single hunk list. This hunk list is then applied to the original | ||||
text. | ||||
The text (or binary) fragments are copied directly from their source | ||||
Python objects into a preallocated output string to avoid the | ||||
allocation of intermediate Python objects. Working memory is about 2x | ||||
the total number of hunks. | ||||
Vadim Gelfer
|
r2859 | Copyright 2005, 2006 Matt Mackall <mpm@selenic.com> | ||
mpm@selenic.com
|
r72 | |||
This software may be used and distributed according to the terms | ||||
of the GNU General Public License, incorporated herein by reference. | ||||
*/ | ||||
#include <Python.h> | ||||
#include <stdlib.h> | ||||
#include <string.h> | ||||
Vadim Gelfer
|
r2468 | |||
Shun-ichi GOTO
|
r5459 | /* Definitions to get compatibility with python 2.4 and earlier which | ||
does not have Py_ssize_t. See also PEP 353. | ||||
Note: msvc (8 or earlier) does not have ssize_t, so we use Py_ssize_t. | ||||
*/ | ||||
#if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN) | ||||
typedef int Py_ssize_t; | ||||
#define PY_SSIZE_T_MAX INT_MAX | ||||
#define PY_SSIZE_T_MIN INT_MIN | ||||
#endif | ||||
mpm@selenic.com
|
r410 | #ifdef _WIN32 | ||
Vadim Gelfer
|
r2468 | # ifdef _MSC_VER | ||
/* msvc 6.0 has problems */ | ||||
# define inline __inline | ||||
mpm@selenic.com
|
r551 | typedef unsigned long uint32_t; | ||
Vadim Gelfer
|
r2468 | # else | ||
# include <stdint.h> | ||||
# endif | ||||
mpm@selenic.com
|
r411 | static uint32_t ntohl(uint32_t x) | ||
{ | ||||
return ((x & 0x000000ffUL) << 24) | | ||||
((x & 0x0000ff00UL) << 8) | | ||||
((x & 0x00ff0000UL) >> 8) | | ||||
((x & 0xff000000UL) >> 24); | ||||
mpm@selenic.com
|
r410 | } | ||
#else | ||||
Vadim Gelfer
|
r2468 | /* not windows */ | ||
# include <sys/types.h> | ||||
Scott McCreary
|
r7036 | # if defined __BEOS__ && !defined __HAIKU__ | ||
Andrew Bachmann
|
r4073 | # include <ByteOrder.h> | ||
# else | ||||
# include <arpa/inet.h> | ||||
# endif | ||||
Thomas Arendsen Hein
|
r2543 | # include <inttypes.h> | ||
mpm@selenic.com
|
r410 | #endif | ||
mpm@selenic.com
|
r72 | |||
static char mpatch_doc[] = "Efficient binary patching."; | ||||
Benoit Boissinot
|
r1722 | static PyObject *mpatch_Error; | ||
mpm@selenic.com
|
r72 | |||
struct frag { | ||||
int start, end, len; | ||||
Matt Mackall
|
r5444 | const char *data; | ||
mpm@selenic.com
|
r72 | }; | ||
struct flist { | ||||
struct frag *base, *head, *tail; | ||||
}; | ||||
static struct flist *lalloc(int size) | ||||
{ | ||||
mpm@selenic.com
|
r128 | struct flist *a = NULL; | ||
mpm@selenic.com
|
r72 | |||
Matt Mackall
|
r3138 | if (size < 1) | ||
size = 1; | ||||
TK Soh
|
r1978 | a = (struct flist *)malloc(sizeof(struct flist)); | ||
mpm@selenic.com
|
r128 | if (a) { | ||
TK Soh
|
r1978 | a->base = (struct frag *)malloc(sizeof(struct frag) * size); | ||
Thomas Arendsen Hein
|
r2048 | if (a->base) { | ||
mpm@selenic.com
|
r128 | a->head = a->tail = a->base; | ||
Thomas Arendsen Hein
|
r2048 | return a; | ||
} | ||||
free(a); | ||||
a = NULL; | ||||
mpm@selenic.com
|
r128 | } | ||
Benoit Boissinot
|
r1722 | if (!PyErr_Occurred()) | ||
PyErr_NoMemory(); | ||||
return NULL; | ||||
mpm@selenic.com
|
r72 | } | ||
static void lfree(struct flist *a) | ||||
{ | ||||
mpm@selenic.com
|
r128 | if (a) { | ||
free(a->base); | ||||
free(a); | ||||
} | ||||
mpm@selenic.com
|
r72 | } | ||
static int lsize(struct flist *a) | ||||
{ | ||||
return a->tail - a->head; | ||||
} | ||||
/* move hunks in source that are less cut to dest, compensating | ||||
for changes in offset. the last hunk may be split if necessary. | ||||
*/ | ||||
static int gather(struct flist *dest, struct flist *src, int cut, int offset) | ||||
{ | ||||
struct frag *d = dest->tail, *s = src->head; | ||||
int postend, c, l; | ||||
while (s != src->tail) { | ||||
if (s->start + offset >= cut) | ||||
mpm@selenic.com
|
r82 | break; /* we've gone far enough */ | ||
mpm@selenic.com
|
r72 | |||
postend = offset + s->start + s->len; | ||||
if (postend <= cut) { | ||||
/* save this hunk */ | ||||
offset += s->start + s->len - s->end; | ||||
*d++ = *s++; | ||||
} | ||||
else { | ||||
/* break up this hunk */ | ||||
c = cut - offset; | ||||
if (s->end < c) | ||||
c = s->end; | ||||
l = cut - offset - s->start; | ||||
if (s->len < l) | ||||
l = s->len; | ||||
offset += s->start + l - c; | ||||
d->start = s->start; | ||||
d->end = c; | ||||
d->len = l; | ||||
d->data = s->data; | ||||
d++; | ||||
s->start = c; | ||||
s->len = s->len - l; | ||||
s->data = s->data + l; | ||||
mpm@selenic.com
|
r82 | break; | ||
mpm@selenic.com
|
r72 | } | ||
} | ||||
dest->tail = d; | ||||
src->head = s; | ||||
return offset; | ||||
} | ||||
/* like gather, but with no output list */ | ||||
static int discard(struct flist *src, int cut, int offset) | ||||
{ | ||||
struct frag *s = src->head; | ||||
int postend, c, l; | ||||
while (s != src->tail) { | ||||
if (s->start + offset >= cut) | ||||
mpm@selenic.com
|
r82 | break; | ||
mpm@selenic.com
|
r72 | |||
postend = offset + s->start + s->len; | ||||
if (postend <= cut) { | ||||
offset += s->start + s->len - s->end; | ||||
s++; | ||||
} | ||||
else { | ||||
c = cut - offset; | ||||
if (s->end < c) | ||||
c = s->end; | ||||
l = cut - offset - s->start; | ||||
if (s->len < l) | ||||
l = s->len; | ||||
offset += s->start + l - c; | ||||
s->start = c; | ||||
s->len = s->len - l; | ||||
s->data = s->data + l; | ||||
mpm@selenic.com
|
r82 | break; | ||
mpm@selenic.com
|
r72 | } | ||
} | ||||
src->head = s; | ||||
return offset; | ||||
} | ||||
/* combine hunk lists a and b, while adjusting b for offset changes in a/ | ||||
this deletes a and b and returns the resultant list. */ | ||||
static struct flist *combine(struct flist *a, struct flist *b) | ||||
{ | ||||
mpm@selenic.com
|
r128 | struct flist *c = NULL; | ||
struct frag *bh, *ct; | ||||
mpm@selenic.com
|
r72 | int offset = 0, post; | ||
mpm@selenic.com
|
r128 | if (a && b) | ||
c = lalloc((lsize(a) + lsize(b)) * 2); | ||||
if (c) { | ||||
mpm@selenic.com
|
r72 | |||
mpm@selenic.com
|
r128 | for (bh = b->head; bh != b->tail; bh++) { | ||
/* save old hunks */ | ||||
offset = gather(c, a, bh->start, offset); | ||||
mpm@selenic.com
|
r72 | |||
mpm@selenic.com
|
r128 | /* discard replaced hunks */ | ||
post = discard(a, bh->end, offset); | ||||
mpm@selenic.com
|
r72 | |||
mpm@selenic.com
|
r128 | /* insert new hunk */ | ||
ct = c->tail; | ||||
ct->start = bh->start - offset; | ||||
ct->end = bh->end - post; | ||||
ct->len = bh->len; | ||||
ct->data = bh->data; | ||||
c->tail++; | ||||
offset = post; | ||||
} | ||||
/* hold on to tail from a */ | ||||
memcpy(c->tail, a->head, sizeof(struct frag) * lsize(a)); | ||||
c->tail += lsize(a); | ||||
mpm@selenic.com
|
r72 | } | ||
lfree(a); | ||||
lfree(b); | ||||
return c; | ||||
} | ||||
/* decode a binary patch into a hunk list */ | ||||
Matt Mackall
|
r5444 | static struct flist *decode(const char *bin, int len) | ||
mpm@selenic.com
|
r72 | { | ||
struct flist *l; | ||||
struct frag *lt; | ||||
Matt Mackall
|
r5444 | const char *data = bin + 12, *end = bin + len; | ||
mpm@selenic.com
|
r384 | char decode[12]; /* for dealing with alignment issues */ | ||
mpm@selenic.com
|
r72 | |||
/* assume worst case size, we won't have many of these lists */ | ||||
l = lalloc(len / 12); | ||||
Benoit Boissinot
|
r1722 | if (!l) | ||
return NULL; | ||||
mpm@selenic.com
|
r72 | lt = l->tail; | ||
Thomas Arendsen Hein
|
r4358 | while (data <= end) { | ||
mpm@selenic.com
|
r384 | memcpy(decode, bin, 12); | ||
lt->start = ntohl(*(uint32_t *)decode); | ||||
lt->end = ntohl(*(uint32_t *)(decode + 4)); | ||||
lt->len = ntohl(*(uint32_t *)(decode + 8)); | ||||
Thomas Arendsen Hein
|
r4358 | if (lt->start > lt->end) | ||
break; /* sanity check */ | ||||
bin = data + lt->len; | ||||
if (bin < data) | ||||
break; /* big data + big (bogus) len can wrap around */ | ||||
lt->data = data; | ||||
data = bin + 12; | ||||
mpm@selenic.com
|
r72 | lt++; | ||
} | ||||
Benoit Boissinot
|
r1722 | if (bin != end) { | ||
if (!PyErr_Occurred()) | ||||
PyErr_SetString(mpatch_Error, "patch cannot be decoded"); | ||||
lfree(l); | ||||
return NULL; | ||||
} | ||||
mpm@selenic.com
|
r72 | l->tail = lt; | ||
return l; | ||||
} | ||||
/* calculate the size of resultant text */ | ||||
static int calcsize(int len, struct flist *l) | ||||
{ | ||||
int outlen = 0, last = 0; | ||||
struct frag *f = l->head; | ||||
while (f != l->tail) { | ||||
Benoit Boissinot
|
r1722 | if (f->start < last || f->end > len) { | ||
if (!PyErr_Occurred()) | ||||
PyErr_SetString(mpatch_Error, | ||||
"invalid patch"); | ||||
return -1; | ||||
} | ||||
mpm@selenic.com
|
r72 | outlen += f->start - last; | ||
last = f->end; | ||||
outlen += f->len; | ||||
f++; | ||||
} | ||||
outlen += len - last; | ||||
return outlen; | ||||
} | ||||
Matt Mackall
|
r5444 | static int apply(char *buf, const char *orig, int len, struct flist *l) | ||
mpm@selenic.com
|
r72 | { | ||
struct frag *f = l->head; | ||||
int last = 0; | ||||
char *p = buf; | ||||
while (f != l->tail) { | ||||
Benoit Boissinot
|
r1722 | if (f->start < last || f->end > len) { | ||
if (!PyErr_Occurred()) | ||||
PyErr_SetString(mpatch_Error, | ||||
"invalid patch"); | ||||
return 0; | ||||
} | ||||
mpm@selenic.com
|
r72 | memcpy(p, orig + last, f->start - last); | ||
p += f->start - last; | ||||
memcpy(p, f->data, f->len); | ||||
last = f->end; | ||||
p += f->len; | ||||
f++; | ||||
} | ||||
memcpy(p, orig + last, len - last); | ||||
Benoit Boissinot
|
r1722 | return 1; | ||
mpm@selenic.com
|
r72 | } | ||
/* recursively generate a patch of all bins between start and end */ | ||||
static struct flist *fold(PyObject *bins, int start, int end) | ||||
{ | ||||
int len; | ||||
Shun-ichi GOTO
|
r5459 | Py_ssize_t blen; | ||
Matt Mackall
|
r5444 | const char *buffer; | ||
mpm@selenic.com
|
r72 | |||
if (start + 1 == end) { | ||||
/* trivial case, output a decoded list */ | ||||
PyObject *tmp = PyList_GetItem(bins, start); | ||||
mpm@selenic.com
|
r128 | if (!tmp) | ||
return NULL; | ||||
Matt Mackall
|
r5444 | if (PyObject_AsCharBuffer(tmp, &buffer, &blen)) | ||
return NULL; | ||||
return decode(buffer, blen); | ||||
mpm@selenic.com
|
r72 | } | ||
/* divide and conquer, memory management is elsewhere */ | ||||
len = (end - start) / 2; | ||||
return combine(fold(bins, start, start + len), | ||||
fold(bins, start + len, end)); | ||||
} | ||||
static PyObject * | ||||
patches(PyObject *self, PyObject *args) | ||||
{ | ||||
PyObject *text, *bins, *result; | ||||
struct flist *patch; | ||||
Matt Mackall
|
r5444 | const char *in; | ||
char *out; | ||||
mpm@selenic.com
|
r72 | int len, outlen; | ||
Shun-ichi GOTO
|
r5459 | Py_ssize_t inlen; | ||
mpm@selenic.com
|
r72 | |||
Matt Mackall
|
r5444 | if (!PyArg_ParseTuple(args, "OO:mpatch", &text, &bins)) | ||
mpm@selenic.com
|
r72 | return NULL; | ||
len = PyList_Size(bins); | ||||
if (!len) { | ||||
/* nothing to do */ | ||||
Py_INCREF(text); | ||||
return text; | ||||
} | ||||
Matt Mackall
|
r5444 | if (PyObject_AsCharBuffer(text, &in, &inlen)) | ||
return NULL; | ||||
mpm@selenic.com
|
r72 | patch = fold(bins, 0, len); | ||
mpm@selenic.com
|
r128 | if (!patch) | ||
Benoit Boissinot
|
r1722 | return NULL; | ||
mpm@selenic.com
|
r128 | |||
Matt Mackall
|
r5444 | outlen = calcsize(inlen, patch); | ||
Benoit Boissinot
|
r1722 | if (outlen < 0) { | ||
result = NULL; | ||||
goto cleanup; | ||||
} | ||||
mpm@selenic.com
|
r72 | result = PyString_FromStringAndSize(NULL, outlen); | ||
Benoit Boissinot
|
r1722 | if (!result) { | ||
result = NULL; | ||||
goto cleanup; | ||||
mpm@selenic.com
|
r128 | } | ||
Benoit Boissinot
|
r1722 | out = PyString_AsString(result); | ||
Matt Mackall
|
r5444 | if (!apply(out, in, inlen, patch)) { | ||
Benoit Boissinot
|
r1722 | Py_DECREF(result); | ||
result = NULL; | ||||
} | ||||
cleanup: | ||||
mpm@selenic.com
|
r72 | lfree(patch); | ||
return result; | ||||
} | ||||
mason@suse.com
|
r2078 | /* calculate size of a patched file directly */ | ||
static PyObject * | ||||
patchedsize(PyObject *self, PyObject *args) | ||||
{ | ||||
long orig, start, end, len, outlen = 0, last = 0; | ||||
int patchlen; | ||||
Thomas Arendsen Hein
|
r4358 | char *bin, *binend, *data; | ||
mason@suse.com
|
r2078 | char decode[12]; /* for dealing with alignment issues */ | ||
if (!PyArg_ParseTuple(args, "ls#", &orig, &bin, &patchlen)) | ||||
return NULL; | ||||
binend = bin + patchlen; | ||||
Thomas Arendsen Hein
|
r4358 | data = bin + 12; | ||
mason@suse.com
|
r2078 | |||
Thomas Arendsen Hein
|
r4358 | while (data <= binend) { | ||
mason@suse.com
|
r2078 | memcpy(decode, bin, 12); | ||
start = ntohl(*(uint32_t *)decode); | ||||
end = ntohl(*(uint32_t *)(decode + 4)); | ||||
len = ntohl(*(uint32_t *)(decode + 8)); | ||||
Thomas Arendsen Hein
|
r4358 | if (start > end) | ||
break; /* sanity check */ | ||||
bin = data + len; | ||||
if (bin < data) | ||||
break; /* big data + big (bogus) len can wrap around */ | ||||
data = bin + 12; | ||||
mason@suse.com
|
r2078 | outlen += start - last; | ||
last = end; | ||||
outlen += len; | ||||
} | ||||
if (bin != binend) { | ||||
if (!PyErr_Occurred()) | ||||
PyErr_SetString(mpatch_Error, "patch cannot be decoded"); | ||||
return NULL; | ||||
} | ||||
outlen += orig - last; | ||||
return Py_BuildValue("l", outlen); | ||||
} | ||||
mpm@selenic.com
|
r72 | static PyMethodDef methods[] = { | ||
{"patches", patches, METH_VARARGS, "apply a series of patches\n"}, | ||||
mason@suse.com
|
r2078 | {"patchedsize", patchedsize, METH_VARARGS, "calculed patched size\n"}, | ||
mpm@selenic.com
|
r72 | {NULL, NULL} | ||
}; | ||||
PyMODINIT_FUNC | ||||
initmpatch(void) | ||||
{ | ||||
Py_InitModule3("mpatch", methods, mpatch_doc); | ||||
Benoit Boissinot
|
r1722 | mpatch_Error = PyErr_NewException("mpatch.mpatchError", NULL, NULL); | ||
mpm@selenic.com
|
r72 | } | ||