##// END OF EJS Templates
Add 'other OS' bits to bdiff.c / style cleanups...
mpm@selenic.com -
r411:9e9f7ab4 default
parent child Browse files
Show More
@@ -1,279 +1,293
1 1 /*
2 2 bdiff.c - efficient binary diff extension for Mercurial
3 3
4 4 Copyright 2005 Matt Mackall <mpm@selenic.com>
5 5
6 6 This software may be used and distributed according to the terms of
7 7 the GNU General Public License, incorporated herein by reference.
8 8
9 9 Based roughly on Python difflib
10 10 */
11 11
12 12 #include <Python.h>
13 13 #include <stdlib.h>
14 14 #include <string.h>
15 #ifdef _WIN32
16
17 typedef unsigned long uint32_t;
18
19 static uint32_t htonl(uint32_t x)
20 {
21 return ((x & 0x000000ffUL) << 24) |
22 ((x & 0x0000ff00UL) << 8) |
23 ((x & 0x00ff0000UL) >> 8) |
24 ((x & 0xff000000UL) >> 24);
25 }
26
27 #else
15 28 #include <netinet/in.h>
16 29 #include <sys/types.h>
30 #endif
17 31
18 32 struct line {
19 33 int h, len, n;
20 34 const char *l;
21 35 };
22 36
23 37 struct hunk {
24 38 int a1, a2, b1, b2;
25 39 };
26 40
27 41 struct hunklist {
28 42 struct hunk *base, *head;
29 43 };
30 44
31 45 static inline uint32_t rol32(uint32_t word, unsigned int shift)
32 46 {
33 47 return (word << shift) | (word >> (32 - shift));
34 48 }
35 49
36 50 int splitlines(const char *a, int len, struct line **lr)
37 51 {
38 52 int h, i;
39 53 const char *p, *b = a;
40 54 struct line *l;
41 55
42 56 /* count the lines */
43 57 i = 1; /* extra line for sentinel */
44 58 for (p = a; p < a + len; p++)
45 59 if (*p == '\n' || p == a + len - 1)
46 60 i++;
47 61
48 62 *lr = l = malloc(sizeof(struct line) * i);
49 63 if (!l)
50 64 return -1;
51 65
52 66 /* build the line array and calculate hashes */
53 67 h = 0;
54 68 for (p = a; p < a + len; p++) {
55 69 h = *p + rol32(h, 7); /* a simple hash from GNU diff */
56 70 if (*p == '\n' || p == a + len - 1) {
57 71 l->len = p - b + 1;
58 72 l->h = h;
59 73 l->l = b;
60 74 l->n = -1;
61 75 l++;
62 76 b = p + 1;
63 77 h = 0;
64 78 }
65 79 }
66 80
67 81 /* set up a sentinel */
68 82 l->h = l->len = 0;
69 83 l->l = a + len;
70 84 return i - 1;
71 85 }
72 86
73 87 int inline cmp(struct line *a, struct line *b)
74 88 {
75 89 return a->len != b->len || memcmp(a->l, b->l, a->len);
76 90 }
77 91
78 92 static int equatelines(struct line *a, int an, struct line *b, int bn)
79 93 {
80 94 int i, j, buckets = 1, t, *h, *l;
81 95
82 96 /* build a hash table of the next highest power of 2 */
83 97 while (buckets < bn + 1)
84 98 buckets *= 2;
85 99
86 100 h = malloc(buckets * sizeof(int));
87 101 l = calloc(buckets, sizeof(int));
88 102 buckets = buckets - 1;
89 103 if (!h || !l) {
90 104 free(h);
91 105 return 0;
92 106 }
93 107
94 108 /* clear the hash table */
95 109 for (i = 0; i <= buckets; i++)
96 110 h[i] = -1;
97 111
98 112 /* add lines to the hash table chains */
99 113 for (i = bn - 1; i >= 0; i--) {
100 114 /* find the equivalence class */
101 115 for (j = b[i].h & buckets; h[j] != -1; j = (j + 1) & buckets)
102 116 if (!cmp(b + i, b + h[j]))
103 117 break;
104 118
105 119 /* add to the head of the equivalence class */
106 120 b[i].n = h[j];
107 121 b[i].h = j;
108 122 h[j] = i;
109 123 l[j]++; /* keep track of popularity */
110 124 }
111 125
112 126 /* compute popularity threshold */
113 127 t = (bn >= 200) ? bn / 100 : bn + 1;
114 128
115 129 /* match items in a to their equivalence class in b */
116 130 for (i = 0; i < an; i++) {
117 131 /* find the equivalence class */
118 132 for (j = a[i].h & buckets; h[j] != -1; j = (j + 1) & buckets)
119 133 if (!cmp(a + i, b + h[j]))
120 134 break;
121 135
122 136 a[i].h = j; /* use equivalence class for quick compare */
123 137 if(l[j] <= t)
124 138 a[i].n = h[j]; /* point to head of match list */
125 139 else
126 140 a[i].n = -1; /* too popular */
127 141 }
128 142
129 143 /* discard hash tables */
130 144 free(h);
131 145 free(l);
132 146 return 1;
133 147 }
134 148
135 149 static int longest_match(struct line *a, struct line *b, int *jpos, int *jlen,
136 150 int a1, int a2, int b1, int b2, int *omi, int *omj)
137 151 {
138 152 int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;
139 153
140 154 for (i = a1; i < a2; i++) {
141 155 /* skip things before the current block */
142 156 for (j = a[i].n; j != -1 && j < b1; j = b[j].n)
143 157 ;
144 158
145 159 /* loop through all lines match a[i] in b */
146 160 for (; j != -1 && j < b2; j = b[j].n) {
147 161 /* does this extend an earlier match? */
148 162 if (i > a1 && j > b1 && jpos[j - 1] == i)
149 163 k = jlen[j - 1] + 1;
150 164 else
151 165 k = 1;
152 166 jpos[j] = i + 1;
153 167 jlen[j] = k;
154 168
155 169 /* best match so far? */
156 170 if (k > mk) {
157 171 mi = i;
158 172 mj = j;
159 173 mk = k;
160 174 }
161 175 }
162 176 }
163 177
164 178 if (mk) {
165 179 mi = mi - mk + 1;
166 180 mj = mj - mk + 1;
167 181 }
168 182
169 183 /* expand match to include neighboring popular lines */
170 184 while (mi - mb > a1 && mj - mb > b1 &&
171 185 a[mi - mb - 1].h == b[mj - mb - 1].h)
172 186 mb++;
173 187 while (mi + mk < a2 && mj + mk < b2 &&
174 188 a[mi + mk].h == b[mj + mk].h)
175 189 mk++;
176 190
177 191 *omi = mi - mb;
178 192 *omj = mj - mb;
179 193 return mk + mb;
180 194 }
181 195
182 196 static void recurse(struct line *a, struct line *b, int *jpos, int *jlen,
183 197 int a1, int a2, int b1, int b2, struct hunklist *l)
184 198 {
185 199 int i, j, k;
186 200
187 201 /* find the longest match in this chunk */
188 202 k = longest_match(a, b, jpos, jlen, a1, a2, b1, b2, &i, &j);
189 203 if (!k)
190 204 return;
191 205
192 206 /* and recurse on the remaining chunks on either side */
193 207 recurse(a, b, jpos, jlen, a1, i, b1, j, l);
194 208 l->head->a1 = i;
195 209 l->head->a2 = i + k;
196 210 l->head->b1 = j;
197 211 l->head->b2 = j + k;
198 212 l->head++;
199 213 recurse(a, b, jpos, jlen, i + k, a2, j + k, b2, l);
200 214 }
201 215
202 216 static PyObject *bdiff(PyObject *self, PyObject *args)
203 217 {
204 218 PyObject *sa, *sb, *result = NULL;
205 219 struct hunklist l;
206 220 struct hunk *h;
207 221 struct line *al, *bl;
208 222 char encode[12], *rb;
209 223 int an, bn, len = 0, t, la = 0, lb = 0, *jpos, *jlen;
210 224
211 225 if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
212 226 return NULL;
213 227
214 228 /* allocate and fill arrays */
215 229 an = splitlines(PyString_AsString(sa), PyString_Size(sa), &al);
216 230 bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &bl);
217 231 t = equatelines(al, an, bl, bn);
218 232 jpos = calloc(bn, sizeof(int));
219 233 jlen = calloc(bn, sizeof(int));
220 234 l.head = l.base = malloc(sizeof(struct hunk) * ((an + bn) / 4 + 2));
221 235 if (!al || !bl || !jpos || !jlen || !l.base || !t)
222 236 goto nomem;
223 237
224 238 /* generate the matching block list */
225 239 recurse(al, bl, jpos, jlen, 0, an, 0, bn, &l);
226 240 l.head->a1 = an;
227 241 l.head->b1 = bn;
228 242 l.head++;
229 243
230 244 /* calculate length of output */
231 245 for(h = l.base; h != l.head; h++) {
232 246 if (h->a1 != la || h->b1 != lb)
233 247 len += 12 + bl[h->b1].l - bl[lb].l;
234 248 la = h->a2;
235 249 lb = h->b2;
236 250 }
237 251
238 252 result = PyString_FromStringAndSize(NULL, len);
239 253 if (!result)
240 254 goto nomem;
241 255
242 256 /* build binary patch */
243 257 rb = PyString_AsString(result);
244 258 la = lb = 0;
245 259
246 260 for(h = l.base; h != l.head; h++) {
247 261 if (h->a1 != la || h->b1 != lb) {
248 262 len = bl[h->b1].l - bl[lb].l;
249 263 *(uint32_t *)(encode) = htonl(al[la].l - al->l);
250 264 *(uint32_t *)(encode + 4) = htonl(al[h->a1].l - al->l);
251 265 *(uint32_t *)(encode + 8) = htonl(len);
252 266 memcpy(rb, encode, 12);
253 267 memcpy(rb + 12, bl[lb].l, len);
254 268 rb += 12 + len;
255 269 }
256 270 la = h->a2;
257 271 lb = h->b2;
258 272 }
259 273
260 274 nomem:
261 275 free(al);
262 276 free(bl);
263 277 free(jpos);
264 278 free(jlen);
265 279 free(l.base);
266 280 return result ? result : PyErr_NoMemory();
267 281 }
268 282
269 283 static char mdiff_doc[] = "Efficient binary diff.";
270 284
271 285 static PyMethodDef methods[] = {
272 286 {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
273 287 {NULL, NULL}
274 288 };
275 289
276 290 PyMODINIT_FUNC initbdiff(void)
277 291 {
278 292 Py_InitModule3("bdiff", methods, mdiff_doc);
279 293 }
@@ -1,328 +1,329
1 1 /*
2 2 mpatch.c - efficient binary patching for Mercurial
3 3
4 4 This implements a patch algorithm that's O(m + nlog n) where m is the
5 5 size of the output and n is the number of patches.
6 6
7 7 Given a list of binary patches, it unpacks each into a hunk list,
8 8 then combines the hunk lists with a treewise recursion to form a
9 9 single hunk list. This hunk list is then applied to the original
10 10 text.
11 11
12 12 The text (or binary) fragments are copied directly from their source
13 13 Python objects into a preallocated output string to avoid the
14 14 allocation of intermediate Python objects. Working memory is about 2x
15 15 the total number of hunks.
16 16
17 17 Copyright 2005 Matt Mackall <mpm@selenic.com>
18 18
19 19 This software may be used and distributed according to the terms
20 20 of the GNU General Public License, incorporated herein by reference.
21 21 */
22 22
23 23 #include <Python.h>
24 24 #include <stdlib.h>
25 25 #include <string.h>
26 26 #ifdef _WIN32
27 27
28 28 typedef unsigned long uint32_t;
29 29
30 uint32_t ntohl(uint32_t x) {
30 static uint32_t ntohl(uint32_t x)
31 {
31 32 return ((x & 0x000000ffUL) << 24) |
32 33 ((x & 0x0000ff00UL) << 8) |
33 34 ((x & 0x00ff0000UL) >> 8) |
34 35 ((x & 0xff000000UL) >> 24);
35 36 }
36 37
37 38 #else
38 39 #include <netinet/in.h>
39 40 #include <sys/types.h>
40 41 #endif
41 42
42 43 static char mpatch_doc[] = "Efficient binary patching.";
43 44
44 45 struct frag {
45 46 int start, end, len;
46 47 char *data;
47 48 };
48 49
49 50 struct flist {
50 51 struct frag *base, *head, *tail;
51 52 };
52 53
53 54 static struct flist *lalloc(int size)
54 55 {
55 56 struct flist *a = NULL;
56 57
57 58 a = malloc(sizeof(struct flist));
58 59 if (a) {
59 60 a->base = malloc(sizeof(struct frag) * size);
60 61 if (!a->base) {
61 62 free(a);
62 63 a = NULL;
63 64 } else
64 65 a->head = a->tail = a->base;
65 66 }
66 67 return a;
67 68 }
68 69
69 70 static void lfree(struct flist *a)
70 71 {
71 72 if (a) {
72 73 free(a->base);
73 74 free(a);
74 75 }
75 76 }
76 77
77 78 static int lsize(struct flist *a)
78 79 {
79 80 return a->tail - a->head;
80 81 }
81 82
82 83 /* move hunks in source that are less cut to dest, compensating
83 84 for changes in offset. the last hunk may be split if necessary.
84 85 */
85 86 static int gather(struct flist *dest, struct flist *src, int cut, int offset)
86 87 {
87 88 struct frag *d = dest->tail, *s = src->head;
88 89 int postend, c, l;
89 90
90 91 while (s != src->tail) {
91 92 if (s->start + offset >= cut)
92 93 break; /* we've gone far enough */
93 94
94 95 postend = offset + s->start + s->len;
95 96 if (postend <= cut) {
96 97 /* save this hunk */
97 98 offset += s->start + s->len - s->end;
98 99 *d++ = *s++;
99 100 }
100 101 else {
101 102 /* break up this hunk */
102 103 c = cut - offset;
103 104 if (s->end < c)
104 105 c = s->end;
105 106 l = cut - offset - s->start;
106 107 if (s->len < l)
107 108 l = s->len;
108 109
109 110 offset += s->start + l - c;
110 111
111 112 d->start = s->start;
112 113 d->end = c;
113 114 d->len = l;
114 115 d->data = s->data;
115 116 d++;
116 117 s->start = c;
117 118 s->len = s->len - l;
118 119 s->data = s->data + l;
119 120
120 121 break;
121 122 }
122 123 }
123 124
124 125 dest->tail = d;
125 126 src->head = s;
126 127 return offset;
127 128 }
128 129
129 130 /* like gather, but with no output list */
130 131 static int discard(struct flist *src, int cut, int offset)
131 132 {
132 133 struct frag *s = src->head;
133 134 int postend, c, l;
134 135
135 136 while (s != src->tail) {
136 137 if (s->start + offset >= cut)
137 138 break;
138 139
139 140 postend = offset + s->start + s->len;
140 141 if (postend <= cut) {
141 142 offset += s->start + s->len - s->end;
142 143 s++;
143 144 }
144 145 else {
145 146 c = cut - offset;
146 147 if (s->end < c)
147 148 c = s->end;
148 149 l = cut - offset - s->start;
149 150 if (s->len < l)
150 151 l = s->len;
151 152
152 153 offset += s->start + l - c;
153 154 s->start = c;
154 155 s->len = s->len - l;
155 156 s->data = s->data + l;
156 157
157 158 break;
158 159 }
159 160 }
160 161
161 162 src->head = s;
162 163 return offset;
163 164 }
164 165
165 166 /* combine hunk lists a and b, while adjusting b for offset changes in a/
166 167 this deletes a and b and returns the resultant list. */
167 168 static struct flist *combine(struct flist *a, struct flist *b)
168 169 {
169 170 struct flist *c = NULL;
170 171 struct frag *bh, *ct;
171 172 int offset = 0, post;
172 173
173 174 if (a && b)
174 175 c = lalloc((lsize(a) + lsize(b)) * 2);
175 176
176 177 if (c) {
177 178
178 179 for (bh = b->head; bh != b->tail; bh++) {
179 180 /* save old hunks */
180 181 offset = gather(c, a, bh->start, offset);
181 182
182 183 /* discard replaced hunks */
183 184 post = discard(a, bh->end, offset);
184 185
185 186 /* insert new hunk */
186 187 ct = c->tail;
187 188 ct->start = bh->start - offset;
188 189 ct->end = bh->end - post;
189 190 ct->len = bh->len;
190 191 ct->data = bh->data;
191 192 c->tail++;
192 193 offset = post;
193 194 }
194 195
195 196 /* hold on to tail from a */
196 197 memcpy(c->tail, a->head, sizeof(struct frag) * lsize(a));
197 198 c->tail += lsize(a);
198 199 }
199 200
200 201 lfree(a);
201 202 lfree(b);
202 203 return c;
203 204 }
204 205
205 206 /* decode a binary patch into a hunk list */
206 207 static struct flist *decode(char *bin, int len)
207 208 {
208 209 struct flist *l;
209 210 struct frag *lt;
210 211 char *end = bin + len;
211 212 char decode[12]; /* for dealing with alignment issues */
212 213
213 214 /* assume worst case size, we won't have many of these lists */
214 215 l = lalloc(len / 12);
215 216 lt = l->tail;
216 217
217 218 while (bin < end) {
218 219 memcpy(decode, bin, 12);
219 220 lt->start = ntohl(*(uint32_t *)decode);
220 221 lt->end = ntohl(*(uint32_t *)(decode + 4));
221 222 lt->len = ntohl(*(uint32_t *)(decode + 8));
222 223 lt->data = bin + 12;
223 224 bin += 12 + lt->len;
224 225 lt++;
225 226 }
226 227
227 228 l->tail = lt;
228 229 return l;
229 230 }
230 231
231 232 /* calculate the size of resultant text */
232 233 static int calcsize(int len, struct flist *l)
233 234 {
234 235 int outlen = 0, last = 0;
235 236 struct frag *f = l->head;
236 237
237 238 while (f != l->tail) {
238 239 outlen += f->start - last;
239 240 last = f->end;
240 241 outlen += f->len;
241 242 f++;
242 243 }
243 244
244 245 outlen += len - last;
245 246 return outlen;
246 247 }
247 248
248 249 static void apply(char *buf, char *orig, int len, struct flist *l)
249 250 {
250 251 struct frag *f = l->head;
251 252 int last = 0;
252 253 char *p = buf;
253 254
254 255 while (f != l->tail) {
255 256 memcpy(p, orig + last, f->start - last);
256 257 p += f->start - last;
257 258 memcpy(p, f->data, f->len);
258 259 last = f->end;
259 260 p += f->len;
260 261 f++;
261 262 }
262 263 memcpy(p, orig + last, len - last);
263 264 }
264 265
265 266 /* recursively generate a patch of all bins between start and end */
266 267 static struct flist *fold(PyObject *bins, int start, int end)
267 268 {
268 269 int len;
269 270
270 271 if (start + 1 == end) {
271 272 /* trivial case, output a decoded list */
272 273 PyObject *tmp = PyList_GetItem(bins, start);
273 274 if (!tmp)
274 275 return NULL;
275 276 return decode(PyString_AsString(tmp), PyString_Size(tmp));
276 277 }
277 278
278 279 /* divide and conquer, memory management is elsewhere */
279 280 len = (end - start) / 2;
280 281 return combine(fold(bins, start, start + len),
281 282 fold(bins, start + len, end));
282 283 }
283 284
284 285 static PyObject *
285 286 patches(PyObject *self, PyObject *args)
286 287 {
287 288 PyObject *text, *bins, *result;
288 289 struct flist *patch;
289 290 char *in, *out;
290 291 int len, outlen;
291 292
292 293 if (!PyArg_ParseTuple(args, "SO:mpatch", &text, &bins))
293 294 return NULL;
294 295
295 296 len = PyList_Size(bins);
296 297 if (!len) {
297 298 /* nothing to do */
298 299 Py_INCREF(text);
299 300 return text;
300 301 }
301 302
302 303 patch = fold(bins, 0, len);
303 304 if (!patch)
304 305 return PyErr_NoMemory();
305 306
306 307 outlen = calcsize(PyString_Size(text), patch);
307 308 result = PyString_FromStringAndSize(NULL, outlen);
308 309 if (result) {
309 310 in = PyString_AsString(text);
310 311 out = PyString_AsString(result);
311 312 apply(out, in, PyString_Size(text), patch);
312 313 }
313 314
314 315 lfree(patch);
315 316 return result;
316 317 }
317 318
318 319 static PyMethodDef methods[] = {
319 320 {"patches", patches, METH_VARARGS, "apply a series of patches\n"},
320 321 {NULL, NULL}
321 322 };
322 323
323 324 PyMODINIT_FUNC
324 325 initmpatch(void)
325 326 {
326 327 Py_InitModule3("mpatch", methods, mpatch_doc);
327 328 }
328 329
General Comments 0
You need to be logged in to leave comments. Login now