##// END OF EJS Templates
More fiddling with uint32_t includes for extensions...
mpm@selenic.com -
r510:7f3fc8fd default
parent child Browse files
Show More
@@ -1,342 +1,344 b''
1 1 /*
2 2 bdiff.c - efficient binary diff extension for Mercurial
3 3
4 4 Copyright 2005 Matt Mackall <mpm@selenic.com>
5 5
6 6 This software may be used and distributed according to the terms of
7 7 the GNU General Public License, incorporated herein by reference.
8 8
9 9 Based roughly on Python difflib
10 10 */
11 11
12 12 #include <Python.h>
13 13 #include <stdlib.h>
14 14 #include <string.h>
15 15 #ifdef _WIN32
16 #include <stdint.h>
16 17 static uint32_t htonl(uint32_t x)
17 18 {
18 19 return ((x & 0x000000ffUL) << 24) |
19 20 ((x & 0x0000ff00UL) << 8) |
20 21 ((x & 0x00ff0000UL) >> 8) |
21 22 ((x & 0xff000000UL) >> 24);
22 23 }
23 24 #else
25 #include <sys/types.h>
24 26 #include <netinet/in.h>
25 27 #endif
26 28
27 29 struct line {
28 30 int h, len, n, e;
29 31 const char *l;
30 32 };
31 33
32 34 struct pos {
33 35 int pos, len;
34 36 };
35 37
36 38 struct hunk {
37 39 int a1, a2, b1, b2;
38 40 };
39 41
40 42 struct hunklist {
41 43 struct hunk *base, *head;
42 44 };
43 45
44 46 static __inline uint32_t rol32(uint32_t word, unsigned int shift)
45 47 {
46 48 return (word << shift) | (word >> (32 - shift));
47 49 }
48 50
49 51 int splitlines(const char *a, int len, struct line **lr)
50 52 {
51 53 int h, i;
52 54 const char *p, *b = a;
53 55 struct line *l;
54 56
55 57 /* count the lines */
56 58 i = 1; /* extra line for sentinel */
57 59 for (p = a; p < a + len; p++)
58 60 if (*p == '\n' || p == a + len - 1)
59 61 i++;
60 62
61 63 *lr = l = malloc(sizeof(struct line) * i);
62 64 if (!l)
63 65 return -1;
64 66
65 67 /* build the line array and calculate hashes */
66 68 h = 0;
67 69 for (p = a; p < a + len; p++) {
68 70 h = *p + rol32(h, 7); /* a simple hash from GNU diff */
69 71 if (*p == '\n' || p == a + len - 1) {
70 72 l->len = p - b + 1;
71 73 l->h = h * l->len;
72 74 l->l = b;
73 75 l->n = -1;
74 76 l++;
75 77 b = p + 1;
76 78 h = 0;
77 79 }
78 80 }
79 81
80 82 /* set up a sentinel */
81 83 l->h = l->len = 0;
82 84 l->l = a + len;
83 85 return i - 1;
84 86 }
85 87
86 88 int inline cmp(struct line *a, struct line *b)
87 89 {
88 90 return a->h != b->h || a->len != b->len || memcmp(a->l, b->l, a->len);
89 91 }
90 92
91 93 static int equatelines(struct line *a, int an, struct line *b, int bn)
92 94 {
93 95 int i, j, buckets = 1, t;
94 96 struct pos *h;
95 97
96 98 /* build a hash table of the next highest power of 2 */
97 99 while (buckets < bn + 1)
98 100 buckets *= 2;
99 101
100 102 h = malloc(buckets * sizeof(struct pos));
101 103 buckets = buckets - 1;
102 104 if (!h)
103 105 return 0;
104 106
105 107 /* clear the hash table */
106 108 for (i = 0; i <= buckets; i++) {
107 109 h[i].pos = -1;
108 110 h[i].len = 0;
109 111 }
110 112
111 113 /* add lines to the hash table chains */
112 114 for (i = bn - 1; i >= 0; i--) {
113 115 /* find the equivalence class */
114 116 for (j = b[i].h & buckets; h[j].pos != -1;
115 117 j = (j + 1) & buckets)
116 118 if (!cmp(b + i, b + h[j].pos))
117 119 break;
118 120
119 121 /* add to the head of the equivalence class */
120 122 b[i].n = h[j].pos;
121 123 b[i].e = j;
122 124 h[j].pos = i;
123 125 h[j].len++; /* keep track of popularity */
124 126 }
125 127
126 128 /* compute popularity threshold */
127 129 t = (bn >= 200) ? bn / 100 : bn + 1;
128 130
129 131 /* match items in a to their equivalence class in b */
130 132 for (i = 0; i < an; i++) {
131 133 /* find the equivalence class */
132 134 for (j = a[i].h & buckets; h[j].pos != -1;
133 135 j = (j + 1) & buckets)
134 136 if (!cmp(a + i, b + h[j].pos))
135 137 break;
136 138
137 139 a[i].e = j; /* use equivalence class for quick compare */
138 140 if(h[j].len <= t)
139 141 a[i].n = h[j].pos; /* point to head of match list */
140 142 else
141 143 a[i].n = -1; /* too popular */
142 144 }
143 145
144 146 /* discard hash tables */
145 147 free(h);
146 148 return 1;
147 149 }
148 150
149 151 static int longest_match(struct line *a, struct line *b, struct pos *pos,
150 152 int a1, int a2, int b1, int b2, int *omi, int *omj)
151 153 {
152 154 int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;
153 155
154 156 for (i = a1; i < a2; i++) {
155 157 /* skip things before the current block */
156 158 for (j = a[i].n; j != -1 && j < b1; j = b[j].n)
157 159 ;
158 160
159 161 /* loop through all lines match a[i] in b */
160 162 for (; j != -1 && j < b2; j = b[j].n) {
161 163 /* does this extend an earlier match? */
162 164 if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)
163 165 k = pos[j - 1].len + 1;
164 166 else
165 167 k = 1;
166 168 pos[j].pos = i;
167 169 pos[j].len = k;
168 170
169 171 /* best match so far? */
170 172 if (k > mk) {
171 173 mi = i;
172 174 mj = j;
173 175 mk = k;
174 176 }
175 177 }
176 178 }
177 179
178 180 if (mk) {
179 181 mi = mi - mk + 1;
180 182 mj = mj - mk + 1;
181 183 }
182 184
183 185 /* expand match to include neighboring popular lines */
184 186 while (mi - mb > a1 && mj - mb > b1 &&
185 187 a[mi - mb - 1].e == b[mj - mb - 1].e)
186 188 mb++;
187 189 while (mi + mk < a2 && mj + mk < b2 &&
188 190 a[mi + mk].e == b[mj + mk].e)
189 191 mk++;
190 192
191 193 *omi = mi - mb;
192 194 *omj = mj - mb;
193 195 return mk + mb;
194 196 }
195 197
196 198 static void recurse(struct line *a, struct line *b, struct pos *pos,
197 199 int a1, int a2, int b1, int b2, struct hunklist *l)
198 200 {
199 201 int i, j, k;
200 202
201 203 /* find the longest match in this chunk */
202 204 k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
203 205 if (!k)
204 206 return;
205 207
206 208 /* and recurse on the remaining chunks on either side */
207 209 recurse(a, b, pos, a1, i, b1, j, l);
208 210 l->head->a1 = i;
209 211 l->head->a2 = i + k;
210 212 l->head->b1 = j;
211 213 l->head->b2 = j + k;
212 214 l->head++;
213 215 recurse(a, b, pos, i + k, a2, j + k, b2, l);
214 216 }
215 217
216 218 static struct hunklist diff(struct line *a, int an, struct line *b, int bn)
217 219 {
218 220 struct hunklist l;
219 221 struct pos *pos;
220 222 int t;
221 223
222 224 /* allocate and fill arrays */
223 225 t = equatelines(a, an, b, bn);
224 226 pos = calloc(bn, sizeof(struct pos));
225 227 l.head = l.base = malloc(sizeof(struct hunk) * ((an + bn) / 4 + 2));
226 228
227 229 if (pos && l.base && t) {
228 230 /* generate the matching block list */
229 231 recurse(a, b, pos, 0, an, 0, bn, &l);
230 232 l.head->a1 = an;
231 233 l.head->b1 = bn;
232 234 l.head++;
233 235 }
234 236
235 237 free(pos);
236 238 return l;
237 239 }
238 240
239 241 static PyObject *blocks(PyObject *self, PyObject *args)
240 242 {
241 243 PyObject *sa, *sb, *rl = NULL, *m;
242 244 struct line *a, *b;
243 245 struct hunklist l;
244 246 struct hunk *h;
245 247 int an, bn, pos = 0;
246 248
247 249 if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
248 250 return NULL;
249 251
250 252 an = splitlines(PyString_AsString(sa), PyString_Size(sa), &a);
251 253 bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &b);
252 254 if (!a || !b)
253 255 goto nomem;
254 256
255 257 l = diff(a, an, b, bn);
256 258 rl = PyList_New(l.head - l.base);
257 259 if (!l.head || !rl)
258 260 goto nomem;
259 261
260 262 for(h = l.base; h != l.head; h++) {
261 263 m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
262 264 PyList_SetItem(rl, pos, m);
263 265 pos++;
264 266 }
265 267
266 268 nomem:
267 269 free(a);
268 270 free(b);
269 271 free(l.base);
270 272 return rl ? rl : PyErr_NoMemory();
271 273 }
272 274
273 275 static PyObject *bdiff(PyObject *self, PyObject *args)
274 276 {
275 277 PyObject *sa, *sb, *result = NULL;
276 278 struct line *al, *bl;
277 279 struct hunklist l;
278 280 struct hunk *h;
279 281 char encode[12], *rb;
280 282 int an, bn, len = 0, la = 0, lb = 0;
281 283
282 284 if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
283 285 return NULL;
284 286
285 287 an = splitlines(PyString_AsString(sa), PyString_Size(sa), &al);
286 288 bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &bl);
287 289 if (!al || !bl)
288 290 goto nomem;
289 291
290 292 l = diff(al, an, bl, bn);
291 293 if (!l.head)
292 294 goto nomem;
293 295
294 296 /* calculate length of output */
295 297 for(h = l.base; h != l.head; h++) {
296 298 if (h->a1 != la || h->b1 != lb)
297 299 len += 12 + bl[h->b1].l - bl[lb].l;
298 300 la = h->a2;
299 301 lb = h->b2;
300 302 }
301 303
302 304 result = PyString_FromStringAndSize(NULL, len);
303 305 if (!result)
304 306 goto nomem;
305 307
306 308 /* build binary patch */
307 309 rb = PyString_AsString(result);
308 310 la = lb = 0;
309 311
310 312 for(h = l.base; h != l.head; h++) {
311 313 if (h->a1 != la || h->b1 != lb) {
312 314 len = bl[h->b1].l - bl[lb].l;
313 315 *(uint32_t *)(encode) = htonl(al[la].l - al->l);
314 316 *(uint32_t *)(encode + 4) = htonl(al[h->a1].l - al->l);
315 317 *(uint32_t *)(encode + 8) = htonl(len);
316 318 memcpy(rb, encode, 12);
317 319 memcpy(rb + 12, bl[lb].l, len);
318 320 rb += 12 + len;
319 321 }
320 322 la = h->a2;
321 323 lb = h->b2;
322 324 }
323 325
324 326 nomem:
325 327 free(al);
326 328 free(bl);
327 329 free(l.base);
328 330 return result ? result : PyErr_NoMemory();
329 331 }
330 332
331 333 static char mdiff_doc[] = "Efficient binary diff.";
332 334
333 335 static PyMethodDef methods[] = {
334 336 {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
335 337 {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
336 338 {NULL, NULL}
337 339 };
338 340
339 341 PyMODINIT_FUNC initbdiff(void)
340 342 {
341 343 Py_InitModule3("bdiff", methods, mdiff_doc);
342 344 }
@@ -1,324 +1,326 b''
1 1 /*
2 2 mpatch.c - efficient binary patching for Mercurial
3 3
4 4 This implements a patch algorithm that's O(m + nlog n) where m is the
5 5 size of the output and n is the number of patches.
6 6
7 7 Given a list of binary patches, it unpacks each into a hunk list,
8 8 then combines the hunk lists with a treewise recursion to form a
9 9 single hunk list. This hunk list is then applied to the original
10 10 text.
11 11
12 12 The text (or binary) fragments are copied directly from their source
13 13 Python objects into a preallocated output string to avoid the
14 14 allocation of intermediate Python objects. Working memory is about 2x
15 15 the total number of hunks.
16 16
17 17 Copyright 2005 Matt Mackall <mpm@selenic.com>
18 18
19 19 This software may be used and distributed according to the terms
20 20 of the GNU General Public License, incorporated herein by reference.
21 21 */
22 22
23 23 #include <Python.h>
24 24 #include <stdlib.h>
25 25 #include <string.h>
26 26 #ifdef _WIN32
27 #include <stdint.h>
27 28 static uint32_t ntohl(uint32_t x)
28 29 {
29 30 return ((x & 0x000000ffUL) << 24) |
30 31 ((x & 0x0000ff00UL) << 8) |
31 32 ((x & 0x00ff0000UL) >> 8) |
32 33 ((x & 0xff000000UL) >> 24);
33 34 }
34 35 #else
36 #include <sys/types.h>
35 37 #include <netinet/in.h>
36 38 #endif
37 39
38 40 static char mpatch_doc[] = "Efficient binary patching.";
39 41
40 42 struct frag {
41 43 int start, end, len;
42 44 char *data;
43 45 };
44 46
45 47 struct flist {
46 48 struct frag *base, *head, *tail;
47 49 };
48 50
49 51 static struct flist *lalloc(int size)
50 52 {
51 53 struct flist *a = NULL;
52 54
53 55 a = malloc(sizeof(struct flist));
54 56 if (a) {
55 57 a->base = malloc(sizeof(struct frag) * size);
56 58 if (!a->base) {
57 59 free(a);
58 60 a = NULL;
59 61 } else
60 62 a->head = a->tail = a->base;
61 63 }
62 64 return a;
63 65 }
64 66
65 67 static void lfree(struct flist *a)
66 68 {
67 69 if (a) {
68 70 free(a->base);
69 71 free(a);
70 72 }
71 73 }
72 74
73 75 static int lsize(struct flist *a)
74 76 {
75 77 return a->tail - a->head;
76 78 }
77 79
78 80 /* move hunks in source that are less cut to dest, compensating
79 81 for changes in offset. the last hunk may be split if necessary.
80 82 */
81 83 static int gather(struct flist *dest, struct flist *src, int cut, int offset)
82 84 {
83 85 struct frag *d = dest->tail, *s = src->head;
84 86 int postend, c, l;
85 87
86 88 while (s != src->tail) {
87 89 if (s->start + offset >= cut)
88 90 break; /* we've gone far enough */
89 91
90 92 postend = offset + s->start + s->len;
91 93 if (postend <= cut) {
92 94 /* save this hunk */
93 95 offset += s->start + s->len - s->end;
94 96 *d++ = *s++;
95 97 }
96 98 else {
97 99 /* break up this hunk */
98 100 c = cut - offset;
99 101 if (s->end < c)
100 102 c = s->end;
101 103 l = cut - offset - s->start;
102 104 if (s->len < l)
103 105 l = s->len;
104 106
105 107 offset += s->start + l - c;
106 108
107 109 d->start = s->start;
108 110 d->end = c;
109 111 d->len = l;
110 112 d->data = s->data;
111 113 d++;
112 114 s->start = c;
113 115 s->len = s->len - l;
114 116 s->data = s->data + l;
115 117
116 118 break;
117 119 }
118 120 }
119 121
120 122 dest->tail = d;
121 123 src->head = s;
122 124 return offset;
123 125 }
124 126
125 127 /* like gather, but with no output list */
126 128 static int discard(struct flist *src, int cut, int offset)
127 129 {
128 130 struct frag *s = src->head;
129 131 int postend, c, l;
130 132
131 133 while (s != src->tail) {
132 134 if (s->start + offset >= cut)
133 135 break;
134 136
135 137 postend = offset + s->start + s->len;
136 138 if (postend <= cut) {
137 139 offset += s->start + s->len - s->end;
138 140 s++;
139 141 }
140 142 else {
141 143 c = cut - offset;
142 144 if (s->end < c)
143 145 c = s->end;
144 146 l = cut - offset - s->start;
145 147 if (s->len < l)
146 148 l = s->len;
147 149
148 150 offset += s->start + l - c;
149 151 s->start = c;
150 152 s->len = s->len - l;
151 153 s->data = s->data + l;
152 154
153 155 break;
154 156 }
155 157 }
156 158
157 159 src->head = s;
158 160 return offset;
159 161 }
160 162
161 163 /* combine hunk lists a and b, while adjusting b for offset changes in a/
162 164 this deletes a and b and returns the resultant list. */
163 165 static struct flist *combine(struct flist *a, struct flist *b)
164 166 {
165 167 struct flist *c = NULL;
166 168 struct frag *bh, *ct;
167 169 int offset = 0, post;
168 170
169 171 if (a && b)
170 172 c = lalloc((lsize(a) + lsize(b)) * 2);
171 173
172 174 if (c) {
173 175
174 176 for (bh = b->head; bh != b->tail; bh++) {
175 177 /* save old hunks */
176 178 offset = gather(c, a, bh->start, offset);
177 179
178 180 /* discard replaced hunks */
179 181 post = discard(a, bh->end, offset);
180 182
181 183 /* insert new hunk */
182 184 ct = c->tail;
183 185 ct->start = bh->start - offset;
184 186 ct->end = bh->end - post;
185 187 ct->len = bh->len;
186 188 ct->data = bh->data;
187 189 c->tail++;
188 190 offset = post;
189 191 }
190 192
191 193 /* hold on to tail from a */
192 194 memcpy(c->tail, a->head, sizeof(struct frag) * lsize(a));
193 195 c->tail += lsize(a);
194 196 }
195 197
196 198 lfree(a);
197 199 lfree(b);
198 200 return c;
199 201 }
200 202
201 203 /* decode a binary patch into a hunk list */
202 204 static struct flist *decode(char *bin, int len)
203 205 {
204 206 struct flist *l;
205 207 struct frag *lt;
206 208 char *end = bin + len;
207 209 char decode[12]; /* for dealing with alignment issues */
208 210
209 211 /* assume worst case size, we won't have many of these lists */
210 212 l = lalloc(len / 12);
211 213 lt = l->tail;
212 214
213 215 while (bin < end) {
214 216 memcpy(decode, bin, 12);
215 217 lt->start = ntohl(*(uint32_t *)decode);
216 218 lt->end = ntohl(*(uint32_t *)(decode + 4));
217 219 lt->len = ntohl(*(uint32_t *)(decode + 8));
218 220 lt->data = bin + 12;
219 221 bin += 12 + lt->len;
220 222 lt++;
221 223 }
222 224
223 225 l->tail = lt;
224 226 return l;
225 227 }
226 228
227 229 /* calculate the size of resultant text */
228 230 static int calcsize(int len, struct flist *l)
229 231 {
230 232 int outlen = 0, last = 0;
231 233 struct frag *f = l->head;
232 234
233 235 while (f != l->tail) {
234 236 outlen += f->start - last;
235 237 last = f->end;
236 238 outlen += f->len;
237 239 f++;
238 240 }
239 241
240 242 outlen += len - last;
241 243 return outlen;
242 244 }
243 245
244 246 static void apply(char *buf, char *orig, int len, struct flist *l)
245 247 {
246 248 struct frag *f = l->head;
247 249 int last = 0;
248 250 char *p = buf;
249 251
250 252 while (f != l->tail) {
251 253 memcpy(p, orig + last, f->start - last);
252 254 p += f->start - last;
253 255 memcpy(p, f->data, f->len);
254 256 last = f->end;
255 257 p += f->len;
256 258 f++;
257 259 }
258 260 memcpy(p, orig + last, len - last);
259 261 }
260 262
261 263 /* recursively generate a patch of all bins between start and end */
262 264 static struct flist *fold(PyObject *bins, int start, int end)
263 265 {
264 266 int len;
265 267
266 268 if (start + 1 == end) {
267 269 /* trivial case, output a decoded list */
268 270 PyObject *tmp = PyList_GetItem(bins, start);
269 271 if (!tmp)
270 272 return NULL;
271 273 return decode(PyString_AsString(tmp), PyString_Size(tmp));
272 274 }
273 275
274 276 /* divide and conquer, memory management is elsewhere */
275 277 len = (end - start) / 2;
276 278 return combine(fold(bins, start, start + len),
277 279 fold(bins, start + len, end));
278 280 }
279 281
280 282 static PyObject *
281 283 patches(PyObject *self, PyObject *args)
282 284 {
283 285 PyObject *text, *bins, *result;
284 286 struct flist *patch;
285 287 char *in, *out;
286 288 int len, outlen;
287 289
288 290 if (!PyArg_ParseTuple(args, "SO:mpatch", &text, &bins))
289 291 return NULL;
290 292
291 293 len = PyList_Size(bins);
292 294 if (!len) {
293 295 /* nothing to do */
294 296 Py_INCREF(text);
295 297 return text;
296 298 }
297 299
298 300 patch = fold(bins, 0, len);
299 301 if (!patch)
300 302 return PyErr_NoMemory();
301 303
302 304 outlen = calcsize(PyString_Size(text), patch);
303 305 result = PyString_FromStringAndSize(NULL, outlen);
304 306 if (result) {
305 307 in = PyString_AsString(text);
306 308 out = PyString_AsString(result);
307 309 apply(out, in, PyString_Size(text), patch);
308 310 }
309 311
310 312 lfree(patch);
311 313 return result;
312 314 }
313 315
314 316 static PyMethodDef methods[] = {
315 317 {"patches", patches, METH_VARARGS, "apply a series of patches\n"},
316 318 {NULL, NULL}
317 319 };
318 320
319 321 PyMODINIT_FUNC
320 322 initmpatch(void)
321 323 {
322 324 Py_InitModule3("mpatch", methods, mpatch_doc);
323 325 }
324 326
General Comments 0
You need to be logged in to leave comments. Login now