##// END OF EJS Templates
bdiff: use ssize_t in favor of Py_ssize_t in cpython-unaware locations...
Maciej Fijalkowski -
r29539:666832b9 default
parent child Browse files
Show More
@@ -1,488 +1,489 b''
1 1 /*
2 2 bdiff.c - efficient binary diff extension for Mercurial
3 3
4 4 Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
5 5
6 6 This software may be used and distributed according to the terms of
7 7 the GNU General Public License, incorporated herein by reference.
8 8
9 9 Based roughly on Python difflib
10 10 */
11 11
12 12 #define PY_SSIZE_T_CLEAN
13 13 #include <Python.h>
14 14 #include <stdlib.h>
15 15 #include <string.h>
16 16 #include <limits.h>
17 17
18 #include "compat.h"
18 19 #include "util.h"
19 20 #include "bitmanipulation.h"
20 21
21 22 struct line {
22 23 int hash, n, e;
23 Py_ssize_t len;
24 ssize_t len;
24 25 const char *l;
25 26 };
26 27
27 28 struct pos {
28 29 int pos, len;
29 30 };
30 31
31 32 struct hunk;
32 33 struct hunk {
33 34 int a1, a2, b1, b2;
34 35 struct hunk *next;
35 36 };
36 37
37 static int splitlines(const char *a, Py_ssize_t len, struct line **lr)
38 static int splitlines(const char *a, ssize_t len, struct line **lr)
38 39 {
39 40 unsigned hash;
40 41 int i;
41 42 const char *p, *b = a;
42 43 const char * const plast = a + len - 1;
43 44 struct line *l;
44 45
45 46 /* count the lines */
46 47 i = 1; /* extra line for sentinel */
47 48 for (p = a; p < a + len; p++)
48 49 if (*p == '\n' || p == plast)
49 50 i++;
50 51
51 52 *lr = l = (struct line *)malloc(sizeof(struct line) * i);
52 53 if (!l)
53 54 return -1;
54 55
55 56 /* build the line array and calculate hashes */
56 57 hash = 0;
57 58 for (p = a; p < a + len; p++) {
58 59 /* Leonid Yuriev's hash */
59 60 hash = (hash * 1664525) + (unsigned char)*p + 1013904223;
60 61
61 62 if (*p == '\n' || p == plast) {
62 63 l->hash = hash;
63 64 hash = 0;
64 65 l->len = p - b + 1;
65 66 l->l = b;
66 67 l->n = INT_MAX;
67 68 l++;
68 69 b = p + 1;
69 70 }
70 71 }
71 72
72 73 /* set up a sentinel */
73 74 l->hash = 0;
74 75 l->len = 0;
75 76 l->l = a + len;
76 77 return i - 1;
77 78 }
78 79
79 80 static inline int cmp(struct line *a, struct line *b)
80 81 {
81 82 return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);
82 83 }
83 84
84 85 static int equatelines(struct line *a, int an, struct line *b, int bn)
85 86 {
86 87 int i, j, buckets = 1, t, scale;
87 88 struct pos *h = NULL;
88 89
89 90 /* build a hash table of the next highest power of 2 */
90 91 while (buckets < bn + 1)
91 92 buckets *= 2;
92 93
93 94 /* try to allocate a large hash table to avoid collisions */
94 95 for (scale = 4; scale; scale /= 2) {
95 96 h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));
96 97 if (h)
97 98 break;
98 99 }
99 100
100 101 if (!h)
101 102 return 0;
102 103
103 104 buckets = buckets * scale - 1;
104 105
105 106 /* clear the hash table */
106 107 for (i = 0; i <= buckets; i++) {
107 108 h[i].pos = -1;
108 109 h[i].len = 0;
109 110 }
110 111
111 112 /* add lines to the hash table chains */
112 113 for (i = 0; i < bn; i++) {
113 114 /* find the equivalence class */
114 115 for (j = b[i].hash & buckets; h[j].pos != -1;
115 116 j = (j + 1) & buckets)
116 117 if (!cmp(b + i, b + h[j].pos))
117 118 break;
118 119
119 120 /* add to the head of the equivalence class */
120 121 b[i].n = h[j].pos;
121 122 b[i].e = j;
122 123 h[j].pos = i;
123 124 h[j].len++; /* keep track of popularity */
124 125 }
125 126
126 127 /* compute popularity threshold */
127 128 t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);
128 129
129 130 /* match items in a to their equivalence class in b */
130 131 for (i = 0; i < an; i++) {
131 132 /* find the equivalence class */
132 133 for (j = a[i].hash & buckets; h[j].pos != -1;
133 134 j = (j + 1) & buckets)
134 135 if (!cmp(a + i, b + h[j].pos))
135 136 break;
136 137
137 138 a[i].e = j; /* use equivalence class for quick compare */
138 139 if (h[j].len <= t)
139 140 a[i].n = h[j].pos; /* point to head of match list */
140 141 else
141 142 a[i].n = -1; /* too popular */
142 143 }
143 144
144 145 /* discard hash tables */
145 146 free(h);
146 147 return 1;
147 148 }
148 149
149 150 static int longest_match(struct line *a, struct line *b, struct pos *pos,
150 151 int a1, int a2, int b1, int b2, int *omi, int *omj)
151 152 {
152 153 int mi = a1, mj = b1, mk = 0, i, j, k, half;
153 154
154 155 /* window our search on large regions to better bound
155 156 worst-case performance. by choosing a window at the end, we
156 157 reduce skipping overhead on the b chains. */
157 158 if (a2 - a1 > 30000)
158 159 a1 = a2 - 30000;
159 160
160 161 half = (a1 + a2) / 2;
161 162
162 163 for (i = a1; i < a2; i++) {
163 164 /* skip all lines in b after the current block */
164 165 for (j = a[i].n; j >= b2; j = b[j].n)
165 166 ;
166 167
167 168 /* loop through all lines match a[i] in b */
168 169 for (; j >= b1; j = b[j].n) {
169 170 /* does this extend an earlier match? */
170 171 for (k = 1; j - k >= b1 && i - k >= a1; k++) {
171 172 /* reached an earlier match? */
172 173 if (pos[j - k].pos == i - k) {
173 174 k += pos[j - k].len;
174 175 break;
175 176 }
176 177 /* previous line mismatch? */
177 178 if (a[i - k].e != b[j - k].e)
178 179 break;
179 180 }
180 181
181 182 pos[j].pos = i;
182 183 pos[j].len = k;
183 184
184 185 /* best match so far? we prefer matches closer
185 186 to the middle to balance recursion */
186 187 if (k > mk || (k == mk && (i <= mi || i < half))) {
187 188 mi = i;
188 189 mj = j;
189 190 mk = k;
190 191 }
191 192 }
192 193 }
193 194
194 195 if (mk) {
195 196 mi = mi - mk + 1;
196 197 mj = mj - mk + 1;
197 198 }
198 199
199 200 /* expand match to include subsequent popular lines */
200 201 while (mi + mk < a2 && mj + mk < b2 &&
201 202 a[mi + mk].e == b[mj + mk].e)
202 203 mk++;
203 204
204 205 *omi = mi;
205 206 *omj = mj;
206 207
207 208 return mk;
208 209 }
209 210
210 211 static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,
211 212 int a1, int a2, int b1, int b2, struct hunk *l)
212 213 {
213 214 int i, j, k;
214 215
215 216 while (1) {
216 217 /* find the longest match in this chunk */
217 218 k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
218 219 if (!k)
219 220 return l;
220 221
221 222 /* and recurse on the remaining chunks on either side */
222 223 l = recurse(a, b, pos, a1, i, b1, j, l);
223 224 if (!l)
224 225 return NULL;
225 226
226 227 l->next = (struct hunk *)malloc(sizeof(struct hunk));
227 228 if (!l->next)
228 229 return NULL;
229 230
230 231 l = l->next;
231 232 l->a1 = i;
232 233 l->a2 = i + k;
233 234 l->b1 = j;
234 235 l->b2 = j + k;
235 236 l->next = NULL;
236 237
237 238 /* tail-recursion didn't happen, so do equivalent iteration */
238 239 a1 = i + k;
239 240 b1 = j + k;
240 241 }
241 242 }
242 243
243 244 static int diff(struct line *a, int an, struct line *b, int bn,
244 245 struct hunk *base)
245 246 {
246 247 struct hunk *curr;
247 248 struct pos *pos;
248 249 int t, count = 0;
249 250
250 251 /* allocate and fill arrays */
251 252 t = equatelines(a, an, b, bn);
252 253 pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));
253 254
254 255 if (pos && t) {
255 256 /* generate the matching block list */
256 257
257 258 curr = recurse(a, b, pos, 0, an, 0, bn, base);
258 259 if (!curr)
259 260 return -1;
260 261
261 262 /* sentinel end hunk */
262 263 curr->next = (struct hunk *)malloc(sizeof(struct hunk));
263 264 if (!curr->next)
264 265 return -1;
265 266 curr = curr->next;
266 267 curr->a1 = curr->a2 = an;
267 268 curr->b1 = curr->b2 = bn;
268 269 curr->next = NULL;
269 270 }
270 271
271 272 free(pos);
272 273
273 274 /* normalize the hunk list, try to push each hunk towards the end */
274 275 for (curr = base->next; curr; curr = curr->next) {
275 276 struct hunk *next = curr->next;
276 277
277 278 if (!next)
278 279 break;
279 280
280 281 if (curr->a2 == next->a1 || curr->b2 == next->b1)
281 282 while (curr->a2 < an && curr->b2 < bn
282 283 && next->a1 < next->a2
283 284 && next->b1 < next->b2
284 285 && !cmp(a + curr->a2, b + curr->b2)) {
285 286 curr->a2++;
286 287 next->a1++;
287 288 curr->b2++;
288 289 next->b1++;
289 290 }
290 291 }
291 292
292 293 for (curr = base->next; curr; curr = curr->next)
293 294 count++;
294 295 return count;
295 296 }
296 297
297 298 static void freehunks(struct hunk *l)
298 299 {
299 300 struct hunk *n;
300 301 for (; l; l = n) {
301 302 n = l->next;
302 303 free(l);
303 304 }
304 305 }
305 306
306 307 static PyObject *blocks(PyObject *self, PyObject *args)
307 308 {
308 309 PyObject *sa, *sb, *rl = NULL, *m;
309 310 struct line *a, *b;
310 311 struct hunk l, *h;
311 312 int an, bn, count, pos = 0;
312 313
313 314 l.next = NULL;
314 315
315 316 if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
316 317 return NULL;
317 318
318 319 an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);
319 320 bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);
320 321
321 322 if (!a || !b)
322 323 goto nomem;
323 324
324 325 count = diff(a, an, b, bn, &l);
325 326 if (count < 0)
326 327 goto nomem;
327 328
328 329 rl = PyList_New(count);
329 330 if (!rl)
330 331 goto nomem;
331 332
332 333 for (h = l.next; h; h = h->next) {
333 334 m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
334 335 PyList_SetItem(rl, pos, m);
335 336 pos++;
336 337 }
337 338
338 339 nomem:
339 340 free(a);
340 341 free(b);
341 342 freehunks(l.next);
342 343 return rl ? rl : PyErr_NoMemory();
343 344 }
344 345
345 346 static PyObject *bdiff(PyObject *self, PyObject *args)
346 347 {
347 348 char *sa, *sb, *rb;
348 349 PyObject *result = NULL;
349 350 struct line *al, *bl;
350 351 struct hunk l, *h;
351 352 int an, bn, count;
352 353 Py_ssize_t len = 0, la, lb;
353 354 PyThreadState *_save;
354 355
355 356 l.next = NULL;
356 357
357 358 if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))
358 359 return NULL;
359 360
360 361 if (la > UINT_MAX || lb > UINT_MAX) {
361 362 PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");
362 363 return NULL;
363 364 }
364 365
365 366 _save = PyEval_SaveThread();
366 367 an = splitlines(sa, la, &al);
367 368 bn = splitlines(sb, lb, &bl);
368 369 if (!al || !bl)
369 370 goto nomem;
370 371
371 372 count = diff(al, an, bl, bn, &l);
372 373 if (count < 0)
373 374 goto nomem;
374 375
375 376 /* calculate length of output */
376 377 la = lb = 0;
377 378 for (h = l.next; h; h = h->next) {
378 379 if (h->a1 != la || h->b1 != lb)
379 380 len += 12 + bl[h->b1].l - bl[lb].l;
380 381 la = h->a2;
381 382 lb = h->b2;
382 383 }
383 384 PyEval_RestoreThread(_save);
384 385 _save = NULL;
385 386
386 387 result = PyBytes_FromStringAndSize(NULL, len);
387 388
388 389 if (!result)
389 390 goto nomem;
390 391
391 392 /* build binary patch */
392 393 rb = PyBytes_AsString(result);
393 394 la = lb = 0;
394 395
395 396 for (h = l.next; h; h = h->next) {
396 397 if (h->a1 != la || h->b1 != lb) {
397 398 len = bl[h->b1].l - bl[lb].l;
398 399 putbe32((uint32_t)(al[la].l - al->l), rb);
399 400 putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);
400 401 putbe32((uint32_t)len, rb + 8);
401 402 memcpy(rb + 12, bl[lb].l, len);
402 403 rb += 12 + len;
403 404 }
404 405 la = h->a2;
405 406 lb = h->b2;
406 407 }
407 408
408 409 nomem:
409 410 if (_save)
410 411 PyEval_RestoreThread(_save);
411 412 free(al);
412 413 free(bl);
413 414 freehunks(l.next);
414 415 return result ? result : PyErr_NoMemory();
415 416 }
416 417
417 418 /*
418 419 * If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,
419 420 * reduce whitespace sequences to a single space and trim remaining whitespace
420 421 * from end of lines.
421 422 */
422 423 static PyObject *fixws(PyObject *self, PyObject *args)
423 424 {
424 425 PyObject *s, *result = NULL;
425 426 char allws, c;
426 427 const char *r;
427 428 Py_ssize_t i, rlen, wlen = 0;
428 429 char *w;
429 430
430 431 if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))
431 432 return NULL;
432 433 r = PyBytes_AsString(s);
433 434 rlen = PyBytes_Size(s);
434 435
435 436 w = (char *)malloc(rlen ? rlen : 1);
436 437 if (!w)
437 438 goto nomem;
438 439
439 440 for (i = 0; i != rlen; i++) {
440 441 c = r[i];
441 442 if (c == ' ' || c == '\t' || c == '\r') {
442 443 if (!allws && (wlen == 0 || w[wlen - 1] != ' '))
443 444 w[wlen++] = ' ';
444 445 } else if (c == '\n' && !allws
445 446 && wlen > 0 && w[wlen - 1] == ' ') {
446 447 w[wlen - 1] = '\n';
447 448 } else {
448 449 w[wlen++] = c;
449 450 }
450 451 }
451 452
452 453 result = PyBytes_FromStringAndSize(w, wlen);
453 454
454 455 nomem:
455 456 free(w);
456 457 return result ? result : PyErr_NoMemory();
457 458 }
458 459
459 460
460 461 static char mdiff_doc[] = "Efficient binary diff.";
461 462
462 463 static PyMethodDef methods[] = {
463 464 {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
464 465 {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
465 466 {"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},
466 467 {NULL, NULL}
467 468 };
468 469
469 470 #ifdef IS_PY3K
470 471 static struct PyModuleDef bdiff_module = {
471 472 PyModuleDef_HEAD_INIT,
472 473 "bdiff",
473 474 mdiff_doc,
474 475 -1,
475 476 methods
476 477 };
477 478
478 479 PyMODINIT_FUNC PyInit_bdiff(void)
479 480 {
480 481 return PyModule_Create(&bdiff_module);
481 482 }
482 483 #else
483 484 PyMODINIT_FUNC initbdiff(void)
484 485 {
485 486 Py_InitModule3("bdiff", methods, mdiff_doc);
486 487 }
487 488 #endif
488 489
General Comments 0
You need to be logged in to leave comments. Login now