##// END OF EJS Templates
mdiff: replace wscleanup() regexps with C loops...
Patrick Mezard -
r15530:eeac5e17 default
parent child Browse files
Show More
@@ -1,455 +1,499 b''
1 1 /*
2 2 bdiff.c - efficient binary diff extension for Mercurial
3 3
4 4 Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
5 5
6 6 This software may be used and distributed according to the terms of
7 7 the GNU General Public License, incorporated herein by reference.
8 8
9 9 Based roughly on Python difflib
10 10 */
11 11
12 12 #include <Python.h>
13 13 #include <stdlib.h>
14 14 #include <string.h>
15 15 #include <limits.h>
16 16
17 17 #if defined __hpux || defined __SUNPRO_C || defined _AIX
18 18 #define inline
19 19 #endif
20 20
21 21 #ifdef __linux
22 22 #define inline __inline
23 23 #endif
24 24
25 25 #ifdef _WIN32
26 26 #ifdef _MSC_VER
27 27 #define inline __inline
28 28 typedef unsigned long uint32_t;
29 29 #else
30 30 #include <stdint.h>
31 31 #endif
32 32 static uint32_t htonl(uint32_t x)
33 33 {
34 34 return ((x & 0x000000ffUL) << 24) |
35 35 ((x & 0x0000ff00UL) << 8) |
36 36 ((x & 0x00ff0000UL) >> 8) |
37 37 ((x & 0xff000000UL) >> 24);
38 38 }
39 39 #else
40 40 #include <sys/types.h>
41 41 #if defined __BEOS__ && !defined __HAIKU__
42 42 #include <ByteOrder.h>
43 43 #else
44 44 #include <arpa/inet.h>
45 45 #endif
46 46 #include <inttypes.h>
47 47 #endif
48 48
49 49 #include "util.h"
50 50
51 51 struct line {
52 52 int hash, len, n, e;
53 53 const char *l;
54 54 };
55 55
56 56 struct pos {
57 57 int pos, len;
58 58 };
59 59
60 60 struct hunk;
61 61 struct hunk {
62 62 int a1, a2, b1, b2;
63 63 struct hunk *next;
64 64 };
65 65
66 66 static int splitlines(const char *a, int len, struct line **lr)
67 67 {
68 68 unsigned hash;
69 69 int i;
70 70 const char *p, *b = a;
71 71 const char * const plast = a + len - 1;
72 72 struct line *l;
73 73
74 74 /* count the lines */
75 75 i = 1; /* extra line for sentinel */
76 76 for (p = a; p < a + len; p++)
77 77 if (*p == '\n' || p == plast)
78 78 i++;
79 79
80 80 *lr = l = (struct line *)malloc(sizeof(struct line) * i);
81 81 if (!l)
82 82 return -1;
83 83
84 84 /* build the line array and calculate hashes */
85 85 hash = 0;
86 86 for (p = a; p < a + len; p++) {
87 87 /* Leonid Yuriev's hash */
88 88 hash = (hash * 1664525) + (unsigned char)*p + 1013904223;
89 89
90 90 if (*p == '\n' || p == plast) {
91 91 l->hash = hash;
92 92 hash = 0;
93 93 l->len = p - b + 1;
94 94 l->l = b;
95 95 l->n = INT_MAX;
96 96 l++;
97 97 b = p + 1;
98 98 }
99 99 }
100 100
101 101 /* set up a sentinel */
102 102 l->hash = 0;
103 103 l->len = 0;
104 104 l->l = a + len;
105 105 return i - 1;
106 106 }
107 107
108 108 static inline int cmp(struct line *a, struct line *b)
109 109 {
110 110 return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);
111 111 }
112 112
113 113 static int equatelines(struct line *a, int an, struct line *b, int bn)
114 114 {
115 115 int i, j, buckets = 1, t, scale;
116 116 struct pos *h = NULL;
117 117
118 118 /* build a hash table of the next highest power of 2 */
119 119 while (buckets < bn + 1)
120 120 buckets *= 2;
121 121
122 122 /* try to allocate a large hash table to avoid collisions */
123 123 for (scale = 4; scale; scale /= 2) {
124 124 h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));
125 125 if (h)
126 126 break;
127 127 }
128 128
129 129 if (!h)
130 130 return 0;
131 131
132 132 buckets = buckets * scale - 1;
133 133
134 134 /* clear the hash table */
135 135 for (i = 0; i <= buckets; i++) {
136 136 h[i].pos = INT_MAX;
137 137 h[i].len = 0;
138 138 }
139 139
140 140 /* add lines to the hash table chains */
141 141 for (i = bn - 1; i >= 0; i--) {
142 142 /* find the equivalence class */
143 143 for (j = b[i].hash & buckets; h[j].pos != INT_MAX;
144 144 j = (j + 1) & buckets)
145 145 if (!cmp(b + i, b + h[j].pos))
146 146 break;
147 147
148 148 /* add to the head of the equivalence class */
149 149 b[i].n = h[j].pos;
150 150 b[i].e = j;
151 151 h[j].pos = i;
152 152 h[j].len++; /* keep track of popularity */
153 153 }
154 154
155 155 /* compute popularity threshold */
156 156 t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);
157 157
158 158 /* match items in a to their equivalence class in b */
159 159 for (i = 0; i < an; i++) {
160 160 /* find the equivalence class */
161 161 for (j = a[i].hash & buckets; h[j].pos != INT_MAX;
162 162 j = (j + 1) & buckets)
163 163 if (!cmp(a + i, b + h[j].pos))
164 164 break;
165 165
166 166 a[i].e = j; /* use equivalence class for quick compare */
167 167 if (h[j].len <= t)
168 168 a[i].n = h[j].pos; /* point to head of match list */
169 169 else
170 170 a[i].n = INT_MAX; /* too popular */
171 171 }
172 172
173 173 /* discard hash tables */
174 174 free(h);
175 175 return 1;
176 176 }
177 177
178 178 static int longest_match(struct line *a, struct line *b, struct pos *pos,
179 179 int a1, int a2, int b1, int b2, int *omi, int *omj)
180 180 {
181 181 int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;
182 182
183 183 for (i = a1; i < a2; i++) {
184 184 /* skip things before the current block */
185 185 for (j = a[i].n; j < b1; j = b[j].n)
186 186 ;
187 187
188 188 /* loop through all lines match a[i] in b */
189 189 for (; j < b2; j = b[j].n) {
190 190 /* does this extend an earlier match? */
191 191 if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)
192 192 k = pos[j - 1].len + 1;
193 193 else
194 194 k = 1;
195 195 pos[j].pos = i;
196 196 pos[j].len = k;
197 197
198 198 /* best match so far? */
199 199 if (k > mk) {
200 200 mi = i;
201 201 mj = j;
202 202 mk = k;
203 203 }
204 204 }
205 205 }
206 206
207 207 if (mk) {
208 208 mi = mi - mk + 1;
209 209 mj = mj - mk + 1;
210 210 }
211 211
212 212 /* expand match to include neighboring popular lines */
213 213 while (mi - mb > a1 && mj - mb > b1 &&
214 214 a[mi - mb - 1].e == b[mj - mb - 1].e)
215 215 mb++;
216 216 while (mi + mk < a2 && mj + mk < b2 &&
217 217 a[mi + mk].e == b[mj + mk].e)
218 218 mk++;
219 219
220 220 *omi = mi - mb;
221 221 *omj = mj - mb;
222 222
223 223 return mk + mb;
224 224 }
225 225
226 226 static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,
227 227 int a1, int a2, int b1, int b2, struct hunk *l)
228 228 {
229 229 int i, j, k;
230 230
231 231 while (1) {
232 232 /* find the longest match in this chunk */
233 233 k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
234 234 if (!k)
235 235 return l;
236 236
237 237 /* and recurse on the remaining chunks on either side */
238 238 l = recurse(a, b, pos, a1, i, b1, j, l);
239 239 if (!l)
240 240 return NULL;
241 241
242 242 l->next = (struct hunk *)malloc(sizeof(struct hunk));
243 243 if (!l->next)
244 244 return NULL;
245 245
246 246 l = l->next;
247 247 l->a1 = i;
248 248 l->a2 = i + k;
249 249 l->b1 = j;
250 250 l->b2 = j + k;
251 251 l->next = NULL;
252 252
253 253 /* tail-recursion didn't happen, so do equivalent iteration */
254 254 a1 = i + k;
255 255 b1 = j + k;
256 256 }
257 257 }
258 258
259 259 static int diff(struct line *a, int an, struct line *b, int bn,
260 260 struct hunk *base)
261 261 {
262 262 struct hunk *curr;
263 263 struct pos *pos;
264 264 int t, count = 0;
265 265
266 266 /* allocate and fill arrays */
267 267 t = equatelines(a, an, b, bn);
268 268 pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));
269 269
270 270 if (pos && t) {
271 271 /* generate the matching block list */
272 272
273 273 curr = recurse(a, b, pos, 0, an, 0, bn, base);
274 274 if (!curr)
275 275 return -1;
276 276
277 277 /* sentinel end hunk */
278 278 curr->next = (struct hunk *)malloc(sizeof(struct hunk));
279 279 if (!curr->next)
280 280 return -1;
281 281 curr = curr->next;
282 282 curr->a1 = curr->a2 = an;
283 283 curr->b1 = curr->b2 = bn;
284 284 curr->next = NULL;
285 285 }
286 286
287 287 free(pos);
288 288
289 289 /* normalize the hunk list, try to push each hunk towards the end */
290 290 for (curr = base->next; curr; curr = curr->next) {
291 291 struct hunk *next = curr->next;
292 292 int shift = 0;
293 293
294 294 if (!next)
295 295 break;
296 296
297 297 if (curr->a2 == next->a1)
298 298 while (curr->a2 + shift < an && curr->b2 + shift < bn
299 299 && !cmp(a + curr->a2 + shift,
300 300 b + curr->b2 + shift))
301 301 shift++;
302 302 else if (curr->b2 == next->b1)
303 303 while (curr->b2 + shift < bn && curr->a2 + shift < an
304 304 && !cmp(b + curr->b2 + shift,
305 305 a + curr->a2 + shift))
306 306 shift++;
307 307 if (!shift)
308 308 continue;
309 309 curr->b2 += shift;
310 310 next->b1 += shift;
311 311 curr->a2 += shift;
312 312 next->a1 += shift;
313 313 }
314 314
315 315 for (curr = base->next; curr; curr = curr->next)
316 316 count++;
317 317 return count;
318 318 }
319 319
320 320 static void freehunks(struct hunk *l)
321 321 {
322 322 struct hunk *n;
323 323 for (; l; l = n) {
324 324 n = l->next;
325 325 free(l);
326 326 }
327 327 }
328 328
329 329 static PyObject *blocks(PyObject *self, PyObject *args)
330 330 {
331 331 PyObject *sa, *sb, *rl = NULL, *m;
332 332 struct line *a, *b;
333 333 struct hunk l, *h;
334 334 int an, bn, count, pos = 0;
335 335
336 336 if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
337 337 return NULL;
338 338
339 339 an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);
340 340 bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);
341 341
342 342 if (!a || !b)
343 343 goto nomem;
344 344
345 345 l.next = NULL;
346 346 count = diff(a, an, b, bn, &l);
347 347 if (count < 0)
348 348 goto nomem;
349 349
350 350 rl = PyList_New(count);
351 351 if (!rl)
352 352 goto nomem;
353 353
354 354 for (h = l.next; h; h = h->next) {
355 355 m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
356 356 PyList_SetItem(rl, pos, m);
357 357 pos++;
358 358 }
359 359
360 360 nomem:
361 361 free(a);
362 362 free(b);
363 363 freehunks(l.next);
364 364 return rl ? rl : PyErr_NoMemory();
365 365 }
366 366
367 367 static PyObject *bdiff(PyObject *self, PyObject *args)
368 368 {
369 369 char *sa, *sb, *rb;
370 370 PyObject *result = NULL;
371 371 struct line *al, *bl;
372 372 struct hunk l, *h;
373 373 uint32_t encode[3];
374 374 int an, bn, len = 0, la, lb, count;
375 375
376 376 if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))
377 377 return NULL;
378 378
379 379 an = splitlines(sa, la, &al);
380 380 bn = splitlines(sb, lb, &bl);
381 381 if (!al || !bl)
382 382 goto nomem;
383 383
384 384 l.next = NULL;
385 385 count = diff(al, an, bl, bn, &l);
386 386 if (count < 0)
387 387 goto nomem;
388 388
389 389 /* calculate length of output */
390 390 la = lb = 0;
391 391 for (h = l.next; h; h = h->next) {
392 392 if (h->a1 != la || h->b1 != lb)
393 393 len += 12 + bl[h->b1].l - bl[lb].l;
394 394 la = h->a2;
395 395 lb = h->b2;
396 396 }
397 397
398 398 result = PyBytes_FromStringAndSize(NULL, len);
399 399
400 400 if (!result)
401 401 goto nomem;
402 402
403 403 /* build binary patch */
404 404 rb = PyBytes_AsString(result);
405 405 la = lb = 0;
406 406
407 407 for (h = l.next; h; h = h->next) {
408 408 if (h->a1 != la || h->b1 != lb) {
409 409 len = bl[h->b1].l - bl[lb].l;
410 410 encode[0] = htonl(al[la].l - al->l);
411 411 encode[1] = htonl(al[h->a1].l - al->l);
412 412 encode[2] = htonl(len);
413 413 memcpy(rb, encode, 12);
414 414 memcpy(rb + 12, bl[lb].l, len);
415 415 rb += 12 + len;
416 416 }
417 417 la = h->a2;
418 418 lb = h->b2;
419 419 }
420 420
421 421 nomem:
422 422 free(al);
423 423 free(bl);
424 424 freehunks(l.next);
425 425 return result ? result : PyErr_NoMemory();
426 426 }
427 427
428 /*
429 * If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,
430 * reduce whitespace sequences to a single space and trim remaining whitespace
431 * from end of lines.
432 */
433 static PyObject *fixws(PyObject *self, PyObject *args)
434 {
435 PyObject *s, *result = NULL;
436 char allws, c;
437 const char *r;
438 int i, rlen, wlen = 0;
439 char *w;
440
441 if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))
442 return NULL;
443 r = PyBytes_AsString(s);
444 rlen = PyBytes_Size(s);
445
446 w = (char *)malloc(rlen);
447 if (!w)
448 goto nomem;
449
450 for (i = 0; i != rlen; i++) {
451 c = r[i];
452 if (c == ' ' || c == '\t' || c == '\r') {
453 if (!allws && (wlen == 0 || w[wlen - 1] != ' '))
454 w[wlen++] = ' ';
455 } else if (c == '\n' && !allws
456 && wlen > 0 && w[wlen - 1] == ' ') {
457 w[wlen - 1] = '\n';
458 } else {
459 w[wlen++] = c;
460 }
461 }
462
463 result = PyBytes_FromStringAndSize(w, wlen);
464
465 nomem:
466 free(w);
467 return result ? result : PyErr_NoMemory();
468 }
469
470
428 471 static char mdiff_doc[] = "Efficient binary diff.";
429 472
430 473 static PyMethodDef methods[] = {
431 474 {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
432 475 {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
476 {"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},
433 477 {NULL, NULL}
434 478 };
435 479
436 480 #ifdef IS_PY3K
437 481 static struct PyModuleDef bdiff_module = {
438 482 PyModuleDef_HEAD_INIT,
439 483 "bdiff",
440 484 mdiff_doc,
441 485 -1,
442 486 methods
443 487 };
444 488
445 489 PyMODINIT_FUNC PyInit_bdiff(void)
446 490 {
447 491 return PyModule_Create(&bdiff_module);
448 492 }
449 493 #else
450 494 PyMODINIT_FUNC initbdiff(void)
451 495 {
452 496 Py_InitModule3("bdiff", methods, mdiff_doc);
453 497 }
454 498 #endif
455 499
@@ -1,334 +1,333 b''
1 1 # mdiff.py - diff and patch routines for mercurial
2 2 #
3 3 # Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from i18n import _
9 9 import bdiff, mpatch, util
10 10 import re, struct
11 11
12 12 def splitnewlines(text):
13 13 '''like str.splitlines, but only split on newlines.'''
14 14 lines = [l + '\n' for l in text.split('\n')]
15 15 if lines:
16 16 if lines[-1] == '\n':
17 17 lines.pop()
18 18 else:
19 19 lines[-1] = lines[-1][:-1]
20 20 return lines
21 21
22 22 class diffopts(object):
23 23 '''context is the number of context lines
24 24 text treats all files as text
25 25 showfunc enables diff -p output
26 26 git enables the git extended patch format
27 27 nodates removes dates from diff headers
28 28 ignorews ignores all whitespace changes in the diff
29 29 ignorewsamount ignores changes in the amount of whitespace
30 30 ignoreblanklines ignores changes whose lines are all blank
31 31 upgrade generates git diffs to avoid data loss
32 32 '''
33 33
34 34 defaults = {
35 35 'context': 3,
36 36 'text': False,
37 37 'showfunc': False,
38 38 'git': False,
39 39 'nodates': False,
40 40 'ignorews': False,
41 41 'ignorewsamount': False,
42 42 'ignoreblanklines': False,
43 43 'upgrade': False,
44 44 }
45 45
46 46 __slots__ = defaults.keys()
47 47
48 48 def __init__(self, **opts):
49 49 for k in self.__slots__:
50 50 v = opts.get(k)
51 51 if v is None:
52 52 v = self.defaults[k]
53 53 setattr(self, k, v)
54 54
55 55 try:
56 56 self.context = int(self.context)
57 57 except ValueError:
58 58 raise util.Abort(_('diff context lines count must be '
59 59 'an integer, not %r') % self.context)
60 60
61 61 def copy(self, **kwargs):
62 62 opts = dict((k, getattr(self, k)) for k in self.defaults)
63 63 opts.update(kwargs)
64 64 return diffopts(**opts)
65 65
66 66 defaultopts = diffopts()
67 67
68 68 def wsclean(opts, text, blank=True):
69 69 if opts.ignorews:
70 text = re.sub('[ \t\r]+', '', text)
70 text = bdiff.fixws(text, 1)
71 71 elif opts.ignorewsamount:
72 text = re.sub('[ \t\r]+', ' ', text)
73 text = text.replace(' \n', '\n')
72 text = bdiff.fixws(text, 0)
74 73 if blank and opts.ignoreblanklines:
75 74 text = re.sub('\n+', '\n', text).strip('\n')
76 75 return text
77 76
78 77 def splitblock(base1, lines1, base2, lines2, opts):
79 78 # The input lines matches except for interwoven blank lines. We
80 79 # transform it into a sequence of matching blocks and blank blocks.
81 80 lines1 = [(wsclean(opts, l) and 1 or 0) for l in lines1]
82 81 lines2 = [(wsclean(opts, l) and 1 or 0) for l in lines2]
83 82 s1, e1 = 0, len(lines1)
84 83 s2, e2 = 0, len(lines2)
85 84 while s1 < e1 or s2 < e2:
86 85 i1, i2, btype = s1, s2, '='
87 86 if (i1 >= e1 or lines1[i1] == 0
88 87 or i2 >= e2 or lines2[i2] == 0):
89 88 # Consume the block of blank lines
90 89 btype = '~'
91 90 while i1 < e1 and lines1[i1] == 0:
92 91 i1 += 1
93 92 while i2 < e2 and lines2[i2] == 0:
94 93 i2 += 1
95 94 else:
96 95 # Consume the matching lines
97 96 while i1 < e1 and lines1[i1] == 1 and lines2[i2] == 1:
98 97 i1 += 1
99 98 i2 += 1
100 99 yield [base1 + s1, base1 + i1, base2 + s2, base2 + i2], btype
101 100 s1 = i1
102 101 s2 = i2
103 102
104 103 def allblocks(text1, text2, opts=None, lines1=None, lines2=None, refine=False):
105 104 """Return (block, type) tuples, where block is an mdiff.blocks
106 105 line entry. type is '=' for blocks matching exactly one another
107 106 (bdiff blocks), '!' for non-matching blocks and '~' for blocks
108 107 matching only after having filtered blank lines. If refine is True,
109 108 then '~' blocks are refined and are only made of blank lines.
110 109 line1 and line2 are text1 and text2 split with splitnewlines() if
111 110 they are already available.
112 111 """
113 112 if opts is None:
114 113 opts = defaultopts
115 114 if opts.ignorews or opts.ignorewsamount:
116 115 text1 = wsclean(opts, text1, False)
117 116 text2 = wsclean(opts, text2, False)
118 117 diff = bdiff.blocks(text1, text2)
119 118 for i, s1 in enumerate(diff):
120 119 # The first match is special.
121 120 # we've either found a match starting at line 0 or a match later
122 121 # in the file. If it starts later, old and new below will both be
123 122 # empty and we'll continue to the next match.
124 123 if i > 0:
125 124 s = diff[i - 1]
126 125 else:
127 126 s = [0, 0, 0, 0]
128 127 s = [s[1], s1[0], s[3], s1[2]]
129 128
130 129 # bdiff sometimes gives huge matches past eof, this check eats them,
131 130 # and deals with the special first match case described above
132 131 if s[0] != s[1] or s[2] != s[3]:
133 132 type = '!'
134 133 if opts.ignoreblanklines:
135 134 if lines1 is None:
136 135 lines1 = splitnewlines(text1)
137 136 if lines2 is None:
138 137 lines2 = splitnewlines(text2)
139 138 old = wsclean(opts, "".join(lines1[s[0]:s[1]]))
140 139 new = wsclean(opts, "".join(lines2[s[2]:s[3]]))
141 140 if old == new:
142 141 type = '~'
143 142 yield s, type
144 143 yield s1, '='
145 144
146 145 def diffline(revs, a, b, opts):
147 146 parts = ['diff']
148 147 if opts.git:
149 148 parts.append('--git')
150 149 if revs and not opts.git:
151 150 parts.append(' '.join(["-r %s" % rev for rev in revs]))
152 151 if opts.git:
153 152 parts.append('a/%s' % a)
154 153 parts.append('b/%s' % b)
155 154 else:
156 155 parts.append(a)
157 156 return ' '.join(parts) + '\n'
158 157
159 158 def unidiff(a, ad, b, bd, fn1, fn2, r=None, opts=defaultopts):
160 159 def datetag(date, addtab=True):
161 160 if not opts.git and not opts.nodates:
162 161 return '\t%s\n' % date
163 162 if addtab and ' ' in fn1:
164 163 return '\t\n'
165 164 return '\n'
166 165
167 166 if not a and not b:
168 167 return ""
169 168 epoch = util.datestr((0, 0))
170 169
171 170 fn1 = util.pconvert(fn1)
172 171 fn2 = util.pconvert(fn2)
173 172
174 173 if not opts.text and (util.binary(a) or util.binary(b)):
175 174 if a and b and len(a) == len(b) and a == b:
176 175 return ""
177 176 l = ['Binary file %s has changed\n' % fn1]
178 177 elif not a:
179 178 b = splitnewlines(b)
180 179 if a is None:
181 180 l1 = '--- /dev/null%s' % datetag(epoch, False)
182 181 else:
183 182 l1 = "--- %s%s" % ("a/" + fn1, datetag(ad))
184 183 l2 = "+++ %s%s" % ("b/" + fn2, datetag(bd))
185 184 l3 = "@@ -0,0 +1,%d @@\n" % len(b)
186 185 l = [l1, l2, l3] + ["+" + e for e in b]
187 186 elif not b:
188 187 a = splitnewlines(a)
189 188 l1 = "--- %s%s" % ("a/" + fn1, datetag(ad))
190 189 if b is None:
191 190 l2 = '+++ /dev/null%s' % datetag(epoch, False)
192 191 else:
193 192 l2 = "+++ %s%s" % ("b/" + fn2, datetag(bd))
194 193 l3 = "@@ -1,%d +0,0 @@\n" % len(a)
195 194 l = [l1, l2, l3] + ["-" + e for e in a]
196 195 else:
197 196 al = splitnewlines(a)
198 197 bl = splitnewlines(b)
199 198 l = list(_unidiff(a, b, al, bl, opts=opts))
200 199 if not l:
201 200 return ""
202 201
203 202 l.insert(0, "--- a/%s%s" % (fn1, datetag(ad)))
204 203 l.insert(1, "+++ b/%s%s" % (fn2, datetag(bd)))
205 204
206 205 for ln in xrange(len(l)):
207 206 if l[ln][-1] != '\n':
208 207 l[ln] += "\n\ No newline at end of file\n"
209 208
210 209 if r:
211 210 l.insert(0, diffline(r, fn1, fn2, opts))
212 211
213 212 return "".join(l)
214 213
215 214 # creates a headerless unified diff
216 215 # t1 and t2 are the text to be diffed
217 216 # l1 and l2 are the text broken up into lines
218 217 def _unidiff(t1, t2, l1, l2, opts=defaultopts):
219 218 def contextend(l, len):
220 219 ret = l + opts.context
221 220 if ret > len:
222 221 ret = len
223 222 return ret
224 223
225 224 def contextstart(l):
226 225 ret = l - opts.context
227 226 if ret < 0:
228 227 return 0
229 228 return ret
230 229
231 230 lastfunc = [0, '']
232 231 def yieldhunk(hunk):
233 232 (astart, a2, bstart, b2, delta) = hunk
234 233 aend = contextend(a2, len(l1))
235 234 alen = aend - astart
236 235 blen = b2 - bstart + aend - a2
237 236
238 237 func = ""
239 238 if opts.showfunc:
240 239 lastpos, func = lastfunc
241 240 # walk backwards from the start of the context up to the start of
242 241 # the previous hunk context until we find a line starting with an
243 242 # alphanumeric char.
244 243 for i in xrange(astart - 1, lastpos - 1, -1):
245 244 if l1[i][0].isalnum():
246 245 func = ' ' + l1[i].rstrip()[:40]
247 246 lastfunc[1] = func
248 247 break
249 248 # by recording this hunk's starting point as the next place to
250 249 # start looking for function lines, we avoid reading any line in
251 250 # the file more than once.
252 251 lastfunc[0] = astart
253 252
254 253 # zero-length hunk ranges report their start line as one less
255 254 if alen:
256 255 astart += 1
257 256 if blen:
258 257 bstart += 1
259 258
260 259 yield "@@ -%d,%d +%d,%d @@%s\n" % (astart, alen,
261 260 bstart, blen, func)
262 261 for x in delta:
263 262 yield x
264 263 for x in xrange(a2, aend):
265 264 yield ' ' + l1[x]
266 265
267 266 # bdiff.blocks gives us the matching sequences in the files. The loop
268 267 # below finds the spaces between those matching sequences and translates
269 268 # them into diff output.
270 269 #
271 270 hunk = None
272 271 for s, stype in allblocks(t1, t2, opts, l1, l2):
273 272 if stype != '!':
274 273 continue
275 274 delta = []
276 275 a1, a2, b1, b2 = s
277 276 old = l1[a1:a2]
278 277 new = l2[b1:b2]
279 278
280 279 astart = contextstart(a1)
281 280 bstart = contextstart(b1)
282 281 prev = None
283 282 if hunk:
284 283 # join with the previous hunk if it falls inside the context
285 284 if astart < hunk[1] + opts.context + 1:
286 285 prev = hunk
287 286 astart = hunk[1]
288 287 bstart = hunk[3]
289 288 else:
290 289 for x in yieldhunk(hunk):
291 290 yield x
292 291 if prev:
293 292 # we've joined the previous hunk, record the new ending points.
294 293 hunk[1] = a2
295 294 hunk[3] = b2
296 295 delta = hunk[4]
297 296 else:
298 297 # create a new hunk
299 298 hunk = [astart, a2, bstart, b2, delta]
300 299
301 300 delta[len(delta):] = [' ' + x for x in l1[astart:a1]]
302 301 delta[len(delta):] = ['-' + x for x in old]
303 302 delta[len(delta):] = ['+' + x for x in new]
304 303
305 304 if hunk:
306 305 for x in yieldhunk(hunk):
307 306 yield x
308 307
309 308 def patchtext(bin):
310 309 pos = 0
311 310 t = []
312 311 while pos < len(bin):
313 312 p1, p2, l = struct.unpack(">lll", bin[pos:pos + 12])
314 313 pos += 12
315 314 t.append(bin[pos:pos + l])
316 315 pos += l
317 316 return "".join(t)
318 317
319 318 def patch(a, bin):
320 319 if len(a) == 0:
321 320 # skip over trivial delta header
322 321 return buffer(bin, 12)
323 322 return mpatch.patches(a, [bin])
324 323
325 324 # similar to difflib.SequenceMatcher.get_matching_blocks
326 325 def get_matching_blocks(a, b):
327 326 return [(d[0], d[2], d[1] - d[0]) for d in bdiff.blocks(a, b)]
328 327
329 328 def trivialdiffheader(length):
330 329 return struct.pack(">lll", 0, 0, length)
331 330
332 331 patches = mpatch.patches
333 332 patchedsize = mpatch.patchedsize
334 333 textdiff = bdiff.bdiff
@@ -1,80 +1,87 b''
1 1 # bdiff.py - Python implementation of bdiff.c
2 2 #
3 3 # Copyright 2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 import struct, difflib
8 import struct, difflib, re
9 9
10 10 def splitnewlines(text):
11 11 '''like str.splitlines, but only split on newlines.'''
12 12 lines = [l + '\n' for l in text.split('\n')]
13 13 if lines:
14 14 if lines[-1] == '\n':
15 15 lines.pop()
16 16 else:
17 17 lines[-1] = lines[-1][:-1]
18 18 return lines
19 19
20 20 def _normalizeblocks(a, b, blocks):
21 21 prev = None
22 22 r = []
23 23 for curr in blocks:
24 24 if prev is None:
25 25 prev = curr
26 26 continue
27 27 shift = 0
28 28
29 29 a1, b1, l1 = prev
30 30 a1end = a1 + l1
31 31 b1end = b1 + l1
32 32
33 33 a2, b2, l2 = curr
34 34 a2end = a2 + l2
35 35 b2end = b2 + l2
36 36 if a1end == a2:
37 37 while (a1end + shift < a2end and
38 38 a[a1end + shift] == b[b1end + shift]):
39 39 shift += 1
40 40 elif b1end == b2:
41 41 while (b1end + shift < b2end and
42 42 a[a1end + shift] == b[b1end + shift]):
43 43 shift += 1
44 44 r.append((a1, b1, l1 + shift))
45 45 prev = a2 + shift, b2 + shift, l2 - shift
46 46 r.append(prev)
47 47 return r
48 48
49 49 def bdiff(a, b):
50 50 a = str(a).splitlines(True)
51 51 b = str(b).splitlines(True)
52 52
53 53 if not a:
54 54 s = "".join(b)
55 55 return s and (struct.pack(">lll", 0, 0, len(s)) + s)
56 56
57 57 bin = []
58 58 p = [0]
59 59 for i in a: p.append(p[-1] + len(i))
60 60
61 61 d = difflib.SequenceMatcher(None, a, b).get_matching_blocks()
62 62 d = _normalizeblocks(a, b, d)
63 63 la = 0
64 64 lb = 0
65 65 for am, bm, size in d:
66 66 s = "".join(b[lb:bm])
67 67 if am > la or s:
68 68 bin.append(struct.pack(">lll", p[la], p[am], len(s)) + s)
69 69 la = am + size
70 70 lb = bm + size
71 71
72 72 return "".join(bin)
73 73
74 74 def blocks(a, b):
75 75 an = splitnewlines(a)
76 76 bn = splitnewlines(b)
77 77 d = difflib.SequenceMatcher(None, an, bn).get_matching_blocks()
78 78 d = _normalizeblocks(an, bn, d)
79 79 return [(i, i + n, j, j + n) for (i, j, n) in d]
80 80
81 def fixws(text, allws):
82 if allws:
83 text = re.sub('[ \t\r]+', '', text)
84 else:
85 text = re.sub('[ \t\r]+', ' ', text)
86 text = text.replace(' \n', '\n')
87 return text
@@ -1,52 +1,66 b''
1 1 import struct
2 2 from mercurial import bdiff, mpatch
3 3
4 4 def test1(a, b):
5 5 d = bdiff.bdiff(a, b)
6 6 c = a
7 7 if d:
8 8 c = mpatch.patches(a, [d])
9 9 if c != b:
10 10 print "***", repr(a), repr(b)
11 11 print "bad:"
12 12 print repr(c)[:200]
13 13 print repr(d)
14 14
15 15 def test(a, b):
16 16 print "***", repr(a), repr(b)
17 17 test1(a, b)
18 18 test1(b, a)
19 19
20 20 test("a\nc\n\n\n\n", "a\nb\n\n\n")
21 21 test("a\nb\nc\n", "a\nc\n")
22 22 test("", "")
23 23 test("a\nb\nc", "a\nb\nc")
24 24 test("a\nb\nc\nd\n", "a\nd\n")
25 25 test("a\nb\nc\nd\n", "a\nc\ne\n")
26 26 test("a\nb\nc\n", "a\nc\n")
27 27 test("a\n", "c\na\nb\n")
28 28 test("a\n", "")
29 29 test("a\n", "b\nc\n")
30 30 test("a\n", "c\na\n")
31 31 test("", "adjfkjdjksdhfksj")
32 32 test("", "ab")
33 33 test("", "abc")
34 34 test("a", "a")
35 35 test("ab", "ab")
36 36 test("abc", "abc")
37 37 test("a\n", "a\n")
38 38 test("a\nb", "a\nb")
39 39
40 40 #issue1295
41 41 def showdiff(a, b):
42 42 bin = bdiff.bdiff(a, b)
43 43 pos = 0
44 44 while pos < len(bin):
45 45 p1, p2, l = struct.unpack(">lll", bin[pos:pos + 12])
46 46 pos += 12
47 47 print p1, p2, repr(bin[pos:pos + l])
48 48 pos += l
49 49 showdiff("x\n\nx\n\nx\n\nx\n\nz\n", "x\n\nx\n\ny\n\nx\n\nx\n\nz\n")
50 50 showdiff("x\n\nx\n\nx\n\nx\n\nz\n", "x\n\nx\n\ny\n\nx\n\ny\n\nx\n\nz\n")
51 51
52 52 print "done"
53
54 def testfixws(a, b, allws):
55 c = bdiff.fixws(a, allws)
56 if c != b:
57 print "*** fixws", repr(a), repr(b), allws
58 print "got:"
59 print repr(c)
60
61 testfixws(" \ta\r b\t\n", "ab\n", 1)
62 testfixws(" \ta\r b\t\n", " a b\n", 0)
63 testfixws("", "", 1)
64 testfixws("", "", 0)
65
66 print "done"
@@ -1,23 +1,24 b''
1 1 *** 'a\nc\n\n\n\n' 'a\nb\n\n\n'
2 2 *** 'a\nb\nc\n' 'a\nc\n'
3 3 *** '' ''
4 4 *** 'a\nb\nc' 'a\nb\nc'
5 5 *** 'a\nb\nc\nd\n' 'a\nd\n'
6 6 *** 'a\nb\nc\nd\n' 'a\nc\ne\n'
7 7 *** 'a\nb\nc\n' 'a\nc\n'
8 8 *** 'a\n' 'c\na\nb\n'
9 9 *** 'a\n' ''
10 10 *** 'a\n' 'b\nc\n'
11 11 *** 'a\n' 'c\na\n'
12 12 *** '' 'adjfkjdjksdhfksj'
13 13 *** '' 'ab'
14 14 *** '' 'abc'
15 15 *** 'a' 'a'
16 16 *** 'ab' 'ab'
17 17 *** 'abc' 'abc'
18 18 *** 'a\n' 'a\n'
19 19 *** 'a\nb' 'a\nb'
20 20 6 6 'y\n\n'
21 21 6 6 'y\n\n'
22 22 9 9 'y\n\n'
23 23 done
24 done
General Comments 0
You need to be logged in to leave comments. Login now