##// END OF EJS Templates
bdiff: give slight preference to removing trailing lines...
Mads Kiilerich -
r30433:96f2f50d default
parent child Browse files
Show More
@@ -1,312 +1,312 b''
1 1 /*
2 2 bdiff.c - efficient binary diff extension for Mercurial
3 3
4 4 Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
5 5
6 6 This software may be used and distributed according to the terms of
7 7 the GNU General Public License, incorporated herein by reference.
8 8
9 9 Based roughly on Python difflib
10 10 */
11 11
12 12 #include <stdlib.h>
13 13 #include <string.h>
14 14 #include <limits.h>
15 15
16 16 #include "compat.h"
17 17 #include "bitmanipulation.h"
18 18 #include "bdiff.h"
19 19
20 20 /* Hash implementation from diffutils */
21 21 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof(v) * CHAR_BIT - (n)))
22 22 #define HASH(h, c) ((c) + ROL(h ,7))
23 23
24 24 struct pos {
25 25 int pos, len;
26 26 };
27 27
28 28 int bdiff_splitlines(const char *a, ssize_t len, struct bdiff_line **lr)
29 29 {
30 30 unsigned hash;
31 31 int i;
32 32 const char *p, *b = a;
33 33 const char * const plast = a + len - 1;
34 34 struct bdiff_line *l;
35 35
36 36 /* count the lines */
37 37 i = 1; /* extra line for sentinel */
38 38 for (p = a; p < plast; p++)
39 39 if (*p == '\n')
40 40 i++;
41 41 if (p == plast)
42 42 i++;
43 43
44 44 *lr = l = (struct bdiff_line *)malloc(sizeof(struct bdiff_line) * i);
45 45 if (!l)
46 46 return -1;
47 47
48 48 /* build the line array and calculate hashes */
49 49 hash = 0;
50 50 for (p = a; p < a + len; p++) {
51 51 hash = HASH(hash, *p);
52 52
53 53 if (*p == '\n' || p == plast) {
54 54 l->hash = hash;
55 55 hash = 0;
56 56 l->len = p - b + 1;
57 57 l->l = b;
58 58 l->n = INT_MAX;
59 59 l++;
60 60 b = p + 1;
61 61 }
62 62 }
63 63
64 64 /* set up a sentinel */
65 65 l->hash = 0;
66 66 l->len = 0;
67 67 l->l = a + len;
68 68 return i - 1;
69 69 }
70 70
71 71 static inline int cmp(struct bdiff_line *a, struct bdiff_line *b)
72 72 {
73 73 return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);
74 74 }
75 75
76 76 static int equatelines(struct bdiff_line *a, int an, struct bdiff_line *b,
77 77 int bn)
78 78 {
79 79 int i, j, buckets = 1, t, scale;
80 80 struct pos *h = NULL;
81 81
82 82 /* build a hash table of the next highest power of 2 */
83 83 while (buckets < bn + 1)
84 84 buckets *= 2;
85 85
86 86 /* try to allocate a large hash table to avoid collisions */
87 87 for (scale = 4; scale; scale /= 2) {
88 88 h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));
89 89 if (h)
90 90 break;
91 91 }
92 92
93 93 if (!h)
94 94 return 0;
95 95
96 96 buckets = buckets * scale - 1;
97 97
98 98 /* clear the hash table */
99 99 for (i = 0; i <= buckets; i++) {
100 100 h[i].pos = -1;
101 101 h[i].len = 0;
102 102 }
103 103
104 104 /* add lines to the hash table chains */
105 105 for (i = 0; i < bn; i++) {
106 106 /* find the equivalence class */
107 107 for (j = b[i].hash & buckets; h[j].pos != -1;
108 108 j = (j + 1) & buckets)
109 109 if (!cmp(b + i, b + h[j].pos))
110 110 break;
111 111
112 112 /* add to the head of the equivalence class */
113 113 b[i].n = h[j].pos;
114 114 b[i].e = j;
115 115 h[j].pos = i;
116 116 h[j].len++; /* keep track of popularity */
117 117 }
118 118
119 119 /* compute popularity threshold */
120 120 t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);
121 121
122 122 /* match items in a to their equivalence class in b */
123 123 for (i = 0; i < an; i++) {
124 124 /* find the equivalence class */
125 125 for (j = a[i].hash & buckets; h[j].pos != -1;
126 126 j = (j + 1) & buckets)
127 127 if (!cmp(a + i, b + h[j].pos))
128 128 break;
129 129
130 130 a[i].e = j; /* use equivalence class for quick compare */
131 131 if (h[j].len <= t)
132 132 a[i].n = h[j].pos; /* point to head of match list */
133 133 else
134 134 a[i].n = -1; /* too popular */
135 135 }
136 136
137 137 /* discard hash tables */
138 138 free(h);
139 139 return 1;
140 140 }
141 141
142 142 static int longest_match(struct bdiff_line *a, struct bdiff_line *b,
143 143 struct pos *pos,
144 144 int a1, int a2, int b1, int b2, int *omi, int *omj)
145 145 {
146 146 int mi = a1, mj = b1, mk = 0, i, j, k, half, bhalf;
147 147
148 148 /* window our search on large regions to better bound
149 149 worst-case performance. by choosing a window at the end, we
150 150 reduce skipping overhead on the b chains. */
151 151 if (a2 - a1 > 30000)
152 152 a1 = a2 - 30000;
153 153
154 154 half = (a1 + a2 - 1) / 2;
155 155 bhalf = (b1 + b2 - 1) / 2;
156 156
157 157 for (i = a1; i < a2; i++) {
158 158 /* skip all lines in b after the current block */
159 159 for (j = a[i].n; j >= b2; j = b[j].n)
160 160 ;
161 161
162 162 /* loop through all lines match a[i] in b */
163 163 for (; j >= b1; j = b[j].n) {
164 164 /* does this extend an earlier match? */
165 165 for (k = 1; j - k >= b1 && i - k >= a1; k++) {
166 166 /* reached an earlier match? */
167 167 if (pos[j - k].pos == i - k) {
168 168 k += pos[j - k].len;
169 169 break;
170 170 }
171 171 /* previous line mismatch? */
172 172 if (a[i - k].e != b[j - k].e)
173 173 break;
174 174 }
175 175
176 176 pos[j].pos = i;
177 177 pos[j].len = k;
178 178
179 179 /* best match so far? we prefer matches closer
180 180 to the middle to balance recursion */
181 181 if (k > mk) {
182 182 /* a longer match */
183 183 mi = i;
184 184 mj = j;
185 185 mk = k;
186 186 } else if (k == mk) {
187 if (i > mi && i <= half) {
187 if (i > mi && i <= half && j > b1) {
188 188 /* same match but closer to half */
189 189 mi = i;
190 190 mj = j;
191 191 } else if (i == mi && (mj > bhalf || i == a1)) {
192 192 /* same i but best earlier j */
193 193 mj = j;
194 194 }
195 195 }
196 196 }
197 197 }
198 198
199 199 if (mk) {
200 200 mi = mi - mk + 1;
201 201 mj = mj - mk + 1;
202 202 }
203 203
204 204 /* expand match to include subsequent popular lines */
205 205 while (mi + mk < a2 && mj + mk < b2 &&
206 206 a[mi + mk].e == b[mj + mk].e)
207 207 mk++;
208 208
209 209 *omi = mi;
210 210 *omj = mj;
211 211
212 212 return mk;
213 213 }
214 214
215 215 static struct bdiff_hunk *recurse(struct bdiff_line *a, struct bdiff_line *b,
216 216 struct pos *pos,
217 217 int a1, int a2, int b1, int b2, struct bdiff_hunk *l)
218 218 {
219 219 int i, j, k;
220 220
221 221 while (1) {
222 222 /* find the longest match in this chunk */
223 223 k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
224 224 if (!k)
225 225 return l;
226 226
227 227 /* and recurse on the remaining chunks on either side */
228 228 l = recurse(a, b, pos, a1, i, b1, j, l);
229 229 if (!l)
230 230 return NULL;
231 231
232 232 l->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk));
233 233 if (!l->next)
234 234 return NULL;
235 235
236 236 l = l->next;
237 237 l->a1 = i;
238 238 l->a2 = i + k;
239 239 l->b1 = j;
240 240 l->b2 = j + k;
241 241 l->next = NULL;
242 242
243 243 /* tail-recursion didn't happen, so do equivalent iteration */
244 244 a1 = i + k;
245 245 b1 = j + k;
246 246 }
247 247 }
248 248
249 249 int bdiff_diff(struct bdiff_line *a, int an, struct bdiff_line *b,
250 250 int bn, struct bdiff_hunk *base)
251 251 {
252 252 struct bdiff_hunk *curr;
253 253 struct pos *pos;
254 254 int t, count = 0;
255 255
256 256 /* allocate and fill arrays */
257 257 t = equatelines(a, an, b, bn);
258 258 pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));
259 259
260 260 if (pos && t) {
261 261 /* generate the matching block list */
262 262
263 263 curr = recurse(a, b, pos, 0, an, 0, bn, base);
264 264 if (!curr)
265 265 return -1;
266 266
267 267 /* sentinel end hunk */
268 268 curr->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk));
269 269 if (!curr->next)
270 270 return -1;
271 271 curr = curr->next;
272 272 curr->a1 = curr->a2 = an;
273 273 curr->b1 = curr->b2 = bn;
274 274 curr->next = NULL;
275 275 }
276 276
277 277 free(pos);
278 278
279 279 /* normalize the hunk list, try to push each hunk towards the end */
280 280 for (curr = base->next; curr; curr = curr->next) {
281 281 struct bdiff_hunk *next = curr->next;
282 282
283 283 if (!next)
284 284 break;
285 285
286 286 if (curr->a2 == next->a1 || curr->b2 == next->b1)
287 287 while (curr->a2 < an && curr->b2 < bn
288 288 && next->a1 < next->a2
289 289 && next->b1 < next->b2
290 290 && !cmp(a + curr->a2, b + curr->b2)) {
291 291 curr->a2++;
292 292 next->a1++;
293 293 curr->b2++;
294 294 next->b1++;
295 295 }
296 296 }
297 297
298 298 for (curr = base->next; curr; curr = curr->next)
299 299 count++;
300 300 return count;
301 301 }
302 302
303 303 void bdiff_freehunks(struct bdiff_hunk *l)
304 304 {
305 305 struct bdiff_hunk *n;
306 306 for (; l; l = n) {
307 307 n = l->next;
308 308 free(l);
309 309 }
310 310 }
311 311
312 312
@@ -1,94 +1,94 b''
1 1 from __future__ import absolute_import, print_function
2 2 import struct
3 3 from mercurial import (
4 4 bdiff,
5 5 mpatch,
6 6 )
7 7
8 8 def test1(a, b):
9 9 d = bdiff.bdiff(a, b)
10 10 c = a
11 11 if d:
12 12 c = mpatch.patches(a, [d])
13 13 if c != b:
14 14 print("bad diff+patch result from\n %r to\n %r:" % (a, b))
15 15 print("bdiff: %r" % d)
16 16 print("patched: %r" % c[:200])
17 17
18 18 def test(a, b):
19 19 print("test", repr(a), repr(b))
20 20 test1(a, b)
21 21 test1(b, a)
22 22
23 23 test("a\nc\n\n\n\n", "a\nb\n\n\n")
24 24 test("a\nb\nc\n", "a\nc\n")
25 25 test("", "")
26 26 test("a\nb\nc", "a\nb\nc")
27 27 test("a\nb\nc\nd\n", "a\nd\n")
28 28 test("a\nb\nc\nd\n", "a\nc\ne\n")
29 29 test("a\nb\nc\n", "a\nc\n")
30 30 test("a\n", "c\na\nb\n")
31 31 test("a\n", "")
32 32 test("a\n", "b\nc\n")
33 33 test("a\n", "c\na\n")
34 34 test("", "adjfkjdjksdhfksj")
35 35 test("", "ab")
36 36 test("", "abc")
37 37 test("a", "a")
38 38 test("ab", "ab")
39 39 test("abc", "abc")
40 40 test("a\n", "a\n")
41 41 test("a\nb", "a\nb")
42 42
43 43 #issue1295
44 44 def showdiff(a, b):
45 45 print('showdiff(\n %r,\n %r):' % (a, b))
46 46 bin = bdiff.bdiff(a, b)
47 47 pos = 0
48 48 q = 0
49 49 while pos < len(bin):
50 50 p1, p2, l = struct.unpack(">lll", bin[pos:pos + 12])
51 51 pos += 12
52 52 if p1:
53 53 print('', repr(a[q:p1]))
54 54 print('', p1, p2, repr(a[p1:p2]), '->', repr(bin[pos:pos + l]))
55 55 pos += l
56 56 q = p2
57 57 if q < len(a):
58 58 print('', repr(a[q:]))
59 59
60 60 showdiff("x\n\nx\n\nx\n\nx\n\nz\n", "x\n\nx\n\ny\n\nx\n\nx\n\nz\n")
61 61 showdiff("x\n\nx\n\nx\n\nx\n\nz\n", "x\n\nx\n\ny\n\nx\n\ny\n\nx\n\nz\n")
62 62 # we should pick up abbbc. rather than bc.de as the longest match
63 63 showdiff("a\nb\nb\nb\nc\n.\nd\ne\n.\nf\n",
64 64 "a\nb\nb\na\nb\nb\nb\nc\n.\nb\nc\n.\nd\ne\nf\n")
65 65
66 66 print("done")
67 67
68 68 def testfixws(a, b, allws):
69 69 c = bdiff.fixws(a, allws)
70 70 if c != b:
71 71 print("*** fixws", repr(a), repr(b), allws)
72 72 print("got:")
73 73 print(repr(c))
74 74
75 75 testfixws(" \ta\r b\t\n", "ab\n", 1)
76 76 testfixws(" \ta\r b\t\n", " a b\n", 0)
77 77 testfixws("", "", 1)
78 78 testfixws("", "", 0)
79 79
80 80 print("done")
81 81
82 82 print("Nice diff for a trivial change:")
83 83 showdiff(
84 84 ''.join('<%s\n-\n' % i for i in range(5)),
85 85 ''.join('>%s\n-\n' % i for i in range(5)))
86 86
87 87 print("Diff 1 to 3 lines - preference for appending:")
88 88 showdiff('a\n', 'a\n' * 3)
89 89 print("Diff 1 to 5 lines - preference for appending:")
90 90 showdiff('a\n', 'a\n' * 5)
91 print("Diff 3 to 1 lines - preference for balanced recursion:")
91 print("Diff 3 to 1 lines - preference for removing trailing lines:")
92 92 showdiff('a\n' * 3, 'a\n')
93 print("Diff 5 to 1 lines - preference for balanced recursion:")
93 print("Diff 5 to 1 lines - preference for removing trailing lines:")
94 94 showdiff('a\n' * 5, 'a\n')
@@ -1,84 +1,82 b''
1 1 test 'a\nc\n\n\n\n' 'a\nb\n\n\n'
2 2 test 'a\nb\nc\n' 'a\nc\n'
3 3 test '' ''
4 4 test 'a\nb\nc' 'a\nb\nc'
5 5 test 'a\nb\nc\nd\n' 'a\nd\n'
6 6 test 'a\nb\nc\nd\n' 'a\nc\ne\n'
7 7 test 'a\nb\nc\n' 'a\nc\n'
8 8 test 'a\n' 'c\na\nb\n'
9 9 test 'a\n' ''
10 10 test 'a\n' 'b\nc\n'
11 11 test 'a\n' 'c\na\n'
12 12 test '' 'adjfkjdjksdhfksj'
13 13 test '' 'ab'
14 14 test '' 'abc'
15 15 test 'a' 'a'
16 16 test 'ab' 'ab'
17 17 test 'abc' 'abc'
18 18 test 'a\n' 'a\n'
19 19 test 'a\nb' 'a\nb'
20 20 showdiff(
21 21 'x\n\nx\n\nx\n\nx\n\nz\n',
22 22 'x\n\nx\n\ny\n\nx\n\nx\n\nz\n'):
23 23 'x\n\nx\n\n'
24 24 6 6 '' -> 'y\n\n'
25 25 'x\n\nx\n\nz\n'
26 26 showdiff(
27 27 'x\n\nx\n\nx\n\nx\n\nz\n',
28 28 'x\n\nx\n\ny\n\nx\n\ny\n\nx\n\nz\n'):
29 29 'x\n\nx\n\n'
30 30 6 6 '' -> 'y\n\n'
31 31 'x\n\n'
32 32 9 9 '' -> 'y\n\n'
33 33 'x\n\nz\n'
34 34 showdiff(
35 35 'a\nb\nb\nb\nc\n.\nd\ne\n.\nf\n',
36 36 'a\nb\nb\na\nb\nb\nb\nc\n.\nb\nc\n.\nd\ne\nf\n'):
37 37 0 0 '' -> 'a\nb\nb\n'
38 38 'a\nb\nb\nb\nc\n.\n'
39 39 12 12 '' -> 'b\nc\n.\n'
40 40 'd\ne\n'
41 41 16 18 '.\n' -> ''
42 42 'f\n'
43 43 done
44 44 done
45 45 Nice diff for a trivial change:
46 46 showdiff(
47 47 '<0\n-\n<1\n-\n<2\n-\n<3\n-\n<4\n-\n',
48 48 '>0\n-\n>1\n-\n>2\n-\n>3\n-\n>4\n-\n'):
49 49 0 3 '<0\n' -> '>0\n'
50 50 '-\n'
51 51 5 8 '<1\n' -> '>1\n'
52 52 '-\n'
53 53 10 13 '<2\n' -> '>2\n'
54 54 '-\n'
55 55 15 18 '<3\n' -> '>3\n'
56 56 '-\n'
57 57 20 23 '<4\n' -> '>4\n'
58 58 '-\n'
59 59 Diff 1 to 3 lines - preference for appending:
60 60 showdiff(
61 61 'a\n',
62 62 'a\na\na\n'):
63 63 'a\n'
64 64 2 2 '' -> 'a\na\n'
65 65 Diff 1 to 5 lines - preference for appending:
66 66 showdiff(
67 67 'a\n',
68 68 'a\na\na\na\na\n'):
69 69 'a\n'
70 70 2 2 '' -> 'a\na\na\na\n'
71 Diff 3 to 1 lines - preference for balanced recursion:
71 Diff 3 to 1 lines - preference for removing trailing lines:
72 72 showdiff(
73 73 'a\na\na\n',
74 74 'a\n'):
75 0 2 'a\n' -> ''
76 75 'a\n'
77 4 6 'a\n' -> ''
78 Diff 5 to 1 lines - preference for balanced recursion:
76 2 6 'a\na\n' -> ''
77 Diff 5 to 1 lines - preference for removing trailing lines:
79 78 showdiff(
80 79 'a\na\na\na\na\n',
81 80 'a\n'):
82 0 4 'a\na\n' -> ''
83 81 'a\n'
84 6 10 'a\na\n' -> ''
82 2 10 'a\na\na\na\n' -> ''
General Comments 0
You need to be logged in to leave comments. Login now