##// END OF EJS Templates
bdiff: early pruning of common prefix before doing expensive computations...
Mads Kiilerich -
r30561:7c0c722d default
parent child Browse files
Show More
@@ -1,204 +1,213 b''
1 /*
1 /*
2 bdiff.c - efficient binary diff extension for Mercurial
2 bdiff.c - efficient binary diff extension for Mercurial
3
3
4 Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
4 Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
5
5
6 This software may be used and distributed according to the terms of
6 This software may be used and distributed according to the terms of
7 the GNU General Public License, incorporated herein by reference.
7 the GNU General Public License, incorporated herein by reference.
8
8
9 Based roughly on Python difflib
9 Based roughly on Python difflib
10 */
10 */
11
11
12 #define PY_SSIZE_T_CLEAN
12 #define PY_SSIZE_T_CLEAN
13 #include <Python.h>
13 #include <Python.h>
14 #include <stdlib.h>
14 #include <stdlib.h>
15 #include <string.h>
15 #include <string.h>
16 #include <limits.h>
16 #include <limits.h>
17
17
18 #include "bdiff.h"
18 #include "bdiff.h"
19 #include "bitmanipulation.h"
19 #include "bitmanipulation.h"
20 #include "util.h"
20 #include "util.h"
21
21
22
22
23 static PyObject *blocks(PyObject *self, PyObject *args)
23 static PyObject *blocks(PyObject *self, PyObject *args)
24 {
24 {
25 PyObject *sa, *sb, *rl = NULL, *m;
25 PyObject *sa, *sb, *rl = NULL, *m;
26 struct bdiff_line *a, *b;
26 struct bdiff_line *a, *b;
27 struct bdiff_hunk l, *h;
27 struct bdiff_hunk l, *h;
28 int an, bn, count, pos = 0;
28 int an, bn, count, pos = 0;
29
29
30 l.next = NULL;
30 l.next = NULL;
31
31
32 if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
32 if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
33 return NULL;
33 return NULL;
34
34
35 an = bdiff_splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);
35 an = bdiff_splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);
36 bn = bdiff_splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);
36 bn = bdiff_splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);
37
37
38 if (!a || !b)
38 if (!a || !b)
39 goto nomem;
39 goto nomem;
40
40
41 count = bdiff_diff(a, an, b, bn, &l);
41 count = bdiff_diff(a, an, b, bn, &l);
42 if (count < 0)
42 if (count < 0)
43 goto nomem;
43 goto nomem;
44
44
45 rl = PyList_New(count);
45 rl = PyList_New(count);
46 if (!rl)
46 if (!rl)
47 goto nomem;
47 goto nomem;
48
48
49 for (h = l.next; h; h = h->next) {
49 for (h = l.next; h; h = h->next) {
50 m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
50 m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
51 PyList_SetItem(rl, pos, m);
51 PyList_SetItem(rl, pos, m);
52 pos++;
52 pos++;
53 }
53 }
54
54
55 nomem:
55 nomem:
56 free(a);
56 free(a);
57 free(b);
57 free(b);
58 bdiff_freehunks(l.next);
58 bdiff_freehunks(l.next);
59 return rl ? rl : PyErr_NoMemory();
59 return rl ? rl : PyErr_NoMemory();
60 }
60 }
61
61
62 static PyObject *bdiff(PyObject *self, PyObject *args)
62 static PyObject *bdiff(PyObject *self, PyObject *args)
63 {
63 {
64 char *sa, *sb, *rb;
64 char *sa, *sb, *rb, *ia, *ib;
65 PyObject *result = NULL;
65 PyObject *result = NULL;
66 struct bdiff_line *al, *bl;
66 struct bdiff_line *al, *bl;
67 struct bdiff_hunk l, *h;
67 struct bdiff_hunk l, *h;
68 int an, bn, count;
68 int an, bn, count;
69 Py_ssize_t len = 0, la, lb;
69 Py_ssize_t len = 0, la, lb, li = 0, lcommon = 0, lmax;
70 PyThreadState *_save;
70 PyThreadState *_save;
71
71
72 l.next = NULL;
72 l.next = NULL;
73
73
74 if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))
74 if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))
75 return NULL;
75 return NULL;
76
76
77 if (la > UINT_MAX || lb > UINT_MAX) {
77 if (la > UINT_MAX || lb > UINT_MAX) {
78 PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");
78 PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");
79 return NULL;
79 return NULL;
80 }
80 }
81
81
82 _save = PyEval_SaveThread();
82 _save = PyEval_SaveThread();
83 an = bdiff_splitlines(sa, la, &al);
83
84 bn = bdiff_splitlines(sb, lb, &bl);
84 lmax = la > lb ? lb : la;
85 for (ia = sa, ib = sb;
86 li < lmax && *ia == *ib;
87 ++li, ++ia, ++ib)
88 if (*ia == '\n')
89 lcommon = li + 1;
90 /* we can almost add: if (li == lmax) lcommon = li; */
91
92 an = bdiff_splitlines(sa + lcommon, la - lcommon, &al);
93 bn = bdiff_splitlines(sb + lcommon, lb - lcommon, &bl);
85 if (!al || !bl)
94 if (!al || !bl)
86 goto nomem;
95 goto nomem;
87
96
88 count = bdiff_diff(al, an, bl, bn, &l);
97 count = bdiff_diff(al, an, bl, bn, &l);
89 if (count < 0)
98 if (count < 0)
90 goto nomem;
99 goto nomem;
91
100
92 /* calculate length of output */
101 /* calculate length of output */
93 la = lb = 0;
102 la = lb = 0;
94 for (h = l.next; h; h = h->next) {
103 for (h = l.next; h; h = h->next) {
95 if (h->a1 != la || h->b1 != lb)
104 if (h->a1 != la || h->b1 != lb)
96 len += 12 + bl[h->b1].l - bl[lb].l;
105 len += 12 + bl[h->b1].l - bl[lb].l;
97 la = h->a2;
106 la = h->a2;
98 lb = h->b2;
107 lb = h->b2;
99 }
108 }
100 PyEval_RestoreThread(_save);
109 PyEval_RestoreThread(_save);
101 _save = NULL;
110 _save = NULL;
102
111
103 result = PyBytes_FromStringAndSize(NULL, len);
112 result = PyBytes_FromStringAndSize(NULL, len);
104
113
105 if (!result)
114 if (!result)
106 goto nomem;
115 goto nomem;
107
116
108 /* build binary patch */
117 /* build binary patch */
109 rb = PyBytes_AsString(result);
118 rb = PyBytes_AsString(result);
110 la = lb = 0;
119 la = lb = 0;
111
120
112 for (h = l.next; h; h = h->next) {
121 for (h = l.next; h; h = h->next) {
113 if (h->a1 != la || h->b1 != lb) {
122 if (h->a1 != la || h->b1 != lb) {
114 len = bl[h->b1].l - bl[lb].l;
123 len = bl[h->b1].l - bl[lb].l;
115 putbe32((uint32_t)(al[la].l - al->l), rb);
124 putbe32((uint32_t)(al[la].l + lcommon - al->l), rb);
116 putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);
125 putbe32((uint32_t)(al[h->a1].l + lcommon - al->l), rb + 4);
117 putbe32((uint32_t)len, rb + 8);
126 putbe32((uint32_t)len, rb + 8);
118 memcpy(rb + 12, bl[lb].l, len);
127 memcpy(rb + 12, bl[lb].l, len);
119 rb += 12 + len;
128 rb += 12 + len;
120 }
129 }
121 la = h->a2;
130 la = h->a2;
122 lb = h->b2;
131 lb = h->b2;
123 }
132 }
124
133
125 nomem:
134 nomem:
126 if (_save)
135 if (_save)
127 PyEval_RestoreThread(_save);
136 PyEval_RestoreThread(_save);
128 free(al);
137 free(al);
129 free(bl);
138 free(bl);
130 bdiff_freehunks(l.next);
139 bdiff_freehunks(l.next);
131 return result ? result : PyErr_NoMemory();
140 return result ? result : PyErr_NoMemory();
132 }
141 }
133
142
134 /*
143 /*
135 * If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,
144 * If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,
136 * reduce whitespace sequences to a single space and trim remaining whitespace
145 * reduce whitespace sequences to a single space and trim remaining whitespace
137 * from end of lines.
146 * from end of lines.
138 */
147 */
139 static PyObject *fixws(PyObject *self, PyObject *args)
148 static PyObject *fixws(PyObject *self, PyObject *args)
140 {
149 {
141 PyObject *s, *result = NULL;
150 PyObject *s, *result = NULL;
142 char allws, c;
151 char allws, c;
143 const char *r;
152 const char *r;
144 Py_ssize_t i, rlen, wlen = 0;
153 Py_ssize_t i, rlen, wlen = 0;
145 char *w;
154 char *w;
146
155
147 if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))
156 if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))
148 return NULL;
157 return NULL;
149 r = PyBytes_AsString(s);
158 r = PyBytes_AsString(s);
150 rlen = PyBytes_Size(s);
159 rlen = PyBytes_Size(s);
151
160
152 w = (char *)malloc(rlen ? rlen : 1);
161 w = (char *)malloc(rlen ? rlen : 1);
153 if (!w)
162 if (!w)
154 goto nomem;
163 goto nomem;
155
164
156 for (i = 0; i != rlen; i++) {
165 for (i = 0; i != rlen; i++) {
157 c = r[i];
166 c = r[i];
158 if (c == ' ' || c == '\t' || c == '\r') {
167 if (c == ' ' || c == '\t' || c == '\r') {
159 if (!allws && (wlen == 0 || w[wlen - 1] != ' '))
168 if (!allws && (wlen == 0 || w[wlen - 1] != ' '))
160 w[wlen++] = ' ';
169 w[wlen++] = ' ';
161 } else if (c == '\n' && !allws
170 } else if (c == '\n' && !allws
162 && wlen > 0 && w[wlen - 1] == ' ') {
171 && wlen > 0 && w[wlen - 1] == ' ') {
163 w[wlen - 1] = '\n';
172 w[wlen - 1] = '\n';
164 } else {
173 } else {
165 w[wlen++] = c;
174 w[wlen++] = c;
166 }
175 }
167 }
176 }
168
177
169 result = PyBytes_FromStringAndSize(w, wlen);
178 result = PyBytes_FromStringAndSize(w, wlen);
170
179
171 nomem:
180 nomem:
172 free(w);
181 free(w);
173 return result ? result : PyErr_NoMemory();
182 return result ? result : PyErr_NoMemory();
174 }
183 }
175
184
176
185
177 static char mdiff_doc[] = "Efficient binary diff.";
186 static char mdiff_doc[] = "Efficient binary diff.";
178
187
179 static PyMethodDef methods[] = {
188 static PyMethodDef methods[] = {
180 {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
189 {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
181 {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
190 {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
182 {"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},
191 {"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},
183 {NULL, NULL}
192 {NULL, NULL}
184 };
193 };
185
194
186 #ifdef IS_PY3K
195 #ifdef IS_PY3K
187 static struct PyModuleDef bdiff_module = {
196 static struct PyModuleDef bdiff_module = {
188 PyModuleDef_HEAD_INIT,
197 PyModuleDef_HEAD_INIT,
189 "bdiff",
198 "bdiff",
190 mdiff_doc,
199 mdiff_doc,
191 -1,
200 -1,
192 methods
201 methods
193 };
202 };
194
203
195 PyMODINIT_FUNC PyInit_bdiff(void)
204 PyMODINIT_FUNC PyInit_bdiff(void)
196 {
205 {
197 return PyModule_Create(&bdiff_module);
206 return PyModule_Create(&bdiff_module);
198 }
207 }
199 #else
208 #else
200 PyMODINIT_FUNC initbdiff(void)
209 PyMODINIT_FUNC initbdiff(void)
201 {
210 {
202 Py_InitModule3("bdiff", methods, mdiff_doc);
211 Py_InitModule3("bdiff", methods, mdiff_doc);
203 }
212 }
204 #endif
213 #endif
@@ -1,82 +1,81 b''
1 test 'a\nc\n\n\n\n' 'a\nb\n\n\n'
1 test 'a\nc\n\n\n\n' 'a\nb\n\n\n'
2 test 'a\nb\nc\n' 'a\nc\n'
2 test 'a\nb\nc\n' 'a\nc\n'
3 test '' ''
3 test '' ''
4 test 'a\nb\nc' 'a\nb\nc'
4 test 'a\nb\nc' 'a\nb\nc'
5 test 'a\nb\nc\nd\n' 'a\nd\n'
5 test 'a\nb\nc\nd\n' 'a\nd\n'
6 test 'a\nb\nc\nd\n' 'a\nc\ne\n'
6 test 'a\nb\nc\nd\n' 'a\nc\ne\n'
7 test 'a\nb\nc\n' 'a\nc\n'
7 test 'a\nb\nc\n' 'a\nc\n'
8 test 'a\n' 'c\na\nb\n'
8 test 'a\n' 'c\na\nb\n'
9 test 'a\n' ''
9 test 'a\n' ''
10 test 'a\n' 'b\nc\n'
10 test 'a\n' 'b\nc\n'
11 test 'a\n' 'c\na\n'
11 test 'a\n' 'c\na\n'
12 test '' 'adjfkjdjksdhfksj'
12 test '' 'adjfkjdjksdhfksj'
13 test '' 'ab'
13 test '' 'ab'
14 test '' 'abc'
14 test '' 'abc'
15 test 'a' 'a'
15 test 'a' 'a'
16 test 'ab' 'ab'
16 test 'ab' 'ab'
17 test 'abc' 'abc'
17 test 'abc' 'abc'
18 test 'a\n' 'a\n'
18 test 'a\n' 'a\n'
19 test 'a\nb' 'a\nb'
19 test 'a\nb' 'a\nb'
20 showdiff(
20 showdiff(
21 'x\n\nx\n\nx\n\nx\n\nz\n',
21 'x\n\nx\n\nx\n\nx\n\nz\n',
22 'x\n\nx\n\ny\n\nx\n\nx\n\nz\n'):
22 'x\n\nx\n\ny\n\nx\n\nx\n\nz\n'):
23 'x\n\nx\n\n'
23 'x\n\nx\n\n'
24 6 6 '' -> 'y\n\n'
24 6 6 '' -> 'y\n\n'
25 'x\n\nx\n\nz\n'
25 'x\n\nx\n\nz\n'
26 showdiff(
26 showdiff(
27 'x\n\nx\n\nx\n\nx\n\nz\n',
27 'x\n\nx\n\nx\n\nx\n\nz\n',
28 'x\n\nx\n\ny\n\nx\n\ny\n\nx\n\nz\n'):
28 'x\n\nx\n\ny\n\nx\n\ny\n\nx\n\nz\n'):
29 'x\n\nx\n\n'
29 'x\n\nx\n\n'
30 6 6 '' -> 'y\n\n'
30 6 6 '' -> 'y\n\n'
31 'x\n\n'
31 'x\n\n'
32 9 9 '' -> 'y\n\n'
32 9 9 '' -> 'y\n\n'
33 'x\n\nz\n'
33 'x\n\nz\n'
34 showdiff(
34 showdiff(
35 'a\nb\nb\nb\nc\n.\nd\ne\n.\nf\n',
35 'a\nb\nb\nb\nc\n.\nd\ne\n.\nf\n',
36 'a\nb\nb\na\nb\nb\nb\nc\n.\nb\nc\n.\nd\ne\nf\n'):
36 'a\nb\nb\na\nb\nb\nb\nc\n.\nb\nc\n.\nd\ne\nf\n'):
37 0 0 '' -> 'a\nb\nb\n'
37 'a\nb\nb\n'
38 'a\nb\nb\nb\nc\n.\n'
38 6 6 '' -> 'a\nb\nb\nb\nc\n.\n'
39 12 12 '' -> 'b\nc\n.\n'
39 'b\nc\n.\nd\ne\n'
40 'd\ne\n'
41 16 18 '.\n' -> ''
40 16 18 '.\n' -> ''
42 'f\n'
41 'f\n'
43 done
42 done
44 done
43 done
45 Nice diff for a trivial change:
44 Nice diff for a trivial change:
46 showdiff(
45 showdiff(
47 '<0\n-\n<1\n-\n<2\n-\n<3\n-\n<4\n-\n',
46 '<0\n-\n<1\n-\n<2\n-\n<3\n-\n<4\n-\n',
48 '>0\n-\n>1\n-\n>2\n-\n>3\n-\n>4\n-\n'):
47 '>0\n-\n>1\n-\n>2\n-\n>3\n-\n>4\n-\n'):
49 0 3 '<0\n' -> '>0\n'
48 0 3 '<0\n' -> '>0\n'
50 '-\n'
49 '-\n'
51 5 8 '<1\n' -> '>1\n'
50 5 8 '<1\n' -> '>1\n'
52 '-\n'
51 '-\n'
53 10 13 '<2\n' -> '>2\n'
52 10 13 '<2\n' -> '>2\n'
54 '-\n'
53 '-\n'
55 15 18 '<3\n' -> '>3\n'
54 15 18 '<3\n' -> '>3\n'
56 '-\n'
55 '-\n'
57 20 23 '<4\n' -> '>4\n'
56 20 23 '<4\n' -> '>4\n'
58 '-\n'
57 '-\n'
59 Diff 1 to 3 lines - preference for appending:
58 Diff 1 to 3 lines - preference for appending:
60 showdiff(
59 showdiff(
61 'a\n',
60 'a\n',
62 'a\na\na\n'):
61 'a\na\na\n'):
63 'a\n'
62 'a\n'
64 2 2 '' -> 'a\na\n'
63 2 2 '' -> 'a\na\n'
65 Diff 1 to 5 lines - preference for appending:
64 Diff 1 to 5 lines - preference for appending:
66 showdiff(
65 showdiff(
67 'a\n',
66 'a\n',
68 'a\na\na\na\na\n'):
67 'a\na\na\na\na\n'):
69 'a\n'
68 'a\n'
70 2 2 '' -> 'a\na\na\na\n'
69 2 2 '' -> 'a\na\na\na\n'
71 Diff 3 to 1 lines - preference for removing trailing lines:
70 Diff 3 to 1 lines - preference for removing trailing lines:
72 showdiff(
71 showdiff(
73 'a\na\na\n',
72 'a\na\na\n',
74 'a\n'):
73 'a\n'):
75 'a\n'
74 'a\n'
76 2 6 'a\na\n' -> ''
75 2 6 'a\na\n' -> ''
77 Diff 5 to 1 lines - preference for removing trailing lines:
76 Diff 5 to 1 lines - preference for removing trailing lines:
78 showdiff(
77 showdiff(
79 'a\na\na\na\na\n',
78 'a\na\na\na\na\n',
80 'a\n'):
79 'a\n'):
81 'a\n'
80 'a\n'
82 2 10 'a\na\na\na\n' -> ''
81 2 10 'a\na\na\na\n' -> ''
General Comments 0
You need to be logged in to leave comments. Login now