##// END OF EJS Templates
bdiff: early pruning of common prefix before doing expensive computations...
Mads Kiilerich -
r30561:7c0c722d default
parent child Browse files
Show More
@@ -1,204 +1,213 b''
1 1 /*
2 2 bdiff.c - efficient binary diff extension for Mercurial
3 3
4 4 Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
5 5
6 6 This software may be used and distributed according to the terms of
7 7 the GNU General Public License, incorporated herein by reference.
8 8
9 9 Based roughly on Python difflib
10 10 */
11 11
12 12 #define PY_SSIZE_T_CLEAN
13 13 #include <Python.h>
14 14 #include <stdlib.h>
15 15 #include <string.h>
16 16 #include <limits.h>
17 17
18 18 #include "bdiff.h"
19 19 #include "bitmanipulation.h"
20 20 #include "util.h"
21 21
22 22
23 23 static PyObject *blocks(PyObject *self, PyObject *args)
24 24 {
25 25 PyObject *sa, *sb, *rl = NULL, *m;
26 26 struct bdiff_line *a, *b;
27 27 struct bdiff_hunk l, *h;
28 28 int an, bn, count, pos = 0;
29 29
30 30 l.next = NULL;
31 31
32 32 if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
33 33 return NULL;
34 34
35 35 an = bdiff_splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);
36 36 bn = bdiff_splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);
37 37
38 38 if (!a || !b)
39 39 goto nomem;
40 40
41 41 count = bdiff_diff(a, an, b, bn, &l);
42 42 if (count < 0)
43 43 goto nomem;
44 44
45 45 rl = PyList_New(count);
46 46 if (!rl)
47 47 goto nomem;
48 48
49 49 for (h = l.next; h; h = h->next) {
50 50 m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
51 51 PyList_SetItem(rl, pos, m);
52 52 pos++;
53 53 }
54 54
55 55 nomem:
56 56 free(a);
57 57 free(b);
58 58 bdiff_freehunks(l.next);
59 59 return rl ? rl : PyErr_NoMemory();
60 60 }
61 61
62 62 static PyObject *bdiff(PyObject *self, PyObject *args)
63 63 {
64 char *sa, *sb, *rb;
64 char *sa, *sb, *rb, *ia, *ib;
65 65 PyObject *result = NULL;
66 66 struct bdiff_line *al, *bl;
67 67 struct bdiff_hunk l, *h;
68 68 int an, bn, count;
69 Py_ssize_t len = 0, la, lb;
69 Py_ssize_t len = 0, la, lb, li = 0, lcommon = 0, lmax;
70 70 PyThreadState *_save;
71 71
72 72 l.next = NULL;
73 73
74 74 if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))
75 75 return NULL;
76 76
77 77 if (la > UINT_MAX || lb > UINT_MAX) {
78 78 PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");
79 79 return NULL;
80 80 }
81 81
82 82 _save = PyEval_SaveThread();
83 an = bdiff_splitlines(sa, la, &al);
84 bn = bdiff_splitlines(sb, lb, &bl);
83
84 lmax = la > lb ? lb : la;
85 for (ia = sa, ib = sb;
86 li < lmax && *ia == *ib;
87 ++li, ++ia, ++ib)
88 if (*ia == '\n')
89 lcommon = li + 1;
90 /* we can almost add: if (li == lmax) lcommon = li; */
91
92 an = bdiff_splitlines(sa + lcommon, la - lcommon, &al);
93 bn = bdiff_splitlines(sb + lcommon, lb - lcommon, &bl);
85 94 if (!al || !bl)
86 95 goto nomem;
87 96
88 97 count = bdiff_diff(al, an, bl, bn, &l);
89 98 if (count < 0)
90 99 goto nomem;
91 100
92 101 /* calculate length of output */
93 102 la = lb = 0;
94 103 for (h = l.next; h; h = h->next) {
95 104 if (h->a1 != la || h->b1 != lb)
96 105 len += 12 + bl[h->b1].l - bl[lb].l;
97 106 la = h->a2;
98 107 lb = h->b2;
99 108 }
100 109 PyEval_RestoreThread(_save);
101 110 _save = NULL;
102 111
103 112 result = PyBytes_FromStringAndSize(NULL, len);
104 113
105 114 if (!result)
106 115 goto nomem;
107 116
108 117 /* build binary patch */
109 118 rb = PyBytes_AsString(result);
110 119 la = lb = 0;
111 120
112 121 for (h = l.next; h; h = h->next) {
113 122 if (h->a1 != la || h->b1 != lb) {
114 123 len = bl[h->b1].l - bl[lb].l;
115 putbe32((uint32_t)(al[la].l - al->l), rb);
116 putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);
124 putbe32((uint32_t)(al[la].l + lcommon - al->l), rb);
125 putbe32((uint32_t)(al[h->a1].l + lcommon - al->l), rb + 4);
117 126 putbe32((uint32_t)len, rb + 8);
118 127 memcpy(rb + 12, bl[lb].l, len);
119 128 rb += 12 + len;
120 129 }
121 130 la = h->a2;
122 131 lb = h->b2;
123 132 }
124 133
125 134 nomem:
126 135 if (_save)
127 136 PyEval_RestoreThread(_save);
128 137 free(al);
129 138 free(bl);
130 139 bdiff_freehunks(l.next);
131 140 return result ? result : PyErr_NoMemory();
132 141 }
133 142
134 143 /*
135 144 * If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,
136 145 * reduce whitespace sequences to a single space and trim remaining whitespace
137 146 * from end of lines.
138 147 */
139 148 static PyObject *fixws(PyObject *self, PyObject *args)
140 149 {
141 150 PyObject *s, *result = NULL;
142 151 char allws, c;
143 152 const char *r;
144 153 Py_ssize_t i, rlen, wlen = 0;
145 154 char *w;
146 155
147 156 if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))
148 157 return NULL;
149 158 r = PyBytes_AsString(s);
150 159 rlen = PyBytes_Size(s);
151 160
152 161 w = (char *)malloc(rlen ? rlen : 1);
153 162 if (!w)
154 163 goto nomem;
155 164
156 165 for (i = 0; i != rlen; i++) {
157 166 c = r[i];
158 167 if (c == ' ' || c == '\t' || c == '\r') {
159 168 if (!allws && (wlen == 0 || w[wlen - 1] != ' '))
160 169 w[wlen++] = ' ';
161 170 } else if (c == '\n' && !allws
162 171 && wlen > 0 && w[wlen - 1] == ' ') {
163 172 w[wlen - 1] = '\n';
164 173 } else {
165 174 w[wlen++] = c;
166 175 }
167 176 }
168 177
169 178 result = PyBytes_FromStringAndSize(w, wlen);
170 179
171 180 nomem:
172 181 free(w);
173 182 return result ? result : PyErr_NoMemory();
174 183 }
175 184
176 185
177 186 static char mdiff_doc[] = "Efficient binary diff.";
178 187
179 188 static PyMethodDef methods[] = {
180 189 {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
181 190 {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
182 191 {"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},
183 192 {NULL, NULL}
184 193 };
185 194
186 195 #ifdef IS_PY3K
187 196 static struct PyModuleDef bdiff_module = {
188 197 PyModuleDef_HEAD_INIT,
189 198 "bdiff",
190 199 mdiff_doc,
191 200 -1,
192 201 methods
193 202 };
194 203
195 204 PyMODINIT_FUNC PyInit_bdiff(void)
196 205 {
197 206 return PyModule_Create(&bdiff_module);
198 207 }
199 208 #else
200 209 PyMODINIT_FUNC initbdiff(void)
201 210 {
202 211 Py_InitModule3("bdiff", methods, mdiff_doc);
203 212 }
204 213 #endif
@@ -1,82 +1,81 b''
1 1 test 'a\nc\n\n\n\n' 'a\nb\n\n\n'
2 2 test 'a\nb\nc\n' 'a\nc\n'
3 3 test '' ''
4 4 test 'a\nb\nc' 'a\nb\nc'
5 5 test 'a\nb\nc\nd\n' 'a\nd\n'
6 6 test 'a\nb\nc\nd\n' 'a\nc\ne\n'
7 7 test 'a\nb\nc\n' 'a\nc\n'
8 8 test 'a\n' 'c\na\nb\n'
9 9 test 'a\n' ''
10 10 test 'a\n' 'b\nc\n'
11 11 test 'a\n' 'c\na\n'
12 12 test '' 'adjfkjdjksdhfksj'
13 13 test '' 'ab'
14 14 test '' 'abc'
15 15 test 'a' 'a'
16 16 test 'ab' 'ab'
17 17 test 'abc' 'abc'
18 18 test 'a\n' 'a\n'
19 19 test 'a\nb' 'a\nb'
20 20 showdiff(
21 21 'x\n\nx\n\nx\n\nx\n\nz\n',
22 22 'x\n\nx\n\ny\n\nx\n\nx\n\nz\n'):
23 23 'x\n\nx\n\n'
24 24 6 6 '' -> 'y\n\n'
25 25 'x\n\nx\n\nz\n'
26 26 showdiff(
27 27 'x\n\nx\n\nx\n\nx\n\nz\n',
28 28 'x\n\nx\n\ny\n\nx\n\ny\n\nx\n\nz\n'):
29 29 'x\n\nx\n\n'
30 30 6 6 '' -> 'y\n\n'
31 31 'x\n\n'
32 32 9 9 '' -> 'y\n\n'
33 33 'x\n\nz\n'
34 34 showdiff(
35 35 'a\nb\nb\nb\nc\n.\nd\ne\n.\nf\n',
36 36 'a\nb\nb\na\nb\nb\nb\nc\n.\nb\nc\n.\nd\ne\nf\n'):
37 0 0 '' -> 'a\nb\nb\n'
38 'a\nb\nb\nb\nc\n.\n'
39 12 12 '' -> 'b\nc\n.\n'
40 'd\ne\n'
37 'a\nb\nb\n'
38 6 6 '' -> 'a\nb\nb\nb\nc\n.\n'
39 'b\nc\n.\nd\ne\n'
41 40 16 18 '.\n' -> ''
42 41 'f\n'
43 42 done
44 43 done
45 44 Nice diff for a trivial change:
46 45 showdiff(
47 46 '<0\n-\n<1\n-\n<2\n-\n<3\n-\n<4\n-\n',
48 47 '>0\n-\n>1\n-\n>2\n-\n>3\n-\n>4\n-\n'):
49 48 0 3 '<0\n' -> '>0\n'
50 49 '-\n'
51 50 5 8 '<1\n' -> '>1\n'
52 51 '-\n'
53 52 10 13 '<2\n' -> '>2\n'
54 53 '-\n'
55 54 15 18 '<3\n' -> '>3\n'
56 55 '-\n'
57 56 20 23 '<4\n' -> '>4\n'
58 57 '-\n'
59 58 Diff 1 to 3 lines - preference for appending:
60 59 showdiff(
61 60 'a\n',
62 61 'a\na\na\n'):
63 62 'a\n'
64 63 2 2 '' -> 'a\na\n'
65 64 Diff 1 to 5 lines - preference for appending:
66 65 showdiff(
67 66 'a\n',
68 67 'a\na\na\na\na\n'):
69 68 'a\n'
70 69 2 2 '' -> 'a\na\na\na\n'
71 70 Diff 3 to 1 lines - preference for removing trailing lines:
72 71 showdiff(
73 72 'a\na\na\n',
74 73 'a\n'):
75 74 'a\n'
76 75 2 6 'a\na\n' -> ''
77 76 Diff 5 to 1 lines - preference for removing trailing lines:
78 77 showdiff(
79 78 'a\na\na\na\na\n',
80 79 'a\n'):
81 80 'a\n'
82 81 2 10 'a\na\na\na\n' -> ''
General Comments 0
You need to be logged in to leave comments. Login now