upstream/mercurial-mirror Commit - r10500:e96597c8

1

/*

1

/*

2

bdiff.c - efficient binary diff extension for Mercurial

2

bdiff.c - efficient binary diff extension for Mercurial

3

4

5

6

This software may be used and distributed according to the terms of

6

This software may be used and distributed according to the terms of

7

the GNU General Public License, incorporated herein by reference.

7

the GNU General Public License, incorporated herein by reference.

8

9

Based roughly on Python difflib

9

Based roughly on Python difflib

10

*/

10

*/

11

12

#include <Python.h>

12

#include <Python.h>

13

#include <stdlib.h>

13

#include <stdlib.h>

14

#include <string.h>

14

#include <string.h>

15

#include <limits.h>

15

#include <limits.h>

16

17

#if defined __hpux || defined __SUNPRO_C || defined _AIX

17

#if defined __hpux || defined __SUNPRO_C || defined _AIX

18

#define inline

18

#define inline

19

#endif

19

#endif

20

21

#ifdef __linux

21

#ifdef __linux

22

#define inline __inline

22

#define inline __inline

23

#endif

23

#endif

24

25

#ifdef _WIN32

25

#ifdef _WIN32

26

#ifdef _MSC_VER

26

#ifdef _MSC_VER

27

#define inline __inline

27

#define inline __inline

28

typedef unsigned long uint32_t;

28

typedef unsigned long uint32_t;

29

#else

29

#else

30

#include <stdint.h>

30

#include <stdint.h>

31

#endif

31

#endif

32

static uint32_t htonl(uint32_t x)

32

static uint32_t htonl(uint32_t x)

33

{

33

{

34

return ((x & 0x000000ffUL) << 24) |

34

return ((x & 0x000000ffUL) << 24) |

35

((x & 0x0000ff00UL) << 8) |

35

((x & 0x0000ff00UL) << 8) |

36

((x & 0x00ff0000UL) >> 8) |

36

((x & 0x00ff0000UL) >> 8) |

37

((x & 0xff000000UL) >> 24);

37

((x & 0xff000000UL) >> 24);

38

}

38

}

39

#else

39

#else

40

#include <sys/types.h>

40

#include <sys/types.h>

41

#if defined __BEOS__ && !defined __HAIKU__

41

#if defined __BEOS__ && !defined __HAIKU__

42

#include <ByteOrder.h>

42

#include <ByteOrder.h>

43

#else

43

#else

44

#include <arpa/inet.h>

44

#include <arpa/inet.h>

45

#endif

45

#endif

46

#include <inttypes.h>

46

#include <inttypes.h>

47

#endif

47

#endif

48

49

struct line {

49

struct line {

50

int h, len, n, e;

50

int h, len, n, e;

51

const char *l;

51

const char *l;

52

};

52

};

53

54

struct pos {

54

struct pos {

55

int pos, len;

55

int pos, len;

56

};

56

};

57

58

struct hunk {

58

struct hunk {

59

int a1, a2, b1, b2;

59

int a1, a2, b1, b2;

60

};

60

};

61

62

struct hunklist {

62

struct hunklist {

63

struct hunk *base, *head;

63

struct hunk *base, *head;

64

};

64

};

65

66

int splitlines(const char *a, int len, struct line **lr)

66

int splitlines(const char *a, int len, struct line **lr)

67

{

67

{

68

int h, i;

68

int h, i;

69

const char *p, *b = a;

69

const char *p, *b = a;

70

const char * const plast = a + len - 1;

70

const char * const plast = a + len - 1;

71

struct line *l;

71

struct line *l;

72

73

/* count the lines */

73

/* count the lines */

74

i = 1; /* extra line for sentinel */

74

i = 1; /* extra line for sentinel */

75

for (p = a; p < a + len; p++)

75

for (p = a; p < a + len; p++)

76

if (*p == '\n' || p == plast)

76

if (*p == '\n' || p == plast)

77

i++;

77

i++;

78

79

*lr = l = (struct line *)malloc(sizeof(struct line) * i);

79

*lr = l = (struct line *)malloc(sizeof(struct line) * i);

80

if (!l)

80

if (!l)

81

return -1;

81

return -1;

82

83

/* build the line array and calculate hashes */

83

/* build the line array and calculate hashes */

84

h = 0;

84

h = 0;

85

for (p = a; p < a + len; p++) {

85

for (p = a; p < a + len; p++) {

86

/* Leonid Yuriev's hash */

86

/* Leonid Yuriev's hash */

87

h = (h * 1664525) + *p + 1013904223;

87

h = (h * 1664525) + *p + 1013904223;

88

89

if (*p == '\n' || p == plast) {

89

if (*p == '\n' || p == plast) {

90

l->h = h;

90

l->h = h;

91

h = 0;

91

h = 0;

92

l->len = p - b + 1;

92

l->len = p - b + 1;

93

l->l = b;

93

l->l = b;

94

l->n = INT_MAX;

94

l->n = INT_MAX;

95

l++;

95

l++;

96

b = p + 1;

96

b = p + 1;

97

}

97

}

98

}

98

}

99

100

/* set up a sentinel */

100

/* set up a sentinel */

101

l->h = l->len = 0;

101

l->h = l->len = 0;

102

l->l = a + len;

102

l->l = a + len;

103

return i - 1;

103

return i - 1;

104

}

104

}

105

106

int inline cmp(struct line *a, struct line *b)

106

int inline cmp(struct line *a, struct line *b)

107

{

107

{

108

return a->h != b->h || a->len != b->len || memcmp(a->l, b->l, a->len);

108

return a->h != b->h || a->len != b->len || memcmp(a->l, b->l, a->len);

109

}

109

}

110

111

static int equatelines(struct line *a, int an, struct line *b, int bn)

111

static int equatelines(struct line *a, int an, struct line *b, int bn)

112

{

112

{

113

int i, j, buckets = 1, t, scale;

113

int i, j, buckets = 1, t, scale;

114

struct pos *h = NULL;

114

struct pos *h = NULL;

115

116

/* build a hash table of the next highest power of 2 */

116

/* build a hash table of the next highest power of 2 */

117

while (buckets < bn + 1)

117

while (buckets < bn + 1)

118

buckets *= 2;

118

buckets *= 2;

119

120

/* try to allocate a large hash table to avoid collisions */

120

/* try to allocate a large hash table to avoid collisions */

121

for (scale = 4; scale; scale /= 2) {

121

for (scale = 4; scale; scale /= 2) {

122

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

122

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

123

if (h)

123

if (h)

124

break;

124

break;

125

}

125

}

126

127

if (!h)

127

if (!h)

128

return 0;

128

return 0;

129

130

buckets = buckets * scale - 1;

130

buckets = buckets * scale - 1;

131

132

/* clear the hash table */

132

/* clear the hash table */

133

for (i = 0; i <= buckets; i++) {

133

for (i = 0; i <= buckets; i++) {

134

h[i].pos = INT_MAX;

134

h[i].pos = INT_MAX;

135

h[i].len = 0;

135

h[i].len = 0;

136

}

136

}

137

138

/* add lines to the hash table chains */

138

/* add lines to the hash table chains */

139

for (i = bn - 1; i >= 0; i--) {

139

for (i = bn - 1; i >= 0; i--) {

140

/* find the equivalence class */

140

/* find the equivalence class */

141

for (j = b[i].h & buckets; h[j].pos != INT_MAX;

141

for (j = b[i].h & buckets; h[j].pos != INT_MAX;

142

j = (j + 1) & buckets)

142

j = (j + 1) & buckets)

143

if (!cmp(b + i, b + h[j].pos))

143

if (!cmp(b + i, b + h[j].pos))

144

break;

144

break;

145

146

/* add to the head of the equivalence class */

146

/* add to the head of the equivalence class */

147

b[i].n = h[j].pos;

147

b[i].n = h[j].pos;

148

b[i].e = j;

148

b[i].e = j;

149

h[j].pos = i;

149

h[j].pos = i;

150

h[j].len++; /* keep track of popularity */

150

h[j].len++; /* keep track of popularity */

151

}

151

}

152

153

/* compute popularity threshold */

153

/* compute popularity threshold */

154

t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);

154

t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);

155

156

/* match items in a to their equivalence class in b */

156

/* match items in a to their equivalence class in b */

157

for (i = 0; i < an; i++) {

157

for (i = 0; i < an; i++) {

158

/* find the equivalence class */

158

/* find the equivalence class */

159

for (j = a[i].h & buckets; h[j].pos != INT_MAX;

159

for (j = a[i].h & buckets; h[j].pos != INT_MAX;

160

j = (j + 1) & buckets)

160

j = (j + 1) & buckets)

161

if (!cmp(a + i, b + h[j].pos))

161

if (!cmp(a + i, b + h[j].pos))

162

break;

162

break;

163

164

a[i].e = j; /* use equivalence class for quick compare */

164

a[i].e = j; /* use equivalence class for quick compare */

165

if (h[j].len <= t)

165

if (h[j].len <= t)

166

a[i].n = h[j].pos; /* point to head of match list */

166

a[i].n = h[j].pos; /* point to head of match list */

167

else

167

else

168

a[i].n = INT_MAX; /* too popular */

168

a[i].n = INT_MAX; /* too popular */

169

}

169

}

170

171

/* discard hash tables */

171

/* discard hash tables */

172

free(h);

172

free(h);

173

return 1;

173

return 1;

174

}

174

}

175

176

static int longest_match(struct line *a, struct line *b, struct pos *pos,

176

static int longest_match(struct line *a, struct line *b, struct pos *pos,

177

int a1, int a2, int b1, int b2, int *omi, int *omj)

177

int a1, int a2, int b1, int b2, int *omi, int *omj)

178

{

178

{

179

int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;

179

int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;

180

181

for (i = a1; i < a2; i++) {

181

for (i = a1; i < a2; i++) {

182

/* skip things before the current block */

182

/* skip things before the current block */

183

for (j = a[i].n; j < b1; j = b[j].n)

183

for (j = a[i].n; j < b1; j = b[j].n)

184

;

184

;

185

186

/* loop through all lines match a[i] in b */

186

/* loop through all lines match a[i] in b */

187

for (; j < b2; j = b[j].n) {

187

for (; j < b2; j = b[j].n) {

188

/* does this extend an earlier match? */

188

/* does this extend an earlier match? */

189

if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)

189

if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)

190

k = pos[j - 1].len + 1;

190

k = pos[j - 1].len + 1;

191

else

191

else

192

k = 1;

192

k = 1;

193

pos[j].pos = i;

193

pos[j].pos = i;

194

pos[j].len = k;

194

pos[j].len = k;

195

196

/* best match so far? */

196

/* best match so far? */

197

if (k > mk) {

197

if (k > mk) {

198

mi = i;

198

mi = i;

199

mj = j;

199

mj = j;

200

mk = k;

200

mk = k;

201

}

201

}

202

}

202

}

203

}

203

}

204

205

if (mk) {

205

if (mk) {

206

mi = mi - mk + 1;

206

mi = mi - mk + 1;

207

mj = mj - mk + 1;

207

mj = mj - mk + 1;

208

}

208

}

209

210

/* expand match to include neighboring popular lines */

210

/* expand match to include neighboring popular lines */

211

while (mi - mb > a1 && mj - mb > b1 &&

211

while (mi - mb > a1 && mj - mb > b1 &&

212

a[mi - mb - 1].e == b[mj - mb - 1].e)

212

a[mi - mb - 1].e == b[mj - mb - 1].e)

213

mb++;

213

mb++;

214

while (mi + mk < a2 && mj + mk < b2 &&

214

while (mi + mk < a2 && mj + mk < b2 &&

215

a[mi + mk].e == b[mj + mk].e)

215

a[mi + mk].e == b[mj + mk].e)

216

mk++;

216

mk++;

217

218

*omi = mi - mb;

218

*omi = mi - mb;

219

*omj = mj - mb;

219

*omj = mj - mb;

220

221

return mk + mb;

221

return mk + mb;

222

}

222

}

223

224

static void recurse(struct line *a, struct line *b, struct pos *pos,

224

static void recurse(struct line *a, struct line *b, struct pos *pos,

225

int a1, int a2, int b1, int b2, struct hunklist *l)

225

int a1, int a2, int b1, int b2, struct hunklist *l)

226

{

226

{

227

int i, j, k;

227

int i, j, k;

228

229

while (1) {

229

/* find the longest match in this chunk */

230

/* find the longest match in this chunk */

230

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

231

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

231

if (!k)

232

if (!k)

232

return;

233

return;

233

234

/* and recurse on the remaining chunks on either side */

235

/* and recurse on the remaining chunks on either side */

235

recurse(a, b, pos, a1, i, b1, j, l);

236

recurse(a, b, pos, a1, i, b1, j, l);

236

l->head->a1 = i;

237

l->head->a1 = i;

237

l->head->a2 = i + k;

238

l->head->a2 = i + k;

238

l->head->b1 = j;

239

l->head->b1 = j;

239

l->head->b2 = j + k;

240

l->head->b2 = j + k;

240

l->head++;

241

l->head++;

241

recurse(a, b, pos, i + k, a2, j + k, b2, l);

242

/* tail-recursion didn't happen, so doing equivalent iteration */

243

a1 = i + k;

244

b1 = j + k;

245

}

242

}

246

}

243

247

244

static struct hunklist diff(struct line *a, int an, struct line *b, int bn)

248

static struct hunklist diff(struct line *a, int an, struct line *b, int bn)

245

{

249

{

246

struct hunklist l;

250

struct hunklist l;

247

struct hunk *curr;

251

struct hunk *curr;

248

struct pos *pos;

252

struct pos *pos;

249

int t;

253

int t;

250

254

251

/* allocate and fill arrays */

255

/* allocate and fill arrays */

252

t = equatelines(a, an, b, bn);

256

t = equatelines(a, an, b, bn);

253

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

257

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

254

/* we can't have more matches than lines in the shorter file */

258

/* we can't have more matches than lines in the shorter file */

255

l.head = l.base = (struct hunk *)malloc(sizeof(struct hunk) *

259

l.head = l.base = (struct hunk *)malloc(sizeof(struct hunk) *

256

((an<bn ? an:bn) + 1));

260

((an<bn ? an:bn) + 1));

257

261

258

if (pos && l.base && t) {

262

if (pos && l.base && t) {

259

/* generate the matching block list */

263

/* generate the matching block list */

260

recurse(a, b, pos, 0, an, 0, bn, &l);

264

recurse(a, b, pos, 0, an, 0, bn, &l);

261

l.head->a1 = l.head->a2 = an;

265

l.head->a1 = l.head->a2 = an;

262

l.head->b1 = l.head->b2 = bn;

266

l.head->b1 = l.head->b2 = bn;

263

l.head++;

267

l.head++;

264

}

268

}

265

269

266

free(pos);

270

free(pos);

267

271

268

/* normalize the hunk list, try to push each hunk towards the end */

272

/* normalize the hunk list, try to push each hunk towards the end */

269

for (curr = l.base; curr != l.head; curr++) {

273

for (curr = l.base; curr != l.head; curr++) {

270

struct hunk *next = curr + 1;

274

struct hunk *next = curr + 1;

271

int shift = 0;

275

int shift = 0;

272

276

273

if (next == l.head)

277

if (next == l.head)

274

break;

278

break;

275

279

276

if (curr->a2 == next->a1)

280

if (curr->a2 == next->a1)

277

while (curr->a2 + shift < an && curr->b2 + shift < bn

281

while (curr->a2 + shift < an && curr->b2 + shift < bn

278

&& !cmp(a + curr->a2 + shift,

282

&& !cmp(a + curr->a2 + shift,

279

b + curr->b2 + shift))

283

b + curr->b2 + shift))

280

shift++;

284

shift++;

281

else if (curr->b2 == next->b1)

285

else if (curr->b2 == next->b1)

282

while (curr->b2 + shift < bn && curr->a2 + shift < an

286

while (curr->b2 + shift < bn && curr->a2 + shift < an

283

&& !cmp(b + curr->b2 + shift,

287

&& !cmp(b + curr->b2 + shift,

284

a + curr->a2 + shift))

288

a + curr->a2 + shift))

285

shift++;

289

shift++;

286

if (!shift)

290

if (!shift)

287

continue;

291

continue;

288

curr->b2 += shift;

292

curr->b2 += shift;

289

next->b1 += shift;

293

next->b1 += shift;

290

curr->a2 += shift;

294

curr->a2 += shift;

291

next->a1 += shift;

295

next->a1 += shift;

292

}

296

}

293

297

294

return l;

298

return l;

295

}

299

}

296

300

297

static PyObject *blocks(PyObject *self, PyObject *args)

301

static PyObject *blocks(PyObject *self, PyObject *args)

298

{

302

{

299

PyObject *sa, *sb, *rl = NULL, *m;

303

PyObject *sa, *sb, *rl = NULL, *m;

300

struct line *a, *b;

304

struct line *a, *b;

301

struct hunklist l = {NULL, NULL};

305

struct hunklist l = {NULL, NULL};

302

struct hunk *h;

306

struct hunk *h;

303

int an, bn, pos = 0;

307

int an, bn, pos = 0;

304

308

305

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

309

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

306

return NULL;

310

return NULL;

307

311

308

an = splitlines(PyString_AsString(sa), PyString_Size(sa), &a);

312

an = splitlines(PyString_AsString(sa), PyString_Size(sa), &a);

309

bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &b);

313

bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &b);

310

if (!a || !b)

314

if (!a || !b)

311

goto nomem;

315

goto nomem;

312

316

313

l = diff(a, an, b, bn);

317

l = diff(a, an, b, bn);

314

rl = PyList_New(l.head - l.base);

318

rl = PyList_New(l.head - l.base);

315

if (!l.head || !rl)

319

if (!l.head || !rl)

316

goto nomem;

320

goto nomem;

317

321

318

for (h = l.base; h != l.head; h++) {

322

for (h = l.base; h != l.head; h++) {

319

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

323

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

320

PyList_SetItem(rl, pos, m);

324

PyList_SetItem(rl, pos, m);

321

pos++;

325

pos++;

322

}

326

}

323

327

324

nomem:

328

nomem:

325

free(a);

329

free(a);

326

free(b);

330

free(b);

327

free(l.base);

331

free(l.base);

328

return rl ? rl : PyErr_NoMemory();

332

return rl ? rl : PyErr_NoMemory();

329

}

333

}

330

334

331

static PyObject *bdiff(PyObject *self, PyObject *args)

335

static PyObject *bdiff(PyObject *self, PyObject *args)

332

{

336

{

333

char *sa, *sb;

337

char *sa, *sb;

334

PyObject *result = NULL;

338

PyObject *result = NULL;

335

struct line *al, *bl;

339

struct line *al, *bl;

336

struct hunklist l = {NULL, NULL};

340

struct hunklist l = {NULL, NULL};

337

struct hunk *h;

341

struct hunk *h;

338

char encode[12], *rb;

342

char encode[12], *rb;

339

int an, bn, len = 0, la, lb;

343

int an, bn, len = 0, la, lb;

340

344

341

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

345

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

342

return NULL;

346

return NULL;

343

347

344

an = splitlines(sa, la, &al);

348

an = splitlines(sa, la, &al);

345

bn = splitlines(sb, lb, &bl);

349

bn = splitlines(sb, lb, &bl);

346

if (!al || !bl)

350

if (!al || !bl)

347

goto nomem;

351

goto nomem;

348

352

349

l = diff(al, an, bl, bn);

353

l = diff(al, an, bl, bn);

350

if (!l.head)

354

if (!l.head)

351

goto nomem;

355

goto nomem;

352

356

353

/* calculate length of output */

357

/* calculate length of output */

354

la = lb = 0;

358

la = lb = 0;

355

for (h = l.base; h != l.head; h++) {

359

for (h = l.base; h != l.head; h++) {

356

if (h->a1 != la || h->b1 != lb)

360

if (h->a1 != la || h->b1 != lb)

357

len += 12 + bl[h->b1].l - bl[lb].l;

361

len += 12 + bl[h->b1].l - bl[lb].l;

358

la = h->a2;

362

la = h->a2;

359

lb = h->b2;

363

lb = h->b2;

360

}

364

}

361

365

362

result = PyString_FromStringAndSize(NULL, len);

366

result = PyString_FromStringAndSize(NULL, len);

363

if (!result)

367

if (!result)

364

goto nomem;

368

goto nomem;

365

369

366

/* build binary patch */

370

/* build binary patch */

367

rb = PyString_AsString(result);

371

rb = PyString_AsString(result);

368

la = lb = 0;

372

la = lb = 0;

369

373

370

for (h = l.base; h != l.head; h++) {

374

for (h = l.base; h != l.head; h++) {

371

if (h->a1 != la || h->b1 != lb) {

375

if (h->a1 != la || h->b1 != lb) {

372

len = bl[h->b1].l - bl[lb].l;

376

len = bl[h->b1].l - bl[lb].l;

373

*(uint32_t *)(encode) = htonl(al[la].l - al->l);

377

*(uint32_t *)(encode) = htonl(al[la].l - al->l);

374

*(uint32_t *)(encode + 4) = htonl(al[h->a1].l - al->l);

378

*(uint32_t *)(encode + 4) = htonl(al[h->a1].l - al->l);

375

*(uint32_t *)(encode + 8) = htonl(len);

379

*(uint32_t *)(encode + 8) = htonl(len);

376

memcpy(rb, encode, 12);

380

memcpy(rb, encode, 12);

377

memcpy(rb + 12, bl[lb].l, len);

381

memcpy(rb + 12, bl[lb].l, len);

378

rb += 12 + len;

382

rb += 12 + len;

379

}

383

}

380

la = h->a2;

384

la = h->a2;

381

lb = h->b2;

385

lb = h->b2;

382

}

386

}

383

387

384

nomem:

388

nomem:

385

free(al);

389

free(al);

386

free(bl);

390

free(bl);

387

free(l.base);

391

free(l.base);

388

return result ? result : PyErr_NoMemory();

392

return result ? result : PyErr_NoMemory();

389

}

393

}

390

394

391

static char mdiff_doc[] = "Efficient binary diff.";

395

static char mdiff_doc[] = "Efficient binary diff.";

392

396

393

static PyMethodDef methods[] = {

397

static PyMethodDef methods[] = {

394

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

398

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

395

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

399

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

396

{NULL, NULL}

400

{NULL, NULL}

397

};

401

};

398

402

399

PyMODINIT_FUNC initbdiff(void)

403

PyMODINIT_FUNC initbdiff(void)

400

{

404

{

401

Py_InitModule3("bdiff", methods, mdiff_doc);

405

Py_InitModule3("bdiff", methods, mdiff_doc);

402

}

406

}

403

407

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             /*
              bdiff.c - efficient binary diff extension for Mercurial
              Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
              This software may be used and distributed according to the terms of
              the GNU General Public License, incorporated herein by reference.
              Based roughly on Python difflib
             */
             #include <Python.h>
             #include <stdlib.h>
             #include <string.h>
             #include <limits.h>
             #if defined __hpux || defined __SUNPRO_C || defined _AIX
             #define inline
             #endif
             #ifdef __linux
             #define inline __inline
             #endif
             #ifdef _WIN32
             #ifdef _MSC_VER
             #define inline __inline
             typedef unsigned long uint32_t;
             #else
             #include <stdint.h>
             #endif
             static uint32_t htonl(uint32_t x)
             {
             	return ((x & 0x000000ffUL) << 24) |
             		((x & 0x0000ff00UL) <<  8) |
             		((x & 0x00ff0000UL) >>  8) |
             		((x & 0xff000000UL) >> 24);
             }
             #else
             #include <sys/types.h>
             #if defined __BEOS__ && !defined __HAIKU__
             #include <ByteOrder.h>
             #else
             #include <arpa/inet.h>
             #endif
             #include <inttypes.h>
             #endif
             struct line {
             	int h, len, n, e;
             	const char *l;
             };
             struct pos {
             	int pos, len;
             };
             struct hunk {
             	int a1, a2, b1, b2;
             };
             struct hunklist {
             	struct hunk *base, *head;
             };
             int splitlines(const char *a, int len, struct line **lr)
             {
             	int h, i;
             	const char *p, *b = a;
             	const char * const plast = a + len - 1;
             	struct line *l;
             	/* count the lines */
             	i = 1; /* extra line for sentinel */
             	for (p = a; p < a + len; p++)
             		if (*p == '\n' || p == plast)
             			i++;
             	*lr = l = (struct line *)malloc(sizeof(struct line) * i);
             	if (!l)
             		return -1;
             	/* build the line array and calculate hashes */
             	h = 0;
             	for (p = a; p < a + len; p++) {
             		/* Leonid Yuriev's hash */
             		h = (h * 1664525) + *p + 1013904223;
             		if (*p == '\n' || p == plast) {
             			l->h = h;
             			h = 0;
             			l->len = p - b + 1;
             			l->l = b;
             			l->n = INT_MAX;
             			l++;
             			b = p + 1;
             		}
             	}
             	/* set up a sentinel */
             	l->h = l->len = 0;
             	l->l = a + len;
             	return i - 1;
             }
             int inline cmp(struct line *a, struct line *b)
             {
             	return a->h != b->h || a->len != b->len || memcmp(a->l, b->l, a->len);
             }
             static int equatelines(struct line *a, int an, struct line *b, int bn)
             {
             	int i, j, buckets = 1, t, scale;
             	struct pos *h = NULL;
             	/* build a hash table of the next highest power of 2 */
             	while (buckets < bn + 1)
             		buckets *= 2;
             	/* try to allocate a large hash table to avoid collisions */
             	for (scale = 4; scale; scale /= 2) {
             		h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));
             		if (h)
             			break;
             	}
             	if (!h)
             		return 0;
             	buckets = buckets * scale - 1;
             	/* clear the hash table */
             	for (i = 0; i <= buckets; i++) {
             		h[i].pos = INT_MAX;
             		h[i].len = 0;
             	}
             	/* add lines to the hash table chains */
             	for (i = bn - 1; i >= 0; i--) {
             		/* find the equivalence class */
             		for (j = b[i].h & buckets; h[j].pos != INT_MAX;
             		     j = (j + 1) & buckets)
             			if (!cmp(b + i, b + h[j].pos))
             				break;
             		/* add to the head of the equivalence class */
             		b[i].n = h[j].pos;
             		b[i].e = j;
             		h[j].pos = i;
             		h[j].len++; /* keep track of popularity */
             	}
             	/* compute popularity threshold */
             	t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);
             	/* match items in a to their equivalence class in b */
             	for (i = 0; i < an; i++) {
             		/* find the equivalence class */
             		for (j = a[i].h & buckets; h[j].pos != INT_MAX;
             		     j = (j + 1) & buckets)
             			if (!cmp(a + i, b + h[j].pos))
             				break;
             		a[i].e = j; /* use equivalence class for quick compare */
             		if (h[j].len <= t)
             			a[i].n = h[j].pos; /* point to head of match list */
             		else
             			a[i].n = INT_MAX; /* too popular */
             	}
             	/* discard hash tables */
             	free(h);
             	return 1;
             }
             static int longest_match(struct line *a, struct line *b, struct pos *pos,
             			 int a1, int a2, int b1, int b2, int *omi, int *omj)
             {
             	int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;
             	for (i = a1; i < a2; i++) {
             		/* skip things before the current block */
             		for (j = a[i].n; j < b1; j = b[j].n)
             			;
             		/* loop through all lines match a[i] in b */
             		for (; j < b2; j = b[j].n) {
             			/* does this extend an earlier match? */
             			if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)
             				k = pos[j - 1].len + 1;
             			else
             				k = 1;
             			pos[j].pos = i;
             			pos[j].len = k;
             			/* best match so far? */
             			if (k > mk) {
             				mi = i;
             				mj = j;
             				mk = k;
             			}
             		}
             	}
             	if (mk) {
             		mi = mi - mk + 1;
             		mj = mj - mk + 1;
             	}
             	/* expand match to include neighboring popular lines */
             	while (mi - mb > a1 && mj - mb > b1 &&
             	       a[mi - mb - 1].e == b[mj - mb - 1].e)
             		mb++;
             	while (mi + mk < a2 && mj + mk < b2 &&
             	       a[mi + mk].e == b[mj + mk].e)
             		mk++;
             	*omi = mi - mb;
             	*omj = mj - mb;
             	return mk + mb;
             }
             static void recurse(struct line *a, struct line *b, struct pos *pos,
             		    int a1, int a2, int b1, int b2, struct hunklist *l)
             {
             	int i, j, k;
+            	while (1) {
             		/* find the longest match in this chunk */
             		k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
             		if (!k)
             			return;
             		/* and recurse on the remaining chunks on either side */
             		recurse(a, b, pos, a1, i, b1, j, l);
             		l->head->a1 = i;
             		l->head->a2 = i + k;
             		l->head->b1 = j;
             		l->head->b2 = j + k;
             		l->head++;
-            	recurse(a, b, pos, i + k, a2, j + k, b2, l);
+            		/* tail-recursion didn't happen, so doing equivalent iteration */
+            		a1 = i + k;
+            		b1 = j + k;
+            	}
             }
             static struct hunklist diff(struct line *a, int an, struct line *b, int bn)
             {
             	struct hunklist l;
             	struct hunk *curr;
             	struct pos *pos;
             	int t;
             	/* allocate and fill arrays */
             	t = equatelines(a, an, b, bn);
             	pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));
             	/* we can't have more matches than lines in the shorter file */
             	l.head = l.base = (struct hunk *)malloc(sizeof(struct hunk) *
             	                                        ((an<bn ? an:bn) + 1));
             	if (pos && l.base && t) {
             		/* generate the matching block list */
             		recurse(a, b, pos, 0, an, 0, bn, &l);
             		l.head->a1 = l.head->a2 = an;
             		l.head->b1 = l.head->b2 = bn;
             		l.head++;
             	}
             	free(pos);
             	/* normalize the hunk list, try to push each hunk towards the end */
             	for (curr = l.base; curr != l.head; curr++) {
             		struct hunk *next = curr + 1;
             		int shift = 0;
             		if (next == l.head)
             			break;
             		if (curr->a2 == next->a1)
             			while (curr->a2 + shift < an && curr->b2 + shift < bn
             			       && !cmp(a + curr->a2 + shift,
             				       b + curr->b2 + shift))
             				shift++;
             		else if (curr->b2 == next->b1)
             			while (curr->b2 + shift < bn && curr->a2 + shift < an
             			       && !cmp(b + curr->b2 + shift,
             				       a + curr->a2 + shift))
             				shift++;
             		if (!shift)
             			continue;
             		curr->b2 += shift;
             		next->b1 += shift;
             		curr->a2 += shift;
             		next->a1 += shift;
             	}
             	return l;
             }
             static PyObject *blocks(PyObject *self, PyObject *args)
             {
             	PyObject *sa, *sb, *rl = NULL, *m;
             	struct line *a, *b;
             	struct hunklist l = {NULL, NULL};
             	struct hunk *h;
             	int an, bn, pos = 0;
             	if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
             		return NULL;
             	an = splitlines(PyString_AsString(sa), PyString_Size(sa), &a);
             	bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &b);
             	if (!a || !b)
             		goto nomem;
             	l = diff(a, an, b, bn);
             	rl = PyList_New(l.head - l.base);
             	if (!l.head || !rl)
             		goto nomem;
             	for (h = l.base; h != l.head; h++) {
             		m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
             		PyList_SetItem(rl, pos, m);
             		pos++;
             	}
             nomem:
             	free(a);
             	free(b);
             	free(l.base);
             	return rl ? rl : PyErr_NoMemory();
             }
             static PyObject *bdiff(PyObject *self, PyObject *args)
             {
             	char *sa, *sb;
             	PyObject *result = NULL;
             	struct line *al, *bl;
             	struct hunklist l = {NULL, NULL};
             	struct hunk *h;
             	char encode[12], *rb;
             	int an, bn, len = 0, la, lb;
             	if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))
             		return NULL;
             	an = splitlines(sa, la, &al);
             	bn = splitlines(sb, lb, &bl);
             	if (!al || !bl)
             		goto nomem;
             	l = diff(al, an, bl, bn);
             	if (!l.head)
             		goto nomem;
             	/* calculate length of output */
             	la = lb = 0;
             	for (h = l.base; h != l.head; h++) {
             		if (h->a1 != la || h->b1 != lb)
             			len += 12 + bl[h->b1].l - bl[lb].l;
             		la = h->a2;
             		lb = h->b2;
             	}
             	result = PyString_FromStringAndSize(NULL, len);
             	if (!result)
             		goto nomem;
             	/* build binary patch */
             	rb = PyString_AsString(result);
             	la = lb = 0;
             	for (h = l.base; h != l.head; h++) {
             		if (h->a1 != la || h->b1 != lb) {
             			len = bl[h->b1].l - bl[lb].l;
             			*(uint32_t *)(encode)     = htonl(al[la].l - al->l);
             			*(uint32_t *)(encode + 4) = htonl(al[h->a1].l - al->l);
             			*(uint32_t *)(encode + 8) = htonl(len);
             			memcpy(rb, encode, 12);
             			memcpy(rb + 12, bl[lb].l, len);
             			rb += 12 + len;
             		}
             		la = h->a2;
             		lb = h->b2;
             	}
             nomem:
             	free(al);
             	free(bl);
             	free(l.base);
             	return result ? result : PyErr_NoMemory();
             }
             static char mdiff_doc[] = "Efficient binary diff.";
             static PyMethodDef methods[] = {
             	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
             	{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
             	{NULL, NULL}
             };
             PyMODINIT_FUNC initbdiff(void)
             {
             	Py_InitModule3("bdiff", methods, mdiff_doc);
             }