upstream/mercurial-mirror Commit - r8858:16f6c137

1

/*

1

/*

2

bdiff.c - efficient binary diff extension for Mercurial

2

bdiff.c - efficient binary diff extension for Mercurial

3

4

5

6

This software may be used and distributed according to the terms of

6

This software may be used and distributed according to the terms of

7

the GNU General Public License, incorporated herein by reference.

7

the GNU General Public License, incorporated herein by reference.

8

9

Based roughly on Python difflib

9

Based roughly on Python difflib

10

*/

10

*/

11

12

#include <Python.h>

12

#include <Python.h>

13

#include <stdlib.h>

13

#include <stdlib.h>

14

#include <string.h>

14

#include <string.h>

15

#include <limits.h>

15

#include <limits.h>

16

17

#if defined __hpux || defined __SUNPRO_C || defined _AIX

17

#if defined __hpux || defined __SUNPRO_C || defined _AIX

18

# define inline

18

# define inline

19

#endif

19

#endif

20

21

#ifdef __linux

22

# define inline __inline

23

#endif

24

21

#ifdef _WIN32

25

#ifdef _WIN32

22

#ifdef _MSC_VER

26

#ifdef _MSC_VER

23

#define inline __inline

27

#define inline __inline

24

typedef unsigned long uint32_t;

28

typedef unsigned long uint32_t;

25

#else

29

#else

26

#include <stdint.h>

30

#include <stdint.h>

27

#endif

31

#endif

28

static uint32_t htonl(uint32_t x)

32

static uint32_t htonl(uint32_t x)

29

{

33

{

30

return ((x & 0x000000ffUL) << 24) |

34

return ((x & 0x000000ffUL) << 24) |

31

((x & 0x0000ff00UL) << 8) |

35

((x & 0x0000ff00UL) << 8) |

32

((x & 0x00ff0000UL) >> 8) |

36

((x & 0x00ff0000UL) >> 8) |

33

((x & 0xff000000UL) >> 24);

37

((x & 0xff000000UL) >> 24);

34

}

38

}

35

#else

39

#else

36

#include <sys/types.h>

40

#include <sys/types.h>

37

#if defined __BEOS__ && !defined __HAIKU__

41

#if defined __BEOS__ && !defined __HAIKU__

38

#include <ByteOrder.h>

42

#include <ByteOrder.h>

39

#else

43

#else

40

#include <arpa/inet.h>

44

#include <arpa/inet.h>

41

#endif

45

#endif

42

#include <inttypes.h>

46

#include <inttypes.h>

43

#endif

47

#endif

44

48

45

struct line {

49

struct line {

46

int h, len, n, e;

50

int h, len, n, e;

47

const char *l;

51

const char *l;

48

};

52

};

49

53

50

struct pos {

54

struct pos {

51

int pos, len;

55

int pos, len;

52

};

56

};

53

57

54

struct hunk {

58

struct hunk {

55

int a1, a2, b1, b2;

59

int a1, a2, b1, b2;

56

};

60

};

57

61

58

struct hunklist {

62

struct hunklist {

59

struct hunk *base, *head;

63

struct hunk *base, *head;

60

};

64

};

61

65

62

int splitlines(const char *a, int len, struct line **lr)

66

int splitlines(const char *a, int len, struct line **lr)

63

{

67

{

64

int h, i;

68

int h, i;

65

const char *p, *b = a;

69

const char *p, *b = a;

66

const char * const plast = a + len - 1;

70

const char * const plast = a + len - 1;

67

struct line *l;

71

struct line *l;

68

72

69

/* count the lines */

73

/* count the lines */

70

i = 1; /* extra line for sentinel */

74

i = 1; /* extra line for sentinel */

71

for (p = a; p < a + len; p++)

75

for (p = a; p < a + len; p++)

72

if (*p == '\n' || p == plast)

76

if (*p == '\n' || p == plast)

73

i++;

77

i++;

74

78

75

*lr = l = (struct line *)malloc(sizeof(struct line) * i);

79

*lr = l = (struct line *)malloc(sizeof(struct line) * i);

76

if (!l)

80

if (!l)

77

return -1;

81

return -1;

78

82

79

/* build the line array and calculate hashes */

83

/* build the line array and calculate hashes */

80

h = 0;

84

h = 0;

81

for (p = a; p < a + len; p++) {

85

for (p = a; p < a + len; p++) {

82

/* Leonid Yuriev's hash */

86

/* Leonid Yuriev's hash */

83

h = (h * 1664525) + *p + 1013904223;

87

h = (h * 1664525) + *p + 1013904223;

84

88

85

if (*p == '\n' || p == plast) {

89

if (*p == '\n' || p == plast) {

86

l->h = h;

90

l->h = h;

87

h = 0;

91

h = 0;

88

l->len = p - b + 1;

92

l->len = p - b + 1;

89

l->l = b;

93

l->l = b;

90

l->n = INT_MAX;

94

l->n = INT_MAX;

91

l++;

95

l++;

92

b = p + 1;

96

b = p + 1;

93

}

97

}

94

}

98

}

95

99

96

/* set up a sentinel */

100

/* set up a sentinel */

97

l->h = l->len = 0;

101

l->h = l->len = 0;

98

l->l = a + len;

102

l->l = a + len;

99

return i - 1;

103

return i - 1;

100

}

104

}

101

105

102

int inline cmp(struct line *a, struct line *b)

106

int inline cmp(struct line *a, struct line *b)

103

{

107

{

104

return a->h != b->h || a->len != b->len || memcmp(a->l, b->l, a->len);

108

return a->h != b->h || a->len != b->len || memcmp(a->l, b->l, a->len);

105

}

109

}

106

110

107

static int equatelines(struct line *a, int an, struct line *b, int bn)

111

static int equatelines(struct line *a, int an, struct line *b, int bn)

108

{

112

{

109

int i, j, buckets = 1, t, scale;

113

int i, j, buckets = 1, t, scale;

110

struct pos *h = NULL;

114

struct pos *h = NULL;

111

115

112

/* build a hash table of the next highest power of 2 */

116

/* build a hash table of the next highest power of 2 */

113

while (buckets < bn + 1)

117

while (buckets < bn + 1)

114

buckets *= 2;

118

buckets *= 2;

115

119

116

/* try to allocate a large hash table to avoid collisions */

120

/* try to allocate a large hash table to avoid collisions */

117

for (scale = 4; scale; scale /= 2) {

121

for (scale = 4; scale; scale /= 2) {

118

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

122

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

119

if (h)

123

if (h)

120

break;

124

break;

121

}

125

}

122

126

123

if (!h)

127

if (!h)

124

return 0;

128

return 0;

125

129

126

buckets = buckets * scale - 1;

130

buckets = buckets * scale - 1;

127

131

128

/* clear the hash table */

132

/* clear the hash table */

129

for (i = 0; i <= buckets; i++) {

133

for (i = 0; i <= buckets; i++) {

130

h[i].pos = INT_MAX;

134

h[i].pos = INT_MAX;

131

h[i].len = 0;

135

h[i].len = 0;

132

}

136

}

133

137

134

/* add lines to the hash table chains */

138

/* add lines to the hash table chains */

135

for (i = bn - 1; i >= 0; i--) {

139

for (i = bn - 1; i >= 0; i--) {

136

/* find the equivalence class */

140

/* find the equivalence class */

137

for (j = b[i].h & buckets; h[j].pos != INT_MAX;

141

for (j = b[i].h & buckets; h[j].pos != INT_MAX;

138

j = (j + 1) & buckets)

142

j = (j + 1) & buckets)

139

if (!cmp(b + i, b + h[j].pos))

143

if (!cmp(b + i, b + h[j].pos))

140

break;

144

break;

141

145

142

/* add to the head of the equivalence class */

146

/* add to the head of the equivalence class */

143

b[i].n = h[j].pos;

147

b[i].n = h[j].pos;

144

b[i].e = j;

148

b[i].e = j;

145

h[j].pos = i;

149

h[j].pos = i;

146

h[j].len++; /* keep track of popularity */

150

h[j].len++; /* keep track of popularity */

147

}

151

}

148

152

149

/* compute popularity threshold */

153

/* compute popularity threshold */

150

t = (bn >= 4000) ? bn / 1000 : bn + 1;

154

t = (bn >= 4000) ? bn / 1000 : bn + 1;

151

155

152

/* match items in a to their equivalence class in b */

156

/* match items in a to their equivalence class in b */

153

for (i = 0; i < an; i++) {

157

for (i = 0; i < an; i++) {

154

/* find the equivalence class */

158

/* find the equivalence class */

155

for (j = a[i].h & buckets; h[j].pos != INT_MAX;

159

for (j = a[i].h & buckets; h[j].pos != INT_MAX;

156

j = (j + 1) & buckets)

160

j = (j + 1) & buckets)

157

if (!cmp(a + i, b + h[j].pos))

161

if (!cmp(a + i, b + h[j].pos))

158

break;

162

break;

159

163

160

a[i].e = j; /* use equivalence class for quick compare */

164

a[i].e = j; /* use equivalence class for quick compare */

161

if (h[j].len <= t)

165

if (h[j].len <= t)

162

a[i].n = h[j].pos; /* point to head of match list */

166

a[i].n = h[j].pos; /* point to head of match list */

163

else

167

else

164

a[i].n = INT_MAX; /* too popular */

168

a[i].n = INT_MAX; /* too popular */

165

}

169

}

166

170

167

/* discard hash tables */

171

/* discard hash tables */

168

free(h);

172

free(h);

169

return 1;

173

return 1;

170

}

174

}

171

175

172

static int longest_match(struct line *a, struct line *b, struct pos *pos,

176

static int longest_match(struct line *a, struct line *b, struct pos *pos,

173

int a1, int a2, int b1, int b2, int *omi, int *omj)

177

int a1, int a2, int b1, int b2, int *omi, int *omj)

174

{

178

{

175

int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;

179

int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;

176

180

177

for (i = a1; i < a2; i++) {

181

for (i = a1; i < a2; i++) {

178

/* skip things before the current block */

182

/* skip things before the current block */

179

for (j = a[i].n; j < b1; j = b[j].n)

183

for (j = a[i].n; j < b1; j = b[j].n)

180

;

184

;

181

185

182

/* loop through all lines match a[i] in b */

186

/* loop through all lines match a[i] in b */

183

for (; j < b2; j = b[j].n) {

187

for (; j < b2; j = b[j].n) {

184

/* does this extend an earlier match? */

188

/* does this extend an earlier match? */

185

if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)

189

if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)

186

k = pos[j - 1].len + 1;

190

k = pos[j - 1].len + 1;

187

else

191

else

188

k = 1;

192

k = 1;

189

pos[j].pos = i;

193

pos[j].pos = i;

190

pos[j].len = k;

194

pos[j].len = k;

191

195

192

/* best match so far? */

196

/* best match so far? */

193

if (k > mk) {

197

if (k > mk) {

194

mi = i;

198

mi = i;

195

mj = j;

199

mj = j;

196

mk = k;

200

mk = k;

197

}

201

}

198

}

202

}

199

}

203

}

200

204

201

if (mk) {

205

if (mk) {

202

mi = mi - mk + 1;

206

mi = mi - mk + 1;

203

mj = mj - mk + 1;

207

mj = mj - mk + 1;

204

}

208

}

205

209

206

/* expand match to include neighboring popular lines */

210

/* expand match to include neighboring popular lines */

207

while (mi - mb > a1 && mj - mb > b1 &&

211

while (mi - mb > a1 && mj - mb > b1 &&

208

a[mi - mb - 1].e == b[mj - mb - 1].e)

212

a[mi - mb - 1].e == b[mj - mb - 1].e)

209

mb++;

213

mb++;

210

while (mi + mk < a2 && mj + mk < b2 &&

214

while (mi + mk < a2 && mj + mk < b2 &&

211

a[mi + mk].e == b[mj + mk].e)

215

a[mi + mk].e == b[mj + mk].e)

212

mk++;

216

mk++;

213

217

214

*omi = mi - mb;

218

*omi = mi - mb;

215

*omj = mj - mb;

219

*omj = mj - mb;

216

220

217

return mk + mb;

221

return mk + mb;

218

}

222

}

219

223

220

static void recurse(struct line *a, struct line *b, struct pos *pos,

224

static void recurse(struct line *a, struct line *b, struct pos *pos,

221

int a1, int a2, int b1, int b2, struct hunklist *l)

225

int a1, int a2, int b1, int b2, struct hunklist *l)

222

{

226

{

223

int i, j, k;

227

int i, j, k;

224

228

225

/* find the longest match in this chunk */

229

/* find the longest match in this chunk */

226

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

230

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

227

if (!k)

231

if (!k)

228

return;

232

return;

229

233

230

/* and recurse on the remaining chunks on either side */

234

/* and recurse on the remaining chunks on either side */

231

recurse(a, b, pos, a1, i, b1, j, l);

235

recurse(a, b, pos, a1, i, b1, j, l);

232

l->head->a1 = i;

236

l->head->a1 = i;

233

l->head->a2 = i + k;

237

l->head->a2 = i + k;

234

l->head->b1 = j;

238

l->head->b1 = j;

235

l->head->b2 = j + k;

239

l->head->b2 = j + k;

236

l->head++;

240

l->head++;

237

recurse(a, b, pos, i + k, a2, j + k, b2, l);

241

recurse(a, b, pos, i + k, a2, j + k, b2, l);

238

}

242

}

239

243

240

static struct hunklist diff(struct line *a, int an, struct line *b, int bn)

244

static struct hunklist diff(struct line *a, int an, struct line *b, int bn)

241

{

245

{

242

struct hunklist l;

246

struct hunklist l;

243

struct hunk *curr;

247

struct hunk *curr;

244

struct pos *pos;

248

struct pos *pos;

245

int t;

249

int t;

246

250

247

/* allocate and fill arrays */

251

/* allocate and fill arrays */

248

t = equatelines(a, an, b, bn);

252

t = equatelines(a, an, b, bn);

249

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

253

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

250

/* we can't have more matches than lines in the shorter file */

254

/* we can't have more matches than lines in the shorter file */

251

l.head = l.base = (struct hunk *)malloc(sizeof(struct hunk) *

255

l.head = l.base = (struct hunk *)malloc(sizeof(struct hunk) *

252

((an<bn ? an:bn) + 1));

256

((an<bn ? an:bn) + 1));

253

257

254

if (pos && l.base && t) {

258

if (pos && l.base && t) {

255

/* generate the matching block list */

259

/* generate the matching block list */

256

recurse(a, b, pos, 0, an, 0, bn, &l);

260

recurse(a, b, pos, 0, an, 0, bn, &l);

257

l.head->a1 = l.head->a2 = an;

261

l.head->a1 = l.head->a2 = an;

258

l.head->b1 = l.head->b2 = bn;

262

l.head->b1 = l.head->b2 = bn;

259

l.head++;

263

l.head++;

260

}

264

}

261

265

262

free(pos);

266

free(pos);

263

267

264

/* normalize the hunk list, try to push each hunk towards the end */

268

/* normalize the hunk list, try to push each hunk towards the end */

265

for (curr = l.base; curr != l.head; curr++) {

269

for (curr = l.base; curr != l.head; curr++) {

266

struct hunk *next = curr+1;

270

struct hunk *next = curr+1;

267

int shift = 0;

271

int shift = 0;

268

272

269

if (next == l.head)

273

if (next == l.head)

270

break;

274

break;

271

275

272

if (curr->a2 == next->a1)

276

if (curr->a2 == next->a1)

273

while (curr->a2+shift < an && curr->b2+shift < bn

277

while (curr->a2+shift < an && curr->b2+shift < bn

274

&& !cmp(a+curr->a2+shift, b+curr->b2+shift))

278

&& !cmp(a+curr->a2+shift, b+curr->b2+shift))

275

shift++;

279

shift++;

276

else if (curr->b2 == next->b1)

280

else if (curr->b2 == next->b1)

277

while (curr->b2+shift < bn && curr->a2+shift < an

281

while (curr->b2+shift < bn && curr->a2+shift < an

278

&& !cmp(b+curr->b2+shift, a+curr->a2+shift))

282

&& !cmp(b+curr->b2+shift, a+curr->a2+shift))

279

shift++;

283

shift++;

280

if (!shift)

284

if (!shift)

281

continue;

285

continue;

282

curr->b2 += shift;

286

curr->b2 += shift;

283

next->b1 += shift;

287

next->b1 += shift;

284

curr->a2 += shift;

288

curr->a2 += shift;

285

next->a1 += shift;

289

next->a1 += shift;

286

}

290

}

287

291

288

return l;

292

return l;

289

}

293

}

290

294

291

static PyObject *blocks(PyObject *self, PyObject *args)

295

static PyObject *blocks(PyObject *self, PyObject *args)

292

{

296

{

293

PyObject *sa, *sb, *rl = NULL, *m;

297

PyObject *sa, *sb, *rl = NULL, *m;

294

struct line *a, *b;

298

struct line *a, *b;

295

struct hunklist l = {NULL, NULL};

299

struct hunklist l = {NULL, NULL};

296

struct hunk *h;

300

struct hunk *h;

297

int an, bn, pos = 0;

301

int an, bn, pos = 0;

298

302

299

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

303

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

300

return NULL;

304

return NULL;

301

305

302

an = splitlines(PyString_AsString(sa), PyString_Size(sa), &a);

306

an = splitlines(PyString_AsString(sa), PyString_Size(sa), &a);

303

bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &b);

307

bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &b);

304

if (!a || !b)

308

if (!a || !b)

305

goto nomem;

309

goto nomem;

306

310

307

l = diff(a, an, b, bn);

311

l = diff(a, an, b, bn);

308

rl = PyList_New(l.head - l.base);

312

rl = PyList_New(l.head - l.base);

309

if (!l.head || !rl)

313

if (!l.head || !rl)

310

goto nomem;

314

goto nomem;

311

315

312

for (h = l.base; h != l.head; h++) {

316

for (h = l.base; h != l.head; h++) {

313

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

317

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

314

PyList_SetItem(rl, pos, m);

318

PyList_SetItem(rl, pos, m);

315

pos++;

319

pos++;

316

}

320

}

317

321

318

nomem:

322

nomem:

319

free(a);

323

free(a);

320

free(b);

324

free(b);

321

free(l.base);

325

free(l.base);

322

return rl ? rl : PyErr_NoMemory();

326

return rl ? rl : PyErr_NoMemory();

323

}

327

}

324

328

325

static PyObject *bdiff(PyObject *self, PyObject *args)

329

static PyObject *bdiff(PyObject *self, PyObject *args)

326

{

330

{

327

char *sa, *sb;

331

char *sa, *sb;

328

PyObject *result = NULL;

332

PyObject *result = NULL;

329

struct line *al, *bl;

333

struct line *al, *bl;

330

struct hunklist l = {NULL, NULL};

334

struct hunklist l = {NULL, NULL};

331

struct hunk *h;

335

struct hunk *h;

332

char encode[12], *rb;

336

char encode[12], *rb;

333

int an, bn, len = 0, la, lb;

337

int an, bn, len = 0, la, lb;

334

338

335

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

339

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

336

return NULL;

340

return NULL;

337

341

338

an = splitlines(sa, la, &al);

342

an = splitlines(sa, la, &al);

339

bn = splitlines(sb, lb, &bl);

343

bn = splitlines(sb, lb, &bl);

340

if (!al || !bl)

344

if (!al || !bl)

341

goto nomem;

345

goto nomem;

342

346

343

l = diff(al, an, bl, bn);

347

l = diff(al, an, bl, bn);

344

if (!l.head)

348

if (!l.head)

345

goto nomem;

349

goto nomem;

346

350

347

/* calculate length of output */

351

/* calculate length of output */

348

la = lb = 0;

352

la = lb = 0;

349

for (h = l.base; h != l.head; h++) {

353

for (h = l.base; h != l.head; h++) {

350

if (h->a1 != la || h->b1 != lb)

354

if (h->a1 != la || h->b1 != lb)

351

len += 12 + bl[h->b1].l - bl[lb].l;

355

len += 12 + bl[h->b1].l - bl[lb].l;

352

la = h->a2;

356

la = h->a2;

353

lb = h->b2;

357

lb = h->b2;

354

}

358

}

355

359

356

result = PyString_FromStringAndSize(NULL, len);

360

result = PyString_FromStringAndSize(NULL, len);

357

if (!result)

361

if (!result)

358

goto nomem;

362

goto nomem;

359

363

360

/* build binary patch */

364

/* build binary patch */

361

rb = PyString_AsString(result);

365

rb = PyString_AsString(result);

362

la = lb = 0;

366

la = lb = 0;

363

367

364

for (h = l.base; h != l.head; h++) {

368

for (h = l.base; h != l.head; h++) {

365

if (h->a1 != la || h->b1 != lb) {

369

if (h->a1 != la || h->b1 != lb) {

366

len = bl[h->b1].l - bl[lb].l;

370

len = bl[h->b1].l - bl[lb].l;

367

*(uint32_t *)(encode) = htonl(al[la].l - al->l);

371

*(uint32_t *)(encode) = htonl(al[la].l - al->l);

368

*(uint32_t *)(encode + 4) = htonl(al[h->a1].l - al->l);

372

*(uint32_t *)(encode + 4) = htonl(al[h->a1].l - al->l);

369

*(uint32_t *)(encode + 8) = htonl(len);

373

*(uint32_t *)(encode + 8) = htonl(len);

370

memcpy(rb, encode, 12);

374

memcpy(rb, encode, 12);

371

memcpy(rb + 12, bl[lb].l, len);

375

memcpy(rb + 12, bl[lb].l, len);

372

rb += 12 + len;

376

rb += 12 + len;

373

}

377

}

374

la = h->a2;

378

la = h->a2;

375

lb = h->b2;

379

lb = h->b2;

376

}

380

}

377

381

378

nomem:

382

nomem:

379

free(al);

383

free(al);

380

free(bl);

384

free(bl);

381

free(l.base);

385

free(l.base);

382

return result ? result : PyErr_NoMemory();

386

return result ? result : PyErr_NoMemory();

383

}

387

}

384

388

385

static char mdiff_doc[] = "Efficient binary diff.";

389

static char mdiff_doc[] = "Efficient binary diff.";

386

390

387

static PyMethodDef methods[] = {

391

static PyMethodDef methods[] = {

388

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

392

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

389

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

393

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

390

{NULL, NULL}

394

{NULL, NULL}

391

};

395

};

392

396

393

PyMODINIT_FUNC initbdiff(void)

397

PyMODINIT_FUNC initbdiff(void)

394

{

398

{

395

Py_InitModule3("bdiff", methods, mdiff_doc);

399

Py_InitModule3("bdiff", methods, mdiff_doc);

396

}

400

}

397

401

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             /*
              bdiff.c - efficient binary diff extension for Mercurial
              Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
              This software may be used and distributed according to the terms of
              the GNU General Public License, incorporated herein by reference.
              Based roughly on Python difflib
             */
             #include <Python.h>
             #include <stdlib.h>
             #include <string.h>
             #include <limits.h>
             #if defined __hpux || defined __SUNPRO_C || defined _AIX
             # define inline
             #endif
+            #ifdef __linux
+            # define inline __inline
+            #endif
             #ifdef _WIN32
             #ifdef _MSC_VER
             #define inline __inline
             typedef unsigned long uint32_t;
             #else
             #include <stdint.h>
             #endif
             static uint32_t htonl(uint32_t x)
             {
             	return ((x & 0x000000ffUL) << 24) |
             		((x & 0x0000ff00UL) <<  8) |
             		((x & 0x00ff0000UL) >>  8) |
             		((x & 0xff000000UL) >> 24);
             }
             #else
             #include <sys/types.h>
             #if defined __BEOS__ && !defined __HAIKU__
             #include <ByteOrder.h>
             #else
             #include <arpa/inet.h>
             #endif
             #include <inttypes.h>
             #endif
             struct line {
             	int h, len, n, e;
             	const char *l;
             };
             struct pos {
             	int pos, len;
             };
             struct hunk {
             	int a1, a2, b1, b2;
             };
             struct hunklist {
             	struct hunk *base, *head;
             };
             int splitlines(const char *a, int len, struct line **lr)
             {
             	int h, i;
             	const char *p, *b = a;
             	const char * const plast = a + len - 1;
             	struct line *l;
             	/* count the lines */
             	i = 1; /* extra line for sentinel */
             	for (p = a; p < a + len; p++)
             		if (*p == '\n' || p == plast)
             			i++;
             	*lr = l = (struct line *)malloc(sizeof(struct line) * i);
             	if (!l)
             		return -1;
             	/* build the line array and calculate hashes */
             	h = 0;
             	for (p = a; p < a + len; p++) {
             		/* Leonid Yuriev's hash */
             		h = (h * 1664525) + *p + 1013904223;
             		if (*p == '\n' || p == plast) {
             			l->h = h;
             			h = 0;
             			l->len = p - b + 1;
             			l->l = b;
             			l->n = INT_MAX;
             			l++;
             			b = p + 1;
             		}
             	}
             	/* set up a sentinel */
             	l->h = l->len = 0;
             	l->l = a + len;
             	return i - 1;
             }
             int inline cmp(struct line *a, struct line *b)
             {
             	return a->h != b->h || a->len != b->len || memcmp(a->l, b->l, a->len);
             }
             static int equatelines(struct line *a, int an, struct line *b, int bn)
             {
             	int i, j, buckets = 1, t, scale;
             	struct pos *h = NULL;
             	/* build a hash table of the next highest power of 2 */
             	while (buckets < bn + 1)
             		buckets *= 2;
             	/* try to allocate a large hash table to avoid collisions */
             	for (scale = 4; scale; scale /= 2) {
             		h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));
             		if (h)
             			break;
             	}
             	if (!h)
             		return 0;
             	buckets = buckets * scale - 1;
             	/* clear the hash table */
             	for (i = 0; i <= buckets; i++) {
             		h[i].pos = INT_MAX;
             		h[i].len = 0;
             	}
             	/* add lines to the hash table chains */
             	for (i = bn - 1; i >= 0; i--) {
             		/* find the equivalence class */
             		for (j = b[i].h & buckets; h[j].pos != INT_MAX;
             		     j = (j + 1) & buckets)
             			if (!cmp(b + i, b + h[j].pos))
             				break;
             		/* add to the head of the equivalence class */
             		b[i].n = h[j].pos;
             		b[i].e = j;
             		h[j].pos = i;
             		h[j].len++; /* keep track of popularity */
             	}
             	/* compute popularity threshold */
             	t = (bn >= 4000) ? bn / 1000 : bn + 1;
             	/* match items in a to their equivalence class in b */
             	for (i = 0; i < an; i++) {
             		/* find the equivalence class */
             		for (j = a[i].h & buckets; h[j].pos != INT_MAX;
             		     j = (j + 1) & buckets)
             			if (!cmp(a + i, b + h[j].pos))
             				break;
             		a[i].e = j; /* use equivalence class for quick compare */
             		if (h[j].len <= t)
             			a[i].n = h[j].pos; /* point to head of match list */
             		else
             			a[i].n = INT_MAX; /* too popular */
             	}
             	/* discard hash tables */
             	free(h);
             	return 1;
             }
             static int longest_match(struct line *a, struct line *b, struct pos *pos,
             			 int a1, int a2, int b1, int b2, int *omi, int *omj)
             {
             	int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;
             	for (i = a1; i < a2; i++) {
             		/* skip things before the current block */
             		for (j = a[i].n; j < b1; j = b[j].n)
             			;
             		/* loop through all lines match a[i] in b */
             		for (; j < b2; j = b[j].n) {
             			/* does this extend an earlier match? */
             			if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)
             				k = pos[j - 1].len + 1;
             			else
             				k = 1;
             			pos[j].pos = i;
             			pos[j].len = k;
             			/* best match so far? */
             			if (k > mk) {
             				mi = i;
             				mj = j;
             				mk = k;
             			}
             		}
             	}
             	if (mk) {
             		mi = mi - mk + 1;
             		mj = mj - mk + 1;
             	}
             	/* expand match to include neighboring popular lines */
             	while (mi - mb > a1 && mj - mb > b1 &&
             	       a[mi - mb - 1].e == b[mj - mb - 1].e)
             		mb++;
             	while (mi + mk < a2 && mj + mk < b2 &&
             	       a[mi + mk].e == b[mj + mk].e)
             		mk++;
             	*omi = mi - mb;
             	*omj = mj - mb;
             	return mk + mb;
             }
             static void recurse(struct line *a, struct line *b, struct pos *pos,
             		    int a1, int a2, int b1, int b2, struct hunklist *l)
             {
             	int i, j, k;
             	/* find the longest match in this chunk */
             	k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
             	if (!k)
             		return;
             	/* and recurse on the remaining chunks on either side */
             	recurse(a, b, pos, a1, i, b1, j, l);
             	l->head->a1 = i;
             	l->head->a2 = i + k;
             	l->head->b1 = j;
             	l->head->b2 = j + k;
             	l->head++;
             	recurse(a, b, pos, i + k, a2, j + k, b2, l);
             }
             static struct hunklist diff(struct line *a, int an, struct line *b, int bn)
             {
             	struct hunklist l;
             	struct hunk *curr;
             	struct pos *pos;
             	int t;
             	/* allocate and fill arrays */
             	t = equatelines(a, an, b, bn);
             	pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));
             	/* we can't have more matches than lines in the shorter file */
             	l.head = l.base = (struct hunk *)malloc(sizeof(struct hunk) *
             	                                        ((an<bn ? an:bn) + 1));
             	if (pos && l.base && t) {
             		/* generate the matching block list */
             		recurse(a, b, pos, 0, an, 0, bn, &l);
             		l.head->a1 = l.head->a2 = an;
             		l.head->b1 = l.head->b2 = bn;
             		l.head++;
             	}
             	free(pos);
             	/* normalize the hunk list, try to push each hunk towards the end */
             	for (curr = l.base; curr != l.head; curr++) {
             		struct hunk *next = curr+1;
             		int shift = 0;
             		if (next == l.head)
             			break;
             		if (curr->a2 == next->a1)
             			while (curr->a2+shift < an && curr->b2+shift < bn
             			       && !cmp(a+curr->a2+shift, b+curr->b2+shift))
             				shift++;
             		else if (curr->b2 == next->b1)
             			while (curr->b2+shift < bn && curr->a2+shift < an
             			       && !cmp(b+curr->b2+shift, a+curr->a2+shift))
             				shift++;
             		if (!shift)
             			continue;
             		curr->b2 += shift;
             		next->b1 += shift;
             		curr->a2 += shift;
             		next->a1 += shift;
             	}
             	return l;
             }
             static PyObject *blocks(PyObject *self, PyObject *args)
             {
             	PyObject *sa, *sb, *rl = NULL, *m;
             	struct line *a, *b;
             	struct hunklist l = {NULL, NULL};
             	struct hunk *h;
             	int an, bn, pos = 0;
             	if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
             		return NULL;
             	an = splitlines(PyString_AsString(sa), PyString_Size(sa), &a);
             	bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &b);
             	if (!a || !b)
             		goto nomem;
             	l = diff(a, an, b, bn);
             	rl = PyList_New(l.head - l.base);
             	if (!l.head || !rl)
             		goto nomem;
             	for (h = l.base; h != l.head; h++) {
             		m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
             		PyList_SetItem(rl, pos, m);
             		pos++;
             	}
             nomem:
             	free(a);
             	free(b);
             	free(l.base);
             	return rl ? rl : PyErr_NoMemory();
             }
             static PyObject *bdiff(PyObject *self, PyObject *args)
             {
             	char *sa, *sb;
             	PyObject *result = NULL;
             	struct line *al, *bl;
             	struct hunklist l = {NULL, NULL};
             	struct hunk *h;
             	char encode[12], *rb;
             	int an, bn, len = 0, la, lb;
             	if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))
             		return NULL;
             	an = splitlines(sa, la, &al);
             	bn = splitlines(sb, lb, &bl);
             	if (!al || !bl)
             		goto nomem;
             	l = diff(al, an, bl, bn);
             	if (!l.head)
             		goto nomem;
             	/* calculate length of output */
             	la = lb = 0;
             	for (h = l.base; h != l.head; h++) {
             		if (h->a1 != la || h->b1 != lb)
             			len += 12 + bl[h->b1].l - bl[lb].l;
             		la = h->a2;
             		lb = h->b2;
             	}
             	result = PyString_FromStringAndSize(NULL, len);
             	if (!result)
             		goto nomem;
             	/* build binary patch */
             	rb = PyString_AsString(result);
             	la = lb = 0;
             	for (h = l.base; h != l.head; h++) {
             		if (h->a1 != la || h->b1 != lb) {
             			len = bl[h->b1].l - bl[lb].l;
             			*(uint32_t *)(encode)     = htonl(al[la].l - al->l);
             			*(uint32_t *)(encode + 4) = htonl(al[h->a1].l - al->l);
             			*(uint32_t *)(encode + 8) = htonl(len);
             			memcpy(rb, encode, 12);
             			memcpy(rb + 12, bl[lb].l, len);
             			rb += 12 + len;
             		}
             		la = h->a2;
             		lb = h->b2;
             	}
             nomem:
             	free(al);
             	free(bl);
             	free(l.base);
             	return result ? result : PyErr_NoMemory();
             }
             static char mdiff_doc[] = "Efficient binary diff.";
             static PyMethodDef methods[] = {
             	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
             	{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
             	{NULL, NULL}
             };
             PyMODINIT_FUNC initbdiff(void)
             {
             	Py_InitModule3("bdiff", methods, mdiff_doc);
             }