upstream/mercurial-mirror Commit - r29322:66dbdd3c

1

/*

1

/*

2

bdiff.c - efficient binary diff extension for Mercurial

2

bdiff.c - efficient binary diff extension for Mercurial

3

4

5

6

This software may be used and distributed according to the terms of

6

This software may be used and distributed according to the terms of

7

the GNU General Public License, incorporated herein by reference.

7

the GNU General Public License, incorporated herein by reference.

8

9

Based roughly on Python difflib

9

Based roughly on Python difflib

10

*/

10

*/

11

12

#define PY_SSIZE_T_CLEAN

12

#define PY_SSIZE_T_CLEAN

13

#include <Python.h>

13

#include <Python.h>

14

#include <stdlib.h>

14

#include <stdlib.h>

15

#include <string.h>

15

#include <string.h>

16

#include <limits.h>

16

#include <limits.h>

17

18

#include "util.h"

18

#include "util.h"

19

20

struct line {

20

struct line {

21

int hash, n, e;

21

int hash, n, e;

22

Py_ssize_t len;

22

Py_ssize_t len;

23

const char *l;

23

const char *l;

24

};

24

};

25

26

struct pos {

26

struct pos {

27

int pos, len;

27

int pos, len;

28

};

28

};

29

30

struct hunk;

30

struct hunk;

31

struct hunk {

31

struct hunk {

32

int a1, a2, b1, b2;

32

int a1, a2, b1, b2;

33

struct hunk *next;

33

struct hunk *next;

34

};

34

};

35

36

static int splitlines(const char *a, Py_ssize_t len, struct line **lr)

36

static int splitlines(const char *a, Py_ssize_t len, struct line **lr)

37

{

37

{

38

unsigned hash;

38

unsigned hash;

39

int i;

39

int i;

40

const char *p, *b = a;

40

const char *p, *b = a;

41

const char * const plast = a + len - 1;

41

const char * const plast = a + len - 1;

42

struct line *l;

42

struct line *l;

43

44

/* count the lines */

44

/* count the lines */

45

i = 1; /* extra line for sentinel */

45

i = 1; /* extra line for sentinel */

46

for (p = a; p < a + len; p++)

46

for (p = a; p < a + len; p++)

47

if (*p == '\n' || p == plast)

47

if (*p == '\n' || p == plast)

48

i++;

48

i++;

49

50

*lr = l = (struct line *)malloc(sizeof(struct line) * i);

50

*lr = l = (struct line *)malloc(sizeof(struct line) * i);

51

if (!l)

51

if (!l)

52

return -1;

52

return -1;

53

54

/* build the line array and calculate hashes */

54

/* build the line array and calculate hashes */

55

hash = 0;

55

hash = 0;

56

for (p = a; p < a + len; p++) {

56

for (p = a; p < a + len; p++) {

57

/* Leonid Yuriev's hash */

57

/* Leonid Yuriev's hash */

58

hash = (hash * 1664525) + (unsigned char)*p + 1013904223;

58

hash = (hash * 1664525) + (unsigned char)*p + 1013904223;

59

60

if (*p == '\n' || p == plast) {

60

if (*p == '\n' || p == plast) {

61

l->hash = hash;

61

l->hash = hash;

62

hash = 0;

62

hash = 0;

63

l->len = p - b + 1;

63

l->len = p - b + 1;

64

l->l = b;

64

l->l = b;

65

l->n = INT_MAX;

65

l->n = INT_MAX;

66

l++;

66

l++;

67

b = p + 1;

67

b = p + 1;

68

}

68

}

69

}

69

}

70

71

/* set up a sentinel */

71

/* set up a sentinel */

72

l->hash = 0;

72

l->hash = 0;

73

l->len = 0;

73

l->len = 0;

74

l->l = a + len;

74

l->l = a + len;

75

return i - 1;

75

return i - 1;

76

}

76

}

77

78

static inline int cmp(struct line *a, struct line *b)

78

static inline int cmp(struct line *a, struct line *b)

79

{

79

{

80

return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);

80

return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);

81

}

81

}

82

83

static int equatelines(struct line *a, int an, struct line *b, int bn)

83

static int equatelines(struct line *a, int an, struct line *b, int bn)

84

{

84

{

85

int i, j, buckets = 1, t, scale;

85

int i, j, buckets = 1, t, scale;

86

struct pos *h = NULL;

86

struct pos *h = NULL;

87

88

/* build a hash table of the next highest power of 2 */

88

/* build a hash table of the next highest power of 2 */

89

while (buckets < bn + 1)

89

while (buckets < bn + 1)

90

buckets *= 2;

90

buckets *= 2;

91

92

/* try to allocate a large hash table to avoid collisions */

92

/* try to allocate a large hash table to avoid collisions */

93

for (scale = 4; scale; scale /= 2) {

93

for (scale = 4; scale; scale /= 2) {

94

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

94

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

95

if (h)

95

if (h)

96

break;

96

break;

97

}

97

}

98

99

if (!h)

99

if (!h)

100

return 0;

100

return 0;

101

102

buckets = buckets * scale - 1;

102

buckets = buckets * scale - 1;

103

104

/* clear the hash table */

104

/* clear the hash table */

105

for (i = 0; i <= buckets; i++) {

105

for (i = 0; i <= buckets; i++) {

106

h[i].pos = -1;

106

h[i].pos = -1;

107

h[i].len = 0;

107

h[i].len = 0;

108

}

108

}

109

110

/* add lines to the hash table chains */

110

/* add lines to the hash table chains */

111

for (i = 0; i < bn; i++) {

111

for (i = 0; i < bn; i++) {

112

/* find the equivalence class */

112

/* find the equivalence class */

113

for (j = b[i].hash & buckets; h[j].pos != -1;

113

for (j = b[i].hash & buckets; h[j].pos != -1;

114

j = (j + 1) & buckets)

114

j = (j + 1) & buckets)

115

if (!cmp(b + i, b + h[j].pos))

115

if (!cmp(b + i, b + h[j].pos))

116

break;

116

break;

117

118

/* add to the head of the equivalence class */

118

/* add to the head of the equivalence class */

119

b[i].n = h[j].pos;

119

b[i].n = h[j].pos;

120

b[i].e = j;

120

b[i].e = j;

121

h[j].pos = i;

121

h[j].pos = i;

122

h[j].len++; /* keep track of popularity */

122

h[j].len++; /* keep track of popularity */

123

}

123

}

124

125

/* compute popularity threshold */

125

/* compute popularity threshold */

126

t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);

126

t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);

127

128

/* match items in a to their equivalence class in b */

128

/* match items in a to their equivalence class in b */

129

for (i = 0; i < an; i++) {

129

for (i = 0; i < an; i++) {

130

/* find the equivalence class */

130

/* find the equivalence class */

131

for (j = a[i].hash & buckets; h[j].pos != -1;

131

for (j = a[i].hash & buckets; h[j].pos != -1;

132

j = (j + 1) & buckets)

132

j = (j + 1) & buckets)

133

if (!cmp(a + i, b + h[j].pos))

133

if (!cmp(a + i, b + h[j].pos))

134

break;

134

break;

135

136

a[i].e = j; /* use equivalence class for quick compare */

136

a[i].e = j; /* use equivalence class for quick compare */

137

if (h[j].len <= t)

137

if (h[j].len <= t)

138

a[i].n = h[j].pos; /* point to head of match list */

138

a[i].n = h[j].pos; /* point to head of match list */

139

else

139

else

140

a[i].n = -1; /* too popular */

140

a[i].n = -1; /* too popular */

141

}

141

}

142

143

/* discard hash tables */

143

/* discard hash tables */

144

free(h);

144

free(h);

145

return 1;

145

return 1;

146

}

146

}

147

148

static int longest_match(struct line *a, struct line *b, struct pos *pos,

148

static int longest_match(struct line *a, struct line *b, struct pos *pos,

149

int a1, int a2, int b1, int b2, int *omi, int *omj)

149

int a1, int a2, int b1, int b2, int *omi, int *omj)

150

{

150

{

151

int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k, half;

151

int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k, half;

152

153

/* window our search on large regions to better bound

153

/* window our search on large regions to better bound

154

worst-case performance. by choosing a window at the end, we

154

worst-case performance. by choosing a window at the end, we

155

reduce skipping overhead on the b chains. */

155

reduce skipping overhead on the b chains. */

156

if (a2 - a1 > 30000)

156

if (a2 - a1 > 30000)

157

a1 = a2 - 30000;

157

a1 = a2 - 30000;

158

159

half = (a1 + a2) / 2;

159

half = (a1 + a2) / 2;

160

161

for (i = a1; i < a2; i++) {

161

for (i = a1; i < a2; i++) {

162

/* skip all lines in b after the current block */

162

/* skip all lines in b after the current block */

163

for (j = a[i].n; j >= b2; j = b[j].n)

163

for (j = a[i].n; j >= b2; j = b[j].n)

164

;

164

;

165

166

/* loop through all lines match a[i] in b */

166

/* loop through all lines match a[i] in b */

167

for (; j >= b1; j = b[j].n) {

167

for (; j >= b1; j = b[j].n) {

168

/* does this extend an earlier match? */

168

/* does this extend an earlier match? */

169

if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)

169

for (k = 1; j - k >= b1 && i - k >= a1; k++) {

170

k = pos[j - 1].len + 1;

170

/* reached an earlier match? */

171

else

171

if (pos[j - k].pos == i - k) {

172

k = 1;

172

k += pos[j - k].len;

173

break;

174

}

175

/* previous line mismatch? */

176

if (a[i - k].e != b[j - k].e)

177

break;

178

}

179

173

pos[j].pos = i;

180

pos[j].pos = i;

174

pos[j].len = k;

181

pos[j].len = k;

175

182

176

/* best match so far? we prefer matches closer

183

/* best match so far? we prefer matches closer

177

to the middle to balance recursion */

184

to the middle to balance recursion */

178

if (k > mk || (k == mk && (i <= mi || i < half))) {

185

if (k > mk || (k == mk && (i <= mi || i < half))) {

179

mi = i;

186

mi = i;

180

mj = j;

187

mj = j;

181

mk = k;

188

mk = k;

182

}

189

}

183

}

190

}

184

}

191

}

185

192

186

if (mk) {

193

if (mk) {

187

mi = mi - mk + 1;

194

mi = mi - mk + 1;

188

mj = mj - mk + 1;

195

mj = mj - mk + 1;

189

}

196

}

190

197

191

/* expand match to include neighboring popular lines */

198

/* expand match to include neighboring popular lines */

192

while (mi - mb > a1 && mj - mb > b1 &&

199

while (mi - mb > a1 && mj - mb > b1 &&

193

a[mi - mb - 1].e == b[mj - mb - 1].e)

200

a[mi - mb - 1].e == b[mj - mb - 1].e)

194

mb++;

201

mb++;

195

while (mi + mk < a2 && mj + mk < b2 &&

202

while (mi + mk < a2 && mj + mk < b2 &&

196

a[mi + mk].e == b[mj + mk].e)

203

a[mi + mk].e == b[mj + mk].e)

197

mk++;

204

mk++;

198

205

199

*omi = mi - mb;

206

*omi = mi - mb;

200

*omj = mj - mb;

207

*omj = mj - mb;

201

208

202

return mk + mb;

209

return mk + mb;

203

}

210

}

204

211

205

static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,

212

static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,

206

int a1, int a2, int b1, int b2, struct hunk *l)

213

int a1, int a2, int b1, int b2, struct hunk *l)

207

{

214

{

208

int i, j, k;

215

int i, j, k;

209

216

210

while (1) {

217

while (1) {

211

/* find the longest match in this chunk */

218

/* find the longest match in this chunk */

212

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

219

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

213

if (!k)

220

if (!k)

214

return l;

221

return l;

215

222

216

/* and recurse on the remaining chunks on either side */

223

/* and recurse on the remaining chunks on either side */

217

l = recurse(a, b, pos, a1, i, b1, j, l);

224

l = recurse(a, b, pos, a1, i, b1, j, l);

218

if (!l)

225

if (!l)

219

return NULL;

226

return NULL;

220

227

221

l->next = (struct hunk *)malloc(sizeof(struct hunk));

228

l->next = (struct hunk *)malloc(sizeof(struct hunk));

222

if (!l->next)

229

if (!l->next)

223

return NULL;

230

return NULL;

224

231

225

l = l->next;

232

l = l->next;

226

l->a1 = i;

233

l->a1 = i;

227

l->a2 = i + k;

234

l->a2 = i + k;

228

l->b1 = j;

235

l->b1 = j;

229

l->b2 = j + k;

236

l->b2 = j + k;

230

l->next = NULL;

237

l->next = NULL;

231

238

232

/* tail-recursion didn't happen, so do equivalent iteration */

239

/* tail-recursion didn't happen, so do equivalent iteration */

233

a1 = i + k;

240

a1 = i + k;

234

b1 = j + k;

241

b1 = j + k;

235

}

242

}

236

}

243

}

237

244

238

static int diff(struct line *a, int an, struct line *b, int bn,

245

static int diff(struct line *a, int an, struct line *b, int bn,

239

struct hunk *base)

246

struct hunk *base)

240

{

247

{

241

struct hunk *curr;

248

struct hunk *curr;

242

struct pos *pos;

249

struct pos *pos;

243

int t, count = 0;

250

int t, count = 0;

244

251

245

/* allocate and fill arrays */

252

/* allocate and fill arrays */

246

t = equatelines(a, an, b, bn);

253

t = equatelines(a, an, b, bn);

247

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

254

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

248

255

249

if (pos && t) {

256

if (pos && t) {

250

/* generate the matching block list */

257

/* generate the matching block list */

251

258

252

curr = recurse(a, b, pos, 0, an, 0, bn, base);

259

curr = recurse(a, b, pos, 0, an, 0, bn, base);

253

if (!curr)

260

if (!curr)

254

return -1;

261

return -1;

255

262

256

/* sentinel end hunk */

263

/* sentinel end hunk */

257

curr->next = (struct hunk *)malloc(sizeof(struct hunk));

264

curr->next = (struct hunk *)malloc(sizeof(struct hunk));

258

if (!curr->next)

265

if (!curr->next)

259

return -1;

266

return -1;

260

curr = curr->next;

267

curr = curr->next;

261

curr->a1 = curr->a2 = an;

268

curr->a1 = curr->a2 = an;

262

curr->b1 = curr->b2 = bn;

269

curr->b1 = curr->b2 = bn;

263

curr->next = NULL;

270

curr->next = NULL;

264

}

271

}

265

272

266

free(pos);

273

free(pos);

267

274

268

/* normalize the hunk list, try to push each hunk towards the end */

275

/* normalize the hunk list, try to push each hunk towards the end */

269

for (curr = base->next; curr; curr = curr->next) {

276

for (curr = base->next; curr; curr = curr->next) {

270

struct hunk *next = curr->next;

277

struct hunk *next = curr->next;

271

278

272

if (!next)

279

if (!next)

273

break;

280

break;

274

281

275

if (curr->a2 == next->a1 || curr->b2 == next->b1)

282

if (curr->a2 == next->a1 || curr->b2 == next->b1)

276

while (curr->a2 < an && curr->b2 < bn

283

while (curr->a2 < an && curr->b2 < bn

277

&& next->a1 < next->a2

284

&& next->a1 < next->a2

278

&& next->b1 < next->b2

285

&& next->b1 < next->b2

279

&& !cmp(a + curr->a2, b + curr->b2)) {

286

&& !cmp(a + curr->a2, b + curr->b2)) {

280

curr->a2++;

287

curr->a2++;

281

next->a1++;

288

next->a1++;

282

curr->b2++;

289

curr->b2++;

283

next->b1++;

290

next->b1++;

284

}

291

}

285

}

292

}

286

293

287

for (curr = base->next; curr; curr = curr->next)

294

for (curr = base->next; curr; curr = curr->next)

288

count++;

295

count++;

289

return count;

296

return count;

290

}

297

}

291

298

292

static void freehunks(struct hunk *l)

299

static void freehunks(struct hunk *l)

293

{

300

{

294

struct hunk *n;

301

struct hunk *n;

295

for (; l; l = n) {

302

for (; l; l = n) {

296

n = l->next;

303

n = l->next;

297

free(l);

304

free(l);

298

}

305

}

299

}

306

}

300

307

301

static PyObject *blocks(PyObject *self, PyObject *args)

308

static PyObject *blocks(PyObject *self, PyObject *args)

302

{

309

{

303

PyObject *sa, *sb, *rl = NULL, *m;

310

PyObject *sa, *sb, *rl = NULL, *m;

304

struct line *a, *b;

311

struct line *a, *b;

305

struct hunk l, *h;

312

struct hunk l, *h;

306

int an, bn, count, pos = 0;

313

int an, bn, count, pos = 0;

307

314

308

l.next = NULL;

315

l.next = NULL;

309

316

310

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

317

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

311

return NULL;

318

return NULL;

312

319

313

an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);

320

an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);

314

bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);

321

bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);

315

322

316

if (!a || !b)

323

if (!a || !b)

317

goto nomem;

324

goto nomem;

318

325

319

count = diff(a, an, b, bn, &l);

326

count = diff(a, an, b, bn, &l);

320

if (count < 0)

327

if (count < 0)

321

goto nomem;

328

goto nomem;

322

329

323

rl = PyList_New(count);

330

rl = PyList_New(count);

324

if (!rl)

331

if (!rl)

325

goto nomem;

332

goto nomem;

326

333

327

for (h = l.next; h; h = h->next) {

334

for (h = l.next; h; h = h->next) {

328

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

335

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

329

PyList_SetItem(rl, pos, m);

336

PyList_SetItem(rl, pos, m);

330

pos++;

337

pos++;

331

}

338

}

332

339

333

nomem:

340

nomem:

334

free(a);

341

free(a);

335

free(b);

342

free(b);

336

freehunks(l.next);

343

freehunks(l.next);

337

return rl ? rl : PyErr_NoMemory();

344

return rl ? rl : PyErr_NoMemory();

338

}

345

}

339

346

340

static PyObject *bdiff(PyObject *self, PyObject *args)

347

static PyObject *bdiff(PyObject *self, PyObject *args)

341

{

348

{

342

char *sa, *sb, *rb;

349

char *sa, *sb, *rb;

343

PyObject *result = NULL;

350

PyObject *result = NULL;

344

struct line *al, *bl;

351

struct line *al, *bl;

345

struct hunk l, *h;

352

struct hunk l, *h;

346

int an, bn, count;

353

int an, bn, count;

347

Py_ssize_t len = 0, la, lb;

354

Py_ssize_t len = 0, la, lb;

348

PyThreadState *_save;

355

PyThreadState *_save;

349

356

350

l.next = NULL;

357

l.next = NULL;

351

358

352

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

359

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

353

return NULL;

360

return NULL;

354

361

355

if (la > UINT_MAX || lb > UINT_MAX) {

362

if (la > UINT_MAX || lb > UINT_MAX) {

356

PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");

363

PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");

357

return NULL;

364

return NULL;

358

}

365

}

359

366

360

_save = PyEval_SaveThread();

367

_save = PyEval_SaveThread();

361

an = splitlines(sa, la, &al);

368

an = splitlines(sa, la, &al);

362

bn = splitlines(sb, lb, &bl);

369

bn = splitlines(sb, lb, &bl);

363

if (!al || !bl)

370

if (!al || !bl)

364

goto nomem;

371

goto nomem;

365

372

366

count = diff(al, an, bl, bn, &l);

373

count = diff(al, an, bl, bn, &l);

367

if (count < 0)

374

if (count < 0)

368

goto nomem;

375

goto nomem;

369

376

370

/* calculate length of output */

377

/* calculate length of output */

371

la = lb = 0;

378

la = lb = 0;

372

for (h = l.next; h; h = h->next) {

379

for (h = l.next; h; h = h->next) {

373

if (h->a1 != la || h->b1 != lb)

380

if (h->a1 != la || h->b1 != lb)

374

len += 12 + bl[h->b1].l - bl[lb].l;

381

len += 12 + bl[h->b1].l - bl[lb].l;

375

la = h->a2;

382

la = h->a2;

376

lb = h->b2;

383

lb = h->b2;

377

}

384

}

378

PyEval_RestoreThread(_save);

385

PyEval_RestoreThread(_save);

379

_save = NULL;

386

_save = NULL;

380

387

381

result = PyBytes_FromStringAndSize(NULL, len);

388

result = PyBytes_FromStringAndSize(NULL, len);

382

389

383

if (!result)

390

if (!result)

384

goto nomem;

391

goto nomem;

385

392

386

/* build binary patch */

393

/* build binary patch */

387

rb = PyBytes_AsString(result);

394

rb = PyBytes_AsString(result);

388

la = lb = 0;

395

la = lb = 0;

389

396

390

for (h = l.next; h; h = h->next) {

397

for (h = l.next; h; h = h->next) {

391

if (h->a1 != la || h->b1 != lb) {

398

if (h->a1 != la || h->b1 != lb) {

392

len = bl[h->b1].l - bl[lb].l;

399

len = bl[h->b1].l - bl[lb].l;

393

putbe32((uint32_t)(al[la].l - al->l), rb);

400

putbe32((uint32_t)(al[la].l - al->l), rb);

394

putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);

401

putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);

395

putbe32((uint32_t)len, rb + 8);

402

putbe32((uint32_t)len, rb + 8);

396

memcpy(rb + 12, bl[lb].l, len);

403

memcpy(rb + 12, bl[lb].l, len);

397

rb += 12 + len;

404

rb += 12 + len;

398

}

405

}

399

la = h->a2;

406

la = h->a2;

400

lb = h->b2;

407

lb = h->b2;

401

}

408

}

402

409

403

nomem:

410

nomem:

404

if (_save)

411

if (_save)

405

PyEval_RestoreThread(_save);

412

PyEval_RestoreThread(_save);

406

free(al);

413

free(al);

407

free(bl);

414

free(bl);

408

freehunks(l.next);

415

freehunks(l.next);

409

return result ? result : PyErr_NoMemory();

416

return result ? result : PyErr_NoMemory();

410

}

417

}

411

418

412

/*

419

/*

413

* If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,

420

* If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,

414

* reduce whitespace sequences to a single space and trim remaining whitespace

421

* reduce whitespace sequences to a single space and trim remaining whitespace

415

* from end of lines.

422

* from end of lines.

416

*/

423

*/

417

static PyObject *fixws(PyObject *self, PyObject *args)

424

static PyObject *fixws(PyObject *self, PyObject *args)

418

{

425

{

419

PyObject *s, *result = NULL;

426

PyObject *s, *result = NULL;

420

char allws, c;

427

char allws, c;

421

const char *r;

428

const char *r;

422

Py_ssize_t i, rlen, wlen = 0;

429

Py_ssize_t i, rlen, wlen = 0;

423

char *w;

430

char *w;

424

431

425

if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))

432

if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))

426

return NULL;

433

return NULL;

427

r = PyBytes_AsString(s);

434

r = PyBytes_AsString(s);

428

rlen = PyBytes_Size(s);

435

rlen = PyBytes_Size(s);

429

436

430

w = (char *)malloc(rlen ? rlen : 1);

437

w = (char *)malloc(rlen ? rlen : 1);

431

if (!w)

438

if (!w)

432

goto nomem;

439

goto nomem;

433

440

434

for (i = 0; i != rlen; i++) {

441

for (i = 0; i != rlen; i++) {

435

c = r[i];

442

c = r[i];

436

if (c == ' ' || c == '\t' || c == '\r') {

443

if (c == ' ' || c == '\t' || c == '\r') {

437

if (!allws && (wlen == 0 || w[wlen - 1] != ' '))

444

if (!allws && (wlen == 0 || w[wlen - 1] != ' '))

438

w[wlen++] = ' ';

445

w[wlen++] = ' ';

439

} else if (c == '\n' && !allws

446

} else if (c == '\n' && !allws

440

&& wlen > 0 && w[wlen - 1] == ' ') {

447

&& wlen > 0 && w[wlen - 1] == ' ') {

441

w[wlen - 1] = '\n';

448

w[wlen - 1] = '\n';

442

} else {

449

} else {

443

w[wlen++] = c;

450

w[wlen++] = c;

444

}

451

}

445

}

452

}

446

453

447

result = PyBytes_FromStringAndSize(w, wlen);

454

result = PyBytes_FromStringAndSize(w, wlen);

448

455

449

nomem:

456

nomem:

450

free(w);

457

free(w);

451

return result ? result : PyErr_NoMemory();

458

return result ? result : PyErr_NoMemory();

452

}

459

}

453

460

454

461

455

static char mdiff_doc[] = "Efficient binary diff.";

462

static char mdiff_doc[] = "Efficient binary diff.";

456

463

457

static PyMethodDef methods[] = {

464

static PyMethodDef methods[] = {

458

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

465

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

459

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

466

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

460

{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},

467

{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},

461

{NULL, NULL}

468

{NULL, NULL}

462

};

469

};

463

470

464

#ifdef IS_PY3K

471

#ifdef IS_PY3K

465

static struct PyModuleDef bdiff_module = {

472

static struct PyModuleDef bdiff_module = {

466

PyModuleDef_HEAD_INIT,

473

PyModuleDef_HEAD_INIT,

467

"bdiff",

474

"bdiff",

468

mdiff_doc,

475

mdiff_doc,

469

-1,

476

-1,

470

methods

477

methods

471

};

478

};

472

479

473

PyMODINIT_FUNC PyInit_bdiff(void)

480

PyMODINIT_FUNC PyInit_bdiff(void)

474

{

481

{

475

return PyModule_Create(&bdiff_module);

482

return PyModule_Create(&bdiff_module);

476

}

483

}

477

#else

484

#else

478

PyMODINIT_FUNC initbdiff(void)

485

PyMODINIT_FUNC initbdiff(void)

479

{

486

{

480

Py_InitModule3("bdiff", methods, mdiff_doc);

487

Py_InitModule3("bdiff", methods, mdiff_doc);

481

}

488

}

482

#endif

489

#endif

483

490

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             /*
              bdiff.c - efficient binary diff extension for Mercurial
              Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
              This software may be used and distributed according to the terms of
              the GNU General Public License, incorporated herein by reference.
              Based roughly on Python difflib
             */
             #define PY_SSIZE_T_CLEAN
             #include <Python.h>
             #include <stdlib.h>
             #include <string.h>
             #include <limits.h>
             #include "util.h"
             struct line {
             	int hash, n, e;
             	Py_ssize_t len;
             	const char *l;
             };
             struct pos {
             	int pos, len;
             };
             struct hunk;
             struct hunk {
             	int a1, a2, b1, b2;
             	struct hunk *next;
             };
             static int splitlines(const char *a, Py_ssize_t len, struct line **lr)
             {
             	unsigned hash;
             	int i;
             	const char *p, *b = a;
             	const char * const plast = a + len - 1;
             	struct line *l;
             	/* count the lines */
             	i = 1; /* extra line for sentinel */
             	for (p = a; p < a + len; p++)
             		if (*p == '\n' || p == plast)
             			i++;
             	*lr = l = (struct line *)malloc(sizeof(struct line) * i);
             	if (!l)
             		return -1;
             	/* build the line array and calculate hashes */
             	hash = 0;
             	for (p = a; p < a + len; p++) {
             		/* Leonid Yuriev's hash */
             		hash = (hash * 1664525) + (unsigned char)*p + 1013904223;
             		if (*p == '\n' || p == plast) {
             			l->hash = hash;
             			hash = 0;
             			l->len = p - b + 1;
             			l->l = b;
             			l->n = INT_MAX;
             			l++;
             			b = p + 1;
             		}
             	}
             	/* set up a sentinel */
             	l->hash = 0;
             	l->len = 0;
             	l->l = a + len;
             	return i - 1;
             }
             static inline int cmp(struct line *a, struct line *b)
             {
             	return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);
             }
             static int equatelines(struct line *a, int an, struct line *b, int bn)
             {
             	int i, j, buckets = 1, t, scale;
             	struct pos *h = NULL;
             	/* build a hash table of the next highest power of 2 */
             	while (buckets < bn + 1)
             		buckets *= 2;
             	/* try to allocate a large hash table to avoid collisions */
             	for (scale = 4; scale; scale /= 2) {
             		h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));
             		if (h)
             			break;
             	}
             	if (!h)
             		return 0;
             	buckets = buckets * scale - 1;
             	/* clear the hash table */
             	for (i = 0; i <= buckets; i++) {
             		h[i].pos = -1;
             		h[i].len = 0;
             	}
             	/* add lines to the hash table chains */
             	for (i = 0; i < bn; i++) {
             		/* find the equivalence class */
             		for (j = b[i].hash & buckets; h[j].pos != -1;
             		     j = (j + 1) & buckets)
             			if (!cmp(b + i, b + h[j].pos))
             				break;
             		/* add to the head of the equivalence class */
             		b[i].n = h[j].pos;
             		b[i].e = j;
             		h[j].pos = i;
             		h[j].len++; /* keep track of popularity */
             	}
             	/* compute popularity threshold */
             	t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);
             	/* match items in a to their equivalence class in b */
             	for (i = 0; i < an; i++) {
             		/* find the equivalence class */
             		for (j = a[i].hash & buckets; h[j].pos != -1;
             		     j = (j + 1) & buckets)
             			if (!cmp(a + i, b + h[j].pos))
             				break;
             		a[i].e = j; /* use equivalence class for quick compare */
             		if (h[j].len <= t)
             			a[i].n = h[j].pos; /* point to head of match list */
             		else
             			a[i].n = -1; /* too popular */
             	}
             	/* discard hash tables */
             	free(h);
             	return 1;
             }
             static int longest_match(struct line *a, struct line *b, struct pos *pos,
             			 int a1, int a2, int b1, int b2, int *omi, int *omj)
             {
             	int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k, half;
             	/* window our search on large regions to better bound
             	   worst-case performance. by choosing a window at the end, we
             	   reduce skipping overhead on the b chains. */
             	if (a2 - a1 > 30000)
             		a1 = a2 - 30000;
             	half = (a1 + a2) / 2;
             	for (i = a1; i < a2; i++) {
             		/* skip all lines in b after the current block */
             		for (j = a[i].n; j >= b2; j = b[j].n)
             			;
             		/* loop through all lines match a[i] in b */
             		for (; j >= b1; j = b[j].n) {
             			/* does this extend an earlier match? */
-            			if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)
+            			for (k = 1; j - k >= b1 && i - k >= a1; k++) {
-            				k = pos[j - 1].len + 1;
+            				/* reached an earlier match? */
-            			else
+            				if (pos[j - k].pos == i - k) {
-            				k = 1;
+            					k += pos[j - k].len;
+            					break;
+            				}
+            				/* previous line mismatch? */
+            				if (a[i - k].e != b[j - k].e)
+            					break;
+            			}
             			pos[j].pos = i;
             			pos[j].len = k;
             			/* best match so far? we prefer matches closer
             			   to the middle to balance recursion */
             			if (k > mk || (k == mk && (i <= mi || i < half))) {
             				mi = i;
             				mj = j;
             				mk = k;
             			}
             		}
             	}
             	if (mk) {
             		mi = mi - mk + 1;
             		mj = mj - mk + 1;
             	}
             	/* expand match to include neighboring popular lines */
             	while (mi - mb > a1 && mj - mb > b1 &&
             	       a[mi - mb - 1].e == b[mj - mb - 1].e)
             		mb++;
             	while (mi + mk < a2 && mj + mk < b2 &&
             	       a[mi + mk].e == b[mj + mk].e)
             		mk++;
             	*omi = mi - mb;
             	*omj = mj - mb;
             	return mk + mb;
             }
             static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,
             			    int a1, int a2, int b1, int b2, struct hunk *l)
             {
             	int i, j, k;
             	while (1) {
             		/* find the longest match in this chunk */
             		k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
             		if (!k)
             			return l;
             		/* and recurse on the remaining chunks on either side */
             		l = recurse(a, b, pos, a1, i, b1, j, l);
             		if (!l)
             			return NULL;
             		l->next = (struct hunk *)malloc(sizeof(struct hunk));
             		if (!l->next)
             			return NULL;
             		l = l->next;
             		l->a1 = i;
             		l->a2 = i + k;
             		l->b1 = j;
             		l->b2 = j + k;
             		l->next = NULL;
             		/* tail-recursion didn't happen, so do equivalent iteration */
             		a1 = i + k;
             		b1 = j + k;
             	}
             }
             static int diff(struct line *a, int an, struct line *b, int bn,
             		 struct hunk *base)
             {
             	struct hunk *curr;
             	struct pos *pos;
             	int t, count = 0;
             	/* allocate and fill arrays */
             	t = equatelines(a, an, b, bn);
             	pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));
             	if (pos && t) {
             		/* generate the matching block list */
             		curr = recurse(a, b, pos, 0, an, 0, bn, base);
             		if (!curr)
             			return -1;
             		/* sentinel end hunk */
             		curr->next = (struct hunk *)malloc(sizeof(struct hunk));
             		if (!curr->next)
             			return -1;
             		curr = curr->next;
             		curr->a1 = curr->a2 = an;
             		curr->b1 = curr->b2 = bn;
             		curr->next = NULL;
             	}
             	free(pos);
             	/* normalize the hunk list, try to push each hunk towards the end */
             	for (curr = base->next; curr; curr = curr->next) {
             		struct hunk *next = curr->next;
             		if (!next)
             			break;
             		if (curr->a2 == next->a1 || curr->b2 == next->b1)
             			while (curr->a2 < an && curr->b2 < bn
             			       && next->a1 < next->a2
             			       && next->b1 < next->b2
             			       && !cmp(a + curr->a2, b + curr->b2)) {
             				curr->a2++;
             				next->a1++;
             				curr->b2++;
             				next->b1++;
             			}
             	}
             	for (curr = base->next; curr; curr = curr->next)
             		count++;
             	return count;
             }
             static void freehunks(struct hunk *l)
             {
             	struct hunk *n;
             	for (; l; l = n) {
             		n = l->next;
             		free(l);
             	}
             }
             static PyObject *blocks(PyObject *self, PyObject *args)
             {
             	PyObject *sa, *sb, *rl = NULL, *m;
             	struct line *a, *b;
             	struct hunk l, *h;
             	int an, bn, count, pos = 0;
             	l.next = NULL;
             	if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
             		return NULL;
             	an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);
             	bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);
             	if (!a || !b)
             		goto nomem;
             	count = diff(a, an, b, bn, &l);
             	if (count < 0)
             		goto nomem;
             	rl = PyList_New(count);
             	if (!rl)
             		goto nomem;
             	for (h = l.next; h; h = h->next) {
             		m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
             		PyList_SetItem(rl, pos, m);
             		pos++;
             	}
             nomem:
             	free(a);
             	free(b);
             	freehunks(l.next);
             	return rl ? rl : PyErr_NoMemory();
             }
             static PyObject *bdiff(PyObject *self, PyObject *args)
             {
             	char *sa, *sb, *rb;
             	PyObject *result = NULL;
             	struct line *al, *bl;
             	struct hunk l, *h;
             	int an, bn, count;
             	Py_ssize_t len = 0, la, lb;
             	PyThreadState *_save;
             	l.next = NULL;
             	if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))
             		return NULL;
             	if (la > UINT_MAX || lb > UINT_MAX) {
             		PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");
             		return NULL;
             	}
             	_save = PyEval_SaveThread();
             	an = splitlines(sa, la, &al);
             	bn = splitlines(sb, lb, &bl);
             	if (!al || !bl)
             		goto nomem;
             	count = diff(al, an, bl, bn, &l);
             	if (count < 0)
             		goto nomem;
             	/* calculate length of output */
             	la = lb = 0;
             	for (h = l.next; h; h = h->next) {
             		if (h->a1 != la || h->b1 != lb)
             			len += 12 + bl[h->b1].l - bl[lb].l;
             		la = h->a2;
             		lb = h->b2;
             	}
             	PyEval_RestoreThread(_save);
             	_save = NULL;
             	result = PyBytes_FromStringAndSize(NULL, len);
             	if (!result)
             		goto nomem;
             	/* build binary patch */
             	rb = PyBytes_AsString(result);
             	la = lb = 0;
             	for (h = l.next; h; h = h->next) {
             		if (h->a1 != la || h->b1 != lb) {
             			len = bl[h->b1].l - bl[lb].l;
             			putbe32((uint32_t)(al[la].l - al->l), rb);
             			putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);
             			putbe32((uint32_t)len, rb + 8);
             			memcpy(rb + 12, bl[lb].l, len);
             			rb += 12 + len;
             		}
             		la = h->a2;
             		lb = h->b2;
             	}
             nomem:
             	if (_save)
             		PyEval_RestoreThread(_save);
             	free(al);
             	free(bl);
             	freehunks(l.next);
             	return result ? result : PyErr_NoMemory();
             }
             /*
              * If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,
              * reduce whitespace sequences to a single space and trim remaining whitespace
              * from end of lines.
              */
             static PyObject *fixws(PyObject *self, PyObject *args)
             {
             	PyObject *s, *result = NULL;
             	char allws, c;
             	const char *r;
             	Py_ssize_t i, rlen, wlen = 0;
             	char *w;
             	if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))
             		return NULL;
             	r = PyBytes_AsString(s);
             	rlen = PyBytes_Size(s);
             	w = (char *)malloc(rlen ? rlen : 1);
             	if (!w)
             		goto nomem;
             	for (i = 0; i != rlen; i++) {
             		c = r[i];
             		if (c == ' ' || c == '\t' || c == '\r') {
             			if (!allws && (wlen == 0 || w[wlen - 1] != ' '))
             				w[wlen++] = ' ';
             		} else if (c == '\n' && !allws
             			  && wlen > 0 && w[wlen - 1] == ' ') {
             			w[wlen - 1] = '\n';
             		} else {
             			w[wlen++] = c;
             		}
             	}
             	result = PyBytes_FromStringAndSize(w, wlen);
             nomem:
             	free(w);
             	return result ? result : PyErr_NoMemory();
             }
             static char mdiff_doc[] = "Efficient binary diff.";
             static PyMethodDef methods[] = {
             	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
             	{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
             	{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},
             	{NULL, NULL}
             };
             #ifdef IS_PY3K
             static struct PyModuleDef bdiff_module = {
             	PyModuleDef_HEAD_INIT,
             	"bdiff",
             	mdiff_doc,
             	-1,
             	methods
             };
             PyMODINIT_FUNC PyInit_bdiff(void)
             {
             	return PyModule_Create(&bdiff_module);
             }
             #else
             PyMODINIT_FUNC initbdiff(void)
             {
             	Py_InitModule3("bdiff", methods, mdiff_doc);
             }
             #endif