upstream/mercurial-mirror Commit - r29323:d29cb5e7

1

/*

1

/*

2

bdiff.c - efficient binary diff extension for Mercurial

2

bdiff.c - efficient binary diff extension for Mercurial

3

4

5

6

This software may be used and distributed according to the terms of

6

This software may be used and distributed according to the terms of

7

the GNU General Public License, incorporated herein by reference.

7

the GNU General Public License, incorporated herein by reference.

8

9

Based roughly on Python difflib

9

Based roughly on Python difflib

10

*/

10

*/

11

12

#define PY_SSIZE_T_CLEAN

12

#define PY_SSIZE_T_CLEAN

13

#include <Python.h>

13

#include <Python.h>

14

#include <stdlib.h>

14

#include <stdlib.h>

15

#include <string.h>

15

#include <string.h>

16

#include <limits.h>

16

#include <limits.h>

17

18

#include "util.h"

18

#include "util.h"

19

20

struct line {

20

struct line {

21

int hash, n, e;

21

int hash, n, e;

22

Py_ssize_t len;

22

Py_ssize_t len;

23

const char *l;

23

const char *l;

24

};

24

};

25

26

struct pos {

26

struct pos {

27

int pos, len;

27

int pos, len;

28

};

28

};

29

30

struct hunk;

30

struct hunk;

31

struct hunk {

31

struct hunk {

32

int a1, a2, b1, b2;

32

int a1, a2, b1, b2;

33

struct hunk *next;

33

struct hunk *next;

34

};

34

};

35

36

static int splitlines(const char *a, Py_ssize_t len, struct line **lr)

36

static int splitlines(const char *a, Py_ssize_t len, struct line **lr)

37

{

37

{

38

unsigned hash;

38

unsigned hash;

39

int i;

39

int i;

40

const char *p, *b = a;

40

const char *p, *b = a;

41

const char * const plast = a + len - 1;

41

const char * const plast = a + len - 1;

42

struct line *l;

42

struct line *l;

43

44

/* count the lines */

44

/* count the lines */

45

i = 1; /* extra line for sentinel */

45

i = 1; /* extra line for sentinel */

46

for (p = a; p < a + len; p++)

46

for (p = a; p < a + len; p++)

47

if (*p == '\n' || p == plast)

47

if (*p == '\n' || p == plast)

48

i++;

48

i++;

49

50

*lr = l = (struct line *)malloc(sizeof(struct line) * i);

50

*lr = l = (struct line *)malloc(sizeof(struct line) * i);

51

if (!l)

51

if (!l)

52

return -1;

52

return -1;

53

54

/* build the line array and calculate hashes */

54

/* build the line array and calculate hashes */

55

hash = 0;

55

hash = 0;

56

for (p = a; p < a + len; p++) {

56

for (p = a; p < a + len; p++) {

57

/* Leonid Yuriev's hash */

57

/* Leonid Yuriev's hash */

58

hash = (hash * 1664525) + (unsigned char)*p + 1013904223;

58

hash = (hash * 1664525) + (unsigned char)*p + 1013904223;

59

60

if (*p == '\n' || p == plast) {

60

if (*p == '\n' || p == plast) {

61

l->hash = hash;

61

l->hash = hash;

62

hash = 0;

62

hash = 0;

63

l->len = p - b + 1;

63

l->len = p - b + 1;

64

l->l = b;

64

l->l = b;

65

l->n = INT_MAX;

65

l->n = INT_MAX;

66

l++;

66

l++;

67

b = p + 1;

67

b = p + 1;

68

}

68

}

69

}

69

}

70

71

/* set up a sentinel */

71

/* set up a sentinel */

72

l->hash = 0;

72

l->hash = 0;

73

l->len = 0;

73

l->len = 0;

74

l->l = a + len;

74

l->l = a + len;

75

return i - 1;

75

return i - 1;

76

}

76

}

77

78

static inline int cmp(struct line *a, struct line *b)

78

static inline int cmp(struct line *a, struct line *b)

79

{

79

{

80

return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);

80

return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);

81

}

81

}

82

83

static int equatelines(struct line *a, int an, struct line *b, int bn)

83

static int equatelines(struct line *a, int an, struct line *b, int bn)

84

{

84

{

85

int i, j, buckets = 1, t, scale;

85

int i, j, buckets = 1, t, scale;

86

struct pos *h = NULL;

86

struct pos *h = NULL;

87

88

/* build a hash table of the next highest power of 2 */

88

/* build a hash table of the next highest power of 2 */

89

while (buckets < bn + 1)

89

while (buckets < bn + 1)

90

buckets *= 2;

90

buckets *= 2;

91

92

/* try to allocate a large hash table to avoid collisions */

92

/* try to allocate a large hash table to avoid collisions */

93

for (scale = 4; scale; scale /= 2) {

93

for (scale = 4; scale; scale /= 2) {

94

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

94

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

95

if (h)

95

if (h)

96

break;

96

break;

97

}

97

}

98

99

if (!h)

99

if (!h)

100

return 0;

100

return 0;

101

102

buckets = buckets * scale - 1;

102

buckets = buckets * scale - 1;

103

104

/* clear the hash table */

104

/* clear the hash table */

105

for (i = 0; i <= buckets; i++) {

105

for (i = 0; i <= buckets; i++) {

106

h[i].pos = -1;

106

h[i].pos = -1;

107

h[i].len = 0;

107

h[i].len = 0;

108

}

108

}

109

110

/* add lines to the hash table chains */

110

/* add lines to the hash table chains */

111

for (i = 0; i < bn; i++) {

111

for (i = 0; i < bn; i++) {

112

/* find the equivalence class */

112

/* find the equivalence class */

113

for (j = b[i].hash & buckets; h[j].pos != -1;

113

for (j = b[i].hash & buckets; h[j].pos != -1;

114

j = (j + 1) & buckets)

114

j = (j + 1) & buckets)

115

if (!cmp(b + i, b + h[j].pos))

115

if (!cmp(b + i, b + h[j].pos))

116

break;

116

break;

117

118

/* add to the head of the equivalence class */

118

/* add to the head of the equivalence class */

119

b[i].n = h[j].pos;

119

b[i].n = h[j].pos;

120

b[i].e = j;

120

b[i].e = j;

121

h[j].pos = i;

121

h[j].pos = i;

122

h[j].len++; /* keep track of popularity */

122

h[j].len++; /* keep track of popularity */

123

}

123

}

124

125

/* compute popularity threshold */

125

/* compute popularity threshold */

126

t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);

126

t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);

127

128

/* match items in a to their equivalence class in b */

128

/* match items in a to their equivalence class in b */

129

for (i = 0; i < an; i++) {

129

for (i = 0; i < an; i++) {

130

/* find the equivalence class */

130

/* find the equivalence class */

131

for (j = a[i].hash & buckets; h[j].pos != -1;

131

for (j = a[i].hash & buckets; h[j].pos != -1;

132

j = (j + 1) & buckets)

132

j = (j + 1) & buckets)

133

if (!cmp(a + i, b + h[j].pos))

133

if (!cmp(a + i, b + h[j].pos))

134

break;

134

break;

135

136

a[i].e = j; /* use equivalence class for quick compare */

136

a[i].e = j; /* use equivalence class for quick compare */

137

if (h[j].len <= t)

137

if (h[j].len <= t)

138

a[i].n = h[j].pos; /* point to head of match list */

138

a[i].n = h[j].pos; /* point to head of match list */

139

else

139

else

140

a[i].n = -1; /* too popular */

140

a[i].n = -1; /* too popular */

141

}

141

}

142

143

/* discard hash tables */

143

/* discard hash tables */

144

free(h);

144

free(h);

145

return 1;

145

return 1;

146

}

146

}

147

148

static int longest_match(struct line *a, struct line *b, struct pos *pos,

148

static int longest_match(struct line *a, struct line *b, struct pos *pos,

149

int a1, int a2, int b1, int b2, int *omi, int *omj)

149

int a1, int a2, int b1, int b2, int *omi, int *omj)

150

{

150

{

151

int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k, half;

151

int mi = a1, mj = b1, mk = 0, i, j, k, half;

152

153

/* window our search on large regions to better bound

153

/* window our search on large regions to better bound

154

worst-case performance. by choosing a window at the end, we

154

worst-case performance. by choosing a window at the end, we

155

reduce skipping overhead on the b chains. */

155

reduce skipping overhead on the b chains. */

156

if (a2 - a1 > 30000)

156

if (a2 - a1 > 30000)

157

a1 = a2 - 30000;

157

a1 = a2 - 30000;

158

159

half = (a1 + a2) / 2;

159

half = (a1 + a2) / 2;

160

161

for (i = a1; i < a2; i++) {

161

for (i = a1; i < a2; i++) {

162

/* skip all lines in b after the current block */

162

/* skip all lines in b after the current block */

163

for (j = a[i].n; j >= b2; j = b[j].n)

163

for (j = a[i].n; j >= b2; j = b[j].n)

164

;

164

;

165

166

/* loop through all lines match a[i] in b */

166

/* loop through all lines match a[i] in b */

167

for (; j >= b1; j = b[j].n) {

167

for (; j >= b1; j = b[j].n) {

168

/* does this extend an earlier match? */

168

/* does this extend an earlier match? */

169

for (k = 1; j - k >= b1 && i - k >= a1; k++) {

169

for (k = 1; j - k >= b1 && i - k >= a1; k++) {

170

/* reached an earlier match? */

170

/* reached an earlier match? */

171

if (pos[j - k].pos == i - k) {

171

if (pos[j - k].pos == i - k) {

172

k += pos[j - k].len;

172

k += pos[j - k].len;

173

break;

173

break;

174

}

174

}

175

/* previous line mismatch? */

175

/* previous line mismatch? */

176

if (a[i - k].e != b[j - k].e)

176

if (a[i - k].e != b[j - k].e)

177

break;

177

break;

178

}

178

}

179

180

pos[j].pos = i;

180

pos[j].pos = i;

181

pos[j].len = k;

181

pos[j].len = k;

182

183

/* best match so far? we prefer matches closer

183

/* best match so far? we prefer matches closer

184

to the middle to balance recursion */

184

to the middle to balance recursion */

185

if (k > mk || (k == mk && (i <= mi || i < half))) {

185

if (k > mk || (k == mk && (i <= mi || i < half))) {

186

mi = i;

186

mi = i;

187

mj = j;

187

mj = j;

188

mk = k;

188

mk = k;

189

}

189

}

190

}

190

}

191

}

191

}

192

193

if (mk) {

193

if (mk) {

194

mi = mi - mk + 1;

194

mi = mi - mk + 1;

195

mj = mj - mk + 1;

195

mj = mj - mk + 1;

196

}

196

}

197

198

/* expand match to include ~~neighboring~~ popular lines */

198

/* expand match to include subsequent popular lines */

199

while (mi - mb > a1 && mj - mb > b1 &&

200

a[mi - mb - 1].e == b[mj - mb - 1].e)

201

mb++;

202

while (mi + mk < a2 && mj + mk < b2 &&

199

while (mi + mk < a2 && mj + mk < b2 &&

203

a[mi + mk].e == b[mj + mk].e)

200

a[mi + mk].e == b[mj + mk].e)

204

mk++;

201

mk++;

205

202

206

*omi = mi - mb;

203

*omi = mi;

207

*omj = mj - mb;

204

*omj = mj;

208

205

209

return mk + mb;

206

return mk;

210

}

207

}

211

208

212

static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,

209

static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,

213

int a1, int a2, int b1, int b2, struct hunk *l)

210

int a1, int a2, int b1, int b2, struct hunk *l)

214

{

211

{

215

int i, j, k;

212

int i, j, k;

216

213

217

while (1) {

214

while (1) {

218

/* find the longest match in this chunk */

215

/* find the longest match in this chunk */

219

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

216

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

220

if (!k)

217

if (!k)

221

return l;

218

return l;

222

219

223

/* and recurse on the remaining chunks on either side */

220

/* and recurse on the remaining chunks on either side */

224

l = recurse(a, b, pos, a1, i, b1, j, l);

221

l = recurse(a, b, pos, a1, i, b1, j, l);

225

if (!l)

222

if (!l)

226

return NULL;

223

return NULL;

227

224

228

l->next = (struct hunk *)malloc(sizeof(struct hunk));

225

l->next = (struct hunk *)malloc(sizeof(struct hunk));

229

if (!l->next)

226

if (!l->next)

230

return NULL;

227

return NULL;

231

228

232

l = l->next;

229

l = l->next;

233

l->a1 = i;

230

l->a1 = i;

234

l->a2 = i + k;

231

l->a2 = i + k;

235

l->b1 = j;

232

l->b1 = j;

236

l->b2 = j + k;

233

l->b2 = j + k;

237

l->next = NULL;

234

l->next = NULL;

238

235

239

/* tail-recursion didn't happen, so do equivalent iteration */

236

/* tail-recursion didn't happen, so do equivalent iteration */

240

a1 = i + k;

237

a1 = i + k;

241

b1 = j + k;

238

b1 = j + k;

242

}

239

}

243

}

240

}

244

241

245

static int diff(struct line *a, int an, struct line *b, int bn,

242

static int diff(struct line *a, int an, struct line *b, int bn,

246

struct hunk *base)

243

struct hunk *base)

247

{

244

{

248

struct hunk *curr;

245

struct hunk *curr;

249

struct pos *pos;

246

struct pos *pos;

250

int t, count = 0;

247

int t, count = 0;

251

248

252

/* allocate and fill arrays */

249

/* allocate and fill arrays */

253

t = equatelines(a, an, b, bn);

250

t = equatelines(a, an, b, bn);

254

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

251

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

255

252

256

if (pos && t) {

253

if (pos && t) {

257

/* generate the matching block list */

254

/* generate the matching block list */

258

255

259

curr = recurse(a, b, pos, 0, an, 0, bn, base);

256

curr = recurse(a, b, pos, 0, an, 0, bn, base);

260

if (!curr)

257

if (!curr)

261

return -1;

258

return -1;

262

259

263

/* sentinel end hunk */

260

/* sentinel end hunk */

264

curr->next = (struct hunk *)malloc(sizeof(struct hunk));

261

curr->next = (struct hunk *)malloc(sizeof(struct hunk));

265

if (!curr->next)

262

if (!curr->next)

266

return -1;

263

return -1;

267

curr = curr->next;

264

curr = curr->next;

268

curr->a1 = curr->a2 = an;

265

curr->a1 = curr->a2 = an;

269

curr->b1 = curr->b2 = bn;

266

curr->b1 = curr->b2 = bn;

270

curr->next = NULL;

267

curr->next = NULL;

271

}

268

}

272

269

273

free(pos);

270

free(pos);

274

271

275

/* normalize the hunk list, try to push each hunk towards the end */

272

/* normalize the hunk list, try to push each hunk towards the end */

276

for (curr = base->next; curr; curr = curr->next) {

273

for (curr = base->next; curr; curr = curr->next) {

277

struct hunk *next = curr->next;

274

struct hunk *next = curr->next;

278

275

279

if (!next)

276

if (!next)

280

break;

277

break;

281

278

282

if (curr->a2 == next->a1 || curr->b2 == next->b1)

279

if (curr->a2 == next->a1 || curr->b2 == next->b1)

283

while (curr->a2 < an && curr->b2 < bn

280

while (curr->a2 < an && curr->b2 < bn

284

&& next->a1 < next->a2

281

&& next->a1 < next->a2

285

&& next->b1 < next->b2

282

&& next->b1 < next->b2

286

&& !cmp(a + curr->a2, b + curr->b2)) {

283

&& !cmp(a + curr->a2, b + curr->b2)) {

287

curr->a2++;

284

curr->a2++;

288

next->a1++;

285

next->a1++;

289

curr->b2++;

286

curr->b2++;

290

next->b1++;

287

next->b1++;

291

}

288

}

292

}

289

}

293

290

294

for (curr = base->next; curr; curr = curr->next)

291

for (curr = base->next; curr; curr = curr->next)

295

count++;

292

count++;

296

return count;

293

return count;

297

}

294

}

298

295

299

static void freehunks(struct hunk *l)

296

static void freehunks(struct hunk *l)

300

{

297

{

301

struct hunk *n;

298

struct hunk *n;

302

for (; l; l = n) {

299

for (; l; l = n) {

303

n = l->next;

300

n = l->next;

304

free(l);

301

free(l);

305

}

302

}

306

}

303

}

307

304

308

static PyObject *blocks(PyObject *self, PyObject *args)

305

static PyObject *blocks(PyObject *self, PyObject *args)

309

{

306

{

310

PyObject *sa, *sb, *rl = NULL, *m;

307

PyObject *sa, *sb, *rl = NULL, *m;

311

struct line *a, *b;

308

struct line *a, *b;

312

struct hunk l, *h;

309

struct hunk l, *h;

313

int an, bn, count, pos = 0;

310

int an, bn, count, pos = 0;

314

311

315

l.next = NULL;

312

l.next = NULL;

316

313

317

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

314

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

318

return NULL;

315

return NULL;

319

316

320

an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);

317

an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);

321

bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);

318

bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);

322

319

323

if (!a || !b)

320

if (!a || !b)

324

goto nomem;

321

goto nomem;

325

322

326

count = diff(a, an, b, bn, &l);

323

count = diff(a, an, b, bn, &l);

327

if (count < 0)

324

if (count < 0)

328

goto nomem;

325

goto nomem;

329

326

330

rl = PyList_New(count);

327

rl = PyList_New(count);

331

if (!rl)

328

if (!rl)

332

goto nomem;

329

goto nomem;

333

330

334

for (h = l.next; h; h = h->next) {

331

for (h = l.next; h; h = h->next) {

335

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

332

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

336

PyList_SetItem(rl, pos, m);

333

PyList_SetItem(rl, pos, m);

337

pos++;

334

pos++;

338

}

335

}

339

336

340

nomem:

337

nomem:

341

free(a);

338

free(a);

342

free(b);

339

free(b);

343

freehunks(l.next);

340

freehunks(l.next);

344

return rl ? rl : PyErr_NoMemory();

341

return rl ? rl : PyErr_NoMemory();

345

}

342

}

346

343

347

static PyObject *bdiff(PyObject *self, PyObject *args)

344

static PyObject *bdiff(PyObject *self, PyObject *args)

348

{

345

{

349

char *sa, *sb, *rb;

346

char *sa, *sb, *rb;

350

PyObject *result = NULL;

347

PyObject *result = NULL;

351

struct line *al, *bl;

348

struct line *al, *bl;

352

struct hunk l, *h;

349

struct hunk l, *h;

353

int an, bn, count;

350

int an, bn, count;

354

Py_ssize_t len = 0, la, lb;

351

Py_ssize_t len = 0, la, lb;

355

PyThreadState *_save;

352

PyThreadState *_save;

356

353

357

l.next = NULL;

354

l.next = NULL;

358

355

359

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

356

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

360

return NULL;

357

return NULL;

361

358

362

if (la > UINT_MAX || lb > UINT_MAX) {

359

if (la > UINT_MAX || lb > UINT_MAX) {

363

PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");

360

PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");

364

return NULL;

361

return NULL;

365

}

362

}

366

363

367

_save = PyEval_SaveThread();

364

_save = PyEval_SaveThread();

368

an = splitlines(sa, la, &al);

365

an = splitlines(sa, la, &al);

369

bn = splitlines(sb, lb, &bl);

366

bn = splitlines(sb, lb, &bl);

370

if (!al || !bl)

367

if (!al || !bl)

371

goto nomem;

368

goto nomem;

372

369

373

count = diff(al, an, bl, bn, &l);

370

count = diff(al, an, bl, bn, &l);

374

if (count < 0)

371

if (count < 0)

375

goto nomem;

372

goto nomem;

376

373

377

/* calculate length of output */

374

/* calculate length of output */

378

la = lb = 0;

375

la = lb = 0;

379

for (h = l.next; h; h = h->next) {

376

for (h = l.next; h; h = h->next) {

380

if (h->a1 != la || h->b1 != lb)

377

if (h->a1 != la || h->b1 != lb)

381

len += 12 + bl[h->b1].l - bl[lb].l;

378

len += 12 + bl[h->b1].l - bl[lb].l;

382

la = h->a2;

379

la = h->a2;

383

lb = h->b2;

380

lb = h->b2;

384

}

381

}

385

PyEval_RestoreThread(_save);

382

PyEval_RestoreThread(_save);

386

_save = NULL;

383

_save = NULL;

387

384

388

result = PyBytes_FromStringAndSize(NULL, len);

385

result = PyBytes_FromStringAndSize(NULL, len);

389

386

390

if (!result)

387

if (!result)

391

goto nomem;

388

goto nomem;

392

389

393

/* build binary patch */

390

/* build binary patch */

394

rb = PyBytes_AsString(result);

391

rb = PyBytes_AsString(result);

395

la = lb = 0;

392

la = lb = 0;

396

393

397

for (h = l.next; h; h = h->next) {

394

for (h = l.next; h; h = h->next) {

398

if (h->a1 != la || h->b1 != lb) {

395

if (h->a1 != la || h->b1 != lb) {

399

len = bl[h->b1].l - bl[lb].l;

396

len = bl[h->b1].l - bl[lb].l;

400

putbe32((uint32_t)(al[la].l - al->l), rb);

397

putbe32((uint32_t)(al[la].l - al->l), rb);

401

putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);

398

putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);

402

putbe32((uint32_t)len, rb + 8);

399

putbe32((uint32_t)len, rb + 8);

403

memcpy(rb + 12, bl[lb].l, len);

400

memcpy(rb + 12, bl[lb].l, len);

404

rb += 12 + len;

401

rb += 12 + len;

405

}

402

}

406

la = h->a2;

403

la = h->a2;

407

lb = h->b2;

404

lb = h->b2;

408

}

405

}

409

406

410

nomem:

407

nomem:

411

if (_save)

408

if (_save)

412

PyEval_RestoreThread(_save);

409

PyEval_RestoreThread(_save);

413

free(al);

410

free(al);

414

free(bl);

411

free(bl);

415

freehunks(l.next);

412

freehunks(l.next);

416

return result ? result : PyErr_NoMemory();

413

return result ? result : PyErr_NoMemory();

417

}

414

}

418

415

419

/*

416

/*

420

* If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,

417

* If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,

421

* reduce whitespace sequences to a single space and trim remaining whitespace

418

* reduce whitespace sequences to a single space and trim remaining whitespace

422

* from end of lines.

419

* from end of lines.

423

*/

420

*/

424

static PyObject *fixws(PyObject *self, PyObject *args)

421

static PyObject *fixws(PyObject *self, PyObject *args)

425

{

422

{

426

PyObject *s, *result = NULL;

423

PyObject *s, *result = NULL;

427

char allws, c;

424

char allws, c;

428

const char *r;

425

const char *r;

429

Py_ssize_t i, rlen, wlen = 0;

426

Py_ssize_t i, rlen, wlen = 0;

430

char *w;

427

char *w;

431

428

432

if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))

429

if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))

433

return NULL;

430

return NULL;

434

r = PyBytes_AsString(s);

431

r = PyBytes_AsString(s);

435

rlen = PyBytes_Size(s);

432

rlen = PyBytes_Size(s);

436

433

437

w = (char *)malloc(rlen ? rlen : 1);

434

w = (char *)malloc(rlen ? rlen : 1);

438

if (!w)

435

if (!w)

439

goto nomem;

436

goto nomem;

440

437

441

for (i = 0; i != rlen; i++) {

438

for (i = 0; i != rlen; i++) {

442

c = r[i];

439

c = r[i];

443

if (c == ' ' || c == '\t' || c == '\r') {

440

if (c == ' ' || c == '\t' || c == '\r') {

444

if (!allws && (wlen == 0 || w[wlen - 1] != ' '))

441

if (!allws && (wlen == 0 || w[wlen - 1] != ' '))

445

w[wlen++] = ' ';

442

w[wlen++] = ' ';

446

} else if (c == '\n' && !allws

443

} else if (c == '\n' && !allws

447

&& wlen > 0 && w[wlen - 1] == ' ') {

444

&& wlen > 0 && w[wlen - 1] == ' ') {

448

w[wlen - 1] = '\n';

445

w[wlen - 1] = '\n';

449

} else {

446

} else {

450

w[wlen++] = c;

447

w[wlen++] = c;

451

}

448

}

452

}

449

}

453

450

454

result = PyBytes_FromStringAndSize(w, wlen);

451

result = PyBytes_FromStringAndSize(w, wlen);

455

452

456

nomem:

453

nomem:

457

free(w);

454

free(w);

458

return result ? result : PyErr_NoMemory();

455

return result ? result : PyErr_NoMemory();

459

}

456

}

460

457

461

458

462

static char mdiff_doc[] = "Efficient binary diff.";

459

static char mdiff_doc[] = "Efficient binary diff.";

463

460

464

static PyMethodDef methods[] = {

461

static PyMethodDef methods[] = {

465

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

462

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

466

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

463

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

467

{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},

464

{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},

468

{NULL, NULL}

465

{NULL, NULL}

469

};

466

};

470

467

471

#ifdef IS_PY3K

468

#ifdef IS_PY3K

472

static struct PyModuleDef bdiff_module = {

469

static struct PyModuleDef bdiff_module = {

473

PyModuleDef_HEAD_INIT,

470

PyModuleDef_HEAD_INIT,

474

"bdiff",

471

"bdiff",

475

mdiff_doc,

472

mdiff_doc,

476

-1,

473

-1,

477

methods

474

methods

478

};

475

};

479

476

480

PyMODINIT_FUNC PyInit_bdiff(void)

477

PyMODINIT_FUNC PyInit_bdiff(void)

481

{

478

{

482

return PyModule_Create(&bdiff_module);

479

return PyModule_Create(&bdiff_module);

483

}

480

}

484

#else

481

#else

485

PyMODINIT_FUNC initbdiff(void)

482

PyMODINIT_FUNC initbdiff(void)

486

{

483

{

487

Py_InitModule3("bdiff", methods, mdiff_doc);

484

Py_InitModule3("bdiff", methods, mdiff_doc);

488

}

485

}

489

#endif

486

#endif

490

487

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             /*
              bdiff.c - efficient binary diff extension for Mercurial
              Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
              This software may be used and distributed according to the terms of
              the GNU General Public License, incorporated herein by reference.
              Based roughly on Python difflib
             */
             #define PY_SSIZE_T_CLEAN
             #include <Python.h>
             #include <stdlib.h>
             #include <string.h>
             #include <limits.h>
             #include "util.h"
             struct line {
             	int hash, n, e;
             	Py_ssize_t len;
             	const char *l;
             };
             struct pos {
             	int pos, len;
             };
             struct hunk;
             struct hunk {
             	int a1, a2, b1, b2;
             	struct hunk *next;
             };
             static int splitlines(const char *a, Py_ssize_t len, struct line **lr)
             {
             	unsigned hash;
             	int i;
             	const char *p, *b = a;
             	const char * const plast = a + len - 1;
             	struct line *l;
             	/* count the lines */
             	i = 1; /* extra line for sentinel */
             	for (p = a; p < a + len; p++)
             		if (*p == '\n' || p == plast)
             			i++;
             	*lr = l = (struct line *)malloc(sizeof(struct line) * i);
             	if (!l)
             		return -1;
             	/* build the line array and calculate hashes */
             	hash = 0;
             	for (p = a; p < a + len; p++) {
             		/* Leonid Yuriev's hash */
             		hash = (hash * 1664525) + (unsigned char)*p + 1013904223;
             		if (*p == '\n' || p == plast) {
             			l->hash = hash;
             			hash = 0;
             			l->len = p - b + 1;
             			l->l = b;
             			l->n = INT_MAX;
             			l++;
             			b = p + 1;
             		}
             	}
             	/* set up a sentinel */
             	l->hash = 0;
             	l->len = 0;
             	l->l = a + len;
             	return i - 1;
             }
             static inline int cmp(struct line *a, struct line *b)
             {
             	return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);
             }
             static int equatelines(struct line *a, int an, struct line *b, int bn)
             {
             	int i, j, buckets = 1, t, scale;
             	struct pos *h = NULL;
             	/* build a hash table of the next highest power of 2 */
             	while (buckets < bn + 1)
             		buckets *= 2;
             	/* try to allocate a large hash table to avoid collisions */
             	for (scale = 4; scale; scale /= 2) {
             		h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));
             		if (h)
             			break;
             	}
             	if (!h)
             		return 0;
             	buckets = buckets * scale - 1;
             	/* clear the hash table */
             	for (i = 0; i <= buckets; i++) {
             		h[i].pos = -1;
             		h[i].len = 0;
             	}
             	/* add lines to the hash table chains */
             	for (i = 0; i < bn; i++) {
             		/* find the equivalence class */
             		for (j = b[i].hash & buckets; h[j].pos != -1;
             		     j = (j + 1) & buckets)
             			if (!cmp(b + i, b + h[j].pos))
             				break;
             		/* add to the head of the equivalence class */
             		b[i].n = h[j].pos;
             		b[i].e = j;
             		h[j].pos = i;
             		h[j].len++; /* keep track of popularity */
             	}
             	/* compute popularity threshold */
             	t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);
             	/* match items in a to their equivalence class in b */
             	for (i = 0; i < an; i++) {
             		/* find the equivalence class */
             		for (j = a[i].hash & buckets; h[j].pos != -1;
             		     j = (j + 1) & buckets)
             			if (!cmp(a + i, b + h[j].pos))
             				break;
             		a[i].e = j; /* use equivalence class for quick compare */
             		if (h[j].len <= t)
             			a[i].n = h[j].pos; /* point to head of match list */
             		else
             			a[i].n = -1; /* too popular */
             	}
             	/* discard hash tables */
             	free(h);
             	return 1;
             }
             static int longest_match(struct line *a, struct line *b, struct pos *pos,
             			 int a1, int a2, int b1, int b2, int *omi, int *omj)
             {
-            	int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k, half;
+            	int mi = a1, mj = b1, mk = 0, i, j, k, half;
             	/* window our search on large regions to better bound
             	   worst-case performance. by choosing a window at the end, we
             	   reduce skipping overhead on the b chains. */
             	if (a2 - a1 > 30000)
             		a1 = a2 - 30000;
             	half = (a1 + a2) / 2;
             	for (i = a1; i < a2; i++) {
             		/* skip all lines in b after the current block */
             		for (j = a[i].n; j >= b2; j = b[j].n)
             			;
             		/* loop through all lines match a[i] in b */
             		for (; j >= b1; j = b[j].n) {
             			/* does this extend an earlier match? */
             			for (k = 1; j - k >= b1 && i - k >= a1; k++) {
             				/* reached an earlier match? */
             				if (pos[j - k].pos == i - k) {
             					k += pos[j - k].len;
             					break;
             				}
             				/* previous line mismatch? */
             				if (a[i - k].e != b[j - k].e)
             					break;
             			}
             			pos[j].pos = i;
             			pos[j].len = k;
             			/* best match so far? we prefer matches closer
             			   to the middle to balance recursion */
             			if (k > mk || (k == mk && (i <= mi || i < half))) {
             				mi = i;
             				mj = j;
             				mk = k;
             			}
             		}
             	}
             	if (mk) {
             		mi = mi - mk + 1;
             		mj = mj - mk + 1;
             	}
-            	/* expand match to include neighboring popular lines */
+            	/* expand match to include subsequent popular lines */
-            	while (mi - mb > a1 && mj - mb > b1 &&
-            	       a[mi - mb - 1].e == b[mj - mb - 1].e)
-            		mb++;
             	while (mi + mk < a2 && mj + mk < b2 &&
             	       a[mi + mk].e == b[mj + mk].e)
             		mk++;
-            	*omi = mi - mb;
+            	*omi = mi;
-            	*omj = mj - mb;
+            	*omj = mj;
-            	return mk + mb;
+            	return mk;
             }
             static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,
             			    int a1, int a2, int b1, int b2, struct hunk *l)
             {
             	int i, j, k;
             	while (1) {
             		/* find the longest match in this chunk */
             		k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
             		if (!k)
             			return l;
             		/* and recurse on the remaining chunks on either side */
             		l = recurse(a, b, pos, a1, i, b1, j, l);
             		if (!l)
             			return NULL;
             		l->next = (struct hunk *)malloc(sizeof(struct hunk));
             		if (!l->next)
             			return NULL;
             		l = l->next;
             		l->a1 = i;
             		l->a2 = i + k;
             		l->b1 = j;
             		l->b2 = j + k;
             		l->next = NULL;
             		/* tail-recursion didn't happen, so do equivalent iteration */
             		a1 = i + k;
             		b1 = j + k;
             	}
             }
             static int diff(struct line *a, int an, struct line *b, int bn,
             		 struct hunk *base)
             {
             	struct hunk *curr;
             	struct pos *pos;
             	int t, count = 0;
             	/* allocate and fill arrays */
             	t = equatelines(a, an, b, bn);
             	pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));
             	if (pos && t) {
             		/* generate the matching block list */
             		curr = recurse(a, b, pos, 0, an, 0, bn, base);
             		if (!curr)
             			return -1;
             		/* sentinel end hunk */
             		curr->next = (struct hunk *)malloc(sizeof(struct hunk));
             		if (!curr->next)
             			return -1;
             		curr = curr->next;
             		curr->a1 = curr->a2 = an;
             		curr->b1 = curr->b2 = bn;
             		curr->next = NULL;
             	}
             	free(pos);
             	/* normalize the hunk list, try to push each hunk towards the end */
             	for (curr = base->next; curr; curr = curr->next) {
             		struct hunk *next = curr->next;
             		if (!next)
             			break;
             		if (curr->a2 == next->a1 || curr->b2 == next->b1)
             			while (curr->a2 < an && curr->b2 < bn
             			       && next->a1 < next->a2
             			       && next->b1 < next->b2
             			       && !cmp(a + curr->a2, b + curr->b2)) {
             				curr->a2++;
             				next->a1++;
             				curr->b2++;
             				next->b1++;
             			}
             	}
             	for (curr = base->next; curr; curr = curr->next)
             		count++;
             	return count;
             }
             static void freehunks(struct hunk *l)
             {
             	struct hunk *n;
             	for (; l; l = n) {
             		n = l->next;
             		free(l);
             	}
             }
             static PyObject *blocks(PyObject *self, PyObject *args)
             {
             	PyObject *sa, *sb, *rl = NULL, *m;
             	struct line *a, *b;
             	struct hunk l, *h;
             	int an, bn, count, pos = 0;
             	l.next = NULL;
             	if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
             		return NULL;
             	an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);
             	bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);
             	if (!a || !b)
             		goto nomem;
             	count = diff(a, an, b, bn, &l);
             	if (count < 0)
             		goto nomem;
             	rl = PyList_New(count);
             	if (!rl)
             		goto nomem;
             	for (h = l.next; h; h = h->next) {
             		m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
             		PyList_SetItem(rl, pos, m);
             		pos++;
             	}
             nomem:
             	free(a);
             	free(b);
             	freehunks(l.next);
             	return rl ? rl : PyErr_NoMemory();
             }
             static PyObject *bdiff(PyObject *self, PyObject *args)
             {
             	char *sa, *sb, *rb;
             	PyObject *result = NULL;
             	struct line *al, *bl;
             	struct hunk l, *h;
             	int an, bn, count;
             	Py_ssize_t len = 0, la, lb;
             	PyThreadState *_save;
             	l.next = NULL;
             	if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))
             		return NULL;
             	if (la > UINT_MAX || lb > UINT_MAX) {
             		PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");
             		return NULL;
             	}
             	_save = PyEval_SaveThread();
             	an = splitlines(sa, la, &al);
             	bn = splitlines(sb, lb, &bl);
             	if (!al || !bl)
             		goto nomem;
             	count = diff(al, an, bl, bn, &l);
             	if (count < 0)
             		goto nomem;
             	/* calculate length of output */
             	la = lb = 0;
             	for (h = l.next; h; h = h->next) {
             		if (h->a1 != la || h->b1 != lb)
             			len += 12 + bl[h->b1].l - bl[lb].l;
             		la = h->a2;
             		lb = h->b2;
             	}
             	PyEval_RestoreThread(_save);
             	_save = NULL;
             	result = PyBytes_FromStringAndSize(NULL, len);
             	if (!result)
             		goto nomem;
             	/* build binary patch */
             	rb = PyBytes_AsString(result);
             	la = lb = 0;
             	for (h = l.next; h; h = h->next) {
             		if (h->a1 != la || h->b1 != lb) {
             			len = bl[h->b1].l - bl[lb].l;
             			putbe32((uint32_t)(al[la].l - al->l), rb);
             			putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);
             			putbe32((uint32_t)len, rb + 8);
             			memcpy(rb + 12, bl[lb].l, len);
             			rb += 12 + len;
             		}
             		la = h->a2;
             		lb = h->b2;
             	}
             nomem:
             	if (_save)
             		PyEval_RestoreThread(_save);
             	free(al);
             	free(bl);
             	freehunks(l.next);
             	return result ? result : PyErr_NoMemory();
             }
             /*
              * If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,
              * reduce whitespace sequences to a single space and trim remaining whitespace
              * from end of lines.
              */
             static PyObject *fixws(PyObject *self, PyObject *args)
             {
             	PyObject *s, *result = NULL;
             	char allws, c;
             	const char *r;
             	Py_ssize_t i, rlen, wlen = 0;
             	char *w;
             	if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))
             		return NULL;
             	r = PyBytes_AsString(s);
             	rlen = PyBytes_Size(s);
             	w = (char *)malloc(rlen ? rlen : 1);
             	if (!w)
             		goto nomem;
             	for (i = 0; i != rlen; i++) {
             		c = r[i];
             		if (c == ' ' || c == '\t' || c == '\r') {
             			if (!allws && (wlen == 0 || w[wlen - 1] != ' '))
             				w[wlen++] = ' ';
             		} else if (c == '\n' && !allws
             			  && wlen > 0 && w[wlen - 1] == ' ') {
             			w[wlen - 1] = '\n';
             		} else {
             			w[wlen++] = c;
             		}
             	}
             	result = PyBytes_FromStringAndSize(w, wlen);
             nomem:
             	free(w);
             	return result ? result : PyErr_NoMemory();
             }
             static char mdiff_doc[] = "Efficient binary diff.";
             static PyMethodDef methods[] = {
             	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
             	{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
             	{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},
             	{NULL, NULL}
             };
             #ifdef IS_PY3K
             static struct PyModuleDef bdiff_module = {
             	PyModuleDef_HEAD_INIT,
             	"bdiff",
             	mdiff_doc,
             	-1,
             	methods
             };
             PyMODINIT_FUNC PyInit_bdiff(void)
             {
             	return PyModule_Create(&bdiff_module);
             }
             #else
             PyMODINIT_FUNC initbdiff(void)
             {
             	Py_InitModule3("bdiff", methods, mdiff_doc);
             }
             #endif