upstream/mercurial-mirror Commit - r29010:e868d8ee

1

/*

1

/*

2

bdiff.c - efficient binary diff extension for Mercurial

2

bdiff.c - efficient binary diff extension for Mercurial

3

4

5

6

This software may be used and distributed according to the terms of

6

This software may be used and distributed according to the terms of

7

the GNU General Public License, incorporated herein by reference.

7

the GNU General Public License, incorporated herein by reference.

8

9

Based roughly on Python difflib

9

Based roughly on Python difflib

10

*/

10

*/

11

12

#define PY_SSIZE_T_CLEAN

12

#define PY_SSIZE_T_CLEAN

13

#include <Python.h>

13

#include <Python.h>

14

#include <stdlib.h>

14

#include <stdlib.h>

15

#include <string.h>

15

#include <string.h>

16

#include <limits.h>

16

#include <limits.h>

17

18

#include "util.h"

18

#include "util.h"

19

20

struct line {

20

struct line {

21

int hash, n, e;

21

int hash, n, e;

22

Py_ssize_t len;

22

Py_ssize_t len;

23

const char *l;

23

const char *l;

24

};

24

};

25

26

struct pos {

26

struct pos {

27

int pos, len;

27

int pos, len;

28

};

28

};

29

30

struct hunk;

30

struct hunk;

31

struct hunk {

31

struct hunk {

32

int a1, a2, b1, b2;

32

int a1, a2, b1, b2;

33

struct hunk *next;

33

struct hunk *next;

34

};

34

};

35

36

static int splitlines(const char *a, Py_ssize_t len, struct line **lr)

36

static int splitlines(const char *a, Py_ssize_t len, struct line **lr)

37

{

37

{

38

unsigned hash;

38

unsigned hash;

39

int i;

39

int i;

40

const char *p, *b = a;

40

const char *p, *b = a;

41

const char * const plast = a + len - 1;

41

const char * const plast = a + len - 1;

42

struct line *l;

42

struct line *l;

43

44

/* count the lines */

44

/* count the lines */

45

i = 1; /* extra line for sentinel */

45

i = 1; /* extra line for sentinel */

46

for (p = a; p < a + len; p++)

46

for (p = a; p < a + len; p++)

47

if (*p == '\n' || p == plast)

47

if (*p == '\n' || p == plast)

48

i++;

48

i++;

49

50

*lr = l = (struct line *)malloc(sizeof(struct line) * i);

50

*lr = l = (struct line *)malloc(sizeof(struct line) * i);

51

if (!l)

51

if (!l)

52

return -1;

52

return -1;

53

54

/* build the line array and calculate hashes */

54

/* build the line array and calculate hashes */

55

hash = 0;

55

hash = 0;

56

for (p = a; p < a + len; p++) {

56

for (p = a; p < a + len; p++) {

57

/* Leonid Yuriev's hash */

57

/* Leonid Yuriev's hash */

58

hash = (hash * 1664525) + (unsigned char)*p + 1013904223;

58

hash = (hash * 1664525) + (unsigned char)*p + 1013904223;

59

60

if (*p == '\n' || p == plast) {

60

if (*p == '\n' || p == plast) {

61

l->hash = hash;

61

l->hash = hash;

62

hash = 0;

62

hash = 0;

63

l->len = p - b + 1;

63

l->len = p - b + 1;

64

l->l = b;

64

l->l = b;

65

l->n = INT_MAX;

65

l->n = INT_MAX;

66

l++;

66

l++;

67

b = p + 1;

67

b = p + 1;

68

}

68

}

69

}

69

}

70

71

/* set up a sentinel */

71

/* set up a sentinel */

72

l->hash = 0;

72

l->hash = 0;

73

l->len = 0;

73

l->len = 0;

74

l->l = a + len;

74

l->l = a + len;

75

return i - 1;

75

return i - 1;

76

}

76

}

77

78

static inline int cmp(struct line *a, struct line *b)

78

static inline int cmp(struct line *a, struct line *b)

79

{

79

{

80

return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);

80

return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);

81

}

81

}

82

83

static int equatelines(struct line *a, int an, struct line *b, int bn)

83

static int equatelines(struct line *a, int an, struct line *b, int bn)

84

{

84

{

85

int i, j, buckets = 1, t, scale;

85

int i, j, buckets = 1, t, scale;

86

struct pos *h = NULL;

86

struct pos *h = NULL;

87

88

/* build a hash table of the next highest power of 2 */

88

/* build a hash table of the next highest power of 2 */

89

while (buckets < bn + 1)

89

while (buckets < bn + 1)

90

buckets *= 2;

90

buckets *= 2;

91

92

/* try to allocate a large hash table to avoid collisions */

92

/* try to allocate a large hash table to avoid collisions */

93

for (scale = 4; scale; scale /= 2) {

93

for (scale = 4; scale; scale /= 2) {

94

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

94

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

95

if (h)

95

if (h)

96

break;

96

break;

97

}

97

}

98

99

if (!h)

99

if (!h)

100

return 0;

100

return 0;

101

102

buckets = buckets * scale - 1;

102

buckets = buckets * scale - 1;

103

104

/* clear the hash table */

104

/* clear the hash table */

105

for (i = 0; i <= buckets; i++) {

105

for (i = 0; i <= buckets; i++) {

106

h[i].pos = INT_MAX;

106

h[i].pos = INT_MAX;

107

h[i].len = 0;

107

h[i].len = 0;

108

}

108

}

109

110

/* add lines to the hash table chains */

110

/* add lines to the hash table chains */

111

for (i = bn - 1; i >= 0; i--) {

111

for (i = bn - 1; i >= 0; i--) {

112

/* find the equivalence class */

112

/* find the equivalence class */

113

for (j = b[i].hash & buckets; h[j].pos != INT_MAX;

113

for (j = b[i].hash & buckets; h[j].pos != INT_MAX;

114

j = (j + 1) & buckets)

114

j = (j + 1) & buckets)

115

if (!cmp(b + i, b + h[j].pos))

115

if (!cmp(b + i, b + h[j].pos))

116

break;

116

break;

117

118

/* add to the head of the equivalence class */

118

/* add to the head of the equivalence class */

119

b[i].n = h[j].pos;

119

b[i].n = h[j].pos;

120

b[i].e = j;

120

b[i].e = j;

121

h[j].pos = i;

121

h[j].pos = i;

122

h[j].len++; /* keep track of popularity */

122

h[j].len++; /* keep track of popularity */

123

}

123

}

124

125

/* compute popularity threshold */

125

/* compute popularity threshold */

126

t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);

126

t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);

127

128

/* match items in a to their equivalence class in b */

128

/* match items in a to their equivalence class in b */

129

for (i = 0; i < an; i++) {

129

for (i = 0; i < an; i++) {

130

/* find the equivalence class */

130

/* find the equivalence class */

131

for (j = a[i].hash & buckets; h[j].pos != INT_MAX;

131

for (j = a[i].hash & buckets; h[j].pos != INT_MAX;

132

j = (j + 1) & buckets)

132

j = (j + 1) & buckets)

133

if (!cmp(a + i, b + h[j].pos))

133

if (!cmp(a + i, b + h[j].pos))

134

break;

134

break;

135

136

a[i].e = j; /* use equivalence class for quick compare */

136

a[i].e = j; /* use equivalence class for quick compare */

137

if (h[j].len <= t)

137

if (h[j].len <= t)

138

a[i].n = h[j].pos; /* point to head of match list */

138

a[i].n = h[j].pos; /* point to head of match list */

139

else

139

else

140

a[i].n = INT_MAX; /* too popular */

140

a[i].n = INT_MAX; /* too popular */

141

}

141

}

142

143

/* discard hash tables */

143

/* discard hash tables */

144

free(h);

144

free(h);

145

return 1;

145

return 1;

146

}

146

}

147

148

static int longest_match(struct line *a, struct line *b, struct pos *pos,

148

static int longest_match(struct line *a, struct line *b, struct pos *pos,

149

int a1, int a2, int b1, int b2, int *omi, int *omj)

149

int a1, int a2, int b1, int b2, int *omi, int *omj)

150

{

150

{

151

int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;

151

int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;

152

153

for (i = a1; i < a2; i++) {

153

for (i = a1; i < a2; i++) {

154

/* skip things before the current block */

154

/* skip things before the current block */

155

for (j = a[i].n; j < b1; j = b[j].n)

155

for (j = a[i].n; j < b1; j = b[j].n)

156

;

156

;

157

158

/* loop through all lines match a[i] in b */

158

/* loop through all lines match a[i] in b */

159

for (; j < b2; j = b[j].n) {

159

for (; j < b2; j = b[j].n) {

160

/* does this extend an earlier match? */

160

/* does this extend an earlier match? */

161

if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)

161

if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)

162

k = pos[j - 1].len + 1;

162

k = pos[j - 1].len + 1;

163

else

163

else

164

k = 1;

164

k = 1;

165

pos[j].pos = i;

165

pos[j].pos = i;

166

pos[j].len = k;

166

pos[j].len = k;

167

168

/* best match so far? */

168

/* best match so far? */

169

if (k > mk) {

169

if (k > mk) {

170

mi = i;

170

mi = i;

171

mj = j;

171

mj = j;

172

mk = k;

172

mk = k;

173

}

173

}

174

}

174

}

175

}

175

}

176

177

if (mk) {

177

if (mk) {

178

mi = mi - mk + 1;

178

mi = mi - mk + 1;

179

mj = mj - mk + 1;

179

mj = mj - mk + 1;

180

}

180

}

181

182

/* expand match to include neighboring popular lines */

182

/* expand match to include neighboring popular lines */

183

while (mi - mb > a1 && mj - mb > b1 &&

183

while (mi - mb > a1 && mj - mb > b1 &&

184

a[mi - mb - 1].e == b[mj - mb - 1].e)

184

a[mi - mb - 1].e == b[mj - mb - 1].e)

185

mb++;

185

mb++;

186

while (mi + mk < a2 && mj + mk < b2 &&

186

while (mi + mk < a2 && mj + mk < b2 &&

187

a[mi + mk].e == b[mj + mk].e)

187

a[mi + mk].e == b[mj + mk].e)

188

mk++;

188

mk++;

189

190

*omi = mi - mb;

190

*omi = mi - mb;

191

*omj = mj - mb;

191

*omj = mj - mb;

192

193

return mk + mb;

193

return mk + mb;

194

}

194

}

195

196

static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,

196

static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,

197

int a1, int a2, int b1, int b2, struct hunk *l)

197

int a1, int a2, int b1, int b2, struct hunk *l)

198

{

198

{

199

int i, j, k;

199

int i, j, k;

200

201

while (1) {

201

while (1) {

202

/* find the longest match in this chunk */

202

/* find the longest match in this chunk */

203

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

203

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

204

if (!k)

204

if (!k)

205

return l;

205

return l;

206

207

/* and recurse on the remaining chunks on either side */

207

/* and recurse on the remaining chunks on either side */

208

l = recurse(a, b, pos, a1, i, b1, j, l);

208

l = recurse(a, b, pos, a1, i, b1, j, l);

209

if (!l)

209

if (!l)

210

return NULL;

210

return NULL;

211

212

l->next = (struct hunk *)malloc(sizeof(struct hunk));

212

l->next = (struct hunk *)malloc(sizeof(struct hunk));

213

if (!l->next)

213

if (!l->next)

214

return NULL;

214

return NULL;

215

216

l = l->next;

216

l = l->next;

217

l->a1 = i;

217

l->a1 = i;

218

l->a2 = i + k;

218

l->a2 = i + k;

219

l->b1 = j;

219

l->b1 = j;

220

l->b2 = j + k;

220

l->b2 = j + k;

221

l->next = NULL;

221

l->next = NULL;

222

223

/* tail-recursion didn't happen, so do equivalent iteration */

223

/* tail-recursion didn't happen, so do equivalent iteration */

224

a1 = i + k;

224

a1 = i + k;

225

b1 = j + k;

225

b1 = j + k;

226

}

226

}

227

}

227

}

228

229

static int diff(struct line *a, int an, struct line *b, int bn,

229

static int diff(struct line *a, int an, struct line *b, int bn,

230

struct hunk *base)

230

struct hunk *base)

231

{

231

{

232

struct hunk *curr;

232

struct hunk *curr;

233

struct pos *pos;

233

struct pos *pos;

234

int t, count = 0;

234

int t, count = 0;

235

236

/* allocate and fill arrays */

236

/* allocate and fill arrays */

237

t = equatelines(a, an, b, bn);

237

t = equatelines(a, an, b, bn);

238

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

238

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

239

240

if (pos && t) {

240

if (pos && t) {

241

/* generate the matching block list */

241

/* generate the matching block list */

242

243

curr = recurse(a, b, pos, 0, an, 0, bn, base);

243

curr = recurse(a, b, pos, 0, an, 0, bn, base);

244

if (!curr)

244

if (!curr)

245

return -1;

245

return -1;

246

247

/* sentinel end hunk */

247

/* sentinel end hunk */

248

curr->next = (struct hunk *)malloc(sizeof(struct hunk));

248

curr->next = (struct hunk *)malloc(sizeof(struct hunk));

249

if (!curr->next)

249

if (!curr->next)

250

return -1;

250

return -1;

251

curr = curr->next;

251

curr = curr->next;

252

curr->a1 = curr->a2 = an;

252

curr->a1 = curr->a2 = an;

253

curr->b1 = curr->b2 = bn;

253

curr->b1 = curr->b2 = bn;

254

curr->next = NULL;

254

curr->next = NULL;

255

}

255

}

256

257

free(pos);

257

free(pos);

258

259

/* normalize the hunk list, try to push each hunk towards the end */

259

/* normalize the hunk list, try to push each hunk towards the end */

260

for (curr = base->next; curr; curr = curr->next) {

260

for (curr = base->next; curr; curr = curr->next) {

261

struct hunk *next = curr->next;

261

struct hunk *next = curr->next;

262

int shift = 0;

262

int shift = 0;

263

264

if (!next)

264

if (!next)

265

break;

265

break;

266

267

if (curr->a2 == next->a1)

267

if (curr->a2 == next->a1 || curr->b2 == next->b1)

268

while (curr->a2 + shift < an && curr->b2 + shift < bn

268

while (curr->a2 + shift < an && curr->b2 + shift < bn

269

&& !cmp(a + curr->a2 + shift,

269

&& !cmp(a + curr->a2 + shift,

270

b + curr->b2 + shift))

270

b + curr->b2 + shift))

271

shift++;

271

shift++;

272

else if (curr->b2 == next->b1)

273

while (curr->b2 + shift < bn && curr->a2 + shift < an

274

&& !cmp(b + curr->b2 + shift,

275

a + curr->a2 + shift))

276

shift++;

277

if (!shift)

272

if (!shift)

278

continue;

273

continue;

279

curr->b2 += shift;

274

curr->b2 += shift;

280

next->b1 += shift;

275

next->b1 += shift;

281

curr->a2 += shift;

276

curr->a2 += shift;

282

next->a1 += shift;

277

next->a1 += shift;

283

}

278

}

284

279

285

for (curr = base->next; curr; curr = curr->next)

280

for (curr = base->next; curr; curr = curr->next)

286

count++;

281

count++;

287

return count;

282

return count;

288

}

283

}

289

284

290

static void freehunks(struct hunk *l)

285

static void freehunks(struct hunk *l)

291

{

286

{

292

struct hunk *n;

287

struct hunk *n;

293

for (; l; l = n) {

288

for (; l; l = n) {

294

n = l->next;

289

n = l->next;

295

free(l);

290

free(l);

296

}

291

}

297

}

292

}

298

293

299

static PyObject *blocks(PyObject *self, PyObject *args)

294

static PyObject *blocks(PyObject *self, PyObject *args)

300

{

295

{

301

PyObject *sa, *sb, *rl = NULL, *m;

296

PyObject *sa, *sb, *rl = NULL, *m;

302

struct line *a, *b;

297

struct line *a, *b;

303

struct hunk l, *h;

298

struct hunk l, *h;

304

int an, bn, count, pos = 0;

299

int an, bn, count, pos = 0;

305

300

306

l.next = NULL;

301

l.next = NULL;

307

302

308

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

303

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

309

return NULL;

304

return NULL;

310

305

311

an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);

306

an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);

312

bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);

307

bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);

313

308

314

if (!a || !b)

309

if (!a || !b)

315

goto nomem;

310

goto nomem;

316

311

317

count = diff(a, an, b, bn, &l);

312

count = diff(a, an, b, bn, &l);

318

if (count < 0)

313

if (count < 0)

319

goto nomem;

314

goto nomem;

320

315

321

rl = PyList_New(count);

316

rl = PyList_New(count);

322

if (!rl)

317

if (!rl)

323

goto nomem;

318

goto nomem;

324

319

325

for (h = l.next; h; h = h->next) {

320

for (h = l.next; h; h = h->next) {

326

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

321

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

327

PyList_SetItem(rl, pos, m);

322

PyList_SetItem(rl, pos, m);

328

pos++;

323

pos++;

329

}

324

}

330

325

331

nomem:

326

nomem:

332

free(a);

327

free(a);

333

free(b);

328

free(b);

334

freehunks(l.next);

329

freehunks(l.next);

335

return rl ? rl : PyErr_NoMemory();

330

return rl ? rl : PyErr_NoMemory();

336

}

331

}

337

332

338

static PyObject *bdiff(PyObject *self, PyObject *args)

333

static PyObject *bdiff(PyObject *self, PyObject *args)

339

{

334

{

340

char *sa, *sb, *rb;

335

char *sa, *sb, *rb;

341

PyObject *result = NULL;

336

PyObject *result = NULL;

342

struct line *al, *bl;

337

struct line *al, *bl;

343

struct hunk l, *h;

338

struct hunk l, *h;

344

int an, bn, count;

339

int an, bn, count;

345

Py_ssize_t len = 0, la, lb;

340

Py_ssize_t len = 0, la, lb;

346

PyThreadState *_save;

341

PyThreadState *_save;

347

342

348

l.next = NULL;

343

l.next = NULL;

349

344

350

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

345

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

351

return NULL;

346

return NULL;

352

347

353

if (la > UINT_MAX || lb > UINT_MAX) {

348

if (la > UINT_MAX || lb > UINT_MAX) {

354

PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");

349

PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");

355

return NULL;

350

return NULL;

356

}

351

}

357

352

358

_save = PyEval_SaveThread();

353

_save = PyEval_SaveThread();

359

an = splitlines(sa, la, &al);

354

an = splitlines(sa, la, &al);

360

bn = splitlines(sb, lb, &bl);

355

bn = splitlines(sb, lb, &bl);

361

if (!al || !bl)

356

if (!al || !bl)

362

goto nomem;

357

goto nomem;

363

358

364

count = diff(al, an, bl, bn, &l);

359

count = diff(al, an, bl, bn, &l);

365

if (count < 0)

360

if (count < 0)

366

goto nomem;

361

goto nomem;

367

362

368

/* calculate length of output */

363

/* calculate length of output */

369

la = lb = 0;

364

la = lb = 0;

370

for (h = l.next; h; h = h->next) {

365

for (h = l.next; h; h = h->next) {

371

if (h->a1 != la || h->b1 != lb)

366

if (h->a1 != la || h->b1 != lb)

372

len += 12 + bl[h->b1].l - bl[lb].l;

367

len += 12 + bl[h->b1].l - bl[lb].l;

373

la = h->a2;

368

la = h->a2;

374

lb = h->b2;

369

lb = h->b2;

375

}

370

}

376

PyEval_RestoreThread(_save);

371

PyEval_RestoreThread(_save);

377

_save = NULL;

372

_save = NULL;

378

373

379

result = PyBytes_FromStringAndSize(NULL, len);

374

result = PyBytes_FromStringAndSize(NULL, len);

380

375

381

if (!result)

376

if (!result)

382

goto nomem;

377

goto nomem;

383

378

384

/* build binary patch */

379

/* build binary patch */

385

rb = PyBytes_AsString(result);

380

rb = PyBytes_AsString(result);

386

la = lb = 0;

381

la = lb = 0;

387

382

388

for (h = l.next; h; h = h->next) {

383

for (h = l.next; h; h = h->next) {

389

if (h->a1 != la || h->b1 != lb) {

384

if (h->a1 != la || h->b1 != lb) {

390

len = bl[h->b1].l - bl[lb].l;

385

len = bl[h->b1].l - bl[lb].l;

391

putbe32((uint32_t)(al[la].l - al->l), rb);

386

putbe32((uint32_t)(al[la].l - al->l), rb);

392

putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);

387

putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);

393

putbe32((uint32_t)len, rb + 8);

388

putbe32((uint32_t)len, rb + 8);

394

memcpy(rb + 12, bl[lb].l, len);

389

memcpy(rb + 12, bl[lb].l, len);

395

rb += 12 + len;

390

rb += 12 + len;

396

}

391

}

397

la = h->a2;

392

la = h->a2;

398

lb = h->b2;

393

lb = h->b2;

399

}

394

}

400

395

401

nomem:

396

nomem:

402

if (_save)

397

if (_save)

403

PyEval_RestoreThread(_save);

398

PyEval_RestoreThread(_save);

404

free(al);

399

free(al);

405

free(bl);

400

free(bl);

406

freehunks(l.next);

401

freehunks(l.next);

407

return result ? result : PyErr_NoMemory();

402

return result ? result : PyErr_NoMemory();

408

}

403

}

409

404

410

/*

405

/*

411

* If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,

406

* If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,

412

* reduce whitespace sequences to a single space and trim remaining whitespace

407

* reduce whitespace sequences to a single space and trim remaining whitespace

413

* from end of lines.

408

* from end of lines.

414

*/

409

*/

415

static PyObject *fixws(PyObject *self, PyObject *args)

410

static PyObject *fixws(PyObject *self, PyObject *args)

416

{

411

{

417

PyObject *s, *result = NULL;

412

PyObject *s, *result = NULL;

418

char allws, c;

413

char allws, c;

419

const char *r;

414

const char *r;

420

Py_ssize_t i, rlen, wlen = 0;

415

Py_ssize_t i, rlen, wlen = 0;

421

char *w;

416

char *w;

422

417

423

if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))

418

if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))

424

return NULL;

419

return NULL;

425

r = PyBytes_AsString(s);

420

r = PyBytes_AsString(s);

426

rlen = PyBytes_Size(s);

421

rlen = PyBytes_Size(s);

427

422

428

w = (char *)malloc(rlen ? rlen : 1);

423

w = (char *)malloc(rlen ? rlen : 1);

429

if (!w)

424

if (!w)

430

goto nomem;

425

goto nomem;

431

426

432

for (i = 0; i != rlen; i++) {

427

for (i = 0; i != rlen; i++) {

433

c = r[i];

428

c = r[i];

434

if (c == ' ' || c == '\t' || c == '\r') {

429

if (c == ' ' || c == '\t' || c == '\r') {

435

if (!allws && (wlen == 0 || w[wlen - 1] != ' '))

430

if (!allws && (wlen == 0 || w[wlen - 1] != ' '))

436

w[wlen++] = ' ';

431

w[wlen++] = ' ';

437

} else if (c == '\n' && !allws

432

} else if (c == '\n' && !allws

438

&& wlen > 0 && w[wlen - 1] == ' ') {

433

&& wlen > 0 && w[wlen - 1] == ' ') {

439

w[wlen - 1] = '\n';

434

w[wlen - 1] = '\n';

440

} else {

435

} else {

441

w[wlen++] = c;

436

w[wlen++] = c;

442

}

437

}

443

}

438

}

444

439

445

result = PyBytes_FromStringAndSize(w, wlen);

440

result = PyBytes_FromStringAndSize(w, wlen);

446

441

447

nomem:

442

nomem:

448

free(w);

443

free(w);

449

return result ? result : PyErr_NoMemory();

444

return result ? result : PyErr_NoMemory();

450

}

445

}

451

446

452

447

453

static char mdiff_doc[] = "Efficient binary diff.";

448

static char mdiff_doc[] = "Efficient binary diff.";

454

449

455

static PyMethodDef methods[] = {

450

static PyMethodDef methods[] = {

456

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

451

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

457

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

452

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

458

{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},

453

{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},

459

{NULL, NULL}

454

{NULL, NULL}

460

};

455

};

461

456

462

#ifdef IS_PY3K

457

#ifdef IS_PY3K

463

static struct PyModuleDef bdiff_module = {

458

static struct PyModuleDef bdiff_module = {

464

PyModuleDef_HEAD_INIT,

459

PyModuleDef_HEAD_INIT,

465

"bdiff",

460

"bdiff",

466

mdiff_doc,

461

mdiff_doc,

467

-1,

462

-1,

468

methods

463

methods

469

};

464

};

470

465

471

PyMODINIT_FUNC PyInit_bdiff(void)

466

PyMODINIT_FUNC PyInit_bdiff(void)

472

{

467

{

473

return PyModule_Create(&bdiff_module);

468

return PyModule_Create(&bdiff_module);

474

}

469

}

475

#else

470

#else

476

PyMODINIT_FUNC initbdiff(void)

471

PyMODINIT_FUNC initbdiff(void)

477

{

472

{

478

Py_InitModule3("bdiff", methods, mdiff_doc);

473

Py_InitModule3("bdiff", methods, mdiff_doc);

479

}

474

}

480

#endif

475

#endif

481

476

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             /*
              bdiff.c - efficient binary diff extension for Mercurial
              Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
              This software may be used and distributed according to the terms of
              the GNU General Public License, incorporated herein by reference.
              Based roughly on Python difflib
             */
             #define PY_SSIZE_T_CLEAN
             #include <Python.h>
             #include <stdlib.h>
             #include <string.h>
             #include <limits.h>
             #include "util.h"
             struct line {
             	int hash, n, e;
             	Py_ssize_t len;
             	const char *l;
             };
             struct pos {
             	int pos, len;
             };
             struct hunk;
             struct hunk {
             	int a1, a2, b1, b2;
             	struct hunk *next;
             };
             static int splitlines(const char *a, Py_ssize_t len, struct line **lr)
             {
             	unsigned hash;
             	int i;
             	const char *p, *b = a;
             	const char * const plast = a + len - 1;
             	struct line *l;
             	/* count the lines */
             	i = 1; /* extra line for sentinel */
             	for (p = a; p < a + len; p++)
             		if (*p == '\n' || p == plast)
             			i++;
             	*lr = l = (struct line *)malloc(sizeof(struct line) * i);
             	if (!l)
             		return -1;
             	/* build the line array and calculate hashes */
             	hash = 0;
             	for (p = a; p < a + len; p++) {
             		/* Leonid Yuriev's hash */
             		hash = (hash * 1664525) + (unsigned char)*p + 1013904223;
             		if (*p == '\n' || p == plast) {
             			l->hash = hash;
             			hash = 0;
             			l->len = p - b + 1;
             			l->l = b;
             			l->n = INT_MAX;
             			l++;
             			b = p + 1;
             		}
             	}
             	/* set up a sentinel */
             	l->hash = 0;
             	l->len = 0;
             	l->l = a + len;
             	return i - 1;
             }
             static inline int cmp(struct line *a, struct line *b)
             {
             	return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);
             }
             static int equatelines(struct line *a, int an, struct line *b, int bn)
             {
             	int i, j, buckets = 1, t, scale;
             	struct pos *h = NULL;
             	/* build a hash table of the next highest power of 2 */
             	while (buckets < bn + 1)
             		buckets *= 2;
             	/* try to allocate a large hash table to avoid collisions */
             	for (scale = 4; scale; scale /= 2) {
             		h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));
             		if (h)
             			break;
             	}
             	if (!h)
             		return 0;
             	buckets = buckets * scale - 1;
             	/* clear the hash table */
             	for (i = 0; i <= buckets; i++) {
             		h[i].pos = INT_MAX;
             		h[i].len = 0;
             	}
             	/* add lines to the hash table chains */
             	for (i = bn - 1; i >= 0; i--) {
             		/* find the equivalence class */
             		for (j = b[i].hash & buckets; h[j].pos != INT_MAX;
             		     j = (j + 1) & buckets)
             			if (!cmp(b + i, b + h[j].pos))
             				break;
             		/* add to the head of the equivalence class */
             		b[i].n = h[j].pos;
             		b[i].e = j;
             		h[j].pos = i;
             		h[j].len++; /* keep track of popularity */
             	}
             	/* compute popularity threshold */
             	t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);
             	/* match items in a to their equivalence class in b */
             	for (i = 0; i < an; i++) {
             		/* find the equivalence class */
             		for (j = a[i].hash & buckets; h[j].pos != INT_MAX;
             		     j = (j + 1) & buckets)
             			if (!cmp(a + i, b + h[j].pos))
             				break;
             		a[i].e = j; /* use equivalence class for quick compare */
             		if (h[j].len <= t)
             			a[i].n = h[j].pos; /* point to head of match list */
             		else
             			a[i].n = INT_MAX; /* too popular */
             	}
             	/* discard hash tables */
             	free(h);
             	return 1;
             }
             static int longest_match(struct line *a, struct line *b, struct pos *pos,
             			 int a1, int a2, int b1, int b2, int *omi, int *omj)
             {
             	int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;
             	for (i = a1; i < a2; i++) {
             		/* skip things before the current block */
             		for (j = a[i].n; j < b1; j = b[j].n)
             			;
             		/* loop through all lines match a[i] in b */
             		for (; j < b2; j = b[j].n) {
             			/* does this extend an earlier match? */
             			if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)
             				k = pos[j - 1].len + 1;
             			else
             				k = 1;
             			pos[j].pos = i;
             			pos[j].len = k;
             			/* best match so far? */
             			if (k > mk) {
             				mi = i;
             				mj = j;
             				mk = k;
             			}
             		}
             	}
             	if (mk) {
             		mi = mi - mk + 1;
             		mj = mj - mk + 1;
             	}
             	/* expand match to include neighboring popular lines */
             	while (mi - mb > a1 && mj - mb > b1 &&
             	       a[mi - mb - 1].e == b[mj - mb - 1].e)
             		mb++;
             	while (mi + mk < a2 && mj + mk < b2 &&
             	       a[mi + mk].e == b[mj + mk].e)
             		mk++;
             	*omi = mi - mb;
             	*omj = mj - mb;
             	return mk + mb;
             }
             static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,
             			    int a1, int a2, int b1, int b2, struct hunk *l)
             {
             	int i, j, k;
             	while (1) {
             		/* find the longest match in this chunk */
             		k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
             		if (!k)
             			return l;
             		/* and recurse on the remaining chunks on either side */
             		l = recurse(a, b, pos, a1, i, b1, j, l);
             		if (!l)
             			return NULL;
             		l->next = (struct hunk *)malloc(sizeof(struct hunk));
             		if (!l->next)
             			return NULL;
             		l = l->next;
             		l->a1 = i;
             		l->a2 = i + k;
             		l->b1 = j;
             		l->b2 = j + k;
             		l->next = NULL;
             		/* tail-recursion didn't happen, so do equivalent iteration */
             		a1 = i + k;
             		b1 = j + k;
             	}
             }
             static int diff(struct line *a, int an, struct line *b, int bn,
             		 struct hunk *base)
             {
             	struct hunk *curr;
             	struct pos *pos;
             	int t, count = 0;
             	/* allocate and fill arrays */
             	t = equatelines(a, an, b, bn);
             	pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));
             	if (pos && t) {
             		/* generate the matching block list */
             		curr = recurse(a, b, pos, 0, an, 0, bn, base);
             		if (!curr)
             			return -1;
             		/* sentinel end hunk */
             		curr->next = (struct hunk *)malloc(sizeof(struct hunk));
             		if (!curr->next)
             			return -1;
             		curr = curr->next;
             		curr->a1 = curr->a2 = an;
             		curr->b1 = curr->b2 = bn;
             		curr->next = NULL;
             	}
             	free(pos);
             	/* normalize the hunk list, try to push each hunk towards the end */
             	for (curr = base->next; curr; curr = curr->next) {
             		struct hunk *next = curr->next;
             		int shift = 0;
             		if (!next)
             			break;
-            		if (curr->a2 == next->a1)
+            		if (curr->a2 == next->a1 || curr->b2 == next->b1)
             			while (curr->a2 + shift < an && curr->b2 + shift < bn
             			       && !cmp(a + curr->a2 + shift,
             				       b + curr->b2 + shift))
             				shift++;
-            		else if (curr->b2 == next->b1)
-            			while (curr->b2 + shift < bn && curr->a2 + shift < an
-            			       && !cmp(b + curr->b2 + shift,
-            				       a + curr->a2 + shift))
-            				shift++;
             		if (!shift)
             			continue;
             		curr->b2 += shift;
             		next->b1 += shift;
             		curr->a2 += shift;
             		next->a1 += shift;
             	}
             	for (curr = base->next; curr; curr = curr->next)
             		count++;
             	return count;
             }
             static void freehunks(struct hunk *l)
             {
             	struct hunk *n;
             	for (; l; l = n) {
             		n = l->next;
             		free(l);
             	}
             }
             static PyObject *blocks(PyObject *self, PyObject *args)
             {
             	PyObject *sa, *sb, *rl = NULL, *m;
             	struct line *a, *b;
             	struct hunk l, *h;
             	int an, bn, count, pos = 0;
             	l.next = NULL;
             	if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
             		return NULL;
             	an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);
             	bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);
             	if (!a || !b)
             		goto nomem;
             	count = diff(a, an, b, bn, &l);
             	if (count < 0)
             		goto nomem;
             	rl = PyList_New(count);
             	if (!rl)
             		goto nomem;
             	for (h = l.next; h; h = h->next) {
             		m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
             		PyList_SetItem(rl, pos, m);
             		pos++;
             	}
             nomem:
             	free(a);
             	free(b);
             	freehunks(l.next);
             	return rl ? rl : PyErr_NoMemory();
             }
             static PyObject *bdiff(PyObject *self, PyObject *args)
             {
             	char *sa, *sb, *rb;
             	PyObject *result = NULL;
             	struct line *al, *bl;
             	struct hunk l, *h;
             	int an, bn, count;
             	Py_ssize_t len = 0, la, lb;
             	PyThreadState *_save;
             	l.next = NULL;
             	if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))
             		return NULL;
             	if (la > UINT_MAX || lb > UINT_MAX) {
             		PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");
             		return NULL;
             	}
             	_save = PyEval_SaveThread();
             	an = splitlines(sa, la, &al);
             	bn = splitlines(sb, lb, &bl);
             	if (!al || !bl)
             		goto nomem;
             	count = diff(al, an, bl, bn, &l);
             	if (count < 0)
             		goto nomem;
             	/* calculate length of output */
             	la = lb = 0;
             	for (h = l.next; h; h = h->next) {
             		if (h->a1 != la || h->b1 != lb)
             			len += 12 + bl[h->b1].l - bl[lb].l;
             		la = h->a2;
             		lb = h->b2;
             	}
             	PyEval_RestoreThread(_save);
             	_save = NULL;
             	result = PyBytes_FromStringAndSize(NULL, len);
             	if (!result)
             		goto nomem;
             	/* build binary patch */
             	rb = PyBytes_AsString(result);
             	la = lb = 0;
             	for (h = l.next; h; h = h->next) {
             		if (h->a1 != la || h->b1 != lb) {
             			len = bl[h->b1].l - bl[lb].l;
             			putbe32((uint32_t)(al[la].l - al->l), rb);
             			putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);
             			putbe32((uint32_t)len, rb + 8);
             			memcpy(rb + 12, bl[lb].l, len);
             			rb += 12 + len;
             		}
             		la = h->a2;
             		lb = h->b2;
             	}
             nomem:
             	if (_save)
             		PyEval_RestoreThread(_save);
             	free(al);
             	free(bl);
             	freehunks(l.next);
             	return result ? result : PyErr_NoMemory();
             }
             /*
              * If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,
              * reduce whitespace sequences to a single space and trim remaining whitespace
              * from end of lines.
              */
             static PyObject *fixws(PyObject *self, PyObject *args)
             {
             	PyObject *s, *result = NULL;
             	char allws, c;
             	const char *r;
             	Py_ssize_t i, rlen, wlen = 0;
             	char *w;
             	if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))
             		return NULL;
             	r = PyBytes_AsString(s);
             	rlen = PyBytes_Size(s);
             	w = (char *)malloc(rlen ? rlen : 1);
             	if (!w)
             		goto nomem;
             	for (i = 0; i != rlen; i++) {
             		c = r[i];
             		if (c == ' ' || c == '\t' || c == '\r') {
             			if (!allws && (wlen == 0 || w[wlen - 1] != ' '))
             				w[wlen++] = ' ';
             		} else if (c == '\n' && !allws
             			  && wlen > 0 && w[wlen - 1] == ' ') {
             			w[wlen - 1] = '\n';
             		} else {
             			w[wlen++] = c;
             		}
             	}
             	result = PyBytes_FromStringAndSize(w, wlen);
             nomem:
             	free(w);
             	return result ? result : PyErr_NoMemory();
             }
             static char mdiff_doc[] = "Efficient binary diff.";
             static PyMethodDef methods[] = {
             	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
             	{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
             	{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},
             	{NULL, NULL}
             };
             #ifdef IS_PY3K
             static struct PyModuleDef bdiff_module = {
             	PyModuleDef_HEAD_INIT,
             	"bdiff",
             	mdiff_doc,
             	-1,
             	methods
             };
             PyMODINIT_FUNC PyInit_bdiff(void)
             {
             	return PyModule_Create(&bdiff_module);
             }
             #else
             PyMODINIT_FUNC initbdiff(void)
             {
             	Py_InitModule3("bdiff", methods, mdiff_doc);
             }
             #endif