upstream/mercurial-mirror Commit - r29539:666832b9

1

/*

1

/*

2

bdiff.c - efficient binary diff extension for Mercurial

2

bdiff.c - efficient binary diff extension for Mercurial

3

4

5

6

This software may be used and distributed according to the terms of

6

This software may be used and distributed according to the terms of

7

the GNU General Public License, incorporated herein by reference.

7

the GNU General Public License, incorporated herein by reference.

8

9

Based roughly on Python difflib

9

Based roughly on Python difflib

10

*/

10

*/

11

12

#define PY_SSIZE_T_CLEAN

12

#define PY_SSIZE_T_CLEAN

13

#include <Python.h>

13

#include <Python.h>

14

#include <stdlib.h>

14

#include <stdlib.h>

15

#include <string.h>

15

#include <string.h>

16

#include <limits.h>

16

#include <limits.h>

17

18

#include "compat.h"

18

#include "util.h"

19

#include "util.h"

19

#include "bitmanipulation.h"

20

#include "bitmanipulation.h"

20

21

struct line {

22

struct line {

22

int hash, n, e;

23

int hash, n, e;

23

~~Py_~~ssize_t len;

24

ssize_t len;

24

const char *l;

25

const char *l;

25

};

26

};

26

27

struct pos {

28

struct pos {

28

int pos, len;

29

int pos, len;

29

};

30

};

30

31

struct hunk;

32

struct hunk;

32

struct hunk {

33

struct hunk {

33

int a1, a2, b1, b2;

34

int a1, a2, b1, b2;

34

struct hunk *next;

35

struct hunk *next;

35

};

36

};

36

37

static int splitlines(const char *a, ~~Py_~~ssize_t len, struct line **lr)

38

static int splitlines(const char *a, ssize_t len, struct line **lr)

38

{

39

{

39

unsigned hash;

40

unsigned hash;

40

int i;

41

int i;

41

const char *p, *b = a;

42

const char *p, *b = a;

42

const char * const plast = a + len - 1;

43

const char * const plast = a + len - 1;

43

struct line *l;

44

struct line *l;

44

45

/* count the lines */

46

/* count the lines */

46

i = 1; /* extra line for sentinel */

47

i = 1; /* extra line for sentinel */

47

for (p = a; p < a + len; p++)

48

for (p = a; p < a + len; p++)

48

if (*p == '\n' || p == plast)

49

if (*p == '\n' || p == plast)

49

i++;

50

i++;

50

51

*lr = l = (struct line *)malloc(sizeof(struct line) * i);

52

*lr = l = (struct line *)malloc(sizeof(struct line) * i);

52

if (!l)

53

if (!l)

53

return -1;

54

return -1;

54

55

/* build the line array and calculate hashes */

56

/* build the line array and calculate hashes */

56

hash = 0;

57

hash = 0;

57

for (p = a; p < a + len; p++) {

58

for (p = a; p < a + len; p++) {

58

/* Leonid Yuriev's hash */

59

/* Leonid Yuriev's hash */

59

hash = (hash * 1664525) + (unsigned char)*p + 1013904223;

60

hash = (hash * 1664525) + (unsigned char)*p + 1013904223;

60

61

if (*p == '\n' || p == plast) {

62

if (*p == '\n' || p == plast) {

62

l->hash = hash;

63

l->hash = hash;

63

hash = 0;

64

hash = 0;

64

l->len = p - b + 1;

65

l->len = p - b + 1;

65

l->l = b;

66

l->l = b;

66

l->n = INT_MAX;

67

l->n = INT_MAX;

67

l++;

68

l++;

68

b = p + 1;

69

b = p + 1;

69

}

70

}

70

}

71

}

71

72

/* set up a sentinel */

73

/* set up a sentinel */

73

l->hash = 0;

74

l->hash = 0;

74

l->len = 0;

75

l->len = 0;

75

l->l = a + len;

76

l->l = a + len;

76

return i - 1;

77

return i - 1;

77

}

78

}

78

79

static inline int cmp(struct line *a, struct line *b)

80

static inline int cmp(struct line *a, struct line *b)

80

{

81

{

81

return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);

82

return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);

82

}

83

}

83

84

static int equatelines(struct line *a, int an, struct line *b, int bn)

85

static int equatelines(struct line *a, int an, struct line *b, int bn)

85

{

86

{

86

int i, j, buckets = 1, t, scale;

87

int i, j, buckets = 1, t, scale;

87

struct pos *h = NULL;

88

struct pos *h = NULL;

88

89

/* build a hash table of the next highest power of 2 */

90

/* build a hash table of the next highest power of 2 */

90

while (buckets < bn + 1)

91

while (buckets < bn + 1)

91

buckets *= 2;

92

buckets *= 2;

92

93

/* try to allocate a large hash table to avoid collisions */

94

/* try to allocate a large hash table to avoid collisions */

94

for (scale = 4; scale; scale /= 2) {

95

for (scale = 4; scale; scale /= 2) {

95

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

96

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

96

if (h)

97

if (h)

97

break;

98

break;

98

}

99

}

99

100

if (!h)

101

if (!h)

101

return 0;

102

return 0;

102

103

buckets = buckets * scale - 1;

104

buckets = buckets * scale - 1;

104

105

/* clear the hash table */

106

/* clear the hash table */

106

for (i = 0; i <= buckets; i++) {

107

for (i = 0; i <= buckets; i++) {

107

h[i].pos = -1;

108

h[i].pos = -1;

108

h[i].len = 0;

109

h[i].len = 0;

109

}

110

}

110

111

/* add lines to the hash table chains */

112

/* add lines to the hash table chains */

112

for (i = 0; i < bn; i++) {

113

for (i = 0; i < bn; i++) {

113

/* find the equivalence class */

114

/* find the equivalence class */

114

for (j = b[i].hash & buckets; h[j].pos != -1;

115

for (j = b[i].hash & buckets; h[j].pos != -1;

115

j = (j + 1) & buckets)

116

j = (j + 1) & buckets)

116

if (!cmp(b + i, b + h[j].pos))

117

if (!cmp(b + i, b + h[j].pos))

117

break;

118

break;

118

119

/* add to the head of the equivalence class */

120

/* add to the head of the equivalence class */

120

b[i].n = h[j].pos;

121

b[i].n = h[j].pos;

121

b[i].e = j;

122

b[i].e = j;

122

h[j].pos = i;

123

h[j].pos = i;

123

h[j].len++; /* keep track of popularity */

124

h[j].len++; /* keep track of popularity */

124

}

125

}

125

126

/* compute popularity threshold */

127

/* compute popularity threshold */

127

t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);

128

t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);

128

129

/* match items in a to their equivalence class in b */

130

/* match items in a to their equivalence class in b */

130

for (i = 0; i < an; i++) {

131

for (i = 0; i < an; i++) {

131

/* find the equivalence class */

132

/* find the equivalence class */

132

for (j = a[i].hash & buckets; h[j].pos != -1;

133

for (j = a[i].hash & buckets; h[j].pos != -1;

133

j = (j + 1) & buckets)

134

j = (j + 1) & buckets)

134

if (!cmp(a + i, b + h[j].pos))

135

if (!cmp(a + i, b + h[j].pos))

135

break;

136

break;

136

137

a[i].e = j; /* use equivalence class for quick compare */

138

a[i].e = j; /* use equivalence class for quick compare */

138

if (h[j].len <= t)

139

if (h[j].len <= t)

139

a[i].n = h[j].pos; /* point to head of match list */

140

a[i].n = h[j].pos; /* point to head of match list */

140

else

141

else

141

a[i].n = -1; /* too popular */

142

a[i].n = -1; /* too popular */

142

}

143

}

143

144

/* discard hash tables */

145

/* discard hash tables */

145

free(h);

146

free(h);

146

return 1;

147

return 1;

147

}

148

}

148

149

static int longest_match(struct line *a, struct line *b, struct pos *pos,

150

static int longest_match(struct line *a, struct line *b, struct pos *pos,

150

int a1, int a2, int b1, int b2, int *omi, int *omj)

151

int a1, int a2, int b1, int b2, int *omi, int *omj)

151

{

152

{

152

int mi = a1, mj = b1, mk = 0, i, j, k, half;

153

int mi = a1, mj = b1, mk = 0, i, j, k, half;

153

154

/* window our search on large regions to better bound

155

/* window our search on large regions to better bound

155

worst-case performance. by choosing a window at the end, we

156

worst-case performance. by choosing a window at the end, we

156

reduce skipping overhead on the b chains. */

157

reduce skipping overhead on the b chains. */

157

if (a2 - a1 > 30000)

158

if (a2 - a1 > 30000)

158

a1 = a2 - 30000;

159

a1 = a2 - 30000;

159

160

half = (a1 + a2) / 2;

161

half = (a1 + a2) / 2;

161

162

for (i = a1; i < a2; i++) {

163

for (i = a1; i < a2; i++) {

163

/* skip all lines in b after the current block */

164

/* skip all lines in b after the current block */

164

for (j = a[i].n; j >= b2; j = b[j].n)

165

for (j = a[i].n; j >= b2; j = b[j].n)

165

;

166

;

166

167

/* loop through all lines match a[i] in b */

168

/* loop through all lines match a[i] in b */

168

for (; j >= b1; j = b[j].n) {

169

for (; j >= b1; j = b[j].n) {

169

/* does this extend an earlier match? */

170

/* does this extend an earlier match? */

170

for (k = 1; j - k >= b1 && i - k >= a1; k++) {

171

for (k = 1; j - k >= b1 && i - k >= a1; k++) {

171

/* reached an earlier match? */

172

/* reached an earlier match? */

172

if (pos[j - k].pos == i - k) {

173

if (pos[j - k].pos == i - k) {

173

k += pos[j - k].len;

174

k += pos[j - k].len;

174

break;

175

break;

175

}

176

}

176

/* previous line mismatch? */

177

/* previous line mismatch? */

177

if (a[i - k].e != b[j - k].e)

178

if (a[i - k].e != b[j - k].e)

178

break;

179

break;

179

}

180

}

180

181

pos[j].pos = i;

182

pos[j].pos = i;

182

pos[j].len = k;

183

pos[j].len = k;

183

184

/* best match so far? we prefer matches closer

185

/* best match so far? we prefer matches closer

185

to the middle to balance recursion */

186

to the middle to balance recursion */

186

if (k > mk || (k == mk && (i <= mi || i < half))) {

187

if (k > mk || (k == mk && (i <= mi || i < half))) {

187

mi = i;

188

mi = i;

188

mj = j;

189

mj = j;

189

mk = k;

190

mk = k;

190

}

191

}

191

}

192

}

192

}

193

}

193

194

if (mk) {

195

if (mk) {

195

mi = mi - mk + 1;

196

mi = mi - mk + 1;

196

mj = mj - mk + 1;

197

mj = mj - mk + 1;

197

}

198

}

198

199

/* expand match to include subsequent popular lines */

200

/* expand match to include subsequent popular lines */

200

while (mi + mk < a2 && mj + mk < b2 &&

201

while (mi + mk < a2 && mj + mk < b2 &&

201

a[mi + mk].e == b[mj + mk].e)

202

a[mi + mk].e == b[mj + mk].e)

202

mk++;

203

mk++;

203

204

*omi = mi;

205

*omi = mi;

205

*omj = mj;

206

*omj = mj;

206

207

return mk;

208

return mk;

208

}

209

}

209

210

static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,

211

static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,

211

int a1, int a2, int b1, int b2, struct hunk *l)

212

int a1, int a2, int b1, int b2, struct hunk *l)

212

{

213

{

213

int i, j, k;

214

int i, j, k;

214

215

while (1) {

216

while (1) {

216

/* find the longest match in this chunk */

217

/* find the longest match in this chunk */

217

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

218

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

218

if (!k)

219

if (!k)

219

return l;

220

return l;

220

221

/* and recurse on the remaining chunks on either side */

222

/* and recurse on the remaining chunks on either side */

222

l = recurse(a, b, pos, a1, i, b1, j, l);

223

l = recurse(a, b, pos, a1, i, b1, j, l);

223

if (!l)

224

if (!l)

224

return NULL;

225

return NULL;

225

226

l->next = (struct hunk *)malloc(sizeof(struct hunk));

227

l->next = (struct hunk *)malloc(sizeof(struct hunk));

227

if (!l->next)

228

if (!l->next)

228

return NULL;

229

return NULL;

229

230

l = l->next;

231

l = l->next;

231

l->a1 = i;

232

l->a1 = i;

232

l->a2 = i + k;

233

l->a2 = i + k;

233

l->b1 = j;

234

l->b1 = j;

234

l->b2 = j + k;

235

l->b2 = j + k;

235

l->next = NULL;

236

l->next = NULL;

236

237

/* tail-recursion didn't happen, so do equivalent iteration */

238

/* tail-recursion didn't happen, so do equivalent iteration */

238

a1 = i + k;

239

a1 = i + k;

239

b1 = j + k;

240

b1 = j + k;

240

}

241

}

241

}

242

}

242

243

static int diff(struct line *a, int an, struct line *b, int bn,

244

static int diff(struct line *a, int an, struct line *b, int bn,

244

struct hunk *base)

245

struct hunk *base)

245

{

246

{

246

struct hunk *curr;

247

struct hunk *curr;

247

struct pos *pos;

248

struct pos *pos;

248

int t, count = 0;

249

int t, count = 0;

249

250

/* allocate and fill arrays */

251

/* allocate and fill arrays */

251

t = equatelines(a, an, b, bn);

252

t = equatelines(a, an, b, bn);

252

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

253

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

253

254

if (pos && t) {

255

if (pos && t) {

255

/* generate the matching block list */

256

/* generate the matching block list */

256

257

curr = recurse(a, b, pos, 0, an, 0, bn, base);

258

curr = recurse(a, b, pos, 0, an, 0, bn, base);

258

if (!curr)

259

if (!curr)

259

return -1;

260

return -1;

260

261

/* sentinel end hunk */

262

/* sentinel end hunk */

262

curr->next = (struct hunk *)malloc(sizeof(struct hunk));

263

curr->next = (struct hunk *)malloc(sizeof(struct hunk));

263

if (!curr->next)

264

if (!curr->next)

264

return -1;

265

return -1;

265

curr = curr->next;

266

curr = curr->next;

266

curr->a1 = curr->a2 = an;

267

curr->a1 = curr->a2 = an;

267

curr->b1 = curr->b2 = bn;

268

curr->b1 = curr->b2 = bn;

268

curr->next = NULL;

269

curr->next = NULL;

269

}

270

}

270

271

free(pos);

272

free(pos);

272

273

/* normalize the hunk list, try to push each hunk towards the end */

274

/* normalize the hunk list, try to push each hunk towards the end */

274

for (curr = base->next; curr; curr = curr->next) {

275

for (curr = base->next; curr; curr = curr->next) {

275

struct hunk *next = curr->next;

276

struct hunk *next = curr->next;

276

277

if (!next)

278

if (!next)

278

break;

279

break;

279

280

if (curr->a2 == next->a1 || curr->b2 == next->b1)

281

if (curr->a2 == next->a1 || curr->b2 == next->b1)

281

while (curr->a2 < an && curr->b2 < bn

282

while (curr->a2 < an && curr->b2 < bn

282

&& next->a1 < next->a2

283

&& next->a1 < next->a2

283

&& next->b1 < next->b2

284

&& next->b1 < next->b2

284

&& !cmp(a + curr->a2, b + curr->b2)) {

285

&& !cmp(a + curr->a2, b + curr->b2)) {

285

curr->a2++;

286

curr->a2++;

286

next->a1++;

287

next->a1++;

287

curr->b2++;

288

curr->b2++;

288

next->b1++;

289

next->b1++;

289

}

290

}

290

}

291

}

291

292

for (curr = base->next; curr; curr = curr->next)

293

for (curr = base->next; curr; curr = curr->next)

293

count++;

294

count++;

294

return count;

295

return count;

295

}

296

}

296

297

static void freehunks(struct hunk *l)

298

static void freehunks(struct hunk *l)

298

{

299

{

299

struct hunk *n;

300

struct hunk *n;

300

for (; l; l = n) {

301

for (; l; l = n) {

301

n = l->next;

302

n = l->next;

302

free(l);

303

free(l);

303

}

304

}

304

}

305

}

305

306

static PyObject *blocks(PyObject *self, PyObject *args)

307

static PyObject *blocks(PyObject *self, PyObject *args)

307

{

308

{

308

PyObject *sa, *sb, *rl = NULL, *m;

309

PyObject *sa, *sb, *rl = NULL, *m;

309

struct line *a, *b;

310

struct line *a, *b;

310

struct hunk l, *h;

311

struct hunk l, *h;

311

int an, bn, count, pos = 0;

312

int an, bn, count, pos = 0;

312

313

l.next = NULL;

314

l.next = NULL;

314

315

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

316

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

316

return NULL;

317

return NULL;

317

318

an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);

319

an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);

319

bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);

320

bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);

320

321

if (!a || !b)

322

if (!a || !b)

322

goto nomem;

323

goto nomem;

323

324

count = diff(a, an, b, bn, &l);

325

count = diff(a, an, b, bn, &l);

325

if (count < 0)

326

if (count < 0)

326

goto nomem;

327

goto nomem;

327

328

rl = PyList_New(count);

329

rl = PyList_New(count);

329

if (!rl)

330

if (!rl)

330

goto nomem;

331

goto nomem;

331

332

for (h = l.next; h; h = h->next) {

333

for (h = l.next; h; h = h->next) {

333

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

334

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

334

PyList_SetItem(rl, pos, m);

335

PyList_SetItem(rl, pos, m);

335

pos++;

336

pos++;

336

}

337

}

337

338

nomem:

339

nomem:

339

free(a);

340

free(a);

340

free(b);

341

free(b);

341

freehunks(l.next);

342

freehunks(l.next);

342

return rl ? rl : PyErr_NoMemory();

343

return rl ? rl : PyErr_NoMemory();

343

}

344

}

344

345

static PyObject *bdiff(PyObject *self, PyObject *args)

346

static PyObject *bdiff(PyObject *self, PyObject *args)

346

{

347

{

347

char *sa, *sb, *rb;

348

char *sa, *sb, *rb;

348

PyObject *result = NULL;

349

PyObject *result = NULL;

349

struct line *al, *bl;

350

struct line *al, *bl;

350

struct hunk l, *h;

351

struct hunk l, *h;

351

int an, bn, count;

352

int an, bn, count;

352

Py_ssize_t len = 0, la, lb;

353

Py_ssize_t len = 0, la, lb;

353

PyThreadState *_save;

354

PyThreadState *_save;

354

355

l.next = NULL;

356

l.next = NULL;

356

357

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

358

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

358

return NULL;

359

return NULL;

359

360

if (la > UINT_MAX || lb > UINT_MAX) {

361

if (la > UINT_MAX || lb > UINT_MAX) {

361

PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");

362

PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");

362

return NULL;

363

return NULL;

363

}

364

}

364

365

_save = PyEval_SaveThread();

366

_save = PyEval_SaveThread();

366

an = splitlines(sa, la, &al);

367

an = splitlines(sa, la, &al);

367

bn = splitlines(sb, lb, &bl);

368

bn = splitlines(sb, lb, &bl);

368

if (!al || !bl)

369

if (!al || !bl)

369

goto nomem;

370

goto nomem;

370

371

count = diff(al, an, bl, bn, &l);

372

count = diff(al, an, bl, bn, &l);

372

if (count < 0)

373

if (count < 0)

373

goto nomem;

374

goto nomem;

374

375

/* calculate length of output */

376

/* calculate length of output */

376

la = lb = 0;

377

la = lb = 0;

377

for (h = l.next; h; h = h->next) {

378

for (h = l.next; h; h = h->next) {

378

if (h->a1 != la || h->b1 != lb)

379

if (h->a1 != la || h->b1 != lb)

379

len += 12 + bl[h->b1].l - bl[lb].l;

380

len += 12 + bl[h->b1].l - bl[lb].l;

380

la = h->a2;

381

la = h->a2;

381

lb = h->b2;

382

lb = h->b2;

382

}

383

}

383

PyEval_RestoreThread(_save);

384

PyEval_RestoreThread(_save);

384

_save = NULL;

385

_save = NULL;

385

386

result = PyBytes_FromStringAndSize(NULL, len);

387

result = PyBytes_FromStringAndSize(NULL, len);

387

388

if (!result)

389

if (!result)

389

goto nomem;

390

goto nomem;

390

391

/* build binary patch */

392

/* build binary patch */

392

rb = PyBytes_AsString(result);

393

rb = PyBytes_AsString(result);

393

la = lb = 0;

394

la = lb = 0;

394

395

for (h = l.next; h; h = h->next) {

396

for (h = l.next; h; h = h->next) {

396

if (h->a1 != la || h->b1 != lb) {

397

if (h->a1 != la || h->b1 != lb) {

397

len = bl[h->b1].l - bl[lb].l;

398

len = bl[h->b1].l - bl[lb].l;

398

putbe32((uint32_t)(al[la].l - al->l), rb);

399

putbe32((uint32_t)(al[la].l - al->l), rb);

399

putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);

400

putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);

400

putbe32((uint32_t)len, rb + 8);

401

putbe32((uint32_t)len, rb + 8);

401

memcpy(rb + 12, bl[lb].l, len);

402

memcpy(rb + 12, bl[lb].l, len);

402

rb += 12 + len;

403

rb += 12 + len;

403

}

404

}

404

la = h->a2;

405

la = h->a2;

405

lb = h->b2;

406

lb = h->b2;

406

}

407

}

407

408

nomem:

409

nomem:

409

if (_save)

410

if (_save)

410

PyEval_RestoreThread(_save);

411

PyEval_RestoreThread(_save);

411

free(al);

412

free(al);

412

free(bl);

413

free(bl);

413

freehunks(l.next);

414

freehunks(l.next);

414

return result ? result : PyErr_NoMemory();

415

return result ? result : PyErr_NoMemory();

415

}

416

}

416

417

/*

418

/*

418

* If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,

419

* If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,

419

* reduce whitespace sequences to a single space and trim remaining whitespace

420

* reduce whitespace sequences to a single space and trim remaining whitespace

420

* from end of lines.

421

* from end of lines.

421

*/

422

*/

422

static PyObject *fixws(PyObject *self, PyObject *args)

423

static PyObject *fixws(PyObject *self, PyObject *args)

423

{

424

{

424

PyObject *s, *result = NULL;

425

PyObject *s, *result = NULL;

425

char allws, c;

426

char allws, c;

426

const char *r;

427

const char *r;

427

Py_ssize_t i, rlen, wlen = 0;

428

Py_ssize_t i, rlen, wlen = 0;

428

char *w;

429

char *w;

429

430

if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))

431

if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))

431

return NULL;

432

return NULL;

432

r = PyBytes_AsString(s);

433

r = PyBytes_AsString(s);

433

rlen = PyBytes_Size(s);

434

rlen = PyBytes_Size(s);

434

435

w = (char *)malloc(rlen ? rlen : 1);

436

w = (char *)malloc(rlen ? rlen : 1);

436

if (!w)

437

if (!w)

437

goto nomem;

438

goto nomem;

438

439

for (i = 0; i != rlen; i++) {

440

for (i = 0; i != rlen; i++) {

440

c = r[i];

441

c = r[i];

441

if (c == ' ' || c == '\t' || c == '\r') {

442

if (c == ' ' || c == '\t' || c == '\r') {

442

if (!allws && (wlen == 0 || w[wlen - 1] != ' '))

443

if (!allws && (wlen == 0 || w[wlen - 1] != ' '))

443

w[wlen++] = ' ';

444

w[wlen++] = ' ';

444

} else if (c == '\n' && !allws

445

} else if (c == '\n' && !allws

445

&& wlen > 0 && w[wlen - 1] == ' ') {

446

&& wlen > 0 && w[wlen - 1] == ' ') {

446

w[wlen - 1] = '\n';

447

w[wlen - 1] = '\n';

447

} else {

448

} else {

448

w[wlen++] = c;

449

w[wlen++] = c;

449

}

450

}

450

}

451

}

451

452

result = PyBytes_FromStringAndSize(w, wlen);

453

result = PyBytes_FromStringAndSize(w, wlen);

453

454

nomem:

455

nomem:

455

free(w);

456

free(w);

456

return result ? result : PyErr_NoMemory();

457

return result ? result : PyErr_NoMemory();

457

}

458

}

458

459

460

static char mdiff_doc[] = "Efficient binary diff.";

461

static char mdiff_doc[] = "Efficient binary diff.";

461

462

static PyMethodDef methods[] = {

463

static PyMethodDef methods[] = {

463

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

464

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

464

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

465

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

465

{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},

466

{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},

466

{NULL, NULL}

467

{NULL, NULL}

467

};

468

};

468

469

#ifdef IS_PY3K

470

#ifdef IS_PY3K

470

static struct PyModuleDef bdiff_module = {

471

static struct PyModuleDef bdiff_module = {

471

PyModuleDef_HEAD_INIT,

472

PyModuleDef_HEAD_INIT,

472

"bdiff",

473

"bdiff",

473

mdiff_doc,

474

mdiff_doc,

474

-1,

475

-1,

475

methods

476

methods

476

};

477

};

477

478

PyMODINIT_FUNC PyInit_bdiff(void)

479

PyMODINIT_FUNC PyInit_bdiff(void)

479

{

480

{

480

return PyModule_Create(&bdiff_module);

481

return PyModule_Create(&bdiff_module);

481

}

482

}

482

#else

483

#else

483

PyMODINIT_FUNC initbdiff(void)

484

PyMODINIT_FUNC initbdiff(void)

484

{

485

{

485

Py_InitModule3("bdiff", methods, mdiff_doc);

486

Py_InitModule3("bdiff", methods, mdiff_doc);

486

}

487

}

487

#endif

488

#endif

488

489

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             /*
              bdiff.c - efficient binary diff extension for Mercurial
              Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
              This software may be used and distributed according to the terms of
              the GNU General Public License, incorporated herein by reference.
              Based roughly on Python difflib
             */
             #define PY_SSIZE_T_CLEAN
             #include <Python.h>
             #include <stdlib.h>
             #include <string.h>
             #include <limits.h>
+            #include "compat.h"
             #include "util.h"
             #include "bitmanipulation.h"
             struct line {
             	int hash, n, e;
-            	Py_ssize_t len;
+            	ssize_t len;
             	const char *l;
             };
             struct pos {
             	int pos, len;
             };
             struct hunk;
             struct hunk {
             	int a1, a2, b1, b2;
             	struct hunk *next;
             };
-            static int splitlines(const char *a, Py_ssize_t len, struct line **lr)
+            static int splitlines(const char *a, ssize_t len, struct line **lr)
             {
             	unsigned hash;
             	int i;
             	const char *p, *b = a;
             	const char * const plast = a + len - 1;
             	struct line *l;
             	/* count the lines */
             	i = 1; /* extra line for sentinel */
             	for (p = a; p < a + len; p++)
             		if (*p == '\n' || p == plast)
             			i++;
             	*lr = l = (struct line *)malloc(sizeof(struct line) * i);
             	if (!l)
             		return -1;
             	/* build the line array and calculate hashes */
             	hash = 0;
             	for (p = a; p < a + len; p++) {
             		/* Leonid Yuriev's hash */
             		hash = (hash * 1664525) + (unsigned char)*p + 1013904223;
             		if (*p == '\n' || p == plast) {
             			l->hash = hash;
             			hash = 0;
             			l->len = p - b + 1;
             			l->l = b;
             			l->n = INT_MAX;
             			l++;
             			b = p + 1;
             		}
             	}
             	/* set up a sentinel */
             	l->hash = 0;
             	l->len = 0;
             	l->l = a + len;
             	return i - 1;
             }
             static inline int cmp(struct line *a, struct line *b)
             {
             	return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);
             }
             static int equatelines(struct line *a, int an, struct line *b, int bn)
             {
             	int i, j, buckets = 1, t, scale;
             	struct pos *h = NULL;
             	/* build a hash table of the next highest power of 2 */
             	while (buckets < bn + 1)
             		buckets *= 2;
             	/* try to allocate a large hash table to avoid collisions */
             	for (scale = 4; scale; scale /= 2) {
             		h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));
             		if (h)
             			break;
             	}
             	if (!h)
             		return 0;
             	buckets = buckets * scale - 1;
             	/* clear the hash table */
             	for (i = 0; i <= buckets; i++) {
             		h[i].pos = -1;
             		h[i].len = 0;
             	}
             	/* add lines to the hash table chains */
             	for (i = 0; i < bn; i++) {
             		/* find the equivalence class */
             		for (j = b[i].hash & buckets; h[j].pos != -1;
             		     j = (j + 1) & buckets)
             			if (!cmp(b + i, b + h[j].pos))
             				break;
             		/* add to the head of the equivalence class */
             		b[i].n = h[j].pos;
             		b[i].e = j;
             		h[j].pos = i;
             		h[j].len++; /* keep track of popularity */
             	}
             	/* compute popularity threshold */
             	t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);
             	/* match items in a to their equivalence class in b */
             	for (i = 0; i < an; i++) {
             		/* find the equivalence class */
             		for (j = a[i].hash & buckets; h[j].pos != -1;
             		     j = (j + 1) & buckets)
             			if (!cmp(a + i, b + h[j].pos))
             				break;
             		a[i].e = j; /* use equivalence class for quick compare */
             		if (h[j].len <= t)
             			a[i].n = h[j].pos; /* point to head of match list */
             		else
             			a[i].n = -1; /* too popular */
             	}
             	/* discard hash tables */
             	free(h);
             	return 1;
             }
             static int longest_match(struct line *a, struct line *b, struct pos *pos,
             			 int a1, int a2, int b1, int b2, int *omi, int *omj)
             {
             	int mi = a1, mj = b1, mk = 0, i, j, k, half;
             	/* window our search on large regions to better bound
             	   worst-case performance. by choosing a window at the end, we
             	   reduce skipping overhead on the b chains. */
             	if (a2 - a1 > 30000)
             		a1 = a2 - 30000;
             	half = (a1 + a2) / 2;
             	for (i = a1; i < a2; i++) {
             		/* skip all lines in b after the current block */
             		for (j = a[i].n; j >= b2; j = b[j].n)
             			;
             		/* loop through all lines match a[i] in b */
             		for (; j >= b1; j = b[j].n) {
             			/* does this extend an earlier match? */
             			for (k = 1; j - k >= b1 && i - k >= a1; k++) {
             				/* reached an earlier match? */
             				if (pos[j - k].pos == i - k) {
             					k += pos[j - k].len;
             					break;
             				}
             				/* previous line mismatch? */
             				if (a[i - k].e != b[j - k].e)
             					break;
             			}
             			pos[j].pos = i;
             			pos[j].len = k;
             			/* best match so far? we prefer matches closer
             			   to the middle to balance recursion */
             			if (k > mk || (k == mk && (i <= mi || i < half))) {
             				mi = i;
             				mj = j;
             				mk = k;
             			}
             		}
             	}
             	if (mk) {
             		mi = mi - mk + 1;
             		mj = mj - mk + 1;
             	}
             	/* expand match to include subsequent popular lines */
             	while (mi + mk < a2 && mj + mk < b2 &&
             	       a[mi + mk].e == b[mj + mk].e)
             		mk++;
             	*omi = mi;
             	*omj = mj;
             	return mk;
             }
             static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,
             			    int a1, int a2, int b1, int b2, struct hunk *l)
             {
             	int i, j, k;
             	while (1) {
             		/* find the longest match in this chunk */
             		k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
             		if (!k)
             			return l;
             		/* and recurse on the remaining chunks on either side */
             		l = recurse(a, b, pos, a1, i, b1, j, l);
             		if (!l)
             			return NULL;
             		l->next = (struct hunk *)malloc(sizeof(struct hunk));
             		if (!l->next)
             			return NULL;
             		l = l->next;
             		l->a1 = i;
             		l->a2 = i + k;
             		l->b1 = j;
             		l->b2 = j + k;
             		l->next = NULL;
             		/* tail-recursion didn't happen, so do equivalent iteration */
             		a1 = i + k;
             		b1 = j + k;
             	}
             }
             static int diff(struct line *a, int an, struct line *b, int bn,
             		 struct hunk *base)
             {
             	struct hunk *curr;
             	struct pos *pos;
             	int t, count = 0;
             	/* allocate and fill arrays */
             	t = equatelines(a, an, b, bn);
             	pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));
             	if (pos && t) {
             		/* generate the matching block list */
             		curr = recurse(a, b, pos, 0, an, 0, bn, base);
             		if (!curr)
             			return -1;
             		/* sentinel end hunk */
             		curr->next = (struct hunk *)malloc(sizeof(struct hunk));
             		if (!curr->next)
             			return -1;
             		curr = curr->next;
             		curr->a1 = curr->a2 = an;
             		curr->b1 = curr->b2 = bn;
             		curr->next = NULL;
             	}
             	free(pos);
             	/* normalize the hunk list, try to push each hunk towards the end */
             	for (curr = base->next; curr; curr = curr->next) {
             		struct hunk *next = curr->next;
             		if (!next)
             			break;
             		if (curr->a2 == next->a1 || curr->b2 == next->b1)
             			while (curr->a2 < an && curr->b2 < bn
             			       && next->a1 < next->a2
             			       && next->b1 < next->b2
             			       && !cmp(a + curr->a2, b + curr->b2)) {
             				curr->a2++;
             				next->a1++;
             				curr->b2++;
             				next->b1++;
             			}
             	}
             	for (curr = base->next; curr; curr = curr->next)
             		count++;
             	return count;
             }
             static void freehunks(struct hunk *l)
             {
             	struct hunk *n;
             	for (; l; l = n) {
             		n = l->next;
             		free(l);
             	}
             }
             static PyObject *blocks(PyObject *self, PyObject *args)
             {
             	PyObject *sa, *sb, *rl = NULL, *m;
             	struct line *a, *b;
             	struct hunk l, *h;
             	int an, bn, count, pos = 0;
             	l.next = NULL;
             	if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
             		return NULL;
             	an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);
             	bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);
             	if (!a || !b)
             		goto nomem;
             	count = diff(a, an, b, bn, &l);
             	if (count < 0)
             		goto nomem;
             	rl = PyList_New(count);
             	if (!rl)
             		goto nomem;
             	for (h = l.next; h; h = h->next) {
             		m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
             		PyList_SetItem(rl, pos, m);
             		pos++;
             	}
             nomem:
             	free(a);
             	free(b);
             	freehunks(l.next);
             	return rl ? rl : PyErr_NoMemory();
             }
             static PyObject *bdiff(PyObject *self, PyObject *args)
             {
             	char *sa, *sb, *rb;
             	PyObject *result = NULL;
             	struct line *al, *bl;
             	struct hunk l, *h;
             	int an, bn, count;
             	Py_ssize_t len = 0, la, lb;
             	PyThreadState *_save;
             	l.next = NULL;
             	if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))
             		return NULL;
             	if (la > UINT_MAX || lb > UINT_MAX) {
             		PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");
             		return NULL;
             	}
             	_save = PyEval_SaveThread();
             	an = splitlines(sa, la, &al);
             	bn = splitlines(sb, lb, &bl);
             	if (!al || !bl)
             		goto nomem;
             	count = diff(al, an, bl, bn, &l);
             	if (count < 0)
             		goto nomem;
             	/* calculate length of output */
             	la = lb = 0;
             	for (h = l.next; h; h = h->next) {
             		if (h->a1 != la || h->b1 != lb)
             			len += 12 + bl[h->b1].l - bl[lb].l;
             		la = h->a2;
             		lb = h->b2;
             	}
             	PyEval_RestoreThread(_save);
             	_save = NULL;
             	result = PyBytes_FromStringAndSize(NULL, len);
             	if (!result)
             		goto nomem;
             	/* build binary patch */
             	rb = PyBytes_AsString(result);
             	la = lb = 0;
             	for (h = l.next; h; h = h->next) {
             		if (h->a1 != la || h->b1 != lb) {
             			len = bl[h->b1].l - bl[lb].l;
             			putbe32((uint32_t)(al[la].l - al->l), rb);
             			putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);
             			putbe32((uint32_t)len, rb + 8);
             			memcpy(rb + 12, bl[lb].l, len);
             			rb += 12 + len;
             		}
             		la = h->a2;
             		lb = h->b2;
             	}
             nomem:
             	if (_save)
             		PyEval_RestoreThread(_save);
             	free(al);
             	free(bl);
             	freehunks(l.next);
             	return result ? result : PyErr_NoMemory();
             }
             /*
              * If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,
              * reduce whitespace sequences to a single space and trim remaining whitespace
              * from end of lines.
              */
             static PyObject *fixws(PyObject *self, PyObject *args)
             {
             	PyObject *s, *result = NULL;
             	char allws, c;
             	const char *r;
             	Py_ssize_t i, rlen, wlen = 0;
             	char *w;
             	if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))
             		return NULL;
             	r = PyBytes_AsString(s);
             	rlen = PyBytes_Size(s);
             	w = (char *)malloc(rlen ? rlen : 1);
             	if (!w)
             		goto nomem;
             	for (i = 0; i != rlen; i++) {
             		c = r[i];
             		if (c == ' ' || c == '\t' || c == '\r') {
             			if (!allws && (wlen == 0 || w[wlen - 1] != ' '))
             				w[wlen++] = ' ';
             		} else if (c == '\n' && !allws
             			  && wlen > 0 && w[wlen - 1] == ' ') {
             			w[wlen - 1] = '\n';
             		} else {
             			w[wlen++] = c;
             		}
             	}
             	result = PyBytes_FromStringAndSize(w, wlen);
             nomem:
             	free(w);
             	return result ? result : PyErr_NoMemory();
             }
             static char mdiff_doc[] = "Efficient binary diff.";
             static PyMethodDef methods[] = {
             	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
             	{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
             	{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},
             	{NULL, NULL}
             };
             #ifdef IS_PY3K
             static struct PyModuleDef bdiff_module = {
             	PyModuleDef_HEAD_INIT,
             	"bdiff",
             	mdiff_doc,
             	-1,
             	methods
             };
             PyMODINIT_FUNC PyInit_bdiff(void)
             {
             	return PyModule_Create(&bdiff_module);
             }
             #else
             PyMODINIT_FUNC initbdiff(void)
             {
             	Py_InitModule3("bdiff", methods, mdiff_doc);
             }
             #endif