upstream/mercurial-mirror Commit - r29540:4ce1fc91

1

/*

1

/*

2

bdiff.c - efficient binary diff extension for Mercurial

2

bdiff.c - efficient binary diff extension for Mercurial

3

4

5

6

This software may be used and distributed according to the terms of

6

This software may be used and distributed according to the terms of

7

the GNU General Public License, incorporated herein by reference.

7

the GNU General Public License, incorporated herein by reference.

8

9

Based roughly on Python difflib

9

Based roughly on Python difflib

10

*/

10

*/

11

12

#define PY_SSIZE_T_CLEAN

12

#define PY_SSIZE_T_CLEAN

13

#include <Python.h>

13

#include <Python.h>

14

#include <stdlib.h>

14

#include <stdlib.h>

15

#include <string.h>

15

#include <string.h>

16

#include <limits.h>

16

#include <limits.h>

17

18

#include "compat.h"

18

#include "compat.h"

19

#include "util.h"

19

#include "util.h"

20

#include "bitmanipulation.h"

20

#include "bitmanipulation.h"

21

22

struct line {

22

struct bdiff_line {

23

int hash, n, e;

23

int hash, n, e;

24

ssize_t len;

24

ssize_t len;

25

const char *l;

25

const char *l;

26

};

26

};

27

28

struct pos {

28

struct pos {

29

int pos, len;

29

int pos, len;

30

};

30

};

31

32

struct hunk;

32

struct bdiff_hunk;

33

struct hunk {

33

struct bdiff_hunk {

34

int a1, a2, b1, b2;

34

int a1, a2, b1, b2;

35

struct hunk *next;

35

struct bdiff_hunk *next;

36

};

36

};

37

38

static int splitlines(const char *a, ssize_t len, struct line **lr)

38

static int bdiff_splitlines(const char *a, ssize_t len, struct bdiff_line **lr)

39

{

39

{

40

unsigned hash;

40

unsigned hash;

41

int i;

41

int i;

42

const char *p, *b = a;

42

const char *p, *b = a;

43

const char * const plast = a + len - 1;

43

const char * const plast = a + len - 1;

44

struct line *l;

44

struct bdiff_line *l;

45

46

/* count the lines */

46

/* count the lines */

47

i = 1; /* extra line for sentinel */

47

i = 1; /* extra line for sentinel */

48

for (p = a; p < a + len; p++)

48

for (p = a; p < a + len; p++)

49

if (*p == '\n' || p == plast)

49

if (*p == '\n' || p == plast)

50

i++;

50

i++;

51

52

*lr = l = (struct line *)malloc(sizeof(struct line) * i);

52

*lr = l = (struct bdiff_line *)malloc(sizeof(struct bdiff_line) * i);

53

if (!l)

53

if (!l)

54

return -1;

54

return -1;

55

56

/* build the line array and calculate hashes */

56

/* build the line array and calculate hashes */

57

hash = 0;

57

hash = 0;

58

for (p = a; p < a + len; p++) {

58

for (p = a; p < a + len; p++) {

59

/* Leonid Yuriev's hash */

59

/* Leonid Yuriev's hash */

60

hash = (hash * 1664525) + (unsigned char)*p + 1013904223;

60

hash = (hash * 1664525) + (unsigned char)*p + 1013904223;

61

62

if (*p == '\n' || p == plast) {

62

if (*p == '\n' || p == plast) {

63

l->hash = hash;

63

l->hash = hash;

64

hash = 0;

64

hash = 0;

65

l->len = p - b + 1;

65

l->len = p - b + 1;

66

l->l = b;

66

l->l = b;

67

l->n = INT_MAX;

67

l->n = INT_MAX;

68

l++;

68

l++;

69

b = p + 1;

69

b = p + 1;

70

}

70

}

71

}

71

}

72

73

/* set up a sentinel */

73

/* set up a sentinel */

74

l->hash = 0;

74

l->hash = 0;

75

l->len = 0;

75

l->len = 0;

76

l->l = a + len;

76

l->l = a + len;

77

return i - 1;

77

return i - 1;

78

}

78

}

79

80

static inline int cmp(struct line *a, struct line *b)

80

static inline int cmp(struct bdiff_line *a, struct bdiff_line *b)

81

{

81

{

82

return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);

82

return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);

83

}

83

}

84

85

static int equatelines(struct line *a, int an, struct line *b, ~~int~~ bn)

85

static int equatelines(struct bdiff_line *a, int an, struct bdiff_line *b,

86

int bn)

86

{

87

{

87

int i, j, buckets = 1, t, scale;

88

int i, j, buckets = 1, t, scale;

88

struct pos *h = NULL;

89

struct pos *h = NULL;

89

90

/* build a hash table of the next highest power of 2 */

91

/* build a hash table of the next highest power of 2 */

91

while (buckets < bn + 1)

92

while (buckets < bn + 1)

92

buckets *= 2;

93

buckets *= 2;

93

94

/* try to allocate a large hash table to avoid collisions */

95

/* try to allocate a large hash table to avoid collisions */

95

for (scale = 4; scale; scale /= 2) {

96

for (scale = 4; scale; scale /= 2) {

96

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

97

h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));

97

if (h)

98

if (h)

98

break;

99

break;

99

}

100

}

100

101

if (!h)

102

if (!h)

102

return 0;

103

return 0;

103

104

buckets = buckets * scale - 1;

105

buckets = buckets * scale - 1;

105

106

/* clear the hash table */

107

/* clear the hash table */

107

for (i = 0; i <= buckets; i++) {

108

for (i = 0; i <= buckets; i++) {

108

h[i].pos = -1;

109

h[i].pos = -1;

109

h[i].len = 0;

110

h[i].len = 0;

110

}

111

}

111

112

/* add lines to the hash table chains */

113

/* add lines to the hash table chains */

113

for (i = 0; i < bn; i++) {

114

for (i = 0; i < bn; i++) {

114

/* find the equivalence class */

115

/* find the equivalence class */

115

for (j = b[i].hash & buckets; h[j].pos != -1;

116

for (j = b[i].hash & buckets; h[j].pos != -1;

116

j = (j + 1) & buckets)

117

j = (j + 1) & buckets)

117

if (!cmp(b + i, b + h[j].pos))

118

if (!cmp(b + i, b + h[j].pos))

118

break;

119

break;

119

120

/* add to the head of the equivalence class */

121

/* add to the head of the equivalence class */

121

b[i].n = h[j].pos;

122

b[i].n = h[j].pos;

122

b[i].e = j;

123

b[i].e = j;

123

h[j].pos = i;

124

h[j].pos = i;

124

h[j].len++; /* keep track of popularity */

125

h[j].len++; /* keep track of popularity */

125

}

126

}

126

127

/* compute popularity threshold */

128

/* compute popularity threshold */

128

t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);

129

t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);

129

130

/* match items in a to their equivalence class in b */

131

/* match items in a to their equivalence class in b */

131

for (i = 0; i < an; i++) {

132

for (i = 0; i < an; i++) {

132

/* find the equivalence class */

133

/* find the equivalence class */

133

for (j = a[i].hash & buckets; h[j].pos != -1;

134

for (j = a[i].hash & buckets; h[j].pos != -1;

134

j = (j + 1) & buckets)

135

j = (j + 1) & buckets)

135

if (!cmp(a + i, b + h[j].pos))

136

if (!cmp(a + i, b + h[j].pos))

136

break;

137

break;

137

138

a[i].e = j; /* use equivalence class for quick compare */

139

a[i].e = j; /* use equivalence class for quick compare */

139

if (h[j].len <= t)

140

if (h[j].len <= t)

140

a[i].n = h[j].pos; /* point to head of match list */

141

a[i].n = h[j].pos; /* point to head of match list */

141

else

142

else

142

a[i].n = -1; /* too popular */

143

a[i].n = -1; /* too popular */

143

}

144

}

144

145

/* discard hash tables */

146

/* discard hash tables */

146

free(h);

147

free(h);

147

return 1;

148

return 1;

148

}

149

}

149

150

static int longest_match(struct line *a, struct line *b, ~~struct~~ ~~pos~~ *~~pos~~,

151

static int longest_match(struct bdiff_line *a, struct bdiff_line *b,

152

struct pos *pos,

151

int a1, int a2, int b1, int b2, int *omi, int *omj)

153

int a1, int a2, int b1, int b2, int *omi, int *omj)

152

{

154

{

153

int mi = a1, mj = b1, mk = 0, i, j, k, half;

155

int mi = a1, mj = b1, mk = 0, i, j, k, half;

154

156

155

/* window our search on large regions to better bound

157

/* window our search on large regions to better bound

156

worst-case performance. by choosing a window at the end, we

158

worst-case performance. by choosing a window at the end, we

157

reduce skipping overhead on the b chains. */

159

reduce skipping overhead on the b chains. */

158

if (a2 - a1 > 30000)

160

if (a2 - a1 > 30000)

159

a1 = a2 - 30000;

161

a1 = a2 - 30000;

160

162

161

half = (a1 + a2) / 2;

163

half = (a1 + a2) / 2;

162

164

163

for (i = a1; i < a2; i++) {

165

for (i = a1; i < a2; i++) {

164

/* skip all lines in b after the current block */

166

/* skip all lines in b after the current block */

165

for (j = a[i].n; j >= b2; j = b[j].n)

167

for (j = a[i].n; j >= b2; j = b[j].n)

166

;

168

;

167

169

168

/* loop through all lines match a[i] in b */

170

/* loop through all lines match a[i] in b */

169

for (; j >= b1; j = b[j].n) {

171

for (; j >= b1; j = b[j].n) {

170

/* does this extend an earlier match? */

172

/* does this extend an earlier match? */

171

for (k = 1; j - k >= b1 && i - k >= a1; k++) {

173

for (k = 1; j - k >= b1 && i - k >= a1; k++) {

172

/* reached an earlier match? */

174

/* reached an earlier match? */

173

if (pos[j - k].pos == i - k) {

175

if (pos[j - k].pos == i - k) {

174

k += pos[j - k].len;

176

k += pos[j - k].len;

175

break;

177

break;

176

}

178

}

177

/* previous line mismatch? */

179

/* previous line mismatch? */

178

if (a[i - k].e != b[j - k].e)

180

if (a[i - k].e != b[j - k].e)

179

break;

181

break;

180

}

182

}

181

183

182

pos[j].pos = i;

184

pos[j].pos = i;

183

pos[j].len = k;

185

pos[j].len = k;

184

186

185

/* best match so far? we prefer matches closer

187

/* best match so far? we prefer matches closer

186

to the middle to balance recursion */

188

to the middle to balance recursion */

187

if (k > mk || (k == mk && (i <= mi || i < half))) {

189

if (k > mk || (k == mk && (i <= mi || i < half))) {

188

mi = i;

190

mi = i;

189

mj = j;

191

mj = j;

190

mk = k;

192

mk = k;

191

}

193

}

192

}

194

}

193

}

195

}

194

196

195

if (mk) {

197

if (mk) {

196

mi = mi - mk + 1;

198

mi = mi - mk + 1;

197

mj = mj - mk + 1;

199

mj = mj - mk + 1;

198

}

200

}

199

201

200

/* expand match to include subsequent popular lines */

202

/* expand match to include subsequent popular lines */

201

while (mi + mk < a2 && mj + mk < b2 &&

203

while (mi + mk < a2 && mj + mk < b2 &&

202

a[mi + mk].e == b[mj + mk].e)

204

a[mi + mk].e == b[mj + mk].e)

203

mk++;

205

mk++;

204

206

205

*omi = mi;

207

*omi = mi;

206

*omj = mj;

208

*omj = mj;

207

209

208

return mk;

210

return mk;

209

}

211

}

210

212

211

static struct hunk *recurse(struct line *a, struct line *b, ~~struct~~ ~~pos~~ *~~pos~~,

213

static struct bdiff_hunk *recurse(struct bdiff_line *a, struct bdiff_line *b,

212

int a1, int a2, int b1, int b2, struct hunk *l)

214

struct pos *pos,

215

int a1, int a2, int b1, int b2, struct bdiff_hunk *l)

213

{

216

{

214

int i, j, k;

217

int i, j, k;

215

218

216

while (1) {

219

while (1) {

217

/* find the longest match in this chunk */

220

/* find the longest match in this chunk */

218

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

221

k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);

219

if (!k)

222

if (!k)

220

return l;

223

return l;

221

224

222

/* and recurse on the remaining chunks on either side */

225

/* and recurse on the remaining chunks on either side */

223

l = recurse(a, b, pos, a1, i, b1, j, l);

226

l = recurse(a, b, pos, a1, i, b1, j, l);

224

if (!l)

227

if (!l)

225

return NULL;

228

return NULL;

226

229

227

l->next = (struct hunk *)malloc(sizeof(struct hunk));

230

l->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk));

228

if (!l->next)

231

if (!l->next)

229

return NULL;

232

return NULL;

230

233

231

l = l->next;

234

l = l->next;

232

l->a1 = i;

235

l->a1 = i;

233

l->a2 = i + k;

236

l->a2 = i + k;

234

l->b1 = j;

237

l->b1 = j;

235

l->b2 = j + k;

238

l->b2 = j + k;

236

l->next = NULL;

239

l->next = NULL;

237

240

238

/* tail-recursion didn't happen, so do equivalent iteration */

241

/* tail-recursion didn't happen, so do equivalent iteration */

239

a1 = i + k;

242

a1 = i + k;

240

b1 = j + k;

243

b1 = j + k;

241

}

244

}

242

}

245

}

243

246

244

static int diff(struct line *a, int an, struct line *b, ~~int~~ bn,

247

static int bdiff_diff(struct bdiff_line *a, int an, struct bdiff_line *b,

245

struct hunk *base)

248

int bn, struct bdiff_hunk *base)

246

{

249

{

247

struct hunk *curr;

250

struct bdiff_hunk *curr;

248

struct pos *pos;

251

struct pos *pos;

249

int t, count = 0;

252

int t, count = 0;

250

253

251

/* allocate and fill arrays */

254

/* allocate and fill arrays */

252

t = equatelines(a, an, b, bn);

255

t = equatelines(a, an, b, bn);

253

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

256

pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));

254

257

255

if (pos && t) {

258

if (pos && t) {

256

/* generate the matching block list */

259

/* generate the matching block list */

257

260

258

curr = recurse(a, b, pos, 0, an, 0, bn, base);

261

curr = recurse(a, b, pos, 0, an, 0, bn, base);

259

if (!curr)

262

if (!curr)

260

return -1;

263

return -1;

261

264

262

/* sentinel end hunk */

265

/* sentinel end hunk */

263

curr->next = (struct hunk *)malloc(sizeof(struct hunk));

266

curr->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk));

264

if (!curr->next)

267

if (!curr->next)

265

return -1;

268

return -1;

266

curr = curr->next;

269

curr = curr->next;

267

curr->a1 = curr->a2 = an;

270

curr->a1 = curr->a2 = an;

268

curr->b1 = curr->b2 = bn;

271

curr->b1 = curr->b2 = bn;

269

curr->next = NULL;

272

curr->next = NULL;

270

}

273

}

271

274

272

free(pos);

275

free(pos);

273

276

274

/* normalize the hunk list, try to push each hunk towards the end */

277

/* normalize the hunk list, try to push each hunk towards the end */

275

for (curr = base->next; curr; curr = curr->next) {

278

for (curr = base->next; curr; curr = curr->next) {

276

struct hunk *next = curr->next;

279

struct bdiff_hunk *next = curr->next;

277

280

278

if (!next)

281

if (!next)

279

break;

282

break;

280

283

281

if (curr->a2 == next->a1 || curr->b2 == next->b1)

284

if (curr->a2 == next->a1 || curr->b2 == next->b1)

282

while (curr->a2 < an && curr->b2 < bn

285

while (curr->a2 < an && curr->b2 < bn

283

&& next->a1 < next->a2

286

&& next->a1 < next->a2

284

&& next->b1 < next->b2

287

&& next->b1 < next->b2

285

&& !cmp(a + curr->a2, b + curr->b2)) {

288

&& !cmp(a + curr->a2, b + curr->b2)) {

286

curr->a2++;

289

curr->a2++;

287

next->a1++;

290

next->a1++;

288

curr->b2++;

291

curr->b2++;

289

next->b1++;

292

next->b1++;

290

}

293

}

291

}

294

}

292

295

293

for (curr = base->next; curr; curr = curr->next)

296

for (curr = base->next; curr; curr = curr->next)

294

count++;

297

count++;

295

return count;

298

return count;

296

}

299

}

297

300

298

static void freehunks(struct hunk *l)

301

static void bdiff_freehunks(struct bdiff_hunk *l)

299

{

302

{

300

struct hunk *n;

303

struct bdiff_hunk *n;

301

for (; l; l = n) {

304

for (; l; l = n) {

302

n = l->next;

305

n = l->next;

303

free(l);

306

free(l);

304

}

307

}

305

}

308

}

306

309

307

static PyObject *blocks(PyObject *self, PyObject *args)

310

static PyObject *blocks(PyObject *self, PyObject *args)

308

{

311

{

309

PyObject *sa, *sb, *rl = NULL, *m;

312

PyObject *sa, *sb, *rl = NULL, *m;

310

struct line *a, *b;

313

struct bdiff_line *a, *b;

311

struct hunk l, *h;

314

struct bdiff_hunk l, *h;

312

int an, bn, count, pos = 0;

315

int an, bn, count, pos = 0;

313

316

314

l.next = NULL;

317

l.next = NULL;

315

318

316

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

319

if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))

317

return NULL;

320

return NULL;

318

321

319

an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);

322

an = bdiff_splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);

320

bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);

323

bn = bdiff_splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);

321

324

322

if (!a || !b)

325

if (!a || !b)

323

goto nomem;

326

goto nomem;

324

327

325

count = diff(a, an, b, bn, &l);

328

count = bdiff_diff(a, an, b, bn, &l);

326

if (count < 0)

329

if (count < 0)

327

goto nomem;

330

goto nomem;

328

331

329

rl = PyList_New(count);

332

rl = PyList_New(count);

330

if (!rl)

333

if (!rl)

331

goto nomem;

334

goto nomem;

332

335

333

for (h = l.next; h; h = h->next) {

336

for (h = l.next; h; h = h->next) {

334

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

337

m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);

335

PyList_SetItem(rl, pos, m);

338

PyList_SetItem(rl, pos, m);

336

pos++;

339

pos++;

337

}

340

}

338

341

339

nomem:

342

nomem:

340

free(a);

343

free(a);

341

free(b);

344

free(b);

342

freehunks(l.next);

345

bdiff_freehunks(l.next);

343

return rl ? rl : PyErr_NoMemory();

346

return rl ? rl : PyErr_NoMemory();

344

}

347

}

345

348

346

static PyObject *bdiff(PyObject *self, PyObject *args)

349

static PyObject *bdiff(PyObject *self, PyObject *args)

347

{

350

{

348

char *sa, *sb, *rb;

351

char *sa, *sb, *rb;

349

PyObject *result = NULL;

352

PyObject *result = NULL;

350

struct line *al, *bl;

353

struct bdiff_line *al, *bl;

351

struct hunk l, *h;

354

struct bdiff_hunk l, *h;

352

int an, bn, count;

355

int an, bn, count;

353

Py_ssize_t len = 0, la, lb;

356

Py_ssize_t len = 0, la, lb;

354

PyThreadState *_save;

357

PyThreadState *_save;

355

358

356

l.next = NULL;

359

l.next = NULL;

357

360

358

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

361

if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))

359

return NULL;

362

return NULL;

360

363

361

if (la > UINT_MAX || lb > UINT_MAX) {

364

if (la > UINT_MAX || lb > UINT_MAX) {

362

PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");

365

PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");

363

return NULL;

366

return NULL;

364

}

367

}

365

368

366

_save = PyEval_SaveThread();

369

_save = PyEval_SaveThread();

367

an = splitlines(sa, la, &al);

370

an = bdiff_splitlines(sa, la, &al);

368

bn = splitlines(sb, lb, &bl);

371

bn = bdiff_splitlines(sb, lb, &bl);

369

if (!al || !bl)

372

if (!al || !bl)

370

goto nomem;

373

goto nomem;

371

374

372

count = diff(al, an, bl, bn, &l);

375

count = bdiff_diff(al, an, bl, bn, &l);

373

if (count < 0)

376

if (count < 0)

374

goto nomem;

377

goto nomem;

375

378

376

/* calculate length of output */

379

/* calculate length of output */

377

la = lb = 0;

380

la = lb = 0;

378

for (h = l.next; h; h = h->next) {

381

for (h = l.next; h; h = h->next) {

379

if (h->a1 != la || h->b1 != lb)

382

if (h->a1 != la || h->b1 != lb)

380

len += 12 + bl[h->b1].l - bl[lb].l;

383

len += 12 + bl[h->b1].l - bl[lb].l;

381

la = h->a2;

384

la = h->a2;

382

lb = h->b2;

385

lb = h->b2;

383

}

386

}

384

PyEval_RestoreThread(_save);

387

PyEval_RestoreThread(_save);

385

_save = NULL;

388

_save = NULL;

386

389

387

result = PyBytes_FromStringAndSize(NULL, len);

390

result = PyBytes_FromStringAndSize(NULL, len);

388

391

389

if (!result)

392

if (!result)

390

goto nomem;

393

goto nomem;

391

394

392

/* build binary patch */

395

/* build binary patch */

393

rb = PyBytes_AsString(result);

396

rb = PyBytes_AsString(result);

394

la = lb = 0;

397

la = lb = 0;

395

398

396

for (h = l.next; h; h = h->next) {

399

for (h = l.next; h; h = h->next) {

397

if (h->a1 != la || h->b1 != lb) {

400

if (h->a1 != la || h->b1 != lb) {

398

len = bl[h->b1].l - bl[lb].l;

401

len = bl[h->b1].l - bl[lb].l;

399

putbe32((uint32_t)(al[la].l - al->l), rb);

402

putbe32((uint32_t)(al[la].l - al->l), rb);

400

putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);

403

putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);

401

putbe32((uint32_t)len, rb + 8);

404

putbe32((uint32_t)len, rb + 8);

402

memcpy(rb + 12, bl[lb].l, len);

405

memcpy(rb + 12, bl[lb].l, len);

403

rb += 12 + len;

406

rb += 12 + len;

404

}

407

}

405

la = h->a2;

408

la = h->a2;

406

lb = h->b2;

409

lb = h->b2;

407

}

410

}

408

411

409

nomem:

412

nomem:

410

if (_save)

413

if (_save)

411

PyEval_RestoreThread(_save);

414

PyEval_RestoreThread(_save);

412

free(al);

415

free(al);

413

free(bl);

416

free(bl);

414

freehunks(l.next);

417

bdiff_freehunks(l.next);

415

return result ? result : PyErr_NoMemory();

418

return result ? result : PyErr_NoMemory();

416

}

419

}

417

420

418

/*

421

/*

419

* If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,

422

* If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,

420

* reduce whitespace sequences to a single space and trim remaining whitespace

423

* reduce whitespace sequences to a single space and trim remaining whitespace

421

* from end of lines.

424

* from end of lines.

422

*/

425

*/

423

static PyObject *fixws(PyObject *self, PyObject *args)

426

static PyObject *fixws(PyObject *self, PyObject *args)

424

{

427

{

425

PyObject *s, *result = NULL;

428

PyObject *s, *result = NULL;

426

char allws, c;

429

char allws, c;

427

const char *r;

430

const char *r;

428

Py_ssize_t i, rlen, wlen = 0;

431

Py_ssize_t i, rlen, wlen = 0;

429

char *w;

432

char *w;

430

433

431

if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))

434

if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))

432

return NULL;

435

return NULL;

433

r = PyBytes_AsString(s);

436

r = PyBytes_AsString(s);

434

rlen = PyBytes_Size(s);

437

rlen = PyBytes_Size(s);

435

438

436

w = (char *)malloc(rlen ? rlen : 1);

439

w = (char *)malloc(rlen ? rlen : 1);

437

if (!w)

440

if (!w)

438

goto nomem;

441

goto nomem;

439

442

440

for (i = 0; i != rlen; i++) {

443

for (i = 0; i != rlen; i++) {

441

c = r[i];

444

c = r[i];

442

if (c == ' ' || c == '\t' || c == '\r') {

445

if (c == ' ' || c == '\t' || c == '\r') {

443

if (!allws && (wlen == 0 || w[wlen - 1] != ' '))

446

if (!allws && (wlen == 0 || w[wlen - 1] != ' '))

444

w[wlen++] = ' ';

447

w[wlen++] = ' ';

445

} else if (c == '\n' && !allws

448

} else if (c == '\n' && !allws

446

&& wlen > 0 && w[wlen - 1] == ' ') {

449

&& wlen > 0 && w[wlen - 1] == ' ') {

447

w[wlen - 1] = '\n';

450

w[wlen - 1] = '\n';

448

} else {

451

} else {

449

w[wlen++] = c;

452

w[wlen++] = c;

450

}

453

}

451

}

454

}

452

455

453

result = PyBytes_FromStringAndSize(w, wlen);

456

result = PyBytes_FromStringAndSize(w, wlen);

454

457

455

nomem:

458

nomem:

456

free(w);

459

free(w);

457

return result ? result : PyErr_NoMemory();

460

return result ? result : PyErr_NoMemory();

458

}

461

}

459

462

460

463

461

static char mdiff_doc[] = "Efficient binary diff.";

464

static char mdiff_doc[] = "Efficient binary diff.";

462

465

463

static PyMethodDef methods[] = {

466

static PyMethodDef methods[] = {

464

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

467

{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},

465

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

468

{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},

466

{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},

469

{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},

467

{NULL, NULL}

470

{NULL, NULL}

468

};

471

};

469

472

470

#ifdef IS_PY3K

473

#ifdef IS_PY3K

471

static struct PyModuleDef bdiff_module = {

474

static struct PyModuleDef bdiff_module = {

472

PyModuleDef_HEAD_INIT,

475

PyModuleDef_HEAD_INIT,

473

"bdiff",

476

"bdiff",

474

mdiff_doc,

477

mdiff_doc,

475

-1,

478

-1,

476

methods

479

methods

477

};

480

};

478

481

479

PyMODINIT_FUNC PyInit_bdiff(void)

482

PyMODINIT_FUNC PyInit_bdiff(void)

480

{

483

{

481

return PyModule_Create(&bdiff_module);

484

return PyModule_Create(&bdiff_module);

482

}

485

}

483

#else

486

#else

484

PyMODINIT_FUNC initbdiff(void)

487

PyMODINIT_FUNC initbdiff(void)

485

{

488

{

486

Py_InitModule3("bdiff", methods, mdiff_doc);

489

Py_InitModule3("bdiff", methods, mdiff_doc);

487

}

490

}

488

#endif

491

#endif

489

492

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             /*
              bdiff.c - efficient binary diff extension for Mercurial
              Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
              This software may be used and distributed according to the terms of
              the GNU General Public License, incorporated herein by reference.
              Based roughly on Python difflib
             */
             #define PY_SSIZE_T_CLEAN
             #include <Python.h>
             #include <stdlib.h>
             #include <string.h>
             #include <limits.h>
             #include "compat.h"
             #include "util.h"
             #include "bitmanipulation.h"
-            struct line {
+            struct bdiff_line {
             	int hash, n, e;
             	ssize_t len;
             	const char *l;
             };
             struct pos {
             	int pos, len;
             };
-            struct hunk;
+            struct bdiff_hunk;
-            struct hunk {
+            struct bdiff_hunk {
             	int a1, a2, b1, b2;
-            	struct hunk *next;
+            	struct bdiff_hunk *next;
             };
-            static int splitlines(const char *a, ssize_t len, struct line **lr)
+            static int bdiff_splitlines(const char *a, ssize_t len, struct bdiff_line **lr)
             {
             	unsigned hash;
             	int i;
             	const char *p, *b = a;
             	const char * const plast = a + len - 1;
-            	struct line *l;
+            	struct bdiff_line *l;
             	/* count the lines */
             	i = 1; /* extra line for sentinel */
             	for (p = a; p < a + len; p++)
             		if (*p == '\n' || p == plast)
             			i++;
-            	*lr = l = (struct line *)malloc(sizeof(struct line) * i);
+            	*lr = l = (struct bdiff_line *)malloc(sizeof(struct bdiff_line) * i);
             	if (!l)
             		return -1;
             	/* build the line array and calculate hashes */
             	hash = 0;
             	for (p = a; p < a + len; p++) {
             		/* Leonid Yuriev's hash */
             		hash = (hash * 1664525) + (unsigned char)*p + 1013904223;
             		if (*p == '\n' || p == plast) {
             			l->hash = hash;
             			hash = 0;
             			l->len = p - b + 1;
             			l->l = b;
             			l->n = INT_MAX;
             			l++;
             			b = p + 1;
             		}
             	}
             	/* set up a sentinel */
             	l->hash = 0;
             	l->len = 0;
             	l->l = a + len;
             	return i - 1;
             }
-            static inline int cmp(struct line *a, struct line *b)
+            static inline int cmp(struct bdiff_line *a, struct bdiff_line *b)
             {
             	return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);
             }
-            static int equatelines(struct line *a, int an, struct line *b, int bn)
+            static int equatelines(struct bdiff_line *a, int an, struct bdiff_line *b,
+            	int bn)
             {
             	int i, j, buckets = 1, t, scale;
             	struct pos *h = NULL;
             	/* build a hash table of the next highest power of 2 */
             	while (buckets < bn + 1)
             		buckets *= 2;
             	/* try to allocate a large hash table to avoid collisions */
             	for (scale = 4; scale; scale /= 2) {
             		h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));
             		if (h)
             			break;
             	}
             	if (!h)
             		return 0;
             	buckets = buckets * scale - 1;
             	/* clear the hash table */
             	for (i = 0; i <= buckets; i++) {
             		h[i].pos = -1;
             		h[i].len = 0;
             	}
             	/* add lines to the hash table chains */
             	for (i = 0; i < bn; i++) {
             		/* find the equivalence class */
             		for (j = b[i].hash & buckets; h[j].pos != -1;
             		     j = (j + 1) & buckets)
             			if (!cmp(b + i, b + h[j].pos))
             				break;
             		/* add to the head of the equivalence class */
             		b[i].n = h[j].pos;
             		b[i].e = j;
             		h[j].pos = i;
             		h[j].len++; /* keep track of popularity */
             	}
             	/* compute popularity threshold */
             	t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);
             	/* match items in a to their equivalence class in b */
             	for (i = 0; i < an; i++) {
             		/* find the equivalence class */
             		for (j = a[i].hash & buckets; h[j].pos != -1;
             		     j = (j + 1) & buckets)
             			if (!cmp(a + i, b + h[j].pos))
             				break;
             		a[i].e = j; /* use equivalence class for quick compare */
             		if (h[j].len <= t)
             			a[i].n = h[j].pos; /* point to head of match list */
             		else
             			a[i].n = -1; /* too popular */
             	}
             	/* discard hash tables */
             	free(h);
             	return 1;
             }
-            static int longest_match(struct line *a, struct line *b, struct pos *pos,
+            static int longest_match(struct bdiff_line *a, struct bdiff_line *b,
+            			struct pos *pos,
             			 int a1, int a2, int b1, int b2, int *omi, int *omj)
             {
             	int mi = a1, mj = b1, mk = 0, i, j, k, half;
             	/* window our search on large regions to better bound
             	   worst-case performance. by choosing a window at the end, we
             	   reduce skipping overhead on the b chains. */
             	if (a2 - a1 > 30000)
             		a1 = a2 - 30000;
             	half = (a1 + a2) / 2;
             	for (i = a1; i < a2; i++) {
             		/* skip all lines in b after the current block */
             		for (j = a[i].n; j >= b2; j = b[j].n)
             			;
             		/* loop through all lines match a[i] in b */
             		for (; j >= b1; j = b[j].n) {
             			/* does this extend an earlier match? */
             			for (k = 1; j - k >= b1 && i - k >= a1; k++) {
             				/* reached an earlier match? */
             				if (pos[j - k].pos == i - k) {
             					k += pos[j - k].len;
             					break;
             				}
             				/* previous line mismatch? */
             				if (a[i - k].e != b[j - k].e)
             					break;
             			}
             			pos[j].pos = i;
             			pos[j].len = k;
             			/* best match so far? we prefer matches closer
             			   to the middle to balance recursion */
             			if (k > mk || (k == mk && (i <= mi || i < half))) {
             				mi = i;
             				mj = j;
             				mk = k;
             			}
             		}
             	}
             	if (mk) {
             		mi = mi - mk + 1;
             		mj = mj - mk + 1;
             	}
             	/* expand match to include subsequent popular lines */
             	while (mi + mk < a2 && mj + mk < b2 &&
             	       a[mi + mk].e == b[mj + mk].e)
             		mk++;
             	*omi = mi;
             	*omj = mj;
             	return mk;
             }
-            static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos,
+            static struct bdiff_hunk *recurse(struct bdiff_line *a, struct bdiff_line *b,
-            			    int a1, int a2, int b1, int b2, struct hunk *l)
+            				struct pos *pos,
+            			    int a1, int a2, int b1, int b2, struct bdiff_hunk *l)
             {
             	int i, j, k;
             	while (1) {
             		/* find the longest match in this chunk */
             		k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
             		if (!k)
             			return l;
             		/* and recurse on the remaining chunks on either side */
             		l = recurse(a, b, pos, a1, i, b1, j, l);
             		if (!l)
             			return NULL;
-            		l->next = (struct hunk *)malloc(sizeof(struct hunk));
+            		l->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk));
             		if (!l->next)
             			return NULL;
             		l = l->next;
             		l->a1 = i;
             		l->a2 = i + k;
             		l->b1 = j;
             		l->b2 = j + k;
             		l->next = NULL;
             		/* tail-recursion didn't happen, so do equivalent iteration */
             		a1 = i + k;
             		b1 = j + k;
             	}
             }
-            static int diff(struct line *a, int an, struct line *b, int bn,
+            static int bdiff_diff(struct bdiff_line *a, int an, struct bdiff_line *b,
-            		 struct hunk *base)
+            		int bn, struct bdiff_hunk *base)
             {
-            	struct hunk *curr;
+            	struct bdiff_hunk *curr;
             	struct pos *pos;
             	int t, count = 0;
             	/* allocate and fill arrays */
             	t = equatelines(a, an, b, bn);
             	pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));
             	if (pos && t) {
             		/* generate the matching block list */
             		curr = recurse(a, b, pos, 0, an, 0, bn, base);
             		if (!curr)
             			return -1;
             		/* sentinel end hunk */
-            		curr->next = (struct hunk *)malloc(sizeof(struct hunk));
+            		curr->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk));
             		if (!curr->next)
             			return -1;
             		curr = curr->next;
             		curr->a1 = curr->a2 = an;
             		curr->b1 = curr->b2 = bn;
             		curr->next = NULL;
             	}
             	free(pos);
             	/* normalize the hunk list, try to push each hunk towards the end */
             	for (curr = base->next; curr; curr = curr->next) {
-            		struct hunk *next = curr->next;
+            		struct bdiff_hunk *next = curr->next;
             		if (!next)
             			break;
             		if (curr->a2 == next->a1 || curr->b2 == next->b1)
             			while (curr->a2 < an && curr->b2 < bn
             			       && next->a1 < next->a2
             			       && next->b1 < next->b2
             			       && !cmp(a + curr->a2, b + curr->b2)) {
             				curr->a2++;
             				next->a1++;
             				curr->b2++;
             				next->b1++;
             			}
             	}
             	for (curr = base->next; curr; curr = curr->next)
             		count++;
             	return count;
             }
-            static void freehunks(struct hunk *l)
+            static void bdiff_freehunks(struct bdiff_hunk *l)
             {
-            	struct hunk *n;
+            	struct bdiff_hunk *n;
             	for (; l; l = n) {
             		n = l->next;
             		free(l);
             	}
             }
             static PyObject *blocks(PyObject *self, PyObject *args)
             {
             	PyObject *sa, *sb, *rl = NULL, *m;
-            	struct line *a, *b;
+            	struct bdiff_line *a, *b;
-            	struct hunk l, *h;
+            	struct bdiff_hunk l, *h;
             	int an, bn, count, pos = 0;
             	l.next = NULL;
             	if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
             		return NULL;
-            	an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);
+            	an = bdiff_splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a);
-            	bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);
+            	bn = bdiff_splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b);
             	if (!a || !b)
             		goto nomem;
-            	count = diff(a, an, b, bn, &l);
+            	count = bdiff_diff(a, an, b, bn, &l);
             	if (count < 0)
             		goto nomem;
             	rl = PyList_New(count);
             	if (!rl)
             		goto nomem;
             	for (h = l.next; h; h = h->next) {
             		m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
             		PyList_SetItem(rl, pos, m);
             		pos++;
             	}
             nomem:
             	free(a);
             	free(b);
-            	freehunks(l.next);
+            	bdiff_freehunks(l.next);
             	return rl ? rl : PyErr_NoMemory();
             }
             static PyObject *bdiff(PyObject *self, PyObject *args)
             {
             	char *sa, *sb, *rb;
             	PyObject *result = NULL;
-            	struct line *al, *bl;
+            	struct bdiff_line *al, *bl;
-            	struct hunk l, *h;
+            	struct bdiff_hunk l, *h;
             	int an, bn, count;
             	Py_ssize_t len = 0, la, lb;
             	PyThreadState *_save;
             	l.next = NULL;
             	if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb))
             		return NULL;
             	if (la > UINT_MAX || lb > UINT_MAX) {
             		PyErr_SetString(PyExc_ValueError, "bdiff inputs too large");
             		return NULL;
             	}
             	_save = PyEval_SaveThread();
-            	an = splitlines(sa, la, &al);
+            	an = bdiff_splitlines(sa, la, &al);
-            	bn = splitlines(sb, lb, &bl);
+            	bn = bdiff_splitlines(sb, lb, &bl);
             	if (!al || !bl)
             		goto nomem;
-            	count = diff(al, an, bl, bn, &l);
+            	count = bdiff_diff(al, an, bl, bn, &l);
             	if (count < 0)
             		goto nomem;
             	/* calculate length of output */
             	la = lb = 0;
             	for (h = l.next; h; h = h->next) {
             		if (h->a1 != la || h->b1 != lb)
             			len += 12 + bl[h->b1].l - bl[lb].l;
             		la = h->a2;
             		lb = h->b2;
             	}
             	PyEval_RestoreThread(_save);
             	_save = NULL;
             	result = PyBytes_FromStringAndSize(NULL, len);
             	if (!result)
             		goto nomem;
             	/* build binary patch */
             	rb = PyBytes_AsString(result);
             	la = lb = 0;
             	for (h = l.next; h; h = h->next) {
             		if (h->a1 != la || h->b1 != lb) {
             			len = bl[h->b1].l - bl[lb].l;
             			putbe32((uint32_t)(al[la].l - al->l), rb);
             			putbe32((uint32_t)(al[h->a1].l - al->l), rb + 4);
             			putbe32((uint32_t)len, rb + 8);
             			memcpy(rb + 12, bl[lb].l, len);
             			rb += 12 + len;
             		}
             		la = h->a2;
             		lb = h->b2;
             	}
             nomem:
             	if (_save)
             		PyEval_RestoreThread(_save);
             	free(al);
             	free(bl);
-            	freehunks(l.next);
+            	bdiff_freehunks(l.next);
             	return result ? result : PyErr_NoMemory();
             }
             /*
              * If allws != 0, remove all whitespace (' ', \t and \r). Otherwise,
              * reduce whitespace sequences to a single space and trim remaining whitespace
              * from end of lines.
              */
             static PyObject *fixws(PyObject *self, PyObject *args)
             {
             	PyObject *s, *result = NULL;
             	char allws, c;
             	const char *r;
             	Py_ssize_t i, rlen, wlen = 0;
             	char *w;
             	if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws))
             		return NULL;
             	r = PyBytes_AsString(s);
             	rlen = PyBytes_Size(s);
             	w = (char *)malloc(rlen ? rlen : 1);
             	if (!w)
             		goto nomem;
             	for (i = 0; i != rlen; i++) {
             		c = r[i];
             		if (c == ' ' || c == '\t' || c == '\r') {
             			if (!allws && (wlen == 0 || w[wlen - 1] != ' '))
             				w[wlen++] = ' ';
             		} else if (c == '\n' && !allws
             			  && wlen > 0 && w[wlen - 1] == ' ') {
             			w[wlen - 1] = '\n';
             		} else {
             			w[wlen++] = c;
             		}
             	}
             	result = PyBytes_FromStringAndSize(w, wlen);
             nomem:
             	free(w);
             	return result ? result : PyErr_NoMemory();
             }
             static char mdiff_doc[] = "Efficient binary diff.";
             static PyMethodDef methods[] = {
             	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
             	{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
             	{"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"},
             	{NULL, NULL}
             };
             #ifdef IS_PY3K
             static struct PyModuleDef bdiff_module = {
             	PyModuleDef_HEAD_INIT,
             	"bdiff",
             	mdiff_doc,
             	-1,
             	methods
             };
             PyMODINIT_FUNC PyInit_bdiff(void)
             {
             	return PyModule_Create(&bdiff_module);
             }
             #else
             PyMODINIT_FUNC initbdiff(void)
             {
             	Py_InitModule3("bdiff", methods, mdiff_doc);
             }
             #endif