rhodecode-enterprise-ce Commit - r4954:f53817a4

1

2

3

4

"""Diff Match and Patch

1

"""Diff Match and Patch

5

2

6

3

7

http://code.google.com/p/google-diff-match-patch/

4

http://code.google.com/p/google-diff-match-patch/

8

5

9

Licensed under the Apache License, Version 2.0 (the "License");

6

Licensed under the Apache License, Version 2.0 (the "License");

10

you may not use this file except in compliance with the License.

7

you may not use this file except in compliance with the License.

11

You may obtain a copy of the License at

8

You may obtain a copy of the License at

12

9

13

http://www.apache.org/licenses/LICENSE-2.0

10

http://www.apache.org/licenses/LICENSE-2.0

14

11

15

Unless required by applicable law or agreed to in writing, software

12

Unless required by applicable law or agreed to in writing, software

16

distributed under the License is distributed on an "AS IS" BASIS,

13

distributed under the License is distributed on an "AS IS" BASIS,

17

WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

18

See the License for the specific language governing permissions and

15

See the License for the specific language governing permissions and

19

limitations under the License.

16

limitations under the License.

20

"""

17

"""

21

18

22

"""Functions for diff, match and patch.

19

"""Functions for diff, match and patch.

23

20

24

Computes the difference between two texts to create a patch.

21

Computes the difference between two texts to create a patch.

25

Applies the patch onto another text, allowing for errors.

22

Applies the patch onto another text, allowing for errors.

26

"""

23

"""

27

24

28

__author__ = 'fraser@google.com (Neil Fraser)'

25

__author__ = "fraser@google.com (Neil Fraser)"

29

26

30

import math

27

import math

31

import re

28

import re

32

import sys

29

import sys

33

import time

30

import time

34

import urllib.request, urllib.parse, urllib.error

31

import urllib.request, urllib.parse, urllib.error

35

32

36

33

37

class diff_match_patch:

34

class diff_match_patch:

38

"""Class containing the diff, match and patch methods.

35

"""Class containing the diff, match and patch methods.

39

40

Also contains the behaviour settings.

41

"""

42

43

def __init__(self):

44

"""Inits a diff_match_patch object with default settings.

45

Redefine these in your program to override the defaults.

46

"""

47

48

# Number of seconds to map a diff before giving up (0 for infinity).

49

self.Diff_Timeout = 1.0

50

# Cost of an empty edit operation in terms of edit characters.

51

self.Diff_EditCost = 4

52

# At what point is no match declared (0.0 = perfection, 1.0 = very loose).

53

self.Match_Threshold = 0.5

54

# How far to search for a match (0 = exact location, 1000+ = broad match).

55

# A match this many characters away from the expected location will add

56

# 1.0 to the score (0.0 is a perfect match).

57

self.Match_Distance = 1000

58

# When deleting a large block of text (over ~64 characters), how close do

59

# the contents have to be to match the expected contents. (0.0 = perfection,

60

# 1.0 = very loose). Note that Match_Threshold controls how closely the

61

# end points of a delete need to match.

62

self.Patch_DeleteThreshold = 0.5

63

# Chunk size for context length.

64

self.Patch_Margin = 4

65

66

# The number of bits in an int.

67

# Python has no maximum, thus to disable patch splitting set to 0.

68

# However to avoid long patches in certain pathological cases, use 32.

69

# Multiple short patches (using native ints) are much faster than long ones.

70

self.Match_MaxBits = 32

71

72

# DIFF FUNCTIONS

73

74

# The data structure representing a diff is an array of tuples:

75

# [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]

76

# which means: delete "Hello", add "Goodbye" and keep " world."

77

DIFF_DELETE = -1

78

DIFF_INSERT = 1

79

DIFF_EQUAL = 0

80

81

def diff_main(self, text1, text2, checklines=True, deadline=None):

82

"""Find the differences between two texts. Simplifies the problem by

83

stripping any common prefix or suffix off the texts before diffing.

84

85

Args:

86

text1: Old string to be diffed.

87

text2: New string to be diffed.

88

checklines: Optional speedup flag. If present and false, then don't run

89

a line-level diff first to identify the changed areas.

90

Defaults to true, which does a faster, slightly less optimal diff.

91

deadline: Optional time when the diff should be complete by. Used

92

internally for recursive calls. Users should set DiffTimeout instead.

93

94

Returns:

95

Array of changes.

96

"""

97

# Set a deadline by which time the diff must be complete.

98

if deadline is None:

99

# Unlike in most languages, Python counts time in seconds.

100

if self.Diff_Timeout <= 0:

101

deadline = sys.maxint

102

else:

103

deadline = time.time() + self.Diff_Timeout

104

105

# Check for null inputs.

106

if text1 is None or text2 is None:

107

raise ValueError("Null inputs. (diff_main)")

108

109

# Check for equality (speedup).

110

if text1 == text2:

111

if text1:

112

return [(self.DIFF_EQUAL, text1)]

113

return []

114

115

# Trim off common prefix (speedup).

116

commonlength = self.diff_commonPrefix(text1, text2)

117

commonprefix = text1[:commonlength]

118

text1 = text1[commonlength:]

119

text2 = text2[commonlength:]

120

121

# Trim off common suffix (speedup).

122

commonlength = self.diff_commonSuffix(text1, text2)

123

if commonlength == 0:

124

commonsuffix = ''

125

else:

126

commonsuffix = text1[-commonlength:]

127

text1 = text1[:-commonlength]

128

text2 = text2[:-commonlength]

129

130

# Compute the diff on the middle block.

131

diffs = self.diff_compute(text1, text2, checklines, deadline)

132

133

# Restore the prefix and suffix.

134

if commonprefix:

135

diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]

136

if commonsuffix:

137

diffs.append((self.DIFF_EQUAL, commonsuffix))

138

self.diff_cleanupMerge(diffs)

139

return diffs

140

141

def diff_compute(self, text1, text2, checklines, deadline):

142

"""Find the differences between two texts. Assumes that the texts do not

143

have any common prefix or suffix.

144

145

Args:

146

text1: Old string to be diffed.

147

text2: New string to be diffed.

148

checklines: Speedup flag. If false, then don't run a line-level diff

149

first to identify the changed areas.

150

If true, then run a faster, slightly less optimal diff.

151

deadline: Time when the diff should be complete by.

152

153

Returns:

154

Array of changes.

155

"""

156

if not text1:

157

# Just add some text (speedup).

158

return [(self.DIFF_INSERT, text2)]

159

160

if not text2:

161

# Just delete some text (speedup).

162

return [(self.DIFF_DELETE, text1)]

163

164

if len(text1) > len(text2):

165

(longtext, shorttext) = (text1, text2)

166

else:

167

(shorttext, longtext) = (text1, text2)

168

i = longtext.find(shorttext)

169

if i != -1:

170

# Shorter text is inside the longer text (speedup).

171

diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),

172

(self.DIFF_INSERT, longtext[i + len(shorttext):])]

173

# Swap insertions for deletions if diff is reversed.

174

if len(text1) > len(text2):

175

diffs[0] = (self.DIFF_DELETE, diffs[0][1])

176

diffs[2] = (self.DIFF_DELETE, diffs[2][1])

177

return diffs

178

179

if len(shorttext) == 1:

180

# Single character string.

181

# After the previous speedup, the character can't be an equality.

182

return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]

183

184

# Check to see if the problem can be split in two.

185

hm = self.diff_halfMatch(text1, text2)

186

if hm:

187

# A half-match was found, sort out the return data.

188

(text1_a, text1_b, text2_a, text2_b, mid_common) = hm

189

# Send both pairs off for separate processing.

190

diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)

191

diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)

192

# Merge the results.

193

return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b

194

195

if checklines and len(text1) > 100 and len(text2) > 100:

196

return self.diff_lineMode(text1, text2, deadline)

197

198

return self.diff_bisect(text1, text2, deadline)

199

200

def diff_lineMode(self, text1, text2, deadline):

201

"""Do a quick line-level diff on both strings, then rediff the parts for

202

greater accuracy.

203

This speedup can produce non-minimal diffs.

204

205

Args:

206

text1: Old string to be diffed.

207

text2: New string to be diffed.

208

deadline: Time when the diff should be complete by.

209

210

Returns:

211

Array of changes.

212

"""

213

214

# Scan the text on a line-by-line basis first.

215

(text1, text2, linearray) = self.diff_linesToChars(text1, text2)

216

217

diffs = self.diff_main(text1, text2, False, deadline)

218

219

# Convert the diff back to original text.

220

self.diff_charsToLines(diffs, linearray)

221

# Eliminate freak matches (e.g. blank lines)

222

self.diff_cleanupSemantic(diffs)

223

224

# Rediff any replacement blocks, this time character-by-character.

225

# Add a dummy entry at the end.

226

diffs.append((self.DIFF_EQUAL, ''))

227

pointer = 0

228

count_delete = 0

229

count_insert = 0

230

text_delete = ''

231

text_insert = ''

232

while pointer < len(diffs):

233

if diffs[pointer][0] == self.DIFF_INSERT:

234

count_insert += 1

235

text_insert += diffs[pointer][1]

236

elif diffs[pointer][0] == self.DIFF_DELETE:

237

count_delete += 1

238

text_delete += diffs[pointer][1]

239

elif diffs[pointer][0] == self.DIFF_EQUAL:

240

# Upon reaching an equality, check for prior redundancies.

241

if count_delete >= 1 and count_insert >= 1:

242

# Delete the offending records and add the merged ones.

243

a = self.diff_main(text_delete, text_insert, False, deadline)

244

diffs[pointer - count_delete - count_insert : pointer] = a

245

pointer = pointer - count_delete - count_insert + len(a)

246

count_insert = 0

247

count_delete = 0

248

text_delete = ''

249

text_insert = ''

250

251

pointer += 1

252

253

diffs.pop() # Remove the dummy entry at the end.

254

255

return diffs

256

257

def diff_bisect(self, text1, text2, deadline):

258

"""Find the 'middle snake' of a diff, split the problem in two

259

and return the recursively constructed diff.

260

See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.

261

262

Args:

263

text1: Old string to be diffed.

264

text2: New string to be diffed.

265

deadline: Time at which to bail if not yet complete.

266

267

Returns:

268

Array of diff tuples.

269

"""

270

271

# Cache the text lengths to prevent multiple calls.

272

text1_length = len(text1)

273

text2_length = len(text2)

274

max_d = (text1_length + text2_length + 1) // 2

275

v_offset = max_d

276

v_length = 2 * max_d

277

v1 = [-1] * v_length

278

v1[v_offset + 1] = 0

279

v2 = v1[:]

280

delta = text1_length - text2_length

281

# If the total number of characters is odd, then the front path will

282

# collide with the reverse path.

283

front = (delta % 2 != 0)

284

# Offsets for start and end of k loop.

285

# Prevents mapping of space beyond the grid.

286

k1start = 0

287

k1end = 0

288

k2start = 0

289

k2end = 0

290

for d in range(max_d):

291

# Bail out if deadline is reached.

292

if time.time() > deadline:

293

break

294

295

# Walk the front path one step.

296

for k1 in range(-d + k1start, d + 1 - k1end, 2):

297

k1_offset = v_offset + k1

298

if k1 == -d or (k1 != d and

299

v1[k1_offset - 1] < v1[k1_offset + 1]):

300

x1 = v1[k1_offset + 1]

301

else:

302

x1 = v1[k1_offset - 1] + 1

303

y1 = x1 - k1

304

while (x1 < text1_length and y1 < text2_length and

305

text1[x1] == text2[y1]):

306

x1 += 1

307

y1 += 1

308

v1[k1_offset] = x1

309

if x1 > text1_length:

310

# Ran off the right of the graph.

311

k1end += 2

312

elif y1 > text2_length:

313

# Ran off the bottom of the graph.

314

k1start += 2

315

elif front:

316

k2_offset = v_offset + delta - k1

317

if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:

318

# Mirror x2 onto top-left coordinate system.

319

x2 = text1_length - v2[k2_offset]

320

if x1 >= x2:

321

# Overlap detected.

322

return self.diff_bisectSplit(text1, text2, x1, y1, deadline)

323

324

# Walk the reverse path one step.

325

for k2 in range(-d + k2start, d + 1 - k2end, 2):

326

k2_offset = v_offset + k2

327

if k2 == -d or (k2 != d and

328

v2[k2_offset - 1] < v2[k2_offset + 1]):

329

x2 = v2[k2_offset + 1]

330

else:

331

x2 = v2[k2_offset - 1] + 1

332

y2 = x2 - k2

333

while (x2 < text1_length and y2 < text2_length and

334

text1[-x2 - 1] == text2[-y2 - 1]):

335

x2 += 1

336

y2 += 1

337

v2[k2_offset] = x2

338

if x2 > text1_length:

339

# Ran off the left of the graph.

340

k2end += 2

341

elif y2 > text2_length:

342

# Ran off the top of the graph.

343

k2start += 2

344

elif not front:

345

k1_offset = v_offset + delta - k2

346

if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:

347

x1 = v1[k1_offset]

348

y1 = v_offset + x1 - k1_offset

349

# Mirror x2 onto top-left coordinate system.

350

x2 = text1_length - x2

351

if x1 >= x2:

352

# Overlap detected.

353

return self.diff_bisectSplit(text1, text2, x1, y1, deadline)

354

355

# Diff took too long and hit the deadline or

356

# number of diffs equals number of characters, no commonality at all.

357

return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]

358

359

def diff_bisectSplit(self, text1, text2, x, y, deadline):

360

"""Given the location of the 'middle snake', split the diff in two parts

361

and recurse.

362

363

Args:

364

text1: Old string to be diffed.

365

text2: New string to be diffed.

366

x: Index of split point in text1.

367

y: Index of split point in text2.

368

deadline: Time at which to bail if not yet complete.

369

370

Returns:

371

Array of diff tuples.

372

"""

373

text1a = text1[:x]

374

text2a = text2[:y]

375

text1b = text1[x:]

376

text2b = text2[y:]

377

378

# Compute both diffs serially.

379

diffs = self.diff_main(text1a, text2a, False, deadline)

380

diffsb = self.diff_main(text1b, text2b, False, deadline)

381

382

return diffs + diffsb

383

384

def diff_linesToChars(self, text1, text2):

385

"""Split two texts into an array of strings. Reduce the texts to a string

386

of hashes where each Unicode character represents one line.

387

36

388

Args:

37

Also contains the behaviour settings.

389

text1: First string.

390

text2: Second string.

391

392

Returns:

393

Three element tuple, containing the encoded text1, the encoded text2 and

394

the array of unique strings. The zeroth element of the array of unique

395

strings is intentionally blank.

396

"""

397

lineArray = [] # e.g. lineArray[4] == "Hello\n"

398

lineHash = {} # e.g. lineHash["Hello\n"] == 4

399

400

# "\x00" is a valid character, but various debuggers don't like it.

401

# So we'll insert a junk entry to avoid generating a null character.

402

lineArray.append('')

403

404

def diff_linesToCharsMunge(text):

405

"""Split a text into an array of strings. Reduce the texts to a string

406

of hashes where each Unicode character represents one line.

407

Modifies linearray and linehash through being a closure.

408

409

Args:

410

text: String to encode.

411

412

Returns:

413

Encoded string.

414

"""

415

chars = []

416

# Walk the text, pulling out a substring for each line.

417

# text.split('\n') would would temporarily double our memory footprint.

418

# Modifying text would create many large strings to garbage collect.

419

lineStart = 0

420

lineEnd = -1

421

while lineEnd < len(text) - 1:

422

lineEnd = text.find('\n', lineStart)

423

if lineEnd == -1:

424

lineEnd = len(text) - 1

425

line = text[lineStart:lineEnd + 1]

426

lineStart = lineEnd + 1

427

428

if line in lineHash:

429

chars.append(unichr(lineHash[line]))

430

else:

431

lineArray.append(line)

432

lineHash[line] = len(lineArray) - 1

433

chars.append(unichr(len(lineArray) - 1))

434

return "".join(chars)

435

436

chars1 = diff_linesToCharsMunge(text1)

437

chars2 = diff_linesToCharsMunge(text2)

438

return (chars1, chars2, lineArray)

439

440

def diff_charsToLines(self, diffs, lineArray):

441

"""Rehydrate the text in a diff from a string of line hashes to real lines

442

of text.

443

444

Args:

445

diffs: Array of diff tuples.

446

lineArray: Array of unique strings.

447

"""

448

for x in range(len(diffs)):

449

text = []

450

for char in diffs[x][1]:

451

text.append(lineArray[ord(char)])

452

diffs[x] = (diffs[x][0], "".join(text))

453

454

def diff_commonPrefix(self, text1, text2):

455

"""Determine the common prefix of two strings.

456

457

Args:

458

text1: First string.

459

text2: Second string.

460

461

Returns:

462

The number of characters common to the start of each string.

463

"""

464

# Quick check for common null cases.

465

if not text1 or not text2 or text1[0] != text2[0]:

466

return 0

467

# Binary search.

468

# Performance analysis: http://neil.fraser.name/news/2007/10/09/

469

pointermin = 0

470

pointermax = min(len(text1), len(text2))

471

pointermid = pointermax

472

pointerstart = 0

473

while pointermin < pointermid:

474

if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:

475

pointermin = pointermid

476

pointerstart = pointermin

477

else:

478

pointermax = pointermid

479

pointermid = (pointermax - pointermin) // 2 + pointermin

480

return pointermid

481

482

def diff_commonSuffix(self, text1, text2):

483

"""Determine the common suffix of two strings.

484

485

Args:

486

text1: First string.

487

text2: Second string.

488

489

Returns:

490

The number of characters common to the end of each string.

491

"""

492

# Quick check for common null cases.

493

if not text1 or not text2 or text1[-1] != text2[-1]:

494

return 0

495

# Binary search.

496

# Performance analysis: http://neil.fraser.name/news/2007/10/09/

497

pointermin = 0

498

pointermax = min(len(text1), len(text2))

499

pointermid = pointermax

500

pointerend = 0

501

while pointermin < pointermid:

502

if (text1[-pointermid:len(text1) - pointerend] ==

503

text2[-pointermid:len(text2) - pointerend]):

504

pointermin = pointermid

505

pointerend = pointermin

506

else:

507

pointermax = pointermid

508

pointermid = (pointermax - pointermin) // 2 + pointermin

509

return pointermid

510

511

def diff_commonOverlap(self, text1, text2):

512

"""Determine if the suffix of one string is the prefix of another.

513

514

Args:

515

text1 First string.

516

text2 Second string.

517

518

Returns:

519

The number of characters common to the end of the first

520

string and the start of the second string.

521

"""

522

# Cache the text lengths to prevent multiple calls.

523

text1_length = len(text1)

524

text2_length = len(text2)

525

# Eliminate the null case.

526

if text1_length == 0 or text2_length == 0:

527

return 0

528

# Truncate the longer string.

529

if text1_length > text2_length:

530

text1 = text1[-text2_length:]

531

elif text1_length < text2_length:

532

text2 = text2[:text1_length]

533

text_length = min(text1_length, text2_length)

534

# Quick check for the worst case.

535

if text1 == text2:

536

return text_length

537

538

# Start by looking for a single character match

539

# and increase length until no match is found.

540

# Performance analysis: http://neil.fraser.name/news/2010/11/04/

541

best = 0

542

length = 1

543

while True:

544

pattern = text1[-length:]

545

found = text2.find(pattern)

546

if found == -1:

547

return best

548

length += found

549

if found == 0 or text1[-length:] == text2[:length]:

550

best = length

551

length += 1

552

553

def diff_halfMatch(self, text1, text2):

554

"""Do the two texts share a substring which is at least half the length of

555

the longer text?

556

This speedup can produce non-minimal diffs.

557

558

Args:

559

text1: First string.

560

text2: Second string.

561

562

Returns:

563

Five element Array, containing the prefix of text1, the suffix of text1,

564

the prefix of text2, the suffix of text2 and the common middle. Or None

565

if there was no match.

566

"""

567

if self.Diff_Timeout <= 0:

568

# Don't risk returning a non-optimal diff if we have unlimited time.

569

return None

570

if len(text1) > len(text2):

571

(longtext, shorttext) = (text1, text2)

572

else:

573

(shorttext, longtext) = (text1, text2)

574

if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):

575

return None # Pointless.

576

577

def diff_halfMatchI(longtext, shorttext, i):

578

"""Does a substring of shorttext exist within longtext such that the

579

substring is at least half the length of longtext?

580

Closure, but does not reference any external variables.

581

582

Args:

583

longtext: Longer string.

584

shorttext: Shorter string.

585

i: Start index of quarter length substring within longtext.

586

587

Returns:

588

Five element Array, containing the prefix of longtext, the suffix of

589

longtext, the prefix of shorttext, the suffix of shorttext and the

590

common middle. Or None if there was no match.

591

"""

592

seed = longtext[i:i + len(longtext) // 4]

593

best_common = ''

594

j = shorttext.find(seed)

595

while j != -1:

596

prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])

597

suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])

598

if len(best_common) < suffixLength + prefixLength:

599

best_common = (shorttext[j - suffixLength:j] +

600

shorttext[j:j + prefixLength])

601

best_longtext_a = longtext[:i - suffixLength]

602

best_longtext_b = longtext[i + prefixLength:]

603

best_shorttext_a = shorttext[:j - suffixLength]

604

best_shorttext_b = shorttext[j + prefixLength:]

605

j = shorttext.find(seed, j + 1)

606

607

if len(best_common) * 2 >= len(longtext):

608

return (best_longtext_a, best_longtext_b,

609

best_shorttext_a, best_shorttext_b, best_common)

610

else:

611

return None

612

613

# First check if the second quarter is the seed for a half-match.

614

hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4)

615

# Check again based on the third quarter.

616

hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2)

617

if not hm1 and not hm2:

618

return None

619

elif not hm2:

620

hm = hm1

621

elif not hm1:

622

hm = hm2

623

else:

624

# Both matched. Select the longest.

625

if len(hm1[4]) > len(hm2[4]):

626

hm = hm1

627

else:

628

hm = hm2

629

630

# A half-match was found, sort out the return data.

631

if len(text1) > len(text2):

632

(text1_a, text1_b, text2_a, text2_b, mid_common) = hm

633

else:

634

(text2_a, text2_b, text1_a, text1_b, mid_common) = hm

635

return (text1_a, text1_b, text2_a, text2_b, mid_common)

636

637

def diff_cleanupSemantic(self, diffs):

638

"""Reduce the number of edits by eliminating semantically trivial

639

equalities.

640

641

Args:

642

diffs: Array of diff tuples.

643

"""

644

changes = False

645

equalities = [] # Stack of indices where equalities are found.

646

lastequality = None # Always equal to diffs[equalities[-1]][1]

647

pointer = 0 # Index of current position.

648

# Number of chars that changed prior to the equality.

649

length_insertions1, length_deletions1 = 0, 0

650

# Number of chars that changed after the equality.

651

length_insertions2, length_deletions2 = 0, 0

652

while pointer < len(diffs):

653

if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.

654

equalities.append(pointer)

655

length_insertions1, length_insertions2 = length_insertions2, 0

656

length_deletions1, length_deletions2 = length_deletions2, 0

657

lastequality = diffs[pointer][1]

658

else: # An insertion or deletion.

659

if diffs[pointer][0] == self.DIFF_INSERT:

660

length_insertions2 += len(diffs[pointer][1])

661

else:

662

length_deletions2 += len(diffs[pointer][1])

663

# Eliminate an equality that is smaller or equal to the edits on both

664

# sides of it.

665

if (lastequality and (len(lastequality) <=

666

max(length_insertions1, length_deletions1)) and

667

(len(lastequality) <= max(length_insertions2, length_deletions2))):

668

# Duplicate record.

669

diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))

670

# Change second copy to insert.

671

diffs[equalities[-1] + 1] = (self.DIFF_INSERT,

672

diffs[equalities[-1] + 1][1])

673

# Throw away the equality we just deleted.

674

equalities.pop()

675

# Throw away the previous equality (it needs to be reevaluated).

676

if len(equalities):

677

equalities.pop()

678

if len(equalities):

679

pointer = equalities[-1]

680

else:

681

pointer = -1

682

# Reset the counters.

683

length_insertions1, length_deletions1 = 0, 0

684

length_insertions2, length_deletions2 = 0, 0

685

lastequality = None

686

changes = True

687

pointer += 1

688

689

# Normalize the diff.

690

if changes:

691

self.diff_cleanupMerge(diffs)

692

self.diff_cleanupSemanticLossless(diffs)

693

694

# Find any overlaps between deletions and insertions.

695

# e.g: <del>abcxxx</del><ins>xxxdef</ins>

696

# -> <del>abc</del>xxx<ins>def</ins>

697

# e.g: <del>xxxabc</del><ins>defxxx</ins>

698

# -> <ins>def</ins>xxx<del>abc</del>

699

# Only extract an overlap if it is as big as the edit ahead or behind it.

700

pointer = 1

701

while pointer < len(diffs):

702

if (diffs[pointer - 1][0] == self.DIFF_DELETE and

703

diffs[pointer][0] == self.DIFF_INSERT):

704

deletion = diffs[pointer - 1][1]

705

insertion = diffs[pointer][1]

706

overlap_length1 = self.diff_commonOverlap(deletion, insertion)

707

overlap_length2 = self.diff_commonOverlap(insertion, deletion)

708

if overlap_length1 >= overlap_length2:

709

if (overlap_length1 >= len(deletion) / 2.0 or

710

overlap_length1 >= len(insertion) / 2.0):

711

# Overlap found. Insert an equality and trim the surrounding edits.

712

diffs.insert(pointer, (self.DIFF_EQUAL,

713

insertion[:overlap_length1]))

714

diffs[pointer - 1] = (self.DIFF_DELETE,

715

deletion[:len(deletion) - overlap_length1])

716

diffs[pointer + 1] = (self.DIFF_INSERT,

717

insertion[overlap_length1:])

718

pointer += 1

719

else:

720

if (overlap_length2 >= len(deletion) / 2.0 or

721

overlap_length2 >= len(insertion) / 2.0):

722

# Reverse overlap found.

723

# Insert an equality and swap and trim the surrounding edits.

724

diffs.insert(pointer, (self.DIFF_EQUAL, deletion[:overlap_length2]))

725

diffs[pointer - 1] = (self.DIFF_INSERT,

726

insertion[:len(insertion) - overlap_length2])

727

diffs[pointer + 1] = (self.DIFF_DELETE, deletion[overlap_length2:])

728

pointer += 1

729

pointer += 1

730

pointer += 1

731

732

def diff_cleanupSemanticLossless(self, diffs):

733

"""Look for single edits surrounded on both sides by equalities

734

which can be shifted sideways to align the edit to a word boundary.

735

e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.

736

737

Args:

738

diffs: Array of diff tuples.

739

"""

38

"""

740

39

741

def diff_cleanupSemanticScore(one, two):

40

def __init__(self):

742

"""Given two strings, compute a score representing whether the

41

"""Inits a diff_match_patch object with default settings.

743

internal boundary falls on logical boundaries.

42

Redefine these in your program to override the defaults.

744

Scores range from 6 (best) to 0 (worst).

43

"""

745

Closure, but does not reference any external variables.

44

45

# Number of seconds to map a diff before giving up (0 for infinity).

46

self.Diff_Timeout = 1.0

47

# Cost of an empty edit operation in terms of edit characters.

48

self.Diff_EditCost = 4

49

# At what point is no match declared (0.0 = perfection, 1.0 = very loose).

50

self.Match_Threshold = 0.5

51

# How far to search for a match (0 = exact location, 1000+ = broad match).

52

# A match this many characters away from the expected location will add

53

# 1.0 to the score (0.0 is a perfect match).

54

self.Match_Distance = 1000

55

# When deleting a large block of text (over ~64 characters), how close do

56

# the contents have to be to match the expected contents. (0.0 = perfection,

57

# 1.0 = very loose). Note that Match_Threshold controls how closely the

58

# end points of a delete need to match.

59

self.Patch_DeleteThreshold = 0.5

60

# Chunk size for context length.

61

self.Patch_Margin = 4

62

63

# The number of bits in an int.

64

# Python has no maximum, thus to disable patch splitting set to 0.

65

# However to avoid long patches in certain pathological cases, use 32.

66

# Multiple short patches (using native ints) are much faster than long ones.

67

self.Match_MaxBits = 32

68

69

# DIFF FUNCTIONS

746

70

747

Args:

71

# The data structure representing a diff is an array of tuples:

748

one: First string.

72

# [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]

749

two: Second string.

73

# which means: delete "Hello", add "Goodbye" and keep " world."

74

DIFF_DELETE = -1

75

DIFF_INSERT = 1

76

DIFF_EQUAL = 0

77

78

def diff_main(self, text1, text2, checklines=True, deadline=None):

79

"""Find the differences between two texts. Simplifies the problem by

80

stripping any common prefix or suffix off the texts before diffing.

81

82

Args:

83

text1: Old string to be diffed.

84

text2: New string to be diffed.

85

checklines: Optional speedup flag. If present and false, then don't run

86

a line-level diff first to identify the changed areas.

87

Defaults to true, which does a faster, slightly less optimal diff.

88

deadline: Optional time when the diff should be complete by. Used

89

internally for recursive calls. Users should set DiffTimeout instead.

90

91

Returns:

92

Array of changes.

93

"""

94

# Set a deadline by which time the diff must be complete.

95

if deadline is None:

96

# Unlike in most languages, Python counts time in seconds.

97

if self.Diff_Timeout <= 0:

98

deadline = sys.maxsize

99

else:

100

deadline = time.time() + self.Diff_Timeout

101

102

# Check for null inputs.

103

if text1 is None or text2 is None:

104

raise ValueError("Null inputs. (diff_main)")

750

105

751

Returns:

106

# Check for equality (speedup).

752

The score.

107

if text1 == text2:

753

"""

108

if text1:

754

if not one or not two:

109

return [(self.DIFF_EQUAL, text1)]

755

# Edges are the best.

110

return []

756

return 6

111

112

# Trim off common prefix (speedup).

113

commonlength = self.diff_commonPrefix(text1, text2)

114

commonprefix = text1[:commonlength]

115

text1 = text1[commonlength:]

116

text2 = text2[commonlength:]

117

118

# Trim off common suffix (speedup).

119

commonlength = self.diff_commonSuffix(text1, text2)

120

if commonlength == 0:

121

commonsuffix = ""

122

else:

123

commonsuffix = text1[-commonlength:]

124

text1 = text1[:-commonlength]

125

text2 = text2[:-commonlength]

126

127

# Compute the diff on the middle block.

128

diffs = self.diff_compute(text1, text2, checklines, deadline)

129

130

# Restore the prefix and suffix.

131

if commonprefix:

132

diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]

133

if commonsuffix:

134

diffs.append((self.DIFF_EQUAL, commonsuffix))

135

self.diff_cleanupMerge(diffs)

136

return diffs

137

138

def diff_compute(self, text1, text2, checklines, deadline):

139

"""Find the differences between two texts. Assumes that the texts do not

140

have any common prefix or suffix.

757

141

758

# Each port of this function behaves slightly differently due to

142

Args:

759

# subtle differences in each language's definition of things like

143

text1: Old string to be diffed.

760

# 'whitespace'. Since this function's purpose is largely cosmetic,

144

text2: New string to be diffed.

761

# the choice has been made to use each language's native features

145

checklines: Speedup flag. If false, then don't run a line-level diff

762

# rather than force total conformity.

146

first to identify the changed areas.

763

char1 = one[-1]

147

If true, then run a faster, slightly less optimal diff.

764

char2 = two[0]

148

deadline: Time when the diff should be complete by.

765

nonAlphaNumeric1 = not char1.isalnum()

149

766

nonAlphaNumeric2 = not char2.isalnum()

150

Returns:

767

whitespace1 = nonAlphaNumeric1 and char1.isspace()

151

Array of changes.

768

whitespace2 = nonAlphaNumeric2 and char2.isspace()

152

"""

769

lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n")

153

if not text1:

770

lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n")

154

# Just add some text (speedup).

771

blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one)

155

return [(self.DIFF_INSERT, text2)]

772

blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two)

156

157

if not text2:

158

# Just delete some text (speedup).

159

return [(self.DIFF_DELETE, text1)]

160

161

if len(text1) > len(text2):

162

(longtext, shorttext) = (text1, text2)

163

else:

164

(shorttext, longtext) = (text1, text2)

165

i = longtext.find(shorttext)

166

if i != -1:

167

# Shorter text is inside the longer text (speedup).

168

diffs = [

169

(self.DIFF_INSERT, longtext[:i]),

170

(self.DIFF_EQUAL, shorttext),

171

(self.DIFF_INSERT, longtext[i + len(shorttext) :]),

172

]

173

# Swap insertions for deletions if diff is reversed.

174

if len(text1) > len(text2):

175

diffs[0] = (self.DIFF_DELETE, diffs[0][1])

176

diffs[2] = (self.DIFF_DELETE, diffs[2][1])

177

return diffs

773

178

774

if blankLine1 or blankLine2:

179

if len(shorttext) == 1:

775

# Five points for blank lines.

180

# Single character string.

776

return 5

181

# After the previous speedup, the character can't be an equality.

777

elif lineBreak1 or lineBreak2:

182

return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]

778

# Four points for line breaks.

183

779

return 4

184

# Check to see if the problem can be split in two.

780

elif nonAlphaNumeric1 and not whitespace1 and whitespace2:

185

hm = self.diff_halfMatch(text1, text2)

781

# Three points for end of sentences.

186

if hm:

782

return 3

187

# A half-match was found, sort out the return data.

783

elif whitespace1 or whitespace2:

188

(text1_a, text1_b, text2_a, text2_b, mid_common) = hm

784

# Two points for whitespace.

189

# Send both pairs off for separate processing.

785

return 2

190

diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)

786

elif nonAlphaNumeric1 or nonAlphaNumeric2:

191

diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)

787

# One point for non-alphanumeric.

192

# Merge the results.

788

return 1

193

return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b

789

return 0

194

195

if checklines and len(text1) > 100 and len(text2) > 100:

196

return self.diff_lineMode(text1, text2, deadline)

197

198

return self.diff_bisect(text1, text2, deadline)

199

200

def diff_lineMode(self, text1, text2, deadline):

201

"""Do a quick line-level diff on both strings, then rediff the parts for

202

greater accuracy.

203

This speedup can produce non-minimal diffs.

204

205

Args:

206

text1: Old string to be diffed.

207

text2: New string to be diffed.

208

deadline: Time when the diff should be complete by.

209

210

Returns:

211

Array of changes.

212

"""

790

213

791

pointer = 1

214

# Scan the text on a line-by-line basis first.

792

# Intentionally ignore the first and last element (don't need checking).

215

(text1, text2, linearray) = self.diff_linesToChars(text1, text2)

793

while pointer < len(diffs) - 1:

216

794

if (diffs[pointer - 1][0] == self.DIFF_EQUAL and

217

diffs = self.diff_main(text1, text2, False, deadline)

795

diffs[pointer + 1][0] == self.DIFF_EQUAL):

218

796

# This is a single edit surrounded by equalities.

219

# Convert the diff back to original text.

797

equality1 = diffs[pointer - 1][1]

220

self.diff_charsToLines(diffs, linearray)

798

edit = diffs[pointer][1]

221

# Eliminate freak matches (e.g. blank lines)

799

equality2 = diffs[pointer + 1][1]

222

self.diff_cleanupSemantic(diffs)

223

224

# Rediff any replacement blocks, this time character-by-character.

225

# Add a dummy entry at the end.

226

diffs.append((self.DIFF_EQUAL, ""))

227

pointer = 0

228

count_delete = 0

229

count_insert = 0

230

text_delete = ""

231

text_insert = ""

232

while pointer < len(diffs):

233

if diffs[pointer][0] == self.DIFF_INSERT:

234

count_insert += 1

235

text_insert += diffs[pointer][1]

236

elif diffs[pointer][0] == self.DIFF_DELETE:

237

count_delete += 1

238

text_delete += diffs[pointer][1]

239

elif diffs[pointer][0] == self.DIFF_EQUAL:

240

# Upon reaching an equality, check for prior redundancies.

241

if count_delete >= 1 and count_insert >= 1:

242

# Delete the offending records and add the merged ones.

243

a = self.diff_main(text_delete, text_insert, False, deadline)

244

diffs[pointer - count_delete - count_insert : pointer] = a

245

pointer = pointer - count_delete - count_insert + len(a)

246

count_insert = 0

247

count_delete = 0

248

text_delete = ""

249

text_insert = ""

800

250

801

# First, shift the edit as far left as possible.

251

pointer += 1

802

commonOffset = self.diff_commonSuffix(equality1, edit)

252

803

if commonOffset:

253

diffs.pop() # Remove the dummy entry at the end.

804

commonString = edit[-commonOffset:]

254

805

equality1 = equality1[:-commonOffset]

255

return diffs

806

edit = commonString + edit[:-commonOffset]

256

807

equality2 = commonString + equality2

257

def diff_bisect(self, text1, text2, deadline):

258

"""Find the 'middle snake' of a diff, split the problem in two

259

and return the recursively constructed diff.

260

See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.

261

262

Args:

263

text1: Old string to be diffed.

264

text2: New string to be diffed.

265

deadline: Time at which to bail if not yet complete.

266

267

Returns:

268

Array of diff tuples.

269

"""

808

270

809

# Second, step character by character right, looking for the best fit.

271

# Cache the text lengths to prevent multiple calls.

810

bestEquality1 = equality1

272

text1_length = len(text1)

811

bestEdit = edit

273

text2_length = len(text2)

812

bestEquality2 = equality2

274

max_d = (text1_length + text2_length + 1) // 2

813

bestScore = (diff_cleanupSemanticScore(equality1, edit) +

275

v_offset = max_d

814

diff_cleanupSemanticScore(edit, equality2))

276

v_length = 2 * max_d

815

while edit and equality2 and edit[0] == equality2[0]:

277

v1 = [-1] * v_length

816

equality1 += edit[0]

278

v1[v_offset + 1] = 0

817

edit = edit[1:] + equality2[0]

279

v2 = v1[:]

818

equality2 = equality2[1:]

280

delta = text1_length - text2_length

819

score = (diff_cleanupSemanticScore(equality1, edit) +

281

# If the total number of characters is odd, then the front path will

820

diff_cleanupSemanticScore(edit, equality2))

282

# collide with the reverse path.

821

# The >= encourages trailing rather than leading whitespace on edits.

283

front = delta % 2 != 0

822

if score >= bestScore:

284

# Offsets for start and end of k loop.

823

bestScore = score

285

# Prevents mapping of space beyond the grid.

824

bestEquality1 = equality1

286

k1start = 0

825

bestEdit = edit

287

k1end = 0

826

bestEquality2 = equality2

288

k2start = 0

289

k2end = 0

290

for d in range(max_d):

291

# Bail out if deadline is reached.

292

if time.time() > deadline:

293

break

294

295

# Walk the front path one step.

296

for k1 in range(-d + k1start, d + 1 - k1end, 2):

297

k1_offset = v_offset + k1

298

if k1 == -d or (k1 != d and v1[k1_offset - 1] < v1[k1_offset + 1]):

299

x1 = v1[k1_offset + 1]

300

else:

301

x1 = v1[k1_offset - 1] + 1

302

y1 = x1 - k1

303

while (

304

x1 < text1_length and y1 < text2_length and text1[x1] == text2[y1]

305

):

306

x1 += 1

307

y1 += 1

308

v1[k1_offset] = x1

309

if x1 > text1_length:

310

# Ran off the right of the graph.

311

k1end += 2

312

elif y1 > text2_length:

313

# Ran off the bottom of the graph.

314

k1start += 2

315

elif front:

316

k2_offset = v_offset + delta - k1

317

if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:

318

# Mirror x2 onto top-left coordinate system.

319

x2 = text1_length - v2[k2_offset]

320

if x1 >= x2:

321

# Overlap detected.

322

return self.diff_bisectSplit(text1, text2, x1, y1, deadline)

827

323

828

if diffs[pointer - 1][1] != bestEquality1:

324

# Walk the reverse path one step.

829

# We have an improvement, save it back to the diff.

325

for k2 in range(-d + k2start, d + 1 - k2end, 2):

830

if bestEquality1:

326

k2_offset = v_offset + k2

831

diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)

327

if k2 == -d or (k2 != d and v2[k2_offset - 1] < v2[k2_offset + 1]):

832

else:

328

x2 = v2[k2_offset + 1]

833

del diffs[pointer - 1]

329

else:

834

pointer -= 1

330

x2 = v2[k2_offset - 1] + 1

835

diffs[pointer] = (diffs[pointer][0], bestEdit)

331

y2 = x2 - k2

836

if bestEquality2:

332

while (

837

diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)

333

x2 < text1_length

838

else:

334

and y2 < text2_length

839

del diffs[pointer + 1]

335

and text1[-x2 - 1] == text2[-y2 - 1]

840

~~pointer~~ -= 1

336

):

841

pointer += 1

337

x2 += 1

338

y2 += 1

339

v2[k2_offset] = x2

340

if x2 > text1_length:

341

# Ran off the left of the graph.

342

k2end += 2

343

elif y2 > text2_length:

344

# Ran off the top of the graph.

345

k2start += 2

346

elif not front:

347

k1_offset = v_offset + delta - k2

348

if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:

349

x1 = v1[k1_offset]

350

y1 = v_offset + x1 - k1_offset

351

# Mirror x2 onto top-left coordinate system.

352

x2 = text1_length - x2

353

if x1 >= x2:

354

# Overlap detected.

355

return self.diff_bisectSplit(text1, text2, x1, y1, deadline)

356

357

# Diff took too long and hit the deadline or

358

# number of diffs equals number of characters, no commonality at all.

359

return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]

360

361

def diff_bisectSplit(self, text1, text2, x, y, deadline):

362

"""Given the location of the 'middle snake', split the diff in two parts

363

and recurse.

364

365

Args:

366

text1: Old string to be diffed.

367

text2: New string to be diffed.

368

x: Index of split point in text1.

369

y: Index of split point in text2.

370

deadline: Time at which to bail if not yet complete.

371

372

Returns:

373

Array of diff tuples.

374

"""

375

text1a = text1[:x]

376

text2a = text2[:y]

377

text1b = text1[x:]

378

text2b = text2[y:]

379

380

# Compute both diffs serially.

381

diffs = self.diff_main(text1a, text2a, False, deadline)

382

diffsb = self.diff_main(text1b, text2b, False, deadline)

842

383

843

# Define some regex patterns for matching boundaries.

384

return diffs + diffsb

844

BLANKLINEEND = re.compile(r"\n\r?\n$");

385

845

BLANKLINESTART = re.compile(r"^\r?\n\r?\n");

386

def diff_linesToChars(self, text1, text2):

387

"""Split two texts into an array of strings. Reduce the texts to a string

388

of hashes where each Unicode character represents one line.

389

390

Args:

391

text1: First string.

392

text2: Second string.

393

394

Returns:

395

Three element tuple, containing the encoded text1, the encoded text2 and

396

the array of unique strings. The zeroth element of the array of unique

397

strings is intentionally blank.

398

"""

399

lineArray = [] # e.g. lineArray[4] == "Hello\n"

400

lineHash = {} # e.g. lineHash["Hello\n"] == 4

401

402

# "\x00" is a valid character, but various debuggers don't like it.

403

# So we'll insert a junk entry to avoid generating a null character.

404

lineArray.append("")

405

406

def diff_linesToCharsMunge(text):

407

"""Split a text into an array of strings. Reduce the texts to a string

408

of hashes where each Unicode character represents one line.

409

Modifies linearray and linehash through being a closure.

410

411

Args:

412

text: String to encode.

413

414

Returns:

415

Encoded string.

416

"""

417

chars = []

418

# Walk the text, pulling out a substring for each line.

419

# text.split('\n') would would temporarily double our memory footprint.

420

# Modifying text would create many large strings to garbage collect.

421

lineStart = 0

422

lineEnd = -1

423

while lineEnd < len(text) - 1:

424

lineEnd = text.find("\n", lineStart)

425

if lineEnd == -1:

426

lineEnd = len(text) - 1

427

line = text[lineStart : lineEnd + 1]

428

lineStart = lineEnd + 1

429

430

if line in lineHash:

431

chars.append(chr(lineHash[line]))

432

else:

433

lineArray.append(line)

434

lineHash[line] = len(lineArray) - 1

435

chars.append(chr(len(lineArray) - 1))

436

return "".join(chars)

437

438

chars1 = diff_linesToCharsMunge(text1)

439

chars2 = diff_linesToCharsMunge(text2)

440

return (chars1, chars2, lineArray)

441

442

def diff_charsToLines(self, diffs, lineArray):

443

"""Rehydrate the text in a diff from a string of line hashes to real lines

444

of text.

846

445

847

def diff_cleanupEfficiency(self, diffs):

446

Args:

848

"""Reduce the number of edits by eliminating operationally trivial

447

diffs: Array of diff tuples.

849

equalities.

448

lineArray: Array of unique strings.

449

"""

450

for x in range(len(diffs)):

451

text = []

452

for char in diffs[x][1]:

453

text.append(lineArray[ord(char)])

454

diffs[x] = (diffs[x][0], "".join(text))

455

456

def diff_commonPrefix(self, text1, text2):

457

"""Determine the common prefix of two strings.

458

459

Args:

460

text1: First string.

461

text2: Second string.

462

463

Returns:

464

The number of characters common to the start of each string.

465

"""

466

# Quick check for common null cases.

467

if not text1 or not text2 or text1[0] != text2[0]:

468

return 0

469

# Binary search.

470

# Performance analysis: http://neil.fraser.name/news/2007/10/09/

471

pointermin = 0

472

pointermax = min(len(text1), len(text2))

473

pointermid = pointermax

474

pointerstart = 0

475

while pointermin < pointermid:

476

if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:

477

pointermin = pointermid

478

pointerstart = pointermin

479

else:

480

pointermax = pointermid

481

pointermid = (pointermax - pointermin) // 2 + pointermin

482

return pointermid

483

484

def diff_commonSuffix(self, text1, text2):

485

"""Determine the common suffix of two strings.

486

487

Args:

488

text1: First string.

489

text2: Second string.

850

490

851

Args:

491

Returns:

852

diffs: Array of diff tuples.

492

The number of characters common to the end of each string.

853

"""

493

"""

854

changes = False

494

# Quick check for common null cases.

855

equalities = [] # Stack of indices where equalities are found.

495

if not text1 or not text2 or text1[-1] != text2[-1]:

856

lastequality = None # Always equal to diffs[equalities[-1]][1]

496

return 0

857

pointer = 0 # Index of current position.

497

# Binary search.

858

pre_ins = False # Is there an insertion operation before the last equality.

498

# Performance analysis: http://neil.fraser.name/news/2007/10/09/

859

pre_del = False # Is there a deletion operation before the last equality.

499

pointermin = 0

860

post_ins = False # Is there an insertion operation after the last equality.

500

pointermax = min(len(text1), len(text2))

861

post_del = False # Is there a deletion operation after the last equality.

501

pointermid = pointermax

862

while pointer < len(diffs):

502

pointerend = 0

863

if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.

503

while pointermin < pointermid:

864

if (len(diffs[pointer][1]) < self.Diff_EditCost and

504

if (

865

(post_ins or post_del)):

505

text1[-pointermid : len(text1) - pointerend]

866

# Candidate found.

506

== text2[-pointermid : len(text2) - pointerend]

867

equalities.append(pointer)

507

):

868

pre_ins = post_ins

508

pointermin = pointermid

869

~~pre_del~~ = po~~st_del~~

509

pointerend = pointermin

870

lastequality = diffs[pointer][1]

510

else:

511

pointermax = pointermid

512

pointermid = (pointermax - pointermin) // 2 + pointermin

513

return pointermid

514

515

def diff_commonOverlap(self, text1, text2):

516

"""Determine if the suffix of one string is the prefix of another.

517

518

Args:

519

text1 First string.

520

text2 Second string.

521

522

Returns:

523

The number of characters common to the end of the first

524

string and the start of the second string.

525

"""

526

# Cache the text lengths to prevent multiple calls.

527

text1_length = len(text1)

528

text2_length = len(text2)

529

# Eliminate the null case.

530

if text1_length == 0 or text2_length == 0:

531

return 0

532

# Truncate the longer string.

533

if text1_length > text2_length:

534

text1 = text1[-text2_length:]

535

elif text1_length < text2_length:

536

text2 = text2[:text1_length]

537

text_length = min(text1_length, text2_length)

538

# Quick check for the worst case.

539

if text1 == text2:

540

return text_length

541

542

# Start by looking for a single character match

543

# and increase length until no match is found.

544

# Performance analysis: http://neil.fraser.name/news/2010/11/04/

545

best = 0

546

length = 1

547

while True:

548

pattern = text1[-length:]

549

found = text2.find(pattern)

550

if found == -1:

551

return best

552

length += found

553

if found == 0 or text1[-length:] == text2[:length]:

554

best = length

555

length += 1

556

557

def diff_halfMatch(self, text1, text2):

558

"""Do the two texts share a substring which is at least half the length of

559

the longer text?

560

This speedup can produce non-minimal diffs.

561

562

Args:

563

text1: First string.

564

text2: Second string.

565

566

Returns:

567

Five element Array, containing the prefix of text1, the suffix of text1,

568

the prefix of text2, the suffix of text2 and the common middle. Or None

569

if there was no match.

570

"""

571

if self.Diff_Timeout <= 0:

572

# Don't risk returning a non-optimal diff if we have unlimited time.

573

return None

574

if len(text1) > len(text2):

575

(longtext, shorttext) = (text1, text2)

871

else:

576

else:

872

# Not a candidate, and can never become one.

577

(shorttext, longtext) = (text1, text2)

873

equalities = []

578

if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):

874

lastequality = None

579

return None # Pointless.

875

580

876

post_ins = post_del = False

581

def diff_halfMatchI(longtext, shorttext, i):

877

else: # An insertion or deletion.

582

"""Does a substring of shorttext exist within longtext such that the

878

if diffs[pointer][0] == self.DIFF_DELETE:

583

substring is at least half the length of longtext?

879

post_del = True

584

Closure, but does not reference any external variables.

880

else:

881

post_ins = True

882

585

883

# Five types to be split:

586

Args:

884

# <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>

587

longtext: Longer string.

885

# <ins>A</ins>X<ins>C</ins><del>D</del>

588

shorttext: Shorter string.

886

# <ins>A</ins><del>B</del>X<ins>C</ins>

589

i: Start index of quarter length substring within longtext.

887

# <ins>A</del>X<ins>C</ins><del>D</del>

888

# <ins>A</ins><del>B</del>X<del>C</del>

889

590

890

if lastequality and ((pre_ins and pre_del and post_ins and post_del) or

591

Returns:

891

((len(lastequality) < self.Diff_EditCost / 2) and

592

Five element Array, containing the prefix of longtext, the suffix of

892

(pre_ins + pre_del + post_ins + post_del) == 3)):

593

longtext, the prefix of shorttext, the suffix of shorttext and the

893

# Duplicate record.

594

common middle. Or None if there was no match.

894

diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))

595

"""

895

# Change second copy to insert.

596

seed = longtext[i : i + len(longtext) // 4]

896

diffs[equalities[-1] + 1] = (self.DIFF_INSERT,

597

best_common = ""

897

diffs[equalities[-1] + 1][1])

598

j = shorttext.find(seed)

898

equalities.pop() # Throw away the equality we just deleted.

599

while j != -1:

899

lastequality = None

600

prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])

900

if pre_ins and pre_del:

601

suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])

901

# No changes made which could affect previous entry, keep going.

602

if len(best_common) < suffixLength + prefixLength:

902

post_ins = post_del = True

603

best_common = (

903

equalities = []

604

shorttext[j - suffixLength : j]

904

else:

605

+ shorttext[j : j + prefixLength]

905

if len(equalities):

606

)

906

equalities.pop() # Throw away the previous equality.

607

best_longtext_a = longtext[: i - suffixLength]

907

if len(equalities):

608

best_longtext_b = longtext[i + prefixLength :]

908

pointer = equalities[-1]

609

best_shorttext_a = shorttext[: j - suffixLength]

909

else:

610

best_shorttext_b = shorttext[j + prefixLength :]

910

pointer = -1

611

j = shorttext.find(seed, j + 1)

911

post_ins = post_del = False

912

changes = True

913

pointer += 1

914

915

if changes:

916

self.diff_cleanupMerge(diffs)

917

918

def diff_cleanupMerge(self, diffs):

919

"""Reorder and merge like edit sections. Merge equalities.

920

Any edit section can move as long as it doesn't cross an equality.

921

612

922

Args:

613

if len(best_common) * 2 >= len(longtext):

923

diffs: Array of diff tuples.

614

return (

924

"""

615

best_longtext_a,

925

diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end.

616

best_longtext_b,

926

pointer = 0

617

best_shorttext_a,

927

count_delete = 0

618

best_shorttext_b,

928

count_insert = 0

619

best_common,

929

text_delete = ''

620

)

930

text_insert = ''

621

else:

931

while pointer < len(diffs):

622

return None

932

if diffs[pointer][0] == self.DIFF_INSERT:

623

933

count_insert += 1

624

# First check if the second quarter is the seed for a half-match.

934

text_insert += diffs[pointer][1]

625

hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4)

935

pointer += 1

626

# Check again based on the third quarter.

936

elif diffs[pointer][0] == self.DIFF_DELETE:

627

hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2)

937

count_delete += 1

628

if not hm1 and not hm2:

938

text_delete += diffs[pointer][1]

629

return None

939

pointer += 1

630

elif not hm2:

940

elif diffs[pointer][0] == self.DIFF_EQUAL:

631

hm = hm1

941

# Upon reaching an equality, check for prior redundancies.

632

elif not hm1:

942

if count_delete + count_insert > 1:

633

hm = hm2

943

if count_delete != 0 and count_insert != 0:

634

else:

944

# Factor out any common prefixies.

635

# Both matched. Select the longest.

945

commonlength = self.diff_commonPrefix(text_insert, text_delete)

636

if len(hm1[4]) > len(hm2[4]):

946

if commonlength != 0:

637

hm = hm1

947

x = pointer - count_delete - count_insert - 1

638

else:

948

if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:

639

hm = hm2

949

diffs[x] = (diffs[x][0], diffs[x][1] +

640

950

text_insert[:commonlength])

641

# A half-match was found, sort out the return data.

951

else:

642

if len(text1) > len(text2):

952

diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength]))

643

(text1_a, text1_b, text2_a, text2_b, mid_common) = hm

644

else:

645

(text2_a, text2_b, text1_a, text1_b, mid_common) = hm

646

return (text1_a, text1_b, text2_a, text2_b, mid_common)

647

648

def diff_cleanupSemantic(self, diffs):

649

"""Reduce the number of edits by eliminating semantically trivial

650

equalities.

651

652

Args:

653

diffs: Array of diff tuples.

654

"""

655

changes = False

656

equalities = [] # Stack of indices where equalities are found.

657

lastequality = None # Always equal to diffs[equalities[-1]][1]

658

pointer = 0 # Index of current position.

659

# Number of chars that changed prior to the equality.

660

length_insertions1, length_deletions1 = 0, 0

661

# Number of chars that changed after the equality.

662

length_insertions2, length_deletions2 = 0, 0

663

while pointer < len(diffs):

664

if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.

665

equalities.append(pointer)

666

length_insertions1, length_insertions2 = length_insertions2, 0

667

length_deletions1, length_deletions2 = length_deletions2, 0

668

lastequality = diffs[pointer][1]

669

else: # An insertion or deletion.

670

if diffs[pointer][0] == self.DIFF_INSERT:

671

length_insertions2 += len(diffs[pointer][1])

672

else:

673

length_deletions2 += len(diffs[pointer][1])

674

# Eliminate an equality that is smaller or equal to the edits on both

675

# sides of it.

676

if (

677

lastequality

678

and (

679

len(lastequality) <= max(length_insertions1, length_deletions1)

680

)

681

and (

682

len(lastequality) <= max(length_insertions2, length_deletions2)

683

)

684

):

685

# Duplicate record.

686

diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))

687

# Change second copy to insert.

688

diffs[equalities[-1] + 1] = (

689

self.DIFF_INSERT,

690

diffs[equalities[-1] + 1][1],

691

)

692

# Throw away the equality we just deleted.

693

equalities.pop()

694

# Throw away the previous equality (it needs to be reevaluated).

695

if len(equalities):

696

equalities.pop()

697

if len(equalities):

698

pointer = equalities[-1]

699

else:

700

pointer = -1

701

# Reset the counters.

702

length_insertions1, length_deletions1 = 0, 0

703

length_insertions2, length_deletions2 = 0, 0

704

lastequality = None

705

changes = True

706

pointer += 1

707

708

# Normalize the diff.

709

if changes:

710

self.diff_cleanupMerge(diffs)

711

self.diff_cleanupSemanticLossless(diffs)

712

713

# Find any overlaps between deletions and insertions.

714

# e.g: <del>abcxxx</del><ins>xxxdef</ins>

715

# -> <del>abc</del>xxx<ins>def</ins>

716

# e.g: <del>xxxabc</del><ins>defxxx</ins>

717

# -> <ins>def</ins>xxx<del>abc</del>

718

# Only extract an overlap if it is as big as the edit ahead or behind it.

719

pointer = 1

720

while pointer < len(diffs):

721

if (

722

diffs[pointer - 1][0] == self.DIFF_DELETE

723

and diffs[pointer][0] == self.DIFF_INSERT

724

):

725

deletion = diffs[pointer - 1][1]

726

insertion = diffs[pointer][1]

727

overlap_length1 = self.diff_commonOverlap(deletion, insertion)

728

overlap_length2 = self.diff_commonOverlap(insertion, deletion)

729

if overlap_length1 >= overlap_length2:

730

if (

731

overlap_length1 >= len(deletion) / 2.0

732

or overlap_length1 >= len(insertion) / 2.0

733

):

734

# Overlap found. Insert an equality and trim the surrounding edits.

735

diffs.insert(

736

pointer, (self.DIFF_EQUAL, insertion[:overlap_length1])

737

)

738

diffs[pointer - 1] = (

739

self.DIFF_DELETE,

740

deletion[: len(deletion) - overlap_length1],

741

)

742

diffs[pointer + 1] = (

743

self.DIFF_INSERT,

744

insertion[overlap_length1:],

745

)

746

pointer += 1

747

else:

748

if (

749

overlap_length2 >= len(deletion) / 2.0

750

or overlap_length2 >= len(insertion) / 2.0

751

):

752

# Reverse overlap found.

753

# Insert an equality and swap and trim the surrounding edits.

754

diffs.insert(

755

pointer, (self.DIFF_EQUAL, deletion[:overlap_length2])

756

)

757

diffs[pointer - 1] = (

758

self.DIFF_INSERT,

759

insertion[: len(insertion) - overlap_length2],

760

)

761

diffs[pointer + 1] = (

762

self.DIFF_DELETE,

763

deletion[overlap_length2:],

764

)

765

pointer += 1

953

pointer += 1

766

pointer += 1

954

text_insert = text_insert[commonlength:]

955

text_delete = text_delete[commonlength:]

956

# Factor out any common suffixies.

957

commonlength = self.diff_commonSuffix(text_insert, text_delete)

958

if commonlength != 0:

959

diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +

960

diffs[pointer][1])

961

text_insert = text_insert[:-commonlength]

962

text_delete = text_delete[:-commonlength]

963

# Delete the offending records and add the merged ones.

964

if count_delete == 0:

965

diffs[pointer - count_insert : pointer] = [

966

(self.DIFF_INSERT, text_insert)]

967

elif count_insert == 0:

968

diffs[pointer - count_delete : pointer] = [

969

(self.DIFF_DELETE, text_delete)]

970

else:

971

diffs[pointer - count_delete - count_insert : pointer] = [

972

(self.DIFF_DELETE, text_delete),

973

(self.DIFF_INSERT, text_insert)]

974

pointer = pointer - count_delete - count_insert + 1

975

if count_delete != 0:

976

pointer += 1

767

pointer += 1

977

if count_insert != 0:

768

978

pointer += 1

769

def diff_cleanupSemanticLossless(self, diffs):

979

elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:

770

"""Look for single edits surrounded on both sides by equalities

980

# Merge this equality with the previous one.

771

which can be shifted sideways to align the edit to a word boundary.

981

diffs[pointer - 1] = (diffs[pointer - 1][0],

772

e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.

982

diffs[pointer - 1][1] + diffs[pointer][1])

773

983

del diffs[pointer]

774

Args:

984

else:

775

diffs: Array of diff tuples.

985

pointer += 1

776

"""

777

778

def diff_cleanupSemanticScore(one, two):

779

"""Given two strings, compute a score representing whether the

780

internal boundary falls on logical boundaries.

781

Scores range from 6 (best) to 0 (worst).

782

Closure, but does not reference any external variables.

783

784

Args:

785

one: First string.

786

two: Second string.

787

788

Returns:

789

The score.

790

"""

791

if not one or not two:

792

# Edges are the best.

793

return 6

794

795

# Each port of this function behaves slightly differently due to

796

# subtle differences in each language's definition of things like

797

# 'whitespace'. Since this function's purpose is largely cosmetic,

798

# the choice has been made to use each language's native features

799

# rather than force total conformity.

800

char1 = one[-1]

801

char2 = two[0]

802

nonAlphaNumeric1 = not char1.isalnum()

803

nonAlphaNumeric2 = not char2.isalnum()

804

whitespace1 = nonAlphaNumeric1 and char1.isspace()

805

whitespace2 = nonAlphaNumeric2 and char2.isspace()

806

lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n")

807

lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n")

808

blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one)

809

blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two)

810

811

if blankLine1 or blankLine2:

812

# Five points for blank lines.

813

return 5

814

elif lineBreak1 or lineBreak2:

815

# Four points for line breaks.

816

return 4

817

elif nonAlphaNumeric1 and not whitespace1 and whitespace2:

818

# Three points for end of sentences.

819

return 3

820

elif whitespace1 or whitespace2:

821

# Two points for whitespace.

822

return 2

823

elif nonAlphaNumeric1 or nonAlphaNumeric2:

824

# One point for non-alphanumeric.

825

return 1

826

return 0

986

827

987

~~count_insert~~ = 0

828

pointer = 1

988

count_delete = 0

829

# Intentionally ignore the first and last element (don't need checking).

989

text_delete = ''

830

while pointer < len(diffs) - 1:

990

text_insert = ''

831

if (

832

diffs[pointer - 1][0] == self.DIFF_EQUAL

833

and diffs[pointer + 1][0] == self.DIFF_EQUAL

834

):

835

# This is a single edit surrounded by equalities.

836

equality1 = diffs[pointer - 1][1]

837

edit = diffs[pointer][1]

838

equality2 = diffs[pointer + 1][1]

991

839

992

if diffs[-1][1] == '':

840

# First, shift the edit as far left as possible.

993

diffs.pop() # Remove the dummy entry at the end.

841

commonOffset = self.diff_commonSuffix(equality1, edit)

842

if commonOffset:

843

commonString = edit[-commonOffset:]

844

equality1 = equality1[:-commonOffset]

845

edit = commonString + edit[:-commonOffset]

846

equality2 = commonString + equality2

994

847

995

# Second pass: look for single edits surrounded on both sides by equalities

848

# Second, step character by character right, looking for the best fit.

996

# which can be shifted sideways to eliminate an equality.

849

bestEquality1 = equality1

997

# e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC

850

bestEdit = edit

998

changes = False

851

bestEquality2 = equality2

999

pointer = 1

852

bestScore = diff_cleanupSemanticScore(

1000

# Intentionally ignore the first and last element (don't need checking).

853

equality1, edit

1001

while pointer < len(diffs) - 1:

854

) + diff_cleanupSemanticScore(edit, equality2)

1002

if (diffs[pointer - 1][0] == self.DIFF_EQUAL and

855

while edit and equality2 and edit[0] == equality2[0]:

1003

diffs[pointer + 1][0] == self.DIFF_EQUAL):

856

equality1 += edit[0]

1004

# This is a single edit surrounded by equalities.

857

edit = edit[1:] + equality2[0]

1005

if diffs[pointer][1].endswith(diffs[pointer - 1][1]):

858

equality2 = equality2[1:]

1006

# Shift the edit over the previous equality.

859

score = diff_cleanupSemanticScore(

1007

diffs[pointer] = (diffs[pointer][0],

860

equality1, edit

1008

diffs[pointer - 1][1] +

861

) + diff_cleanupSemanticScore(edit, equality2)

1009

diffs[pointer][1][:-len(diffs[pointer - 1][1])])

862

# The >= encourages trailing rather than leading whitespace on edits.

1010

diffs[pointer + 1] = (diffs[pointer + 1][0],

863

if score >= bestScore:

1011

diffs[pointer - 1][1] + diffs[pointer + 1][1])

864

bestScore = score

1012

del diffs[pointer - 1]

865

bestEquality1 = equality1

1013

changes = True

866

bestEdit = edit

1014

elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):

867

bestEquality2 = equality2

1015

# Shift the edit over the next equality.

868

1016

~~diffs~~[~~pointer~~ - 1] = (diffs[pointer - 1][0],

869

if diffs[pointer - 1][1] != bestEquality1:

1017

diffs[pointer - 1][1] + diffs[pointer + 1][1])

870

# We have an improvement, save it back to the diff.

1018

diffs[pointer] = (diffs[pointer][0],

871

if bestEquality1:

1019

diffs[pointer][1][~~len~~(diffs[pointer + 1][1]):] +

872

diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)

1020

diffs[pointer + 1][1])

873

else:

1021

del diffs[pointer + 1]

874

del diffs[pointer - 1]

1022

changes = True

875

pointer -= 1

1023

pointer += 1

876

diffs[pointer] = (diffs[pointer][0], bestEdit)

877

if bestEquality2:

878

diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)

879

else:

880

del diffs[pointer + 1]

881

pointer -= 1

882

pointer += 1

883

884

# Define some regex patterns for matching boundaries.

885

BLANKLINEEND = re.compile(r"\n\r?\n$")

886

BLANKLINESTART = re.compile(r"^\r?\n\r?\n")

887

888

def diff_cleanupEfficiency(self, diffs):

889

"""Reduce the number of edits by eliminating operationally trivial

890

equalities.

1024

891

1025

# If shifts were made, the diff needs reordering and another shift sweep.

892

Args:

1026

if changes:

893

diffs: Array of diff tuples.

1027

self.diff_cleanupMerge(diffs)

894

"""

895

changes = False

896

equalities = [] # Stack of indices where equalities are found.

897

lastequality = None # Always equal to diffs[equalities[-1]][1]

898

pointer = 0 # Index of current position.

899

pre_ins = False # Is there an insertion operation before the last equality.

900

pre_del = False # Is there a deletion operation before the last equality.

901

post_ins = False # Is there an insertion operation after the last equality.

902

post_del = False # Is there a deletion operation after the last equality.

903

while pointer < len(diffs):

904

if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.

905

if len(diffs[pointer][1]) < self.Diff_EditCost and (

906

post_ins or post_del

907

):

908

# Candidate found.

909

equalities.append(pointer)

910

pre_ins = post_ins

911

pre_del = post_del

912

lastequality = diffs[pointer][1]

913

else:

914

# Not a candidate, and can never become one.

915

equalities = []

916

lastequality = None

1028

917

1029

def diff_xIndex(self, diffs, loc):

918

post_ins = post_del = False

1030

"""loc is a location in text1, compute and return the equivalent location

919

else: # An insertion or deletion.

1031

in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8

920

if diffs[pointer][0] == self.DIFF_DELETE:

1032

921

post_del = True

1033

Args:

922

else:

1034

diffs: Array of diff tuples.

923

post_ins = True

1035

loc: Location within text1.

1036

924

1037

Returns:

925

# Five types to be split:

1038

Location within text2.

926

# <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>

1039

"""

927

# <ins>A</ins>X<ins>C</ins><del>D</del>

1040

chars1 = 0

928

# <ins>A</ins><del>B</del>X<ins>C</ins>

1041

chars2 = 0

929

# <ins>A</del>X<ins>C</ins><del>D</del>

1042

last_chars1 = 0

930

# <ins>A</ins><del>B</del>X<del>C</del>

1043

last_chars2 = 0

1044

for x in range(len(diffs)):

1045

(op, text) = diffs[x]

1046

if op != self.DIFF_INSERT: # Equality or deletion.

1047

chars1 += len(text)

1048

if op != self.DIFF_DELETE: # Equality or insertion.

1049

chars2 += len(text)

1050

if chars1 > loc: # Overshot the location.

1051

break

1052

last_chars1 = chars1

1053

last_chars2 = chars2

1054

931

1055

if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:

932

if lastequality and (

1056

# The location was deleted.

933

(pre_ins and pre_del and post_ins and post_del)

1057

return last_chars2

934

or (

1058

# Add the remaining len(character).

935

(len(lastequality) < self.Diff_EditCost / 2)

1059

return last_chars2 + (loc - last_chars1)

936

and (pre_ins + pre_del + post_ins + post_del) == 3

937

)

938

):

939

# Duplicate record.

940

diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))

941

# Change second copy to insert.

942

diffs[equalities[-1] + 1] = (

943

self.DIFF_INSERT,

944

diffs[equalities[-1] + 1][1],

945

)

946

equalities.pop() # Throw away the equality we just deleted.

947

lastequality = None

948

if pre_ins and pre_del:

949

# No changes made which could affect previous entry, keep going.

950

post_ins = post_del = True

951

equalities = []

952

else:

953

if len(equalities):

954

equalities.pop() # Throw away the previous equality.

955

if len(equalities):

956

pointer = equalities[-1]

957

else:

958

pointer = -1

959

post_ins = post_del = False

960

changes = True

961

pointer += 1

1060

962

1061

def diff_prettyHtml(self, diffs):

963

if changes:

1062

"""Convert a diff array into a pretty HTML report.

964

self.diff_cleanupMerge(diffs)

1063

965

1064

Args:

966

def diff_cleanupMerge(self, diffs):

1065

diffs: Array of diff tuples.

967

"""Reorder and merge like edit sections. Merge equalities.

968

Any edit section can move as long as it doesn't cross an equality.

1066

969

1067

Returns:

970

Args:

1068

HTML representation.

971

diffs: Array of diff tuples.

1069

"""

972

"""

1070

html = []

973

diffs.append((self.DIFF_EQUAL, "")) # Add a dummy entry at the end.

1071

for (op, data) in diffs:

974

pointer = 0

1072

text = (data.replace("&", "&").replace("<", "<")

975

count_delete = 0

1073

.replace(">", ">").replace("\n", "¶ "))

976

count_insert = 0

1074

if op == self.DIFF_INSERT:

977

text_delete = ""

1075

html.append("<ins style=\"background:#e6ffe6;\">%s</ins>" % text)

978

text_insert = ""

1076

elif op == self.DIFF_DELETE:

979

while pointer < len(diffs):

1077

html.append("<del style=\"background:#ffe6e6;\">%s</del>" % text)

980

if diffs[pointer][0] == self.DIFF_INSERT:

1078

elif op == self.DIFF_EQUAL:

981

count_insert += 1

1079

html.append("%s" % text)

982

text_insert += diffs[pointer][1]

1080

return "".join(html)

983

pointer += 1

984

elif diffs[pointer][0] == self.DIFF_DELETE:

985

count_delete += 1

986

text_delete += diffs[pointer][1]

987

pointer += 1

988

elif diffs[pointer][0] == self.DIFF_EQUAL:

989

# Upon reaching an equality, check for prior redundancies.

990

if count_delete + count_insert > 1:

991

if count_delete != 0 and count_insert != 0:

992

# Factor out any common prefixies.

993

commonlength = self.diff_commonPrefix(text_insert, text_delete)

994

if commonlength != 0:

995

x = pointer - count_delete - count_insert - 1

996

if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:

997

diffs[x] = (

998

diffs[x][0],

999

diffs[x][1] + text_insert[:commonlength],

1000

)

1001

else:

1002

diffs.insert(

1003

0, (self.DIFF_EQUAL, text_insert[:commonlength])

1004

)

1005

pointer += 1

1006

text_insert = text_insert[commonlength:]

1007

text_delete = text_delete[commonlength:]

1008

# Factor out any common suffixies.

1009

commonlength = self.diff_commonSuffix(text_insert, text_delete)

1010

if commonlength != 0:

1011

diffs[pointer] = (

1012

diffs[pointer][0],

1013

text_insert[-commonlength:] + diffs[pointer][1],

1014

)

1015

text_insert = text_insert[:-commonlength]

1016

text_delete = text_delete[:-commonlength]

1017

# Delete the offending records and add the merged ones.

1018

if count_delete == 0:

1019

diffs[pointer - count_insert : pointer] = [

1020

(self.DIFF_INSERT, text_insert)

1021

]

1022

elif count_insert == 0:

1023

diffs[pointer - count_delete : pointer] = [

1024

(self.DIFF_DELETE, text_delete)

1025

]

1026

else:

1027

diffs[pointer - count_delete - count_insert : pointer] = [

1028

(self.DIFF_DELETE, text_delete),

1029

(self.DIFF_INSERT, text_insert),

1030

]

1031

pointer = pointer - count_delete - count_insert + 1

1032

if count_delete != 0:

1033

pointer += 1

1034

if count_insert != 0:

1035

pointer += 1

1036

elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:

1037

# Merge this equality with the previous one.

1038

diffs[pointer - 1] = (

1039

diffs[pointer - 1][0],

1040

diffs[pointer - 1][1] + diffs[pointer][1],

1041

)

1042

del diffs[pointer]

1043

else:

1044

pointer += 1

1081

1045

1082

def diff_text1(self, diffs):

1046

count_insert = 0

1083

"""Compute and return the source text (all equalities and deletions).

1047

count_delete = 0

1084

1048

text_delete = ""

1085

Args:

1049

text_insert = ""

1086

diffs: Array of diff tuples.

1087

1050

1088

Returns:

1051

if diffs[-1][1] == "":

1089

Source text.

1052

diffs.pop() # Remove the dummy entry at the end.

1090

"""

1091

text = []

1092

for (op, data) in diffs:

1093

if op != self.DIFF_INSERT:

1094

text.append(data)

1095

return "".join(text)

1096

1097

def diff_text2(self, diffs):

1098

"""Compute and return the destination text (all equalities and insertions).

1099

1053

1100

Args:

1054

# Second pass: look for single edits surrounded on both sides by equalities

1101

diffs: Array of diff tuples.

1055

# which can be shifted sideways to eliminate an equality.

1056

# e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC

1057

changes = False

1058

pointer = 1

1059

# Intentionally ignore the first and last element (don't need checking).

1060

while pointer < len(diffs) - 1:

1061

if (

1062

diffs[pointer - 1][0] == self.DIFF_EQUAL

1063

and diffs[pointer + 1][0] == self.DIFF_EQUAL

1064

):

1065

# This is a single edit surrounded by equalities.

1066

if diffs[pointer][1].endswith(diffs[pointer - 1][1]):

1067

# Shift the edit over the previous equality.

1068

diffs[pointer] = (

1069

diffs[pointer][0],

1070

diffs[pointer - 1][1]

1071

+ diffs[pointer][1][: -len(diffs[pointer - 1][1])],

1072

)

1073

diffs[pointer + 1] = (

1074

diffs[pointer + 1][0],

1075

diffs[pointer - 1][1] + diffs[pointer + 1][1],

1076

)

1077

del diffs[pointer - 1]

1078

changes = True

1079

elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):

1080

# Shift the edit over the next equality.

1081

diffs[pointer - 1] = (

1082

diffs[pointer - 1][0],

1083

diffs[pointer - 1][1] + diffs[pointer + 1][1],

1084

)

1085

diffs[pointer] = (

1086

diffs[pointer][0],

1087

diffs[pointer][1][len(diffs[pointer + 1][1]) :]

1088

+ diffs[pointer + 1][1],

1089

)

1090

del diffs[pointer + 1]

1091

changes = True

1092

pointer += 1

1102

1093

1103

Returns:

1094

# If shifts were made, the diff needs reordering and another shift sweep.

1104

Destination text.

1095

if changes:

1105

"""

1096

self.diff_cleanupMerge(diffs)

1106

text = []

1097

1107

for (op, data) in diffs:

1098

def diff_xIndex(self, diffs, loc):

1108

if op != self.DIFF_DELETE:

1099

"""loc is a location in text1, compute and return the equivalent location

1109

text.append(data)

1100

in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8

1110

return "".join(text)

1101

1102

Args:

1103

diffs: Array of diff tuples.

1104

loc: Location within text1.

1111

1105

1112

def diff_levenshtein(self, diffs):

1106

Returns:

1113

"""Compute the Levenshtein distance; the number of inserted, deleted or

1107

Location within text2.

1114

substituted characters.

1108

"""

1109

chars1 = 0

1110

chars2 = 0

1111

last_chars1 = 0

1112

last_chars2 = 0

1113

for x in range(len(diffs)):

1114

(op, text) = diffs[x]

1115

if op != self.DIFF_INSERT: # Equality or deletion.

1116

chars1 += len(text)

1117

if op != self.DIFF_DELETE: # Equality or insertion.

1118

chars2 += len(text)

1119

if chars1 > loc: # Overshot the location.

1120

break

1121

last_chars1 = chars1

1122

last_chars2 = chars2

1115

1123

1116

Args:

1124

if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:

1117

diffs: Array of diff tuples.

1125

# The location was deleted.

1126

return last_chars2

1127

# Add the remaining len(character).

1128

return last_chars2 + (loc - last_chars1)

1129

1130

def diff_prettyHtml(self, diffs):

1131

"""Convert a diff array into a pretty HTML report.

1132

1133

Args:

1134

diffs: Array of diff tuples.

1135

1136

Returns:

1137

HTML representation.

1138

"""

1139

html = []

1140

for op, data in diffs:

1141

text = (

1142

data.replace("&", "&")

1143

.replace("<", "<")

1144

.replace(">", ">")

1145

.replace("\n", "¶ ")

1146

)

1147

if op == self.DIFF_INSERT:

1148

html.append('<ins style="background:#e6ffe6;">%s</ins>' % text)

1149

elif op == self.DIFF_DELETE:

1150

html.append('<del style="background:#ffe6e6;">%s</del>' % text)

1151

elif op == self.DIFF_EQUAL:

1152

html.append("%s" % text)

1153

return "".join(html)

1154

1155

def diff_text1(self, diffs):

1156

"""Compute and return the source text (all equalities and deletions).

1118

1157

1119

Returns:

1158

Args:

1120

Number of changes.

1159

diffs: Array of diff tuples.

1121

"""

1160

1122

levenshtein = 0

1161

Returns:

1123

insertions = 0

1162

Source text.

1124

deletions = 0

1163

"""

1125

for (op, data) in diffs:

1164

text = []

1126

if op == self.DIFF_INSERT:

1165

for op, data in diffs:

1127

insertions += len(data)

1166

if op != self.DIFF_INSERT:

1128

elif op == self.DIFF_DELETE:

1167

text.append(data)

1129

deletions += len(data)

1168

return "".join(text)

1130

elif op == self.DIFF_EQUAL:

1169

1131

# A deletion and an insertion is one substitution.

1170

def diff_text2(self, diffs):

1132

levenshtein += max(insertions, deletions)

1171

"""Compute and return the destination text (all equalities and insertions).

1172

1173

Args:

1174

diffs: Array of diff tuples.

1175

1176

Returns:

1177

Destination text.

1178

"""

1179

text = []

1180

for op, data in diffs:

1181

if op != self.DIFF_DELETE:

1182

text.append(data)

1183

return "".join(text)

1184

1185

def diff_levenshtein(self, diffs):

1186

"""Compute the Levenshtein distance; the number of inserted, deleted or

1187

substituted characters.

1188

1189

Args:

1190

diffs: Array of diff tuples.

1191

1192

Returns:

1193

Number of changes.

1194

"""

1195

levenshtein = 0

1133

insertions = 0

1196

insertions = 0

1134

deletions = 0

1197

deletions = 0

1135

levenshtein += max(insertions, deletions)

1198

for op, data in diffs:

1136

return levenshtein

1199

if op == self.DIFF_INSERT:

1200

insertions += len(data)

1201

elif op == self.DIFF_DELETE:

1202

deletions += len(data)

1203

elif op == self.DIFF_EQUAL:

1204

# A deletion and an insertion is one substitution.

1205

levenshtein += max(insertions, deletions)

1206

insertions = 0

1207

deletions = 0

1208

levenshtein += max(insertions, deletions)

1209

return levenshtein

1137

1210

1138

def diff_toDelta(self, diffs):

1211

def diff_toDelta(self, diffs):

1139

"""Crush the diff into an encoded string which describes the operations

1212

"""Crush the diff into an encoded string which describes the operations

1140

required to transform text1 into text2.

1213

required to transform text1 into text2.

1141

E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.

1214

E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.

1142

Operations are tab-separated. Inserted text is escaped using %xx notation.

1215

Operations are tab-separated. Inserted text is escaped using %xx notation.

1143

1216

1144

Args:

1217

Args:

1145

diffs: Array of diff tuples.

1218

diffs: Array of diff tuples.

1146

1219

1147

Returns:

1220

Returns:

1148

Delta text.

1221

Delta text.

1149

"""

1222

"""

1150

text = []

1223

text = []

1151

for (op, data) in diffs:

1224

for op, data in diffs:

1152

if op == self.DIFF_INSERT:

1225

if op == self.DIFF_INSERT:

1153

# High ascii will raise UnicodeDecodeError. Use Unicode instead.

1226

# High ascii will raise UnicodeDecodeError. Use Unicode instead.

1154

data = data.encode("utf-8")

1227

data = data.encode("utf-8")

1155

text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))

1228

text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))

1156

elif op == self.DIFF_DELETE:

1229

elif op == self.DIFF_DELETE:

1157

text.append("-%d" % len(data))

1230

text.append("-%d" % len(data))

1158

elif op == self.DIFF_EQUAL:

1231

elif op == self.DIFF_EQUAL:

1159

text.append("=%d" % len(data))

1232

text.append("=%d" % len(data))

1160

return "\t".join(text)

1233

return "\t".join(text)

1161

1234

1162

def diff_fromDelta(self, text1, delta):

1235

def diff_fromDelta(self, text1, delta):

1163

"""Given the original text1, and an encoded string which describes the

1236

"""Given the original text1, and an encoded string which describes the

1164

operations required to transform text1 into text2, compute the full diff.

1237

operations required to transform text1 into text2, compute the full diff.

1165

1238

1166

Args:

1239

Args:

1167

text1: Source string for the diff.

1240

text1: Source string for the diff.

1168

delta: Delta text.

1241

delta: Delta text.

1169

1170

Returns:

1171

Array of diff tuples.

1172

1242

1173

Raises:

1243

Returns:

1174

ValueError: If invalid input.

1244

Array of diff tuples.

1175

"""

1176

if type(delta) == unicode:

1177

# Deltas should be composed of a subset of ascii chars, Unicode not

1178

# required. If this encode raises UnicodeEncodeError, delta is invalid.

1179

delta = delta.encode("ascii")

1180

diffs = []

1181

pointer = 0 # Cursor in text1

1182

tokens = delta.split("\t")

1183

for token in tokens:

1184

if token == "":

1185

# Blank tokens are ok (from a trailing \t).

1186

continue

1187

# Each token begins with a one character parameter which specifies the

1188

# operation of this token (delete, insert, equality).

1189

param = token[1:]

1190

if token[0] == "+":

1191

param = urllib.parse.unquote(param).decode("utf-8")

1192

diffs.append((self.DIFF_INSERT, param))

1193

elif token[0] == "-" or token[0] == "=":

1194

try:

1195

n = int(param)

1196

except ValueError:

1197

raise ValueError("Invalid number in diff_fromDelta: " + param)

1198

if n < 0:

1199

raise ValueError("Negative number in diff_fromDelta: " + param)

1200

text = text1[pointer : pointer + n]

1201

pointer += n

1202

if token[0] == "=":

1203

diffs.append((self.DIFF_EQUAL, text))

1204

else:

1205

diffs.append((self.DIFF_DELETE, text))

1206

else:

1207

# Anything else is an error.

1208

raise ValueError("Invalid diff operation in diff_fromDelta: " +

1209

token[0])

1210

if pointer != len(text1):

1211

raise ValueError(

1212

"Delta length (%d) does not equal source text length (%d)." %

1213

(pointer, len(text1)))

1214

return diffs

1215

1245

1216

# MATCH FUNCTIONS

1246

Raises:

1247

ValueError: If invalid input.

1248

"""

1249

if type(delta) == str:

1250

# Deltas should be composed of a subset of ascii chars, Unicode not

1251

# required. If this encode raises UnicodeEncodeError, delta is invalid.

1252

delta = delta.encode("ascii")

1253

diffs = []

1254

pointer = 0 # Cursor in text1

1255

tokens = delta.split("\t")

1256

for token in tokens:

1257

if token == "":

1258

# Blank tokens are ok (from a trailing \t).

1259

continue

1260

# Each token begins with a one character parameter which specifies the

1261

# operation of this token (delete, insert, equality).

1262

param = token[1:]

1263

if token[0] == "+":

1264

param = urllib.parse.unquote(param)

1265

diffs.append((self.DIFF_INSERT, param))

1266

elif token[0] == "-" or token[0] == "=":

1267

try:

1268

n = int(param)

1269

except ValueError:

1270

raise ValueError("Invalid number in diff_fromDelta: " + param)

1271

if n < 0:

1272

raise ValueError("Negative number in diff_fromDelta: " + param)

1273

text = text1[pointer : pointer + n]

1274

pointer += n

1275

if token[0] == "=":

1276

diffs.append((self.DIFF_EQUAL, text))

1277

else:

1278

diffs.append((self.DIFF_DELETE, text))

1279

else:

1280

# Anything else is an error.

1281

raise ValueError(

1282

"Invalid diff operation in diff_fromDelta: " + token[0]

1283

)

1284

if pointer != len(text1):

1285

raise ValueError(

1286

"Delta length (%d) does not equal source text length (%d)."

1287

% (pointer, len(text1))

1288

)

1289

return diffs

1217

1290

1218

def match_main(self, text, pattern, loc):

1291

# MATCH FUNCTIONS

1219

"""Locate the best instance of 'pattern' in 'text' near 'loc'.

1220

1292

1221

Args:

1293

def match_main(self, text, pattern, loc):

1222

text: The text to search.

1294

"""Locate the best instance of 'pattern' in 'text' near 'loc'.

1223

pattern: The pattern to search for.

1224

loc: The location to search around.

1225

1295

1226

Returns:

1296

Args:

1227

Best match index or -1.

1297

text: The text to search.

1228

"""

1298

pattern: The pattern to search for.

1229

# Check for null inputs.

1299

loc: The location to search around.

1230

if text is None or pattern is None:

1300

1231

raise ValueError("Null inputs. (match_main)")

1301

Returns:

1302

Best match index or -1.

1303

"""

1304

# Check for null inputs.

1305

if text is None or pattern is None:

1306

raise ValueError("Null inputs. (match_main)")

1232

1307

1233

loc = max(0, min(loc, len(text)))

1308

loc = max(0, min(loc, len(text)))

1234

if text == pattern:

1309

if text == pattern:

1235

# Shortcut (potentially not guaranteed by the algorithm)

1310

# Shortcut (potentially not guaranteed by the algorithm)

1236

return 0

1311

return 0

1237

elif not text:

1312

elif not text:

1238

# Nothing to match.

1313

# Nothing to match.

1239

return -1

1314

return -1

1240

elif text[loc:loc + len(pattern)] == pattern:

1315

elif text[loc : loc + len(pattern)] == pattern:

1241

# Perfect match at the perfect spot! (Includes case of null pattern)

1316

# Perfect match at the perfect spot! (Includes case of null pattern)

1242

return loc

1317

return loc

1243

else:

1318

else:

1244

# Do a fuzzy compare.

1319

# Do a fuzzy compare.

1245

match = self.match_bitap(text, pattern, loc)

1320

match = self.match_bitap(text, pattern, loc)

1246

return match

1321

return match

1247

1322

1248

def match_bitap(self, text, pattern, loc):

1323

def match_bitap(self, text, pattern, loc):

1249

"""Locate the best instance of 'pattern' in 'text' near 'loc' using the

1324

"""Locate the best instance of 'pattern' in 'text' near 'loc' using the

1250

Bitap algorithm.

1325

Bitap algorithm.

1251

1326

1252

Args:

1327

Args:

1253

text: The text to search.

1328

text: The text to search.

1254

pattern: The pattern to search for.

1329

pattern: The pattern to search for.

1255

loc: The location to search around.

1330

loc: The location to search around.

1256

1331

1257

Returns:

1332

Returns:

1258

Best match index or -1.

1333

Best match index or -1.

1259

"""

1334

"""

1260

# Python doesn't have a maxint limit, so ignore this check.

1335

# Python doesn't have a maxint limit, so ignore this check.

1261

#if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:

1336

# if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:

1262

# raise ValueError("Pattern too long for this application.")

1337

# raise ValueError("Pattern too long for this application.")

1263

1338

1264

# Initialise the alphabet.

1339

# Initialise the alphabet.

1265

s = self.match_alphabet(pattern)

1340

s = self.match_alphabet(pattern)

1266

1341

1267

def match_bitapScore(e, x):

1342

def match_bitapScore(e, x):

1268

"""Compute and return the score for a match with e errors and x location.

1343

"""Compute and return the score for a match with e errors and x location.

1269

Accesses loc and pattern through being a closure.

1344

Accesses loc and pattern through being a closure.

1270

1345

1271

Args:

1346

Args:

1272

e: Number of errors in match.

1347

e: Number of errors in match.

1273

x: Location of match.

1348

x: Location of match.

1274

1349

1275

Returns:

1350

Returns:

1276

Overall score for match (0.0 = good, 1.0 = bad).

1351

Overall score for match (0.0 = good, 1.0 = bad).

1277

"""

1352

"""

1278

accuracy = float(e) / len(pattern)

1353

accuracy = float(e) / len(pattern)

1279

proximity = abs(loc - x)

1354

proximity = abs(loc - x)

1280

if not self.Match_Distance:

1355

if not self.Match_Distance:

1281

# Dodge divide by zero error.

1356

# Dodge divide by zero error.

1282

return proximity and 1.0 or accuracy

1357

return proximity and 1.0 or accuracy

1283

return accuracy + (proximity / float(self.Match_Distance))

1358

return accuracy + (proximity / float(self.Match_Distance))

1284

1359

1285

# Highest score beyond which we give up.

1360

# Highest score beyond which we give up.

1286

score_threshold = self.Match_Threshold

1361

score_threshold = self.Match_Threshold

1287

# Is there a nearby exact match? (speedup)

1362

# Is there a nearby exact match? (speedup)

1288

best_loc = text.find(pattern, loc)

1363

best_loc = text.find(pattern, loc)

1289

if best_loc != -1:

1364

if best_loc != -1:

1290

score_threshold = min(match_bitapScore(0, best_loc), score_threshold)

1365

score_threshold = min(match_bitapScore(0, best_loc), score_threshold)

1291

# What about in the other direction? (speedup)

1366

# What about in the other direction? (speedup)

1292

best_loc = text.rfind(pattern, loc + len(pattern))

1367

best_loc = text.rfind(pattern, loc + len(pattern))

1293

if best_loc != -1:

1368

if best_loc != -1:

1294

score_threshold = min(match_bitapScore(0, best_loc), score_threshold)

1369

score_threshold = min(match_bitapScore(0, best_loc), score_threshold)

1295

1370

1296

# Initialise the bit arrays.

1371

# Initialise the bit arrays.

1297

matchmask = 1 << (len(pattern) - 1)

1372

matchmask = 1 << (len(pattern) - 1)

1298

best_loc = -1

1373

best_loc = -1

1299

1374

1300

bin_max = len(pattern) + len(text)

1375

bin_max = len(pattern) + len(text)

1301

# Empty initialization added to appease pychecker.

1376

# Empty initialization added to appease pychecker.

1302

last_rd = None

1377

last_rd = None

1303

for d in range(len(pattern)):

1378

for d in range(len(pattern)):

1304

# Scan for the best match each iteration allows for one more error.

1379

# Scan for the best match each iteration allows for one more error.

1305

# Run a binary search to determine how far from 'loc' we can stray at

1380

# Run a binary search to determine how far from 'loc' we can stray at

1306

# this error level.

1381

# this error level.

1307

bin_min = 0

1382

bin_min = 0

1308

bin_mid = bin_max

1383

bin_mid = bin_max

1309

while bin_min < bin_mid:

1384

while bin_min < bin_mid:

1310

if match_bitapScore(d, loc + bin_mid) <= score_threshold:

1385

if match_bitapScore(d, loc + bin_mid) <= score_threshold:

1311

bin_min = bin_mid

1386

bin_min = bin_mid

1312

else:

1387

else:

1313

bin_max = bin_mid

1388

bin_max = bin_mid

1314

bin_mid = (bin_max - bin_min) // 2 + bin_min

1389

bin_mid = (bin_max - bin_min) // 2 + bin_min

1315

1390

1316

# Use the result from this iteration as the maximum for the next.

1391

# Use the result from this iteration as the maximum for the next.

1317

bin_max = bin_mid

1392

bin_max = bin_mid

1318

start = max(1, loc - bin_mid + 1)

1393

start = max(1, loc - bin_mid + 1)

1319

finish = min(loc + bin_mid, len(text)) + len(pattern)

1394

finish = min(loc + bin_mid, len(text)) + len(pattern)

1320

1395

1321

rd = [0] * (finish + 2)

1396

rd = [0] * (finish + 2)

1322

rd[finish + 1] = (1 << d) - 1

1397

rd[finish + 1] = (1 << d) - 1

1323

for j in range(finish, start - 1, -1):

1398

for j in range(finish, start - 1, -1):

1324

if len(text) <= j - 1:

1399

if len(text) <= j - 1:

1325

# Out of range.

1400

# Out of range.

1326

charMatch = 0

1401

charMatch = 0

1327

else:

1402

else:

1328

charMatch = s.get(text[j - 1], 0)

1403

charMatch = s.get(text[j - 1], 0)

1329

if d == 0: # First pass: exact match.

1404

if d == 0: # First pass: exact match.

1330

rd[j] = ((rd[j + 1] << 1) | 1) & charMatch

1405

rd[j] = ((rd[j + 1] << 1) | 1) & charMatch

1331

else: # Subsequent passes: fuzzy match.

1406

else: # Subsequent passes: fuzzy match.

1332

rd[j] = (((rd[j + 1] << 1) | 1) & charMatch) | (

1407

rd[j] = (

1333

((~~last_~~rd[j + 1] | ~~last_rd~~[j]) << 1) | 1) | ~~last_rd~~[j + 1]

1408

(((rd[j + 1] << 1) | 1) & charMatch)

1334

if rd[j] & matchmask:

1409

| (((last_rd[j + 1] | last_rd[j]) << 1) | 1)

1335

score = match_bitapScore(d, j - 1)

1410

| last_rd[j + 1]

1336

# This match will almost certainly be better than any existing match.

1411

)

1337

# But check anyway.

1412

if rd[j] & matchmask:

1338

if score <= score_threshold:

1413

score = match_bitapScore(d, j - 1)

1339

# Told you so.

1414

# This match will almost certainly be better than any existing match.

1340

score_threshold = score

1415

# But check anyway.

1341

best_loc = j - 1

1416

if score <= score_threshold:

1342

if best_loc > loc:

1417

# Told you so.

1343

# When passing loc, don't exceed our current distance from loc.

1418

score_threshold = score

1344

~~start~~ = ~~max~~(1, 2 * ~~loc~~ - ~~best_loc~~)

1419

best_loc = j - 1

1345

else:

1420

if best_loc > loc:

1346

# Already passed loc, downhill from here on in.

1421

# When passing loc, don't exceed our current distance from loc.

1347

break

1422

start = max(1, 2 * loc - best_loc)

1348

# No hope for a (better) match at greater error levels.

1423

else:

1349

if match_bitapScore(d + 1, loc) > score_threshold:

1424

# Already passed loc, downhill from here on in.

1350

break

1425

break

1351

last_rd = rd

1426

# No hope for a (better) match at greater error levels.

1352

return best_loc

1427

if match_bitapScore(d + 1, loc) > score_threshold:

1428

break

1429

last_rd = rd

1430

return best_loc

1353

1431

1354

def match_alphabet(self, pattern):

1432

def match_alphabet(self, pattern):

1355

"""Initialise the alphabet for the Bitap algorithm.

1433

"""Initialise the alphabet for the Bitap algorithm.

1356

1434

1357

Args:

1435

Args:

1358

pattern: The text to encode.

1436

pattern: The text to encode.

1359

1437

1360

Returns:

1438

Returns:

1361

Hash of character locations.

1439

Hash of character locations.

1362

"""

1440

"""

1363

s = {}

1441

s = {}

1364

for char in pattern:

1442

for char in pattern:

1365

s[char] = 0

1443

s[char] = 0

1366

for i in range(len(pattern)):

1444

for i in range(len(pattern)):

1367

s[pattern[i]] |= 1 << (len(pattern) - i - 1)

1445

s[pattern[i]] |= 1 << (len(pattern) - i - 1)

1368

return s

1446

return s

1369

1447

1370

# PATCH FUNCTIONS

1448

# PATCH FUNCTIONS

1371

1449

1372

def patch_addContext(self, patch, text):

1450

def patch_addContext(self, patch, text):

1373

"""Increase the context until it is unique,

1451

"""Increase the context until it is unique,

1374

but don't let the pattern expand beyond Match_MaxBits.

1452

but don't let the pattern expand beyond Match_MaxBits.

1375

1453

1376

Args:

1454

Args:

1377

patch: The patch to grow.

1455

patch: The patch to grow.

1378

text: Source text.

1456

text: Source text.

1379

"""

1457

"""

1380

if len(text) == 0:

1458

if len(text) == 0:

1381

return

1459

return

1382

pattern = text[patch.start2 : patch.start2 + patch.length1]

1460

pattern = text[patch.start2 : patch.start2 + patch.length1]

1383

padding = 0

1461

padding = 0

1384

1462

1385

# Look for the first and last matches of pattern in text. If two different

1463

# Look for the first and last matches of pattern in text. If two different

1386

# matches are found, increase the pattern length.

1464

# matches are found, increase the pattern length.

1387

while (text.find(pattern) != text.rfind(pattern) and (~~self~~.~~Match_MaxBits~~ ==

1465

while text.find(pattern) != text.rfind(pattern) and (

1388

0 or ~~len~~(~~pattern~~) < self.Match_MaxBits - ~~self~~.~~Patch_Margin~~ -

1466

self.Match_MaxBits == 0

1389

self.Patch_Margin)):

1467

or len(pattern) < self.Match_MaxBits - self.Patch_Margin - self.Patch_Margin

1390

padding += self.Patch_Margin

1468

):

1391

pattern = text[max(0, patch.start2 - padding) :

1469

padding += self.Patch_Margin

1392

patch.start2 + patch.length1 + padding]

1470

pattern = text[

1393

# Add one chunk for good luck.

1471

max(0, patch.start2 - padding) : patch.start2 + patch.length1 + padding

1394

padding += self.Patch_Margin

1472

]

1473

# Add one chunk for good luck.

1474

padding += self.Patch_Margin

1395

1475

1396

# Add the prefix.

1476

# Add the prefix.

1397

prefix = text[max(0, patch.start2 - padding) : patch.start2]

1477

prefix = text[max(0, patch.start2 - padding) : patch.start2]

1398

if prefix:

1478

if prefix:

1399

patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]

1479

patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]

1400

# Add the suffix.

1480

# Add the suffix.

1401

suffix = text[patch.start2 + patch.length1 :

1481

suffix = text[

1402

patch.start2 + patch.length1 + padding]

1482

patch.start2 + patch.length1 : patch.start2 + patch.length1 + padding

1403

if suffix:

1483

]

1404

patch.diffs.append((self.DIFF_EQUAL, suffix))

1484

if suffix:

1485

patch.diffs.append((self.DIFF_EQUAL, suffix))

1405

1486

1406

# Roll back the start points.

1487

# Roll back the start points.

1407

patch.start1 -= len(prefix)

1488

patch.start1 -= len(prefix)

1408

patch.start2 -= len(prefix)

1489

patch.start2 -= len(prefix)

1409

# Extend lengths.

1490

# Extend lengths.

1410

patch.length1 += len(prefix) + len(suffix)

1491

patch.length1 += len(prefix) + len(suffix)

1411

patch.length2 += len(prefix) + len(suffix)

1492

patch.length2 += len(prefix) + len(suffix)

1412

1493

1413

def patch_make(self, a, b=None, c=None):

1494

def patch_make(self, a, b=None, c=None):

1414

"""Compute a list of patches to turn text1 into text2.

1495

"""Compute a list of patches to turn text1 into text2.

1415

Use diffs if provided, otherwise compute it ourselves.

1496

Use diffs if provided, otherwise compute it ourselves.

1416

There are four ways to call this function, depending on what data is

1497

There are four ways to call this function, depending on what data is

1417

available to the caller:

1498

available to the caller:

1418

Method 1:

1499

Method 1:

1419

a = text1, b = text2

1500

a = text1, b = text2

1420

Method 2:

1501

Method 2:

1421

a = diffs

1502

a = diffs

1422

Method 3 (optimal):

1503

Method 3 (optimal):

1423

a = text1, b = diffs

1504

a = text1, b = diffs

1424

Method 4 (deprecated, use method 3):

1505

Method 4 (deprecated, use method 3):

1425

a = text1, b = text2, c = diffs

1506

a = text1, b = text2, c = diffs

1426

1507

1427

Args:

1508

Args:

1428

a: text1 (methods 1,3,4) or Array of diff tuples for text1 to

1509

a: text1 (methods 1,3,4) or Array of diff tuples for text1 to

1429

text2 (method 2).

1510

text2 (method 2).

1430

b: text2 (methods 1,4) or Array of diff tuples for text1 to

1511

b: text2 (methods 1,4) or Array of diff tuples for text1 to

1431

text2 (method 3) or undefined (method 2).

1512

text2 (method 3) or undefined (method 2).

1432

c: Array of diff tuples for text1 to text2 (method 4) or

1513

c: Array of diff tuples for text1 to text2 (method 4) or

1433

undefined (methods 1,2,3).

1514

undefined (methods 1,2,3).

1434

1515

1435

Returns:

1516

Returns:

1436

Array of Patch objects.

1517

Array of Patch objects.

1437

"""

1518

"""

1438

text1 = None

1519

text1 = None

1439

diffs = None

1520

diffs = None

1440

# Note that texts may arrive as 'str' or 'unicode'.

1521

# Note that texts may arrive as 'str' or 'unicode'.

1441

if isinstance(a, str) and isinstance(b, str) and c is None:

1522

if isinstance(a, str) and isinstance(b, str) and c is None:

1442

# Method 1: text1, text2

1523

# Method 1: text1, text2

1443

# Compute diffs from text1 and text2.

1524

# Compute diffs from text1 and text2.

1444

text1 = a

1525

text1 = a

1445

diffs = self.diff_main(text1, b, True)

1526

diffs = self.diff_main(text1, b, True)

1446

if len(diffs) > 2:

1527

if len(diffs) > 2:

1447

self.diff_cleanupSemantic(diffs)

1528

self.diff_cleanupSemantic(diffs)

1448

self.diff_cleanupEfficiency(diffs)

1529

self.diff_cleanupEfficiency(diffs)

1449

elif isinstance(a, list) and b is None and c is None:

1530

elif isinstance(a, list) and b is None and c is None:

1450

# Method 2: diffs

1531

# Method 2: diffs

1451

# Compute text1 from diffs.

1532

# Compute text1 from diffs.

1452

diffs = a

1533

diffs = a

1453

text1 = self.diff_text1(diffs)

1534

text1 = self.diff_text1(diffs)

1454

elif isinstance(a, str) and isinstance(b, list) and c is None:

1535

elif isinstance(a, str) and isinstance(b, list) and c is None:

1455

# Method 3: text1, diffs

1536

# Method 3: text1, diffs

1456

text1 = a

1537

text1 = a

1457

diffs = b

1538

diffs = b

1458

elif (isinstance(a, str) and isinstance(b, str) and

1539

elif isinstance(a, str) and isinstance(b, str) and isinstance(c, list):

1459

isinstance(c, list)):

1540

# Method 4: text1, text2, diffs

1460

# Method 4: text1, text2, diffs

1541

# text2 is not used.

1461

# text2 is not used.

1542

text1 = a

1462

text1 = a

1543

diffs = c

1463

diffs = c

1544

else:

1464

else:

1545

raise ValueError("Unknown call format to patch_make.")

1465

raise ValueError("Unknown call format to patch_make.")

1466

1546

1467

if not diffs:

1547

if not diffs:

1468

return [] # Get rid of the None case.

1548

return [] # Get rid of the None case.

1469

patches = []

1549

patches = []

1470

patch = patch_obj()

1550

patch = patch_obj()

1471

char_count1 = 0 # Number of characters into the text1 string.

1551

char_count1 = 0 # Number of characters into the text1 string.

1472

char_count2 = 0 # Number of characters into the text2 string.

1552

char_count2 = 0 # Number of characters into the text2 string.

1473

prepatch_text = text1 # Recreate the patches to determine context info.

1553

prepatch_text = text1 # Recreate the patches to determine context info.

1474

postpatch_text = text1

1554

postpatch_text = text1

1475

for x in range(len(diffs)):

1555

for x in range(len(diffs)):

1476

(diff_type, diff_text) = diffs[x]

1556

(diff_type, diff_text) = diffs[x]

1477

if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:

1557

if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:

1478

# A new patch starts here.

1558

# A new patch starts here.

1479

patch.start1 = char_count1

1559

patch.start1 = char_count1

1480

patch.start2 = char_count2

1560

patch.start2 = char_count2

1481

if diff_type == self.DIFF_INSERT:

1561

if diff_type == self.DIFF_INSERT:

1482

# Insertion

1562

# Insertion

1483

patch.diffs.append(diffs[x])

1563

patch.diffs.append(diffs[x])

1484

patch.length2 += len(diff_text)

1564

patch.length2 += len(diff_text)

1485

postpatch_text = (postpatch_text[:char_count2] + diff_text +

1565

postpatch_text = (

1486

postpatch_text[char_count2:])

1566

postpatch_text[:char_count2]

1487

elif diff_type == self.DIFF_DELETE:

1567

+ diff_text

1488

# Deletion.

1568

+ postpatch_text[char_count2:]

1489

patch.length1 += len(diff_text)

1569

)

1490

patch.diffs.append(diffs[x])

1570

elif diff_type == self.DIFF_DELETE:

1491

postpatch_text = (postpatch_text[:char_count2] +

1571

# Deletion.

1492

~~postpatch_text~~[~~char_count2~~ + len(diff_text)~~:])~~

1572

patch.length1 += len(diff_text)

1493

elif (diff_type == self.DIFF_EQUAL and

1573

patch.diffs.append(diffs[x])

1494

len(diff_text) <= 2 * self.Patch_Margin and

1574

postpatch_text = (

1495

len(patch.diffs) != 0 and len(diffs) != x + 1):

1575

postpatch_text[:char_count2]

1496

# Small equality inside a patch.

1576

+ postpatch_text[char_count2 + len(diff_text) :]

1497

patch.diffs.append(diffs[x])

1577

)

1498

patch.length1 += len(diff_text)

1578

elif (

1499

patch.length2 += len(diff_text)

1579

diff_type == self.DIFF_EQUAL

1580

and len(diff_text) <= 2 * self.Patch_Margin

1581

and len(patch.diffs) != 0

1582

and len(diffs) != x + 1

1583

):

1584

# Small equality inside a patch.

1585

patch.diffs.append(diffs[x])

1586

patch.length1 += len(diff_text)

1587

patch.length2 += len(diff_text)

1500

1588

1501

if (diff_type == self.DIFF_EQUAL and

1589

if diff_type == self.DIFF_EQUAL and len(diff_text) >= 2 * self.Patch_Margin:

1502

len(diff_text) >= 2 * self.Patch_Margin):

1590

# Time for a new patch.

1503

# Time for a new patch.

1591

if len(patch.diffs) != 0:

1504

if len(patch.diffs) != 0:

1592

self.patch_addContext(patch, prepatch_text)

1505

self.patch_addContext(patch, prepatch_text)

1593

patches.append(patch)

1506

patches.~~append~~(~~patch~~)

1594

patch = patch_obj()

1507

patch = patch_obj()

1595

# Unlike Unidiff, our patch lists have a rolling context.

1508

# Unlike Unidiff, our patch lists have a rolling context.

1596

# http://code.google.com/p/google-diff-match-patch/wiki/Unidiff

1509

# http://code.google.com/p/google-diff-match-patch/wiki/Unidiff

1597

# Update prepatch text & pos to reflect the application of the

1510

# Update prepatch text & pos to reflect the application of the

1598

# just completed patch.

1511

# just completed patch.

1599

prepatch_text = postpatch_text

1512

prepatch_text = postpatch_text

1600

char_count1 = char_count2

1513

char_count1 = char_count2

1514

1601

1515

# Update the current character count.

1602

# Update the current character count.

1516

if diff_type != self.DIFF_INSERT:

1603

if diff_type != self.DIFF_INSERT:

1517

char_count1 += len(diff_text)

1604

char_count1 += len(diff_text)

1518

if diff_type != self.DIFF_DELETE:

1605

if diff_type != self.DIFF_DELETE:

1519

char_count2 += len(diff_text)

1606

char_count2 += len(diff_text)

1520

1607

1521

# Pick up the leftover patch if not empty.

1608

# Pick up the leftover patch if not empty.

1522

if len(patch.diffs) != 0:

1609

if len(patch.diffs) != 0:

1523

self.patch_addContext(patch, prepatch_text)

1610

self.patch_addContext(patch, prepatch_text)

1524

patches.append(patch)

1611

patches.append(patch)

1525

return patches

1612

return patches

1526

1613

1527

def patch_deepCopy(self, patches):

1614

def patch_deepCopy(self, patches):

1528

"""Given an array of patches, return another array that is identical.

1615

"""Given an array of patches, return another array that is identical.

1529

1616

1530

Args:

1617

Args:

1531

patches: Array of Patch objects.

1618

patches: Array of Patch objects.

1532

1619

1533

Returns:

1620

Returns:

1534

Array of Patch objects.

1621

Array of Patch objects.

1535

"""

1622

"""

1536

patchesCopy = []

1623

patchesCopy = []

1537

for patch in patches:

1624

for patch in patches:

1538

patchCopy = patch_obj()

1625

patchCopy = patch_obj()

1539

# No need to deep copy the tuples since they are immutable.

1626

# No need to deep copy the tuples since they are immutable.

1540

patchCopy.diffs = patch.diffs[:]

1627

patchCopy.diffs = patch.diffs[:]

1541

patchCopy.start1 = patch.start1

1628

patchCopy.start1 = patch.start1

1542

patchCopy.start2 = patch.start2

1629

patchCopy.start2 = patch.start2

1543

patchCopy.length1 = patch.length1

1630

patchCopy.length1 = patch.length1

1544

patchCopy.length2 = patch.length2

1631

patchCopy.length2 = patch.length2

1545

patchesCopy.append(patchCopy)

1632

patchesCopy.append(patchCopy)

1546

return patchesCopy

1633

return patchesCopy

1547

1634

1548

def patch_apply(self, patches, text):

1635

def patch_apply(self, patches, text):

1549

"""Merge a set of patches onto the text. Return a patched text, as well

1636

"""Merge a set of patches onto the text. Return a patched text, as well

1550

as a list of true/false values indicating which patches were applied.

1637

as a list of true/false values indicating which patches were applied.

1551

1638

1552

Args:

1639

Args:

1553

patches: Array of Patch objects.

1640

patches: Array of Patch objects.

1554

text: Old text.

1641

text: Old text.

1555

1642

1556

Returns:

1643

Returns:

1557

Two element Array, containing the new text and an array of boolean values.

1644

Two element Array, containing the new text and an array of boolean values.

1558

"""

1645

"""

1559

if not patches:

1646

if not patches:

1560

return (text, [])

1647

return (text, [])

1561

1648

1562

# Deep copy the patches so that no changes are made to originals.

1649

# Deep copy the patches so that no changes are made to originals.

1563

patches = self.patch_deepCopy(patches)

1650

patches = self.patch_deepCopy(patches)

1564

1651

1565

nullPadding = self.patch_addPadding(patches)

1652

nullPadding = self.patch_addPadding(patches)

1566

text = nullPadding + text + nullPadding

1653

text = nullPadding + text + nullPadding

1567

self.patch_splitMax(patches)

1654

self.patch_splitMax(patches)

1568

1655

1569

# delta keeps track of the offset between the expected and actual location

1656

# delta keeps track of the offset between the expected and actual location

1570

# of the previous patch. If there are patches expected at positions 10 and

1657

# of the previous patch. If there are patches expected at positions 10 and

1571

# 20, but the first patch was found at 12, delta is 2 and the second patch

1658

# 20, but the first patch was found at 12, delta is 2 and the second patch

1572

# has an effective expected position of 22.

1659

# has an effective expected position of 22.

1573

delta = 0

1660

delta = 0

1574

results = []

1661

results = []

1575

for patch in patches:

1662

for patch in patches:

1576

expected_loc = patch.start2 + delta

1663

expected_loc = patch.start2 + delta

1577

text1 = self.diff_text1(patch.diffs)

1664

text1 = self.diff_text1(patch.diffs)

1578

end_loc = -1

1665

end_loc = -1

1579

if len(text1) > self.Match_MaxBits:

1666

if len(text1) > self.Match_MaxBits:

1580

# patch_splitMax will only provide an oversized pattern in the case of

1667

# patch_splitMax will only provide an oversized pattern in the case of

1581

# a monster delete.

1668

# a monster delete.

1582

start_loc = self.match_main(~~text~~, ~~text1~~[:~~self~~.~~Match_MaxBits~~],

1669

start_loc = self.match_main(

1583

expected_loc)

1670

text, text1[: self.Match_MaxBits], expected_loc

1584

if start_loc != -1:

1671

)

1585

end_loc = self.match_main(text, text1[-self.Match_MaxBits:],

1672

if start_loc != -1:

1586

~~expecte~~d_loc + ~~len~~(~~text1~~) - self.~~Match_MaxBits~~)

1673

end_loc = self.match_main(

1587

if end_loc == -1 or start_loc >= end_loc:

1674

text,

1588

# Can't find valid trailing context. Drop this patch.

1675

text1[-self.Match_MaxBits :],

1589

start_loc = -1

1676

expected_loc + len(text1) - self.Match_MaxBits,

1590

else:

1677

)

1591

start_loc = self.match_main(text, text1, expected_loc)

1678

if end_loc == -1 or start_loc >= end_loc:

1592

if start_loc == -1:

1679

# Can't find valid trailing context. Drop this patch.

1593

# No match found. :(

1680

start_loc = -1

1594

results.append(False)

1681

else:

1595

# Subtract the delta for this failed patch from subsequent patches.

1682

start_loc = self.match_main(text, text1, expected_loc)

1596

delta -= patch.length2 - patch.length1

1683

if start_loc == -1:

1597

else:

1684

# No match found. :(

1598

# Found a match. :)

1685

results.append(False)

1599

results.append(True)

1686

# Subtract the delta for this failed patch from subsequent patches.

1600

delta = start_loc - expected_loc

1687

delta -= patch.length2 - patch.length1

1601

if end_loc == -1:

1688

else:

1602

text2 = text[start_loc : start_loc + len(text1)]

1689

# Found a match. :)

1603

else:

1690

results.append(True)

1604

text2 = text[start_loc : end_loc + self.Match_MaxBits]

1691

delta = start_loc - expected_loc

1605

if text1 == text2:

1692

if end_loc == -1:

1606

# Perfect match, just shove the replacement text in.

1693

text2 = text[start_loc : start_loc + len(text1)]

1607

text = (text[:start_loc] + self.diff_text2(patch.diffs) +

1694

else:

1608

text[start_loc + ~~len~~(~~text1~~):])

1695

text2 = text[start_loc : end_loc + self.Match_MaxBits]

1609

else:

1696

if text1 == text2:

1610

# Imperfect match.

1697

# Perfect match, just shove the replacement text in.

1611

# Run a diff to get a framework of equivalent indices.

1698

text = (

1612

diffs = self.diff_main(text1, text2, False)

1699

text[:start_loc]

1613

if (len(text1) > self.Match_MaxBits and

1700

+ self.diff_text2(patch.diffs)

1614

self.diff_levenshtein(diffs) / float(len(text1)) >

1701

+ text[start_loc + len(text1) :]

1615

self.Patch_DeleteThreshold):

1702

)

1616

# The end points match, but the content is unacceptably bad.

1703

else:

1617

results[-1] = False

1704

# Imperfect match.

1618

else:

1705

# Run a diff to get a framework of equivalent indices.

1619

self.diff_cleanupSemanticLossless(diffs)

1706

diffs = self.diff_main(text1, text2, False)

1620

~~index1~~ = 0

1707

if (

1621

for (op, data) in patch.diffs:

1708

len(text1) > self.Match_MaxBits

1622

if op != self.DIFF_EQUAL:

1709

and self.diff_levenshtein(diffs) / float(len(text1))

1623

index2 = self.diff_xIndex(diffs, index1)

1710

> self.Patch_DeleteThreshold

1624

if op == self.DIFF_INSERT: # Insertion

1711

):

1625

text = text[:start_loc + index2] + data + text[start_loc +

1712

# The end points match, but the content is unacceptably bad.

1626

index2:]

1713

results[-1] = False

1627

elif op == self.DIFF_DELETE: # Deletion

1714

else:

1628

text = text[:start_loc + index2] + text[start_loc +

1715

self.diff_cleanupSemanticLossless(diffs)

1629

self.diff_xIndex(diffs, index1 + len(data)):]

1716

index1 = 0

1630

if op != self.DIFF_DELETE:

1717

for op, data in patch.diffs:

1631

index1 += len(data)

1718

if op != self.DIFF_EQUAL:

1632

# Strip the padding off.

1719

index2 = self.diff_xIndex(diffs, index1)

1633

text = text[len(nullPadding):-len(nullPadding)]

1720

if op == self.DIFF_INSERT: # Insertion

1634

return (text, results)

1721

text = (

1722

text[: start_loc + index2]

1723

+ data

1724

+ text[start_loc + index2 :]

1725

)

1726

elif op == self.DIFF_DELETE: # Deletion

1727

text = (

1728

text[: start_loc + index2]

1729

+ text[

1730

start_loc

1731

+ self.diff_xIndex(diffs, index1 + len(data)) :

1732

]

1733

)

1734

if op != self.DIFF_DELETE:

1735

index1 += len(data)

1736

# Strip the padding off.

1737

text = text[len(nullPadding) : -len(nullPadding)]

1738

return (text, results)

1635

1739

1636

def patch_addPadding(self, patches):

1740

def patch_addPadding(self, patches):

1637

"""Add some padding on text start and end so that edges can match

1741

"""Add some padding on text start and end so that edges can match

1638

something. Intended to be called only from within patch_apply.

1742

something. Intended to be called only from within patch_apply.

1639

1743

1640

Args:

1744

Args:

1641

patches: Array of Patch objects.

1745

patches: Array of Patch objects.

1642

1746

1643

Returns:

1747

Returns:

1644

The padding string added to each side.

1748

The padding string added to each side.

1645

"""

1749

"""

1646

paddingLength = self.Patch_Margin

1750

paddingLength = self.Patch_Margin

1647

nullPadding = ""

1751

nullPadding = ""

1648

for x in range(1, paddingLength + 1):

1752

for x in range(1, paddingLength + 1):

1649

nullPadding += chr(x)

1753

nullPadding += chr(x)

1650

1754

1651

# Bump all the patches forward.

1755

# Bump all the patches forward.

1652

for patch in patches:

1756

for patch in patches:

1653

patch.start1 += paddingLength

1757

patch.start1 += paddingLength

1654

patch.start2 += paddingLength

1758

patch.start2 += paddingLength

1655

1759

1656

# Add some padding on start of first diff.

1760

# Add some padding on start of first diff.

1657

patch = patches[0]

1761

patch = patches[0]

1658

diffs = patch.diffs

1762

diffs = patch.diffs

1659

if not diffs or diffs[0][0] != self.DIFF_EQUAL:

1763

if not diffs or diffs[0][0] != self.DIFF_EQUAL:

1660

# Add nullPadding equality.

1764

# Add nullPadding equality.

1661

diffs.insert(0, (self.DIFF_EQUAL, nullPadding))

1765

diffs.insert(0, (self.DIFF_EQUAL, nullPadding))

1662

patch.start1 -= paddingLength # Should be 0.

1766

patch.start1 -= paddingLength # Should be 0.

1663

patch.start2 -= paddingLength # Should be 0.

1767

patch.start2 -= paddingLength # Should be 0.

1664

patch.length1 += paddingLength

1768

patch.length1 += paddingLength

1665

patch.length2 += paddingLength

1769

patch.length2 += paddingLength

1666

elif paddingLength > len(diffs[0][1]):

1770

elif paddingLength > len(diffs[0][1]):

1667

# Grow first equality.

1771

# Grow first equality.

1668

extraLength = paddingLength - len(diffs[0][1])

1772

extraLength = paddingLength - len(diffs[0][1])

1669

newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]

1773

newText = nullPadding[len(diffs[0][1]) :] + diffs[0][1]

1670

diffs[0] = (diffs[0][0], newText)

1774

diffs[0] = (diffs[0][0], newText)

1671

patch.start1 -= extraLength

1775

patch.start1 -= extraLength

1672

patch.start2 -= extraLength

1776

patch.start2 -= extraLength

1673

patch.length1 += extraLength

1777

patch.length1 += extraLength

1674

patch.length2 += extraLength

1778

patch.length2 += extraLength

1675

1779

1676

# Add some padding on end of last diff.

1780

# Add some padding on end of last diff.

1677

patch = patches[-1]

1781

patch = patches[-1]

1678

diffs = patch.diffs

1782

diffs = patch.diffs

1679

if not diffs or diffs[-1][0] != self.DIFF_EQUAL:

1783

if not diffs or diffs[-1][0] != self.DIFF_EQUAL:

1680

# Add nullPadding equality.

1784

# Add nullPadding equality.

1681

diffs.append((self.DIFF_EQUAL, nullPadding))

1785

diffs.append((self.DIFF_EQUAL, nullPadding))

1682

patch.length1 += paddingLength

1786

patch.length1 += paddingLength

1683

patch.length2 += paddingLength

1787

patch.length2 += paddingLength

1684

elif paddingLength > len(diffs[-1][1]):

1788

elif paddingLength > len(diffs[-1][1]):

1685

# Grow last equality.

1789

# Grow last equality.

1686

extraLength = paddingLength - len(diffs[-1][1])

1790

extraLength = paddingLength - len(diffs[-1][1])

1687

newText = diffs[-1][1] + nullPadding[:extraLength]

1791

newText = diffs[-1][1] + nullPadding[:extraLength]

1688

diffs[-1] = (diffs[-1][0], newText)

1792

diffs[-1] = (diffs[-1][0], newText)

1689

patch.length1 += extraLength

1793

patch.length1 += extraLength

1690

patch.length2 += extraLength

1794

patch.length2 += extraLength

1691

1795

1692

return nullPadding

1796

return nullPadding

1693

1797

1694

def patch_splitMax(self, patches):

1798

def patch_splitMax(self, patches):

1695

"""Look through the patches and break up any which are longer than the

1799

"""Look through the patches and break up any which are longer than the

1696

maximum limit of the match algorithm.

1800

maximum limit of the match algorithm.

1697

Intended to be called only from within patch_apply.

1801

Intended to be called only from within patch_apply.

1698

1802

1699

Args:

1803

Args:

1700

patches: Array of Patch objects.

1804

patches: Array of Patch objects.

1701

"""

1805

"""

1702

patch_size = self.Match_MaxBits

1806

patch_size = self.Match_MaxBits

1703

if patch_size == 0:

1807

if patch_size == 0:

1704

# Python has the option of not splitting strings due to its ability

1808

# Python has the option of not splitting strings due to its ability

1705

# to handle integers of arbitrary precision.

1809

# to handle integers of arbitrary precision.

1706

return

1810

return

1707

for x in range(len(patches)):

1811

for x in range(len(patches)):

1708

if patches[x].length1 <= patch_size:

1812

if patches[x].length1 <= patch_size:

1709

continue

1813

continue

1710

bigpatch = patches[x]

1814

bigpatch = patches[x]

1711

# Remove the big old patch.

1815

# Remove the big old patch.

1712

del patches[x]

1816

del patches[x]

1713

x -= 1

1817

x -= 1

1714

start1 = bigpatch.start1

1818

start1 = bigpatch.start1

1715

start2 = bigpatch.start2

1819

start2 = bigpatch.start2

1716

precontext = ''

1820

precontext = ""

1717

while len(bigpatch.diffs) != 0:

1821

while len(bigpatch.diffs) != 0:

1718

# Create one of several smaller patches.

1822

# Create one of several smaller patches.

1719

patch = patch_obj()

1823

patch = patch_obj()

1720

empty = True

1824

empty = True

1721

patch.start1 = start1 - len(precontext)

1825

patch.start1 = start1 - len(precontext)

1722

patch.start2 = start2 - len(precontext)

1826

patch.start2 = start2 - len(precontext)

1723

if precontext:

1827

if precontext:

1724

patch.length1 = patch.length2 = len(precontext)

1828

patch.length1 = patch.length2 = len(precontext)

1725

patch.diffs.append((self.DIFF_EQUAL, precontext))

1829

patch.diffs.append((self.DIFF_EQUAL, precontext))

1726

1830

1727

while (len(bigpatch.diffs) != 0 and

1831

while (

1728

patch.length1 < patch_size - self.Patch_Margin):

1832

len(bigpatch.diffs) != 0

1729

(diff_type, diff_text) = bigpatch.diffs[0]

1833

and patch.length1 < patch_size - self.Patch_Margin

1730

if diff_type == self.DIFF_INSERT:

1834

):

1731

# Insertions are harmless.

1835

(diff_type, diff_text) = bigpatch.diffs[0]

1732

patch.length2 += len(diff_text)

1836

if diff_type == self.DIFF_INSERT:

1733

start2 += len(diff_text)

1837

# Insertions are harmless.

1734

patch.diffs.append(bigpatch.diffs.pop(0))

1838

patch.length2 += len(diff_text)

1735

empty = False

1839

start2 += len(diff_text)

1736

elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and

1840

patch.diffs.append(bigpatch.diffs.pop(0))

1737

patch.diffs[0][0] == self.DIFF_EQUAL and

1841

empty = False

1738

len(diff_text) > 2 * patch_size):

1842

elif (

1739

# This is a large deletion. Let it pass in one chunk.

1843

diff_type == self.DIFF_DELETE

1740

patch.length1 += len(diff_text)

1844

and len(patch.diffs) == 1

1741

start1 += len(diff_text)

1845

and patch.diffs[0][0] == self.DIFF_EQUAL

1742

empty = False

1846

and len(diff_text) > 2 * patch_size

1743

patch.diffs.append((diff_type, diff_text))

1847

):

1744

del bigpatch.diffs[0]

1848

# This is a large deletion. Let it pass in one chunk.

1745

else:

1849

patch.length1 += len(diff_text)

1746

# Deletion or equality. Only take as much as we can stomach.

1850

start1 += len(diff_text)

1747

diff_text = diff_text[:patch_size - patch.length1 -

1851

empty = False

1748

self.Patch_Margin]

1852

patch.diffs.append((diff_type, diff_text))

1749

patch.length1 += len(diff_text)

1853

del bigpatch.diffs[0]

1750

start1 += len(diff_text)

1854

else:

1751

if diff_type == self.DIFF_EQUAL:

1855

# Deletion or equality. Only take as much as we can stomach.

1752

patch.length2 += len(diff_text)

1856

diff_text = diff_text[

1753

start2 += len(diff_text)

1857

: patch_size - patch.length1 - self.Patch_Margin

1754

else:

1858

]

1755

empty = False

1859

patch.length1 += len(diff_text)

1860

start1 += len(diff_text)

1861

if diff_type == self.DIFF_EQUAL:

1862

patch.length2 += len(diff_text)

1863

start2 += len(diff_text)

1864

else:

1865

empty = False

1756

1866

1757

patch.diffs.append((diff_type, diff_text))

1867

patch.diffs.append((diff_type, diff_text))

1758

if diff_text == bigpatch.diffs[0][1]:

1868

if diff_text == bigpatch.diffs[0][1]:

1759

del bigpatch.diffs[0]

1869

del bigpatch.diffs[0]

1760

else:

1870

else:

1761

bigpatch.diffs[0] = (~~bigpatch~~.~~diffs~~[0][0],

1871

bigpatch.diffs[0] = (

1762

bigpatch.diffs[0][1][~~len~~(~~diff_text):])~~

1872

bigpatch.diffs[0][0],

1873

bigpatch.diffs[0][1][len(diff_text) :],

1874

)

1763

1875

1764

# Compute the head context for the next patch.

1876

# Compute the head context for the next patch.

1765

precontext = self.diff_text2(patch.diffs)

1877

precontext = self.diff_text2(patch.diffs)

1766

precontext = precontext[-self.Patch_Margin:]

1878

precontext = precontext[-self.Patch_Margin :]

1767

# Append the end context for this patch.

1879

# Append the end context for this patch.

1768

postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin]

1880

postcontext = self.diff_text1(bigpatch.diffs)[: self.Patch_Margin]

1769

if postcontext:

1881

if postcontext:

1770

patch.length1 += len(postcontext)

1882

patch.length1 += len(postcontext)

1771

patch.length2 += len(postcontext)

1883

patch.length2 += len(postcontext)

1772

if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:

1884

if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:

1773

patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +

1885

patch.diffs[-1] = (

1774

~~postcontext~~)

1886

self.DIFF_EQUAL,

1775

else:

1887

patch.diffs[-1][1] + postcontext,

1776

patch.diffs.append((self.DIFF_EQUAL, postcontext))

1888

)

1889

else:

1890

patch.diffs.append((self.DIFF_EQUAL, postcontext))

1777

1891

1778

if not empty:

1892

if not empty:

1779

x += 1

1893

x += 1

1780

patches.insert(x, patch)

1894

patches.insert(x, patch)

1781

1782

def patch_toText(self, patches):

1783

"""Take a list of patches and return a textual representation.

1784

1895

1785

Args:

1896

def patch_toText(self, patches):

1786

patches: Array of Patch objects.

1897

"""Take a list of patches and return a textual representation.

1898

1899

Args:

1900

patches: Array of Patch objects.

1787

1901

1788

Returns:

1902

Returns:

1789

Text representation of patches.

1903

Text representation of patches.

1790

"""

1904

"""

1791

text = []

1905

text = []

1792

for patch in patches:

1906

for patch in patches:

1793

text.append(str(patch))

1907

text.append(str(patch))

1794

return "".join(text)

1908

return "".join(text)

1795

1909

1796

def patch_fromText(self, textline):

1910

def patch_fromText(self, textline):

1797

"""Parse a textual representation of patches and return a list of patch

1911

"""Parse a textual representation of patches and return a list of patch

1798

objects.

1912

objects.

1799

1913

1800

Args:

1914

Args:

1801

textline: Text representation of patches.

1915

textline: Text representation of patches.

1802

1916

1803

Returns:

1917

Returns:

1804

Array of Patch objects.

1918

Array of Patch objects.

1805

1919

1806

Raises:

1920

Raises:

1807

ValueError: If invalid input.

1921

ValueError: If invalid input.

1808

"""

1922

"""

1809

if type(textline) == unicode:

1923

if type(textline) == unicode:

1810

# Patches should be composed of a subset of ascii chars, Unicode not

1924

# Patches should be composed of a subset of ascii chars, Unicode not

1811

# required. If this encode raises UnicodeEncodeError, patch is invalid.

1925

# required. If this encode raises UnicodeEncodeError, patch is invalid.

1812

textline = textline.encode("ascii")

1926

textline = textline.encode("ascii")

1813

patches = []

1927

patches = []

1814

if not textline:

1928

if not textline:

1815

return patches

1929

return patches

1816

text = textline.split('\n')

1930

text = textline.split("\n")

1817

while len(text) != 0:

1931

while len(text) != 0:

1818

m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])

1932

m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])

1819

if not m:

1933

if not m:

1820

raise ValueError("Invalid patch string: " + text[0])

1934

raise ValueError("Invalid patch string: " + text[0])

1821

patch = patch_obj()

1935

patch = patch_obj()

1822

patches.append(patch)

1936

patches.append(patch)

1823

patch.start1 = int(m.group(1))

1937

patch.start1 = int(m.group(1))

1824

if m.group(2) == '':

1938

if m.group(2) == "":

1825

patch.start1 -= 1

1939

patch.start1 -= 1

1826

patch.length1 = 1

1940

patch.length1 = 1

1827

elif m.group(2) == '0':

1941

elif m.group(2) == "0":

1828

patch.length1 = 0

1942

patch.length1 = 0

1829

else:

1943

else:

1830

patch.start1 -= 1

1944

patch.start1 -= 1

1831

patch.length1 = int(m.group(2))

1945

patch.length1 = int(m.group(2))

1832

1946

1833

patch.start2 = int(m.group(3))

1947

patch.start2 = int(m.group(3))

1834

if m.group(4) == '':

1948

if m.group(4) == "":

1835

patch.start2 -= 1

1949

patch.start2 -= 1

1836

patch.length2 = 1

1950

patch.length2 = 1

1837

elif m.group(4) == '0':

1951

elif m.group(4) == "0":

1838

patch.length2 = 0

1952

patch.length2 = 0

1839

else:

1953

else:

1840

patch.start2 -= 1

1954

patch.start2 -= 1

1841

patch.length2 = int(m.group(4))

1955

patch.length2 = int(m.group(4))

1842

1956

1843

del text[0]

1957

del text[0]

1844

1958

1845

while len(text) != 0:

1959

while len(text) != 0:

1846

if text[0]:

1960

if text[0]:

1847

sign = text[0][0]

1961

sign = text[0][0]

1848

else:

1962

else:

1849

sign = ''

1963

sign = ""

1850

line = urllib.parse.unquote(text[0][1:])

1964

line = urllib.parse.unquote(text[0][1:])

1851

line = line.decode("utf-8")

1965

line = line.decode("utf-8")

1852

if sign == '+':

1966

if sign == "+":

1853

# Insertion.

1967

# Insertion.

1854

patch.diffs.append((self.DIFF_INSERT, line))

1968

patch.diffs.append((self.DIFF_INSERT, line))

1855

elif sign == '-':

1969

elif sign == "-":

1856

# Deletion.

1970

# Deletion.

1857

patch.diffs.append((self.DIFF_DELETE, line))

1971

patch.diffs.append((self.DIFF_DELETE, line))

1858

elif sign == ' ':

1972

elif sign == " ":

1859

# Minor equality.

1973

# Minor equality.

1860

patch.diffs.append((self.DIFF_EQUAL, line))

1974

patch.diffs.append((self.DIFF_EQUAL, line))

1861

elif sign == '@':

1975

elif sign == "@":

1862

# Start of next patch.

1976

# Start of next patch.

1863

break

1977

break

1864

elif sign == '':

1978

elif sign == "":

1865

# Blank line? Whatever.

1979

# Blank line? Whatever.

1866

pass

1980

pass

1867

else:

1981

else:

1868

# WTF?

1982

# WTF?

1869

raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))

1983

raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))

1870

del text[0]

1984

del text[0]

1871

return patches

1985

return patches

1872

1986

1873

1987

1874

class patch_obj:

1988

class patch_obj:

1875

"""Class representing one patch operation.

1989

"""Class representing one patch operation."""

1876

"""

1877

1990

1878

def __init__(self):

1991

def __init__(self):

1879

"""Initializes with an empty list of diffs.

1992

"""Initializes with an empty list of diffs."""

1880

"""

1993

self.diffs = []

1881

self.diffs = []

1994

self.start1 = None

1882

self.start1 = None

1995

self.start2 = None

1883

self.start2 = None

1996

self.length1 = 0

1884

self.length1 = 0

1997

self.length2 = 0

1885

self.length2 = 0

1886

1998

1887

def __str__(self):

1999

def __str__(self):

1888

"""Emmulate GNU diff's format.

2000

"""Emmulate GNU diff's format.

1889

Header: @@ -382,8 +481,9 @@

2001

Header: @@ -382,8 +481,9 @@

1890

Indicies are printed as 1-based, not 0-based.

2002

Indicies are printed as 1-based, not 0-based.

1891

2003

1892

Returns:

2004

Returns:

1893

The GNU diff string.

2005

The GNU diff string.

1894

"""

2006

"""

1895

if self.length1 == 0:

2007

if self.length1 == 0:

1896

coords1 = str(self.start1) + ",0"

2008

coords1 = str(self.start1) + ",0"

1897

elif self.length1 == 1:

2009

elif self.length1 == 1:

1898

coords1 = str(self.start1 + 1)

2010

coords1 = str(self.start1 + 1)

1899

else:

2011

else:

1900

coords1 = str(self.start1 + 1) + "," + str(self.length1)

2012

coords1 = str(self.start1 + 1) + "," + str(self.length1)

1901

if self.length2 == 0:

2013

if self.length2 == 0:

1902

coords2 = str(self.start2) + ",0"

2014

coords2 = str(self.start2) + ",0"

1903

elif self.length2 == 1:

2015

elif self.length2 == 1:

1904

coords2 = str(self.start2 + 1)

2016

coords2 = str(self.start2 + 1)

1905

else:

2017

else:

1906

coords2 = str(self.start2 + 1) + "," + str(self.length2)

2018

coords2 = str(self.start2 + 1) + "," + str(self.length2)

1907

text = ["@@ -", coords1, " +", coords2, " @@\n"]

2019

text = ["@@ -", coords1, " +", coords2, " @@\n"]

1908

# Escape the body of the patch with %xx notation.

2020

# Escape the body of the patch with %xx notation.

1909

for (op, data) in self.diffs:

2021

for op, data in self.diffs:

1910

if op == diff_match_patch.DIFF_INSERT:

2022

if op == diff_match_patch.DIFF_INSERT:

1911

text.append("+")

2023

text.append("+")

1912

elif op == diff_match_patch.DIFF_DELETE:

2024

elif op == diff_match_patch.DIFF_DELETE:

1913

text.append("-")

2025

text.append("-")

1914

elif op == diff_match_patch.DIFF_EQUAL:

2026

elif op == diff_match_patch.DIFF_EQUAL:

1915

text.append(" ")

2027

text.append(" ")

1916

# High ascii will raise UnicodeDecodeError. Use Unicode instead.

2028

# High ascii will raise UnicodeDecodeError. Use Unicode instead.

1917

data = data.encode("utf-8")

2029

data = data.encode("utf-8")

1918

text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n")

2030

text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n")

1919

return "".join(text) No newline at end of file

2031

return "".join(text)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages