upstream/mercurial-mirror Commit - r31210:e1d03590

1

# similar.py - mechanisms for finding similar files

1

# similar.py - mechanisms for finding similar files

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import hashlib

10

import hashlib

11

12

from .i18n import _

12

from .i18n import _

13

from . import (

13

from . import (

14

bdiff,

14

bdiff,

15

mdiff,

15

mdiff,

16

)

16

)

17

18

def _findexactmatches(repo, added, removed):

18

def _findexactmatches(repo, added, removed):

19

'''find renamed files that have no changes

19

'''find renamed files that have no changes

20

21

Takes a list of new filectxs and a list of removed filectxs, and yields

21

Takes a list of new filectxs and a list of removed filectxs, and yields

22

(before, after) tuples of exact matches.

22

(before, after) tuples of exact matches.

23

'''

23

'''

24

numfiles = len(added) + len(removed)

24

numfiles = len(added) + len(removed)

25

26

# Get hashes of removed files.

26

# Get hashes of removed files.

27

hashes = {}

27

hashes = {}

28

for i, fctx in enumerate(removed):

28

for i, fctx in enumerate(removed):

29

repo.ui.progress(_('searching for exact renames'), i, total=numfiles,

29

repo.ui.progress(_('searching for exact renames'), i, total=numfiles,

30

unit=_('files'))

30

unit=_('files'))

31

h = hashlib.sha1(fctx.data()).digest()

31

h = hashlib.sha1(fctx.data()).digest()

32

hashes[h] = fctx

32

hashes[h] = fctx

33

34

# For each added file, see if it corresponds to a removed file.

34

# For each added file, see if it corresponds to a removed file.

35

for i, fctx in enumerate(added):

35

for i, fctx in enumerate(added):

36

repo.ui.progress(_('searching for exact renames'), i + len(removed),

36

repo.ui.progress(_('searching for exact renames'), i + len(removed),

37

total=numfiles, unit=_('files'))

37

total=numfiles, unit=_('files'))

38

h = ~~hashlib~~.~~sha1~~(fctx.data()).~~digest~~()

38

adata = fctx.data()

39

h = hashlib.sha1(adata).digest()

39

if h in hashes:

40

if h in hashes:

40

~~yield~~ (hashes[h], ~~fctx~~)

41

rfctx = hashes[h]

42

# compare between actual file contents for exact identity

43

if adata == rfctx.data():

44

yield (rfctx, fctx)

41

45

42

# Done

46

# Done

43

repo.ui.progress(_('searching for exact renames'), None)

47

repo.ui.progress(_('searching for exact renames'), None)

44

48

45

def _ctxdata(fctx):

49

def _ctxdata(fctx):

46

# lazily load text

50

# lazily load text

47

orig = fctx.data()

51

orig = fctx.data()

48

return orig, mdiff.splitnewlines(orig)

52

return orig, mdiff.splitnewlines(orig)

49

53

50

def _score(fctx, otherdata):

54

def _score(fctx, otherdata):

51

orig, lines = otherdata

55

orig, lines = otherdata

52

text = fctx.data()

56

text = fctx.data()

53

# bdiff.blocks() returns blocks of matching lines

57

# bdiff.blocks() returns blocks of matching lines

54

# count the number of bytes in each

58

# count the number of bytes in each

55

equal = 0

59

equal = 0

56

matches = bdiff.blocks(text, orig)

60

matches = bdiff.blocks(text, orig)

57

for x1, x2, y1, y2 in matches:

61

for x1, x2, y1, y2 in matches:

58

for line in lines[y1:y2]:

62

for line in lines[y1:y2]:

59

equal += len(line)

63

equal += len(line)

60

64

61

lengths = len(text) + len(orig)

65

lengths = len(text) + len(orig)

62

return equal * 2.0 / lengths

66

return equal * 2.0 / lengths

63

67

64

def score(fctx1, fctx2):

68

def score(fctx1, fctx2):

65

return _score(fctx1, _ctxdata(fctx2))

69

return _score(fctx1, _ctxdata(fctx2))

66

70

67

def _findsimilarmatches(repo, added, removed, threshold):

71

def _findsimilarmatches(repo, added, removed, threshold):

68

'''find potentially renamed files based on similar file content

72

'''find potentially renamed files based on similar file content

69

73

70

Takes a list of new filectxs and a list of removed filectxs, and yields

74

Takes a list of new filectxs and a list of removed filectxs, and yields

71

(before, after, score) tuples of partial matches.

75

(before, after, score) tuples of partial matches.

72

'''

76

'''

73

copies = {}

77

copies = {}

74

for i, r in enumerate(removed):

78

for i, r in enumerate(removed):

75

repo.ui.progress(_('searching for similar files'), i,

79

repo.ui.progress(_('searching for similar files'), i,

76

total=len(removed), unit=_('files'))

80

total=len(removed), unit=_('files'))

77

81

78

data = None

82

data = None

79

for a in added:

83

for a in added:

80

bestscore = copies.get(a, (None, threshold))[1]

84

bestscore = copies.get(a, (None, threshold))[1]

81

if data is None:

85

if data is None:

82

data = _ctxdata(r)

86

data = _ctxdata(r)

83

myscore = _score(a, data)

87

myscore = _score(a, data)

84

if myscore >= bestscore:

88

if myscore >= bestscore:

85

copies[a] = (r, myscore)

89

copies[a] = (r, myscore)

86

repo.ui.progress(_('searching'), None)

90

repo.ui.progress(_('searching'), None)

87

91

88

for dest, v in copies.iteritems():

92

for dest, v in copies.iteritems():

89

source, bscore = v

93

source, bscore = v

90

yield source, dest, bscore

94

yield source, dest, bscore

91

95

92

def findrenames(repo, added, removed, threshold):

96

def findrenames(repo, added, removed, threshold):

93

'''find renamed files -- yields (before, after, score) tuples'''

97

'''find renamed files -- yields (before, after, score) tuples'''

94

parentctx = repo['.']

98

parentctx = repo['.']

95

workingctx = repo[None]

99

workingctx = repo[None]

96

100

97

# Zero length files will be frequently unrelated to each other, and

101

# Zero length files will be frequently unrelated to each other, and

98

# tracking the deletion/addition of such a file will probably cause more

102

# tracking the deletion/addition of such a file will probably cause more

99

# harm than good. We strip them out here to avoid matching them later on.

103

# harm than good. We strip them out here to avoid matching them later on.

100

addedfiles = set([workingctx[fp] for fp in added

104

addedfiles = set([workingctx[fp] for fp in added

101

if workingctx[fp].size() > 0])

105

if workingctx[fp].size() > 0])

102

removedfiles = set([parentctx[fp] for fp in removed

106

removedfiles = set([parentctx[fp] for fp in removed

103

if fp in parentctx and parentctx[fp].size() > 0])

107

if fp in parentctx and parentctx[fp].size() > 0])

104

108

105

# Find exact matches.

109

# Find exact matches.

106

for (a, b) in _findexactmatches(repo,

110

for (a, b) in _findexactmatches(repo,

107

sorted(addedfiles), sorted(removedfiles)):

111

sorted(addedfiles), sorted(removedfiles)):

108

addedfiles.remove(b)

112

addedfiles.remove(b)

109

yield (a.path(), b.path(), 1.0)

113

yield (a.path(), b.path(), 1.0)

110

114

111

# If the user requested similar files to be matched, search for them also.

115

# If the user requested similar files to be matched, search for them also.

112

if threshold < 1.0:

116

if threshold < 1.0:

113

for (a, b, score) in _findsimilarmatches(repo,

117

for (a, b, score) in _findsimilarmatches(repo,

114

sorted(addedfiles), sorted(removedfiles), threshold):

118

sorted(addedfiles), sorted(removedfiles), threshold):

115

yield (a.path(), b.path(), score)

119

yield (a.path(), b.path(), score)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # similar.py - mechanisms for finding similar files
             #
             # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import hashlib
             from .i18n import _
             from . import (
                 bdiff,
                 mdiff,
             )
             def _findexactmatches(repo, added, removed):
                 '''find renamed files that have no changes
                 Takes a list of new filectxs and a list of removed filectxs, and yields
                 (before, after) tuples of exact matches.
                 '''
                 numfiles = len(added) + len(removed)
                 # Get hashes of removed files.
                 hashes = {}
                 for i, fctx in enumerate(removed):
                     repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
                                      unit=_('files'))
                     h = hashlib.sha1(fctx.data()).digest()
                     hashes[h] = fctx
                 # For each added file, see if it corresponds to a removed file.
                 for i, fctx in enumerate(added):
                     repo.ui.progress(_('searching for exact renames'), i + len(removed),
                             total=numfiles, unit=_('files'))
-                    h = hashlib.sha1(fctx.data()).digest()
+                    adata = fctx.data()
+                    h = hashlib.sha1(adata).digest()
                     if h in hashes:
-                        yield (hashes[h], fctx)
+                        rfctx = hashes[h]
+                        # compare between actual file contents for exact identity
+                        if adata == rfctx.data():
+                            yield (rfctx, fctx)
                 # Done
                 repo.ui.progress(_('searching for exact renames'), None)
             def _ctxdata(fctx):
                 # lazily load text
                 orig = fctx.data()
                 return orig, mdiff.splitnewlines(orig)
             def _score(fctx, otherdata):
                 orig, lines = otherdata
                 text = fctx.data()
                 # bdiff.blocks() returns blocks of matching lines
                 # count the number of bytes in each
                 equal = 0
                 matches = bdiff.blocks(text, orig)
                 for x1, x2, y1, y2 in matches:
                     for line in lines[y1:y2]:
                         equal += len(line)
                 lengths = len(text) + len(orig)
                 return equal * 2.0 / lengths
             def score(fctx1, fctx2):
                 return _score(fctx1, _ctxdata(fctx2))
             def _findsimilarmatches(repo, added, removed, threshold):
                 '''find potentially renamed files based on similar file content
                 Takes a list of new filectxs and a list of removed filectxs, and yields
                 (before, after, score) tuples of partial matches.
                 '''
                 copies = {}
                 for i, r in enumerate(removed):
                     repo.ui.progress(_('searching for similar files'), i,
                                      total=len(removed), unit=_('files'))
                     data = None
                     for a in added:
                         bestscore = copies.get(a, (None, threshold))[1]
                         if data is None:
                             data = _ctxdata(r)
                         myscore = _score(a, data)
                         if myscore >= bestscore:
                             copies[a] = (r, myscore)
                 repo.ui.progress(_('searching'), None)
                 for dest, v in copies.iteritems():
                     source, bscore = v
                     yield source, dest, bscore
             def findrenames(repo, added, removed, threshold):
                 '''find renamed files -- yields (before, after, score) tuples'''
                 parentctx = repo['.']
                 workingctx = repo[None]
                 # Zero length files will be frequently unrelated to each other, and
                 # tracking the deletion/addition of such a file will probably cause more
                 # harm than good. We strip them out here to avoid matching them later on.
                 addedfiles = set([workingctx[fp] for fp in added
                         if workingctx[fp].size() > 0])
                 removedfiles = set([parentctx[fp] for fp in removed
                         if fp in parentctx and parentctx[fp].size() > 0])
                 # Find exact matches.
                 for (a, b) in _findexactmatches(repo,
                         sorted(addedfiles), sorted(removedfiles)):
                     addedfiles.remove(b)
                     yield (a.path(), b.path(), 1.0)
                 # If the user requested similar files to be matched, search for them also.
                 if threshold < 1.0:
                     for (a, b, score) in _findsimilarmatches(repo,
                             sorted(addedfiles), sorted(removedfiles), threshold):
                         yield (a.path(), b.path(), score)