upstream/mercurial-mirror Commit - r29337:f72d0c21

1

# similar.py - mechanisms for finding similar files

1

# similar.py - mechanisms for finding similar files

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

from .i18n import _

10

from .i18n import _

11

from . import (

11

from . import (

12

bdiff,

12

bdiff,

13

mdiff,

13

mdiff,

14

util,

14

util,

15

)

15

)

16

17

def _findexactmatches(repo, added, removed):

17

def _findexactmatches(repo, added, removed):

18

'''find renamed files that have no changes

18

'''find renamed files that have no changes

19

20

Takes a list of new filectxs and a list of removed filectxs, and yields

20

Takes a list of new filectxs and a list of removed filectxs, and yields

21

(before, after) tuples of exact matches.

21

(before, after) tuples of exact matches.

22

'''

22

'''

23

numfiles = len(added) + len(removed)

23

numfiles = len(added) + len(removed)

24

25

# Get hashes of removed files.

25

# Get hashes of removed files.

26

hashes = {}

26

hashes = {}

27

for i, fctx in enumerate(removed):

27

for i, fctx in enumerate(removed):

28

repo.ui.progress(_('searching for exact renames'), i, total=numfiles,

28

repo.ui.progress(_('searching for exact renames'), i, total=numfiles,

29

unit=_('files'))

29

unit=_('files'))

30

h = util.sha1(fctx.data()).digest()

30

h = util.sha1(fctx.data()).digest()

31

hashes[h] = fctx

31

hashes[h] = fctx

32

33

# For each added file, see if it corresponds to a removed file.

33

# For each added file, see if it corresponds to a removed file.

34

for i, fctx in enumerate(added):

34

for i, fctx in enumerate(added):

35

repo.ui.progress(_('searching for exact renames'), i + len(removed),

35

repo.ui.progress(_('searching for exact renames'), i + len(removed),

36

total=numfiles, unit=_('files'))

36

total=numfiles, unit=_('files'))

37

h = util.sha1(fctx.data()).digest()

37

h = util.sha1(fctx.data()).digest()

38

if h in hashes:

38

if h in hashes:

39

yield (hashes[h], fctx)

39

yield (hashes[h], fctx)

40

41

# Done

41

# Done

42

repo.ui.progress(_('searching for exact renames'), None)

42

repo.ui.progress(_('searching for exact renames'), None)

43

44

def _findsimilarmatches(repo, added, removed, threshold):

44

def _findsimilarmatches(repo, added, removed, threshold):

45

'''find potentially renamed files based on similar file content

45

'''find potentially renamed files based on similar file content

46

47

Takes a list of new filectxs and a list of removed filectxs, and yields

47

Takes a list of new filectxs and a list of removed filectxs, and yields

48

(before, after, score) tuples of partial matches.

48

(before, after, score) tuples of partial matches.

49

'''

49

'''

50

copies = {}

50

copies = {}

51

for i, r in enumerate(removed):

51

for i, r in enumerate(removed):

52

repo.ui.progress(_('searching for similar files'), i,

52

repo.ui.progress(_('searching for similar files'), i,

53

total=len(removed), unit=_('files'))

53

total=len(removed), unit=_('files'))

54

55

# lazily load text

55

# lazily load text

56

@util.cachefunc

56

@util.cachefunc

57

def data():

57

def data():

58

orig = r.data()

58

orig = r.data()

59

return orig, mdiff.splitnewlines(orig)

59

return orig, mdiff.splitnewlines(orig)

60

61

def score(text):

61

def score(text):

62

orig, lines = data()

62

orig, lines = data()

63

# bdiff.blocks() returns blocks of matching lines

63

# bdiff.blocks() returns blocks of matching lines

64

# count the number of bytes in each

64

# count the number of bytes in each

65

equal = 0

65

equal = 0

66

matches = bdiff.blocks(text, orig)

66

matches = bdiff.blocks(text, orig)

67

for x1, x2, y1, y2 in matches:

67

for x1, x2, y1, y2 in matches:

68

for line in lines[y1:y2]:

68

for line in lines[y1:y2]:

69

equal += len(line)

69

equal += len(line)

70

71

lengths = len(text) + len(orig)

71

lengths = len(text) + len(orig)

72

return equal * 2.0 / lengths

72

return equal * 2.0 / lengths

73

74

for a in added:

74

for a in added:

75

bestscore = copies.get(a, (None, threshold))[1]

75

bestscore = copies.get(a, (None, threshold))[1]

76

myscore = score(a.data())

76

myscore = score(a.data())

77

if myscore >= bestscore:

77

if myscore >= bestscore:

78

copies[a] = (r, myscore)

78

copies[a] = (r, myscore)

79

repo.ui.progress(_('searching'), None)

79

repo.ui.progress(_('searching'), None)

80

81

for dest, v in copies.iteritems():

81

for dest, v in copies.iteritems():

82

source, score = v

82

source, score = v

83

yield source, dest, score

83

yield source, dest, score

84

85

def findrenames(repo, added, removed, threshold):

85

def findrenames(repo, added, removed, threshold):

86

'''find renamed files -- yields (before, after, score) tuples'''

86

'''find renamed files -- yields (before, after, score) tuples'''

87

parentctx = repo['.']

87

parentctx = repo['.']

88

workingctx = repo[None]

88

workingctx = repo[None]

89

90

# Zero length files will be frequently unrelated to each other, and

90

# Zero length files will be frequently unrelated to each other, and

91

# tracking the deletion/addition of such a file will probably cause more

91

# tracking the deletion/addition of such a file will probably cause more

92

# harm than good. We strip them out here to avoid matching them later on.

92

# harm than good. We strip them out here to avoid matching them later on.

93

addedfiles = set([workingctx[fp] for fp in added

93

addedfiles = set([workingctx[fp] for fp in added

94

if workingctx[fp].size() > 0])

94

if workingctx[fp].size() > 0])

95

removedfiles = set([parentctx[fp] for fp in removed

95

removedfiles = set([parentctx[fp] for fp in removed

96

if fp in parentctx and parentctx[fp].size() > 0])

96

if fp in parentctx and parentctx[fp].size() > 0])

97

98

# Find exact matches.

98

# Find exact matches.

99

for (a, b) in _findexactmatches(repo,

99

for (a, b) in _findexactmatches(repo,

100

sorted(addedfiles), sorted(removedfiles)):

100

sorted(addedfiles), sorted(removedfiles)):

101

addedfiles.remove(b)

101

addedfiles.remove(b)

102

yield (a.path(), b.path(), 1.0)

102

yield (a.path(), b.path(), 1.0)

103

104

# If the user requested similar files to be matched, search for them also.

104

# If the user requested similar files to be matched, search for them also.

105

if threshold < 1.0:

105

if threshold < 1.0:

106

for (a, b, score) in _findsimilarmatches(repo,

106

for (a, b, score) in _findsimilarmatches(repo,

107

sorted(addedfiles), sorted(removedfiles), threshold):

107

sorted(addedfiles), sorted(removedfiles), threshold):

108

yield (a.path(), b.path(), score)

108

yield (a.path(), b.path(), score)

109

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # similar.py - mechanisms for finding similar files
             #
             # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             from .i18n import _
             from . import (
                 bdiff,
                 mdiff,
                 util,
             )
             def _findexactmatches(repo, added, removed):
                 '''find renamed files that have no changes
                 Takes a list of new filectxs and a list of removed filectxs, and yields
                 (before, after) tuples of exact matches.
                 '''
                 numfiles = len(added) + len(removed)
                 # Get hashes of removed files.
                 hashes = {}
                 for i, fctx in enumerate(removed):
                     repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
                                      unit=_('files'))
                     h = util.sha1(fctx.data()).digest()
                     hashes[h] = fctx
                 # For each added file, see if it corresponds to a removed file.
                 for i, fctx in enumerate(added):
                     repo.ui.progress(_('searching for exact renames'), i + len(removed),
                             total=numfiles, unit=_('files'))
                     h = util.sha1(fctx.data()).digest()
                     if h in hashes:
                         yield (hashes[h], fctx)
                 # Done
                 repo.ui.progress(_('searching for exact renames'), None)
             def _findsimilarmatches(repo, added, removed, threshold):
                 '''find potentially renamed files based on similar file content
                 Takes a list of new filectxs and a list of removed filectxs, and yields
                 (before, after, score) tuples of partial matches.
                 '''
                 copies = {}
                 for i, r in enumerate(removed):
                     repo.ui.progress(_('searching for similar files'), i,
                                      total=len(removed), unit=_('files'))
                     # lazily load text
                     @util.cachefunc
                     def data():
                         orig = r.data()
                         return orig, mdiff.splitnewlines(orig)
                     def score(text):
                         orig, lines = data()
                         # bdiff.blocks() returns blocks of matching lines
                         # count the number of bytes in each
                         equal = 0
                         matches = bdiff.blocks(text, orig)
                         for x1, x2, y1, y2 in matches:
                             for line in lines[y1:y2]:
                                 equal += len(line)
                         lengths = len(text) + len(orig)
                         return equal * 2.0 / lengths
                     for a in added:
                         bestscore = copies.get(a, (None, threshold))[1]
                         myscore = score(a.data())
                         if myscore >= bestscore:
                             copies[a] = (r, myscore)
                 repo.ui.progress(_('searching'), None)
                 for dest, v in copies.iteritems():
                     source, score = v
                     yield source, dest, score
             def findrenames(repo, added, removed, threshold):
                 '''find renamed files -- yields (before, after, score) tuples'''
                 parentctx = repo['.']
                 workingctx = repo[None]
                 # Zero length files will be frequently unrelated to each other, and
                 # tracking the deletion/addition of such a file will probably cause more
                 # harm than good. We strip them out here to avoid matching them later on.
                 addedfiles = set([workingctx[fp] for fp in added
                         if workingctx[fp].size() > 0])
                 removedfiles = set([parentctx[fp] for fp in removed
                         if fp in parentctx and parentctx[fp].size() > 0])
                 # Find exact matches.
                 for (a, b) in _findexactmatches(repo,
                         sorted(addedfiles), sorted(removedfiles)):
                     addedfiles.remove(b)
                     yield (a.path(), b.path(), 1.0)
                 # If the user requested similar files to be matched, search for them also.
                 if threshold < 1.0:
                     for (a, b, score) in _findsimilarmatches(repo,
                             sorted(addedfiles), sorted(removedfiles), threshold):
                         yield (a.path(), b.path(), score)