upstream/mercurial-mirror Commit - r30805:0ae287eb

1

# similar.py - mechanisms for finding similar files

1

# similar.py - mechanisms for finding similar files

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import hashlib

10

import hashlib

11

12

from .i18n import _

12

from .i18n import _

13

from . import (

13

from . import (

14

bdiff,

14

bdiff,

15

mdiff,

15

mdiff,

16

util,

16

util,

17

)

17

)

18

19

def _findexactmatches(repo, added, removed):

19

def _findexactmatches(repo, added, removed):

20

'''find renamed files that have no changes

20

'''find renamed files that have no changes

21

22

Takes a list of new filectxs and a list of removed filectxs, and yields

22

Takes a list of new filectxs and a list of removed filectxs, and yields

23

(before, after) tuples of exact matches.

23

(before, after) tuples of exact matches.

24

'''

24

'''

25

numfiles = len(added) + len(removed)

25

numfiles = len(added) + len(removed)

26

27

# Get hashes of removed files.

27

# Get hashes of removed files.

28

hashes = {}

28

hashes = {}

29

for i, fctx in enumerate(removed):

29

for i, fctx in enumerate(removed):

30

repo.ui.progress(_('searching for exact renames'), i, total=numfiles,

30

repo.ui.progress(_('searching for exact renames'), i, total=numfiles,

31

unit=_('files'))

31

unit=_('files'))

32

h = hashlib.sha1(fctx.data()).digest()

32

h = hashlib.sha1(fctx.data()).digest()

33

hashes[h] = fctx

33

hashes[h] = fctx

34

35

# For each added file, see if it corresponds to a removed file.

35

# For each added file, see if it corresponds to a removed file.

36

for i, fctx in enumerate(added):

36

for i, fctx in enumerate(added):

37

repo.ui.progress(_('searching for exact renames'), i + len(removed),

37

repo.ui.progress(_('searching for exact renames'), i + len(removed),

38

total=numfiles, unit=_('files'))

38

total=numfiles, unit=_('files'))

39

h = hashlib.sha1(fctx.data()).digest()

39

h = hashlib.sha1(fctx.data()).digest()

40

if h in hashes:

40

if h in hashes:

41

yield (hashes[h], fctx)

41

yield (hashes[h], fctx)

42

43

# Done

43

# Done

44

repo.ui.progress(_('searching for exact renames'), None)

44

repo.ui.progress(_('searching for exact renames'), None)

45

46

@util.cachefunc

47

def _ctxdata(fctx):

48

# lazily load text

49

orig = fctx.data()

50

return orig, mdiff.splitnewlines(orig)

51

52

@util.cachefunc

53

def score(fctx1, fctx2):

54

text = fctx1.data()

55

orig, lines = _ctxdata(fctx2)

56

# bdiff.blocks() returns blocks of matching lines

57

# count the number of bytes in each

58

equal = 0

59

matches = bdiff.blocks(text, orig)

60

for x1, x2, y1, y2 in matches:

61

for line in lines[y1:y2]:

62

equal += len(line)

63

64

lengths = len(text) + len(orig)

65

return equal * 2.0 / lengths

66

46

def _findsimilarmatches(repo, added, removed, threshold):

67

def _findsimilarmatches(repo, added, removed, threshold):

47

'''find potentially renamed files based on similar file content

68

'''find potentially renamed files based on similar file content

48

69

49

Takes a list of new filectxs and a list of removed filectxs, and yields

70

Takes a list of new filectxs and a list of removed filectxs, and yields

50

(before, after, score) tuples of partial matches.

71

(before, after, score) tuples of partial matches.

51

'''

72

'''

52

copies = {}

73

copies = {}

53

for i, r in enumerate(removed):

74

for i, r in enumerate(removed):

54

repo.ui.progress(_('searching for similar files'), i,

75

repo.ui.progress(_('searching for similar files'), i,

55

total=len(removed), unit=_('files'))

76

total=len(removed), unit=_('files'))

56

77

57

# lazily load text

58

@util.cachefunc

59

def data():

60

orig = r.data()

61

return orig, mdiff.splitnewlines(orig)

62

63

def score(text):

64

orig, lines = data()

65

# bdiff.blocks() returns blocks of matching lines

66

# count the number of bytes in each

67

equal = 0

68

matches = bdiff.blocks(text, orig)

69

for x1, x2, y1, y2 in matches:

70

for line in lines[y1:y2]:

71

equal += len(line)

72

73

lengths = len(text) + len(orig)

74

return equal * 2.0 / lengths

75

76

for a in added:

78

for a in added:

77

bestscore = copies.get(a, (None, threshold))[1]

79

bestscore = copies.get(a, (None, threshold))[1]

78

myscore = score(a.~~data~~())

80

myscore = score(a, r)

79

if myscore >= bestscore:

81

if myscore >= bestscore:

80

copies[a] = (r, myscore)

82

copies[a] = (r, myscore)

81

repo.ui.progress(_('searching'), None)

83

repo.ui.progress(_('searching'), None)

82

84

83

for dest, v in copies.iteritems():

85

for dest, v in copies.iteritems():

84

source, bscore = v

86

source, bscore = v

85

yield source, dest, bscore

87

yield source, dest, bscore

86

88

87

def findrenames(repo, added, removed, threshold):

89

def findrenames(repo, added, removed, threshold):

88

'''find renamed files -- yields (before, after, score) tuples'''

90

'''find renamed files -- yields (before, after, score) tuples'''

89

parentctx = repo['.']

91

parentctx = repo['.']

90

workingctx = repo[None]

92

workingctx = repo[None]

91

93

92

# Zero length files will be frequently unrelated to each other, and

94

# Zero length files will be frequently unrelated to each other, and

93

# tracking the deletion/addition of such a file will probably cause more

95

# tracking the deletion/addition of such a file will probably cause more

94

# harm than good. We strip them out here to avoid matching them later on.

96

# harm than good. We strip them out here to avoid matching them later on.

95

addedfiles = set([workingctx[fp] for fp in added

97

addedfiles = set([workingctx[fp] for fp in added

96

if workingctx[fp].size() > 0])

98

if workingctx[fp].size() > 0])

97

removedfiles = set([parentctx[fp] for fp in removed

99

removedfiles = set([parentctx[fp] for fp in removed

98

if fp in parentctx and parentctx[fp].size() > 0])

100

if fp in parentctx and parentctx[fp].size() > 0])

99

101

100

# Find exact matches.

102

# Find exact matches.

101

for (a, b) in _findexactmatches(repo,

103

for (a, b) in _findexactmatches(repo,

102

sorted(addedfiles), sorted(removedfiles)):

104

sorted(addedfiles), sorted(removedfiles)):

103

addedfiles.remove(b)

105

addedfiles.remove(b)

104

yield (a.path(), b.path(), 1.0)

106

yield (a.path(), b.path(), 1.0)

105

107

106

# If the user requested similar files to be matched, search for them also.

108

# If the user requested similar files to be matched, search for them also.

107

if threshold < 1.0:

109

if threshold < 1.0:

108

for (a, b, score) in _findsimilarmatches(repo,

110

for (a, b, score) in _findsimilarmatches(repo,

109

sorted(addedfiles), sorted(removedfiles), threshold):

111

sorted(addedfiles), sorted(removedfiles), threshold):

110

yield (a.path(), b.path(), score)

112

yield (a.path(), b.path(), score)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # similar.py - mechanisms for finding similar files
             #
             # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import hashlib
             from .i18n import _
             from . import (
                 bdiff,
                 mdiff,
                 util,
             )
             def _findexactmatches(repo, added, removed):
                 '''find renamed files that have no changes
                 Takes a list of new filectxs and a list of removed filectxs, and yields
                 (before, after) tuples of exact matches.
                 '''
                 numfiles = len(added) + len(removed)
                 # Get hashes of removed files.
                 hashes = {}
                 for i, fctx in enumerate(removed):
                     repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
                                      unit=_('files'))
                     h = hashlib.sha1(fctx.data()).digest()
                     hashes[h] = fctx
                 # For each added file, see if it corresponds to a removed file.
                 for i, fctx in enumerate(added):
                     repo.ui.progress(_('searching for exact renames'), i + len(removed),
                             total=numfiles, unit=_('files'))
                     h = hashlib.sha1(fctx.data()).digest()
                     if h in hashes:
                         yield (hashes[h], fctx)
                 # Done
                 repo.ui.progress(_('searching for exact renames'), None)
+            @util.cachefunc
+            def _ctxdata(fctx):
+                # lazily load text
+                orig = fctx.data()
+                return orig, mdiff.splitnewlines(orig)
+            @util.cachefunc
+            def score(fctx1, fctx2):
+                text = fctx1.data()
+                orig, lines = _ctxdata(fctx2)
+                # bdiff.blocks() returns blocks of matching lines
+                # count the number of bytes in each
+                equal = 0
+                matches = bdiff.blocks(text, orig)
+                for x1, x2, y1, y2 in matches:
+                    for line in lines[y1:y2]:
+                        equal += len(line)
+                lengths = len(text) + len(orig)
+                return equal * 2.0 / lengths
             def _findsimilarmatches(repo, added, removed, threshold):
                 '''find potentially renamed files based on similar file content
                 Takes a list of new filectxs and a list of removed filectxs, and yields
                 (before, after, score) tuples of partial matches.
                 '''
                 copies = {}
                 for i, r in enumerate(removed):
                     repo.ui.progress(_('searching for similar files'), i,
                                      total=len(removed), unit=_('files'))
-                    # lazily load text
-                    @util.cachefunc
-                    def data():
-                        orig = r.data()
-                        return orig, mdiff.splitnewlines(orig)
-                    def score(text):
-                        orig, lines = data()
-                        # bdiff.blocks() returns blocks of matching lines
-                        # count the number of bytes in each
-                        equal = 0
-                        matches = bdiff.blocks(text, orig)
-                        for x1, x2, y1, y2 in matches:
-                            for line in lines[y1:y2]:
-                                equal += len(line)
-                        lengths = len(text) + len(orig)
-                        return equal * 2.0 / lengths
                     for a in added:
                         bestscore = copies.get(a, (None, threshold))[1]
-                        myscore = score(a.data())
+                        myscore = score(a, r)
                         if myscore >= bestscore:
                             copies[a] = (r, myscore)
                 repo.ui.progress(_('searching'), None)
                 for dest, v in copies.iteritems():
                     source, bscore = v
                     yield source, dest, bscore
             def findrenames(repo, added, removed, threshold):
                 '''find renamed files -- yields (before, after, score) tuples'''
                 parentctx = repo['.']
                 workingctx = repo[None]
                 # Zero length files will be frequently unrelated to each other, and
                 # tracking the deletion/addition of such a file will probably cause more
                 # harm than good. We strip them out here to avoid matching them later on.
                 addedfiles = set([workingctx[fp] for fp in added
                         if workingctx[fp].size() > 0])
                 removedfiles = set([parentctx[fp] for fp in removed
                         if fp in parentctx and parentctx[fp].size() > 0])
                 # Find exact matches.
                 for (a, b) in _findexactmatches(repo,
                         sorted(addedfiles), sorted(removedfiles)):
                     addedfiles.remove(b)
                     yield (a.path(), b.path(), 1.0)
                 # If the user requested similar files to be matched, search for them also.
                 if threshold < 1.0:
                     for (a, b, score) in _findsimilarmatches(repo,
                             sorted(addedfiles), sorted(removedfiles), threshold):
                         yield (a.path(), b.path(), score)