upstream/mercurial-mirror Commit - r38414:59c9d3cc

1

# similar.py - mechanisms for finding similar files

1

# similar.py - mechanisms for finding similar files

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

from .i18n import _

10

from .i18n import _

11

from . import (

11

from . import (

12

mdiff,

12

mdiff,

13

)

13

)

14

15

def _findexactmatches(repo, added, removed):

15

def _findexactmatches(repo, added, removed):

16

'''find renamed files that have no changes

16

'''find renamed files that have no changes

17

18

Takes a list of new filectxs and a list of removed filectxs, and yields

18

Takes a list of new filectxs and a list of removed filectxs, and yields

19

(before, after) tuples of exact matches.

19

(before, after) tuples of exact matches.

20

'''

20

'''

21

# Build table of removed files: {hash(fctx.data()): [fctx, ...]}.

21

# Build table of removed files: {hash(fctx.data()): [fctx, ...]}.

22

# We use hash() to discard fctx.data() from memory.

22

# We use hash() to discard fctx.data() from memory.

23

hashes = {}

23

hashes = {}

24

progress = repo.ui.makeprogress(_('searching for exact renames'),

24

progress = repo.ui.makeprogress(_('searching for exact renames'),

25

total=(len(added) + len(removed)),

25

total=(len(added) + len(removed)),

26

unit=_('files'))

26

unit=_('files'))

27

for fctx in removed:

27

for fctx in removed:

28

progress.increment()

28

progress.increment()

29

h = hash(fctx.data())

29

h = hash(fctx.data())

30

if h not in hashes:

30

if h not in hashes:

31

hashes[h] = [fctx]

31

hashes[h] = [fctx]

32

else:

32

else:

33

hashes[h].append(fctx)

33

hashes[h].append(fctx)

34

35

# For each added file, see if it corresponds to a removed file.

35

# For each added file, see if it corresponds to a removed file.

36

for fctx in added:

36

for fctx in added:

37

progress.increment()

37

progress.increment()

38

adata = fctx.data()

38

adata = fctx.data()

39

h = hash(adata)

39

h = hash(adata)

40

for rfctx in hashes.get(h, []):

40

for rfctx in hashes.get(h, []):

41

# compare between actual file contents for exact identity

41

# compare between actual file contents for exact identity

42

if adata == rfctx.data():

42

if adata == rfctx.data():

43

yield (rfctx, fctx)

43

yield (rfctx, fctx)

44

break

44

break

45

46

# Done

46

# Done

47

progress.complete()

47

progress.complete()

48

49

def _ctxdata(fctx):

49

def _ctxdata(fctx):

50

# lazily load text

50

# lazily load text

51

orig = fctx.data()

51

orig = fctx.data()

52

return orig, mdiff.splitnewlines(orig)

52

return orig, mdiff.splitnewlines(orig)

53

54

def _score(fctx, otherdata):

54

def _score(fctx, otherdata):

55

orig, lines = otherdata

55

orig, lines = otherdata

56

text = fctx.data()

56

text = fctx.data()

57

# mdiff.blocks() returns blocks of matching lines

57

# mdiff.blocks() returns blocks of matching lines

58

# count the number of bytes in each

58

# count the number of bytes in each

59

equal = 0

59

equal = 0

60

matches = mdiff.blocks(text, orig)

60

matches = mdiff.blocks(text, orig)

61

for x1, x2, y1, y2 in matches:

61

for x1, x2, y1, y2 in matches:

62

for line in lines[y1:y2]:

62

for line in lines[y1:y2]:

63

equal += len(line)

63

equal += len(line)

64

65

lengths = len(text) + len(orig)

65

lengths = len(text) + len(orig)

66

return equal * 2.0 / lengths

66

return equal * 2.0 / lengths

67

68

def score(fctx1, fctx2):

68

def score(fctx1, fctx2):

69

return _score(fctx1, _ctxdata(fctx2))

69

return _score(fctx1, _ctxdata(fctx2))

70

71

def _findsimilarmatches(repo, added, removed, threshold):

71

def _findsimilarmatches(repo, added, removed, threshold):

72

'''find potentially renamed files based on similar file content

72

'''find potentially renamed files based on similar file content

73

74

Takes a list of new filectxs and a list of removed filectxs, and yields

74

Takes a list of new filectxs and a list of removed filectxs, and yields

75

(before, after, score) tuples of partial matches.

75

(before, after, score) tuples of partial matches.

76

'''

76

'''

77

copies = {}

77

copies = {}

78

for i, r in enumerate(removed):

78

progress = repo.ui.makeprogress(_('searching for similar files'),

79

repo.ui.progress(_('searching for similar files'), i,

79

unit=_('files'), total=len(removed))

80

total=len(removed), unit=_('files'))

80

for r in removed:

81

progress.increment()

82

data = None

82

data = None

83

for a in added:

83

for a in added:

84

bestscore = copies.get(a, (None, threshold))[1]

84

bestscore = copies.get(a, (None, threshold))[1]

85

if data is None:

85

if data is None:

86

data = _ctxdata(r)

86

data = _ctxdata(r)

87

myscore = _score(a, data)

87

myscore = _score(a, data)

88

if myscore > bestscore:

88

if myscore > bestscore:

89

copies[a] = (r, myscore)

89

copies[a] = (r, myscore)

90

repo.ui.progress(_('searching'), None)

90

progress.complete()

91

92

for dest, v in copies.iteritems():

92

for dest, v in copies.iteritems():

93

source, bscore = v

93

source, bscore = v

94

yield source, dest, bscore

94

yield source, dest, bscore

95

96

def _dropempty(fctxs):

96

def _dropempty(fctxs):

97

return [x for x in fctxs if x.size() > 0]

97

return [x for x in fctxs if x.size() > 0]

98

99

def findrenames(repo, added, removed, threshold):

99

def findrenames(repo, added, removed, threshold):

100

'''find renamed files -- yields (before, after, score) tuples'''

100

'''find renamed files -- yields (before, after, score) tuples'''

101

wctx = repo[None]

101

wctx = repo[None]

102

pctx = wctx.p1()

102

pctx = wctx.p1()

103

104

# Zero length files will be frequently unrelated to each other, and

104

# Zero length files will be frequently unrelated to each other, and

105

# tracking the deletion/addition of such a file will probably cause more

105

# tracking the deletion/addition of such a file will probably cause more

106

# harm than good. We strip them out here to avoid matching them later on.

106

# harm than good. We strip them out here to avoid matching them later on.

107

addedfiles = _dropempty(wctx[fp] for fp in sorted(added))

107

addedfiles = _dropempty(wctx[fp] for fp in sorted(added))

108

removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx)

108

removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx)

109

110

# Find exact matches.

110

# Find exact matches.

111

matchedfiles = set()

111

matchedfiles = set()

112

for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):

112

for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):

113

matchedfiles.add(b)

113

matchedfiles.add(b)

114

yield (a.path(), b.path(), 1.0)

114

yield (a.path(), b.path(), 1.0)

115

116

# If the user requested similar files to be matched, search for them also.

116

# If the user requested similar files to be matched, search for them also.

117

if threshold < 1.0:

117

if threshold < 1.0:

118

addedfiles = [x for x in addedfiles if x not in matchedfiles]

118

addedfiles = [x for x in addedfiles if x not in matchedfiles]

119

for (a, b, score) in _findsimilarmatches(repo, addedfiles,

119

for (a, b, score) in _findsimilarmatches(repo, addedfiles,

120

removedfiles, threshold):

120

removedfiles, threshold):

121

yield (a.path(), b.path(), score)

121

yield (a.path(), b.path(), score)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # similar.py - mechanisms for finding similar files
             #
             # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             from .i18n import _
             from . import (
                 mdiff,
             )
             def _findexactmatches(repo, added, removed):
                 '''find renamed files that have no changes
                 Takes a list of new filectxs and a list of removed filectxs, and yields
                 (before, after) tuples of exact matches.
                 '''
                 # Build table of removed files: {hash(fctx.data()): [fctx, ...]}.
                 # We use hash() to discard fctx.data() from memory.
                 hashes = {}
                 progress = repo.ui.makeprogress(_('searching for exact renames'),
                                                 total=(len(added) + len(removed)),
                                                 unit=_('files'))
                 for fctx in removed:
                     progress.increment()
                     h = hash(fctx.data())
                     if h not in hashes:
                         hashes[h] = [fctx]
                     else:
                         hashes[h].append(fctx)
                 # For each added file, see if it corresponds to a removed file.
                 for fctx in added:
                     progress.increment()
                     adata = fctx.data()
                     h = hash(adata)
                     for rfctx in hashes.get(h, []):
                         # compare between actual file contents for exact identity
                         if adata == rfctx.data():
                             yield (rfctx, fctx)
                             break
                 # Done
                 progress.complete()
             def _ctxdata(fctx):
                 # lazily load text
                 orig = fctx.data()
                 return orig, mdiff.splitnewlines(orig)
             def _score(fctx, otherdata):
                 orig, lines = otherdata
                 text = fctx.data()
                 # mdiff.blocks() returns blocks of matching lines
                 # count the number of bytes in each
                 equal = 0
                 matches = mdiff.blocks(text, orig)
                 for x1, x2, y1, y2 in matches:
                     for line in lines[y1:y2]:
                         equal += len(line)
                 lengths = len(text) + len(orig)
                 return equal * 2.0 / lengths
             def score(fctx1, fctx2):
                 return _score(fctx1, _ctxdata(fctx2))
             def _findsimilarmatches(repo, added, removed, threshold):
                 '''find potentially renamed files based on similar file content
                 Takes a list of new filectxs and a list of removed filectxs, and yields
                 (before, after, score) tuples of partial matches.
                 '''
                 copies = {}
-                for i, r in enumerate(removed):
+                progress = repo.ui.makeprogress(_('searching for similar files'),
-                    repo.ui.progress(_('searching for similar files'), i,
+                                     unit=_('files'), total=len(removed))
-                                     total=len(removed), unit=_('files'))
+                for r in removed:
+                    progress.increment()
                     data = None
                     for a in added:
                         bestscore = copies.get(a, (None, threshold))[1]
                         if data is None:
                             data = _ctxdata(r)
                         myscore = _score(a, data)
                         if myscore > bestscore:
                             copies[a] = (r, myscore)
-                repo.ui.progress(_('searching'), None)
+                progress.complete()
                 for dest, v in copies.iteritems():
                     source, bscore = v
                     yield source, dest, bscore
             def _dropempty(fctxs):
                 return [x for x in fctxs if x.size() > 0]
             def findrenames(repo, added, removed, threshold):
                 '''find renamed files -- yields (before, after, score) tuples'''
                 wctx = repo[None]
                 pctx = wctx.p1()
                 # Zero length files will be frequently unrelated to each other, and
                 # tracking the deletion/addition of such a file will probably cause more
                 # harm than good. We strip them out here to avoid matching them later on.
                 addedfiles = _dropempty(wctx[fp] for fp in sorted(added))
                 removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx)
                 # Find exact matches.
                 matchedfiles = set()
                 for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
                     matchedfiles.add(b)
                     yield (a.path(), b.path(), 1.0)
                 # If the user requested similar files to be matched, search for them also.
                 if threshold < 1.0:
                     addedfiles = [x for x in addedfiles if x not in matchedfiles]
                     for (a, b, score) in _findsimilarmatches(repo, addedfiles,
                                                              removedfiles, threshold):
                         yield (a.path(), b.path(), score)