upstream/mercurial-mirror Files · mercurial/metadata.py

metadata: move computation related to files touched in a dedicated module...

metadata: move computation related to files touched in a dedicated module This was suggested by Yuya Nishihara a while back. Since I am about to add more metadata related computation, lets create a new repositories. Differential Revision: https://phab.mercurial-scm.org/D8587

marmoute - - Load All Authors

File last commit:

r45466:4c1d3921 default


                r45466:4c1d3921

default

Download file

             metadata.py
        
                    268 lines
            
             | 8.3 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / mercurial / metadata.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      # metadata.py -- code related to various metadata computation and access.

      #

      # Copyright 2019 Google, Inc <martinvonz@google.com>

      # Copyright 2020 Pierre-Yves David <pierre-yves.david@octobus.net>

      #

      # This software may be used and distributed according to the terms of the

      # GNU General Public License version 2 or any later version.

      from __future__ import absolute_import, print_function

      import multiprocessing

      from . import (

          error,

          pycompat,

          util,

      )

      from .revlogutils import (

          flagutil as sidedataflag,

          sidedata as sidedatamod,

      )

      def computechangesetfilesadded(ctx):

          """return the list of files added in a changeset

          """

          added = []

          for f in ctx.files():

              if not any(f in p for p in ctx.parents()):

                  added.append(f)

          return added

      def computechangesetfilesremoved(ctx):

          """return the list of files removed in a changeset

          """

          removed = []

          for f in ctx.files():

              if f not in ctx:

                  removed.append(f)

          return removed

      def computechangesetcopies(ctx):

          """return the copies data for a changeset

          The copies data are returned as a pair of dictionnary (p1copies, p2copies).

          Each dictionnary are in the form: `{newname: oldname}`

          """

          p1copies = {}

          p2copies = {}

          p1 = ctx.p1()

          p2 = ctx.p2()

          narrowmatch = ctx._repo.narrowmatch()

          for dst in ctx.files():

              if not narrowmatch(dst) or dst not in ctx:

                  continue

              copied = ctx[dst].renamed()

              if not copied:

                  continue

              src, srcnode = copied

              if src in p1 and p1[src].filenode() == srcnode:

                  p1copies[dst] = src

              elif src in p2 and p2[src].filenode() == srcnode:

                  p2copies[dst] = src

          return p1copies, p2copies

      def encodecopies(files, copies):

          items = []

          for i, dst in enumerate(files):

              if dst in copies:

                  items.append(b'%d\0%s' % (i, copies[dst]))

          if len(items) != len(copies):

              raise error.ProgrammingError(

                  b'some copy targets missing from file list'

              )

          return b"\n".join(items)

      def decodecopies(files, data):

          try:

              copies = {}

              if not data:

                  return copies

              for l in data.split(b'\n'):

                  strindex, src = l.split(b'\0')

                  i = int(strindex)

                  dst = files[i]

                  copies[dst] = src

              return copies

          except (ValueError, IndexError):

              # Perhaps someone had chosen the same key name (e.g. "p1copies") and

              # used different syntax for the value.

              return None

      def encodefileindices(files, subset):

          subset = set(subset)

          indices = []

          for i, f in enumerate(files):

              if f in subset:

                  indices.append(b'%d' % i)

          return b'\n'.join(indices)

      def decodefileindices(files, data):

          try:

              subset = []

              if not data:

                  return subset

              for strindex in data.split(b'\n'):

                  i = int(strindex)

                  if i < 0 or i >= len(files):

                      return None

                  subset.append(files[i])

              return subset

          except (ValueError, IndexError):

              # Perhaps someone had chosen the same key name (e.g. "added") and

              # used different syntax for the value.

              return None

      def _getsidedata(srcrepo, rev):

          ctx = srcrepo[rev]

          filescopies = computechangesetcopies(ctx)

          filesadded = computechangesetfilesadded(ctx)

          filesremoved = computechangesetfilesremoved(ctx)

          sidedata = {}

          if any([filescopies, filesadded, filesremoved]):

              sortedfiles = sorted(ctx.files())

              p1copies, p2copies = filescopies

              p1copies = encodecopies(sortedfiles, p1copies)

              p2copies = encodecopies(sortedfiles, p2copies)

              filesadded = encodefileindices(sortedfiles, filesadded)

              filesremoved = encodefileindices(sortedfiles, filesremoved)

              if p1copies:

                  sidedata[sidedatamod.SD_P1COPIES] = p1copies

              if p2copies:

                  sidedata[sidedatamod.SD_P2COPIES] = p2copies

              if filesadded:

                  sidedata[sidedatamod.SD_FILESADDED] = filesadded

              if filesremoved:

                  sidedata[sidedatamod.SD_FILESREMOVED] = filesremoved

          return sidedata

      def getsidedataadder(srcrepo, destrepo):

          use_w = srcrepo.ui.configbool(b'experimental', b'worker.repository-upgrade')

          if pycompat.iswindows or not use_w:

              return _get_simple_sidedata_adder(srcrepo, destrepo)

          else:

              return _get_worker_sidedata_adder(srcrepo, destrepo)

      def _sidedata_worker(srcrepo, revs_queue, sidedata_queue, tokens):

          """The function used by worker precomputing sidedata

          It read an input queue containing revision numbers

          It write in an output queue containing (rev, <sidedata-map>)

          The `None` input value is used as a stop signal.

          The `tokens` semaphore is user to avoid having too many unprocessed

          entries. The workers needs to acquire one token before fetching a task.

          They will be released by the consumer of the produced data.

          """

          tokens.acquire()

          rev = revs_queue.get()

          while rev is not None:

              data = _getsidedata(srcrepo, rev)

              sidedata_queue.put((rev, data))

              tokens.acquire()

              rev = revs_queue.get()

          # processing of `None` is completed, release the token.

          tokens.release()

      BUFF_PER_WORKER = 50

      def _get_worker_sidedata_adder(srcrepo, destrepo):

          """The parallel version of the sidedata computation

          This code spawn a pool of worker that precompute a buffer of sidedata

          before we actually need them"""

          # avoid circular import copies -> scmutil -> worker -> copies

          from . import worker

          nbworkers = worker._numworkers(srcrepo.ui)

          tokens = multiprocessing.BoundedSemaphore(nbworkers * BUFF_PER_WORKER)

          revsq = multiprocessing.Queue()

          sidedataq = multiprocessing.Queue()

          assert srcrepo.filtername is None

          # queue all tasks beforehand, revision numbers are small and it make

          # synchronisation simpler

          #

          # Since the computation for each node can be quite expensive, the overhead

          # of using a single queue is not revelant. In practice, most computation

          # are fast but some are very expensive and dominate all the other smaller

          # cost.

          for r in srcrepo.changelog.revs():

              revsq.put(r)

          # queue the "no more tasks" markers

          for i in range(nbworkers):

              revsq.put(None)

          allworkers = []

          for i in range(nbworkers):

              args = (srcrepo, revsq, sidedataq, tokens)

              w = multiprocessing.Process(target=_sidedata_worker, args=args)

              allworkers.append(w)

              w.start()

          # dictionnary to store results for revision higher than we one we are

          # looking for. For example, if we need the sidedatamap for 42, and 43 is

          # received, when shelve 43 for later use.

          staging = {}

          def sidedata_companion(revlog, rev):

              sidedata = {}

              if util.safehasattr(revlog, b'filteredrevs'):  # this is a changelog

                  # Is the data previously shelved ?

                  sidedata = staging.pop(rev, None)

                  if sidedata is None:

                      # look at the queued result until we find the one we are lookig

                      # for (shelve the other ones)

                      r, sidedata = sidedataq.get()

                      while r != rev:

                          staging[r] = sidedata

                          r, sidedata = sidedataq.get()

                  tokens.release()

              return False, (), sidedata

          return sidedata_companion

      def _get_simple_sidedata_adder(srcrepo, destrepo):

          """The simple version of the sidedata computation

          It just compute it in the same thread on request"""

          def sidedatacompanion(revlog, rev):

              sidedata = {}

              if util.safehasattr(revlog, 'filteredrevs'):  # this is a changelog

                  sidedata = _getsidedata(srcrepo, rev)

              return False, (), sidedata

          return sidedatacompanion

      def getsidedataremover(srcrepo, destrepo):

          def sidedatacompanion(revlog, rev):

              f = ()

              if util.safehasattr(revlog, 'filteredrevs'):  # this is a changelog

                  if revlog.flags(rev) & sidedataflag.REVIDX_SIDEDATA:

                      f = (

                          sidedatamod.SD_P1COPIES,

                          sidedatamod.SD_P2COPIES,

                          sidedatamod.SD_FILESADDED,

                          sidedatamod.SD_FILESREMOVED,

                      )

              return False, f, {}

          return sidedatacompanion

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				# metadata.py -- code related to various metadata computation and access.
				#
				# Copyright 2019 Google, Inc <martinvonz@google.com>
				# Copyright 2020 Pierre-Yves David <pierre-yves.david@octobus.net>
				#
				# This software may be used and distributed according to the terms of the
				# GNU General Public License version 2 or any later version.
				from __future__ import absolute_import, print_function

				import multiprocessing

				from . import (
				error,
				pycompat,
				util,
				)

				from .revlogutils import (
				flagutil as sidedataflag,
				sidedata as sidedatamod,
				)


				def computechangesetfilesadded(ctx):
				"""return the list of files added in a changeset
				"""
				added = []
				for f in ctx.files():
				if not any(f in p for p in ctx.parents()):
				added.append(f)
				return added


				def computechangesetfilesremoved(ctx):
				"""return the list of files removed in a changeset
				"""
				removed = []
				for f in ctx.files():
				if f not in ctx:
				removed.append(f)
				return removed


				def computechangesetcopies(ctx):
				"""return the copies data for a changeset

				The copies data are returned as a pair of dictionnary (p1copies, p2copies).

				Each dictionnary are in the form: `{newname: oldname}`
				"""
				p1copies = {}
				p2copies = {}
				p1 = ctx.p1()
				p2 = ctx.p2()
				narrowmatch = ctx._repo.narrowmatch()
				for dst in ctx.files():
				if not narrowmatch(dst) or dst not in ctx:
				continue
				copied = ctx[dst].renamed()
				if not copied:
				continue
				src, srcnode = copied
				if src in p1 and p1[src].filenode() == srcnode:
				p1copies[dst] = src
				elif src in p2 and p2[src].filenode() == srcnode:
				p2copies[dst] = src
				return p1copies, p2copies


				def encodecopies(files, copies):
				items = []
				for i, dst in enumerate(files):
				if dst in copies:
				items.append(b'%d\0%s' % (i, copies[dst]))
				if len(items) != len(copies):
				raise error.ProgrammingError(
				b'some copy targets missing from file list'
				)
				return b"\n".join(items)


				def decodecopies(files, data):
				try:
				copies = {}
				if not data:
				return copies
				for l in data.split(b'\n'):
				strindex, src = l.split(b'\0')
				i = int(strindex)
				dst = files[i]
				copies[dst] = src
				return copies
				except (ValueError, IndexError):
				# Perhaps someone had chosen the same key name (e.g. "p1copies") and
				# used different syntax for the value.
				return None


				def encodefileindices(files, subset):
				subset = set(subset)
				indices = []
				for i, f in enumerate(files):
				if f in subset:
				indices.append(b'%d' % i)
				return b'\n'.join(indices)


				def decodefileindices(files, data):
				try:
				subset = []
				if not data:
				return subset
				for strindex in data.split(b'\n'):
				i = int(strindex)
				if i < 0 or i >= len(files):
				return None
				subset.append(files[i])
				return subset
				except (ValueError, IndexError):
				# Perhaps someone had chosen the same key name (e.g. "added") and
				# used different syntax for the value.
				return None


				def _getsidedata(srcrepo, rev):
				ctx = srcrepo[rev]
				filescopies = computechangesetcopies(ctx)
				filesadded = computechangesetfilesadded(ctx)
				filesremoved = computechangesetfilesremoved(ctx)
				sidedata = {}
				if any([filescopies, filesadded, filesremoved]):
				sortedfiles = sorted(ctx.files())
				p1copies, p2copies = filescopies
				p1copies = encodecopies(sortedfiles, p1copies)
				p2copies = encodecopies(sortedfiles, p2copies)
				filesadded = encodefileindices(sortedfiles, filesadded)
				filesremoved = encodefileindices(sortedfiles, filesremoved)
				if p1copies:
				sidedata[sidedatamod.SD_P1COPIES] = p1copies
				if p2copies:
				sidedata[sidedatamod.SD_P2COPIES] = p2copies
				if filesadded:
				sidedata[sidedatamod.SD_FILESADDED] = filesadded
				if filesremoved:
				sidedata[sidedatamod.SD_FILESREMOVED] = filesremoved
				return sidedata


				def getsidedataadder(srcrepo, destrepo):
				use_w = srcrepo.ui.configbool(b'experimental', b'worker.repository-upgrade')
				if pycompat.iswindows or not use_w:
				return _get_simple_sidedata_adder(srcrepo, destrepo)
				else:
				return _get_worker_sidedata_adder(srcrepo, destrepo)


				def _sidedata_worker(srcrepo, revs_queue, sidedata_queue, tokens):
				"""The function used by worker precomputing sidedata

				It read an input queue containing revision numbers
				It write in an output queue containing (rev, <sidedata-map>)

				The `None` input value is used as a stop signal.

				The `tokens` semaphore is user to avoid having too many unprocessed
				entries. The workers needs to acquire one token before fetching a task.
				They will be released by the consumer of the produced data.
				"""
				tokens.acquire()
				rev = revs_queue.get()
				while rev is not None:
				data = _getsidedata(srcrepo, rev)
				sidedata_queue.put((rev, data))
				tokens.acquire()
				rev = revs_queue.get()
				# processing of `None` is completed, release the token.
				tokens.release()


				BUFF_PER_WORKER = 50


				def _get_worker_sidedata_adder(srcrepo, destrepo):
				"""The parallel version of the sidedata computation

				This code spawn a pool of worker that precompute a buffer of sidedata
				before we actually need them"""
				# avoid circular import copies -> scmutil -> worker -> copies
				from . import worker

				nbworkers = worker._numworkers(srcrepo.ui)

				tokens = multiprocessing.BoundedSemaphore(nbworkers * BUFF_PER_WORKER)
				revsq = multiprocessing.Queue()
				sidedataq = multiprocessing.Queue()

				assert srcrepo.filtername is None
				# queue all tasks beforehand, revision numbers are small and it make
				# synchronisation simpler
				#
				# Since the computation for each node can be quite expensive, the overhead
				# of using a single queue is not revelant. In practice, most computation
				# are fast but some are very expensive and dominate all the other smaller
				# cost.
				for r in srcrepo.changelog.revs():
				revsq.put(r)
				# queue the "no more tasks" markers
				for i in range(nbworkers):
				revsq.put(None)

				allworkers = []
				for i in range(nbworkers):
				args = (srcrepo, revsq, sidedataq, tokens)
				w = multiprocessing.Process(target=_sidedata_worker, args=args)
				allworkers.append(w)
				w.start()

				# dictionnary to store results for revision higher than we one we are
				# looking for. For example, if we need the sidedatamap for 42, and 43 is
				# received, when shelve 43 for later use.
				staging = {}

				def sidedata_companion(revlog, rev):
				sidedata = {}
				if util.safehasattr(revlog, b'filteredrevs'): # this is a changelog
				# Is the data previously shelved ?
				sidedata = staging.pop(rev, None)
				if sidedata is None:
				# look at the queued result until we find the one we are lookig
				# for (shelve the other ones)
				r, sidedata = sidedataq.get()
				while r != rev:
				staging[r] = sidedata
				r, sidedata = sidedataq.get()
				tokens.release()
				return False, (), sidedata

				return sidedata_companion


				def _get_simple_sidedata_adder(srcrepo, destrepo):
				"""The simple version of the sidedata computation

				It just compute it in the same thread on request"""

				def sidedatacompanion(revlog, rev):
				sidedata = {}
				if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
				sidedata = _getsidedata(srcrepo, rev)
				return False, (), sidedata

				return sidedatacompanion


				def getsidedataremover(srcrepo, destrepo):
				def sidedatacompanion(revlog, rev):
				f = ()
				if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
				if revlog.flags(rev) & sidedataflag.REVIDX_SIDEDATA:
				f = (
				sidedatamod.SD_P1COPIES,
				sidedatamod.SD_P2COPIES,
				sidedatamod.SD_FILESADDED,
				sidedatamod.SD_FILESREMOVED,
				)
				return False, f, {}

				return sidedatacompanion