upstream/mercurial-mirror Files · hgext/git/index.py

git: decode node IDs back into Python strings (issue6349)...

git: decode node IDs back into Python strings (issue6349) db.text_factory = bytes, so the database contains only strings. The object IDs we get from pygit2 are Python strings. b'foo' != 'foo' This change allows the "don't reindex" optimization to work by allowing the "cur_cache_heads == cache_heads" comparison a few lines down to succeed. Differential Revision: https://phab.mercurial-scm.org/D8622

Hollis Blanchard - - Load All Authors

File last commit:

r45472:fb2936c5 default


                r45472:fb2936c5

default

Download file

             index.py
        
                    353 lines
            
             | 11.2 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / hgext / git / index.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      from __future__ import absolute_import

      import collections

      import os

      import sqlite3

      from mercurial.i18n import _

      from mercurial import (

          encoding,

          error,

          node as nodemod,

          pycompat,

      )

      from . import gitutil

      pygit2 = gitutil.get_pygit2()

      _CURRENT_SCHEMA_VERSION = 1

      _SCHEMA = (

          """

      CREATE TABLE refs (

        -- node and name are unique together. There may be more than one name for

        -- a given node, and there may be no name at all for a given node (in the

        -- case of an anonymous hg head).

        node TEXT NOT NULL,

        name TEXT

      );

      -- The "possible heads" of the repository, which we use to figure out

      -- if we need to re-walk the changelog.

      CREATE TABLE possible_heads (

        node TEXT NOT NULL

      );

      -- The topological heads of the changelog, which hg depends on.

      CREATE TABLE heads (

        node TEXT NOT NULL

      );

      -- A total ordering of the changelog

      CREATE TABLE changelog (

        rev INTEGER NOT NULL PRIMARY KEY,

        node TEXT NOT NULL,

        p1 TEXT,

        p2 TEXT

      );

      CREATE UNIQUE INDEX changelog_node_idx ON changelog(node);

      CREATE UNIQUE INDEX changelog_node_rev_idx ON changelog(rev, node);

      -- Changed files for each commit, which lets us dynamically build

      -- filelogs.

      CREATE TABLE changedfiles (

        node TEXT NOT NULL,

        filename TEXT NOT NULL,

        -- 40 zeroes for deletions

        filenode TEXT NOT NULL,

      -- to handle filelog parentage:

        p1node TEXT,

        p1filenode TEXT,

        p2node TEXT,

        p2filenode TEXT

      );

      CREATE INDEX changedfiles_nodes_idx

        ON changedfiles(node);

      PRAGMA user_version=%d

      """

          % _CURRENT_SCHEMA_VERSION

      )

      def _createdb(path):

          # print('open db', path)

          # import traceback

          # traceback.print_stack()

          db = sqlite3.connect(encoding.strfromlocal(path))

          db.text_factory = bytes

          res = db.execute('PRAGMA user_version').fetchone()[0]

          # New database.

          if res == 0:

              for statement in _SCHEMA.split(';'):

                  db.execute(statement.strip())

              db.commit()

          elif res == _CURRENT_SCHEMA_VERSION:

              pass

          else:

              raise error.Abort(_(b'sqlite database has unrecognized version'))

          db.execute('PRAGMA journal_mode=WAL')

          return db

      _OUR_ORDER = ()

      if pygit2:

          _OUR_ORDER = (

              pygit2.GIT_SORT_TOPOLOGICAL

              | pygit2.GIT_SORT_TIME

              | pygit2.GIT_SORT_REVERSE

          )

      _DIFF_FLAGS = 1 << 21  # GIT_DIFF_FORCE_BINARY, which isn't exposed by pygit2

      def _find_nearest_ancestor_introducing_node(

          db, gitrepo, file_path, walk_start, filenode

      ):

          """Find the nearest ancestor that introduces a file node.

          Args:

            db: a handle to our sqlite database.

            gitrepo: A pygit2.Repository instance.

            file_path: the path of a file in the repo

            walk_start: a pygit2.Oid that is a commit where we should start walking

                        for our nearest ancestor.

          Returns:

            A hexlified SHA that is the commit ID of the next-nearest parent.

          """

          assert isinstance(file_path, str), 'file_path must be str, got %r' % type(

              file_path

          )

          assert isinstance(filenode, str), 'filenode must be str, got %r' % type(

              filenode

          )

          parent_options = {

              row[0].decode('ascii')

              for row in db.execute(

                  'SELECT node FROM changedfiles '

                  'WHERE filename = ? AND filenode = ?',

                  (file_path, filenode),

              )

          }

          inner_walker = gitrepo.walk(walk_start, _OUR_ORDER)

          for w in inner_walker:

              if w.id.hex in parent_options:

                  return w.id.hex

          raise error.ProgrammingError(

              'Unable to find introducing commit for %s node %s from %s',

              (file_path, filenode, walk_start),

          )

      def fill_in_filelog(gitrepo, db, startcommit, path, startfilenode):

          """Given a starting commit and path, fill in a filelog's parent pointers.

          Args:

            gitrepo: a pygit2.Repository

            db: a handle to our sqlite database

            startcommit: a hexlified node id for the commit to start at

            path: the path of the file whose parent pointers we should fill in.

            filenode: the hexlified node id of the file at startcommit

          TODO: make filenode optional

          """

          assert isinstance(

              startcommit, str

          ), 'startcommit must be str, got %r' % type(startcommit)

          assert isinstance(

              startfilenode, str

          ), 'startfilenode must be str, got %r' % type(startfilenode)

          visit = collections.deque([(startcommit, startfilenode)])

          while visit:

              cnode, filenode = visit.popleft()

              commit = gitrepo[cnode]

              parents = []

              for parent in commit.parents:

                  t = parent.tree

                  for comp in path.split('/'):

                      try:

                          t = gitrepo[t[comp].id]

                      except KeyError:

                          break

                  else:

                      introducer = _find_nearest_ancestor_introducing_node(

                          db, gitrepo, path, parent.id, t.id.hex

                      )

                      parents.append((introducer, t.id.hex))

              p1node = p1fnode = p2node = p2fnode = gitutil.nullgit

              for par, parfnode in parents:

                  found = int(

                      db.execute(

                          'SELECT COUNT(*) FROM changedfiles WHERE '

                          'node = ? AND filename = ? AND filenode = ? AND '

                          'p1node NOT NULL',

                          (par, path, parfnode),

                      ).fetchone()[0]

                  )

                  if found == 0:

                      assert par is not None

                      visit.append((par, parfnode))

              if parents:

                  p1node, p1fnode = parents[0]

              if len(parents) == 2:

                  p2node, p2fnode = parents[1]

              if len(parents) > 2:

                  raise error.ProgrammingError(

                      b"git support can't handle octopus merges"

                  )

              db.execute(

                  'UPDATE changedfiles SET '

                  'p1node = ?, p1filenode = ?, p2node = ?, p2filenode = ? '

                  'WHERE node = ? AND filename = ? AND filenode = ?',

                  (p1node, p1fnode, p2node, p2fnode, commit.id.hex, path, filenode),

              )

          db.commit()

      def _index_repo(gitrepo, db, progress_factory=lambda *args, **kwargs: None):

          # Identify all references so we can tell the walker to visit all of them.

          all_refs = gitrepo.listall_references()

          possible_heads = set()

          prog = progress_factory(b'refs')

          for pos, ref in enumerate(all_refs):

              if prog is not None:

                  prog.update(pos)

              if not (

                  ref.startswith('refs/heads/')  # local branch

                  or ref.startswith('refs/tags/')  # tag

                  or ref.startswith('refs/remotes/')  # remote branch

                  or ref.startswith('refs/hg/')  # from this extension

              ):

                  continue

              try:

                  start = gitrepo.lookup_reference(ref).peel(pygit2.GIT_OBJ_COMMIT)

              except ValueError:

                  # No commit to be found, so we don't care for hg's purposes.

                  continue

              possible_heads.add(start.id)

          # Optimization: if the list of heads hasn't changed, don't

          # reindex, the changelog. This doesn't matter on small

          # repositories, but on even moderately deep histories (eg cpython)

          # this is a very important performance win.

          #

          # TODO: we should figure out how to incrementally index history

          # (preferably by detecting rewinds!) so that we don't have to do a

          # full changelog walk every time a new commit is created.

          cache_heads = {

              pycompat.sysstr(x[0])

              for x in db.execute('SELECT node FROM possible_heads')

          }

          walker = None

          cur_cache_heads = {h.hex for h in possible_heads}

          if cur_cache_heads == cache_heads:

              return

          for start in possible_heads:

              if walker is None:

                  walker = gitrepo.walk(start, _OUR_ORDER)

              else:

                  walker.push(start)

          # Empty out the existing changelog. Even for large-ish histories

          # we can do the top-level "walk all the commits" dance very

          # quickly as long as we don't need to figure out the changed files

          # list.

          db.execute('DELETE FROM changelog')

          if prog is not None:

              prog.complete()

          prog = progress_factory(b'commits')

          # This walker is sure to visit all the revisions in history, but

          # only once.

          for pos, commit in enumerate(walker):

              if prog is not None:

                  prog.update(pos)

              p1 = p2 = nodemod.nullhex

              if len(commit.parents) > 2:

                  raise error.ProgrammingError(

                      (

                          b"git support can't handle octopus merges, "

                          b"found a commit with %d parents :("

                      )

                      % len(commit.parents)

                  )

              if commit.parents:

                  p1 = commit.parents[0].id.hex

              if len(commit.parents) == 2:

                  p2 = commit.parents[1].id.hex

              db.execute(

                  'INSERT INTO changelog (rev, node, p1, p2) VALUES(?, ?, ?, ?)',

                  (pos, commit.id.hex, p1, p2),

              )

              num_changedfiles = db.execute(

                  "SELECT COUNT(*) from changedfiles WHERE node = ?",

                  (commit.id.hex,),

              ).fetchone()[0]

              if not num_changedfiles:

                  files = {}

                  # I *think* we only need to check p1 for changed files

                  # (and therefore linkrevs), because any node that would

                  # actually have this commit as a linkrev would be

                  # completely new in this rev.

                  p1 = commit.parents[0].id.hex if commit.parents else None

                  if p1 is not None:

                      patchgen = gitrepo.diff(p1, commit.id.hex, flags=_DIFF_FLAGS)

                  else:

                      patchgen = commit.tree.diff_to_tree(

                          swap=True, flags=_DIFF_FLAGS

                      )

                  new_files = (p.delta.new_file for p in patchgen)

                  files = {

                      nf.path: nf.id.hex

                      for nf in new_files

                      if nf.id.raw != nodemod.nullid

                  }

                  for p, n in files.items():

                      # We intentionally set NULLs for any file parentage

                      # information so it'll get demand-computed later. We

                      # used to do it right here, and it was _very_ slow.

                      db.execute(

                          'INSERT INTO changedfiles ('

                          'node, filename, filenode, p1node, p1filenode, p2node, '

                          'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)',

                          (commit.id.hex, p, n, None, None, None, None),

                      )

          db.execute('DELETE FROM heads')

          db.execute('DELETE FROM possible_heads')

          for hid in possible_heads:

              h = hid.hex

              db.execute('INSERT INTO possible_heads (node) VALUES(?)', (h,))

              haschild = db.execute(

                  'SELECT COUNT(*) FROM changelog WHERE p1 = ? OR p2 = ?', (h, h)

              ).fetchone()[0]

              if not haschild:

                  db.execute('INSERT INTO heads (node) VALUES(?)', (h,))

          db.commit()

          if prog is not None:

              prog.complete()

      def get_index(gitrepo, progress_factory=lambda *args, **kwargs: None):

          cachepath = os.path.join(

              pycompat.fsencode(gitrepo.path), b'..', b'.hg', b'cache'

          )

          if not os.path.exists(cachepath):

              os.makedirs(cachepath)

          dbpath = os.path.join(cachepath, b'git-commits.sqlite')

          db = _createdb(dbpath)

          # TODO check against gitrepo heads before doing a full index

          # TODO thread a ui.progress call into this layer

          _index_repo(gitrepo, db, progress_factory)

          return db

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				from __future__ import absolute_import

				import collections
				import os
				import sqlite3

				from mercurial.i18n import _

				from mercurial import (
				encoding,
				error,
				node as nodemod,
				pycompat,
				)

				from . import gitutil


				pygit2 = gitutil.get_pygit2()

				_CURRENT_SCHEMA_VERSION = 1
				_SCHEMA = (
				"""
				CREATE TABLE refs (
				-- node and name are unique together. There may be more than one name for
				-- a given node, and there may be no name at all for a given node (in the
				-- case of an anonymous hg head).
				node TEXT NOT NULL,
				name TEXT
				);

				-- The "possible heads" of the repository, which we use to figure out
				-- if we need to re-walk the changelog.
				CREATE TABLE possible_heads (
				node TEXT NOT NULL
				);

				-- The topological heads of the changelog, which hg depends on.
				CREATE TABLE heads (
				node TEXT NOT NULL
				);

				-- A total ordering of the changelog
				CREATE TABLE changelog (
				rev INTEGER NOT NULL PRIMARY KEY,
				node TEXT NOT NULL,
				p1 TEXT,
				p2 TEXT
				);

				CREATE UNIQUE INDEX changelog_node_idx ON changelog(node);
				CREATE UNIQUE INDEX changelog_node_rev_idx ON changelog(rev, node);

				-- Changed files for each commit, which lets us dynamically build
				-- filelogs.
				CREATE TABLE changedfiles (
				node TEXT NOT NULL,
				filename TEXT NOT NULL,
				-- 40 zeroes for deletions
				filenode TEXT NOT NULL,
				-- to handle filelog parentage:
				p1node TEXT,
				p1filenode TEXT,
				p2node TEXT,
				p2filenode TEXT
				);

				CREATE INDEX changedfiles_nodes_idx
				ON changedfiles(node);

				PRAGMA user_version=%d
				"""
				% _CURRENT_SCHEMA_VERSION
				)


				def _createdb(path):
				# print('open db', path)
				# import traceback
				# traceback.print_stack()
				db = sqlite3.connect(encoding.strfromlocal(path))
				db.text_factory = bytes

				res = db.execute('PRAGMA user_version').fetchone()[0]

				# New database.
				if res == 0:
				for statement in _SCHEMA.split(';'):
				db.execute(statement.strip())

				db.commit()

				elif res == _CURRENT_SCHEMA_VERSION:
				pass

				else:
				raise error.Abort(_(b'sqlite database has unrecognized version'))

				db.execute('PRAGMA journal_mode=WAL')

				return db


				_OUR_ORDER = ()
				if pygit2:
				_OUR_ORDER = (
				pygit2.GIT_SORT_TOPOLOGICAL
				\| pygit2.GIT_SORT_TIME
				\| pygit2.GIT_SORT_REVERSE
				)

				_DIFF_FLAGS = 1 << 21 # GIT_DIFF_FORCE_BINARY, which isn't exposed by pygit2


				def _find_nearest_ancestor_introducing_node(
				db, gitrepo, file_path, walk_start, filenode
				):
				"""Find the nearest ancestor that introduces a file node.

				Args:
				db: a handle to our sqlite database.
				gitrepo: A pygit2.Repository instance.
				file_path: the path of a file in the repo
				walk_start: a pygit2.Oid that is a commit where we should start walking
				for our nearest ancestor.

				Returns:
				A hexlified SHA that is the commit ID of the next-nearest parent.
				"""
				assert isinstance(file_path, str), 'file_path must be str, got %r' % type(
				file_path
				)
				assert isinstance(filenode, str), 'filenode must be str, got %r' % type(
				filenode
				)
				parent_options = {
				row[0].decode('ascii')
				for row in db.execute(
				'SELECT node FROM changedfiles '
				'WHERE filename = ? AND filenode = ?',
				(file_path, filenode),
				)
				}
				inner_walker = gitrepo.walk(walk_start, _OUR_ORDER)
				for w in inner_walker:
				if w.id.hex in parent_options:
				return w.id.hex
				raise error.ProgrammingError(
				'Unable to find introducing commit for %s node %s from %s',
				(file_path, filenode, walk_start),
				)


				def fill_in_filelog(gitrepo, db, startcommit, path, startfilenode):
				"""Given a starting commit and path, fill in a filelog's parent pointers.

				Args:
				gitrepo: a pygit2.Repository
				db: a handle to our sqlite database
				startcommit: a hexlified node id for the commit to start at
				path: the path of the file whose parent pointers we should fill in.
				filenode: the hexlified node id of the file at startcommit

				TODO: make filenode optional
				"""
				assert isinstance(
				startcommit, str
				), 'startcommit must be str, got %r' % type(startcommit)
				assert isinstance(
				startfilenode, str
				), 'startfilenode must be str, got %r' % type(startfilenode)
				visit = collections.deque([(startcommit, startfilenode)])
				while visit:
				cnode, filenode = visit.popleft()
				commit = gitrepo[cnode]
				parents = []
				for parent in commit.parents:
				t = parent.tree
				for comp in path.split('/'):
				try:
				t = gitrepo[t[comp].id]
				except KeyError:
				break
				else:
				introducer = _find_nearest_ancestor_introducing_node(
				db, gitrepo, path, parent.id, t.id.hex
				)
				parents.append((introducer, t.id.hex))
				p1node = p1fnode = p2node = p2fnode = gitutil.nullgit
				for par, parfnode in parents:
				found = int(
				db.execute(
				'SELECT COUNT(*) FROM changedfiles WHERE '
				'node = ? AND filename = ? AND filenode = ? AND '
				'p1node NOT NULL',
				(par, path, parfnode),
				).fetchone()[0]
				)
				if found == 0:
				assert par is not None
				visit.append((par, parfnode))
				if parents:
				p1node, p1fnode = parents[0]
				if len(parents) == 2:
				p2node, p2fnode = parents[1]
				if len(parents) > 2:
				raise error.ProgrammingError(
				b"git support can't handle octopus merges"
				)
				db.execute(
				'UPDATE changedfiles SET '
				'p1node = ?, p1filenode = ?, p2node = ?, p2filenode = ? '
				'WHERE node = ? AND filename = ? AND filenode = ?',
				(p1node, p1fnode, p2node, p2fnode, commit.id.hex, path, filenode),
				)
				db.commit()


				def _index_repo(gitrepo, db, progress_factory=lambda args, *kwargs: None):
				# Identify all references so we can tell the walker to visit all of them.
				all_refs = gitrepo.listall_references()
				possible_heads = set()
				prog = progress_factory(b'refs')
				for pos, ref in enumerate(all_refs):
				if prog is not None:
				prog.update(pos)
				if not (
				ref.startswith('refs/heads/') # local branch
				or ref.startswith('refs/tags/') # tag
				or ref.startswith('refs/remotes/') # remote branch
				or ref.startswith('refs/hg/') # from this extension
				):
				continue
				try:
				start = gitrepo.lookup_reference(ref).peel(pygit2.GIT_OBJ_COMMIT)
				except ValueError:
				# No commit to be found, so we don't care for hg's purposes.
				continue
				possible_heads.add(start.id)
				# Optimization: if the list of heads hasn't changed, don't
				# reindex, the changelog. This doesn't matter on small
				# repositories, but on even moderately deep histories (eg cpython)
				# this is a very important performance win.
				#
				# TODO: we should figure out how to incrementally index history
				# (preferably by detecting rewinds!) so that we don't have to do a
				# full changelog walk every time a new commit is created.
				cache_heads = {
				pycompat.sysstr(x[0])
				for x in db.execute('SELECT node FROM possible_heads')
				}
				walker = None
				cur_cache_heads = {h.hex for h in possible_heads}
				if cur_cache_heads == cache_heads:
				return
				for start in possible_heads:
				if walker is None:
				walker = gitrepo.walk(start, _OUR_ORDER)
				else:
				walker.push(start)

				# Empty out the existing changelog. Even for large-ish histories
				# we can do the top-level "walk all the commits" dance very
				# quickly as long as we don't need to figure out the changed files
				# list.
				db.execute('DELETE FROM changelog')
				if prog is not None:
				prog.complete()
				prog = progress_factory(b'commits')
				# This walker is sure to visit all the revisions in history, but
				# only once.
				for pos, commit in enumerate(walker):
				if prog is not None:
				prog.update(pos)
				p1 = p2 = nodemod.nullhex
				if len(commit.parents) > 2:
				raise error.ProgrammingError(
				(
				b"git support can't handle octopus merges, "
				b"found a commit with %d parents :("
				)
				% len(commit.parents)
				)
				if commit.parents:
				p1 = commit.parents[0].id.hex
				if len(commit.parents) == 2:
				p2 = commit.parents[1].id.hex
				db.execute(
				'INSERT INTO changelog (rev, node, p1, p2) VALUES(?, ?, ?, ?)',
				(pos, commit.id.hex, p1, p2),
				)

				num_changedfiles = db.execute(
				"SELECT COUNT(*) from changedfiles WHERE node = ?",
				(commit.id.hex,),
				).fetchone()[0]
				if not num_changedfiles:
				files = {}
				# I think we only need to check p1 for changed files
				# (and therefore linkrevs), because any node that would
				# actually have this commit as a linkrev would be
				# completely new in this rev.
				p1 = commit.parents[0].id.hex if commit.parents else None
				if p1 is not None:
				patchgen = gitrepo.diff(p1, commit.id.hex, flags=_DIFF_FLAGS)
				else:
				patchgen = commit.tree.diff_to_tree(
				swap=True, flags=_DIFF_FLAGS
				)
				new_files = (p.delta.new_file for p in patchgen)
				files = {
				nf.path: nf.id.hex
				for nf in new_files
				if nf.id.raw != nodemod.nullid
				}
				for p, n in files.items():
				# We intentionally set NULLs for any file parentage
				# information so it'll get demand-computed later. We
				# used to do it right here, and it was _very_ slow.
				db.execute(
				'INSERT INTO changedfiles ('
				'node, filename, filenode, p1node, p1filenode, p2node, '
				'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)',
				(commit.id.hex, p, n, None, None, None, None),
				)
				db.execute('DELETE FROM heads')
				db.execute('DELETE FROM possible_heads')
				for hid in possible_heads:
				h = hid.hex
				db.execute('INSERT INTO possible_heads (node) VALUES(?)', (h,))
				haschild = db.execute(
				'SELECT COUNT(*) FROM changelog WHERE p1 = ? OR p2 = ?', (h, h)
				).fetchone()[0]
				if not haschild:
				db.execute('INSERT INTO heads (node) VALUES(?)', (h,))

				db.commit()
				if prog is not None:
				prog.complete()


				def get_index(gitrepo, progress_factory=lambda args, *kwargs: None):
				cachepath = os.path.join(
				pycompat.fsencode(gitrepo.path), b'..', b'.hg', b'cache'
				)
				if not os.path.exists(cachepath):
				os.makedirs(cachepath)
				dbpath = os.path.join(cachepath, b'git-commits.sqlite')
				db = _createdb(dbpath)
				# TODO check against gitrepo heads before doing a full index
				# TODO thread a ui.progress call into this layer
				_index_repo(gitrepo, db, progress_factory)
				return db