upstream/mercurial-mirror Commit - r45472:fb2936c5

git: decode node IDs back into Python strings (issue6349)...

Hollis Blanchard -

r45472:fb2936c5 default

parent child

hgext/git/index.py

0 +4 -1

              from __future__ import absolute_import
              import collections
              import os
              import sqlite3
              from mercurial.i18n import _
              from mercurial import (
                  encoding,
                  error,
                  node as nodemod,
                  pycompat,
              )
              from . import gitutil
              pygit2 = gitutil.get_pygit2()
              _CURRENT_SCHEMA_VERSION = 1
              _SCHEMA = (
                  """
              CREATE TABLE refs (
                -- node and name are unique together. There may be more than one name for
                -- a given node, and there may be no name at all for a given node (in the
                -- case of an anonymous hg head).
                node TEXT NOT NULL,
                name TEXT
              );
              -- The "possible heads" of the repository, which we use to figure out
              -- if we need to re-walk the changelog.
              CREATE TABLE possible_heads (
                node TEXT NOT NULL
              );
              -- The topological heads of the changelog, which hg depends on.
              CREATE TABLE heads (
                node TEXT NOT NULL
              );
              -- A total ordering of the changelog
              CREATE TABLE changelog (
                rev INTEGER NOT NULL PRIMARY KEY,
                node TEXT NOT NULL,
                p1 TEXT,
                p2 TEXT
              );
              CREATE UNIQUE INDEX changelog_node_idx ON changelog(node);
              CREATE UNIQUE INDEX changelog_node_rev_idx ON changelog(rev, node);
              -- Changed files for each commit, which lets us dynamically build
              -- filelogs.
              CREATE TABLE changedfiles (
                node TEXT NOT NULL,
                filename TEXT NOT NULL,
                -- 40 zeroes for deletions
                filenode TEXT NOT NULL,
              -- to handle filelog parentage:
                p1node TEXT,
                p1filenode TEXT,
                p2node TEXT,
                p2filenode TEXT
              );
              CREATE INDEX changedfiles_nodes_idx
                ON changedfiles(node);
              PRAGMA user_version=%d
              """
                  % _CURRENT_SCHEMA_VERSION
              )
              def _createdb(path):
                  # print('open db', path)
                  # import traceback
                  # traceback.print_stack()
                  db = sqlite3.connect(encoding.strfromlocal(path))
                  db.text_factory = bytes
                  res = db.execute('PRAGMA user_version').fetchone()[0]
                  # New database.
                  if res == 0:
                      for statement in _SCHEMA.split(';'):
                          db.execute(statement.strip())
                      db.commit()
                  elif res == _CURRENT_SCHEMA_VERSION:
                      pass
                  else:
                      raise error.Abort(_(b'sqlite database has unrecognized version'))
                  db.execute('PRAGMA journal_mode=WAL')
                  return db
              _OUR_ORDER = ()
              if pygit2:
                  _OUR_ORDER = (
                      pygit2.GIT_SORT_TOPOLOGICAL
                      | pygit2.GIT_SORT_TIME
                      | pygit2.GIT_SORT_REVERSE
                  )
              _DIFF_FLAGS = 1 << 21  # GIT_DIFF_FORCE_BINARY, which isn't exposed by pygit2
              def _find_nearest_ancestor_introducing_node(
                  db, gitrepo, file_path, walk_start, filenode
              ):
                  """Find the nearest ancestor that introduces a file node.
                  Args:
                    db: a handle to our sqlite database.
                    gitrepo: A pygit2.Repository instance.
                    file_path: the path of a file in the repo
                    walk_start: a pygit2.Oid that is a commit where we should start walking
                                for our nearest ancestor.
                  Returns:
                    A hexlified SHA that is the commit ID of the next-nearest parent.
                  """
                  assert isinstance(file_path, str), 'file_path must be str, got %r' % type(
                      file_path
                  )
                  assert isinstance(filenode, str), 'filenode must be str, got %r' % type(
                      filenode
                  )
                  parent_options = {
                      row[0].decode('ascii')
                      for row in db.execute(
                          'SELECT node FROM changedfiles '
                          'WHERE filename = ? AND filenode = ?',
                          (file_path, filenode),
                      )
                  }
                  inner_walker = gitrepo.walk(walk_start, _OUR_ORDER)
                  for w in inner_walker:
                      if w.id.hex in parent_options:
                          return w.id.hex
                  raise error.ProgrammingError(
                      'Unable to find introducing commit for %s node %s from %s',
                      (file_path, filenode, walk_start),
                  )
              def fill_in_filelog(gitrepo, db, startcommit, path, startfilenode):
                  """Given a starting commit and path, fill in a filelog's parent pointers.
                  Args:
                    gitrepo: a pygit2.Repository
                    db: a handle to our sqlite database
                    startcommit: a hexlified node id for the commit to start at
                    path: the path of the file whose parent pointers we should fill in.
                    filenode: the hexlified node id of the file at startcommit
                  TODO: make filenode optional
                  """
                  assert isinstance(
                      startcommit, str
                  ), 'startcommit must be str, got %r' % type(startcommit)
                  assert isinstance(
                      startfilenode, str
                  ), 'startfilenode must be str, got %r' % type(startfilenode)
                  visit = collections.deque([(startcommit, startfilenode)])
                  while visit:
                      cnode, filenode = visit.popleft()
                      commit = gitrepo[cnode]
                      parents = []
                      for parent in commit.parents:
                          t = parent.tree
                          for comp in path.split('/'):
                              try:
                                  t = gitrepo[t[comp].id]
                              except KeyError:
                                  break
                          else:
                              introducer = _find_nearest_ancestor_introducing_node(
                                  db, gitrepo, path, parent.id, t.id.hex
                              )
                              parents.append((introducer, t.id.hex))
                      p1node = p1fnode = p2node = p2fnode = gitutil.nullgit
                      for par, parfnode in parents:
                          found = int(
                              db.execute(
                                  'SELECT COUNT(*) FROM changedfiles WHERE '
                                  'node = ? AND filename = ? AND filenode = ? AND '
                                  'p1node NOT NULL',
                                  (par, path, parfnode),
                              ).fetchone()[0]
                          )
                          if found == 0:
                              assert par is not None
                              visit.append((par, parfnode))
                      if parents:
                          p1node, p1fnode = parents[0]
                      if len(parents) == 2:
                          p2node, p2fnode = parents[1]
                      if len(parents) > 2:
                          raise error.ProgrammingError(
                              b"git support can't handle octopus merges"
                          )
                      db.execute(
                          'UPDATE changedfiles SET '
                          'p1node = ?, p1filenode = ?, p2node = ?, p2filenode = ? '
                          'WHERE node = ? AND filename = ? AND filenode = ?',
                          (p1node, p1fnode, p2node, p2fnode, commit.id.hex, path, filenode),
                      )
                  db.commit()
              def _index_repo(gitrepo, db, progress_factory=lambda *args, **kwargs: None):
                  # Identify all references so we can tell the walker to visit all of them.
                  all_refs = gitrepo.listall_references()
                  possible_heads = set()
                  prog = progress_factory(b'refs')
                  for pos, ref in enumerate(all_refs):
                      if prog is not None:
                          prog.update(pos)
                      if not (
                          ref.startswith('refs/heads/')  # local branch
                          or ref.startswith('refs/tags/')  # tag
                          or ref.startswith('refs/remotes/')  # remote branch
                          or ref.startswith('refs/hg/')  # from this extension
                      ):
                          continue
                      try:
                          start = gitrepo.lookup_reference(ref).peel(pygit2.GIT_OBJ_COMMIT)
                      except ValueError:
                          # No commit to be found, so we don't care for hg's purposes.
                          continue
                      possible_heads.add(start.id)
                  # Optimization: if the list of heads hasn't changed, don't
                  # reindex, the changelog. This doesn't matter on small
                  # repositories, but on even moderately deep histories (eg cpython)
                  # this is a very important performance win.
                  #
                  # TODO: we should figure out how to incrementally index history
                  # (preferably by detecting rewinds!) so that we don't have to do a
                  # full changelog walk every time a new commit is created.
-                 cache_heads = {x[0] for x in db.execute('SELECT node FROM possible_heads')}
+                 cache_heads = {
+                     pycompat.sysstr(x[0])
+                     for x in db.execute('SELECT node FROM possible_heads')
+                 }
                  walker = None
                  cur_cache_heads = {h.hex for h in possible_heads}
                  if cur_cache_heads == cache_heads:
                      return
                  for start in possible_heads:
                      if walker is None:
                          walker = gitrepo.walk(start, _OUR_ORDER)
                      else:
                          walker.push(start)
                  # Empty out the existing changelog. Even for large-ish histories
                  # we can do the top-level "walk all the commits" dance very
                  # quickly as long as we don't need to figure out the changed files
                  # list.
                  db.execute('DELETE FROM changelog')
                  if prog is not None:
                      prog.complete()
                  prog = progress_factory(b'commits')
                  # This walker is sure to visit all the revisions in history, but
                  # only once.
                  for pos, commit in enumerate(walker):
                      if prog is not None:
                          prog.update(pos)
                      p1 = p2 = nodemod.nullhex
                      if len(commit.parents) > 2:
                          raise error.ProgrammingError(
                              (
                                  b"git support can't handle octopus merges, "
                                  b"found a commit with %d parents :("
                              )
                              % len(commit.parents)
                          )
                      if commit.parents:
                          p1 = commit.parents[0].id.hex
                      if len(commit.parents) == 2:
                          p2 = commit.parents[1].id.hex
                      db.execute(
                          'INSERT INTO changelog (rev, node, p1, p2) VALUES(?, ?, ?, ?)',
                          (pos, commit.id.hex, p1, p2),
                      )
                      num_changedfiles = db.execute(
                          "SELECT COUNT(*) from changedfiles WHERE node = ?",
                          (commit.id.hex,),
                      ).fetchone()[0]
                      if not num_changedfiles:
                          files = {}
                          # I *think* we only need to check p1 for changed files
                          # (and therefore linkrevs), because any node that would
                          # actually have this commit as a linkrev would be
                          # completely new in this rev.
                          p1 = commit.parents[0].id.hex if commit.parents else None
                          if p1 is not None:
                              patchgen = gitrepo.diff(p1, commit.id.hex, flags=_DIFF_FLAGS)
                          else:
                              patchgen = commit.tree.diff_to_tree(
                                  swap=True, flags=_DIFF_FLAGS
                              )
                          new_files = (p.delta.new_file for p in patchgen)
                          files = {
                              nf.path: nf.id.hex
                              for nf in new_files
                              if nf.id.raw != nodemod.nullid
                          }
                          for p, n in files.items():
                              # We intentionally set NULLs for any file parentage
                              # information so it'll get demand-computed later. We
                              # used to do it right here, and it was _very_ slow.
                              db.execute(
                                  'INSERT INTO changedfiles ('
                                  'node, filename, filenode, p1node, p1filenode, p2node, '
                                  'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)',
                                  (commit.id.hex, p, n, None, None, None, None),
                              )
                  db.execute('DELETE FROM heads')
                  db.execute('DELETE FROM possible_heads')
                  for hid in possible_heads:
                      h = hid.hex
                      db.execute('INSERT INTO possible_heads (node) VALUES(?)', (h,))
                      haschild = db.execute(
                          'SELECT COUNT(*) FROM changelog WHERE p1 = ? OR p2 = ?', (h, h)
                      ).fetchone()[0]
                      if not haschild:
                          db.execute('INSERT INTO heads (node) VALUES(?)', (h,))
                  db.commit()
                  if prog is not None:
                      prog.complete()
              def get_index(gitrepo, progress_factory=lambda *args, **kwargs: None):
                  cachepath = os.path.join(
                      pycompat.fsencode(gitrepo.path), b'..', b'.hg', b'cache'
                  )
                  if not os.path.exists(cachepath):
                      os.makedirs(cachepath)
                  dbpath = os.path.join(cachepath, b'git-commits.sqlite')
                  db = _createdb(dbpath)
                  # TODO check against gitrepo heads before doing a full index
                  # TODO thread a ui.progress call into this layer
                  _index_repo(gitrepo, db, progress_factory)
                  return db

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No reviewers

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages