index.py
350 lines
| 11.2 KiB
| text/x-python
|
PythonLexer
Augie Fackler
|
r44961 | from __future__ import absolute_import | ||
import collections | ||||
import os | ||||
import sqlite3 | ||||
from mercurial.i18n import _ | ||||
from mercurial import ( | ||||
encoding, | ||||
error, | ||||
node as nodemod, | ||||
pycompat, | ||||
) | ||||
from . import gitutil | ||||
Martin von Zweigbergk
|
r44968 | pygit2 = gitutil.get_pygit2() | ||
Augie Fackler
|
r44961 | _CURRENT_SCHEMA_VERSION = 1 | ||
_SCHEMA = ( | ||||
""" | ||||
CREATE TABLE refs ( | ||||
-- node and name are unique together. There may be more than one name for | ||||
-- a given node, and there may be no name at all for a given node (in the | ||||
-- case of an anonymous hg head). | ||||
node TEXT NOT NULL, | ||||
name TEXT | ||||
); | ||||
-- The "possible heads" of the repository, which we use to figure out | ||||
-- if we need to re-walk the changelog. | ||||
CREATE TABLE possible_heads ( | ||||
node TEXT NOT NULL | ||||
); | ||||
-- The topological heads of the changelog, which hg depends on. | ||||
CREATE TABLE heads ( | ||||
node TEXT NOT NULL | ||||
); | ||||
-- A total ordering of the changelog | ||||
CREATE TABLE changelog ( | ||||
rev INTEGER NOT NULL PRIMARY KEY, | ||||
node TEXT NOT NULL, | ||||
p1 TEXT, | ||||
p2 TEXT | ||||
); | ||||
CREATE UNIQUE INDEX changelog_node_idx ON changelog(node); | ||||
CREATE UNIQUE INDEX changelog_node_rev_idx ON changelog(rev, node); | ||||
-- Changed files for each commit, which lets us dynamically build | ||||
-- filelogs. | ||||
CREATE TABLE changedfiles ( | ||||
node TEXT NOT NULL, | ||||
filename TEXT NOT NULL, | ||||
-- 40 zeroes for deletions | ||||
filenode TEXT NOT NULL, | ||||
-- to handle filelog parentage: | ||||
p1node TEXT, | ||||
p1filenode TEXT, | ||||
p2node TEXT, | ||||
p2filenode TEXT | ||||
); | ||||
CREATE INDEX changedfiles_nodes_idx | ||||
ON changedfiles(node); | ||||
PRAGMA user_version=%d | ||||
""" | ||||
% _CURRENT_SCHEMA_VERSION | ||||
) | ||||
def _createdb(path): | ||||
# print('open db', path) | ||||
# import traceback | ||||
# traceback.print_stack() | ||||
db = sqlite3.connect(encoding.strfromlocal(path)) | ||||
db.text_factory = bytes | ||||
res = db.execute('PRAGMA user_version').fetchone()[0] | ||||
# New database. | ||||
if res == 0: | ||||
for statement in _SCHEMA.split(';'): | ||||
db.execute(statement.strip()) | ||||
db.commit() | ||||
elif res == _CURRENT_SCHEMA_VERSION: | ||||
pass | ||||
else: | ||||
raise error.Abort(_(b'sqlite database has unrecognized version')) | ||||
db.execute('PRAGMA journal_mode=WAL') | ||||
return db | ||||
Martin von Zweigbergk
|
r44968 | _OUR_ORDER = () | ||
if pygit2: | ||||
_OUR_ORDER = ( | ||||
pygit2.GIT_SORT_TOPOLOGICAL | ||||
| pygit2.GIT_SORT_TIME | ||||
| pygit2.GIT_SORT_REVERSE | ||||
) | ||||
Augie Fackler
|
r44961 | |||
_DIFF_FLAGS = 1 << 21 # GIT_DIFF_FORCE_BINARY, which isn't exposed by pygit2 | ||||
def _find_nearest_ancestor_introducing_node( | ||||
db, gitrepo, file_path, walk_start, filenode | ||||
): | ||||
"""Find the nearest ancestor that introduces a file node. | ||||
Args: | ||||
db: a handle to our sqlite database. | ||||
gitrepo: A pygit2.Repository instance. | ||||
file_path: the path of a file in the repo | ||||
walk_start: a pygit2.Oid that is a commit where we should start walking | ||||
for our nearest ancestor. | ||||
Returns: | ||||
A hexlified SHA that is the commit ID of the next-nearest parent. | ||||
""" | ||||
assert isinstance(file_path, str), 'file_path must be str, got %r' % type( | ||||
file_path | ||||
) | ||||
assert isinstance(filenode, str), 'filenode must be str, got %r' % type( | ||||
filenode | ||||
) | ||||
parent_options = { | ||||
row[0].decode('ascii') | ||||
for row in db.execute( | ||||
'SELECT node FROM changedfiles ' | ||||
'WHERE filename = ? AND filenode = ?', | ||||
(file_path, filenode), | ||||
) | ||||
} | ||||
inner_walker = gitrepo.walk(walk_start, _OUR_ORDER) | ||||
for w in inner_walker: | ||||
if w.id.hex in parent_options: | ||||
return w.id.hex | ||||
raise error.ProgrammingError( | ||||
'Unable to find introducing commit for %s node %s from %s', | ||||
(file_path, filenode, walk_start), | ||||
) | ||||
def fill_in_filelog(gitrepo, db, startcommit, path, startfilenode): | ||||
"""Given a starting commit and path, fill in a filelog's parent pointers. | ||||
Args: | ||||
gitrepo: a pygit2.Repository | ||||
db: a handle to our sqlite database | ||||
startcommit: a hexlified node id for the commit to start at | ||||
path: the path of the file whose parent pointers we should fill in. | ||||
filenode: the hexlified node id of the file at startcommit | ||||
TODO: make filenode optional | ||||
""" | ||||
assert isinstance( | ||||
startcommit, str | ||||
), 'startcommit must be str, got %r' % type(startcommit) | ||||
assert isinstance( | ||||
startfilenode, str | ||||
), 'startfilenode must be str, got %r' % type(startfilenode) | ||||
visit = collections.deque([(startcommit, startfilenode)]) | ||||
while visit: | ||||
cnode, filenode = visit.popleft() | ||||
commit = gitrepo[cnode] | ||||
parents = [] | ||||
for parent in commit.parents: | ||||
t = parent.tree | ||||
for comp in path.split('/'): | ||||
try: | ||||
t = gitrepo[t[comp].id] | ||||
except KeyError: | ||||
break | ||||
else: | ||||
introducer = _find_nearest_ancestor_introducing_node( | ||||
db, gitrepo, path, parent.id, t.id.hex | ||||
) | ||||
parents.append((introducer, t.id.hex)) | ||||
p1node = p1fnode = p2node = p2fnode = gitutil.nullgit | ||||
for par, parfnode in parents: | ||||
found = int( | ||||
db.execute( | ||||
'SELECT COUNT(*) FROM changedfiles WHERE ' | ||||
'node = ? AND filename = ? AND filenode = ? AND ' | ||||
'p1node NOT NULL', | ||||
(par, path, parfnode), | ||||
).fetchone()[0] | ||||
) | ||||
if found == 0: | ||||
assert par is not None | ||||
visit.append((par, parfnode)) | ||||
if parents: | ||||
p1node, p1fnode = parents[0] | ||||
if len(parents) == 2: | ||||
p2node, p2fnode = parents[1] | ||||
if len(parents) > 2: | ||||
raise error.ProgrammingError( | ||||
b"git support can't handle octopus merges" | ||||
) | ||||
db.execute( | ||||
'UPDATE changedfiles SET ' | ||||
'p1node = ?, p1filenode = ?, p2node = ?, p2filenode = ? ' | ||||
'WHERE node = ? AND filename = ? AND filenode = ?', | ||||
(p1node, p1fnode, p2node, p2fnode, commit.id.hex, path, filenode), | ||||
) | ||||
db.commit() | ||||
def _index_repo(gitrepo, db, progress_factory=lambda *args, **kwargs: None): | ||||
# Identify all references so we can tell the walker to visit all of them. | ||||
all_refs = gitrepo.listall_references() | ||||
possible_heads = set() | ||||
prog = progress_factory(b'refs') | ||||
for pos, ref in enumerate(all_refs): | ||||
if prog is not None: | ||||
prog.update(pos) | ||||
if not ( | ||||
ref.startswith('refs/heads/') # local branch | ||||
or ref.startswith('refs/tags/') # tag | ||||
or ref.startswith('refs/remotes/') # remote branch | ||||
or ref.startswith('refs/hg/') # from this extension | ||||
): | ||||
continue | ||||
try: | ||||
start = gitrepo.lookup_reference(ref).peel(pygit2.GIT_OBJ_COMMIT) | ||||
except ValueError: | ||||
# No commit to be found, so we don't care for hg's purposes. | ||||
continue | ||||
possible_heads.add(start.id) | ||||
# Optimization: if the list of heads hasn't changed, don't | ||||
# reindex, the changelog. This doesn't matter on small | ||||
# repositories, but on even moderately deep histories (eg cpython) | ||||
# this is a very important performance win. | ||||
# | ||||
# TODO: we should figure out how to incrementally index history | ||||
# (preferably by detecting rewinds!) so that we don't have to do a | ||||
# full changelog walk every time a new commit is created. | ||||
cache_heads = {x[0] for x in db.execute('SELECT node FROM possible_heads')} | ||||
walker = None | ||||
cur_cache_heads = {h.hex for h in possible_heads} | ||||
if cur_cache_heads == cache_heads: | ||||
return | ||||
for start in possible_heads: | ||||
if walker is None: | ||||
walker = gitrepo.walk(start, _OUR_ORDER) | ||||
else: | ||||
walker.push(start) | ||||
# Empty out the existing changelog. Even for large-ish histories | ||||
# we can do the top-level "walk all the commits" dance very | ||||
# quickly as long as we don't need to figure out the changed files | ||||
# list. | ||||
db.execute('DELETE FROM changelog') | ||||
if prog is not None: | ||||
prog.complete() | ||||
prog = progress_factory(b'commits') | ||||
# This walker is sure to visit all the revisions in history, but | ||||
# only once. | ||||
for pos, commit in enumerate(walker): | ||||
if prog is not None: | ||||
prog.update(pos) | ||||
p1 = p2 = nodemod.nullhex | ||||
if len(commit.parents) > 2: | ||||
raise error.ProgrammingError( | ||||
( | ||||
b"git support can't handle octopus merges, " | ||||
b"found a commit with %d parents :(" | ||||
) | ||||
% len(commit.parents) | ||||
) | ||||
if commit.parents: | ||||
p1 = commit.parents[0].id.hex | ||||
if len(commit.parents) == 2: | ||||
p2 = commit.parents[1].id.hex | ||||
db.execute( | ||||
'INSERT INTO changelog (rev, node, p1, p2) VALUES(?, ?, ?, ?)', | ||||
(pos, commit.id.hex, p1, p2), | ||||
) | ||||
num_changedfiles = db.execute( | ||||
"SELECT COUNT(*) from changedfiles WHERE node = ?", | ||||
(commit.id.hex,), | ||||
).fetchone()[0] | ||||
if not num_changedfiles: | ||||
files = {} | ||||
# I *think* we only need to check p1 for changed files | ||||
# (and therefore linkrevs), because any node that would | ||||
# actually have this commit as a linkrev would be | ||||
# completely new in this rev. | ||||
p1 = commit.parents[0].id.hex if commit.parents else None | ||||
if p1 is not None: | ||||
patchgen = gitrepo.diff(p1, commit.id.hex, flags=_DIFF_FLAGS) | ||||
else: | ||||
patchgen = commit.tree.diff_to_tree( | ||||
swap=True, flags=_DIFF_FLAGS | ||||
) | ||||
new_files = (p.delta.new_file for p in patchgen) | ||||
files = { | ||||
nf.path: nf.id.hex | ||||
for nf in new_files | ||||
if nf.id.raw != nodemod.nullid | ||||
} | ||||
for p, n in files.items(): | ||||
# We intentionally set NULLs for any file parentage | ||||
# information so it'll get demand-computed later. We | ||||
# used to do it right here, and it was _very_ slow. | ||||
db.execute( | ||||
'INSERT INTO changedfiles (' | ||||
'node, filename, filenode, p1node, p1filenode, p2node, ' | ||||
'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)', | ||||
(commit.id.hex, p, n, None, None, None, None), | ||||
) | ||||
db.execute('DELETE FROM heads') | ||||
db.execute('DELETE FROM possible_heads') | ||||
for hid in possible_heads: | ||||
h = hid.hex | ||||
db.execute('INSERT INTO possible_heads (node) VALUES(?)', (h,)) | ||||
haschild = db.execute( | ||||
'SELECT COUNT(*) FROM changelog WHERE p1 = ? OR p2 = ?', (h, h) | ||||
).fetchone()[0] | ||||
if not haschild: | ||||
db.execute('INSERT INTO heads (node) VALUES(?)', (h,)) | ||||
db.commit() | ||||
if prog is not None: | ||||
prog.complete() | ||||
def get_index(gitrepo, progress_factory=lambda *args, **kwargs: None): | ||||
cachepath = os.path.join( | ||||
pycompat.fsencode(gitrepo.path), b'..', b'.hg', b'cache' | ||||
) | ||||
if not os.path.exists(cachepath): | ||||
os.makedirs(cachepath) | ||||
dbpath = os.path.join(cachepath, b'git-commits.sqlite') | ||||
db = _createdb(dbpath) | ||||
# TODO check against gitrepo heads before doing a full index | ||||
# TODO thread a ui.progress call into this layer | ||||
_index_repo(gitrepo, db, progress_factory) | ||||
return db | ||||