##// END OF EJS Templates
git: decode node IDs back into Python strings (issue6349)...
Hollis Blanchard -
r45472:fb2936c5 default
parent child Browse files
Show More
@@ -1,350 +1,353 b''
1 from __future__ import absolute_import
1 from __future__ import absolute_import
2
2
3 import collections
3 import collections
4 import os
4 import os
5 import sqlite3
5 import sqlite3
6
6
7 from mercurial.i18n import _
7 from mercurial.i18n import _
8
8
9 from mercurial import (
9 from mercurial import (
10 encoding,
10 encoding,
11 error,
11 error,
12 node as nodemod,
12 node as nodemod,
13 pycompat,
13 pycompat,
14 )
14 )
15
15
16 from . import gitutil
16 from . import gitutil
17
17
18
18
19 pygit2 = gitutil.get_pygit2()
19 pygit2 = gitutil.get_pygit2()
20
20
21 _CURRENT_SCHEMA_VERSION = 1
21 _CURRENT_SCHEMA_VERSION = 1
22 _SCHEMA = (
22 _SCHEMA = (
23 """
23 """
24 CREATE TABLE refs (
24 CREATE TABLE refs (
25 -- node and name are unique together. There may be more than one name for
25 -- node and name are unique together. There may be more than one name for
26 -- a given node, and there may be no name at all for a given node (in the
26 -- a given node, and there may be no name at all for a given node (in the
27 -- case of an anonymous hg head).
27 -- case of an anonymous hg head).
28 node TEXT NOT NULL,
28 node TEXT NOT NULL,
29 name TEXT
29 name TEXT
30 );
30 );
31
31
32 -- The "possible heads" of the repository, which we use to figure out
32 -- The "possible heads" of the repository, which we use to figure out
33 -- if we need to re-walk the changelog.
33 -- if we need to re-walk the changelog.
34 CREATE TABLE possible_heads (
34 CREATE TABLE possible_heads (
35 node TEXT NOT NULL
35 node TEXT NOT NULL
36 );
36 );
37
37
38 -- The topological heads of the changelog, which hg depends on.
38 -- The topological heads of the changelog, which hg depends on.
39 CREATE TABLE heads (
39 CREATE TABLE heads (
40 node TEXT NOT NULL
40 node TEXT NOT NULL
41 );
41 );
42
42
43 -- A total ordering of the changelog
43 -- A total ordering of the changelog
44 CREATE TABLE changelog (
44 CREATE TABLE changelog (
45 rev INTEGER NOT NULL PRIMARY KEY,
45 rev INTEGER NOT NULL PRIMARY KEY,
46 node TEXT NOT NULL,
46 node TEXT NOT NULL,
47 p1 TEXT,
47 p1 TEXT,
48 p2 TEXT
48 p2 TEXT
49 );
49 );
50
50
51 CREATE UNIQUE INDEX changelog_node_idx ON changelog(node);
51 CREATE UNIQUE INDEX changelog_node_idx ON changelog(node);
52 CREATE UNIQUE INDEX changelog_node_rev_idx ON changelog(rev, node);
52 CREATE UNIQUE INDEX changelog_node_rev_idx ON changelog(rev, node);
53
53
54 -- Changed files for each commit, which lets us dynamically build
54 -- Changed files for each commit, which lets us dynamically build
55 -- filelogs.
55 -- filelogs.
56 CREATE TABLE changedfiles (
56 CREATE TABLE changedfiles (
57 node TEXT NOT NULL,
57 node TEXT NOT NULL,
58 filename TEXT NOT NULL,
58 filename TEXT NOT NULL,
59 -- 40 zeroes for deletions
59 -- 40 zeroes for deletions
60 filenode TEXT NOT NULL,
60 filenode TEXT NOT NULL,
61 -- to handle filelog parentage:
61 -- to handle filelog parentage:
62 p1node TEXT,
62 p1node TEXT,
63 p1filenode TEXT,
63 p1filenode TEXT,
64 p2node TEXT,
64 p2node TEXT,
65 p2filenode TEXT
65 p2filenode TEXT
66 );
66 );
67
67
68 CREATE INDEX changedfiles_nodes_idx
68 CREATE INDEX changedfiles_nodes_idx
69 ON changedfiles(node);
69 ON changedfiles(node);
70
70
71 PRAGMA user_version=%d
71 PRAGMA user_version=%d
72 """
72 """
73 % _CURRENT_SCHEMA_VERSION
73 % _CURRENT_SCHEMA_VERSION
74 )
74 )
75
75
76
76
77 def _createdb(path):
77 def _createdb(path):
78 # print('open db', path)
78 # print('open db', path)
79 # import traceback
79 # import traceback
80 # traceback.print_stack()
80 # traceback.print_stack()
81 db = sqlite3.connect(encoding.strfromlocal(path))
81 db = sqlite3.connect(encoding.strfromlocal(path))
82 db.text_factory = bytes
82 db.text_factory = bytes
83
83
84 res = db.execute('PRAGMA user_version').fetchone()[0]
84 res = db.execute('PRAGMA user_version').fetchone()[0]
85
85
86 # New database.
86 # New database.
87 if res == 0:
87 if res == 0:
88 for statement in _SCHEMA.split(';'):
88 for statement in _SCHEMA.split(';'):
89 db.execute(statement.strip())
89 db.execute(statement.strip())
90
90
91 db.commit()
91 db.commit()
92
92
93 elif res == _CURRENT_SCHEMA_VERSION:
93 elif res == _CURRENT_SCHEMA_VERSION:
94 pass
94 pass
95
95
96 else:
96 else:
97 raise error.Abort(_(b'sqlite database has unrecognized version'))
97 raise error.Abort(_(b'sqlite database has unrecognized version'))
98
98
99 db.execute('PRAGMA journal_mode=WAL')
99 db.execute('PRAGMA journal_mode=WAL')
100
100
101 return db
101 return db
102
102
103
103
104 _OUR_ORDER = ()
104 _OUR_ORDER = ()
105 if pygit2:
105 if pygit2:
106 _OUR_ORDER = (
106 _OUR_ORDER = (
107 pygit2.GIT_SORT_TOPOLOGICAL
107 pygit2.GIT_SORT_TOPOLOGICAL
108 | pygit2.GIT_SORT_TIME
108 | pygit2.GIT_SORT_TIME
109 | pygit2.GIT_SORT_REVERSE
109 | pygit2.GIT_SORT_REVERSE
110 )
110 )
111
111
112 _DIFF_FLAGS = 1 << 21 # GIT_DIFF_FORCE_BINARY, which isn't exposed by pygit2
112 _DIFF_FLAGS = 1 << 21 # GIT_DIFF_FORCE_BINARY, which isn't exposed by pygit2
113
113
114
114
115 def _find_nearest_ancestor_introducing_node(
115 def _find_nearest_ancestor_introducing_node(
116 db, gitrepo, file_path, walk_start, filenode
116 db, gitrepo, file_path, walk_start, filenode
117 ):
117 ):
118 """Find the nearest ancestor that introduces a file node.
118 """Find the nearest ancestor that introduces a file node.
119
119
120 Args:
120 Args:
121 db: a handle to our sqlite database.
121 db: a handle to our sqlite database.
122 gitrepo: A pygit2.Repository instance.
122 gitrepo: A pygit2.Repository instance.
123 file_path: the path of a file in the repo
123 file_path: the path of a file in the repo
124 walk_start: a pygit2.Oid that is a commit where we should start walking
124 walk_start: a pygit2.Oid that is a commit where we should start walking
125 for our nearest ancestor.
125 for our nearest ancestor.
126
126
127 Returns:
127 Returns:
128 A hexlified SHA that is the commit ID of the next-nearest parent.
128 A hexlified SHA that is the commit ID of the next-nearest parent.
129 """
129 """
130 assert isinstance(file_path, str), 'file_path must be str, got %r' % type(
130 assert isinstance(file_path, str), 'file_path must be str, got %r' % type(
131 file_path
131 file_path
132 )
132 )
133 assert isinstance(filenode, str), 'filenode must be str, got %r' % type(
133 assert isinstance(filenode, str), 'filenode must be str, got %r' % type(
134 filenode
134 filenode
135 )
135 )
136 parent_options = {
136 parent_options = {
137 row[0].decode('ascii')
137 row[0].decode('ascii')
138 for row in db.execute(
138 for row in db.execute(
139 'SELECT node FROM changedfiles '
139 'SELECT node FROM changedfiles '
140 'WHERE filename = ? AND filenode = ?',
140 'WHERE filename = ? AND filenode = ?',
141 (file_path, filenode),
141 (file_path, filenode),
142 )
142 )
143 }
143 }
144 inner_walker = gitrepo.walk(walk_start, _OUR_ORDER)
144 inner_walker = gitrepo.walk(walk_start, _OUR_ORDER)
145 for w in inner_walker:
145 for w in inner_walker:
146 if w.id.hex in parent_options:
146 if w.id.hex in parent_options:
147 return w.id.hex
147 return w.id.hex
148 raise error.ProgrammingError(
148 raise error.ProgrammingError(
149 'Unable to find introducing commit for %s node %s from %s',
149 'Unable to find introducing commit for %s node %s from %s',
150 (file_path, filenode, walk_start),
150 (file_path, filenode, walk_start),
151 )
151 )
152
152
153
153
154 def fill_in_filelog(gitrepo, db, startcommit, path, startfilenode):
154 def fill_in_filelog(gitrepo, db, startcommit, path, startfilenode):
155 """Given a starting commit and path, fill in a filelog's parent pointers.
155 """Given a starting commit and path, fill in a filelog's parent pointers.
156
156
157 Args:
157 Args:
158 gitrepo: a pygit2.Repository
158 gitrepo: a pygit2.Repository
159 db: a handle to our sqlite database
159 db: a handle to our sqlite database
160 startcommit: a hexlified node id for the commit to start at
160 startcommit: a hexlified node id for the commit to start at
161 path: the path of the file whose parent pointers we should fill in.
161 path: the path of the file whose parent pointers we should fill in.
162 filenode: the hexlified node id of the file at startcommit
162 filenode: the hexlified node id of the file at startcommit
163
163
164 TODO: make filenode optional
164 TODO: make filenode optional
165 """
165 """
166 assert isinstance(
166 assert isinstance(
167 startcommit, str
167 startcommit, str
168 ), 'startcommit must be str, got %r' % type(startcommit)
168 ), 'startcommit must be str, got %r' % type(startcommit)
169 assert isinstance(
169 assert isinstance(
170 startfilenode, str
170 startfilenode, str
171 ), 'startfilenode must be str, got %r' % type(startfilenode)
171 ), 'startfilenode must be str, got %r' % type(startfilenode)
172 visit = collections.deque([(startcommit, startfilenode)])
172 visit = collections.deque([(startcommit, startfilenode)])
173 while visit:
173 while visit:
174 cnode, filenode = visit.popleft()
174 cnode, filenode = visit.popleft()
175 commit = gitrepo[cnode]
175 commit = gitrepo[cnode]
176 parents = []
176 parents = []
177 for parent in commit.parents:
177 for parent in commit.parents:
178 t = parent.tree
178 t = parent.tree
179 for comp in path.split('/'):
179 for comp in path.split('/'):
180 try:
180 try:
181 t = gitrepo[t[comp].id]
181 t = gitrepo[t[comp].id]
182 except KeyError:
182 except KeyError:
183 break
183 break
184 else:
184 else:
185 introducer = _find_nearest_ancestor_introducing_node(
185 introducer = _find_nearest_ancestor_introducing_node(
186 db, gitrepo, path, parent.id, t.id.hex
186 db, gitrepo, path, parent.id, t.id.hex
187 )
187 )
188 parents.append((introducer, t.id.hex))
188 parents.append((introducer, t.id.hex))
189 p1node = p1fnode = p2node = p2fnode = gitutil.nullgit
189 p1node = p1fnode = p2node = p2fnode = gitutil.nullgit
190 for par, parfnode in parents:
190 for par, parfnode in parents:
191 found = int(
191 found = int(
192 db.execute(
192 db.execute(
193 'SELECT COUNT(*) FROM changedfiles WHERE '
193 'SELECT COUNT(*) FROM changedfiles WHERE '
194 'node = ? AND filename = ? AND filenode = ? AND '
194 'node = ? AND filename = ? AND filenode = ? AND '
195 'p1node NOT NULL',
195 'p1node NOT NULL',
196 (par, path, parfnode),
196 (par, path, parfnode),
197 ).fetchone()[0]
197 ).fetchone()[0]
198 )
198 )
199 if found == 0:
199 if found == 0:
200 assert par is not None
200 assert par is not None
201 visit.append((par, parfnode))
201 visit.append((par, parfnode))
202 if parents:
202 if parents:
203 p1node, p1fnode = parents[0]
203 p1node, p1fnode = parents[0]
204 if len(parents) == 2:
204 if len(parents) == 2:
205 p2node, p2fnode = parents[1]
205 p2node, p2fnode = parents[1]
206 if len(parents) > 2:
206 if len(parents) > 2:
207 raise error.ProgrammingError(
207 raise error.ProgrammingError(
208 b"git support can't handle octopus merges"
208 b"git support can't handle octopus merges"
209 )
209 )
210 db.execute(
210 db.execute(
211 'UPDATE changedfiles SET '
211 'UPDATE changedfiles SET '
212 'p1node = ?, p1filenode = ?, p2node = ?, p2filenode = ? '
212 'p1node = ?, p1filenode = ?, p2node = ?, p2filenode = ? '
213 'WHERE node = ? AND filename = ? AND filenode = ?',
213 'WHERE node = ? AND filename = ? AND filenode = ?',
214 (p1node, p1fnode, p2node, p2fnode, commit.id.hex, path, filenode),
214 (p1node, p1fnode, p2node, p2fnode, commit.id.hex, path, filenode),
215 )
215 )
216 db.commit()
216 db.commit()
217
217
218
218
219 def _index_repo(gitrepo, db, progress_factory=lambda *args, **kwargs: None):
219 def _index_repo(gitrepo, db, progress_factory=lambda *args, **kwargs: None):
220 # Identify all references so we can tell the walker to visit all of them.
220 # Identify all references so we can tell the walker to visit all of them.
221 all_refs = gitrepo.listall_references()
221 all_refs = gitrepo.listall_references()
222 possible_heads = set()
222 possible_heads = set()
223 prog = progress_factory(b'refs')
223 prog = progress_factory(b'refs')
224 for pos, ref in enumerate(all_refs):
224 for pos, ref in enumerate(all_refs):
225 if prog is not None:
225 if prog is not None:
226 prog.update(pos)
226 prog.update(pos)
227 if not (
227 if not (
228 ref.startswith('refs/heads/') # local branch
228 ref.startswith('refs/heads/') # local branch
229 or ref.startswith('refs/tags/') # tag
229 or ref.startswith('refs/tags/') # tag
230 or ref.startswith('refs/remotes/') # remote branch
230 or ref.startswith('refs/remotes/') # remote branch
231 or ref.startswith('refs/hg/') # from this extension
231 or ref.startswith('refs/hg/') # from this extension
232 ):
232 ):
233 continue
233 continue
234 try:
234 try:
235 start = gitrepo.lookup_reference(ref).peel(pygit2.GIT_OBJ_COMMIT)
235 start = gitrepo.lookup_reference(ref).peel(pygit2.GIT_OBJ_COMMIT)
236 except ValueError:
236 except ValueError:
237 # No commit to be found, so we don't care for hg's purposes.
237 # No commit to be found, so we don't care for hg's purposes.
238 continue
238 continue
239 possible_heads.add(start.id)
239 possible_heads.add(start.id)
240 # Optimization: if the list of heads hasn't changed, don't
240 # Optimization: if the list of heads hasn't changed, don't
241 # reindex, the changelog. This doesn't matter on small
241 # reindex, the changelog. This doesn't matter on small
242 # repositories, but on even moderately deep histories (eg cpython)
242 # repositories, but on even moderately deep histories (eg cpython)
243 # this is a very important performance win.
243 # this is a very important performance win.
244 #
244 #
245 # TODO: we should figure out how to incrementally index history
245 # TODO: we should figure out how to incrementally index history
246 # (preferably by detecting rewinds!) so that we don't have to do a
246 # (preferably by detecting rewinds!) so that we don't have to do a
247 # full changelog walk every time a new commit is created.
247 # full changelog walk every time a new commit is created.
248 cache_heads = {x[0] for x in db.execute('SELECT node FROM possible_heads')}
248 cache_heads = {
249 pycompat.sysstr(x[0])
250 for x in db.execute('SELECT node FROM possible_heads')
251 }
249 walker = None
252 walker = None
250 cur_cache_heads = {h.hex for h in possible_heads}
253 cur_cache_heads = {h.hex for h in possible_heads}
251 if cur_cache_heads == cache_heads:
254 if cur_cache_heads == cache_heads:
252 return
255 return
253 for start in possible_heads:
256 for start in possible_heads:
254 if walker is None:
257 if walker is None:
255 walker = gitrepo.walk(start, _OUR_ORDER)
258 walker = gitrepo.walk(start, _OUR_ORDER)
256 else:
259 else:
257 walker.push(start)
260 walker.push(start)
258
261
259 # Empty out the existing changelog. Even for large-ish histories
262 # Empty out the existing changelog. Even for large-ish histories
260 # we can do the top-level "walk all the commits" dance very
263 # we can do the top-level "walk all the commits" dance very
261 # quickly as long as we don't need to figure out the changed files
264 # quickly as long as we don't need to figure out the changed files
262 # list.
265 # list.
263 db.execute('DELETE FROM changelog')
266 db.execute('DELETE FROM changelog')
264 if prog is not None:
267 if prog is not None:
265 prog.complete()
268 prog.complete()
266 prog = progress_factory(b'commits')
269 prog = progress_factory(b'commits')
267 # This walker is sure to visit all the revisions in history, but
270 # This walker is sure to visit all the revisions in history, but
268 # only once.
271 # only once.
269 for pos, commit in enumerate(walker):
272 for pos, commit in enumerate(walker):
270 if prog is not None:
273 if prog is not None:
271 prog.update(pos)
274 prog.update(pos)
272 p1 = p2 = nodemod.nullhex
275 p1 = p2 = nodemod.nullhex
273 if len(commit.parents) > 2:
276 if len(commit.parents) > 2:
274 raise error.ProgrammingError(
277 raise error.ProgrammingError(
275 (
278 (
276 b"git support can't handle octopus merges, "
279 b"git support can't handle octopus merges, "
277 b"found a commit with %d parents :("
280 b"found a commit with %d parents :("
278 )
281 )
279 % len(commit.parents)
282 % len(commit.parents)
280 )
283 )
281 if commit.parents:
284 if commit.parents:
282 p1 = commit.parents[0].id.hex
285 p1 = commit.parents[0].id.hex
283 if len(commit.parents) == 2:
286 if len(commit.parents) == 2:
284 p2 = commit.parents[1].id.hex
287 p2 = commit.parents[1].id.hex
285 db.execute(
288 db.execute(
286 'INSERT INTO changelog (rev, node, p1, p2) VALUES(?, ?, ?, ?)',
289 'INSERT INTO changelog (rev, node, p1, p2) VALUES(?, ?, ?, ?)',
287 (pos, commit.id.hex, p1, p2),
290 (pos, commit.id.hex, p1, p2),
288 )
291 )
289
292
290 num_changedfiles = db.execute(
293 num_changedfiles = db.execute(
291 "SELECT COUNT(*) from changedfiles WHERE node = ?",
294 "SELECT COUNT(*) from changedfiles WHERE node = ?",
292 (commit.id.hex,),
295 (commit.id.hex,),
293 ).fetchone()[0]
296 ).fetchone()[0]
294 if not num_changedfiles:
297 if not num_changedfiles:
295 files = {}
298 files = {}
296 # I *think* we only need to check p1 for changed files
299 # I *think* we only need to check p1 for changed files
297 # (and therefore linkrevs), because any node that would
300 # (and therefore linkrevs), because any node that would
298 # actually have this commit as a linkrev would be
301 # actually have this commit as a linkrev would be
299 # completely new in this rev.
302 # completely new in this rev.
300 p1 = commit.parents[0].id.hex if commit.parents else None
303 p1 = commit.parents[0].id.hex if commit.parents else None
301 if p1 is not None:
304 if p1 is not None:
302 patchgen = gitrepo.diff(p1, commit.id.hex, flags=_DIFF_FLAGS)
305 patchgen = gitrepo.diff(p1, commit.id.hex, flags=_DIFF_FLAGS)
303 else:
306 else:
304 patchgen = commit.tree.diff_to_tree(
307 patchgen = commit.tree.diff_to_tree(
305 swap=True, flags=_DIFF_FLAGS
308 swap=True, flags=_DIFF_FLAGS
306 )
309 )
307 new_files = (p.delta.new_file for p in patchgen)
310 new_files = (p.delta.new_file for p in patchgen)
308 files = {
311 files = {
309 nf.path: nf.id.hex
312 nf.path: nf.id.hex
310 for nf in new_files
313 for nf in new_files
311 if nf.id.raw != nodemod.nullid
314 if nf.id.raw != nodemod.nullid
312 }
315 }
313 for p, n in files.items():
316 for p, n in files.items():
314 # We intentionally set NULLs for any file parentage
317 # We intentionally set NULLs for any file parentage
315 # information so it'll get demand-computed later. We
318 # information so it'll get demand-computed later. We
316 # used to do it right here, and it was _very_ slow.
319 # used to do it right here, and it was _very_ slow.
317 db.execute(
320 db.execute(
318 'INSERT INTO changedfiles ('
321 'INSERT INTO changedfiles ('
319 'node, filename, filenode, p1node, p1filenode, p2node, '
322 'node, filename, filenode, p1node, p1filenode, p2node, '
320 'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)',
323 'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)',
321 (commit.id.hex, p, n, None, None, None, None),
324 (commit.id.hex, p, n, None, None, None, None),
322 )
325 )
323 db.execute('DELETE FROM heads')
326 db.execute('DELETE FROM heads')
324 db.execute('DELETE FROM possible_heads')
327 db.execute('DELETE FROM possible_heads')
325 for hid in possible_heads:
328 for hid in possible_heads:
326 h = hid.hex
329 h = hid.hex
327 db.execute('INSERT INTO possible_heads (node) VALUES(?)', (h,))
330 db.execute('INSERT INTO possible_heads (node) VALUES(?)', (h,))
328 haschild = db.execute(
331 haschild = db.execute(
329 'SELECT COUNT(*) FROM changelog WHERE p1 = ? OR p2 = ?', (h, h)
332 'SELECT COUNT(*) FROM changelog WHERE p1 = ? OR p2 = ?', (h, h)
330 ).fetchone()[0]
333 ).fetchone()[0]
331 if not haschild:
334 if not haschild:
332 db.execute('INSERT INTO heads (node) VALUES(?)', (h,))
335 db.execute('INSERT INTO heads (node) VALUES(?)', (h,))
333
336
334 db.commit()
337 db.commit()
335 if prog is not None:
338 if prog is not None:
336 prog.complete()
339 prog.complete()
337
340
338
341
339 def get_index(gitrepo, progress_factory=lambda *args, **kwargs: None):
342 def get_index(gitrepo, progress_factory=lambda *args, **kwargs: None):
340 cachepath = os.path.join(
343 cachepath = os.path.join(
341 pycompat.fsencode(gitrepo.path), b'..', b'.hg', b'cache'
344 pycompat.fsencode(gitrepo.path), b'..', b'.hg', b'cache'
342 )
345 )
343 if not os.path.exists(cachepath):
346 if not os.path.exists(cachepath):
344 os.makedirs(cachepath)
347 os.makedirs(cachepath)
345 dbpath = os.path.join(cachepath, b'git-commits.sqlite')
348 dbpath = os.path.join(cachepath, b'git-commits.sqlite')
346 db = _createdb(dbpath)
349 db = _createdb(dbpath)
347 # TODO check against gitrepo heads before doing a full index
350 # TODO check against gitrepo heads before doing a full index
348 # TODO thread a ui.progress call into this layer
351 # TODO thread a ui.progress call into this layer
349 _index_repo(gitrepo, db, progress_factory)
352 _index_repo(gitrepo, db, progress_factory)
350 return db
353 return db
General Comments 0
You need to be logged in to leave comments. Login now