##// END OF EJS Templates
git: decode node IDs back into Python strings (issue6349)...
Hollis Blanchard -
r45472:fb2936c5 default
parent child Browse files
Show More
@@ -1,350 +1,353
1 1 from __future__ import absolute_import
2 2
3 3 import collections
4 4 import os
5 5 import sqlite3
6 6
7 7 from mercurial.i18n import _
8 8
9 9 from mercurial import (
10 10 encoding,
11 11 error,
12 12 node as nodemod,
13 13 pycompat,
14 14 )
15 15
16 16 from . import gitutil
17 17
18 18
19 19 pygit2 = gitutil.get_pygit2()
20 20
21 21 _CURRENT_SCHEMA_VERSION = 1
22 22 _SCHEMA = (
23 23 """
24 24 CREATE TABLE refs (
25 25 -- node and name are unique together. There may be more than one name for
26 26 -- a given node, and there may be no name at all for a given node (in the
27 27 -- case of an anonymous hg head).
28 28 node TEXT NOT NULL,
29 29 name TEXT
30 30 );
31 31
32 32 -- The "possible heads" of the repository, which we use to figure out
33 33 -- if we need to re-walk the changelog.
34 34 CREATE TABLE possible_heads (
35 35 node TEXT NOT NULL
36 36 );
37 37
38 38 -- The topological heads of the changelog, which hg depends on.
39 39 CREATE TABLE heads (
40 40 node TEXT NOT NULL
41 41 );
42 42
43 43 -- A total ordering of the changelog
44 44 CREATE TABLE changelog (
45 45 rev INTEGER NOT NULL PRIMARY KEY,
46 46 node TEXT NOT NULL,
47 47 p1 TEXT,
48 48 p2 TEXT
49 49 );
50 50
51 51 CREATE UNIQUE INDEX changelog_node_idx ON changelog(node);
52 52 CREATE UNIQUE INDEX changelog_node_rev_idx ON changelog(rev, node);
53 53
54 54 -- Changed files for each commit, which lets us dynamically build
55 55 -- filelogs.
56 56 CREATE TABLE changedfiles (
57 57 node TEXT NOT NULL,
58 58 filename TEXT NOT NULL,
59 59 -- 40 zeroes for deletions
60 60 filenode TEXT NOT NULL,
61 61 -- to handle filelog parentage:
62 62 p1node TEXT,
63 63 p1filenode TEXT,
64 64 p2node TEXT,
65 65 p2filenode TEXT
66 66 );
67 67
68 68 CREATE INDEX changedfiles_nodes_idx
69 69 ON changedfiles(node);
70 70
71 71 PRAGMA user_version=%d
72 72 """
73 73 % _CURRENT_SCHEMA_VERSION
74 74 )
75 75
76 76
77 77 def _createdb(path):
78 78 # print('open db', path)
79 79 # import traceback
80 80 # traceback.print_stack()
81 81 db = sqlite3.connect(encoding.strfromlocal(path))
82 82 db.text_factory = bytes
83 83
84 84 res = db.execute('PRAGMA user_version').fetchone()[0]
85 85
86 86 # New database.
87 87 if res == 0:
88 88 for statement in _SCHEMA.split(';'):
89 89 db.execute(statement.strip())
90 90
91 91 db.commit()
92 92
93 93 elif res == _CURRENT_SCHEMA_VERSION:
94 94 pass
95 95
96 96 else:
97 97 raise error.Abort(_(b'sqlite database has unrecognized version'))
98 98
99 99 db.execute('PRAGMA journal_mode=WAL')
100 100
101 101 return db
102 102
103 103
104 104 _OUR_ORDER = ()
105 105 if pygit2:
106 106 _OUR_ORDER = (
107 107 pygit2.GIT_SORT_TOPOLOGICAL
108 108 | pygit2.GIT_SORT_TIME
109 109 | pygit2.GIT_SORT_REVERSE
110 110 )
111 111
112 112 _DIFF_FLAGS = 1 << 21 # GIT_DIFF_FORCE_BINARY, which isn't exposed by pygit2
113 113
114 114
115 115 def _find_nearest_ancestor_introducing_node(
116 116 db, gitrepo, file_path, walk_start, filenode
117 117 ):
118 118 """Find the nearest ancestor that introduces a file node.
119 119
120 120 Args:
121 121 db: a handle to our sqlite database.
122 122 gitrepo: A pygit2.Repository instance.
123 123 file_path: the path of a file in the repo
124 124 walk_start: a pygit2.Oid that is a commit where we should start walking
125 125 for our nearest ancestor.
126 126
127 127 Returns:
128 128 A hexlified SHA that is the commit ID of the next-nearest parent.
129 129 """
130 130 assert isinstance(file_path, str), 'file_path must be str, got %r' % type(
131 131 file_path
132 132 )
133 133 assert isinstance(filenode, str), 'filenode must be str, got %r' % type(
134 134 filenode
135 135 )
136 136 parent_options = {
137 137 row[0].decode('ascii')
138 138 for row in db.execute(
139 139 'SELECT node FROM changedfiles '
140 140 'WHERE filename = ? AND filenode = ?',
141 141 (file_path, filenode),
142 142 )
143 143 }
144 144 inner_walker = gitrepo.walk(walk_start, _OUR_ORDER)
145 145 for w in inner_walker:
146 146 if w.id.hex in parent_options:
147 147 return w.id.hex
148 148 raise error.ProgrammingError(
149 149 'Unable to find introducing commit for %s node %s from %s',
150 150 (file_path, filenode, walk_start),
151 151 )
152 152
153 153
154 154 def fill_in_filelog(gitrepo, db, startcommit, path, startfilenode):
155 155 """Given a starting commit and path, fill in a filelog's parent pointers.
156 156
157 157 Args:
158 158 gitrepo: a pygit2.Repository
159 159 db: a handle to our sqlite database
160 160 startcommit: a hexlified node id for the commit to start at
161 161 path: the path of the file whose parent pointers we should fill in.
162 162 filenode: the hexlified node id of the file at startcommit
163 163
164 164 TODO: make filenode optional
165 165 """
166 166 assert isinstance(
167 167 startcommit, str
168 168 ), 'startcommit must be str, got %r' % type(startcommit)
169 169 assert isinstance(
170 170 startfilenode, str
171 171 ), 'startfilenode must be str, got %r' % type(startfilenode)
172 172 visit = collections.deque([(startcommit, startfilenode)])
173 173 while visit:
174 174 cnode, filenode = visit.popleft()
175 175 commit = gitrepo[cnode]
176 176 parents = []
177 177 for parent in commit.parents:
178 178 t = parent.tree
179 179 for comp in path.split('/'):
180 180 try:
181 181 t = gitrepo[t[comp].id]
182 182 except KeyError:
183 183 break
184 184 else:
185 185 introducer = _find_nearest_ancestor_introducing_node(
186 186 db, gitrepo, path, parent.id, t.id.hex
187 187 )
188 188 parents.append((introducer, t.id.hex))
189 189 p1node = p1fnode = p2node = p2fnode = gitutil.nullgit
190 190 for par, parfnode in parents:
191 191 found = int(
192 192 db.execute(
193 193 'SELECT COUNT(*) FROM changedfiles WHERE '
194 194 'node = ? AND filename = ? AND filenode = ? AND '
195 195 'p1node NOT NULL',
196 196 (par, path, parfnode),
197 197 ).fetchone()[0]
198 198 )
199 199 if found == 0:
200 200 assert par is not None
201 201 visit.append((par, parfnode))
202 202 if parents:
203 203 p1node, p1fnode = parents[0]
204 204 if len(parents) == 2:
205 205 p2node, p2fnode = parents[1]
206 206 if len(parents) > 2:
207 207 raise error.ProgrammingError(
208 208 b"git support can't handle octopus merges"
209 209 )
210 210 db.execute(
211 211 'UPDATE changedfiles SET '
212 212 'p1node = ?, p1filenode = ?, p2node = ?, p2filenode = ? '
213 213 'WHERE node = ? AND filename = ? AND filenode = ?',
214 214 (p1node, p1fnode, p2node, p2fnode, commit.id.hex, path, filenode),
215 215 )
216 216 db.commit()
217 217
218 218
219 219 def _index_repo(gitrepo, db, progress_factory=lambda *args, **kwargs: None):
220 220 # Identify all references so we can tell the walker to visit all of them.
221 221 all_refs = gitrepo.listall_references()
222 222 possible_heads = set()
223 223 prog = progress_factory(b'refs')
224 224 for pos, ref in enumerate(all_refs):
225 225 if prog is not None:
226 226 prog.update(pos)
227 227 if not (
228 228 ref.startswith('refs/heads/') # local branch
229 229 or ref.startswith('refs/tags/') # tag
230 230 or ref.startswith('refs/remotes/') # remote branch
231 231 or ref.startswith('refs/hg/') # from this extension
232 232 ):
233 233 continue
234 234 try:
235 235 start = gitrepo.lookup_reference(ref).peel(pygit2.GIT_OBJ_COMMIT)
236 236 except ValueError:
237 237 # No commit to be found, so we don't care for hg's purposes.
238 238 continue
239 239 possible_heads.add(start.id)
240 240 # Optimization: if the list of heads hasn't changed, don't
241 241 # reindex, the changelog. This doesn't matter on small
242 242 # repositories, but on even moderately deep histories (eg cpython)
243 243 # this is a very important performance win.
244 244 #
245 245 # TODO: we should figure out how to incrementally index history
246 246 # (preferably by detecting rewinds!) so that we don't have to do a
247 247 # full changelog walk every time a new commit is created.
248 cache_heads = {x[0] for x in db.execute('SELECT node FROM possible_heads')}
248 cache_heads = {
249 pycompat.sysstr(x[0])
250 for x in db.execute('SELECT node FROM possible_heads')
251 }
249 252 walker = None
250 253 cur_cache_heads = {h.hex for h in possible_heads}
251 254 if cur_cache_heads == cache_heads:
252 255 return
253 256 for start in possible_heads:
254 257 if walker is None:
255 258 walker = gitrepo.walk(start, _OUR_ORDER)
256 259 else:
257 260 walker.push(start)
258 261
259 262 # Empty out the existing changelog. Even for large-ish histories
260 263 # we can do the top-level "walk all the commits" dance very
261 264 # quickly as long as we don't need to figure out the changed files
262 265 # list.
263 266 db.execute('DELETE FROM changelog')
264 267 if prog is not None:
265 268 prog.complete()
266 269 prog = progress_factory(b'commits')
267 270 # This walker is sure to visit all the revisions in history, but
268 271 # only once.
269 272 for pos, commit in enumerate(walker):
270 273 if prog is not None:
271 274 prog.update(pos)
272 275 p1 = p2 = nodemod.nullhex
273 276 if len(commit.parents) > 2:
274 277 raise error.ProgrammingError(
275 278 (
276 279 b"git support can't handle octopus merges, "
277 280 b"found a commit with %d parents :("
278 281 )
279 282 % len(commit.parents)
280 283 )
281 284 if commit.parents:
282 285 p1 = commit.parents[0].id.hex
283 286 if len(commit.parents) == 2:
284 287 p2 = commit.parents[1].id.hex
285 288 db.execute(
286 289 'INSERT INTO changelog (rev, node, p1, p2) VALUES(?, ?, ?, ?)',
287 290 (pos, commit.id.hex, p1, p2),
288 291 )
289 292
290 293 num_changedfiles = db.execute(
291 294 "SELECT COUNT(*) from changedfiles WHERE node = ?",
292 295 (commit.id.hex,),
293 296 ).fetchone()[0]
294 297 if not num_changedfiles:
295 298 files = {}
296 299 # I *think* we only need to check p1 for changed files
297 300 # (and therefore linkrevs), because any node that would
298 301 # actually have this commit as a linkrev would be
299 302 # completely new in this rev.
300 303 p1 = commit.parents[0].id.hex if commit.parents else None
301 304 if p1 is not None:
302 305 patchgen = gitrepo.diff(p1, commit.id.hex, flags=_DIFF_FLAGS)
303 306 else:
304 307 patchgen = commit.tree.diff_to_tree(
305 308 swap=True, flags=_DIFF_FLAGS
306 309 )
307 310 new_files = (p.delta.new_file for p in patchgen)
308 311 files = {
309 312 nf.path: nf.id.hex
310 313 for nf in new_files
311 314 if nf.id.raw != nodemod.nullid
312 315 }
313 316 for p, n in files.items():
314 317 # We intentionally set NULLs for any file parentage
315 318 # information so it'll get demand-computed later. We
316 319 # used to do it right here, and it was _very_ slow.
317 320 db.execute(
318 321 'INSERT INTO changedfiles ('
319 322 'node, filename, filenode, p1node, p1filenode, p2node, '
320 323 'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)',
321 324 (commit.id.hex, p, n, None, None, None, None),
322 325 )
323 326 db.execute('DELETE FROM heads')
324 327 db.execute('DELETE FROM possible_heads')
325 328 for hid in possible_heads:
326 329 h = hid.hex
327 330 db.execute('INSERT INTO possible_heads (node) VALUES(?)', (h,))
328 331 haschild = db.execute(
329 332 'SELECT COUNT(*) FROM changelog WHERE p1 = ? OR p2 = ?', (h, h)
330 333 ).fetchone()[0]
331 334 if not haschild:
332 335 db.execute('INSERT INTO heads (node) VALUES(?)', (h,))
333 336
334 337 db.commit()
335 338 if prog is not None:
336 339 prog.complete()
337 340
338 341
339 342 def get_index(gitrepo, progress_factory=lambda *args, **kwargs: None):
340 343 cachepath = os.path.join(
341 344 pycompat.fsencode(gitrepo.path), b'..', b'.hg', b'cache'
342 345 )
343 346 if not os.path.exists(cachepath):
344 347 os.makedirs(cachepath)
345 348 dbpath = os.path.join(cachepath, b'git-commits.sqlite')
346 349 db = _createdb(dbpath)
347 350 # TODO check against gitrepo heads before doing a full index
348 351 # TODO thread a ui.progress call into this layer
349 352 _index_repo(gitrepo, db, progress_factory)
350 353 return db
General Comments 0
You need to be logged in to leave comments. Login now