##// END OF EJS Templates
parsers: reduce raw_length when truncating...
Bryan O'Sullivan -
r16784:12a852c7 default
parent child Browse files
Show More
@@ -1,1287 +1,1291 b''
1 1 /*
2 2 parsers.c - efficient content parsing
3 3
4 4 Copyright 2008 Matt Mackall <mpm@selenic.com> and others
5 5
6 6 This software may be used and distributed according to the terms of
7 7 the GNU General Public License, incorporated herein by reference.
8 8 */
9 9
10 10 #include <Python.h>
11 11 #include <ctype.h>
12 12 #include <string.h>
13 13
14 14 #include "util.h"
15 15
16 16 static inline int hexdigit(const char *p, Py_ssize_t off)
17 17 {
18 18 char c = p[off];
19 19
20 20 if (c >= '0' && c <= '9')
21 21 return c - '0';
22 22 if (c >= 'a' && c <= 'f')
23 23 return c - 'a' + 10;
24 24 if (c >= 'A' && c <= 'F')
25 25 return c - 'A' + 10;
26 26
27 27 PyErr_SetString(PyExc_ValueError, "input contains non-hex character");
28 28 return 0;
29 29 }
30 30
31 31 /*
32 32 * Turn a hex-encoded string into binary.
33 33 */
34 34 static PyObject *unhexlify(const char *str, int len)
35 35 {
36 36 PyObject *ret;
37 37 char *d;
38 38 int i;
39 39
40 40 ret = PyBytes_FromStringAndSize(NULL, len / 2);
41 41
42 42 if (!ret)
43 43 return NULL;
44 44
45 45 d = PyBytes_AsString(ret);
46 46
47 47 for (i = 0; i < len;) {
48 48 int hi = hexdigit(str, i++);
49 49 int lo = hexdigit(str, i++);
50 50 *d++ = (hi << 4) | lo;
51 51 }
52 52
53 53 return ret;
54 54 }
55 55
56 56 /*
57 57 * This code assumes that a manifest is stitched together with newline
58 58 * ('\n') characters.
59 59 */
60 60 static PyObject *parse_manifest(PyObject *self, PyObject *args)
61 61 {
62 62 PyObject *mfdict, *fdict;
63 63 char *str, *cur, *start, *zero;
64 64 int len;
65 65
66 66 if (!PyArg_ParseTuple(args, "O!O!s#:parse_manifest",
67 67 &PyDict_Type, &mfdict,
68 68 &PyDict_Type, &fdict,
69 69 &str, &len))
70 70 goto quit;
71 71
72 72 for (start = cur = str, zero = NULL; cur < str + len; cur++) {
73 73 PyObject *file = NULL, *node = NULL;
74 74 PyObject *flags = NULL;
75 75 int nlen;
76 76
77 77 if (!*cur) {
78 78 zero = cur;
79 79 continue;
80 80 }
81 81 else if (*cur != '\n')
82 82 continue;
83 83
84 84 if (!zero) {
85 85 PyErr_SetString(PyExc_ValueError,
86 86 "manifest entry has no separator");
87 87 goto quit;
88 88 }
89 89
90 90 file = PyBytes_FromStringAndSize(start, zero - start);
91 91
92 92 if (!file)
93 93 goto bail;
94 94
95 95 nlen = cur - zero - 1;
96 96
97 97 node = unhexlify(zero + 1, nlen > 40 ? 40 : nlen);
98 98 if (!node)
99 99 goto bail;
100 100
101 101 if (nlen > 40) {
102 102 flags = PyBytes_FromStringAndSize(zero + 41,
103 103 nlen - 40);
104 104 if (!flags)
105 105 goto bail;
106 106
107 107 if (PyDict_SetItem(fdict, file, flags) == -1)
108 108 goto bail;
109 109 }
110 110
111 111 if (PyDict_SetItem(mfdict, file, node) == -1)
112 112 goto bail;
113 113
114 114 start = cur + 1;
115 115 zero = NULL;
116 116
117 117 Py_XDECREF(flags);
118 118 Py_XDECREF(node);
119 119 Py_XDECREF(file);
120 120 continue;
121 121 bail:
122 122 Py_XDECREF(flags);
123 123 Py_XDECREF(node);
124 124 Py_XDECREF(file);
125 125 goto quit;
126 126 }
127 127
128 128 if (len > 0 && *(cur - 1) != '\n') {
129 129 PyErr_SetString(PyExc_ValueError,
130 130 "manifest contains trailing garbage");
131 131 goto quit;
132 132 }
133 133
134 134 Py_INCREF(Py_None);
135 135 return Py_None;
136 136 quit:
137 137 return NULL;
138 138 }
139 139
140 140 static PyObject *parse_dirstate(PyObject *self, PyObject *args)
141 141 {
142 142 PyObject *dmap, *cmap, *parents = NULL, *ret = NULL;
143 143 PyObject *fname = NULL, *cname = NULL, *entry = NULL;
144 144 char *str, *cur, *end, *cpos;
145 145 int state, mode, size, mtime;
146 146 unsigned int flen;
147 147 int len;
148 148
149 149 if (!PyArg_ParseTuple(args, "O!O!s#:parse_dirstate",
150 150 &PyDict_Type, &dmap,
151 151 &PyDict_Type, &cmap,
152 152 &str, &len))
153 153 goto quit;
154 154
155 155 /* read parents */
156 156 if (len < 40)
157 157 goto quit;
158 158
159 159 parents = Py_BuildValue("s#s#", str, 20, str + 20, 20);
160 160 if (!parents)
161 161 goto quit;
162 162
163 163 /* read filenames */
164 164 cur = str + 40;
165 165 end = str + len;
166 166
167 167 while (cur < end - 17) {
168 168 /* unpack header */
169 169 state = *cur;
170 170 mode = getbe32(cur + 1);
171 171 size = getbe32(cur + 5);
172 172 mtime = getbe32(cur + 9);
173 173 flen = getbe32(cur + 13);
174 174 cur += 17;
175 175 if (cur + flen > end || cur + flen < cur) {
176 176 PyErr_SetString(PyExc_ValueError, "overflow in dirstate");
177 177 goto quit;
178 178 }
179 179
180 180 entry = Py_BuildValue("ciii", state, mode, size, mtime);
181 181 if (!entry)
182 182 goto quit;
183 183 PyObject_GC_UnTrack(entry); /* don't waste time with this */
184 184
185 185 cpos = memchr(cur, 0, flen);
186 186 if (cpos) {
187 187 fname = PyBytes_FromStringAndSize(cur, cpos - cur);
188 188 cname = PyBytes_FromStringAndSize(cpos + 1,
189 189 flen - (cpos - cur) - 1);
190 190 if (!fname || !cname ||
191 191 PyDict_SetItem(cmap, fname, cname) == -1 ||
192 192 PyDict_SetItem(dmap, fname, entry) == -1)
193 193 goto quit;
194 194 Py_DECREF(cname);
195 195 } else {
196 196 fname = PyBytes_FromStringAndSize(cur, flen);
197 197 if (!fname ||
198 198 PyDict_SetItem(dmap, fname, entry) == -1)
199 199 goto quit;
200 200 }
201 201 cur += flen;
202 202 Py_DECREF(fname);
203 203 Py_DECREF(entry);
204 204 fname = cname = entry = NULL;
205 205 }
206 206
207 207 ret = parents;
208 208 Py_INCREF(ret);
209 209 quit:
210 210 Py_XDECREF(fname);
211 211 Py_XDECREF(cname);
212 212 Py_XDECREF(entry);
213 213 Py_XDECREF(parents);
214 214 return ret;
215 215 }
216 216
217 217 /*
218 218 * A base-16 trie for fast node->rev mapping.
219 219 *
220 220 * Positive value is index of the next node in the trie
221 221 * Negative value is a leaf: -(rev + 1)
222 222 * Zero is empty
223 223 */
224 224 typedef struct {
225 225 int children[16];
226 226 } nodetree;
227 227
228 228 /*
229 229 * This class has two behaviours.
230 230 *
231 231 * When used in a list-like way (with integer keys), we decode an
232 232 * entry in a RevlogNG index file on demand. Our last entry is a
233 233 * sentinel, always a nullid. We have limited support for
234 234 * integer-keyed insert and delete, only at elements right before the
235 235 * sentinel.
236 236 *
237 237 * With string keys, we lazily perform a reverse mapping from node to
238 238 * rev, using a base-16 trie.
239 239 */
240 240 typedef struct {
241 241 PyObject_HEAD
242 242 /* Type-specific fields go here. */
243 243 PyObject *data; /* raw bytes of index */
244 244 PyObject **cache; /* cached tuples */
245 245 const char **offsets; /* populated on demand */
246 246 Py_ssize_t raw_length; /* original number of elements */
247 247 Py_ssize_t length; /* current number of elements */
248 248 PyObject *added; /* populated on demand */
249 249 nodetree *nt; /* base-16 trie */
250 250 int ntlength; /* # nodes in use */
251 251 int ntcapacity; /* # nodes allocated */
252 252 int ntdepth; /* maximum depth of tree */
253 253 int ntsplits; /* # splits performed */
254 254 int ntrev; /* last rev scanned */
255 255 int ntlookups; /* # lookups */
256 256 int ntmisses; /* # lookups that miss the cache */
257 257 int inlined;
258 258 } indexObject;
259 259
260 260 static Py_ssize_t index_length(const indexObject *self)
261 261 {
262 262 if (self->added == NULL)
263 263 return self->length;
264 264 return self->length + PyList_GET_SIZE(self->added);
265 265 }
266 266
267 267 static PyObject *nullentry;
268 268 static const char nullid[20];
269 269
270 270 static long inline_scan(indexObject *self, const char **offsets);
271 271
272 272 #if LONG_MAX == 0x7fffffffL
273 273 static char *tuple_format = "Kiiiiiis#";
274 274 #else
275 275 static char *tuple_format = "kiiiiiis#";
276 276 #endif
277 277
278 278 /*
279 279 * Return a pointer to the beginning of a RevlogNG record.
280 280 */
281 281 static const char *index_deref(indexObject *self, Py_ssize_t pos)
282 282 {
283 283 if (self->inlined && pos > 0) {
284 284 if (self->offsets == NULL) {
285 285 self->offsets = malloc(self->raw_length *
286 286 sizeof(*self->offsets));
287 287 if (self->offsets == NULL)
288 288 return (const char *)PyErr_NoMemory();
289 289 inline_scan(self, self->offsets);
290 290 }
291 291 return self->offsets[pos];
292 292 }
293 293
294 294 return PyString_AS_STRING(self->data) + pos * 64;
295 295 }
296 296
297 297 /*
298 298 * RevlogNG format (all in big endian, data may be inlined):
299 299 * 6 bytes: offset
300 300 * 2 bytes: flags
301 301 * 4 bytes: compressed length
302 302 * 4 bytes: uncompressed length
303 303 * 4 bytes: base revision
304 304 * 4 bytes: link revision
305 305 * 4 bytes: parent 1 revision
306 306 * 4 bytes: parent 2 revision
307 307 * 32 bytes: nodeid (only 20 bytes used)
308 308 */
309 309 static PyObject *index_get(indexObject *self, Py_ssize_t pos)
310 310 {
311 311 uint64_t offset_flags;
312 312 int comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2;
313 313 const char *c_node_id;
314 314 const char *data;
315 315 Py_ssize_t length = index_length(self);
316 316 PyObject *entry;
317 317
318 318 if (pos < 0)
319 319 pos += length;
320 320
321 321 if (pos < 0 || pos >= length) {
322 322 PyErr_SetString(PyExc_IndexError, "revlog index out of range");
323 323 return NULL;
324 324 }
325 325
326 326 if (pos == length - 1) {
327 327 Py_INCREF(nullentry);
328 328 return nullentry;
329 329 }
330 330
331 331 if (pos >= self->length - 1) {
332 332 PyObject *obj;
333 333 obj = PyList_GET_ITEM(self->added, pos - self->length + 1);
334 334 Py_INCREF(obj);
335 335 return obj;
336 336 }
337 337
338 338 if (self->cache) {
339 339 if (self->cache[pos]) {
340 340 Py_INCREF(self->cache[pos]);
341 341 return self->cache[pos];
342 342 }
343 343 } else {
344 344 self->cache = calloc(self->raw_length, sizeof(PyObject *));
345 345 if (self->cache == NULL)
346 346 return PyErr_NoMemory();
347 347 }
348 348
349 349 data = index_deref(self, pos);
350 350 if (data == NULL)
351 351 return NULL;
352 352
353 353 offset_flags = getbe32(data + 4);
354 354 if (pos == 0) /* mask out version number for the first entry */
355 355 offset_flags &= 0xFFFF;
356 356 else {
357 357 uint32_t offset_high = getbe32(data);
358 358 offset_flags |= ((uint64_t)offset_high) << 32;
359 359 }
360 360
361 361 comp_len = getbe32(data + 8);
362 362 uncomp_len = getbe32(data + 12);
363 363 base_rev = getbe32(data + 16);
364 364 link_rev = getbe32(data + 20);
365 365 parent_1 = getbe32(data + 24);
366 366 parent_2 = getbe32(data + 28);
367 367 c_node_id = data + 32;
368 368
369 369 entry = Py_BuildValue(tuple_format, offset_flags, comp_len,
370 370 uncomp_len, base_rev, link_rev,
371 371 parent_1, parent_2, c_node_id, 20);
372 372
373 373 if (entry)
374 374 PyObject_GC_UnTrack(entry);
375 375
376 376 self->cache[pos] = entry;
377 377 Py_INCREF(entry);
378 378
379 379 return entry;
380 380 }
381 381
382 382 /*
383 383 * Return the 20-byte SHA of the node corresponding to the given rev.
384 384 */
385 385 static const char *index_node(indexObject *self, Py_ssize_t pos)
386 386 {
387 387 Py_ssize_t length = index_length(self);
388 388 const char *data;
389 389
390 390 if (pos == length - 1 || pos == INT_MAX)
391 391 return nullid;
392 392
393 393 if (pos >= length)
394 394 return NULL;
395 395
396 396 if (pos >= self->length - 1) {
397 397 PyObject *tuple, *str;
398 398 tuple = PyList_GET_ITEM(self->added, pos - self->length + 1);
399 399 str = PyTuple_GetItem(tuple, 7);
400 400 return str ? PyString_AS_STRING(str) : NULL;
401 401 }
402 402
403 403 data = index_deref(self, pos);
404 404 return data ? data + 32 : NULL;
405 405 }
406 406
407 407 static int nt_insert(indexObject *self, const char *node, int rev);
408 408
409 409 static int node_check(PyObject *obj, char **node, Py_ssize_t *nodelen)
410 410 {
411 411 if (PyString_AsStringAndSize(obj, node, nodelen) == -1)
412 412 return -1;
413 413 if (*nodelen == 20)
414 414 return 0;
415 415 PyErr_SetString(PyExc_ValueError, "20-byte hash required");
416 416 return -1;
417 417 }
418 418
419 419 static PyObject *index_insert(indexObject *self, PyObject *args)
420 420 {
421 421 PyObject *obj;
422 422 char *node;
423 423 long offset;
424 424 Py_ssize_t len, nodelen;
425 425
426 426 if (!PyArg_ParseTuple(args, "lO", &offset, &obj))
427 427 return NULL;
428 428
429 429 if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != 8) {
430 430 PyErr_SetString(PyExc_TypeError, "8-tuple required");
431 431 return NULL;
432 432 }
433 433
434 434 if (node_check(PyTuple_GET_ITEM(obj, 7), &node, &nodelen) == -1)
435 435 return NULL;
436 436
437 437 len = index_length(self);
438 438
439 439 if (offset < 0)
440 440 offset += len;
441 441
442 442 if (offset != len - 1) {
443 443 PyErr_SetString(PyExc_IndexError,
444 444 "insert only supported at index -1");
445 445 return NULL;
446 446 }
447 447
448 448 if (offset > INT_MAX) {
449 449 PyErr_SetString(PyExc_ValueError,
450 450 "currently only 2**31 revs supported");
451 451 return NULL;
452 452 }
453 453
454 454 if (self->added == NULL) {
455 455 self->added = PyList_New(0);
456 456 if (self->added == NULL)
457 457 return NULL;
458 458 }
459 459
460 460 if (PyList_Append(self->added, obj) == -1)
461 461 return NULL;
462 462
463 463 if (self->nt)
464 464 nt_insert(self, node, (int)offset);
465 465
466 466 Py_RETURN_NONE;
467 467 }
468 468
469 469 static void _index_clearcaches(indexObject *self)
470 470 {
471 471 if (self->cache) {
472 472 Py_ssize_t i;
473 473
474 474 for (i = 0; i < self->raw_length; i++)
475 475 Py_CLEAR(self->cache[i]);
476 476 free(self->cache);
477 477 self->cache = NULL;
478 478 }
479 479 if (self->offsets) {
480 480 free(self->offsets);
481 481 self->offsets = NULL;
482 482 }
483 483 if (self->nt) {
484 484 free(self->nt);
485 485 self->nt = NULL;
486 486 }
487 487 }
488 488
489 489 static PyObject *index_clearcaches(indexObject *self)
490 490 {
491 491 _index_clearcaches(self);
492 492 self->ntlength = self->ntcapacity = 0;
493 493 self->ntdepth = self->ntsplits = 0;
494 494 self->ntrev = -1;
495 495 self->ntlookups = self->ntmisses = 0;
496 496 Py_RETURN_NONE;
497 497 }
498 498
499 499 static PyObject *index_stats(indexObject *self)
500 500 {
501 501 PyObject *obj = PyDict_New();
502 502
503 503 if (obj == NULL)
504 504 return NULL;
505 505
506 506 #define istat(__n, __d) \
507 507 if (PyDict_SetItemString(obj, __d, PyInt_FromSsize_t(self->__n)) == -1) \
508 508 goto bail;
509 509
510 510 if (self->added) {
511 511 Py_ssize_t len = PyList_GET_SIZE(self->added);
512 512 if (PyDict_SetItemString(obj, "index entries added",
513 513 PyInt_FromSsize_t(len)) == -1)
514 514 goto bail;
515 515 }
516 516
517 517 if (self->raw_length != self->length - 1)
518 518 istat(raw_length, "revs on disk");
519 519 istat(length, "revs in memory");
520 520 istat(ntcapacity, "node trie capacity");
521 521 istat(ntdepth, "node trie depth");
522 522 istat(ntlength, "node trie count");
523 523 istat(ntlookups, "node trie lookups");
524 524 istat(ntmisses, "node trie misses");
525 525 istat(ntrev, "node trie last rev scanned");
526 526 istat(ntsplits, "node trie splits");
527 527
528 528 #undef istat
529 529
530 530 return obj;
531 531
532 532 bail:
533 533 Py_XDECREF(obj);
534 534 return NULL;
535 535 }
536 536
537 537 static inline int nt_level(const char *node, Py_ssize_t level)
538 538 {
539 539 int v = node[level>>1];
540 540 if (!(level & 1))
541 541 v >>= 4;
542 542 return v & 0xf;
543 543 }
544 544
545 545 /*
546 546 * Return values:
547 547 *
548 548 * -4: match is ambiguous (multiple candidates)
549 549 * -2: not found
550 550 * rest: valid rev
551 551 */
552 552 static int nt_find(indexObject *self, const char *node, Py_ssize_t nodelen,
553 553 int hex)
554 554 {
555 555 int (*getnybble)(const char *, Py_ssize_t) = hex ? hexdigit : nt_level;
556 556 int level, maxlevel, off;
557 557
558 558 if (nodelen == 20 && node[0] == '\0' && memcmp(node, nullid, 20) == 0)
559 559 return -1;
560 560
561 561 if (self->nt == NULL)
562 562 return -2;
563 563
564 564 if (hex)
565 565 maxlevel = nodelen > 40 ? 40 : (int)nodelen;
566 566 else
567 567 maxlevel = nodelen > 20 ? 40 : ((int)nodelen * 2);
568 568
569 569 for (level = off = 0; level < maxlevel; level++) {
570 570 int k = getnybble(node, level);
571 571 nodetree *n = &self->nt[off];
572 572 int v = n->children[k];
573 573
574 574 if (v < 0) {
575 575 const char *n;
576 576 Py_ssize_t i;
577 577
578 578 v = -v - 1;
579 579 n = index_node(self, v);
580 580 if (n == NULL)
581 581 return -2;
582 582 for (i = level; i < maxlevel; i++)
583 583 if (getnybble(node, i) != nt_level(n, i))
584 584 return -2;
585 585 return v;
586 586 }
587 587 if (v == 0)
588 588 return -2;
589 589 off = v;
590 590 }
591 591 /* multiple matches against an ambiguous prefix */
592 592 return -4;
593 593 }
594 594
595 595 static int nt_new(indexObject *self)
596 596 {
597 597 if (self->ntlength == self->ntcapacity) {
598 598 self->ntcapacity *= 2;
599 599 self->nt = realloc(self->nt,
600 600 self->ntcapacity * sizeof(nodetree));
601 601 if (self->nt == NULL) {
602 602 PyErr_SetString(PyExc_MemoryError, "out of memory");
603 603 return -1;
604 604 }
605 605 memset(&self->nt[self->ntlength], 0,
606 606 sizeof(nodetree) * (self->ntcapacity - self->ntlength));
607 607 }
608 608 return self->ntlength++;
609 609 }
610 610
611 611 static int nt_insert(indexObject *self, const char *node, int rev)
612 612 {
613 613 int level = 0;
614 614 int off = 0;
615 615
616 616 while (level < 40) {
617 617 int k = nt_level(node, level);
618 618 nodetree *n;
619 619 int v;
620 620
621 621 n = &self->nt[off];
622 622 v = n->children[k];
623 623
624 624 if (v == 0) {
625 625 n->children[k] = -rev - 1;
626 626 return 0;
627 627 }
628 628 if (v < 0) {
629 629 const char *oldnode = index_node(self, -v - 1);
630 630 int noff;
631 631
632 632 if (!oldnode || !memcmp(oldnode, node, 20)) {
633 633 n->children[k] = -rev - 1;
634 634 return 0;
635 635 }
636 636 noff = nt_new(self);
637 637 if (noff == -1)
638 638 return -1;
639 639 /* self->nt may have been changed by realloc */
640 640 self->nt[off].children[k] = noff;
641 641 off = noff;
642 642 n = &self->nt[off];
643 643 n->children[nt_level(oldnode, ++level)] = v;
644 644 if (level > self->ntdepth)
645 645 self->ntdepth = level;
646 646 self->ntsplits += 1;
647 647 } else {
648 648 level += 1;
649 649 off = v;
650 650 }
651 651 }
652 652
653 653 return -1;
654 654 }
655 655
656 656 static int nt_init(indexObject *self)
657 657 {
658 658 if (self->nt == NULL) {
659 659 self->ntcapacity = self->raw_length < 4
660 660 ? 4 : self->raw_length / 2;
661 661 self->nt = calloc(self->ntcapacity, sizeof(nodetree));
662 662 if (self->nt == NULL) {
663 663 PyErr_NoMemory();
664 664 return -1;
665 665 }
666 666 self->ntlength = 1;
667 667 self->ntrev = (int)index_length(self) - 1;
668 668 self->ntlookups = 1;
669 669 self->ntmisses = 0;
670 670 if (nt_insert(self, nullid, INT_MAX) == -1)
671 671 return -1;
672 672 }
673 673 return 0;
674 674 }
675 675
676 676 /*
677 677 * Return values:
678 678 *
679 679 * -3: error (exception set)
680 680 * -2: not found (no exception set)
681 681 * rest: valid rev
682 682 */
683 683 static int index_find_node(indexObject *self,
684 684 const char *node, Py_ssize_t nodelen)
685 685 {
686 686 int rev;
687 687
688 688 self->ntlookups++;
689 689 rev = nt_find(self, node, nodelen, 0);
690 690 if (rev >= -1)
691 691 return rev;
692 692
693 693 if (nt_init(self) == -1)
694 694 return -3;
695 695
696 696 /*
697 697 * For the first handful of lookups, we scan the entire index,
698 698 * and cache only the matching nodes. This optimizes for cases
699 699 * like "hg tip", where only a few nodes are accessed.
700 700 *
701 701 * After that, we cache every node we visit, using a single
702 702 * scan amortized over multiple lookups. This gives the best
703 703 * bulk performance, e.g. for "hg log".
704 704 */
705 705 if (self->ntmisses++ < 4) {
706 706 for (rev = self->ntrev - 1; rev >= 0; rev--) {
707 707 const char *n = index_node(self, rev);
708 708 if (n == NULL)
709 709 return -2;
710 710 if (memcmp(node, n, nodelen > 20 ? 20 : nodelen) == 0) {
711 711 if (nt_insert(self, n, rev) == -1)
712 712 return -3;
713 713 break;
714 714 }
715 715 }
716 716 } else {
717 717 for (rev = self->ntrev - 1; rev >= 0; rev--) {
718 718 const char *n = index_node(self, rev);
719 719 if (n == NULL) {
720 720 self->ntrev = rev + 1;
721 721 return -2;
722 722 }
723 723 if (nt_insert(self, n, rev) == -1) {
724 724 self->ntrev = rev + 1;
725 725 return -3;
726 726 }
727 727 if (memcmp(node, n, nodelen > 20 ? 20 : nodelen) == 0) {
728 728 break;
729 729 }
730 730 }
731 731 self->ntrev = rev;
732 732 }
733 733
734 734 if (rev >= 0)
735 735 return rev;
736 736 return -2;
737 737 }
738 738
739 739 static PyObject *raise_revlog_error(void)
740 740 {
741 741 static PyObject *errclass;
742 742 PyObject *mod = NULL, *errobj;
743 743
744 744 if (errclass == NULL) {
745 745 PyObject *dict;
746 746
747 747 mod = PyImport_ImportModule("mercurial.error");
748 748 if (mod == NULL)
749 749 goto classfail;
750 750
751 751 dict = PyModule_GetDict(mod);
752 752 if (dict == NULL)
753 753 goto classfail;
754 754
755 755 errclass = PyDict_GetItemString(dict, "RevlogError");
756 756 if (errclass == NULL) {
757 757 PyErr_SetString(PyExc_SystemError,
758 758 "could not find RevlogError");
759 759 goto classfail;
760 760 }
761 761 Py_INCREF(errclass);
762 762 }
763 763
764 764 errobj = PyObject_CallFunction(errclass, NULL);
765 765 if (errobj == NULL)
766 766 return NULL;
767 767 PyErr_SetObject(errclass, errobj);
768 768 return errobj;
769 769
770 770 classfail:
771 771 Py_XDECREF(mod);
772 772 return NULL;
773 773 }
774 774
775 775 static PyObject *index_getitem(indexObject *self, PyObject *value)
776 776 {
777 777 char *node;
778 778 Py_ssize_t nodelen;
779 779 int rev;
780 780
781 781 if (PyInt_Check(value))
782 782 return index_get(self, PyInt_AS_LONG(value));
783 783
784 784 if (node_check(value, &node, &nodelen) == -1)
785 785 return NULL;
786 786 rev = index_find_node(self, node, nodelen);
787 787 if (rev >= -1)
788 788 return PyInt_FromLong(rev);
789 789 if (rev == -2)
790 790 raise_revlog_error();
791 791 return NULL;
792 792 }
793 793
794 794 static int nt_partialmatch(indexObject *self, const char *node,
795 795 Py_ssize_t nodelen)
796 796 {
797 797 int rev;
798 798
799 799 if (nt_init(self) == -1)
800 800 return -3;
801 801
802 802 if (self->ntrev > 0) {
803 803 /* ensure that the radix tree is fully populated */
804 804 for (rev = self->ntrev - 1; rev >= 0; rev--) {
805 805 const char *n = index_node(self, rev);
806 806 if (n == NULL)
807 807 return -2;
808 808 if (nt_insert(self, n, rev) == -1)
809 809 return -3;
810 810 }
811 811 self->ntrev = rev;
812 812 }
813 813
814 814 return nt_find(self, node, nodelen, 1);
815 815 }
816 816
817 817 static PyObject *index_partialmatch(indexObject *self, PyObject *args)
818 818 {
819 819 const char *fullnode;
820 820 int nodelen;
821 821 char *node;
822 822 int rev, i;
823 823
824 824 if (!PyArg_ParseTuple(args, "s#", &node, &nodelen))
825 825 return NULL;
826 826
827 827 if (nodelen < 4) {
828 828 PyErr_SetString(PyExc_ValueError, "key too short");
829 829 return NULL;
830 830 }
831 831
832 832 if (nodelen > 40)
833 833 nodelen = 40;
834 834
835 835 for (i = 0; i < nodelen; i++)
836 836 hexdigit(node, i);
837 837 if (PyErr_Occurred()) {
838 838 /* input contains non-hex characters */
839 839 PyErr_Clear();
840 840 Py_RETURN_NONE;
841 841 }
842 842
843 843 rev = nt_partialmatch(self, node, nodelen);
844 844
845 845 switch (rev) {
846 846 case -4:
847 847 raise_revlog_error();
848 848 case -3:
849 849 return NULL;
850 850 case -2:
851 851 Py_RETURN_NONE;
852 852 case -1:
853 853 return PyString_FromStringAndSize(nullid, 20);
854 854 }
855 855
856 856 fullnode = index_node(self, rev);
857 857 if (fullnode == NULL) {
858 858 PyErr_Format(PyExc_IndexError,
859 859 "could not access rev %d", rev);
860 860 return NULL;
861 861 }
862 862 return PyString_FromStringAndSize(fullnode, 20);
863 863 }
864 864
865 865 static PyObject *index_m_get(indexObject *self, PyObject *args)
866 866 {
867 867 Py_ssize_t nodelen;
868 868 PyObject *val;
869 869 char *node;
870 870 int rev;
871 871
872 872 if (!PyArg_ParseTuple(args, "O", &val))
873 873 return NULL;
874 874 if (node_check(val, &node, &nodelen) == -1)
875 875 return NULL;
876 876 rev = index_find_node(self, node, nodelen);
877 877 if (rev == -3)
878 878 return NULL;
879 879 if (rev == -2)
880 880 Py_RETURN_NONE;
881 881 return PyInt_FromLong(rev);
882 882 }
883 883
884 884 static int index_contains(indexObject *self, PyObject *value)
885 885 {
886 886 char *node;
887 887 Py_ssize_t nodelen;
888 888
889 889 if (PyInt_Check(value)) {
890 890 long rev = PyInt_AS_LONG(value);
891 891 return rev >= -1 && rev < index_length(self);
892 892 }
893 893
894 894 if (node_check(value, &node, &nodelen) == -1)
895 895 return -1;
896 896
897 897 switch (index_find_node(self, node, nodelen)) {
898 898 case -3:
899 899 return -1;
900 900 case -2:
901 901 return 0;
902 902 default:
903 903 return 1;
904 904 }
905 905 }
906 906
907 907 /*
908 908 * Invalidate any trie entries introduced by added revs.
909 909 */
910 910 static void nt_invalidate_added(indexObject *self, Py_ssize_t start)
911 911 {
912 912 Py_ssize_t i, len = PyList_GET_SIZE(self->added);
913 913
914 914 for (i = start; i < len; i++) {
915 915 PyObject *tuple = PyList_GET_ITEM(self->added, i);
916 916 PyObject *node = PyTuple_GET_ITEM(tuple, 7);
917 917
918 918 nt_insert(self, PyString_AS_STRING(node), -1);
919 919 }
920 920
921 921 if (start == 0)
922 922 Py_CLEAR(self->added);
923 923 }
924 924
925 925 /*
926 926 * Delete a numeric range of revs, which must be at the end of the
927 927 * range, but exclude the sentinel nullid entry.
928 928 */
929 929 static int index_slice_del(indexObject *self, PyObject *item)
930 930 {
931 931 Py_ssize_t start, stop, step, slicelength;
932 932 Py_ssize_t length = index_length(self);
933 int ret = 0;
933 934
934 935 if (PySlice_GetIndicesEx((PySliceObject*)item, length,
935 936 &start, &stop, &step, &slicelength) < 0)
936 937 return -1;
937 938
938 939 if (slicelength <= 0)
939 940 return 0;
940 941
941 942 if ((step < 0 && start < stop) || (step > 0 && start > stop))
942 943 stop = start;
943 944
944 945 if (step < 0) {
945 946 stop = start + 1;
946 947 start = stop + step*(slicelength - 1) - 1;
947 948 step = -step;
948 949 }
949 950
950 951 if (step != 1) {
951 952 PyErr_SetString(PyExc_ValueError,
952 953 "revlog index delete requires step size of 1");
953 954 return -1;
954 955 }
955 956
956 957 if (stop != length - 1) {
957 958 PyErr_SetString(PyExc_IndexError,
958 959 "revlog index deletion indices are invalid");
959 960 return -1;
960 961 }
961 962
962 963 if (start < self->length - 1) {
963 964 if (self->nt) {
964 965 Py_ssize_t i;
965 966
966 967 for (i = start + 1; i < self->length - 1; i++) {
967 968 const char *node = index_node(self, i);
968 969
969 970 if (node)
970 971 nt_insert(self, node, -1);
971 972 }
972 973 if (self->added)
973 974 nt_invalidate_added(self, 0);
974 975 if (self->ntrev > start)
975 976 self->ntrev = (int)start;
976 977 }
977 978 self->length = start + 1;
978 return 0;
979 if (start < self->raw_length)
980 self->raw_length = start;
981 goto done;
979 982 }
980 983
981 984 if (self->nt) {
982 985 nt_invalidate_added(self, start - self->length + 1);
983 986 if (self->ntrev > start)
984 987 self->ntrev = (int)start;
985 988 }
986 return self->added
987 ? PyList_SetSlice(self->added, start - self->length + 1,
988 PyList_GET_SIZE(self->added), NULL)
989 : 0;
989 if (self->added)
990 ret = PyList_SetSlice(self->added, start - self->length + 1,
991 PyList_GET_SIZE(self->added), NULL);
992 done:
993 return ret;
990 994 }
991 995
992 996 /*
993 997 * Supported ops:
994 998 *
995 999 * slice deletion
996 1000 * string assignment (extend node->rev mapping)
997 1001 * string deletion (shrink node->rev mapping)
998 1002 */
999 1003 static int index_assign_subscript(indexObject *self, PyObject *item,
1000 1004 PyObject *value)
1001 1005 {
1002 1006 char *node;
1003 1007 Py_ssize_t nodelen;
1004 1008 long rev;
1005 1009
1006 1010 if (PySlice_Check(item) && value == NULL)
1007 1011 return index_slice_del(self, item);
1008 1012
1009 1013 if (node_check(item, &node, &nodelen) == -1)
1010 1014 return -1;
1011 1015
1012 1016 if (value == NULL)
1013 1017 return self->nt ? nt_insert(self, node, -1) : 0;
1014 1018 rev = PyInt_AsLong(value);
1015 1019 if (rev > INT_MAX || rev < 0) {
1016 1020 if (!PyErr_Occurred())
1017 1021 PyErr_SetString(PyExc_ValueError, "rev out of range");
1018 1022 return -1;
1019 1023 }
1020 1024 return nt_insert(self, node, (int)rev);
1021 1025 }
1022 1026
1023 1027 /*
1024 1028 * Find all RevlogNG entries in an index that has inline data. Update
1025 1029 * the optional "offsets" table with those entries.
1026 1030 */
1027 1031 static long inline_scan(indexObject *self, const char **offsets)
1028 1032 {
1029 1033 const char *data = PyString_AS_STRING(self->data);
1030 1034 const char *end = data + PyString_GET_SIZE(self->data);
1031 1035 const long hdrsize = 64;
1032 1036 long incr = hdrsize;
1033 1037 Py_ssize_t len = 0;
1034 1038
1035 1039 while (data + hdrsize <= end) {
1036 1040 uint32_t comp_len;
1037 1041 const char *old_data;
1038 1042 /* 3rd element of header is length of compressed inline data */
1039 1043 comp_len = getbe32(data + 8);
1040 1044 incr = hdrsize + comp_len;
1041 1045 if (incr < hdrsize)
1042 1046 break;
1043 1047 if (offsets)
1044 1048 offsets[len] = data;
1045 1049 len++;
1046 1050 old_data = data;
1047 1051 data += incr;
1048 1052 if (data <= old_data)
1049 1053 break;
1050 1054 }
1051 1055
1052 1056 if (data != end && data + hdrsize != end) {
1053 1057 if (!PyErr_Occurred())
1054 1058 PyErr_SetString(PyExc_ValueError, "corrupt index file");
1055 1059 return -1;
1056 1060 }
1057 1061
1058 1062 return len;
1059 1063 }
1060 1064
1061 1065 static int index_init(indexObject *self, PyObject *args)
1062 1066 {
1063 1067 PyObject *data_obj, *inlined_obj;
1064 1068 Py_ssize_t size;
1065 1069
1066 1070 if (!PyArg_ParseTuple(args, "OO", &data_obj, &inlined_obj))
1067 1071 return -1;
1068 1072 if (!PyString_Check(data_obj)) {
1069 1073 PyErr_SetString(PyExc_TypeError, "data is not a string");
1070 1074 return -1;
1071 1075 }
1072 1076 size = PyString_GET_SIZE(data_obj);
1073 1077
1074 1078 self->inlined = inlined_obj && PyObject_IsTrue(inlined_obj);
1075 1079 self->data = data_obj;
1076 1080 self->cache = NULL;
1077 1081
1078 1082 self->added = NULL;
1079 1083 self->offsets = NULL;
1080 1084 self->nt = NULL;
1081 1085 self->ntlength = self->ntcapacity = 0;
1082 1086 self->ntdepth = self->ntsplits = 0;
1083 1087 self->ntlookups = self->ntmisses = 0;
1084 1088 self->ntrev = -1;
1085 1089 Py_INCREF(self->data);
1086 1090
1087 1091 if (self->inlined) {
1088 1092 long len = inline_scan(self, NULL);
1089 1093 if (len == -1)
1090 1094 goto bail;
1091 1095 self->raw_length = len;
1092 1096 self->length = len + 1;
1093 1097 } else {
1094 1098 if (size % 64) {
1095 1099 PyErr_SetString(PyExc_ValueError, "corrupt index file");
1096 1100 goto bail;
1097 1101 }
1098 1102 self->raw_length = size / 64;
1099 1103 self->length = self->raw_length + 1;
1100 1104 }
1101 1105
1102 1106 return 0;
1103 1107 bail:
1104 1108 return -1;
1105 1109 }
1106 1110
1107 1111 static PyObject *index_nodemap(indexObject *self)
1108 1112 {
1109 1113 Py_INCREF(self);
1110 1114 return (PyObject *)self;
1111 1115 }
1112 1116
1113 1117 static void index_dealloc(indexObject *self)
1114 1118 {
1115 1119 _index_clearcaches(self);
1116 1120 Py_DECREF(self->data);
1117 1121 Py_XDECREF(self->added);
1118 1122 PyObject_Del(self);
1119 1123 }
1120 1124
1121 1125 static PySequenceMethods index_sequence_methods = {
1122 1126 (lenfunc)index_length, /* sq_length */
1123 1127 0, /* sq_concat */
1124 1128 0, /* sq_repeat */
1125 1129 (ssizeargfunc)index_get, /* sq_item */
1126 1130 0, /* sq_slice */
1127 1131 0, /* sq_ass_item */
1128 1132 0, /* sq_ass_slice */
1129 1133 (objobjproc)index_contains, /* sq_contains */
1130 1134 };
1131 1135
1132 1136 static PyMappingMethods index_mapping_methods = {
1133 1137 (lenfunc)index_length, /* mp_length */
1134 1138 (binaryfunc)index_getitem, /* mp_subscript */
1135 1139 (objobjargproc)index_assign_subscript, /* mp_ass_subscript */
1136 1140 };
1137 1141
1138 1142 static PyMethodDef index_methods[] = {
1139 1143 {"clearcaches", (PyCFunction)index_clearcaches, METH_NOARGS,
1140 1144 "clear the index caches"},
1141 1145 {"get", (PyCFunction)index_m_get, METH_VARARGS,
1142 1146 "get an index entry"},
1143 1147 {"insert", (PyCFunction)index_insert, METH_VARARGS,
1144 1148 "insert an index entry"},
1145 1149 {"partialmatch", (PyCFunction)index_partialmatch, METH_VARARGS,
1146 1150 "match a potentially ambiguous node ID"},
1147 1151 {"stats", (PyCFunction)index_stats, METH_NOARGS,
1148 1152 "stats for the index"},
1149 1153 {NULL} /* Sentinel */
1150 1154 };
1151 1155
1152 1156 static PyGetSetDef index_getset[] = {
1153 1157 {"nodemap", (getter)index_nodemap, NULL, "nodemap", NULL},
1154 1158 {NULL} /* Sentinel */
1155 1159 };
1156 1160
1157 1161 static PyTypeObject indexType = {
1158 1162 PyObject_HEAD_INIT(NULL)
1159 1163 0, /* ob_size */
1160 1164 "parsers.index", /* tp_name */
1161 1165 sizeof(indexObject), /* tp_basicsize */
1162 1166 0, /* tp_itemsize */
1163 1167 (destructor)index_dealloc, /* tp_dealloc */
1164 1168 0, /* tp_print */
1165 1169 0, /* tp_getattr */
1166 1170 0, /* tp_setattr */
1167 1171 0, /* tp_compare */
1168 1172 0, /* tp_repr */
1169 1173 0, /* tp_as_number */
1170 1174 &index_sequence_methods, /* tp_as_sequence */
1171 1175 &index_mapping_methods, /* tp_as_mapping */
1172 1176 0, /* tp_hash */
1173 1177 0, /* tp_call */
1174 1178 0, /* tp_str */
1175 1179 0, /* tp_getattro */
1176 1180 0, /* tp_setattro */
1177 1181 0, /* tp_as_buffer */
1178 1182 Py_TPFLAGS_DEFAULT, /* tp_flags */
1179 1183 "revlog index", /* tp_doc */
1180 1184 0, /* tp_traverse */
1181 1185 0, /* tp_clear */
1182 1186 0, /* tp_richcompare */
1183 1187 0, /* tp_weaklistoffset */
1184 1188 0, /* tp_iter */
1185 1189 0, /* tp_iternext */
1186 1190 index_methods, /* tp_methods */
1187 1191 0, /* tp_members */
1188 1192 index_getset, /* tp_getset */
1189 1193 0, /* tp_base */
1190 1194 0, /* tp_dict */
1191 1195 0, /* tp_descr_get */
1192 1196 0, /* tp_descr_set */
1193 1197 0, /* tp_dictoffset */
1194 1198 (initproc)index_init, /* tp_init */
1195 1199 0, /* tp_alloc */
1196 1200 };
1197 1201
1198 1202 /*
1199 1203 * returns a tuple of the form (index, index, cache) with elements as
1200 1204 * follows:
1201 1205 *
1202 1206 * index: an index object that lazily parses RevlogNG records
1203 1207 * cache: if data is inlined, a tuple (index_file_content, 0), else None
1204 1208 *
1205 1209 * added complications are for backwards compatibility
1206 1210 */
1207 1211 static PyObject *parse_index2(PyObject *self, PyObject *args)
1208 1212 {
1209 1213 PyObject *tuple = NULL, *cache = NULL;
1210 1214 indexObject *idx;
1211 1215 int ret;
1212 1216
1213 1217 idx = PyObject_New(indexObject, &indexType);
1214 1218 if (idx == NULL)
1215 1219 goto bail;
1216 1220
1217 1221 ret = index_init(idx, args);
1218 1222 if (ret == -1)
1219 1223 goto bail;
1220 1224
1221 1225 if (idx->inlined) {
1222 1226 cache = Py_BuildValue("iO", 0, idx->data);
1223 1227 if (cache == NULL)
1224 1228 goto bail;
1225 1229 } else {
1226 1230 cache = Py_None;
1227 1231 Py_INCREF(cache);
1228 1232 }
1229 1233
1230 1234 tuple = Py_BuildValue("NN", idx, cache);
1231 1235 if (!tuple)
1232 1236 goto bail;
1233 1237 return tuple;
1234 1238
1235 1239 bail:
1236 1240 Py_XDECREF(idx);
1237 1241 Py_XDECREF(cache);
1238 1242 Py_XDECREF(tuple);
1239 1243 return NULL;
1240 1244 }
1241 1245
1242 1246 static char parsers_doc[] = "Efficient content parsing.";
1243 1247
1244 1248 static PyMethodDef methods[] = {
1245 1249 {"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"},
1246 1250 {"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"},
1247 1251 {"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
1248 1252 {NULL, NULL}
1249 1253 };
1250 1254
1251 1255 static void module_init(PyObject *mod)
1252 1256 {
1253 1257 indexType.tp_new = PyType_GenericNew;
1254 1258 if (PyType_Ready(&indexType) < 0)
1255 1259 return;
1256 1260 Py_INCREF(&indexType);
1257 1261
1258 1262 PyModule_AddObject(mod, "index", (PyObject *)&indexType);
1259 1263
1260 1264 nullentry = Py_BuildValue("iiiiiiis#", 0, 0, 0,
1261 1265 -1, -1, -1, -1, nullid, 20);
1262 1266 if (nullentry)
1263 1267 PyObject_GC_UnTrack(nullentry);
1264 1268 }
1265 1269
1266 1270 #ifdef IS_PY3K
1267 1271 static struct PyModuleDef parsers_module = {
1268 1272 PyModuleDef_HEAD_INIT,
1269 1273 "parsers",
1270 1274 parsers_doc,
1271 1275 -1,
1272 1276 methods
1273 1277 };
1274 1278
1275 1279 PyMODINIT_FUNC PyInit_parsers(void)
1276 1280 {
1277 1281 PyObject *mod = PyModule_Create(&parsers_module);
1278 1282 module_init(mod);
1279 1283 return mod;
1280 1284 }
1281 1285 #else
1282 1286 PyMODINIT_FUNC initparsers(void)
1283 1287 {
1284 1288 PyObject *mod = Py_InitModule3("parsers", methods, parsers_doc);
1285 1289 module_init(mod);
1286 1290 }
1287 1291 #endif
General Comments 0
You need to be logged in to leave comments. Login now