parsers.c
433 lines
| 10.0 KiB
| text/x-c
|
CLexer
/ mercurial / parsers.c
Bryan O'Sullivan
|
r6389 | /* | ||
parsers.c - efficient content parsing | ||||
Copyright 2008 Matt Mackall <mpm@selenic.com> and others | ||||
This software may be used and distributed according to the terms of | ||||
the GNU General Public License, incorporated herein by reference. | ||||
*/ | ||||
#include <Python.h> | ||||
#include <ctype.h> | ||||
#include <string.h> | ||||
static int hexdigit(char c) | ||||
{ | ||||
if (c >= '0' && c <= '9') | ||||
return c - '0'; | ||||
Matt Mackall
|
r7092 | if (c >= 'a' && c <= 'f') | ||
return c - 'a' + 10; | ||||
Bryan O'Sullivan
|
r6389 | if (c >= 'A' && c <= 'F') | ||
return c - 'A' + 10; | ||||
Matt Mackall
|
r7092 | PyErr_SetString(PyExc_ValueError, "input contains non-hex character"); | ||
return 0; | ||||
Bryan O'Sullivan
|
r6389 | } | ||
/* | ||||
* Turn a hex-encoded string into binary. | ||||
*/ | ||||
static PyObject *unhexlify(const char *str, int len) | ||||
{ | ||||
Matt Mackall
|
r7092 | PyObject *ret; | ||
Benoit Boissinot
|
r6395 | const char *c; | ||
char *d; | ||||
Bryan O'Sullivan
|
r6389 | |||
ret = PyString_FromStringAndSize(NULL, len / 2); | ||||
if (!ret) | ||||
Matt Mackall
|
r7092 | return NULL; | ||
Bryan O'Sullivan
|
r6389 | |||
Matt Mackall
|
r7092 | d = PyString_AS_STRING(ret); | ||
Bryan O'Sullivan
|
r6389 | for (c = str; c < str + len;) { | ||
int hi = hexdigit(*c++); | ||||
int lo = hexdigit(*c++); | ||||
*d++ = (hi << 4) | lo; | ||||
} | ||||
Matt Mackall
|
r7091 | |||
Bryan O'Sullivan
|
r6389 | return ret; | ||
} | ||||
/* | ||||
* This code assumes that a manifest is stitched together with newline | ||||
* ('\n') characters. | ||||
*/ | ||||
static PyObject *parse_manifest(PyObject *self, PyObject *args) | ||||
{ | ||||
PyObject *mfdict, *fdict; | ||||
char *str, *cur, *start, *zero; | ||||
int len; | ||||
if (!PyArg_ParseTuple(args, "O!O!s#:parse_manifest", | ||||
&PyDict_Type, &mfdict, | ||||
&PyDict_Type, &fdict, | ||||
&str, &len)) | ||||
goto quit; | ||||
for (start = cur = str, zero = NULL; cur < str + len; cur++) { | ||||
PyObject *file = NULL, *node = NULL; | ||||
PyObject *flags = NULL; | ||||
int nlen; | ||||
if (!*cur) { | ||||
zero = cur; | ||||
continue; | ||||
} | ||||
else if (*cur != '\n') | ||||
continue; | ||||
if (!zero) { | ||||
PyErr_SetString(PyExc_ValueError, | ||||
"manifest entry has no separator"); | ||||
goto quit; | ||||
} | ||||
file = PyString_FromStringAndSize(start, zero - start); | ||||
if (!file) | ||||
goto bail; | ||||
nlen = cur - zero - 1; | ||||
node = unhexlify(zero + 1, nlen > 40 ? 40 : nlen); | ||||
if (!node) | ||||
goto bail; | ||||
if (nlen > 40) { | ||||
flags = PyString_FromStringAndSize(zero + 41, | ||||
nlen - 40); | ||||
if (!flags) | ||||
goto bail; | ||||
if (PyDict_SetItem(fdict, file, flags) == -1) | ||||
goto bail; | ||||
} | ||||
if (PyDict_SetItem(mfdict, file, node) == -1) | ||||
goto bail; | ||||
start = cur + 1; | ||||
zero = NULL; | ||||
Py_XDECREF(flags); | ||||
Py_XDECREF(node); | ||||
Py_XDECREF(file); | ||||
continue; | ||||
bail: | ||||
Py_XDECREF(flags); | ||||
Py_XDECREF(node); | ||||
Py_XDECREF(file); | ||||
goto quit; | ||||
} | ||||
if (len > 0 && *(cur - 1) != '\n') { | ||||
PyErr_SetString(PyExc_ValueError, | ||||
"manifest contains trailing garbage"); | ||||
goto quit; | ||||
} | ||||
Py_INCREF(Py_None); | ||||
return Py_None; | ||||
quit: | ||||
return NULL; | ||||
} | ||||
Matt Mackall
|
r7093 | #ifdef _WIN32 | ||
Matt Mackall
|
r10282 | #ifdef _MSC_VER | ||
Matt Mackall
|
r7093 | /* msvc 6.0 has problems */ | ||
Matt Mackall
|
r10282 | #define inline __inline | ||
Matt Mackall
|
r7093 | typedef unsigned long uint32_t; | ||
Dhruva Krishnamurthy
|
r7122 | typedef unsigned __int64 uint64_t; | ||
Matt Mackall
|
r10282 | #else | ||
#include <stdint.h> | ||||
#endif | ||||
Matt Mackall
|
r7093 | static uint32_t ntohl(uint32_t x) | ||
{ | ||||
return ((x & 0x000000ffUL) << 24) | | ||||
Thomas Arendsen Hein
|
r7190 | ((x & 0x0000ff00UL) << 8) | | ||
((x & 0x00ff0000UL) >> 8) | | ||||
((x & 0xff000000UL) >> 24); | ||||
Matt Mackall
|
r7093 | } | ||
#else | ||||
/* not windows */ | ||||
Matt Mackall
|
r10282 | #include <sys/types.h> | ||
#if defined __BEOS__ && !defined __HAIKU__ | ||||
#include <ByteOrder.h> | ||||
#else | ||||
#include <arpa/inet.h> | ||||
#endif | ||||
#include <inttypes.h> | ||||
Matt Mackall
|
r7093 | #endif | ||
static PyObject *parse_dirstate(PyObject *self, PyObject *args) | ||||
{ | ||||
PyObject *dmap, *cmap, *parents = NULL, *ret = NULL; | ||||
PyObject *fname = NULL, *cname = NULL, *entry = NULL; | ||||
char *str, *cur, *end, *cpos; | ||||
Benoit Boissinot
|
r7174 | int state, mode, size, mtime; | ||
unsigned int flen; | ||||
Matt Mackall
|
r7093 | int len; | ||
char decode[16]; /* for alignment */ | ||||
if (!PyArg_ParseTuple(args, "O!O!s#:parse_dirstate", | ||||
&PyDict_Type, &dmap, | ||||
&PyDict_Type, &cmap, | ||||
&str, &len)) | ||||
goto quit; | ||||
/* read parents */ | ||||
if (len < 40) | ||||
goto quit; | ||||
parents = Py_BuildValue("s#s#", str, 20, str + 20, 20); | ||||
if (!parents) | ||||
goto quit; | ||||
/* read filenames */ | ||||
cur = str + 40; | ||||
end = str + len; | ||||
while (cur < end - 17) { | ||||
/* unpack header */ | ||||
state = *cur; | ||||
memcpy(decode, cur + 1, 16); | ||||
mode = ntohl(*(uint32_t *)(decode)); | ||||
size = ntohl(*(uint32_t *)(decode + 4)); | ||||
mtime = ntohl(*(uint32_t *)(decode + 8)); | ||||
flen = ntohl(*(uint32_t *)(decode + 12)); | ||||
cur += 17; | ||||
Benoit Boissinot
|
r7174 | if (flen > end - cur) { | ||
PyErr_SetString(PyExc_ValueError, "overflow in dirstate"); | ||||
Matt Mackall
|
r7093 | goto quit; | ||
Benoit Boissinot
|
r7174 | } | ||
Matt Mackall
|
r7093 | |||
entry = Py_BuildValue("ciii", state, mode, size, mtime); | ||||
if (!entry) | ||||
goto quit; | ||||
Benoit Boissinot
|
r7175 | PyObject_GC_UnTrack(entry); /* don't waste time with this */ | ||
Matt Mackall
|
r7093 | |||
cpos = memchr(cur, 0, flen); | ||||
if (cpos) { | ||||
fname = PyString_FromStringAndSize(cur, cpos - cur); | ||||
cname = PyString_FromStringAndSize(cpos + 1, | ||||
flen - (cpos - cur) - 1); | ||||
if (!fname || !cname || | ||||
PyDict_SetItem(cmap, fname, cname) == -1 || | ||||
PyDict_SetItem(dmap, fname, entry) == -1) | ||||
goto quit; | ||||
Py_DECREF(cname); | ||||
} else { | ||||
fname = PyString_FromStringAndSize(cur, flen); | ||||
if (!fname || | ||||
PyDict_SetItem(dmap, fname, entry) == -1) | ||||
goto quit; | ||||
} | ||||
cur += flen; | ||||
Py_DECREF(fname); | ||||
Py_DECREF(entry); | ||||
fname = cname = entry = NULL; | ||||
} | ||||
ret = parents; | ||||
Py_INCREF(ret); | ||||
quit: | ||||
Py_XDECREF(fname); | ||||
Py_XDECREF(cname); | ||||
Py_XDECREF(entry); | ||||
Py_XDECREF(parents); | ||||
return ret; | ||||
} | ||||
Bernhard Leiner
|
r7108 | const char nullid[20]; | ||
const int nullrev = -1; | ||||
Benoit Boissinot
|
r7154 | /* create an index tuple, insert into the nodemap */ | ||
static PyObject * _build_idx_entry(PyObject *nodemap, int n, uint64_t offset_flags, | ||||
int comp_len, int uncomp_len, int base_rev, | ||||
int link_rev, int parent_1, int parent_2, | ||||
const char *c_node_id) | ||||
{ | ||||
int err; | ||||
PyObject *entry, *node_id, *n_obj; | ||||
node_id = PyString_FromStringAndSize(c_node_id, 20); | ||||
n_obj = PyInt_FromLong(n); | ||||
if (!node_id || !n_obj) | ||||
err = -1; | ||||
else | ||||
err = PyDict_SetItem(nodemap, node_id, n_obj); | ||||
Py_XDECREF(n_obj); | ||||
if (err) | ||||
goto error_dealloc; | ||||
entry = Py_BuildValue("LiiiiiiN", offset_flags, comp_len, | ||||
uncomp_len, base_rev, link_rev, | ||||
parent_1, parent_2, node_id); | ||||
if (!entry) | ||||
goto error_dealloc; | ||||
PyObject_GC_UnTrack(entry); /* don't waste time with this */ | ||||
return entry; | ||||
error_dealloc: | ||||
Py_XDECREF(node_id); | ||||
return NULL; | ||||
} | ||||
Bernhard Leiner
|
r7108 | /* RevlogNG format (all in big endian, data may be inlined): | ||
* 6 bytes: offset | ||||
* 2 bytes: flags | ||||
* 4 bytes: compressed length | ||||
* 4 bytes: uncompressed length | ||||
* 4 bytes: base revision | ||||
* 4 bytes: link revision | ||||
* 4 bytes: parent 1 revision | ||||
* 4 bytes: parent 2 revision | ||||
* 32 bytes: nodeid (only 20 bytes used) | ||||
*/ | ||||
static int _parse_index_ng (const char *data, int size, int inlined, | ||||
PyObject *index, PyObject *nodemap) | ||||
{ | ||||
Benoit Boissinot
|
r7154 | PyObject *entry; | ||
int n = 0, err; | ||||
uint64_t offset_flags; | ||||
Bernhard Leiner
|
r7108 | int comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2; | ||
Benoit Boissinot
|
r7154 | const char *c_node_id; | ||
Bernhard Leiner
|
r7108 | const char *end = data + size; | ||
Thomas Arendsen Hein
|
r7169 | char decode[64]; /* to enforce alignment with inline data */ | ||
Bernhard Leiner
|
r7108 | |||
while (data < end) { | ||||
Benoit Boissinot
|
r7174 | unsigned int step; | ||
Thomas Arendsen Hein
|
r7190 | |||
Thomas Arendsen Hein
|
r7169 | memcpy(decode, data, 64); | ||
Dirkjan Ochtman
|
r7186 | offset_flags = ntohl(*((uint32_t *) (decode + 4))); | ||
if (n == 0) /* mask out version number for the first entry */ | ||||
offset_flags &= 0xFFFF; | ||||
else { | ||||
Matt Mackall
|
r10282 | uint32_t offset_high = ntohl(*((uint32_t *)decode)); | ||
offset_flags |= ((uint64_t)offset_high) << 32; | ||||
Benoit Boissinot
|
r7133 | } | ||
Matt Mackall
|
r10282 | comp_len = ntohl(*((uint32_t *)(decode + 8))); | ||
uncomp_len = ntohl(*((uint32_t *)(decode + 12))); | ||||
base_rev = ntohl(*((uint32_t *)(decode + 16))); | ||||
link_rev = ntohl(*((uint32_t *)(decode + 20))); | ||||
parent_1 = ntohl(*((uint32_t *)(decode + 24))); | ||||
parent_2 = ntohl(*((uint32_t *)(decode + 28))); | ||||
Thomas Arendsen Hein
|
r7169 | c_node_id = decode + 32; | ||
Benoit Boissinot
|
r7154 | |||
entry = _build_idx_entry(nodemap, n, offset_flags, | ||||
comp_len, uncomp_len, base_rev, | ||||
link_rev, parent_1, parent_2, | ||||
c_node_id); | ||||
Bernhard Leiner
|
r7108 | if (!entry) | ||
Benoit Boissinot
|
r7154 | return 0; | ||
Bernhard Leiner
|
r7108 | |||
if (inlined) { | ||||
Benoit Boissinot
|
r7154 | err = PyList_Append(index, entry); | ||
Bernhard Leiner
|
r7108 | Py_DECREF(entry); | ||
Benoit Boissinot
|
r7154 | if (err) | ||
return 0; | ||||
} else | ||||
Bernhard Leiner
|
r7108 | PyList_SET_ITEM(index, n, entry); /* steals reference */ | ||
n++; | ||||
Benoit Boissinot
|
r7174 | step = 64 + (inlined ? comp_len : 0); | ||
if (end - data < step) | ||||
break; | ||||
data += step; | ||||
Bernhard Leiner
|
r7108 | } | ||
Benoit Boissinot
|
r7174 | if (data != end) { | ||
Bernhard Leiner
|
r7108 | if (!PyErr_Occurred()) | ||
PyErr_SetString(PyExc_ValueError, "corrupt index file"); | ||||
Benoit Boissinot
|
r7154 | return 0; | ||
Bernhard Leiner
|
r7108 | } | ||
Thomas Arendsen Hein
|
r7190 | /* create the nullid/nullrev entry in the nodemap and the | ||
Bernhard Leiner
|
r7108 | * magic nullid entry in the index at [-1] */ | ||
Thomas Arendsen Hein
|
r7190 | entry = _build_idx_entry(nodemap, | ||
Benoit Boissinot
|
r7154 | nullrev, 0, 0, 0, -1, -1, -1, -1, nullid); | ||
Bernhard Leiner
|
r7108 | if (!entry) | ||
Benoit Boissinot
|
r7154 | return 0; | ||
Bernhard Leiner
|
r7108 | if (inlined) { | ||
Benoit Boissinot
|
r7154 | err = PyList_Append(index, entry); | ||
Bernhard Leiner
|
r7108 | Py_DECREF(entry); | ||
Benoit Boissinot
|
r7154 | if (err) | ||
return 0; | ||||
} else | ||||
Bernhard Leiner
|
r7108 | PyList_SET_ITEM(index, n, entry); /* steals reference */ | ||
return 1; | ||||
} | ||||
/* This function parses a index file and returns a Python tuple of the | ||||
* following format: (index, nodemap, cache) | ||||
* | ||||
* index: a list of tuples containing the RevlogNG records | ||||
* nodemap: a dict mapping node ids to indices in the index list | ||||
* cache: if data is inlined, a tuple (index_file_content, 0) else None | ||||
*/ | ||||
static PyObject *parse_index(PyObject *self, PyObject *args) | ||||
{ | ||||
const char *data; | ||||
int size, inlined; | ||||
Thomas Arendsen Hein
|
r7190 | PyObject *rval = NULL, *index = NULL, *nodemap = NULL, *cache = NULL; | ||
Bernhard Leiner
|
r7108 | PyObject *data_obj = NULL, *inlined_obj; | ||
if (!PyArg_ParseTuple(args, "s#O", &data, &size, &inlined_obj)) | ||||
return NULL; | ||||
inlined = inlined_obj && PyObject_IsTrue(inlined_obj); | ||||
Thomas Arendsen Hein
|
r7190 | /* If no data is inlined, we know the size of the index list in | ||
* advance: size divided by size of one one revlog record (64 bytes) | ||||
* plus one for the nullid */ | ||||
Bernhard Leiner
|
r7108 | index = inlined ? PyList_New(0) : PyList_New(size / 64 + 1); | ||
if (!index) | ||||
goto quit; | ||||
nodemap = PyDict_New(); | ||||
Benoit Boissinot
|
r7154 | if (!nodemap) | ||
goto quit; | ||||
Bernhard Leiner
|
r7108 | |||
/* set up the cache return value */ | ||||
if (inlined) { | ||||
/* Note that the reference to data_obj is only borrowed */ | ||||
data_obj = PyTuple_GET_ITEM(args, 0); | ||||
cache = Py_BuildValue("iO", 0, data_obj); | ||||
if (!cache) | ||||
goto quit; | ||||
} else { | ||||
cache = Py_None; | ||||
Py_INCREF(Py_None); | ||||
} | ||||
/* actually populate the index and the nodemap with data */ | ||||
if (!_parse_index_ng (data, size, inlined, index, nodemap)) | ||||
goto quit; | ||||
rval = Py_BuildValue("NNN", index, nodemap, cache); | ||||
if (!rval) | ||||
goto quit; | ||||
return rval; | ||||
quit: | ||||
Py_XDECREF(index); | ||||
Py_XDECREF(nodemap); | ||||
Py_XDECREF(cache); | ||||
Py_XDECREF(rval); | ||||
return NULL; | ||||
} | ||||
Bryan O'Sullivan
|
r6389 | static char parsers_doc[] = "Efficient content parsing."; | ||
static PyMethodDef methods[] = { | ||||
{"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"}, | ||||
Matt Mackall
|
r7093 | {"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"}, | ||
Bernhard Leiner
|
r7108 | {"parse_index", parse_index, METH_VARARGS, "parse a revlog index\n"}, | ||
Bryan O'Sullivan
|
r6389 | {NULL, NULL} | ||
}; | ||||
PyMODINIT_FUNC initparsers(void) | ||||
{ | ||||
Py_InitModule3("parsers", methods, parsers_doc); | ||||
} | ||||