parsers.c
416 lines
| 9.7 KiB
| text/x-c
|
CLexer
/ mercurial / parsers.c
Bryan O'Sullivan
|
r6389 | /* | ||
parsers.c - efficient content parsing | ||||
Copyright 2008 Matt Mackall <mpm@selenic.com> and others | ||||
This software may be used and distributed according to the terms of | ||||
the GNU General Public License, incorporated herein by reference. | ||||
*/ | ||||
#include <Python.h> | ||||
#include <ctype.h> | ||||
#include <string.h> | ||||
static int hexdigit(char c) | ||||
{ | ||||
if (c >= '0' && c <= '9') | ||||
return c - '0'; | ||||
Matt Mackall
|
r7092 | if (c >= 'a' && c <= 'f') | ||
return c - 'a' + 10; | ||||
Bryan O'Sullivan
|
r6389 | if (c >= 'A' && c <= 'F') | ||
return c - 'A' + 10; | ||||
Matt Mackall
|
r7092 | PyErr_SetString(PyExc_ValueError, "input contains non-hex character"); | ||
return 0; | ||||
Bryan O'Sullivan
|
r6389 | } | ||
/* | ||||
* Turn a hex-encoded string into binary. | ||||
*/ | ||||
static PyObject *unhexlify(const char *str, int len) | ||||
{ | ||||
Matt Mackall
|
r7092 | PyObject *ret; | ||
Benoit Boissinot
|
r6395 | const char *c; | ||
char *d; | ||||
Bryan O'Sullivan
|
r6389 | |||
ret = PyString_FromStringAndSize(NULL, len / 2); | ||||
if (!ret) | ||||
Matt Mackall
|
r7092 | return NULL; | ||
Bryan O'Sullivan
|
r6389 | |||
Matt Mackall
|
r7092 | d = PyString_AS_STRING(ret); | ||
Bryan O'Sullivan
|
r6389 | for (c = str; c < str + len;) { | ||
int hi = hexdigit(*c++); | ||||
int lo = hexdigit(*c++); | ||||
*d++ = (hi << 4) | lo; | ||||
} | ||||
Matt Mackall
|
r7091 | |||
Bryan O'Sullivan
|
r6389 | return ret; | ||
} | ||||
/* | ||||
* This code assumes that a manifest is stitched together with newline | ||||
* ('\n') characters. | ||||
*/ | ||||
static PyObject *parse_manifest(PyObject *self, PyObject *args) | ||||
{ | ||||
PyObject *mfdict, *fdict; | ||||
char *str, *cur, *start, *zero; | ||||
int len; | ||||
if (!PyArg_ParseTuple(args, "O!O!s#:parse_manifest", | ||||
&PyDict_Type, &mfdict, | ||||
&PyDict_Type, &fdict, | ||||
&str, &len)) | ||||
goto quit; | ||||
for (start = cur = str, zero = NULL; cur < str + len; cur++) { | ||||
PyObject *file = NULL, *node = NULL; | ||||
PyObject *flags = NULL; | ||||
int nlen; | ||||
if (!*cur) { | ||||
zero = cur; | ||||
continue; | ||||
} | ||||
else if (*cur != '\n') | ||||
continue; | ||||
if (!zero) { | ||||
PyErr_SetString(PyExc_ValueError, | ||||
"manifest entry has no separator"); | ||||
goto quit; | ||||
} | ||||
file = PyString_FromStringAndSize(start, zero - start); | ||||
if (!file) | ||||
goto bail; | ||||
nlen = cur - zero - 1; | ||||
node = unhexlify(zero + 1, nlen > 40 ? 40 : nlen); | ||||
if (!node) | ||||
goto bail; | ||||
if (nlen > 40) { | ||||
PyObject *flags; | ||||
flags = PyString_FromStringAndSize(zero + 41, | ||||
nlen - 40); | ||||
if (!flags) | ||||
goto bail; | ||||
if (PyDict_SetItem(fdict, file, flags) == -1) | ||||
goto bail; | ||||
} | ||||
if (PyDict_SetItem(mfdict, file, node) == -1) | ||||
goto bail; | ||||
start = cur + 1; | ||||
zero = NULL; | ||||
Py_XDECREF(flags); | ||||
Py_XDECREF(node); | ||||
Py_XDECREF(file); | ||||
continue; | ||||
bail: | ||||
Py_XDECREF(flags); | ||||
Py_XDECREF(node); | ||||
Py_XDECREF(file); | ||||
goto quit; | ||||
} | ||||
if (len > 0 && *(cur - 1) != '\n') { | ||||
PyErr_SetString(PyExc_ValueError, | ||||
"manifest contains trailing garbage"); | ||||
goto quit; | ||||
} | ||||
Py_INCREF(Py_None); | ||||
return Py_None; | ||||
quit: | ||||
return NULL; | ||||
} | ||||
Matt Mackall
|
r7093 | #ifdef _WIN32 | ||
# ifdef _MSC_VER | ||||
/* msvc 6.0 has problems */ | ||||
# define inline __inline | ||||
typedef unsigned long uint32_t; | ||||
Dhruva Krishnamurthy
|
r7122 | typedef unsigned __int64 uint64_t; | ||
Matt Mackall
|
r7093 | # else | ||
# include <stdint.h> | ||||
# endif | ||||
static uint32_t ntohl(uint32_t x) | ||||
{ | ||||
return ((x & 0x000000ffUL) << 24) | | ||||
((x & 0x0000ff00UL) << 8) | | ||||
((x & 0x00ff0000UL) >> 8) | | ||||
((x & 0xff000000UL) >> 24); | ||||
} | ||||
#else | ||||
/* not windows */ | ||||
# include <sys/types.h> | ||||
# if defined __BEOS__ && !defined __HAIKU__ | ||||
# include <ByteOrder.h> | ||||
# else | ||||
# include <arpa/inet.h> | ||||
# endif | ||||
# include <inttypes.h> | ||||
#endif | ||||
static PyObject *parse_dirstate(PyObject *self, PyObject *args) | ||||
{ | ||||
PyObject *dmap, *cmap, *parents = NULL, *ret = NULL; | ||||
PyObject *fname = NULL, *cname = NULL, *entry = NULL; | ||||
char *str, *cur, *end, *cpos; | ||||
int state, mode, size, mtime, flen; | ||||
int len; | ||||
char decode[16]; /* for alignment */ | ||||
if (!PyArg_ParseTuple(args, "O!O!s#:parse_dirstate", | ||||
&PyDict_Type, &dmap, | ||||
&PyDict_Type, &cmap, | ||||
&str, &len)) | ||||
goto quit; | ||||
/* read parents */ | ||||
if (len < 40) | ||||
goto quit; | ||||
parents = Py_BuildValue("s#s#", str, 20, str + 20, 20); | ||||
if (!parents) | ||||
goto quit; | ||||
/* read filenames */ | ||||
cur = str + 40; | ||||
end = str + len; | ||||
while (cur < end - 17) { | ||||
/* unpack header */ | ||||
state = *cur; | ||||
memcpy(decode, cur + 1, 16); | ||||
mode = ntohl(*(uint32_t *)(decode)); | ||||
size = ntohl(*(uint32_t *)(decode + 4)); | ||||
mtime = ntohl(*(uint32_t *)(decode + 8)); | ||||
flen = ntohl(*(uint32_t *)(decode + 12)); | ||||
cur += 17; | ||||
if (cur + flen > end) | ||||
goto quit; | ||||
entry = Py_BuildValue("ciii", state, mode, size, mtime); | ||||
PyObject_GC_UnTrack(entry); /* don't waste time with this */ | ||||
if (!entry) | ||||
goto quit; | ||||
cpos = memchr(cur, 0, flen); | ||||
if (cpos) { | ||||
fname = PyString_FromStringAndSize(cur, cpos - cur); | ||||
cname = PyString_FromStringAndSize(cpos + 1, | ||||
flen - (cpos - cur) - 1); | ||||
if (!fname || !cname || | ||||
PyDict_SetItem(cmap, fname, cname) == -1 || | ||||
PyDict_SetItem(dmap, fname, entry) == -1) | ||||
goto quit; | ||||
Py_DECREF(cname); | ||||
} else { | ||||
fname = PyString_FromStringAndSize(cur, flen); | ||||
if (!fname || | ||||
PyDict_SetItem(dmap, fname, entry) == -1) | ||||
goto quit; | ||||
} | ||||
cur += flen; | ||||
Py_DECREF(fname); | ||||
Py_DECREF(entry); | ||||
fname = cname = entry = NULL; | ||||
} | ||||
ret = parents; | ||||
Py_INCREF(ret); | ||||
quit: | ||||
Py_XDECREF(fname); | ||||
Py_XDECREF(cname); | ||||
Py_XDECREF(entry); | ||||
Py_XDECREF(parents); | ||||
return ret; | ||||
} | ||||
Bernhard Leiner
|
r7108 | |||
static inline uint64_t ntohll(uint64_t x) | ||||
{ | ||||
return (((uint64_t)ntohl((uint32_t)x)) << 32) | | ||||
(uint64_t)ntohl((uint32_t)(x >> 32)); | ||||
} | ||||
const char nullid[20]; | ||||
const int nullrev = -1; | ||||
/* RevlogNG format (all in big endian, data may be inlined): | ||||
* 6 bytes: offset | ||||
* 2 bytes: flags | ||||
* 4 bytes: compressed length | ||||
* 4 bytes: uncompressed length | ||||
* 4 bytes: base revision | ||||
* 4 bytes: link revision | ||||
* 4 bytes: parent 1 revision | ||||
* 4 bytes: parent 2 revision | ||||
* 32 bytes: nodeid (only 20 bytes used) | ||||
*/ | ||||
static int _parse_index_ng (const char *data, int size, int inlined, | ||||
PyObject *index, PyObject *nodemap) | ||||
{ | ||||
PyObject *entry = NULL, *node_id = NULL, *n_obj = NULL; | ||||
PyObject *nullrev_obj = NULL, *nullid_obj = NULL; | ||||
int comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2; | ||||
uint64_t offset_flags; | ||||
int n = 0; | ||||
const char *end = data + size; | ||||
while (data < end) { | ||||
offset_flags = ntohll(*((uint64_t *) data)); | ||||
if (n == 0) /* mask out version number for the first entry */ | ||||
offset_flags &= 0xFFFF; | ||||
comp_len = ntohl(*((uint32_t *) (data + 8))); | ||||
uncomp_len = ntohl(*((uint32_t *) (data + 12))); | ||||
base_rev = ntohl(*((uint32_t *) (data + 16))); | ||||
link_rev = ntohl(*((uint32_t *) (data + 20))); | ||||
parent_1 = ntohl(*((uint32_t *) (data + 24))); | ||||
parent_2 = ntohl(*((uint32_t *) (data + 28))); | ||||
node_id = PyString_FromStringAndSize(data + 32, 20); | ||||
n_obj = PyInt_FromLong(n); | ||||
if (!node_id || !n_obj || | ||||
PyDict_SetItem(nodemap, node_id, n_obj) != 0) | ||||
goto quit; | ||||
Py_DECREF(n_obj); | ||||
entry = Py_BuildValue("LiiiiiiN", offset_flags, comp_len, | ||||
uncomp_len, base_rev, link_rev, | ||||
parent_1, parent_2, node_id); | ||||
PyObject_GC_UnTrack(entry); /* don't waste time with this */ | ||||
if (!entry) | ||||
goto quit; | ||||
/* append to or set value in the index list */ | ||||
if (inlined) { | ||||
if (PyList_Append(index, entry) != 0) | ||||
goto quit; | ||||
Py_DECREF(entry); | ||||
} else { | ||||
PyList_SET_ITEM(index, n, entry); /* steals reference */ | ||||
} | ||||
data += 64 + (inlined ? comp_len : 0); | ||||
n++; | ||||
} | ||||
if (data > end) { | ||||
if (!PyErr_Occurred()) | ||||
PyErr_SetString(PyExc_ValueError, "corrupt index file"); | ||||
goto quit; | ||||
} | ||||
/* create the nullid/nullrev entry in the nodemap and the | ||||
* magic nullid entry in the index at [-1] */ | ||||
nullid_obj = PyString_FromStringAndSize(nullid, 20); | ||||
nullrev_obj = PyInt_FromLong(nullrev); | ||||
if (!nodemap || !nullid_obj || !nullrev_obj || | ||||
PyDict_SetItem(nodemap, nullid_obj, nullrev_obj) != 0) | ||||
goto quit; | ||||
Py_DECREF(nullrev_obj); | ||||
entry = Py_BuildValue("iiiiiiiN", 0, 0, 0, -1, -1, -1, -1, nullid_obj); | ||||
PyObject_GC_UnTrack(entry); /* don't waste time with this */ | ||||
if (!entry) | ||||
goto quit; | ||||
if (inlined) { | ||||
if (PyList_Append(index, entry) != 0) | ||||
goto quit; | ||||
Py_DECREF(entry); | ||||
} else { | ||||
PyList_SET_ITEM(index, n, entry); /* steals reference */ | ||||
} | ||||
return 1; | ||||
quit: | ||||
Py_XDECREF(n_obj); | ||||
Py_XDECREF(node_id); | ||||
Py_XDECREF(entry); | ||||
Py_XDECREF(nullrev_obj); | ||||
Py_XDECREF(nullid_obj); | ||||
return 0; | ||||
} | ||||
/* This function parses a index file and returns a Python tuple of the | ||||
* following format: (index, nodemap, cache) | ||||
* | ||||
* index: a list of tuples containing the RevlogNG records | ||||
* nodemap: a dict mapping node ids to indices in the index list | ||||
* cache: if data is inlined, a tuple (index_file_content, 0) else None | ||||
*/ | ||||
static PyObject *parse_index(PyObject *self, PyObject *args) | ||||
{ | ||||
const char *data; | ||||
int size, inlined; | ||||
PyObject *rval = NULL, *index = NULL, *nodemap = NULL, *cache = NULL; | ||||
PyObject *data_obj = NULL, *inlined_obj; | ||||
if (!PyArg_ParseTuple(args, "s#O", &data, &size, &inlined_obj)) | ||||
return NULL; | ||||
inlined = inlined_obj && PyObject_IsTrue(inlined_obj); | ||||
/* If no data is inlined, we know the size of the index list in | ||||
* advance: size divided by size of one one revlog record (64 bytes) | ||||
* plus one for the nullid */ | ||||
index = inlined ? PyList_New(0) : PyList_New(size / 64 + 1); | ||||
if (!index) | ||||
goto quit; | ||||
nodemap = PyDict_New(); | ||||
/* set up the cache return value */ | ||||
if (inlined) { | ||||
/* Note that the reference to data_obj is only borrowed */ | ||||
data_obj = PyTuple_GET_ITEM(args, 0); | ||||
cache = Py_BuildValue("iO", 0, data_obj); | ||||
if (!cache) | ||||
goto quit; | ||||
} else { | ||||
cache = Py_None; | ||||
Py_INCREF(Py_None); | ||||
} | ||||
/* actually populate the index and the nodemap with data */ | ||||
if (!_parse_index_ng (data, size, inlined, index, nodemap)) | ||||
goto quit; | ||||
rval = Py_BuildValue("NNN", index, nodemap, cache); | ||||
if (!rval) | ||||
goto quit; | ||||
return rval; | ||||
quit: | ||||
Py_XDECREF(index); | ||||
Py_XDECREF(nodemap); | ||||
Py_XDECREF(cache); | ||||
Py_XDECREF(rval); | ||||
Py_XDECREF(data_obj); | ||||
return NULL; | ||||
} | ||||
Bryan O'Sullivan
|
r6389 | static char parsers_doc[] = "Efficient content parsing."; | ||
static PyMethodDef methods[] = { | ||||
{"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"}, | ||||
Matt Mackall
|
r7093 | {"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"}, | ||
Bernhard Leiner
|
r7108 | {"parse_index", parse_index, METH_VARARGS, "parse a revlog index\n"}, | ||
Bryan O'Sullivan
|
r6389 | {NULL, NULL} | ||
}; | ||||
PyMODINIT_FUNC initparsers(void) | ||||
{ | ||||
Py_InitModule3("parsers", methods, parsers_doc); | ||||
} | ||||