##// END OF EJS Templates
mdiff: replace wscleanup() regexps with C loops...
mdiff: replace wscleanup() regexps with C loops On my system it reduces: hg annotate -w mercurial/commands.py from 36s to less than 8s, to be compared with 6.3s when run without whitespace options.

File last commit:

r15033:2ef2d3a5 stable
r15530:eeac5e17 default
Show More
parsers.c
418 lines | 9.1 KiB | text/x-c | CLexer
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389 /*
parsers.c - efficient content parsing
Copyright 2008 Matt Mackall <mpm@selenic.com> and others
This software may be used and distributed according to the terms of
the GNU General Public License, incorporated herein by reference.
*/
#include <Python.h>
#include <ctype.h>
#include <string.h>
Renato Cunha
parsers.c: Added support for py3k....
r11361 #include "util.h"
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389 static int hexdigit(char c)
{
if (c >= '0' && c <= '9')
return c - '0';
Matt Mackall
parsers: speed up hex decoding for manifests
r7092 if (c >= 'a' && c <= 'f')
return c - 'a' + 10;
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389 if (c >= 'A' && c <= 'F')
return c - 'A' + 10;
Matt Mackall
parsers: speed up hex decoding for manifests
r7092 PyErr_SetString(PyExc_ValueError, "input contains non-hex character");
return 0;
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389 }
/*
* Turn a hex-encoded string into binary.
*/
static PyObject *unhexlify(const char *str, int len)
{
Matt Mackall
parsers: speed up hex decoding for manifests
r7092 PyObject *ret;
Benoit Boissinot
fix const annotation warning
r6395 const char *c;
char *d;
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389
Renato Cunha
parsers.c: Added support for py3k....
r11361 ret = PyBytes_FromStringAndSize(NULL, len / 2);
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389 if (!ret)
Matt Mackall
parsers: speed up hex decoding for manifests
r7092 return NULL;
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389
Renato Cunha
parsers.c: Added support for py3k....
r11361 d = PyBytes_AsString(ret);
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389 for (c = str; c < str + len;) {
int hi = hexdigit(*c++);
int lo = hexdigit(*c++);
*d++ = (hi << 4) | lo;
}
Matt Mackall
parsers: clean up whitespace
r7091
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389 return ret;
}
/*
* This code assumes that a manifest is stitched together with newline
* ('\n') characters.
*/
static PyObject *parse_manifest(PyObject *self, PyObject *args)
{
PyObject *mfdict, *fdict;
char *str, *cur, *start, *zero;
int len;
if (!PyArg_ParseTuple(args, "O!O!s#:parse_manifest",
&PyDict_Type, &mfdict,
&PyDict_Type, &fdict,
&str, &len))
goto quit;
for (start = cur = str, zero = NULL; cur < str + len; cur++) {
PyObject *file = NULL, *node = NULL;
PyObject *flags = NULL;
int nlen;
if (!*cur) {
zero = cur;
continue;
}
else if (*cur != '\n')
continue;
if (!zero) {
PyErr_SetString(PyExc_ValueError,
"manifest entry has no separator");
goto quit;
}
Renato Cunha
parsers.c: Added support for py3k....
r11361 file = PyBytes_FromStringAndSize(start, zero - start);
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389 if (!file)
goto bail;
nlen = cur - zero - 1;
node = unhexlify(zero + 1, nlen > 40 ? 40 : nlen);
if (!node)
goto bail;
if (nlen > 40) {
Renato Cunha
parsers.c: Added support for py3k....
r11361 flags = PyBytes_FromStringAndSize(zero + 41,
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389 nlen - 40);
if (!flags)
goto bail;
if (PyDict_SetItem(fdict, file, flags) == -1)
goto bail;
}
if (PyDict_SetItem(mfdict, file, node) == -1)
goto bail;
start = cur + 1;
zero = NULL;
Py_XDECREF(flags);
Py_XDECREF(node);
Py_XDECREF(file);
continue;
bail:
Py_XDECREF(flags);
Py_XDECREF(node);
Py_XDECREF(file);
goto quit;
}
if (len > 0 && *(cur - 1) != '\n') {
PyErr_SetString(PyExc_ValueError,
"manifest contains trailing garbage");
goto quit;
}
Py_INCREF(Py_None);
return Py_None;
quit:
return NULL;
}
Matt Mackall
dirstate: C parsing extension
r7093 #ifdef _WIN32
Matt Mackall
many, many trivial check-code fixups
r10282 #ifdef _MSC_VER
Matt Mackall
dirstate: C parsing extension
r7093 /* msvc 6.0 has problems */
Matt Mackall
many, many trivial check-code fixups
r10282 #define inline __inline
Matt Mackall
dirstate: C parsing extension
r7093 typedef unsigned long uint32_t;
Dhruva Krishnamurthy
Fix missing uint64_t definition in parsers.c under Windows
r7122 typedef unsigned __int64 uint64_t;
Matt Mackall
many, many trivial check-code fixups
r10282 #else
#include <stdint.h>
#endif
Matt Mackall
dirstate: C parsing extension
r7093 static uint32_t ntohl(uint32_t x)
{
return ((x & 0x000000ffUL) << 24) |
Thomas Arendsen Hein
Some additional space/tab cleanups
r7190 ((x & 0x0000ff00UL) << 8) |
((x & 0x00ff0000UL) >> 8) |
((x & 0xff000000UL) >> 24);
Matt Mackall
dirstate: C parsing extension
r7093 }
#else
/* not windows */
Matt Mackall
many, many trivial check-code fixups
r10282 #include <sys/types.h>
#if defined __BEOS__ && !defined __HAIKU__
#include <ByteOrder.h>
#else
#include <arpa/inet.h>
#endif
#include <inttypes.h>
Matt Mackall
dirstate: C parsing extension
r7093 #endif
static PyObject *parse_dirstate(PyObject *self, PyObject *args)
{
PyObject *dmap, *cmap, *parents = NULL, *ret = NULL;
PyObject *fname = NULL, *cname = NULL, *entry = NULL;
char *str, *cur, *end, *cpos;
Benoit Boissinot
parsers.c: fix integer overflows...
r7174 int state, mode, size, mtime;
unsigned int flen;
Matt Mackall
dirstate: C parsing extension
r7093 int len;
Matt Mackall
parsers: avoid pointer aliasing...
r15033 uint32_t decode[4]; /* for alignment */
Matt Mackall
dirstate: C parsing extension
r7093
if (!PyArg_ParseTuple(args, "O!O!s#:parse_dirstate",
&PyDict_Type, &dmap,
&PyDict_Type, &cmap,
&str, &len))
goto quit;
/* read parents */
if (len < 40)
goto quit;
parents = Py_BuildValue("s#s#", str, 20, str + 20, 20);
if (!parents)
goto quit;
/* read filenames */
cur = str + 40;
end = str + len;
while (cur < end - 17) {
/* unpack header */
state = *cur;
memcpy(decode, cur + 1, 16);
Matt Mackall
parsers: avoid pointer aliasing...
r15033 mode = ntohl(decode[0]);
size = ntohl(decode[1]);
mtime = ntohl(decode[2]);
flen = ntohl(decode[3]);
Matt Mackall
dirstate: C parsing extension
r7093 cur += 17;
Matt Mackall
parsers: fix some signed comparison issues...
r10449 if (cur + flen > end || cur + flen < cur) {
Benoit Boissinot
parsers.c: fix integer overflows...
r7174 PyErr_SetString(PyExc_ValueError, "overflow in dirstate");
Matt Mackall
dirstate: C parsing extension
r7093 goto quit;
Benoit Boissinot
parsers.c: fix integer overflows...
r7174 }
Matt Mackall
dirstate: C parsing extension
r7093
entry = Py_BuildValue("ciii", state, mode, size, mtime);
if (!entry)
goto quit;
Benoit Boissinot
parsers.c: do not try to untrack after a failure
r7175 PyObject_GC_UnTrack(entry); /* don't waste time with this */
Matt Mackall
dirstate: C parsing extension
r7093
cpos = memchr(cur, 0, flen);
if (cpos) {
Renato Cunha
parsers.c: Added support for py3k....
r11361 fname = PyBytes_FromStringAndSize(cur, cpos - cur);
cname = PyBytes_FromStringAndSize(cpos + 1,
Matt Mackall
dirstate: C parsing extension
r7093 flen - (cpos - cur) - 1);
if (!fname || !cname ||
PyDict_SetItem(cmap, fname, cname) == -1 ||
PyDict_SetItem(dmap, fname, entry) == -1)
goto quit;
Py_DECREF(cname);
} else {
Renato Cunha
parsers.c: Added support for py3k....
r11361 fname = PyBytes_FromStringAndSize(cur, flen);
Matt Mackall
dirstate: C parsing extension
r7093 if (!fname ||
PyDict_SetItem(dmap, fname, entry) == -1)
goto quit;
}
cur += flen;
Py_DECREF(fname);
Py_DECREF(entry);
fname = cname = entry = NULL;
}
ret = parents;
Py_INCREF(ret);
quit:
Py_XDECREF(fname);
Py_XDECREF(cname);
Py_XDECREF(entry);
Py_XDECREF(parents);
return ret;
}
Bernhard Leiner
C implementation of revlog index parsing
r7108 const char nullid[20];
const int nullrev = -1;
/* RevlogNG format (all in big endian, data may be inlined):
* 6 bytes: offset
* 2 bytes: flags
* 4 bytes: compressed length
* 4 bytes: uncompressed length
* 4 bytes: base revision
* 4 bytes: link revision
* 4 bytes: parent 1 revision
* 4 bytes: parent 2 revision
* 32 bytes: nodeid (only 20 bytes used)
*/
Matt Mackall
revlog: only build the nodemap on demand
r13254 static int _parse_index_ng(const char *data, int size, int inlined,
PyObject *index)
Bernhard Leiner
C implementation of revlog index parsing
r7108 {
Benoit Boissinot
index parser: fix refcounting in case of errors, refactor...
r7154 PyObject *entry;
int n = 0, err;
uint64_t offset_flags;
Bernhard Leiner
C implementation of revlog index parsing
r7108 int comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2;
Benoit Boissinot
index parser: fix refcounting in case of errors, refactor...
r7154 const char *c_node_id;
Bernhard Leiner
C implementation of revlog index parsing
r7108 const char *end = data + size;
Matt Mackall
parsers: avoid pointer aliasing...
r15033 uint32_t decode[8]; /* to enforce alignment with inline data */
Bernhard Leiner
C implementation of revlog index parsing
r7108
while (data < end) {
Benoit Boissinot
parsers.c: fix integer overflows...
r7174 unsigned int step;
Thomas Arendsen Hein
Some additional space/tab cleanups
r7190
Matt Mackall
parsers: avoid pointer aliasing...
r15033 memcpy(decode, data, 32);
offset_flags = ntohl(decode[1]);
Dirkjan Ochtman
clean up trailing spaces, leading spaces in C
r7186 if (n == 0) /* mask out version number for the first entry */
offset_flags &= 0xFFFF;
else {
Matt Mackall
parsers: avoid pointer aliasing...
r15033 uint32_t offset_high = ntohl(decode[0]);
Matt Mackall
many, many trivial check-code fixups
r10282 offset_flags |= ((uint64_t)offset_high) << 32;
Benoit Boissinot
revlog parser: use ntohl() instead of ntohll() (fix endianness issues)
r7133 }
Matt Mackall
parsers: avoid pointer aliasing...
r15033 comp_len = ntohl(decode[2]);
uncomp_len = ntohl(decode[3]);
base_rev = ntohl(decode[4]);
link_rev = ntohl(decode[5]);
parent_1 = ntohl(decode[6]);
parent_2 = ntohl(decode[7]);
c_node_id = data + 32;
Benoit Boissinot
index parser: fix refcounting in case of errors, refactor...
r7154
Matt Mackall
revlog: only build the nodemap on demand
r13254 entry = Py_BuildValue("Liiiiiis#", offset_flags, comp_len,
uncomp_len, base_rev, link_rev,
parent_1, parent_2, c_node_id, 20);
Bernhard Leiner
C implementation of revlog index parsing
r7108 if (!entry)
Benoit Boissinot
index parser: fix refcounting in case of errors, refactor...
r7154 return 0;
Bernhard Leiner
C implementation of revlog index parsing
r7108
Matt Mackall
revlog: only build the nodemap on demand
r13254 PyObject_GC_UnTrack(entry); /* don't waste time with this */
Bernhard Leiner
C implementation of revlog index parsing
r7108 if (inlined) {
Benoit Boissinot
index parser: fix refcounting in case of errors, refactor...
r7154 err = PyList_Append(index, entry);
Bernhard Leiner
C implementation of revlog index parsing
r7108 Py_DECREF(entry);
Benoit Boissinot
index parser: fix refcounting in case of errors, refactor...
r7154 if (err)
return 0;
} else
Bernhard Leiner
C implementation of revlog index parsing
r7108 PyList_SET_ITEM(index, n, entry); /* steals reference */
n++;
Benoit Boissinot
parsers.c: fix integer overflows...
r7174 step = 64 + (inlined ? comp_len : 0);
Matt Mackall
parsers: fix some signed comparison issues...
r10449 if (data + step > end || data + step < data)
Benoit Boissinot
parsers.c: fix integer overflows...
r7174 break;
data += step;
Bernhard Leiner
C implementation of revlog index parsing
r7108 }
Benoit Boissinot
parsers.c: fix integer overflows...
r7174 if (data != end) {
Bernhard Leiner
C implementation of revlog index parsing
r7108 if (!PyErr_Occurred())
PyErr_SetString(PyExc_ValueError, "corrupt index file");
Benoit Boissinot
index parser: fix refcounting in case of errors, refactor...
r7154 return 0;
Bernhard Leiner
C implementation of revlog index parsing
r7108 }
Matt Mackall
revlog: only build the nodemap on demand
r13254 /* create the magic nullid entry in the index at [-1] */
entry = Py_BuildValue("Liiiiiis#", (uint64_t)0, 0, 0, -1, -1, -1, -1, nullid, 20);
Bernhard Leiner
C implementation of revlog index parsing
r7108 if (!entry)
Benoit Boissinot
index parser: fix refcounting in case of errors, refactor...
r7154 return 0;
Matt Mackall
revlog: only build the nodemap on demand
r13254
PyObject_GC_UnTrack(entry); /* don't waste time with this */
Bernhard Leiner
C implementation of revlog index parsing
r7108 if (inlined) {
Benoit Boissinot
index parser: fix refcounting in case of errors, refactor...
r7154 err = PyList_Append(index, entry);
Bernhard Leiner
C implementation of revlog index parsing
r7108 Py_DECREF(entry);
Benoit Boissinot
index parser: fix refcounting in case of errors, refactor...
r7154 if (err)
return 0;
} else
Bernhard Leiner
C implementation of revlog index parsing
r7108 PyList_SET_ITEM(index, n, entry); /* steals reference */
return 1;
}
/* This function parses a index file and returns a Python tuple of the
Matt Mackall
revlog: only build the nodemap on demand
r13254 * following format: (index, cache)
Bernhard Leiner
C implementation of revlog index parsing
r7108 *
* index: a list of tuples containing the RevlogNG records
* cache: if data is inlined, a tuple (index_file_content, 0) else None
*/
Matt Mackall
revlog: only build the nodemap on demand
r13254 static PyObject *parse_index2(PyObject *self, PyObject *args)
Bernhard Leiner
C implementation of revlog index parsing
r7108 {
const char *data;
int size, inlined;
Matt Mackall
revlog: only build the nodemap on demand
r13254 PyObject *rval = NULL, *index = NULL, *cache = NULL;
Bernhard Leiner
C implementation of revlog index parsing
r7108 PyObject *data_obj = NULL, *inlined_obj;
if (!PyArg_ParseTuple(args, "s#O", &data, &size, &inlined_obj))
return NULL;
inlined = inlined_obj && PyObject_IsTrue(inlined_obj);
Thomas Arendsen Hein
Some additional space/tab cleanups
r7190 /* If no data is inlined, we know the size of the index list in
Benoit Boissinot
parsers.c: fix comment
r13263 * advance: size divided by the size of one revlog record (64 bytes)
* plus one for nullid */
Bernhard Leiner
C implementation of revlog index parsing
r7108 index = inlined ? PyList_New(0) : PyList_New(size / 64 + 1);
if (!index)
goto quit;
/* set up the cache return value */
if (inlined) {
/* Note that the reference to data_obj is only borrowed */
data_obj = PyTuple_GET_ITEM(args, 0);
cache = Py_BuildValue("iO", 0, data_obj);
if (!cache)
goto quit;
} else {
cache = Py_None;
Py_INCREF(Py_None);
}
Matt Mackall
revlog: only build the nodemap on demand
r13254 /* actually populate the index with data */
if (!_parse_index_ng(data, size, inlined, index))
Bernhard Leiner
C implementation of revlog index parsing
r7108 goto quit;
Matt Mackall
revlog: only build the nodemap on demand
r13254 rval = Py_BuildValue("NN", index, cache);
Bernhard Leiner
C implementation of revlog index parsing
r7108 if (!rval)
goto quit;
return rval;
quit:
Py_XDECREF(index);
Py_XDECREF(cache);
Py_XDECREF(rval);
return NULL;
}
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389 static char parsers_doc[] = "Efficient content parsing.";
static PyMethodDef methods[] = {
{"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"},
Matt Mackall
dirstate: C parsing extension
r7093 {"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"},
Matt Mackall
revlog: only build the nodemap on demand
r13254 {"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389 {NULL, NULL}
};
Renato Cunha
parsers.c: Added support for py3k....
r11361 #ifdef IS_PY3K
static struct PyModuleDef parsers_module = {
PyModuleDef_HEAD_INIT,
"parsers",
parsers_doc,
-1,
methods
};
PyMODINIT_FUNC PyInit_parsers(void)
{
return PyModule_Create(&parsers_module);
}
#else
Bryan O'Sullivan
manifest: improve parsing performance by 8x via a new C extension
r6389 PyMODINIT_FUNC initparsers(void)
{
Py_InitModule3("parsers", methods, parsers_doc);
}
Renato Cunha
parsers.c: Added support for py3k....
r11361 #endif