compressiondict.c
392 lines
| 10.8 KiB
| text/x-c
|
CLexer
Gregory Szorc
|
r30435 | /** | ||
* Copyright (c) 2016-present, Gregory Szorc | ||||
* All rights reserved. | ||||
* | ||||
* This software may be modified and distributed under the terms | ||||
* of the BSD license. See the LICENSE file for details. | ||||
*/ | ||||
#include "python-zstandard.h" | ||||
extern PyObject* ZstdError; | ||||
ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) { | ||||
Gregory Szorc
|
r31796 | static char* kwlist[] = { | ||
"dict_size", | ||||
"samples", | ||||
"selectivity", | ||||
"level", | ||||
"notifications", | ||||
"dict_id", | ||||
NULL | ||||
}; | ||||
Gregory Szorc
|
r30435 | size_t capacity; | ||
PyObject* samples; | ||||
Py_ssize_t samplesLen; | ||||
Gregory Szorc
|
r31796 | unsigned selectivity = 0; | ||
int level = 0; | ||||
unsigned notifications = 0; | ||||
unsigned dictID = 0; | ||||
Gregory Szorc
|
r30435 | ZDICT_params_t zparams; | ||
Py_ssize_t sampleIndex; | ||||
Py_ssize_t sampleSize; | ||||
PyObject* sampleItem; | ||||
size_t zresult; | ||||
Gregory Szorc
|
r31796 | void* sampleBuffer = NULL; | ||
Gregory Szorc
|
r30435 | void* sampleOffset; | ||
size_t samplesSize = 0; | ||||
Gregory Szorc
|
r31796 | size_t* sampleSizes = NULL; | ||
void* dict = NULL; | ||||
ZstdCompressionDict* result = NULL; | ||||
Gregory Szorc
|
r30435 | |||
Gregory Szorc
|
r31796 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IiII:train_dictionary", | ||
Gregory Szorc
|
r30895 | kwlist, | ||
Gregory Szorc
|
r30435 | &capacity, | ||
&PyList_Type, &samples, | ||||
Gregory Szorc
|
r31796 | &selectivity, &level, ¬ifications, &dictID)) { | ||
Gregory Szorc
|
r30435 | return NULL; | ||
} | ||||
Gregory Szorc
|
r31796 | memset(&zparams, 0, sizeof(zparams)); | ||
Gregory Szorc
|
r30435 | |||
Gregory Szorc
|
r31796 | zparams.selectivityLevel = selectivity; | ||
zparams.compressionLevel = level; | ||||
zparams.notificationLevel = notifications; | ||||
zparams.dictID = dictID; | ||||
Gregory Szorc
|
r30435 | |||
/* Figure out the size of the raw samples */ | ||||
samplesLen = PyList_Size(samples); | ||||
for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) { | ||||
sampleItem = PyList_GetItem(samples, sampleIndex); | ||||
if (!PyBytes_Check(sampleItem)) { | ||||
PyErr_SetString(PyExc_ValueError, "samples must be bytes"); | ||||
return NULL; | ||||
} | ||||
samplesSize += PyBytes_GET_SIZE(sampleItem); | ||||
} | ||||
/* Now that we know the total size of the raw simples, we can allocate | ||||
a buffer for the raw data */ | ||||
Gregory Szorc
|
r30822 | sampleBuffer = PyMem_Malloc(samplesSize); | ||
Gregory Szorc
|
r30435 | if (!sampleBuffer) { | ||
PyErr_NoMemory(); | ||||
Gregory Szorc
|
r31796 | goto finally; | ||
Gregory Szorc
|
r30435 | } | ||
Gregory Szorc
|
r30822 | sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t)); | ||
Gregory Szorc
|
r30435 | if (!sampleSizes) { | ||
PyErr_NoMemory(); | ||||
Gregory Szorc
|
r31796 | goto finally; | ||
Gregory Szorc
|
r30435 | } | ||
sampleOffset = sampleBuffer; | ||||
/* Now iterate again and assemble the samples in the buffer */ | ||||
for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) { | ||||
sampleItem = PyList_GetItem(samples, sampleIndex); | ||||
sampleSize = PyBytes_GET_SIZE(sampleItem); | ||||
sampleSizes[sampleIndex] = sampleSize; | ||||
memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize); | ||||
sampleOffset = (char*)sampleOffset + sampleSize; | ||||
} | ||||
Gregory Szorc
|
r30822 | dict = PyMem_Malloc(capacity); | ||
Gregory Szorc
|
r30435 | if (!dict) { | ||
PyErr_NoMemory(); | ||||
Gregory Szorc
|
r31796 | goto finally; | ||
Gregory Szorc
|
r30435 | } | ||
Gregory Szorc
|
r31796 | /* TODO consider using dup2() to redirect zstd's stderr writing to a buffer */ | ||
Py_BEGIN_ALLOW_THREADS | ||||
Gregory Szorc
|
r30435 | zresult = ZDICT_trainFromBuffer_advanced(dict, capacity, | ||
sampleBuffer, sampleSizes, (unsigned int)samplesLen, | ||||
zparams); | ||||
Gregory Szorc
|
r31796 | Py_END_ALLOW_THREADS | ||
Gregory Szorc
|
r30435 | if (ZDICT_isError(zresult)) { | ||
PyErr_Format(ZstdError, "Cannot train dict: %s", ZDICT_getErrorName(zresult)); | ||||
Gregory Szorc
|
r30822 | PyMem_Free(dict); | ||
Gregory Szorc
|
r31796 | goto finally; | ||
Gregory Szorc
|
r30435 | } | ||
result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType); | ||||
if (!result) { | ||||
Gregory Szorc
|
r31796 | goto finally; | ||
Gregory Szorc
|
r30435 | } | ||
result->dictData = dict; | ||||
result->dictSize = zresult; | ||||
Gregory Szorc
|
r31796 | result->d = 0; | ||
result->k = 0; | ||||
finally: | ||||
PyMem_Free(sampleBuffer); | ||||
PyMem_Free(sampleSizes); | ||||
Gregory Szorc
|
r30435 | return result; | ||
} | ||||
Gregory Szorc
|
r31796 | ZstdCompressionDict* train_cover_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) { | ||
static char* kwlist[] = { | ||||
"dict_size", | ||||
"samples", | ||||
"k", | ||||
"d", | ||||
"notifications", | ||||
"dict_id", | ||||
"level", | ||||
"optimize", | ||||
"steps", | ||||
"threads", | ||||
NULL | ||||
}; | ||||
size_t capacity; | ||||
PyObject* samples; | ||||
unsigned k = 0; | ||||
unsigned d = 0; | ||||
unsigned notifications = 0; | ||||
unsigned dictID = 0; | ||||
int level = 0; | ||||
PyObject* optimize = NULL; | ||||
unsigned steps = 0; | ||||
int threads = 0; | ||||
COVER_params_t params; | ||||
Py_ssize_t samplesLen; | ||||
Py_ssize_t i; | ||||
size_t samplesSize = 0; | ||||
void* sampleBuffer = NULL; | ||||
size_t* sampleSizes = NULL; | ||||
void* sampleOffset; | ||||
Py_ssize_t sampleSize; | ||||
void* dict = NULL; | ||||
size_t zresult; | ||||
ZstdCompressionDict* result = NULL; | ||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiOIi:train_cover_dictionary", | ||||
kwlist, &capacity, &PyList_Type, &samples, | ||||
&k, &d, ¬ifications, &dictID, &level, &optimize, &steps, &threads)) { | ||||
return NULL; | ||||
} | ||||
if (threads < 0) { | ||||
threads = cpu_count(); | ||||
} | ||||
memset(¶ms, 0, sizeof(params)); | ||||
params.k = k; | ||||
params.d = d; | ||||
params.steps = steps; | ||||
params.nbThreads = threads; | ||||
params.notificationLevel = notifications; | ||||
params.dictID = dictID; | ||||
params.compressionLevel = level; | ||||
/* Figure out total size of input samples. */ | ||||
samplesLen = PyList_Size(samples); | ||||
for (i = 0; i < samplesLen; i++) { | ||||
PyObject* sampleItem = PyList_GET_ITEM(samples, i); | ||||
if (!PyBytes_Check(sampleItem)) { | ||||
PyErr_SetString(PyExc_ValueError, "samples must be bytes"); | ||||
return NULL; | ||||
} | ||||
samplesSize += PyBytes_GET_SIZE(sampleItem); | ||||
} | ||||
sampleBuffer = PyMem_Malloc(samplesSize); | ||||
if (!sampleBuffer) { | ||||
PyErr_NoMemory(); | ||||
goto finally; | ||||
} | ||||
sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t)); | ||||
if (!sampleSizes) { | ||||
PyErr_NoMemory(); | ||||
goto finally; | ||||
} | ||||
sampleOffset = sampleBuffer; | ||||
for (i = 0; i < samplesLen; i++) { | ||||
PyObject* sampleItem = PyList_GET_ITEM(samples, i); | ||||
sampleSize = PyBytes_GET_SIZE(sampleItem); | ||||
sampleSizes[i] = sampleSize; | ||||
memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize); | ||||
sampleOffset = (char*)sampleOffset + sampleSize; | ||||
} | ||||
dict = PyMem_Malloc(capacity); | ||||
if (!dict) { | ||||
PyErr_NoMemory(); | ||||
goto finally; | ||||
} | ||||
Py_BEGIN_ALLOW_THREADS | ||||
if (optimize && PyObject_IsTrue(optimize)) { | ||||
zresult = COVER_optimizeTrainFromBuffer(dict, capacity, | ||||
sampleBuffer, sampleSizes, (unsigned)samplesLen, ¶ms); | ||||
} | ||||
else { | ||||
zresult = COVER_trainFromBuffer(dict, capacity, | ||||
sampleBuffer, sampleSizes, (unsigned)samplesLen, params); | ||||
} | ||||
Py_END_ALLOW_THREADS | ||||
if (ZDICT_isError(zresult)) { | ||||
PyMem_Free(dict); | ||||
PyErr_Format(ZstdError, "cannot train dict: %s", ZDICT_getErrorName(zresult)); | ||||
goto finally; | ||||
} | ||||
result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType); | ||||
if (!result) { | ||||
PyMem_Free(dict); | ||||
goto finally; | ||||
} | ||||
result->dictData = dict; | ||||
result->dictSize = zresult; | ||||
result->d = params.d; | ||||
result->k = params.k; | ||||
finally: | ||||
PyMem_Free(sampleBuffer); | ||||
PyMem_Free(sampleSizes); | ||||
return result; | ||||
} | ||||
Gregory Szorc
|
r30435 | |||
PyDoc_STRVAR(ZstdCompressionDict__doc__, | ||||
"ZstdCompressionDict(data) - Represents a computed compression dictionary\n" | ||||
"\n" | ||||
"This type holds the results of a computed Zstandard compression dictionary.\n" | ||||
"Instances are obtained by calling ``train_dictionary()`` or by passing bytes\n" | ||||
"obtained from another source into the constructor.\n" | ||||
); | ||||
static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args) { | ||||
const char* source; | ||||
Py_ssize_t sourceSize; | ||||
self->dictData = NULL; | ||||
self->dictSize = 0; | ||||
#if PY_MAJOR_VERSION >= 3 | ||||
Gregory Szorc
|
r30895 | if (!PyArg_ParseTuple(args, "y#:ZstdCompressionDict", | ||
Gregory Szorc
|
r30435 | #else | ||
Gregory Szorc
|
r30895 | if (!PyArg_ParseTuple(args, "s#:ZstdCompressionDict", | ||
Gregory Szorc
|
r30435 | #endif | ||
Gregory Szorc
|
r30895 | &source, &sourceSize)) { | ||
Gregory Szorc
|
r30435 | return -1; | ||
} | ||||
Gregory Szorc
|
r30822 | self->dictData = PyMem_Malloc(sourceSize); | ||
Gregory Szorc
|
r30435 | if (!self->dictData) { | ||
PyErr_NoMemory(); | ||||
return -1; | ||||
} | ||||
memcpy(self->dictData, source, sourceSize); | ||||
self->dictSize = sourceSize; | ||||
return 0; | ||||
} | ||||
static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) { | ||||
if (self->dictData) { | ||||
Gregory Szorc
|
r30822 | PyMem_Free(self->dictData); | ||
Gregory Szorc
|
r30435 | self->dictData = NULL; | ||
} | ||||
PyObject_Del(self); | ||||
} | ||||
static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) { | ||||
unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize); | ||||
return PyLong_FromLong(dictID); | ||||
} | ||||
static PyObject* ZstdCompressionDict_as_bytes(ZstdCompressionDict* self) { | ||||
return PyBytes_FromStringAndSize(self->dictData, self->dictSize); | ||||
} | ||||
static PyMethodDef ZstdCompressionDict_methods[] = { | ||||
{ "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS, | ||||
PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") }, | ||||
{ "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS, | ||||
PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") }, | ||||
{ NULL, NULL } | ||||
}; | ||||
Gregory Szorc
|
r31796 | static PyMemberDef ZstdCompressionDict_members[] = { | ||
{ "k", T_UINT, offsetof(ZstdCompressionDict, k), READONLY, | ||||
"segment size" }, | ||||
{ "d", T_UINT, offsetof(ZstdCompressionDict, d), READONLY, | ||||
"dmer size" }, | ||||
{ NULL } | ||||
}; | ||||
Gregory Szorc
|
r30435 | static Py_ssize_t ZstdCompressionDict_length(ZstdCompressionDict* self) { | ||
return self->dictSize; | ||||
} | ||||
static PySequenceMethods ZstdCompressionDict_sq = { | ||||
(lenfunc)ZstdCompressionDict_length, /* sq_length */ | ||||
0, /* sq_concat */ | ||||
0, /* sq_repeat */ | ||||
0, /* sq_item */ | ||||
0, /* sq_ass_item */ | ||||
0, /* sq_contains */ | ||||
0, /* sq_inplace_concat */ | ||||
0 /* sq_inplace_repeat */ | ||||
}; | ||||
PyTypeObject ZstdCompressionDictType = { | ||||
PyVarObject_HEAD_INIT(NULL, 0) | ||||
"zstd.ZstdCompressionDict", /* tp_name */ | ||||
sizeof(ZstdCompressionDict), /* tp_basicsize */ | ||||
0, /* tp_itemsize */ | ||||
(destructor)ZstdCompressionDict_dealloc, /* tp_dealloc */ | ||||
0, /* tp_print */ | ||||
0, /* tp_getattr */ | ||||
0, /* tp_setattr */ | ||||
0, /* tp_compare */ | ||||
0, /* tp_repr */ | ||||
0, /* tp_as_number */ | ||||
&ZstdCompressionDict_sq, /* tp_as_sequence */ | ||||
0, /* tp_as_mapping */ | ||||
0, /* tp_hash */ | ||||
0, /* tp_call */ | ||||
0, /* tp_str */ | ||||
0, /* tp_getattro */ | ||||
0, /* tp_setattro */ | ||||
0, /* tp_as_buffer */ | ||||
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ | ||||
ZstdCompressionDict__doc__, /* tp_doc */ | ||||
0, /* tp_traverse */ | ||||
0, /* tp_clear */ | ||||
0, /* tp_richcompare */ | ||||
0, /* tp_weaklistoffset */ | ||||
0, /* tp_iter */ | ||||
0, /* tp_iternext */ | ||||
ZstdCompressionDict_methods, /* tp_methods */ | ||||
Gregory Szorc
|
r31796 | ZstdCompressionDict_members, /* tp_members */ | ||
Gregory Szorc
|
r30435 | 0, /* tp_getset */ | ||
0, /* tp_base */ | ||||
0, /* tp_dict */ | ||||
0, /* tp_descr_get */ | ||||
0, /* tp_descr_set */ | ||||
0, /* tp_dictoffset */ | ||||
(initproc)ZstdCompressionDict_init, /* tp_init */ | ||||
0, /* tp_alloc */ | ||||
PyType_GenericNew, /* tp_new */ | ||||
}; | ||||
void compressiondict_module_init(PyObject* mod) { | ||||
Py_TYPE(&ZstdCompressionDictType) = &PyType_Type; | ||||
if (PyType_Ready(&ZstdCompressionDictType) < 0) { | ||||
return; | ||||
} | ||||
Py_INCREF((PyObject*)&ZstdCompressionDictType); | ||||
PyModule_AddObject(mod, "ZstdCompressionDict", | ||||
(PyObject*)&ZstdCompressionDictType); | ||||
} | ||||