|
|
/**
|
|
|
* Copyright (c) 2016-present, Gregory Szorc
|
|
|
* All rights reserved.
|
|
|
*
|
|
|
* This software may be modified and distributed under the terms
|
|
|
* of the BSD license. See the LICENSE file for details.
|
|
|
*/
|
|
|
|
|
|
#include "python-zstandard.h"
|
|
|
|
|
|
extern PyObject* ZstdError;
|
|
|
|
|
|
ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) {
|
|
|
static char* kwlist[] = {
|
|
|
"dict_size",
|
|
|
"samples",
|
|
|
"selectivity",
|
|
|
"level",
|
|
|
"notifications",
|
|
|
"dict_id",
|
|
|
NULL
|
|
|
};
|
|
|
size_t capacity;
|
|
|
PyObject* samples;
|
|
|
Py_ssize_t samplesLen;
|
|
|
unsigned selectivity = 0;
|
|
|
int level = 0;
|
|
|
unsigned notifications = 0;
|
|
|
unsigned dictID = 0;
|
|
|
ZDICT_params_t zparams;
|
|
|
Py_ssize_t sampleIndex;
|
|
|
Py_ssize_t sampleSize;
|
|
|
PyObject* sampleItem;
|
|
|
size_t zresult;
|
|
|
void* sampleBuffer = NULL;
|
|
|
void* sampleOffset;
|
|
|
size_t samplesSize = 0;
|
|
|
size_t* sampleSizes = NULL;
|
|
|
void* dict = NULL;
|
|
|
ZstdCompressionDict* result = NULL;
|
|
|
|
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IiII:train_dictionary",
|
|
|
kwlist,
|
|
|
&capacity,
|
|
|
&PyList_Type, &samples,
|
|
|
&selectivity, &level, ¬ifications, &dictID)) {
|
|
|
return NULL;
|
|
|
}
|
|
|
|
|
|
memset(&zparams, 0, sizeof(zparams));
|
|
|
|
|
|
zparams.selectivityLevel = selectivity;
|
|
|
zparams.compressionLevel = level;
|
|
|
zparams.notificationLevel = notifications;
|
|
|
zparams.dictID = dictID;
|
|
|
|
|
|
/* Figure out the size of the raw samples */
|
|
|
samplesLen = PyList_Size(samples);
|
|
|
for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
|
|
|
sampleItem = PyList_GetItem(samples, sampleIndex);
|
|
|
if (!PyBytes_Check(sampleItem)) {
|
|
|
PyErr_SetString(PyExc_ValueError, "samples must be bytes");
|
|
|
return NULL;
|
|
|
}
|
|
|
samplesSize += PyBytes_GET_SIZE(sampleItem);
|
|
|
}
|
|
|
|
|
|
/* Now that we know the total size of the raw simples, we can allocate
|
|
|
a buffer for the raw data */
|
|
|
sampleBuffer = PyMem_Malloc(samplesSize);
|
|
|
if (!sampleBuffer) {
|
|
|
PyErr_NoMemory();
|
|
|
goto finally;
|
|
|
}
|
|
|
sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t));
|
|
|
if (!sampleSizes) {
|
|
|
PyErr_NoMemory();
|
|
|
goto finally;
|
|
|
}
|
|
|
|
|
|
sampleOffset = sampleBuffer;
|
|
|
/* Now iterate again and assemble the samples in the buffer */
|
|
|
for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
|
|
|
sampleItem = PyList_GetItem(samples, sampleIndex);
|
|
|
sampleSize = PyBytes_GET_SIZE(sampleItem);
|
|
|
sampleSizes[sampleIndex] = sampleSize;
|
|
|
memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize);
|
|
|
sampleOffset = (char*)sampleOffset + sampleSize;
|
|
|
}
|
|
|
|
|
|
dict = PyMem_Malloc(capacity);
|
|
|
if (!dict) {
|
|
|
PyErr_NoMemory();
|
|
|
goto finally;
|
|
|
}
|
|
|
|
|
|
/* TODO consider using dup2() to redirect zstd's stderr writing to a buffer */
|
|
|
Py_BEGIN_ALLOW_THREADS
|
|
|
zresult = ZDICT_trainFromBuffer_advanced(dict, capacity,
|
|
|
sampleBuffer, sampleSizes, (unsigned int)samplesLen,
|
|
|
zparams);
|
|
|
Py_END_ALLOW_THREADS
|
|
|
if (ZDICT_isError(zresult)) {
|
|
|
PyErr_Format(ZstdError, "Cannot train dict: %s", ZDICT_getErrorName(zresult));
|
|
|
PyMem_Free(dict);
|
|
|
goto finally;
|
|
|
}
|
|
|
|
|
|
result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType);
|
|
|
if (!result) {
|
|
|
goto finally;
|
|
|
}
|
|
|
|
|
|
result->dictData = dict;
|
|
|
result->dictSize = zresult;
|
|
|
result->d = 0;
|
|
|
result->k = 0;
|
|
|
|
|
|
finally:
|
|
|
PyMem_Free(sampleBuffer);
|
|
|
PyMem_Free(sampleSizes);
|
|
|
|
|
|
return result;
|
|
|
}
|
|
|
|
|
|
ZstdCompressionDict* train_cover_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) {
|
|
|
static char* kwlist[] = {
|
|
|
"dict_size",
|
|
|
"samples",
|
|
|
"k",
|
|
|
"d",
|
|
|
"notifications",
|
|
|
"dict_id",
|
|
|
"level",
|
|
|
"optimize",
|
|
|
"steps",
|
|
|
"threads",
|
|
|
NULL
|
|
|
};
|
|
|
|
|
|
size_t capacity;
|
|
|
PyObject* samples;
|
|
|
unsigned k = 0;
|
|
|
unsigned d = 0;
|
|
|
unsigned notifications = 0;
|
|
|
unsigned dictID = 0;
|
|
|
int level = 0;
|
|
|
PyObject* optimize = NULL;
|
|
|
unsigned steps = 0;
|
|
|
int threads = 0;
|
|
|
COVER_params_t params;
|
|
|
Py_ssize_t samplesLen;
|
|
|
Py_ssize_t i;
|
|
|
size_t samplesSize = 0;
|
|
|
void* sampleBuffer = NULL;
|
|
|
size_t* sampleSizes = NULL;
|
|
|
void* sampleOffset;
|
|
|
Py_ssize_t sampleSize;
|
|
|
void* dict = NULL;
|
|
|
size_t zresult;
|
|
|
ZstdCompressionDict* result = NULL;
|
|
|
|
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiOIi:train_cover_dictionary",
|
|
|
kwlist, &capacity, &PyList_Type, &samples,
|
|
|
&k, &d, ¬ifications, &dictID, &level, &optimize, &steps, &threads)) {
|
|
|
return NULL;
|
|
|
}
|
|
|
|
|
|
if (threads < 0) {
|
|
|
threads = cpu_count();
|
|
|
}
|
|
|
|
|
|
memset(¶ms, 0, sizeof(params));
|
|
|
params.k = k;
|
|
|
params.d = d;
|
|
|
params.steps = steps;
|
|
|
params.nbThreads = threads;
|
|
|
params.notificationLevel = notifications;
|
|
|
params.dictID = dictID;
|
|
|
params.compressionLevel = level;
|
|
|
|
|
|
/* Figure out total size of input samples. */
|
|
|
samplesLen = PyList_Size(samples);
|
|
|
for (i = 0; i < samplesLen; i++) {
|
|
|
PyObject* sampleItem = PyList_GET_ITEM(samples, i);
|
|
|
|
|
|
if (!PyBytes_Check(sampleItem)) {
|
|
|
PyErr_SetString(PyExc_ValueError, "samples must be bytes");
|
|
|
return NULL;
|
|
|
}
|
|
|
samplesSize += PyBytes_GET_SIZE(sampleItem);
|
|
|
}
|
|
|
|
|
|
sampleBuffer = PyMem_Malloc(samplesSize);
|
|
|
if (!sampleBuffer) {
|
|
|
PyErr_NoMemory();
|
|
|
goto finally;
|
|
|
}
|
|
|
|
|
|
sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t));
|
|
|
if (!sampleSizes) {
|
|
|
PyErr_NoMemory();
|
|
|
goto finally;
|
|
|
}
|
|
|
|
|
|
sampleOffset = sampleBuffer;
|
|
|
for (i = 0; i < samplesLen; i++) {
|
|
|
PyObject* sampleItem = PyList_GET_ITEM(samples, i);
|
|
|
sampleSize = PyBytes_GET_SIZE(sampleItem);
|
|
|
sampleSizes[i] = sampleSize;
|
|
|
memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize);
|
|
|
sampleOffset = (char*)sampleOffset + sampleSize;
|
|
|
}
|
|
|
|
|
|
dict = PyMem_Malloc(capacity);
|
|
|
if (!dict) {
|
|
|
PyErr_NoMemory();
|
|
|
goto finally;
|
|
|
}
|
|
|
|
|
|
Py_BEGIN_ALLOW_THREADS
|
|
|
if (optimize && PyObject_IsTrue(optimize)) {
|
|
|
zresult = COVER_optimizeTrainFromBuffer(dict, capacity,
|
|
|
sampleBuffer, sampleSizes, (unsigned)samplesLen, ¶ms);
|
|
|
}
|
|
|
else {
|
|
|
zresult = COVER_trainFromBuffer(dict, capacity,
|
|
|
sampleBuffer, sampleSizes, (unsigned)samplesLen, params);
|
|
|
}
|
|
|
Py_END_ALLOW_THREADS
|
|
|
|
|
|
if (ZDICT_isError(zresult)) {
|
|
|
PyMem_Free(dict);
|
|
|
PyErr_Format(ZstdError, "cannot train dict: %s", ZDICT_getErrorName(zresult));
|
|
|
goto finally;
|
|
|
}
|
|
|
|
|
|
result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType);
|
|
|
if (!result) {
|
|
|
PyMem_Free(dict);
|
|
|
goto finally;
|
|
|
}
|
|
|
|
|
|
result->dictData = dict;
|
|
|
result->dictSize = zresult;
|
|
|
result->d = params.d;
|
|
|
result->k = params.k;
|
|
|
|
|
|
finally:
|
|
|
PyMem_Free(sampleBuffer);
|
|
|
PyMem_Free(sampleSizes);
|
|
|
|
|
|
return result;
|
|
|
}
|
|
|
|
|
|
PyDoc_STRVAR(ZstdCompressionDict__doc__,
|
|
|
"ZstdCompressionDict(data) - Represents a computed compression dictionary\n"
|
|
|
"\n"
|
|
|
"This type holds the results of a computed Zstandard compression dictionary.\n"
|
|
|
"Instances are obtained by calling ``train_dictionary()`` or by passing bytes\n"
|
|
|
"obtained from another source into the constructor.\n"
|
|
|
);
|
|
|
|
|
|
static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args) {
|
|
|
const char* source;
|
|
|
Py_ssize_t sourceSize;
|
|
|
|
|
|
self->dictData = NULL;
|
|
|
self->dictSize = 0;
|
|
|
|
|
|
#if PY_MAJOR_VERSION >= 3
|
|
|
if (!PyArg_ParseTuple(args, "y#:ZstdCompressionDict",
|
|
|
#else
|
|
|
if (!PyArg_ParseTuple(args, "s#:ZstdCompressionDict",
|
|
|
#endif
|
|
|
&source, &sourceSize)) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
self->dictData = PyMem_Malloc(sourceSize);
|
|
|
if (!self->dictData) {
|
|
|
PyErr_NoMemory();
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
memcpy(self->dictData, source, sourceSize);
|
|
|
self->dictSize = sourceSize;
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) {
|
|
|
if (self->dictData) {
|
|
|
PyMem_Free(self->dictData);
|
|
|
self->dictData = NULL;
|
|
|
}
|
|
|
|
|
|
PyObject_Del(self);
|
|
|
}
|
|
|
|
|
|
static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) {
|
|
|
unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize);
|
|
|
|
|
|
return PyLong_FromLong(dictID);
|
|
|
}
|
|
|
|
|
|
static PyObject* ZstdCompressionDict_as_bytes(ZstdCompressionDict* self) {
|
|
|
return PyBytes_FromStringAndSize(self->dictData, self->dictSize);
|
|
|
}
|
|
|
|
|
|
static PyMethodDef ZstdCompressionDict_methods[] = {
|
|
|
{ "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS,
|
|
|
PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") },
|
|
|
{ "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS,
|
|
|
PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") },
|
|
|
{ NULL, NULL }
|
|
|
};
|
|
|
|
|
|
static PyMemberDef ZstdCompressionDict_members[] = {
|
|
|
{ "k", T_UINT, offsetof(ZstdCompressionDict, k), READONLY,
|
|
|
"segment size" },
|
|
|
{ "d", T_UINT, offsetof(ZstdCompressionDict, d), READONLY,
|
|
|
"dmer size" },
|
|
|
{ NULL }
|
|
|
};
|
|
|
|
|
|
static Py_ssize_t ZstdCompressionDict_length(ZstdCompressionDict* self) {
|
|
|
return self->dictSize;
|
|
|
}
|
|
|
|
|
|
static PySequenceMethods ZstdCompressionDict_sq = {
|
|
|
(lenfunc)ZstdCompressionDict_length, /* sq_length */
|
|
|
0, /* sq_concat */
|
|
|
0, /* sq_repeat */
|
|
|
0, /* sq_item */
|
|
|
0, /* sq_ass_item */
|
|
|
0, /* sq_contains */
|
|
|
0, /* sq_inplace_concat */
|
|
|
0 /* sq_inplace_repeat */
|
|
|
};
|
|
|
|
|
|
PyTypeObject ZstdCompressionDictType = {
|
|
|
PyVarObject_HEAD_INIT(NULL, 0)
|
|
|
"zstd.ZstdCompressionDict", /* tp_name */
|
|
|
sizeof(ZstdCompressionDict), /* tp_basicsize */
|
|
|
0, /* tp_itemsize */
|
|
|
(destructor)ZstdCompressionDict_dealloc, /* tp_dealloc */
|
|
|
0, /* tp_print */
|
|
|
0, /* tp_getattr */
|
|
|
0, /* tp_setattr */
|
|
|
0, /* tp_compare */
|
|
|
0, /* tp_repr */
|
|
|
0, /* tp_as_number */
|
|
|
&ZstdCompressionDict_sq, /* tp_as_sequence */
|
|
|
0, /* tp_as_mapping */
|
|
|
0, /* tp_hash */
|
|
|
0, /* tp_call */
|
|
|
0, /* tp_str */
|
|
|
0, /* tp_getattro */
|
|
|
0, /* tp_setattro */
|
|
|
0, /* tp_as_buffer */
|
|
|
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
|
|
|
ZstdCompressionDict__doc__, /* tp_doc */
|
|
|
0, /* tp_traverse */
|
|
|
0, /* tp_clear */
|
|
|
0, /* tp_richcompare */
|
|
|
0, /* tp_weaklistoffset */
|
|
|
0, /* tp_iter */
|
|
|
0, /* tp_iternext */
|
|
|
ZstdCompressionDict_methods, /* tp_methods */
|
|
|
ZstdCompressionDict_members, /* tp_members */
|
|
|
0, /* tp_getset */
|
|
|
0, /* tp_base */
|
|
|
0, /* tp_dict */
|
|
|
0, /* tp_descr_get */
|
|
|
0, /* tp_descr_set */
|
|
|
0, /* tp_dictoffset */
|
|
|
(initproc)ZstdCompressionDict_init, /* tp_init */
|
|
|
0, /* tp_alloc */
|
|
|
PyType_GenericNew, /* tp_new */
|
|
|
};
|
|
|
|
|
|
void compressiondict_module_init(PyObject* mod) {
|
|
|
Py_TYPE(&ZstdCompressionDictType) = &PyType_Type;
|
|
|
if (PyType_Ready(&ZstdCompressionDictType) < 0) {
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
Py_INCREF((PyObject*)&ZstdCompressionDictType);
|
|
|
PyModule_AddObject(mod, "ZstdCompressionDict",
|
|
|
(PyObject*)&ZstdCompressionDictType);
|
|
|
}
|
|
|
|