|
|
/**
|
|
|
* Copyright (c) 2016-present, Gregory Szorc
|
|
|
* All rights reserved.
|
|
|
*
|
|
|
* This software may be modified and distributed under the terms
|
|
|
* of the BSD license. See the LICENSE file for details.
|
|
|
*/
|
|
|
|
|
|
#include "python-zstandard.h"
|
|
|
|
|
|
extern PyObject* ZstdError;
|
|
|
|
|
|
ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) {
|
|
|
static char *kwlist[] = { "dict_size", "samples", "parameters", NULL };
|
|
|
size_t capacity;
|
|
|
PyObject* samples;
|
|
|
Py_ssize_t samplesLen;
|
|
|
PyObject* parameters = NULL;
|
|
|
ZDICT_params_t zparams;
|
|
|
Py_ssize_t sampleIndex;
|
|
|
Py_ssize_t sampleSize;
|
|
|
PyObject* sampleItem;
|
|
|
size_t zresult;
|
|
|
void* sampleBuffer;
|
|
|
void* sampleOffset;
|
|
|
size_t samplesSize = 0;
|
|
|
size_t* sampleSizes;
|
|
|
void* dict;
|
|
|
ZstdCompressionDict* result;
|
|
|
|
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|O!", kwlist,
|
|
|
&capacity,
|
|
|
&PyList_Type, &samples,
|
|
|
(PyObject*)&DictParametersType, ¶meters)) {
|
|
|
return NULL;
|
|
|
}
|
|
|
|
|
|
/* Validate parameters first since it is easiest. */
|
|
|
zparams.selectivityLevel = 0;
|
|
|
zparams.compressionLevel = 0;
|
|
|
zparams.notificationLevel = 0;
|
|
|
zparams.dictID = 0;
|
|
|
zparams.reserved[0] = 0;
|
|
|
zparams.reserved[1] = 0;
|
|
|
|
|
|
if (parameters) {
|
|
|
/* TODO validate data ranges */
|
|
|
zparams.selectivityLevel = PyLong_AsUnsignedLong(PyTuple_GetItem(parameters, 0));
|
|
|
zparams.compressionLevel = PyLong_AsLong(PyTuple_GetItem(parameters, 1));
|
|
|
zparams.notificationLevel = PyLong_AsUnsignedLong(PyTuple_GetItem(parameters, 2));
|
|
|
zparams.dictID = PyLong_AsUnsignedLong(PyTuple_GetItem(parameters, 3));
|
|
|
}
|
|
|
|
|
|
/* Figure out the size of the raw samples */
|
|
|
samplesLen = PyList_Size(samples);
|
|
|
for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
|
|
|
sampleItem = PyList_GetItem(samples, sampleIndex);
|
|
|
if (!PyBytes_Check(sampleItem)) {
|
|
|
PyErr_SetString(PyExc_ValueError, "samples must be bytes");
|
|
|
/* TODO probably need to perform DECREF here */
|
|
|
return NULL;
|
|
|
}
|
|
|
samplesSize += PyBytes_GET_SIZE(sampleItem);
|
|
|
}
|
|
|
|
|
|
/* Now that we know the total size of the raw simples, we can allocate
|
|
|
a buffer for the raw data */
|
|
|
sampleBuffer = malloc(samplesSize);
|
|
|
if (!sampleBuffer) {
|
|
|
PyErr_NoMemory();
|
|
|
return NULL;
|
|
|
}
|
|
|
sampleSizes = malloc(samplesLen * sizeof(size_t));
|
|
|
if (!sampleSizes) {
|
|
|
free(sampleBuffer);
|
|
|
PyErr_NoMemory();
|
|
|
return NULL;
|
|
|
}
|
|
|
|
|
|
sampleOffset = sampleBuffer;
|
|
|
/* Now iterate again and assemble the samples in the buffer */
|
|
|
for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
|
|
|
sampleItem = PyList_GetItem(samples, sampleIndex);
|
|
|
sampleSize = PyBytes_GET_SIZE(sampleItem);
|
|
|
sampleSizes[sampleIndex] = sampleSize;
|
|
|
memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize);
|
|
|
sampleOffset = (char*)sampleOffset + sampleSize;
|
|
|
}
|
|
|
|
|
|
dict = malloc(capacity);
|
|
|
if (!dict) {
|
|
|
free(sampleSizes);
|
|
|
free(sampleBuffer);
|
|
|
PyErr_NoMemory();
|
|
|
return NULL;
|
|
|
}
|
|
|
|
|
|
zresult = ZDICT_trainFromBuffer_advanced(dict, capacity,
|
|
|
sampleBuffer, sampleSizes, (unsigned int)samplesLen,
|
|
|
zparams);
|
|
|
if (ZDICT_isError(zresult)) {
|
|
|
PyErr_Format(ZstdError, "Cannot train dict: %s", ZDICT_getErrorName(zresult));
|
|
|
free(dict);
|
|
|
free(sampleSizes);
|
|
|
free(sampleBuffer);
|
|
|
return NULL;
|
|
|
}
|
|
|
|
|
|
result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType);
|
|
|
if (!result) {
|
|
|
return NULL;
|
|
|
}
|
|
|
|
|
|
result->dictData = dict;
|
|
|
result->dictSize = zresult;
|
|
|
return result;
|
|
|
}
|
|
|
|
|
|
|
|
|
PyDoc_STRVAR(ZstdCompressionDict__doc__,
|
|
|
"ZstdCompressionDict(data) - Represents a computed compression dictionary\n"
|
|
|
"\n"
|
|
|
"This type holds the results of a computed Zstandard compression dictionary.\n"
|
|
|
"Instances are obtained by calling ``train_dictionary()`` or by passing bytes\n"
|
|
|
"obtained from another source into the constructor.\n"
|
|
|
);
|
|
|
|
|
|
static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args) {
|
|
|
const char* source;
|
|
|
Py_ssize_t sourceSize;
|
|
|
|
|
|
self->dictData = NULL;
|
|
|
self->dictSize = 0;
|
|
|
|
|
|
#if PY_MAJOR_VERSION >= 3
|
|
|
if (!PyArg_ParseTuple(args, "y#", &source, &sourceSize)) {
|
|
|
#else
|
|
|
if (!PyArg_ParseTuple(args, "s#", &source, &sourceSize)) {
|
|
|
#endif
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
self->dictData = malloc(sourceSize);
|
|
|
if (!self->dictData) {
|
|
|
PyErr_NoMemory();
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
memcpy(self->dictData, source, sourceSize);
|
|
|
self->dictSize = sourceSize;
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) {
|
|
|
if (self->dictData) {
|
|
|
free(self->dictData);
|
|
|
self->dictData = NULL;
|
|
|
}
|
|
|
|
|
|
PyObject_Del(self);
|
|
|
}
|
|
|
|
|
|
static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) {
|
|
|
unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize);
|
|
|
|
|
|
return PyLong_FromLong(dictID);
|
|
|
}
|
|
|
|
|
|
static PyObject* ZstdCompressionDict_as_bytes(ZstdCompressionDict* self) {
|
|
|
return PyBytes_FromStringAndSize(self->dictData, self->dictSize);
|
|
|
}
|
|
|
|
|
|
static PyMethodDef ZstdCompressionDict_methods[] = {
|
|
|
{ "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS,
|
|
|
PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") },
|
|
|
{ "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS,
|
|
|
PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") },
|
|
|
{ NULL, NULL }
|
|
|
};
|
|
|
|
|
|
static Py_ssize_t ZstdCompressionDict_length(ZstdCompressionDict* self) {
|
|
|
return self->dictSize;
|
|
|
}
|
|
|
|
|
|
static PySequenceMethods ZstdCompressionDict_sq = {
|
|
|
(lenfunc)ZstdCompressionDict_length, /* sq_length */
|
|
|
0, /* sq_concat */
|
|
|
0, /* sq_repeat */
|
|
|
0, /* sq_item */
|
|
|
0, /* sq_ass_item */
|
|
|
0, /* sq_contains */
|
|
|
0, /* sq_inplace_concat */
|
|
|
0 /* sq_inplace_repeat */
|
|
|
};
|
|
|
|
|
|
PyTypeObject ZstdCompressionDictType = {
|
|
|
PyVarObject_HEAD_INIT(NULL, 0)
|
|
|
"zstd.ZstdCompressionDict", /* tp_name */
|
|
|
sizeof(ZstdCompressionDict), /* tp_basicsize */
|
|
|
0, /* tp_itemsize */
|
|
|
(destructor)ZstdCompressionDict_dealloc, /* tp_dealloc */
|
|
|
0, /* tp_print */
|
|
|
0, /* tp_getattr */
|
|
|
0, /* tp_setattr */
|
|
|
0, /* tp_compare */
|
|
|
0, /* tp_repr */
|
|
|
0, /* tp_as_number */
|
|
|
&ZstdCompressionDict_sq, /* tp_as_sequence */
|
|
|
0, /* tp_as_mapping */
|
|
|
0, /* tp_hash */
|
|
|
0, /* tp_call */
|
|
|
0, /* tp_str */
|
|
|
0, /* tp_getattro */
|
|
|
0, /* tp_setattro */
|
|
|
0, /* tp_as_buffer */
|
|
|
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
|
|
|
ZstdCompressionDict__doc__, /* tp_doc */
|
|
|
0, /* tp_traverse */
|
|
|
0, /* tp_clear */
|
|
|
0, /* tp_richcompare */
|
|
|
0, /* tp_weaklistoffset */
|
|
|
0, /* tp_iter */
|
|
|
0, /* tp_iternext */
|
|
|
ZstdCompressionDict_methods, /* tp_methods */
|
|
|
0, /* tp_members */
|
|
|
0, /* tp_getset */
|
|
|
0, /* tp_base */
|
|
|
0, /* tp_dict */
|
|
|
0, /* tp_descr_get */
|
|
|
0, /* tp_descr_set */
|
|
|
0, /* tp_dictoffset */
|
|
|
(initproc)ZstdCompressionDict_init, /* tp_init */
|
|
|
0, /* tp_alloc */
|
|
|
PyType_GenericNew, /* tp_new */
|
|
|
};
|
|
|
|
|
|
void compressiondict_module_init(PyObject* mod) {
|
|
|
Py_TYPE(&ZstdCompressionDictType) = &PyType_Type;
|
|
|
if (PyType_Ready(&ZstdCompressionDictType) < 0) {
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
Py_INCREF((PyObject*)&ZstdCompressionDictType);
|
|
|
PyModule_AddObject(mod, "ZstdCompressionDict",
|
|
|
(PyObject*)&ZstdCompressionDictType);
|
|
|
}
|
|
|
|