compressiondict.c
411 lines
| 11.4 KiB
| text/x-c
|
CLexer
Gregory Szorc
|
r30435 | /** | ||
* Copyright (c) 2016-present, Gregory Szorc | ||||
* All rights reserved. | ||||
* | ||||
* This software may be modified and distributed under the terms | ||||
* of the BSD license. See the LICENSE file for details. | ||||
*/ | ||||
#include "python-zstandard.h" | ||||
extern PyObject* ZstdError; | ||||
ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) { | ||||
Gregory Szorc
|
r31796 | static char* kwlist[] = { | ||
"dict_size", | ||||
"samples", | ||||
"k", | ||||
"d", | ||||
"notifications", | ||||
"dict_id", | ||||
"level", | ||||
"steps", | ||||
"threads", | ||||
NULL | ||||
}; | ||||
size_t capacity; | ||||
PyObject* samples; | ||||
unsigned k = 0; | ||||
unsigned d = 0; | ||||
unsigned notifications = 0; | ||||
unsigned dictID = 0; | ||||
int level = 0; | ||||
unsigned steps = 0; | ||||
int threads = 0; | ||||
Gregory Szorc
|
r37513 | ZDICT_cover_params_t params; | ||
Gregory Szorc
|
r31796 | Py_ssize_t samplesLen; | ||
Py_ssize_t i; | ||||
size_t samplesSize = 0; | ||||
void* sampleBuffer = NULL; | ||||
size_t* sampleSizes = NULL; | ||||
void* sampleOffset; | ||||
Py_ssize_t sampleSize; | ||||
void* dict = NULL; | ||||
size_t zresult; | ||||
ZstdCompressionDict* result = NULL; | ||||
Gregory Szorc
|
r37513 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiIi:train_dictionary", | ||
Gregory Szorc
|
r31796 | kwlist, &capacity, &PyList_Type, &samples, | ||
Gregory Szorc
|
r37513 | &k, &d, ¬ifications, &dictID, &level, &steps, &threads)) { | ||
Gregory Szorc
|
r31796 | return NULL; | ||
} | ||||
if (threads < 0) { | ||||
threads = cpu_count(); | ||||
} | ||||
memset(¶ms, 0, sizeof(params)); | ||||
params.k = k; | ||||
params.d = d; | ||||
params.steps = steps; | ||||
params.nbThreads = threads; | ||||
Gregory Szorc
|
r37513 | params.zParams.notificationLevel = notifications; | ||
params.zParams.dictID = dictID; | ||||
params.zParams.compressionLevel = level; | ||||
Gregory Szorc
|
r31796 | |||
/* Figure out total size of input samples. */ | ||||
samplesLen = PyList_Size(samples); | ||||
for (i = 0; i < samplesLen; i++) { | ||||
PyObject* sampleItem = PyList_GET_ITEM(samples, i); | ||||
if (!PyBytes_Check(sampleItem)) { | ||||
PyErr_SetString(PyExc_ValueError, "samples must be bytes"); | ||||
return NULL; | ||||
} | ||||
samplesSize += PyBytes_GET_SIZE(sampleItem); | ||||
} | ||||
sampleBuffer = PyMem_Malloc(samplesSize); | ||||
if (!sampleBuffer) { | ||||
PyErr_NoMemory(); | ||||
goto finally; | ||||
} | ||||
sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t)); | ||||
if (!sampleSizes) { | ||||
PyErr_NoMemory(); | ||||
goto finally; | ||||
} | ||||
sampleOffset = sampleBuffer; | ||||
for (i = 0; i < samplesLen; i++) { | ||||
PyObject* sampleItem = PyList_GET_ITEM(samples, i); | ||||
sampleSize = PyBytes_GET_SIZE(sampleItem); | ||||
sampleSizes[i] = sampleSize; | ||||
memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize); | ||||
sampleOffset = (char*)sampleOffset + sampleSize; | ||||
} | ||||
dict = PyMem_Malloc(capacity); | ||||
if (!dict) { | ||||
PyErr_NoMemory(); | ||||
goto finally; | ||||
} | ||||
Py_BEGIN_ALLOW_THREADS | ||||
Gregory Szorc
|
r37513 | /* No parameters uses the default function, which will use default params | ||
and call ZDICT_optimizeTrainFromBuffer_cover under the hood. */ | ||||
if (!params.k && !params.d && !params.zParams.compressionLevel | ||||
&& !params.zParams.notificationLevel && !params.zParams.dictID) { | ||||
zresult = ZDICT_trainFromBuffer(dict, capacity, sampleBuffer, | ||||
sampleSizes, (unsigned)samplesLen); | ||||
} | ||||
/* Use optimize mode if user controlled steps or threads explicitly. */ | ||||
else if (params.steps || params.nbThreads) { | ||||
zresult = ZDICT_optimizeTrainFromBuffer_cover(dict, capacity, | ||||
Gregory Szorc
|
r31796 | sampleBuffer, sampleSizes, (unsigned)samplesLen, ¶ms); | ||
} | ||||
Gregory Szorc
|
r37513 | /* Non-optimize mode with explicit control. */ | ||
Gregory Szorc
|
r31796 | else { | ||
Gregory Szorc
|
r37513 | zresult = ZDICT_trainFromBuffer_cover(dict, capacity, | ||
Gregory Szorc
|
r31796 | sampleBuffer, sampleSizes, (unsigned)samplesLen, params); | ||
} | ||||
Py_END_ALLOW_THREADS | ||||
if (ZDICT_isError(zresult)) { | ||||
PyMem_Free(dict); | ||||
PyErr_Format(ZstdError, "cannot train dict: %s", ZDICT_getErrorName(zresult)); | ||||
goto finally; | ||||
} | ||||
result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType); | ||||
if (!result) { | ||||
PyMem_Free(dict); | ||||
goto finally; | ||||
} | ||||
result->dictData = dict; | ||||
result->dictSize = zresult; | ||||
Gregory Szorc
|
r37513 | result->dictType = ZSTD_dct_fullDict; | ||
Gregory Szorc
|
r31796 | result->d = params.d; | ||
result->k = params.k; | ||||
Gregory Szorc
|
r37513 | result->cdict = NULL; | ||
result->ddict = NULL; | ||||
Gregory Szorc
|
r31796 | |||
finally: | ||||
PyMem_Free(sampleBuffer); | ||||
PyMem_Free(sampleSizes); | ||||
return result; | ||||
} | ||||
Gregory Szorc
|
r30435 | |||
Gregory Szorc
|
r37513 | int ensure_ddict(ZstdCompressionDict* dict) { | ||
if (dict->ddict) { | ||||
return 0; | ||||
} | ||||
Py_BEGIN_ALLOW_THREADS | ||||
dict->ddict = ZSTD_createDDict_advanced(dict->dictData, dict->dictSize, | ||||
ZSTD_dlm_byRef, dict->dictType, ZSTD_defaultCMem); | ||||
Py_END_ALLOW_THREADS | ||||
if (!dict->ddict) { | ||||
PyErr_SetString(ZstdError, "could not create decompression dict"); | ||||
return 1; | ||||
} | ||||
return 0; | ||||
} | ||||
Gregory Szorc
|
r30435 | PyDoc_STRVAR(ZstdCompressionDict__doc__, | ||
"ZstdCompressionDict(data) - Represents a computed compression dictionary\n" | ||||
"\n" | ||||
"This type holds the results of a computed Zstandard compression dictionary.\n" | ||||
Gregory Szorc
|
r37513 | "Instances are obtained by calling ``train_dictionary()`` or by passing\n" | ||
"bytes obtained from another source into the constructor.\n" | ||||
Gregory Szorc
|
r30435 | ); | ||
Gregory Szorc
|
r37513 | static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) { | ||
static char* kwlist[] = { | ||||
"data", | ||||
"dict_type", | ||||
NULL | ||||
}; | ||||
int result = -1; | ||||
Py_buffer source; | ||||
unsigned dictType = ZSTD_dct_auto; | ||||
Gregory Szorc
|
r30435 | |||
self->dictData = NULL; | ||||
self->dictSize = 0; | ||||
Gregory Szorc
|
r37513 | self->cdict = NULL; | ||
self->ddict = NULL; | ||||
Gregory Szorc
|
r30435 | |||
#if PY_MAJOR_VERSION >= 3 | ||||
Gregory Szorc
|
r37513 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|I:ZstdCompressionDict", | ||
Gregory Szorc
|
r30435 | #else | ||
Gregory Szorc
|
r37513 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|I:ZstdCompressionDict", | ||
Gregory Szorc
|
r30435 | #endif | ||
Gregory Szorc
|
r37513 | kwlist, &source, &dictType)) { | ||
Gregory Szorc
|
r30435 | return -1; | ||
} | ||||
Gregory Szorc
|
r37513 | if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { | ||
PyErr_SetString(PyExc_ValueError, | ||||
"data buffer should be contiguous and have at most one dimension"); | ||||
goto finally; | ||||
} | ||||
if (dictType != ZSTD_dct_auto && dictType != ZSTD_dct_rawContent | ||||
&& dictType != ZSTD_dct_fullDict) { | ||||
PyErr_Format(PyExc_ValueError, | ||||
"invalid dictionary load mode: %d; must use DICT_TYPE_* constants", | ||||
dictType); | ||||
goto finally; | ||||
} | ||||
self->dictType = dictType; | ||||
self->dictData = PyMem_Malloc(source.len); | ||||
Gregory Szorc
|
r30435 | if (!self->dictData) { | ||
PyErr_NoMemory(); | ||||
Gregory Szorc
|
r37513 | goto finally; | ||
Gregory Szorc
|
r30435 | } | ||
Gregory Szorc
|
r37513 | memcpy(self->dictData, source.buf, source.len); | ||
self->dictSize = source.len; | ||||
result = 0; | ||||
Gregory Szorc
|
r30435 | |||
Gregory Szorc
|
r37513 | finally: | ||
PyBuffer_Release(&source); | ||||
return result; | ||||
} | ||||
static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) { | ||||
if (self->cdict) { | ||||
ZSTD_freeCDict(self->cdict); | ||||
self->cdict = NULL; | ||||
Gregory Szorc
|
r30435 | } | ||
Gregory Szorc
|
r37513 | if (self->ddict) { | ||
ZSTD_freeDDict(self->ddict); | ||||
self->ddict = NULL; | ||||
} | ||||
Gregory Szorc
|
r30435 | if (self->dictData) { | ||
Gregory Szorc
|
r30822 | PyMem_Free(self->dictData); | ||
Gregory Szorc
|
r30435 | self->dictData = NULL; | ||
} | ||||
PyObject_Del(self); | ||||
} | ||||
Gregory Szorc
|
r37513 | PyDoc_STRVAR(ZstdCompressionDict_precompute_compress__doc__, | ||
"Precompute a dictionary so it can be used by multiple compressors.\n" | ||||
); | ||||
static PyObject* ZstdCompressionDict_precompute_compress(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) { | ||||
static char* kwlist[] = { | ||||
"level", | ||||
"compression_params", | ||||
NULL | ||||
}; | ||||
int level = 0; | ||||
ZstdCompressionParametersObject* compressionParams = NULL; | ||||
ZSTD_compressionParameters cParams; | ||||
size_t zresult; | ||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!:precompute_compress", kwlist, | ||||
&level, &ZstdCompressionParametersType, &compressionParams)) { | ||||
return NULL; | ||||
} | ||||
if (level && compressionParams) { | ||||
PyErr_SetString(PyExc_ValueError, | ||||
"must only specify one of level or compression_params"); | ||||
return NULL; | ||||
} | ||||
if (!level && !compressionParams) { | ||||
PyErr_SetString(PyExc_ValueError, | ||||
"must specify one of level or compression_params"); | ||||
return NULL; | ||||
} | ||||
if (self->cdict) { | ||||
zresult = ZSTD_freeCDict(self->cdict); | ||||
self->cdict = NULL; | ||||
if (ZSTD_isError(zresult)) { | ||||
PyErr_Format(ZstdError, "unable to free CDict: %s", | ||||
ZSTD_getErrorName(zresult)); | ||||
return NULL; | ||||
} | ||||
} | ||||
if (level) { | ||||
cParams = ZSTD_getCParams(level, 0, self->dictSize); | ||||
} | ||||
else { | ||||
Gregory Szorc
|
r42237 | if (to_cparams(compressionParams, &cParams)) { | ||
return NULL; | ||||
} | ||||
Gregory Szorc
|
r37513 | } | ||
assert(!self->cdict); | ||||
self->cdict = ZSTD_createCDict_advanced(self->dictData, self->dictSize, | ||||
ZSTD_dlm_byRef, self->dictType, cParams, ZSTD_defaultCMem); | ||||
if (!self->cdict) { | ||||
PyErr_SetString(ZstdError, "unable to precompute dictionary"); | ||||
return NULL; | ||||
} | ||||
Py_RETURN_NONE; | ||||
} | ||||
Gregory Szorc
|
r30435 | static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) { | ||
unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize); | ||||
return PyLong_FromLong(dictID); | ||||
} | ||||
static PyObject* ZstdCompressionDict_as_bytes(ZstdCompressionDict* self) { | ||||
return PyBytes_FromStringAndSize(self->dictData, self->dictSize); | ||||
} | ||||
static PyMethodDef ZstdCompressionDict_methods[] = { | ||||
{ "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS, | ||||
PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") }, | ||||
{ "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS, | ||||
PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") }, | ||||
Gregory Szorc
|
r37513 | { "precompute_compress", (PyCFunction)ZstdCompressionDict_precompute_compress, | ||
METH_VARARGS | METH_KEYWORDS, ZstdCompressionDict_precompute_compress__doc__ }, | ||||
Gregory Szorc
|
r30435 | { NULL, NULL } | ||
}; | ||||
Gregory Szorc
|
r31796 | static PyMemberDef ZstdCompressionDict_members[] = { | ||
{ "k", T_UINT, offsetof(ZstdCompressionDict, k), READONLY, | ||||
"segment size" }, | ||||
{ "d", T_UINT, offsetof(ZstdCompressionDict, d), READONLY, | ||||
"dmer size" }, | ||||
{ NULL } | ||||
}; | ||||
Gregory Szorc
|
r30435 | static Py_ssize_t ZstdCompressionDict_length(ZstdCompressionDict* self) { | ||
return self->dictSize; | ||||
} | ||||
static PySequenceMethods ZstdCompressionDict_sq = { | ||||
(lenfunc)ZstdCompressionDict_length, /* sq_length */ | ||||
0, /* sq_concat */ | ||||
0, /* sq_repeat */ | ||||
0, /* sq_item */ | ||||
0, /* sq_ass_item */ | ||||
0, /* sq_contains */ | ||||
0, /* sq_inplace_concat */ | ||||
0 /* sq_inplace_repeat */ | ||||
}; | ||||
PyTypeObject ZstdCompressionDictType = { | ||||
PyVarObject_HEAD_INIT(NULL, 0) | ||||
"zstd.ZstdCompressionDict", /* tp_name */ | ||||
sizeof(ZstdCompressionDict), /* tp_basicsize */ | ||||
0, /* tp_itemsize */ | ||||
(destructor)ZstdCompressionDict_dealloc, /* tp_dealloc */ | ||||
0, /* tp_print */ | ||||
0, /* tp_getattr */ | ||||
0, /* tp_setattr */ | ||||
0, /* tp_compare */ | ||||
0, /* tp_repr */ | ||||
0, /* tp_as_number */ | ||||
&ZstdCompressionDict_sq, /* tp_as_sequence */ | ||||
0, /* tp_as_mapping */ | ||||
0, /* tp_hash */ | ||||
0, /* tp_call */ | ||||
0, /* tp_str */ | ||||
0, /* tp_getattro */ | ||||
0, /* tp_setattro */ | ||||
0, /* tp_as_buffer */ | ||||
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ | ||||
ZstdCompressionDict__doc__, /* tp_doc */ | ||||
0, /* tp_traverse */ | ||||
0, /* tp_clear */ | ||||
0, /* tp_richcompare */ | ||||
0, /* tp_weaklistoffset */ | ||||
0, /* tp_iter */ | ||||
0, /* tp_iternext */ | ||||
ZstdCompressionDict_methods, /* tp_methods */ | ||||
Gregory Szorc
|
r31796 | ZstdCompressionDict_members, /* tp_members */ | ||
Gregory Szorc
|
r30435 | 0, /* tp_getset */ | ||
0, /* tp_base */ | ||||
0, /* tp_dict */ | ||||
0, /* tp_descr_get */ | ||||
0, /* tp_descr_set */ | ||||
0, /* tp_dictoffset */ | ||||
(initproc)ZstdCompressionDict_init, /* tp_init */ | ||||
0, /* tp_alloc */ | ||||
PyType_GenericNew, /* tp_new */ | ||||
}; | ||||
void compressiondict_module_init(PyObject* mod) { | ||||
Victor Stinner
|
r47090 | Py_SET_TYPE(&ZstdCompressionDictType, &PyType_Type); | ||
Gregory Szorc
|
r30435 | if (PyType_Ready(&ZstdCompressionDictType) < 0) { | ||
return; | ||||
} | ||||
Py_INCREF((PyObject*)&ZstdCompressionDictType); | ||||
PyModule_AddObject(mod, "ZstdCompressionDict", | ||||
(PyObject*)&ZstdCompressionDictType); | ||||
} | ||||