##// END OF EJS Templates
zstandard: vendor python-zstandard 0.10.1...
Gregory Szorc -
r40157:73fef626 default
parent child Browse files
Show More

The requested changes are too big and content was truncated. Show full diff

@@ -0,0 +1,360 b''
1 /**
2 * Copyright (c) 2018-present, Gregory Szorc
3 * All rights reserved.
4 *
5 * This software may be modified and distributed under the terms
6 * of the BSD license. See the LICENSE file for details.
7 */
8
9 #include "python-zstandard.h"
10
11 extern PyObject* ZstdError;
12
13 PyDoc_STRVAR(ZstdCompressionChunkerIterator__doc__,
14 "Iterator of output chunks from ZstdCompressionChunker.\n"
15 );
16
17 static void ZstdCompressionChunkerIterator_dealloc(ZstdCompressionChunkerIterator* self) {
18 Py_XDECREF(self->chunker);
19
20 PyObject_Del(self);
21 }
22
23 static PyObject* ZstdCompressionChunkerIterator_iter(PyObject* self) {
24 Py_INCREF(self);
25 return self;
26 }
27
28 static PyObject* ZstdCompressionChunkerIterator_iternext(ZstdCompressionChunkerIterator* self) {
29 size_t zresult;
30 PyObject* chunk;
31 ZstdCompressionChunker* chunker = self->chunker;
32 ZSTD_EndDirective zFlushMode;
33
34 if (self->mode != compressionchunker_mode_normal && chunker->input.pos != chunker->input.size) {
35 PyErr_SetString(ZstdError, "input should have been fully consumed before calling flush() or finish()");
36 return NULL;
37 }
38
39 if (chunker->finished) {
40 return NULL;
41 }
42
43 /* If we have data left in the input, consume it. */
44 while (chunker->input.pos < chunker->input.size) {
45 Py_BEGIN_ALLOW_THREADS
46 zresult = ZSTD_compress_generic(chunker->compressor->cctx, &chunker->output,
47 &chunker->input, ZSTD_e_continue);
48 Py_END_ALLOW_THREADS
49
50 /* Input is fully consumed. */
51 if (chunker->input.pos == chunker->input.size) {
52 chunker->input.src = NULL;
53 chunker->input.pos = 0;
54 chunker->input.size = 0;
55 PyBuffer_Release(&chunker->inBuffer);
56 }
57
58 if (ZSTD_isError(zresult)) {
59 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
60 return NULL;
61 }
62
63 /* If it produced a full output chunk, emit it. */
64 if (chunker->output.pos == chunker->output.size) {
65 chunk = PyBytes_FromStringAndSize(chunker->output.dst, chunker->output.pos);
66 if (!chunk) {
67 return NULL;
68 }
69
70 chunker->output.pos = 0;
71
72 return chunk;
73 }
74
75 /* Else continue to compress available input data. */
76 }
77
78 /* We also need this here for the special case of an empty input buffer. */
79 if (chunker->input.pos == chunker->input.size) {
80 chunker->input.src = NULL;
81 chunker->input.pos = 0;
82 chunker->input.size = 0;
83 PyBuffer_Release(&chunker->inBuffer);
84 }
85
86 /* No more input data. A partial chunk may be in chunker->output.
87 * If we're in normal compression mode, we're done. Otherwise if we're in
88 * flush or finish mode, we need to emit what data remains.
89 */
90 if (self->mode == compressionchunker_mode_normal) {
91 /* We don't need to set StopIteration. */
92 return NULL;
93 }
94
95 if (self->mode == compressionchunker_mode_flush) {
96 zFlushMode = ZSTD_e_flush;
97 }
98 else if (self->mode == compressionchunker_mode_finish) {
99 zFlushMode = ZSTD_e_end;
100 }
101 else {
102 PyErr_SetString(ZstdError, "unhandled compression mode; this should never happen");
103 return NULL;
104 }
105
106 Py_BEGIN_ALLOW_THREADS
107 zresult = ZSTD_compress_generic(chunker->compressor->cctx, &chunker->output,
108 &chunker->input, zFlushMode);
109 Py_END_ALLOW_THREADS
110
111 if (ZSTD_isError(zresult)) {
112 PyErr_Format(ZstdError, "zstd compress error: %s",
113 ZSTD_getErrorName(zresult));
114 return NULL;
115 }
116
117 if (!zresult && chunker->output.pos == 0) {
118 return NULL;
119 }
120
121 chunk = PyBytes_FromStringAndSize(chunker->output.dst, chunker->output.pos);
122 if (!chunk) {
123 return NULL;
124 }
125
126 chunker->output.pos = 0;
127
128 if (!zresult && self->mode == compressionchunker_mode_finish) {
129 chunker->finished = 1;
130 }
131
132 return chunk;
133 }
134
135 PyTypeObject ZstdCompressionChunkerIteratorType = {
136 PyVarObject_HEAD_INIT(NULL, 0)
137 "zstd.ZstdCompressionChunkerIterator", /* tp_name */
138 sizeof(ZstdCompressionChunkerIterator), /* tp_basicsize */
139 0, /* tp_itemsize */
140 (destructor)ZstdCompressionChunkerIterator_dealloc, /* tp_dealloc */
141 0, /* tp_print */
142 0, /* tp_getattr */
143 0, /* tp_setattr */
144 0, /* tp_compare */
145 0, /* tp_repr */
146 0, /* tp_as_number */
147 0, /* tp_as_sequence */
148 0, /* tp_as_mapping */
149 0, /* tp_hash */
150 0, /* tp_call */
151 0, /* tp_str */
152 0, /* tp_getattro */
153 0, /* tp_setattro */
154 0, /* tp_as_buffer */
155 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
156 ZstdCompressionChunkerIterator__doc__, /* tp_doc */
157 0, /* tp_traverse */
158 0, /* tp_clear */
159 0, /* tp_richcompare */
160 0, /* tp_weaklistoffset */
161 ZstdCompressionChunkerIterator_iter, /* tp_iter */
162 (iternextfunc)ZstdCompressionChunkerIterator_iternext, /* tp_iternext */
163 0, /* tp_methods */
164 0, /* tp_members */
165 0, /* tp_getset */
166 0, /* tp_base */
167 0, /* tp_dict */
168 0, /* tp_descr_get */
169 0, /* tp_descr_set */
170 0, /* tp_dictoffset */
171 0, /* tp_init */
172 0, /* tp_alloc */
173 PyType_GenericNew, /* tp_new */
174 };
175
176 PyDoc_STRVAR(ZstdCompressionChunker__doc__,
177 "Compress chunks iteratively into exact chunk sizes.\n"
178 );
179
180 static void ZstdCompressionChunker_dealloc(ZstdCompressionChunker* self) {
181 PyBuffer_Release(&self->inBuffer);
182 self->input.src = NULL;
183
184 PyMem_Free(self->output.dst);
185 self->output.dst = NULL;
186
187 Py_XDECREF(self->compressor);
188
189 PyObject_Del(self);
190 }
191
192 static ZstdCompressionChunkerIterator* ZstdCompressionChunker_compress(ZstdCompressionChunker* self, PyObject* args, PyObject* kwargs) {
193 static char* kwlist[] = {
194 "data",
195 NULL
196 };
197
198 ZstdCompressionChunkerIterator* result;
199
200 if (self->finished) {
201 PyErr_SetString(ZstdError, "cannot call compress() after compression finished");
202 return NULL;
203 }
204
205 if (self->inBuffer.obj) {
206 PyErr_SetString(ZstdError,
207 "cannot perform operation before consuming output from previous operation");
208 return NULL;
209 }
210
211 #if PY_MAJOR_VERSION >= 3
212 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:compress",
213 #else
214 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:compress",
215 #endif
216 kwlist, &self->inBuffer)) {
217 return NULL;
218 }
219
220 if (!PyBuffer_IsContiguous(&self->inBuffer, 'C') || self->inBuffer.ndim > 1) {
221 PyErr_SetString(PyExc_ValueError,
222 "data buffer should be contiguous and have at most one dimension");
223 PyBuffer_Release(&self->inBuffer);
224 return NULL;
225 }
226
227 result = (ZstdCompressionChunkerIterator*)PyObject_CallObject((PyObject*)&ZstdCompressionChunkerIteratorType, NULL);
228 if (!result) {
229 PyBuffer_Release(&self->inBuffer);
230 return NULL;
231 }
232
233 self->input.src = self->inBuffer.buf;
234 self->input.size = self->inBuffer.len;
235 self->input.pos = 0;
236
237 result->chunker = self;
238 Py_INCREF(result->chunker);
239
240 result->mode = compressionchunker_mode_normal;
241
242 return result;
243 }
244
245 static ZstdCompressionChunkerIterator* ZstdCompressionChunker_finish(ZstdCompressionChunker* self) {
246 ZstdCompressionChunkerIterator* result;
247
248 if (self->finished) {
249 PyErr_SetString(ZstdError, "cannot call finish() after compression finished");
250 return NULL;
251 }
252
253 if (self->inBuffer.obj) {
254 PyErr_SetString(ZstdError,
255 "cannot call finish() before consuming output from previous operation");
256 return NULL;
257 }
258
259 result = (ZstdCompressionChunkerIterator*)PyObject_CallObject((PyObject*)&ZstdCompressionChunkerIteratorType, NULL);
260 if (!result) {
261 return NULL;
262 }
263
264 result->chunker = self;
265 Py_INCREF(result->chunker);
266
267 result->mode = compressionchunker_mode_finish;
268
269 return result;
270 }
271
272 static ZstdCompressionChunkerIterator* ZstdCompressionChunker_flush(ZstdCompressionChunker* self, PyObject* args, PyObject* kwargs) {
273 ZstdCompressionChunkerIterator* result;
274
275 if (self->finished) {
276 PyErr_SetString(ZstdError, "cannot call flush() after compression finished");
277 return NULL;
278 }
279
280 if (self->inBuffer.obj) {
281 PyErr_SetString(ZstdError,
282 "cannot call flush() before consuming output from previous operation");
283 return NULL;
284 }
285
286 result = (ZstdCompressionChunkerIterator*)PyObject_CallObject((PyObject*)&ZstdCompressionChunkerIteratorType, NULL);
287 if (!result) {
288 return NULL;
289 }
290
291 result->chunker = self;
292 Py_INCREF(result->chunker);
293
294 result->mode = compressionchunker_mode_flush;
295
296 return result;
297 }
298
299 static PyMethodDef ZstdCompressionChunker_methods[] = {
300 { "compress", (PyCFunction)ZstdCompressionChunker_compress, METH_VARARGS | METH_KEYWORDS,
301 PyDoc_STR("compress data") },
302 { "finish", (PyCFunction)ZstdCompressionChunker_finish, METH_NOARGS,
303 PyDoc_STR("finish compression operation") },
304 { "flush", (PyCFunction)ZstdCompressionChunker_flush, METH_VARARGS | METH_KEYWORDS,
305 PyDoc_STR("finish compression operation") },
306 { NULL, NULL }
307 };
308
309 PyTypeObject ZstdCompressionChunkerType = {
310 PyVarObject_HEAD_INIT(NULL, 0)
311 "zstd.ZstdCompressionChunkerType", /* tp_name */
312 sizeof(ZstdCompressionChunker), /* tp_basicsize */
313 0, /* tp_itemsize */
314 (destructor)ZstdCompressionChunker_dealloc, /* tp_dealloc */
315 0, /* tp_print */
316 0, /* tp_getattr */
317 0, /* tp_setattr */
318 0, /* tp_compare */
319 0, /* tp_repr */
320 0, /* tp_as_number */
321 0, /* tp_as_sequence */
322 0, /* tp_as_mapping */
323 0, /* tp_hash */
324 0, /* tp_call */
325 0, /* tp_str */
326 0, /* tp_getattro */
327 0, /* tp_setattro */
328 0, /* tp_as_buffer */
329 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
330 ZstdCompressionChunker__doc__, /* tp_doc */
331 0, /* tp_traverse */
332 0, /* tp_clear */
333 0, /* tp_richcompare */
334 0, /* tp_weaklistoffset */
335 0, /* tp_iter */
336 0, /* tp_iternext */
337 ZstdCompressionChunker_methods, /* tp_methods */
338 0, /* tp_members */
339 0, /* tp_getset */
340 0, /* tp_base */
341 0, /* tp_dict */
342 0, /* tp_descr_get */
343 0, /* tp_descr_set */
344 0, /* tp_dictoffset */
345 0, /* tp_init */
346 0, /* tp_alloc */
347 PyType_GenericNew, /* tp_new */
348 };
349
350 void compressionchunker_module_init(PyObject* module) {
351 Py_TYPE(&ZstdCompressionChunkerIteratorType) = &PyType_Type;
352 if (PyType_Ready(&ZstdCompressionChunkerIteratorType) < 0) {
353 return;
354 }
355
356 Py_TYPE(&ZstdCompressionChunkerType) = &PyType_Type;
357 if (PyType_Ready(&ZstdCompressionChunkerType) < 0) {
358 return;
359 }
360 }
@@ -0,0 +1,44 b''
1 /* ******************************************************************
2 debug
3 Part of FSE library
4 Copyright (C) 2013-present, Yann Collet.
5
6 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
7
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions are
10 met:
11
12 * Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following disclaimer
16 in the documentation and/or other materials provided with the
17 distribution.
18
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 You can contact the author at :
32 - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
33 ****************************************************************** */
34
35
36 /*
37 * This module only hosts one global variable
38 * which can be used to dynamically influence the verbosity of traces,
39 * such as DEBUGLOG and RAWLOG
40 */
41
42 #include "debug.h"
43
44 int g_debuglevel = DEBUGLEVEL;
@@ -0,0 +1,123 b''
1 /* ******************************************************************
2 debug
3 Part of FSE library
4 Copyright (C) 2013-present, Yann Collet.
5
6 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
7
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions are
10 met:
11
12 * Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following disclaimer
16 in the documentation and/or other materials provided with the
17 distribution.
18
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 You can contact the author at :
32 - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
33 ****************************************************************** */
34
35
36 /*
37 * The purpose of this header is to enable debug functions.
38 * They regroup assert(), DEBUGLOG() and RAWLOG() for run-time,
39 * and DEBUG_STATIC_ASSERT() for compile-time.
40 *
41 * By default, DEBUGLEVEL==0, which means run-time debug is disabled.
42 *
43 * Level 1 enables assert() only.
44 * Starting level 2, traces can be generated and pushed to stderr.
45 * The higher the level, the more verbose the traces.
46 *
47 * It's possible to dynamically adjust level using variable g_debug_level,
48 * which is only declared if DEBUGLEVEL>=2,
49 * and is a global variable, not multi-thread protected (use with care)
50 */
51
52 #ifndef DEBUG_H_12987983217
53 #define DEBUG_H_12987983217
54
55 #if defined (__cplusplus)
56 extern "C" {
57 #endif
58
59
60 /* static assert is triggered at compile time, leaving no runtime artefact,
61 * but can only work with compile-time constants.
62 * This variant can only be used inside a function. */
63 #define DEBUG_STATIC_ASSERT(c) (void)sizeof(char[(c) ? 1 : -1])
64
65
66 /* DEBUGLEVEL is expected to be defined externally,
67 * typically through compiler command line.
68 * Value must be a number. */
69 #ifndef DEBUGLEVEL
70 # define DEBUGLEVEL 0
71 #endif
72
73 /* recommended values for DEBUGLEVEL :
74 * 0 : no debug, all run-time functions disabled
75 * 1 : no display, enables assert() only
76 * 2 : reserved, for currently active debug path
77 * 3 : events once per object lifetime (CCtx, CDict, etc.)
78 * 4 : events once per frame
79 * 5 : events once per block
80 * 6 : events once per sequence (verbose)
81 * 7+: events at every position (*very* verbose)
82 *
83 * It's generally inconvenient to output traces > 5.
84 * In which case, it's possible to selectively enable higher verbosity levels
85 * by modifying g_debug_level.
86 */
87
88 #if (DEBUGLEVEL>=1)
89 # include <assert.h>
90 #else
91 # ifndef assert /* assert may be already defined, due to prior #include <assert.h> */
92 # define assert(condition) ((void)0) /* disable assert (default) */
93 # endif
94 #endif
95
96 #if (DEBUGLEVEL>=2)
97 # include <stdio.h>
98 extern int g_debuglevel; /* here, this variable is only declared,
99 it actually lives in debug.c,
100 and is shared by the whole process.
101 It's typically used to enable very verbose levels
102 on selective conditions (such as position in src) */
103
104 # define RAWLOG(l, ...) { \
105 if (l<=g_debuglevel) { \
106 fprintf(stderr, __VA_ARGS__); \
107 } }
108 # define DEBUGLOG(l, ...) { \
109 if (l<=g_debuglevel) { \
110 fprintf(stderr, __FILE__ ": " __VA_ARGS__); \
111 fprintf(stderr, " \n"); \
112 } }
113 #else
114 # define RAWLOG(l, ...) {} /* disabled */
115 # define DEBUGLOG(l, ...) {} /* disabled */
116 #endif
117
118
119 #if defined (__cplusplus)
120 }
121 #endif
122
123 #endif /* DEBUG_H_12987983217 */
1 NO CONTENT: new file 100644
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: new file 100644
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: new file 100644
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: new file 100644
The requested commit or file is too big and content was truncated. Show full diff
@@ -1,88 +1,95 b''
1 1 # Files that just need to be migrated to the formatter.
2 2 # Do not add new files here!
3 3 mercurial/cext/dirs.c
4 4 mercurial/cext/manifest.c
5 5 mercurial/cext/osutil.c
6 6 mercurial/cext/revlog.c
7 7 # Vendored code that we should never format:
8 8 contrib/python-zstandard/c-ext/bufferutil.c
9 contrib/python-zstandard/c-ext/compressionchunker.c
9 10 contrib/python-zstandard/c-ext/compressiondict.c
10 11 contrib/python-zstandard/c-ext/compressionparams.c
11 12 contrib/python-zstandard/c-ext/compressionreader.c
12 13 contrib/python-zstandard/c-ext/compressionwriter.c
13 14 contrib/python-zstandard/c-ext/compressobj.c
14 15 contrib/python-zstandard/c-ext/compressor.c
15 16 contrib/python-zstandard/c-ext/compressoriterator.c
16 17 contrib/python-zstandard/c-ext/constants.c
17 18 contrib/python-zstandard/c-ext/decompressionreader.c
18 19 contrib/python-zstandard/c-ext/decompressionwriter.c
19 20 contrib/python-zstandard/c-ext/decompressobj.c
20 21 contrib/python-zstandard/c-ext/decompressor.c
21 22 contrib/python-zstandard/c-ext/decompressoriterator.c
22 23 contrib/python-zstandard/c-ext/frameparams.c
23 24 contrib/python-zstandard/c-ext/python-zstandard.h
24 25 contrib/python-zstandard/zstd.c
25 26 contrib/python-zstandard/zstd/common/bitstream.h
26 27 contrib/python-zstandard/zstd/common/compiler.h
27 28 contrib/python-zstandard/zstd/common/cpu.h
29 contrib/python-zstandard/zstd/common/debug.c
30 contrib/python-zstandard/zstd/common/debug.h
28 31 contrib/python-zstandard/zstd/common/entropy_common.c
29 32 contrib/python-zstandard/zstd/common/error_private.c
30 33 contrib/python-zstandard/zstd/common/error_private.h
31 34 contrib/python-zstandard/zstd/common/fse_decompress.c
32 35 contrib/python-zstandard/zstd/common/fse.h
33 36 contrib/python-zstandard/zstd/common/huf.h
34 37 contrib/python-zstandard/zstd/common/mem.h
35 38 contrib/python-zstandard/zstd/common/pool.c
36 39 contrib/python-zstandard/zstd/common/pool.h
37 40 contrib/python-zstandard/zstd/common/threading.c
38 41 contrib/python-zstandard/zstd/common/threading.h
39 42 contrib/python-zstandard/zstd/common/xxhash.c
40 43 contrib/python-zstandard/zstd/common/xxhash.h
41 44 contrib/python-zstandard/zstd/common/zstd_common.c
42 45 contrib/python-zstandard/zstd/common/zstd_errors.h
43 46 contrib/python-zstandard/zstd/common/zstd_internal.h
44 47 contrib/python-zstandard/zstd/compress/fse_compress.c
48 contrib/python-zstandard/zstd/compress/hist.c
49 contrib/python-zstandard/zstd/compress/hist.h
45 50 contrib/python-zstandard/zstd/compress/huf_compress.c
46 51 contrib/python-zstandard/zstd/compress/zstd_compress.c
47 52 contrib/python-zstandard/zstd/compress/zstd_compress_internal.h
48 53 contrib/python-zstandard/zstd/compress/zstd_double_fast.c
49 54 contrib/python-zstandard/zstd/compress/zstd_double_fast.h
50 55 contrib/python-zstandard/zstd/compress/zstd_fast.c
51 56 contrib/python-zstandard/zstd/compress/zstd_fast.h
52 57 contrib/python-zstandard/zstd/compress/zstd_lazy.c
53 58 contrib/python-zstandard/zstd/compress/zstd_lazy.h
54 59 contrib/python-zstandard/zstd/compress/zstd_ldm.c
55 60 contrib/python-zstandard/zstd/compress/zstd_ldm.h
56 61 contrib/python-zstandard/zstd/compress/zstdmt_compress.c
57 62 contrib/python-zstandard/zstd/compress/zstdmt_compress.h
58 63 contrib/python-zstandard/zstd/compress/zstd_opt.c
59 64 contrib/python-zstandard/zstd/compress/zstd_opt.h
60 65 contrib/python-zstandard/zstd/decompress/huf_decompress.c
61 66 contrib/python-zstandard/zstd/decompress/zstd_decompress.c
62 67 contrib/python-zstandard/zstd/deprecated/zbuff_common.c
63 68 contrib/python-zstandard/zstd/deprecated/zbuff_compress.c
64 69 contrib/python-zstandard/zstd/deprecated/zbuff_decompress.c
65 70 contrib/python-zstandard/zstd/deprecated/zbuff.h
66 71 contrib/python-zstandard/zstd/dictBuilder/cover.c
72 contrib/python-zstandard/zstd/dictBuilder/cover.h
67 73 contrib/python-zstandard/zstd/dictBuilder/divsufsort.c
68 74 contrib/python-zstandard/zstd/dictBuilder/divsufsort.h
75 contrib/python-zstandard/zstd/dictBuilder/fastcover.c
69 76 contrib/python-zstandard/zstd/dictBuilder/zdict.c
70 77 contrib/python-zstandard/zstd/dictBuilder/zdict.h
71 78 contrib/python-zstandard/zstd/zstd.h
72 79 hgext/fsmonitor/pywatchman/bser.c
73 80 mercurial/thirdparty/xdiff/xdiff.h
74 81 mercurial/thirdparty/xdiff/xdiffi.c
75 82 mercurial/thirdparty/xdiff/xdiffi.h
76 83 mercurial/thirdparty/xdiff/xemit.c
77 84 mercurial/thirdparty/xdiff/xemit.h
78 85 mercurial/thirdparty/xdiff/xhistogram.c
79 86 mercurial/thirdparty/xdiff/xinclude.h
80 87 mercurial/thirdparty/xdiff/xmacros.h
81 88 mercurial/thirdparty/xdiff/xmerge.c
82 89 mercurial/thirdparty/xdiff/xpatience.c
83 90 mercurial/thirdparty/xdiff/xprepare.c
84 91 mercurial/thirdparty/xdiff/xprepare.h
85 92 mercurial/thirdparty/xdiff/xtypes.h
86 93 mercurial/thirdparty/xdiff/xutils.c
87 94 mercurial/thirdparty/xdiff/xutils.h
88 95 mercurial/thirdparty/zope/interface/_zope_interface_coptimizations.c
@@ -1,7 +1,10 b''
1 1 graft c-ext
2 graft debian
2 3 graft zstd
3 4 graft tests
4 5 include make_cffi.py
5 6 include setup_zstd.py
6 7 include zstd.c
8 include zstd_cffi.py
7 9 include LICENSE
10 include NEWS.rst
@@ -1,338 +1,456 b''
1 1 ===============
2 2 Version History
3 3 ===============
4 4
5 5 1.0.0 (not yet released)
6 6 ========================
7 7
8 8 Actions Blocking Release
9 9 ------------------------
10 10
11 11 * compression and decompression APIs that support ``io.rawIOBase`` interface
12 12 (#13).
13 13 * Refactor module names so C and CFFI extensions live under ``zstandard``
14 14 package.
15 15 * Overall API design review.
16 16 * Use Python allocator where possible.
17 17 * Figure out what to do about experimental APIs not implemented by CFFI.
18 18 * APIs for auto adjusting compression parameters based on input size. e.g.
19 19 clamping the window log so it isn't too large for input.
20 20 * Consider allowing compressor and decompressor instances to be thread safe,
21 21 support concurrent operations. Or track when an operation is in progress and
22 22 refuse to let concurrent operations use the same instance.
23 23 * Support for magic-less frames for all decompression operations (``decompress()``
24 24 doesn't work due to sniffing the content size and the lack of a ZSTD API to
25 25 sniff magic-less frames - this should be fixed in 1.3.5.).
26 26 * Audit for complete flushing when ending compression streams.
27 27 * Deprecate legacy APIs.
28 28 * Audit for ability to control read/write sizes on all APIs.
29 29 * Detect memory leaks via bench.py.
30 30 * Remove low-level compression parameters from ``ZstdCompressor.__init__`` and
31 31 require use of ``CompressionParameters``.
32 32 * Expose ``ZSTD_getFrameProgression()`` from more compressor types.
33 * Support modifying compression parameters mid operation when supported by
34 zstd API.
35 * Expose ``ZSTD_CLEVEL_DEFAULT`` constant.
36 * Support ``ZSTD_p_forceAttachDict`` compression parameter.
37 * Use ``ZSTD_CCtx_getParameter()``/``ZSTD_CCtxParam_getParameter()`` for retrieving
38 compression parameters.
39 * Consider exposing ``ZSTDMT_toFlushNow()``.
40 * Expose ``ZDICT_trainFromBuffer_fastCover()``,
41 ``ZDICT_optimizeTrainFromBuffer_fastCover``.
42 * Expose and enforce ``ZSTD_minCLevel()`` for minimum compression level.
43 * Consider a ``chunker()`` API for decompression.
44 * Consider stats for ``chunker()`` API, including finding the last consumed
45 offset of input data.
33 46
34 47 Other Actions Not Blocking Release
35 48 ---------------------------------------
36 49
37 50 * Support for block compression APIs.
38 51 * API for ensuring max memory ceiling isn't exceeded.
39 52 * Move off nose for testing.
40 53
54 0.10.1 (released 2018-10-08)
55 ============================
56
57 Backwards Compatibility Notes
58 -----------------------------
59
60 * ``ZstdCompressor.stream_reader().closed`` is now a property instead of a
61 method (#58).
62 * ``ZstdDecompressor.stream_reader().closed`` is now a property instead of a
63 method (#58).
64
65 Changes
66 -------
67
68 * Stop attempting to package Python 3.6 for Miniconda. The latest version of
69 Miniconda is using Python 3.7. The Python 3.6 Miniconda packages were a lie
70 since this were built against Python 3.7.
71 * ``ZstdCompressor.stream_reader()``'s and ``ZstdDecompressor.stream_reader()``'s
72 ``closed`` attribute is now a read-only property instead of a method. This now
73 properly matches the ``IOBase`` API and allows instances to be used in more
74 places that accept ``IOBase`` instances.
75
76 0.10.0 (released 2018-10-08)
77 ============================
78
79 Backwards Compatibility Notes
80 -----------------------------
81
82 * ``ZstdDecompressor.stream_reader().read()`` now consistently requires an
83 argument in both the C and CFFI backends. Before, the CFFI implementation
84 would assume a default value of ``-1``, which was later rejected.
85 * The ``compress_literals`` argument and attribute has been removed from
86 ``zstd.ZstdCompressionParameters`` because it was removed by the zstd 1.3.5
87 API.
88 * ``ZSTD_CCtx_setParametersUsingCCtxParams()`` is no longer called on every
89 operation performed against ``ZstdCompressor`` instances. The reason for this
90 change is that the zstd 1.3.5 API no longer allows this without calling
91 ``ZSTD_CCtx_resetParameters()`` first. But if we called
92 ``ZSTD_CCtx_resetParameters()`` on every operation, we'd have to redo
93 potentially expensive setup when using dictionaries. We now call
94 ``ZSTD_CCtx_reset()`` on every operation and don't attempt to change
95 compression parameters.
96 * Objects returned by ``ZstdCompressor.stream_reader()`` no longer need to be
97 used as a context manager. The context manager interface still exists and its
98 behavior is unchanged.
99 * Objects returned by ``ZstdDecompressor.stream_reader()`` no longer need to be
100 used as a context manager. The context manager interface still exists and its
101 behavior is unchanged.
102
103 Bug Fixes
104 ---------
105
106 * ``ZstdDecompressor.decompressobj().decompress()`` should now return all data
107 from internal buffers in more scenarios. Before, it was possible for data to
108 remain in internal buffers. This data would be emitted on a subsequent call
109 to ``decompress()``. The overall output stream would still be valid. But if
110 callers were expecting input data to exactly map to output data (say the
111 producer had used ``flush(COMPRESSOBJ_FLUSH_BLOCK)`` and was attempting to
112 map input chunks to output chunks), then the previous behavior would be
113 wrong. The new behavior is such that output from
114 ``flush(COMPRESSOBJ_FLUSH_BLOCK)`` fed into ``decompressobj().decompress()``
115 should produce all available compressed input.
116 * ``ZstdDecompressor.stream_reader().read()`` should no longer segfault after
117 a previous context manager resulted in error (#56).
118 * ``ZstdCompressor.compressobj().flush(COMPRESSOBJ_FLUSH_BLOCK)`` now returns
119 all data necessary to flush a block. Before, it was possible for the
120 ``flush()`` to not emit all data necessary to fully represent a block. This
121 would mean decompressors wouldn't be able to decompress all data that had been
122 fed into the compressor and ``flush()``ed. (#55).
123
124 New Features
125 ------------
126
127 * New module constants ``BLOCKSIZELOG_MAX``, ``BLOCKSIZE_MAX``,
128 ``TARGETLENGTH_MAX`` that expose constants from libzstd.
129 * New ``ZstdCompressor.chunker()`` API for manually feeding data into a
130 compressor and emitting chunks of a fixed size. Like ``compressobj()``, the
131 API doesn't impose restrictions on the input or output types for the
132 data streams. Unlike ``compressobj()``, it ensures output chunks are of a
133 fixed size. This makes this API useful when the compressed output is being
134 fed into an I/O layer, where uniform write sizes are useful.
135 * ``ZstdCompressor.stream_reader()`` no longer needs to be used as a context
136 manager (#34).
137 * ``ZstdDecompressor.stream_reader()`` no longer needs to be used as a context
138 manager (#34).
139 * Bundled zstandard library upgraded from 1.3.4 to 1.3.6.
140
141 Changes
142 -------
143
144 * Added ``zstd_cffi.py`` and ``NEWS.rst`` to ``MANIFEST.in``.
145 * ``zstandard.__version__`` is now defined (#50).
146 * Upgrade pip, setuptools, wheel, and cibuildwheel packages to latest versions.
147 * Upgrade various packages used in CI to latest versions. Notably tox (in
148 order to support Python 3.7).
149 * Use relative paths in setup.py to appease Python 3.7 (#51).
150 * Added CI for Python 3.7.
151
152 0.9.1 (released 2018-06-04)
153 ===========================
154
155 * Debian packaging support.
156 * Fix typo in setup.py (#44).
157 * Support building with mingw compiler (#46).
158
41 159 0.9.0 (released 2018-04-08)
42 160 ===========================
43 161
44 162 Backwards Compatibility Notes
45 163 -----------------------------
46 164
47 165 * CFFI 1.11 or newer is now required (previous requirement was 1.8).
48 166 * The primary module is now ``zstandard``. Please change imports of ``zstd``
49 167 and ``zstd_cffi`` to ``import zstandard``. See the README for more. Support
50 168 for importing the old names will be dropped in the next release.
51 169 * ``ZstdCompressor.read_from()`` and ``ZstdDecompressor.read_from()`` have
52 170 been renamed to ``read_to_iter()``. ``read_from()`` is aliased to the new
53 171 name and will be deleted in a future release.
54 172 * Support for Python 2.6 has been removed.
55 173 * Support for Python 3.3 has been removed.
56 174 * The ``selectivity`` argument to ``train_dictionary()`` has been removed, as
57 175 the feature disappeared from zstd 1.3.
58 176 * Support for legacy dictionaries has been removed. Cover dictionaries are now
59 177 the default. ``train_cover_dictionary()`` has effectively been renamed to
60 178 ``train_dictionary()``.
61 179 * The ``allow_empty`` argument from ``ZstdCompressor.compress()`` has been
62 180 deleted and the method now allows empty inputs to be compressed by default.
63 181 * ``estimate_compression_context_size()`` has been removed. Use
64 182 ``CompressionParameters.estimated_compression_context_size()`` instead.
65 183 * ``get_compression_parameters()`` has been removed. Use
66 184 ``CompressionParameters.from_level()`` instead.
67 185 * The arguments to ``CompressionParameters.__init__()`` have changed. If you
68 186 were using positional arguments before, the positions now map to different
69 187 arguments. It is recommended to use keyword arguments to construct
70 188 ``CompressionParameters`` instances.
71 189 * ``TARGETLENGTH_MAX`` constant has been removed (it disappeared from zstandard
72 190 1.3.4).
73 191 * ``ZstdCompressor.write_to()`` and ``ZstdDecompressor.write_to()`` have been
74 192 renamed to ``ZstdCompressor.stream_writer()`` and
75 193 ``ZstdDecompressor.stream_writer()``, respectively. The old names are still
76 194 aliased, but will be removed in the next major release.
77 195 * Content sizes are written into frame headers by default
78 196 (``ZstdCompressor(write_content_size=True)`` is now the default).
79 197 * ``CompressionParameters`` has been renamed to ``ZstdCompressionParameters``
80 198 for consistency with other types. The old name is an alias and will be removed
81 199 in the next major release.
82 200
83 201 Bug Fixes
84 202 ---------
85 203
86 204 * Fixed memory leak in ``ZstdCompressor.copy_stream()`` (#40) (from 0.8.2).
87 205 * Fixed memory leak in ``ZstdDecompressor.copy_stream()`` (#35) (from 0.8.2).
88 206 * Fixed memory leak of ``ZSTD_DDict`` instances in CFFI's ``ZstdDecompressor``.
89 207
90 208 New Features
91 209 ------------
92 210
93 * Bundlded zstandard library upgraded from 1.1.3 to 1.3.4. This delivers various
211 * Bundled zstandard library upgraded from 1.1.3 to 1.3.4. This delivers various
94 212 bug fixes and performance improvements. It also gives us access to newer
95 213 features.
96 214 * Support for negative compression levels.
97 215 * Support for *long distance matching* (facilitates compression ratios that approach
98 216 LZMA).
99 217 * Supporting for reading empty zstandard frames (with an embedded content size
100 218 of 0).
101 219 * Support for writing and partial support for reading zstandard frames without a
102 220 magic header.
103 221 * New ``stream_reader()`` API that exposes the ``io.RawIOBase`` interface (allows
104 222 you to ``.read()`` from a file-like object).
105 223 * Several minor features, bug fixes, and performance enhancements.
106 224 * Wheels for Linux and macOS are now provided with releases.
107 225
108 226 Changes
109 227 -------
110 228
111 229 * Functions accepting bytes data now use the buffer protocol and can accept
112 230 more types (like ``memoryview`` and ``bytearray``) (#26).
113 231 * Add #includes so compilation on OS X and BSDs works (#20).
114 232 * New ``ZstdDecompressor.stream_reader()`` API to obtain a read-only i/o stream
115 233 of decompressed data for a source.
116 234 * New ``ZstdCompressor.stream_reader()`` API to obtain a read-only i/o stream of
117 235 compressed data for a source.
118 236 * Renamed ``ZstdDecompressor.read_from()`` to ``ZstdDecompressor.read_to_iter()``.
119 237 The old name is still available.
120 238 * Renamed ``ZstdCompressor.read_from()`` to ``ZstdCompressor.read_to_iter()``.
121 239 ``read_from()`` is still available at its old location.
122 240 * Introduce the ``zstandard`` module to import and re-export the C or CFFI
123 241 *backend* as appropriate. Behavior can be controlled via the
124 242 ``PYTHON_ZSTANDARD_IMPORT_POLICY`` environment variable. See README for
125 243 usage info.
126 244 * Vendored version of zstd upgraded to 1.3.4.
127 245 * Added module constants ``CONTENTSIZE_UNKNOWN`` and ``CONTENTSIZE_ERROR``.
128 246 * Add ``STRATEGY_BTULTRA`` compression strategy constant.
129 247 * Switch from deprecated ``ZSTD_getDecompressedSize()`` to
130 248 ``ZSTD_getFrameContentSize()`` replacement.
131 249 * ``ZstdCompressor.compress()`` can now compress empty inputs without requiring
132 250 special handling.
133 251 * ``ZstdCompressor`` and ``ZstdDecompressor`` now have a ``memory_size()``
134 252 method for determining the current memory utilization of the underlying zstd
135 253 primitive.
136 254 * ``train_dictionary()`` has new arguments and functionality for trying multiple
137 255 variations of COVER parameters and selecting the best one.
138 256 * Added module constants ``LDM_MINMATCH_MIN``, ``LDM_MINMATCH_MAX``, and
139 257 ``LDM_BUCKETSIZELOG_MAX``.
140 258 * Converted all consumers to the zstandard *new advanced API*, which uses
141 259 ``ZSTD_compress_generic()``
142 260 * ``CompressionParameters.__init__`` now accepts several more arguments,
143 261 including support for *long distance matching*.
144 262 * ``ZstdCompressionDict.__init__`` now accepts a ``dict_type`` argument that
145 263 controls how the dictionary should be interpreted. This can be used to
146 264 force the use of *content-only* dictionaries or to require the presence
147 265 of the dictionary magic header.
148 266 * ``ZstdCompressionDict.precompute_compress()`` can be used to precompute the
149 267 compression dictionary so it can efficiently be used with multiple
150 268 ``ZstdCompressor`` instances.
151 269 * Digested dictionaries are now stored in ``ZstdCompressionDict`` instances,
152 270 created automatically on first use, and automatically reused by all
153 271 ``ZstdDecompressor`` instances bound to that dictionary.
154 272 * All meaningful functions now accept keyword arguments.
155 273 * ``ZstdDecompressor.decompressobj()`` now accepts a ``write_size`` argument
156 274 to control how much work to perform on every decompressor invocation.
157 275 * ``ZstdCompressor.write_to()`` now exposes a ``tell()``, which exposes the
158 276 total number of bytes written so far.
159 277 * ``ZstdDecompressor.stream_reader()`` now supports ``seek()`` when moving
160 278 forward in the stream.
161 279 * Removed ``TARGETLENGTH_MAX`` constant.
162 280 * Added ``frame_header_size(data)`` function.
163 281 * Added ``frame_content_size(data)`` function.
164 282 * Consumers of ``ZSTD_decompress*`` have been switched to the new *advanced
165 283 decompression* API.
166 284 * ``ZstdCompressor`` and ``ZstdCompressionParams`` can now be constructed with
167 285 negative compression levels.
168 286 * ``ZstdDecompressor`` now accepts a ``max_window_size`` argument to limit the
169 287 amount of memory required for decompression operations.
170 288 * ``FORMAT_ZSTD1`` and ``FORMAT_ZSTD1_MAGICLESS`` constants to be used with
171 289 the ``format`` compression parameter to control whether the frame magic
172 290 header is written.
173 291 * ``ZstdDecompressor`` now accepts a ``format`` argument to control the
174 292 expected frame format.
175 293 * ``ZstdCompressor`` now has a ``frame_progression()`` method to return
176 294 information about the current compression operation.
177 295 * Error messages in CFFI no longer have ``b''`` literals.
178 296 * Compiler warnings and underlying overflow issues on 32-bit platforms have been
179 297 fixed.
180 298 * Builds in CI now build with compiler warnings as errors. This should hopefully
181 299 fix new compiler warnings from being introduced.
182 300 * Make ``ZstdCompressor(write_content_size=True)`` and
183 301 ``CompressionParameters(write_content_size=True)`` the default.
184 302 * ``CompressionParameters`` has been renamed to ``ZstdCompressionParameters``.
185 303
186 304 0.8.2 (released 2018-02-22)
187 305 ---------------------------
188 306
189 307 * Fixed memory leak in ``ZstdCompressor.copy_stream()`` (#40).
190 308 * Fixed memory leak in ``ZstdDecompressor.copy_stream()`` (#35).
191 309
192 310 0.8.1 (released 2017-04-08)
193 311 ---------------------------
194 312
195 313 * Add #includes so compilation on OS X and BSDs works (#20).
196 314
197 315 0.8.0 (released 2017-03-08)
198 316 ===========================
199 317
200 318 * CompressionParameters now has a estimated_compression_context_size() method.
201 319 zstd.estimate_compression_context_size() is now deprecated and slated for
202 320 removal.
203 321 * Implemented a lot of fuzzing tests.
204 322 * CompressionParameters instances now perform extra validation by calling
205 323 ZSTD_checkCParams() at construction time.
206 324 * multi_compress_to_buffer() API for compressing multiple inputs as a
207 325 single operation, as efficiently as possible.
208 326 * ZSTD_CStream instances are now used across multiple operations on
209 327 ZstdCompressor instances, resulting in much better performance for
210 328 APIs that do streaming.
211 329 * ZSTD_DStream instances are now used across multiple operations on
212 330 ZstdDecompressor instances, resulting in much better performance for
213 331 APIs that do streaming.
214 332 * train_dictionary() now releases the GIL.
215 333 * Support for training dictionaries using the COVER algorithm.
216 334 * multi_decompress_to_buffer() API for decompressing multiple frames as a
217 335 single operation, as efficiently as possible.
218 336 * Support for multi-threaded compression.
219 337 * Disable deprecation warnings when compiling CFFI module.
220 338 * Fixed memory leak in train_dictionary().
221 339 * Removed DictParameters type.
222 340 * train_dictionary() now accepts keyword arguments instead of a
223 341 DictParameters instance to control dictionary generation.
224 342
225 343 0.7.0 (released 2017-02-07)
226 344 ===========================
227 345
228 346 * Added zstd.get_frame_parameters() to obtain info about a zstd frame.
229 347 * Added ZstdDecompressor.decompress_content_dict_chain() for efficient
230 348 decompression of *content-only dictionary chains*.
231 349 * CFFI module fully implemented; all tests run against both C extension and
232 350 CFFI implementation.
233 351 * Vendored version of zstd updated to 1.1.3.
234 352 * Use ZstdDecompressor.decompress() now uses ZSTD_createDDict_byReference()
235 353 to avoid extra memory allocation of dict data.
236 354 * Add function names to error messages (by using ":name" in PyArg_Parse*
237 355 functions).
238 356 * Reuse decompression context across operations. Previously, we created a
239 357 new ZSTD_DCtx for each decompress(). This was measured to slow down
240 358 decompression by 40-200MB/s. The API guarantees say ZstdDecompressor
241 359 is not thread safe. So we reuse the ZSTD_DCtx across operations and make
242 360 things faster in the process.
243 361 * ZstdCompressor.write_to()'s compress() and flush() methods now return number
244 362 of bytes written.
245 363 * ZstdDecompressor.write_to()'s write() method now returns the number of bytes
246 364 written to the underlying output object.
247 365 * CompressionParameters instances now expose their values as attributes.
248 366 * CompressionParameters instances no longer are subscriptable nor behave
249 367 as tuples (backwards incompatible). Use attributes to obtain values.
250 368 * DictParameters instances now expose their values as attributes.
251 369
252 370 0.6.0 (released 2017-01-14)
253 371 ===========================
254 372
255 373 * Support for legacy zstd protocols (build time opt in feature).
256 374 * Automation improvements to test against Python 3.6, latest versions
257 375 of Tox, more deterministic AppVeyor behavior.
258 376 * CFFI "parser" improved to use a compiler preprocessor instead of rewriting
259 377 source code manually.
260 378 * Vendored version of zstd updated to 1.1.2.
261 379 * Documentation improvements.
262 380 * Introduce a bench.py script for performing (crude) benchmarks.
263 381 * ZSTD_CCtx instances are now reused across multiple compress() operations.
264 382 * ZstdCompressor.write_to() now has a flush() method.
265 383 * ZstdCompressor.compressobj()'s flush() method now accepts an argument to
266 384 flush a block (as opposed to ending the stream).
267 385 * Disallow compress(b'') when writing content sizes by default (issue #11).
268 386
269 387 0.5.2 (released 2016-11-12)
270 388 ===========================
271 389
272 390 * more packaging fixes for source distribution
273 391
274 392 0.5.1 (released 2016-11-12)
275 393 ===========================
276 394
277 395 * setup_zstd.py is included in the source distribution
278 396
279 397 0.5.0 (released 2016-11-10)
280 398 ===========================
281 399
282 400 * Vendored version of zstd updated to 1.1.1.
283 401 * Continuous integration for Python 3.6 and 3.7
284 402 * Continuous integration for Conda
285 403 * Added compression and decompression APIs providing similar interfaces
286 404 to the standard library ``zlib`` and ``bz2`` modules. This allows
287 405 coding to a common interface.
288 406 * ``zstd.__version__` is now defined.
289 407 * ``read_from()`` on various APIs now accepts objects implementing the buffer
290 408 protocol.
291 409 * ``read_from()`` has gained a ``skip_bytes`` argument. This allows callers
292 410 to pass in an existing buffer with a header without having to create a
293 411 slice or a new object.
294 412 * Implemented ``ZstdCompressionDict.as_bytes()``.
295 413 * Python's memory allocator is now used instead of ``malloc()``.
296 414 * Low-level zstd data structures are reused in more instances, cutting down
297 415 on overhead for certain operations.
298 416 * ``distutils`` boilerplate for obtaining an ``Extension`` instance
299 417 has now been refactored into a standalone ``setup_zstd.py`` file. This
300 418 allows other projects with ``setup.py`` files to reuse the
301 419 ``distutils`` code for this project without copying code.
302 420 * The monolithic ``zstd.c`` file has been split into a header file defining
303 421 types and separate ``.c`` source files for the implementation.
304 422
305 423 Older History
306 424 =============
307 425
308 426 2016-08-31 - Zstandard 1.0.0 is released and Gregory starts hacking on a
309 427 Python extension for use by the Mercurial project. A very hacky prototype
310 428 is sent to the mercurial-devel list for RFC.
311 429
312 430 2016-09-03 - Most functionality from Zstandard C API implemented. Source
313 431 code published on https://github.com/indygreg/python-zstandard. Travis-CI
314 432 automation configured. 0.0.1 release on PyPI.
315 433
316 434 2016-09-05 - After the API was rounded out a bit and support for Python
317 435 2.6 and 2.7 was added, version 0.1 was released to PyPI.
318 436
319 437 2016-09-05 - After the compressor and decompressor APIs were changed, 0.2
320 438 was released to PyPI.
321 439
322 440 2016-09-10 - 0.3 is released with a bunch of new features. ZstdCompressor
323 441 now accepts arguments controlling frame parameters. The source size can now
324 442 be declared when performing streaming compression. ZstdDecompressor.decompress()
325 443 is implemented. Compression dictionaries are now cached when using the simple
326 444 compression and decompression APIs. Memory size APIs added.
327 445 ZstdCompressor.read_from() and ZstdDecompressor.read_from() have been
328 446 implemented. This rounds out the major compression/decompression APIs planned
329 447 by the author.
330 448
331 449 2016-10-02 - 0.3.3 is released with a bug fix for read_from not fully
332 450 decoding a zstd frame (issue #2).
333 451
334 452 2016-10-02 - 0.4.0 is released with zstd 1.1.0, support for custom read and
335 453 write buffer sizes, and a few bug fixes involving failure to read/write
336 454 all data when buffer sizes were too small to hold remaining data.
337 455
338 456 2016-11-10 - 0.5.0 is released with zstd 1.1.1 and other enhancements.
@@ -1,1420 +1,1495 b''
1 1 ================
2 2 python-zstandard
3 3 ================
4 4
5 5 This project provides Python bindings for interfacing with the
6 6 `Zstandard <http://www.zstd.net>`_ compression library. A C extension
7 7 and CFFI interface are provided.
8 8
9 9 The primary goal of the project is to provide a rich interface to the
10 10 underlying C API through a Pythonic interface while not sacrificing
11 11 performance. This means exposing most of the features and flexibility
12 12 of the C API while not sacrificing usability or safety that Python provides.
13 13
14 14 The canonical home for this project lives in a Mercurial repository run by
15 15 the author. For convenience, that repository is frequently synchronized to
16 16 https://github.com/indygreg/python-zstandard.
17 17
18 18 | |ci-status| |win-ci-status|
19 19
20 20 Requirements
21 21 ============
22 22
23 23 This extension is designed to run with Python 2.7, 3.4, 3.5, and 3.6
24 24 on common platforms (Linux, Windows, and OS X). x86 and x86_64 are well-tested
25 25 on Windows. Only x86_64 is well-tested on Linux and macOS.
26 26
27 27 Installing
28 28 ==========
29 29
30 30 This package is uploaded to PyPI at https://pypi.python.org/pypi/zstandard.
31 31 So, to install this package::
32 32
33 33 $ pip install zstandard
34 34
35 35 Binary wheels are made available for some platforms. If you need to
36 36 install from a source distribution, all you should need is a working C
37 37 compiler and the Python development headers/libraries. On many Linux
38 38 distributions, you can install a ``python-dev`` or ``python-devel``
39 39 package to provide these dependencies.
40 40
41 41 Packages are also uploaded to Anaconda Cloud at
42 42 https://anaconda.org/indygreg/zstandard. See that URL for how to install
43 43 this package with ``conda``.
44 44
45 45 Performance
46 46 ===========
47 47
48 48 zstandard is a highly tunable compression algorithm. In its default settings
49 49 (compression level 3), it will be faster at compression and decompression and
50 50 will have better compression ratios than zlib on most data sets. When tuned
51 51 for speed, it approaches lz4's speed and ratios. When tuned for compression
52 52 ratio, it approaches lzma ratios and compression speed, but decompression
53 53 speed is much faster. See the official zstandard documentation for more.
54 54
55 55 zstandard and this library support multi-threaded compression. There is a
56 56 mechanism to compress large inputs using multiple threads.
57 57
58 58 The performance of this library is usually very similar to what the zstandard
59 59 C API can deliver. Overhead in this library is due to general Python overhead
60 60 and can't easily be avoided by *any* zstandard Python binding. This library
61 61 exposes multiple APIs for performing compression and decompression so callers
62 62 can pick an API suitable for their need. Contrast with the compression
63 63 modules in Python's standard library (like ``zlib``), which only offer limited
64 64 mechanisms for performing operations. The API flexibility means consumers can
65 65 choose to use APIs that facilitate zero copying or minimize Python object
66 66 creation and garbage collection overhead.
67 67
68 68 This library is capable of single-threaded throughputs well over 1 GB/s. For
69 69 exact numbers, measure yourself. The source code repository has a ``bench.py``
70 70 script that can be used to measure things.
71 71
72 72 API
73 73 ===
74 74
75 75 To interface with Zstandard, simply import the ``zstandard`` module::
76 76
77 77 import zstandard
78 78
79 79 It is a popular convention to alias the module as a different name for
80 80 brevity::
81 81
82 82 import zstandard as zstd
83 83
84 84 This module attempts to import and use either the C extension or CFFI
85 85 implementation. On Python platforms known to support C extensions (like
86 86 CPython), it raises an ImportError if the C extension cannot be imported.
87 87 On Python platforms known to not support C extensions (like PyPy), it only
88 88 attempts to import the CFFI implementation and raises ImportError if that
89 89 can't be done. On other platforms, it first tries to import the C extension
90 90 then falls back to CFFI if that fails and raises ImportError if CFFI fails.
91 91
92 92 To change the module import behavior, a ``PYTHON_ZSTANDARD_IMPORT_POLICY``
93 93 environment variable can be set. The following values are accepted:
94 94
95 95 default
96 96 The behavior described above.
97 97 cffi_fallback
98 98 Always try to import the C extension then fall back to CFFI if that
99 99 fails.
100 100 cext
101 101 Only attempt to import the C extension.
102 102 cffi
103 103 Only attempt to import the CFFI implementation.
104 104
105 105 In addition, the ``zstandard`` module exports a ``backend`` attribute
106 106 containing the string name of the backend being used. It will be one
107 107 of ``cext`` or ``cffi`` (for *C extension* and *cffi*, respectively).
108 108
109 109 The types, functions, and attributes exposed by the ``zstandard`` module
110 110 are documented in the sections below.
111 111
112 112 .. note::
113 113
114 114 The documentation in this section makes references to various zstd
115 115 concepts and functionality. The source repository contains a
116 116 ``docs/concepts.rst`` file explaining these in more detail.
117 117
118 118 ZstdCompressor
119 119 --------------
120 120
121 121 The ``ZstdCompressor`` class provides an interface for performing
122 122 compression operations. Each instance is essentially a wrapper around a
123 123 ``ZSTD_CCtx`` from the C API.
124 124
125 125 Each instance is associated with parameters that control compression
126 126 behavior. These come from the following named arguments (all optional):
127 127
128 128 level
129 129 Integer compression level. Valid values are between 1 and 22.
130 130 dict_data
131 131 Compression dictionary to use.
132 132
133 133 Note: When using dictionary data and ``compress()`` is called multiple
134 134 times, the ``ZstdCompressionParameters`` derived from an integer
135 135 compression ``level`` and the first compressed data's size will be reused
136 136 for all subsequent operations. This may not be desirable if source data
137 137 size varies significantly.
138 138 compression_params
139 139 A ``ZstdCompressionParameters`` instance defining compression settings.
140 140 write_checksum
141 141 Whether a 4 byte checksum should be written with the compressed data.
142 142 Defaults to False. If True, the decompressor can verify that decompressed
143 143 data matches the original input data.
144 144 write_content_size
145 145 Whether the size of the uncompressed data will be written into the
146 146 header of compressed data. Defaults to True. The data will only be
147 147 written if the compressor knows the size of the input data. This is
148 148 often not true for streaming compression.
149 149 write_dict_id
150 150 Whether to write the dictionary ID into the compressed data.
151 151 Defaults to True. The dictionary ID is only written if a dictionary
152 152 is being used.
153 153 threads
154 154 Enables and sets the number of threads to use for multi-threaded compression
155 155 operations. Defaults to 0, which means to use single-threaded compression.
156 156 Negative values will resolve to the number of logical CPUs in the system.
157 157 Read below for more info on multi-threaded compression. This argument only
158 158 controls thread count for operations that operate on individual pieces of
159 159 data. APIs that spawn multiple threads for working on multiple pieces of
160 160 data have their own ``threads`` argument.
161 161
162 162 ``compression_params`` is mutually exclusive with ``level``, ``write_checksum``,
163 163 ``write_content_size``, ``write_dict_id``, and ``threads``.
164 164
165 165 Unless specified otherwise, assume that no two methods of ``ZstdCompressor``
166 166 instances can be called from multiple Python threads simultaneously. In other
167 167 words, assume instances are not thread safe unless stated otherwise.
168 168
169 169 Utility Methods
170 170 ^^^^^^^^^^^^^^^
171 171
172 172 ``frame_progression()`` returns a 3-tuple containing the number of bytes
173 173 ingested, consumed, and produced by the current compression operation.
174 174
175 175 ``memory_size()`` obtains the memory utilization of the underlying zstd
176 176 compression context, in bytes.::
177 177
178 178 cctx = zstd.ZstdCompressor()
179 179 memory = cctx.memory_size()
180 180
181 181 Simple API
182 182 ^^^^^^^^^^
183 183
184 184 ``compress(data)`` compresses and returns data as a one-shot operation.::
185 185
186 186 cctx = zstd.ZstdCompressor()
187 187 compressed = cctx.compress(b'data to compress')
188 188
189 189 The ``data`` argument can be any object that implements the *buffer protocol*.
190 190
191 191 Stream Reader API
192 192 ^^^^^^^^^^^^^^^^^
193 193
194 194 ``stream_reader(source)`` can be used to obtain an object conforming to the
195 195 ``io.RawIOBase`` interface for reading compressed output as a stream::
196 196
197 197 with open(path, 'rb') as fh:
198 198 cctx = zstd.ZstdCompressor()
199 reader = cctx.stream_reader(fh)
200 while True:
201 chunk = reader.read(16384)
202 if not chunk:
203 break
204
205 # Do something with compressed chunk.
206
207 Instances can also be used as context managers::
208
209 with open(path, 'rb') as fh:
199 210 with cctx.stream_reader(fh) as reader:
200 211 while True:
201 212 chunk = reader.read(16384)
202 213 if not chunk:
203 214 break
204 215
205 216 # Do something with compressed chunk.
206 217
207 The stream can only be read within a context manager. When the context
208 manager exits, the stream is closed and the underlying resource is
209 released and future operations against the compression stream stream will fail.
218 When the context manager exists or ``close()`` is called, the stream is closed,
219 underlying resources are released, and future operations against the compression
220 stream will fail.
210 221
211 222 The ``source`` argument to ``stream_reader()`` can be any object with a
212 223 ``read(size)`` method or any object implementing the *buffer protocol*.
213 224
214 225 ``stream_reader()`` accepts a ``size`` argument specifying how large the input
215 226 stream is. This is used to adjust compression parameters so they are
216 227 tailored to the source size.::
217 228
218 229 with open(path, 'rb') as fh:
219 230 cctx = zstd.ZstdCompressor()
220 231 with cctx.stream_reader(fh, size=os.stat(path).st_size) as reader:
221 232 ...
222 233
223 234 If the ``source`` is a stream, you can specify how large ``read()`` requests
224 235 to that stream should be via the ``read_size`` argument. It defaults to
225 236 ``zstandard.COMPRESSION_RECOMMENDED_INPUT_SIZE``.::
226 237
227 238 with open(path, 'rb') as fh:
228 239 cctx = zstd.ZstdCompressor()
229 240 # Will perform fh.read(8192) when obtaining data to feed into the
230 241 # compressor.
231 242 with cctx.stream_reader(fh, read_size=8192) as reader:
232 243 ...
233 244
234 245 The stream returned by ``stream_reader()`` is neither writable nor seekable
235 246 (even if the underlying source is seekable). ``readline()`` and
236 247 ``readlines()`` are not implemented because they don't make sense for
237 248 compressed data. ``tell()`` returns the number of compressed bytes
238 249 emitted so far.
239 250
240 251 Streaming Input API
241 252 ^^^^^^^^^^^^^^^^^^^
242 253
243 254 ``stream_writer(fh)`` (which behaves as a context manager) allows you to *stream*
244 255 data into a compressor.::
245 256
246 257 cctx = zstd.ZstdCompressor(level=10)
247 258 with cctx.stream_writer(fh) as compressor:
248 259 compressor.write(b'chunk 0')
249 260 compressor.write(b'chunk 1')
250 261 ...
251 262
252 263 The argument to ``stream_writer()`` must have a ``write(data)`` method. As
253 264 compressed data is available, ``write()`` will be called with the compressed
254 265 data as its argument. Many common Python types implement ``write()``, including
255 266 open file handles and ``io.BytesIO``.
256 267
257 268 ``stream_writer()`` returns an object representing a streaming compressor
258 269 instance. It **must** be used as a context manager. That object's
259 270 ``write(data)`` method is used to feed data into the compressor.
260 271
261 272 A ``flush()`` method can be called to evict whatever data remains within the
262 273 compressor's internal state into the output object. This may result in 0 or
263 274 more ``write()`` calls to the output object.
264 275
265 276 Both ``write()`` and ``flush()`` return the number of bytes written to the
266 277 object's ``write()``. In many cases, small inputs do not accumulate enough
267 278 data to cause a write and ``write()`` will return ``0``.
268 279
269 280 If the size of the data being fed to this streaming compressor is known,
270 281 you can declare it before compression begins::
271 282
272 283 cctx = zstd.ZstdCompressor()
273 284 with cctx.stream_writer(fh, size=data_len) as compressor:
274 285 compressor.write(chunk0)
275 286 compressor.write(chunk1)
276 287 ...
277 288
278 289 Declaring the size of the source data allows compression parameters to
279 290 be tuned. And if ``write_content_size`` is used, it also results in the
280 291 content size being written into the frame header of the output data.
281 292
282 293 The size of chunks being ``write()`` to the destination can be specified::
283 294
284 295 cctx = zstd.ZstdCompressor()
285 296 with cctx.stream_writer(fh, write_size=32768) as compressor:
286 297 ...
287 298
288 299 To see how much memory is being used by the streaming compressor::
289 300
290 301 cctx = zstd.ZstdCompressor()
291 302 with cctx.stream_writer(fh) as compressor:
292 303 ...
293 304 byte_size = compressor.memory_size()
294 305
295 306 Thte total number of bytes written so far are exposed via ``tell()``::
296 307
297 308 cctx = zstd.ZstdCompressor()
298 309 with cctx.stream_writer(fh) as compressor:
299 310 ...
300 311 total_written = compressor.tell()
301 312
302 313 Streaming Output API
303 314 ^^^^^^^^^^^^^^^^^^^^
304 315
305 316 ``read_to_iter(reader)`` provides a mechanism to stream data out of a
306 317 compressor as an iterator of data chunks.::
307 318
308 319 cctx = zstd.ZstdCompressor()
309 320 for chunk in cctx.read_to_iter(fh):
310 321 # Do something with emitted data.
311 322
312 323 ``read_to_iter()`` accepts an object that has a ``read(size)`` method or
313 324 conforms to the buffer protocol.
314 325
315 326 Uncompressed data is fetched from the source either by calling ``read(size)``
316 327 or by fetching a slice of data from the object directly (in the case where
317 328 the buffer protocol is being used). The returned iterator consists of chunks
318 329 of compressed data.
319 330
320 331 If reading from the source via ``read()``, ``read()`` will be called until
321 332 it raises or returns an empty bytes (``b''``). It is perfectly valid for
322 333 the source to deliver fewer bytes than were what requested by ``read(size)``.
323 334
324 335 Like ``stream_writer()``, ``read_to_iter()`` also accepts a ``size`` argument
325 336 declaring the size of the input stream::
326 337
327 338 cctx = zstd.ZstdCompressor()
328 339 for chunk in cctx.read_to_iter(fh, size=some_int):
329 340 pass
330 341
331 342 You can also control the size that data is ``read()`` from the source and
332 343 the ideal size of output chunks::
333 344
334 345 cctx = zstd.ZstdCompressor()
335 346 for chunk in cctx.read_to_iter(fh, read_size=16384, write_size=8192):
336 347 pass
337 348
338 349 Unlike ``stream_writer()``, ``read_to_iter()`` does not give direct control
339 350 over the sizes of chunks fed into the compressor. Instead, chunk sizes will
340 351 be whatever the object being read from delivers. These will often be of a
341 352 uniform size.
342 353
343 354 Stream Copying API
344 355 ^^^^^^^^^^^^^^^^^^
345 356
346 357 ``copy_stream(ifh, ofh)`` can be used to copy data between 2 streams while
347 358 compressing it.::
348 359
349 360 cctx = zstd.ZstdCompressor()
350 361 cctx.copy_stream(ifh, ofh)
351 362
352 363 For example, say you wish to compress a file::
353 364
354 365 cctx = zstd.ZstdCompressor()
355 366 with open(input_path, 'rb') as ifh, open(output_path, 'wb') as ofh:
356 367 cctx.copy_stream(ifh, ofh)
357 368
358 369 It is also possible to declare the size of the source stream::
359 370
360 371 cctx = zstd.ZstdCompressor()
361 372 cctx.copy_stream(ifh, ofh, size=len_of_input)
362 373
363 374 You can also specify how large the chunks that are ``read()`` and ``write()``
364 375 from and to the streams::
365 376
366 377 cctx = zstd.ZstdCompressor()
367 378 cctx.copy_stream(ifh, ofh, read_size=32768, write_size=16384)
368 379
369 380 The stream copier returns a 2-tuple of bytes read and written::
370 381
371 382 cctx = zstd.ZstdCompressor()
372 383 read_count, write_count = cctx.copy_stream(ifh, ofh)
373 384
374 385 Compressor API
375 386 ^^^^^^^^^^^^^^
376 387
377 388 ``compressobj()`` returns an object that exposes ``compress(data)`` and
378 389 ``flush()`` methods. Each returns compressed data or an empty bytes.
379 390
380 391 The purpose of ``compressobj()`` is to provide an API-compatible interface
381 392 with ``zlib.compressobj``, ``bz2.BZ2Compressor``, etc. This allows callers to
382 393 swap in different compressor objects while using the same API.
383 394
384 395 ``flush()`` accepts an optional argument indicating how to end the stream.
385 396 ``zstd.COMPRESSOBJ_FLUSH_FINISH`` (the default) ends the compression stream.
386 397 Once this type of flush is performed, ``compress()`` and ``flush()`` can
387 398 no longer be called. This type of flush **must** be called to end the
388 399 compression context. If not called, returned data may be incomplete.
389 400
390 401 A ``zstd.COMPRESSOBJ_FLUSH_BLOCK`` argument to ``flush()`` will flush a
391 402 zstd block. Flushes of this type can be performed multiple times. The next
392 403 call to ``compress()`` will begin a new zstd block.
393 404
394 405 Here is how this API should be used::
395 406
396 407 cctx = zstd.ZstdCompressor()
397 408 cobj = cctx.compressobj()
398 409 data = cobj.compress(b'raw input 0')
399 410 data = cobj.compress(b'raw input 1')
400 411 data = cobj.flush()
401 412
402 413 Or to flush blocks::
403 414
404 415 cctx.zstd.ZstdCompressor()
405 416 cobj = cctx.compressobj()
406 417 data = cobj.compress(b'chunk in first block')
407 418 data = cobj.flush(zstd.COMPRESSOBJ_FLUSH_BLOCK)
408 419 data = cobj.compress(b'chunk in second block')
409 420 data = cobj.flush()
410 421
411 422 For best performance results, keep input chunks under 256KB. This avoids
412 423 extra allocations for a large output object.
413 424
414 425 It is possible to declare the input size of the data that will be fed into
415 426 the compressor::
416 427
417 428 cctx = zstd.ZstdCompressor()
418 429 cobj = cctx.compressobj(size=6)
419 430 data = cobj.compress(b'foobar')
420 431 data = cobj.flush()
421 432
433 Chunker API
434 ^^^^^^^^^^^
435
436 ``chunker(size=None, chunk_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE)`` returns
437 an object that can be used to iteratively feed chunks of data into a compressor
438 and produce output chunks of a uniform size.
439
440 The object returned by ``chunker()`` exposes the following methods:
441
442 ``compress(data)``
443 Feeds new input data into the compressor.
444
445 ``flush()``
446 Flushes all data currently in the compressor.
447
448 ``finish()``
449 Signals the end of input data. No new data can be compressed after this
450 method is called.
451
452 ``compress()``, ``flush()``, and ``finish()`` all return an iterator of
453 ``bytes`` instances holding compressed data. The iterator may be empty. Callers
454 MUST iterate through all elements of the returned iterator before performing
455 another operation on the object.
456
457 All chunks emitted by ``compress()`` will have a length of ``chunk_size``.
458
459 ``flush()`` and ``finish()`` may return a final chunk smaller than
460 ``chunk_size``.
461
462 Here is how the API should be used::
463
464 cctx = zstd.ZstdCompressor()
465 chunker = cctx.chunker(chunk_size=32768)
466
467 with open(path, 'rb') as fh:
468 while True:
469 in_chunk = fh.read(32768)
470 if not in_chunk:
471 break
472
473 for out_chunk in chunker.compress(in_chunk):
474 # Do something with output chunk of size 32768.
475
476 for out_chunk in chunker.finish():
477 # Do something with output chunks that finalize the zstd frame.
478
479 The ``chunker()`` API is often a better alternative to ``compressobj()``.
480
481 ``compressobj()`` will emit output data as it is available. This results in a
482 *stream* of output chunks of varying sizes. The consistency of the output chunk
483 size with ``chunker()`` is more appropriate for many usages, such as sending
484 compressed data to a socket.
485
486 ``compressobj()`` may also perform extra memory reallocations in order to
487 dynamically adjust the sizes of the output chunks. Since ``chunker()`` output
488 chunks are all the same size (except for flushed or final chunks), there is
489 less memory allocation overhead.
490
422 491 Batch Compression API
423 492 ^^^^^^^^^^^^^^^^^^^^^
424 493
425 494 (Experimental. Not yet supported in CFFI bindings.)
426 495
427 496 ``multi_compress_to_buffer(data, [threads=0])`` performs compression of multiple
428 497 inputs as a single operation.
429 498
430 499 Data to be compressed can be passed as a ``BufferWithSegmentsCollection``, a
431 500 ``BufferWithSegments``, or a list containing byte like objects. Each element of
432 501 the container will be compressed individually using the configured parameters
433 502 on the ``ZstdCompressor`` instance.
434 503
435 504 The ``threads`` argument controls how many threads to use for compression. The
436 505 default is ``0`` which means to use a single thread. Negative values use the
437 506 number of logical CPUs in the machine.
438 507
439 508 The function returns a ``BufferWithSegmentsCollection``. This type represents
440 509 N discrete memory allocations, eaching holding 1 or more compressed frames.
441 510
442 511 Output data is written to shared memory buffers. This means that unlike
443 512 regular Python objects, a reference to *any* object within the collection
444 513 keeps the shared buffer and therefore memory backing it alive. This can have
445 514 undesirable effects on process memory usage.
446 515
447 516 The API and behavior of this function is experimental and will likely change.
448 517 Known deficiencies include:
449 518
450 519 * If asked to use multiple threads, it will always spawn that many threads,
451 520 even if the input is too small to use them. It should automatically lower
452 521 the thread count when the extra threads would just add overhead.
453 522 * The buffer allocation strategy is fixed. There is room to make it dynamic,
454 523 perhaps even to allow one output buffer per input, facilitating a variation
455 524 of the API to return a list without the adverse effects of shared memory
456 525 buffers.
457 526
458 527 ZstdDecompressor
459 528 ----------------
460 529
461 530 The ``ZstdDecompressor`` class provides an interface for performing
462 531 decompression. It is effectively a wrapper around the ``ZSTD_DCtx`` type from
463 532 the C API.
464 533
465 534 Each instance is associated with parameters that control decompression. These
466 535 come from the following named arguments (all optional):
467 536
468 537 dict_data
469 538 Compression dictionary to use.
470 539 max_window_size
471 540 Sets an uppet limit on the window size for decompression operations in
472 541 kibibytes. This setting can be used to prevent large memory allocations
473 542 for inputs using large compression windows.
474 543 format
475 544 Set the format of data for the decoder. By default, this is
476 545 ``zstd.FORMAT_ZSTD1``. It can be set to ``zstd.FORMAT_ZSTD1_MAGICLESS`` to
477 546 allow decoding frames without the 4 byte magic header. Not all decompression
478 547 APIs support this mode.
479 548
480 549 The interface of this class is very similar to ``ZstdCompressor`` (by design).
481 550
482 551 Unless specified otherwise, assume that no two methods of ``ZstdDecompressor``
483 552 instances can be called from multiple Python threads simultaneously. In other
484 553 words, assume instances are not thread safe unless stated otherwise.
485 554
486 555 Utility Methods
487 556 ^^^^^^^^^^^^^^^
488 557
489 558 ``memory_size()`` obtains the size of the underlying zstd decompression context,
490 559 in bytes.::
491 560
492 561 dctx = zstd.ZstdDecompressor()
493 562 size = dctx.memory_size()
494 563
495 564 Simple API
496 565 ^^^^^^^^^^
497 566
498 567 ``decompress(data)`` can be used to decompress an entire compressed zstd
499 568 frame in a single operation.::
500 569
501 570 dctx = zstd.ZstdDecompressor()
502 571 decompressed = dctx.decompress(data)
503 572
504 573 By default, ``decompress(data)`` will only work on data written with the content
505 574 size encoded in its header (this is the default behavior of
506 575 ``ZstdCompressor().compress()`` but may not be true for streaming compression). If
507 576 compressed data without an embedded content size is seen, ``zstd.ZstdError`` will
508 577 be raised.
509 578
510 579 If the compressed data doesn't have its content size embedded within it,
511 580 decompression can be attempted by specifying the ``max_output_size``
512 581 argument.::
513 582
514 583 dctx = zstd.ZstdDecompressor()
515 584 uncompressed = dctx.decompress(data, max_output_size=1048576)
516 585
517 586 Ideally, ``max_output_size`` will be identical to the decompressed output
518 587 size.
519 588
520 589 If ``max_output_size`` is too small to hold the decompressed data,
521 590 ``zstd.ZstdError`` will be raised.
522 591
523 592 If ``max_output_size`` is larger than the decompressed data, the allocated
524 593 output buffer will be resized to only use the space required.
525 594
526 595 Please note that an allocation of the requested ``max_output_size`` will be
527 596 performed every time the method is called. Setting to a very large value could
528 597 result in a lot of work for the memory allocator and may result in
529 598 ``MemoryError`` being raised if the allocation fails.
530 599
531 600 .. important::
532 601
533 602 If the exact size of decompressed data is unknown (not passed in explicitly
534 603 and not stored in the zstandard frame), for performance reasons it is
535 604 encouraged to use a streaming API.
536 605
537 606 Stream Reader API
538 607 ^^^^^^^^^^^^^^^^^
539 608
540 609 ``stream_reader(source)`` can be used to obtain an object conforming to the
541 610 ``io.RawIOBase`` interface for reading decompressed output as a stream::
542 611
543 612 with open(path, 'rb') as fh:
544 613 dctx = zstd.ZstdDecompressor()
545 with dctx.stream_reader(fh) as reader:
546 while True:
547 chunk = reader.read(16384)
548 if not chunk:
549 break
614 reader = dctx.stream_reader(fh)
615 while True:
616 chunk = reader.read(16384)
617 if not chunk:
618 break
619
620 # Do something with decompressed chunk.
550 621
551 # Do something with decompressed chunk.
622 The stream can also be used as a context manager::
552 623
553 The stream can only be read within a context manager. When the context
554 manager exits, the stream is closed and the underlying resource is
555 released and future operations against the stream will fail.
624 with open(path, 'rb') as fh:
625 dctx = zstd.ZstdDecompressor()
626 with dctx.stream_reader(fh) as reader:
627 ...
628
629 When used as a context manager, the stream is closed and the underlying
630 resources are released when the context manager exits. Future operations against
631 the stream will fail.
556 632
557 633 The ``source`` argument to ``stream_reader()`` can be any object with a
558 634 ``read(size)`` method or any object implementing the *buffer protocol*.
559 635
560 636 If the ``source`` is a stream, you can specify how large ``read()`` requests
561 637 to that stream should be via the ``read_size`` argument. It defaults to
562 638 ``zstandard.DECOMPRESSION_RECOMMENDED_INPUT_SIZE``.::
563 639
564 640 with open(path, 'rb') as fh:
565 641 dctx = zstd.ZstdDecompressor()
566 642 # Will perform fh.read(8192) when obtaining data for the decompressor.
567 643 with dctx.stream_reader(fh, read_size=8192) as reader:
568 644 ...
569 645
570 646 The stream returned by ``stream_reader()`` is not writable.
571 647
572 648 The stream returned by ``stream_reader()`` is *partially* seekable.
573 649 Absolute and relative positions (``SEEK_SET`` and ``SEEK_CUR``) forward
574 650 of the current position are allowed. Offsets behind the current read
575 651 position and offsets relative to the end of stream are not allowed and
576 652 will raise ``ValueError`` if attempted.
577 653
578 654 ``tell()`` returns the number of decompressed bytes read so far.
579 655
580 656 Not all I/O methods are implemented. Notably missing is support for
581 657 ``readline()``, ``readlines()``, and linewise iteration support. Support for
582 658 these is planned for a future release.
583 659
584 660 Streaming Input API
585 661 ^^^^^^^^^^^^^^^^^^^
586 662
587 663 ``stream_writer(fh)`` can be used to incrementally send compressed data to a
588 664 decompressor.::
589 665
590 666 dctx = zstd.ZstdDecompressor()
591 667 with dctx.stream_writer(fh) as decompressor:
592 668 decompressor.write(compressed_data)
593 669
594 670 This behaves similarly to ``zstd.ZstdCompressor``: compressed data is written to
595 671 the decompressor by calling ``write(data)`` and decompressed output is written
596 672 to the output object by calling its ``write(data)`` method.
597 673
598 674 Calls to ``write()`` will return the number of bytes written to the output
599 675 object. Not all inputs will result in bytes being written, so return values
600 676 of ``0`` are possible.
601 677
602 678 The size of chunks being ``write()`` to the destination can be specified::
603 679
604 680 dctx = zstd.ZstdDecompressor()
605 681 with dctx.stream_writer(fh, write_size=16384) as decompressor:
606 682 pass
607 683
608 684 You can see how much memory is being used by the decompressor::
609 685
610 686 dctx = zstd.ZstdDecompressor()
611 687 with dctx.stream_writer(fh) as decompressor:
612 688 byte_size = decompressor.memory_size()
613 689
614 690 Streaming Output API
615 691 ^^^^^^^^^^^^^^^^^^^^
616 692
617 693 ``read_to_iter(fh)`` provides a mechanism to stream decompressed data out of a
618 694 compressed source as an iterator of data chunks.::
619 695
620 696 dctx = zstd.ZstdDecompressor()
621 697 for chunk in dctx.read_to_iter(fh):
622 698 # Do something with original data.
623 699
624 700 ``read_to_iter()`` accepts an object with a ``read(size)`` method that will
625 701 return compressed bytes or an object conforming to the buffer protocol that
626 702 can expose its data as a contiguous range of bytes.
627 703
628 704 ``read_to_iter()`` returns an iterator whose elements are chunks of the
629 705 decompressed data.
630 706
631 707 The size of requested ``read()`` from the source can be specified::
632 708
633 709 dctx = zstd.ZstdDecompressor()
634 710 for chunk in dctx.read_to_iter(fh, read_size=16384):
635 711 pass
636 712
637 713 It is also possible to skip leading bytes in the input data::
638 714
639 715 dctx = zstd.ZstdDecompressor()
640 716 for chunk in dctx.read_to_iter(fh, skip_bytes=1):
641 717 pass
642 718
643 719 .. tip::
644 720
645 721 Skipping leading bytes is useful if the source data contains extra
646 722 *header* data. Traditionally, you would need to create a slice or
647 723 ``memoryview`` of the data you want to decompress. This would create
648 724 overhead. It is more efficient to pass the offset into this API.
649 725
650 726 Similarly to ``ZstdCompressor.read_to_iter()``, the consumer of the iterator
651 727 controls when data is decompressed. If the iterator isn't consumed,
652 728 decompression is put on hold.
653 729
654 730 When ``read_to_iter()`` is passed an object conforming to the buffer protocol,
655 731 the behavior may seem similar to what occurs when the simple decompression
656 732 API is used. However, this API works when the decompressed size is unknown.
657 733 Furthermore, if feeding large inputs, the decompressor will work in chunks
658 734 instead of performing a single operation.
659 735
660 736 Stream Copying API
661 737 ^^^^^^^^^^^^^^^^^^
662 738
663 739 ``copy_stream(ifh, ofh)`` can be used to copy data across 2 streams while
664 740 performing decompression.::
665 741
666 742 dctx = zstd.ZstdDecompressor()
667 743 dctx.copy_stream(ifh, ofh)
668 744
669 745 e.g. to decompress a file to another file::
670 746
671 747 dctx = zstd.ZstdDecompressor()
672 748 with open(input_path, 'rb') as ifh, open(output_path, 'wb') as ofh:
673 749 dctx.copy_stream(ifh, ofh)
674 750
675 751 The size of chunks being ``read()`` and ``write()`` from and to the streams
676 752 can be specified::
677 753
678 754 dctx = zstd.ZstdDecompressor()
679 755 dctx.copy_stream(ifh, ofh, read_size=8192, write_size=16384)
680 756
681 757 Decompressor API
682 758 ^^^^^^^^^^^^^^^^
683 759
684 760 ``decompressobj()`` returns an object that exposes a ``decompress(data)``
685 761 method. Compressed data chunks are fed into ``decompress(data)`` and
686 762 uncompressed output (or an empty bytes) is returned. Output from subsequent
687 763 calls needs to be concatenated to reassemble the full decompressed byte
688 764 sequence.
689 765
690 766 The purpose of ``decompressobj()`` is to provide an API-compatible interface
691 767 with ``zlib.decompressobj`` and ``bz2.BZ2Decompressor``. This allows callers
692 768 to swap in different decompressor objects while using the same API.
693 769
694 770 Each object is single use: once an input frame is decoded, ``decompress()``
695 771 can no longer be called.
696 772
697 773 Here is how this API should be used::
698 774
699 775 dctx = zstd.ZstdDecompressor()
700 776 dobj = dctx.decompressobj()
701 777 data = dobj.decompress(compressed_chunk_0)
702 778 data = dobj.decompress(compressed_chunk_1)
703 779
704 780 By default, calls to ``decompress()`` write output data in chunks of size
705 781 ``DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE``. These chunks are concatenated
706 782 before being returned to the caller. It is possible to define the size of
707 783 these temporary chunks by passing ``write_size`` to ``decompressobj()``::
708 784
709 785 dctx = zstd.ZstdDecompressor()
710 786 dobj = dctx.decompressobj(write_size=1048576)
711 787
712 788 .. note::
713 789
714 790 Because calls to ``decompress()`` may need to perform multiple
715 791 memory (re)allocations, this streaming decompression API isn't as
716 792 efficient as other APIs.
717 793
718 794 Batch Decompression API
719 795 ^^^^^^^^^^^^^^^^^^^^^^^
720 796
721 797 (Experimental. Not yet supported in CFFI bindings.)
722 798
723 799 ``multi_decompress_to_buffer()`` performs decompression of multiple
724 800 frames as a single operation and returns a ``BufferWithSegmentsCollection``
725 801 containing decompressed data for all inputs.
726 802
727 803 Compressed frames can be passed to the function as a ``BufferWithSegments``,
728 804 a ``BufferWithSegmentsCollection``, or as a list containing objects that
729 805 conform to the buffer protocol. For best performance, pass a
730 806 ``BufferWithSegmentsCollection`` or a ``BufferWithSegments``, as
731 807 minimal input validation will be done for that type. If calling from
732 808 Python (as opposed to C), constructing one of these instances may add
733 809 overhead cancelling out the performance overhead of validation for list
734 810 inputs.::
735 811
736 812 dctx = zstd.ZstdDecompressor()
737 813 results = dctx.multi_decompress_to_buffer([b'...', b'...'])
738 814
739 815 The decompressed size of each frame MUST be discoverable. It can either be
740 816 embedded within the zstd frame (``write_content_size=True`` argument to
741 817 ``ZstdCompressor``) or passed in via the ``decompressed_sizes`` argument.
742 818
743 819 The ``decompressed_sizes`` argument is an object conforming to the buffer
744 820 protocol which holds an array of 64-bit unsigned integers in the machine's
745 821 native format defining the decompressed sizes of each frame. If this argument
746 822 is passed, it avoids having to scan each frame for its decompressed size.
747 823 This frame scanning can add noticeable overhead in some scenarios.::
748 824
749 825 frames = [...]
750 826 sizes = struct.pack('=QQQQ', len0, len1, len2, len3)
751 827
752 828 dctx = zstd.ZstdDecompressor()
753 829 results = dctx.multi_decompress_to_buffer(frames, decompressed_sizes=sizes)
754 830
755 831 The ``threads`` argument controls the number of threads to use to perform
756 832 decompression operations. The default (``0``) or the value ``1`` means to
757 833 use a single thread. Negative values use the number of logical CPUs in the
758 834 machine.
759 835
760 836 .. note::
761 837
762 838 It is possible to pass a ``mmap.mmap()`` instance into this function by
763 839 wrapping it with a ``BufferWithSegments`` instance (which will define the
764 840 offsets of frames within the memory mapped region).
765 841
766 842 This function is logically equivalent to performing ``dctx.decompress()``
767 843 on each input frame and returning the result.
768 844
769 845 This function exists to perform decompression on multiple frames as fast
770 846 as possible by having as little overhead as possible. Since decompression is
771 847 performed as a single operation and since the decompressed output is stored in
772 848 a single buffer, extra memory allocations, Python objects, and Python function
773 849 calls are avoided. This is ideal for scenarios where callers know up front that
774 850 they need to access data for multiple frames, such as when *delta chains* are
775 851 being used.
776 852
777 853 Currently, the implementation always spawns multiple threads when requested,
778 854 even if the amount of work to do is small. In the future, it will be smarter
779 855 about avoiding threads and their associated overhead when the amount of
780 856 work to do is small.
781 857
782 858 Prefix Dictionary Chain Decompression
783 859 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
784 860
785 861 ``decompress_content_dict_chain(frames)`` performs decompression of a list of
786 862 zstd frames produced using chained *prefix* dictionary compression. Such
787 863 a list of frames is produced by compressing discrete inputs where each
788 864 non-initial input is compressed with a *prefix* dictionary consisting of the
789 865 content of the previous input.
790 866
791 867 For example, say you have the following inputs::
792 868
793 869 inputs = [b'input 1', b'input 2', b'input 3']
794 870
795 871 The zstd frame chain consists of:
796 872
797 873 1. ``b'input 1'`` compressed in standalone/discrete mode
798 874 2. ``b'input 2'`` compressed using ``b'input 1'`` as a *prefix* dictionary
799 875 3. ``b'input 3'`` compressed using ``b'input 2'`` as a *prefix* dictionary
800 876
801 877 Each zstd frame **must** have the content size written.
802 878
803 879 The following Python code can be used to produce a *prefix dictionary chain*::
804 880
805 881 def make_chain(inputs):
806 882 frames = []
807 883
808 884 # First frame is compressed in standalone/discrete mode.
809 885 zctx = zstd.ZstdCompressor()
810 886 frames.append(zctx.compress(inputs[0]))
811 887
812 888 # Subsequent frames use the previous fulltext as a prefix dictionary
813 889 for i, raw in enumerate(inputs[1:]):
814 890 dict_data = zstd.ZstdCompressionDict(
815 891 inputs[i], dict_type=zstd.DICT_TYPE_RAWCONTENT)
816 892 zctx = zstd.ZstdCompressor(dict_data=dict_data)
817 893 frames.append(zctx.compress(raw))
818 894
819 895 return frames
820 896
821 897 ``decompress_content_dict_chain()`` returns the uncompressed data of the last
822 898 element in the input chain.
823 899
824 900
825 901 .. note::
826 902
827 903 It is possible to implement *prefix dictionary chain* decompression
828 904 on top of other APIs. However, this function will likely be faster -
829 905 especially for long input chains - as it avoids the overhead of instantiating
830 906 and passing around intermediate objects between C and Python.
831 907
832 908 Multi-Threaded Compression
833 909 --------------------------
834 910
835 911 ``ZstdCompressor`` accepts a ``threads`` argument that controls the number
836 912 of threads to use for compression. The way this works is that input is split
837 913 into segments and each segment is fed into a worker pool for compression. Once
838 914 a segment is compressed, it is flushed/appended to the output.
839 915
840 916 .. note::
841 917
842 918 These threads are created at the C layer and are not Python threads. So they
843 919 work outside the GIL. It is therefore possible to CPU saturate multiple cores
844 920 from Python.
845 921
846 922 The segment size for multi-threaded compression is chosen from the window size
847 923 of the compressor. This is derived from the ``window_log`` attribute of a
848 924 ``ZstdCompressionParameters`` instance. By default, segment sizes are in the 1+MB
849 925 range.
850 926
851 927 If multi-threaded compression is requested and the input is smaller than the
852 928 configured segment size, only a single compression thread will be used. If the
853 929 input is smaller than the segment size multiplied by the thread pool size or
854 930 if data cannot be delivered to the compressor fast enough, not all requested
855 931 compressor threads may be active simultaneously.
856 932
857 933 Compared to non-multi-threaded compression, multi-threaded compression has
858 934 higher per-operation overhead. This includes extra memory operations,
859 935 thread creation, lock acquisition, etc.
860 936
861 937 Due to the nature of multi-threaded compression using *N* compression
862 938 *states*, the output from multi-threaded compression will likely be larger
863 939 than non-multi-threaded compression. The difference is usually small. But
864 940 there is a CPU/wall time versus size trade off that may warrant investigation.
865 941
866 942 Output from multi-threaded compression does not require any special handling
867 943 on the decompression side. To the decompressor, data generated with single
868 944 threaded compressor looks the same as data generated by a multi-threaded
869 945 compressor and does not require any special handling or additional resource
870 946 requirements.
871 947
872 948 Dictionary Creation and Management
873 949 ----------------------------------
874 950
875 951 Compression dictionaries are represented with the ``ZstdCompressionDict`` type.
876 952
877 953 Instances can be constructed from bytes::
878 954
879 955 dict_data = zstd.ZstdCompressionDict(data)
880 956
881 957 It is possible to construct a dictionary from *any* data. If the data doesn't
882 958 begin with a magic header, it will be treated as a *prefix* dictionary.
883 959 *Prefix* dictionaries allow compression operations to reference raw data
884 960 within the dictionary.
885 961
886 962 It is possible to force the use of *prefix* dictionaries or to require a
887 963 dictionary header:
888 964
889 965 dict_data = zstd.ZstdCompressionDict(data,
890 966 dict_type=zstd.DICT_TYPE_RAWCONTENT)
891 967
892 968 dict_data = zstd.ZstdCompressionDict(data,
893 969 dict_type=zstd.DICT_TYPE_FULLDICT)
894 970
895 971 You can see how many bytes are in the dictionary by calling ``len()``::
896 972
897 973 dict_data = zstd.train_dictionary(size, samples)
898 974 dict_size = len(dict_data) # will not be larger than ``size``
899 975
900 976 Once you have a dictionary, you can pass it to the objects performing
901 977 compression and decompression::
902 978
903 979 dict_data = zstd.train_dictionary(131072, samples)
904 980
905 981 cctx = zstd.ZstdCompressor(dict_data=dict_data)
906 982 for source_data in input_data:
907 983 compressed = cctx.compress(source_data)
908 984 # Do something with compressed data.
909 985
910 986 dctx = zstd.ZstdDecompressor(dict_data=dict_data)
911 987 for compressed_data in input_data:
912 988 buffer = io.BytesIO()
913 989 with dctx.stream_writer(buffer) as decompressor:
914 990 decompressor.write(compressed_data)
915 991 # Do something with raw data in ``buffer``.
916 992
917 993 Dictionaries have unique integer IDs. You can retrieve this ID via::
918 994
919 995 dict_id = zstd.dictionary_id(dict_data)
920 996
921 997 You can obtain the raw data in the dict (useful for persisting and constructing
922 998 a ``ZstdCompressionDict`` later) via ``as_bytes()``::
923 999
924 1000 dict_data = zstd.train_dictionary(size, samples)
925 1001 raw_data = dict_data.as_bytes()
926 1002
927 1003 By default, when a ``ZstdCompressionDict`` is *attached* to a
928 1004 ``ZstdCompressor``, each ``ZstdCompressor`` performs work to prepare the
929 1005 dictionary for use. This is fine if only 1 compression operation is being
930 1006 performed or if the ``ZstdCompressor`` is being reused for multiple operations.
931 1007 But if multiple ``ZstdCompressor`` instances are being used with the dictionary,
932 1008 this can add overhead.
933 1009
934 1010 It is possible to *precompute* the dictionary so it can readily be consumed
935 1011 by multiple ``ZstdCompressor`` instances::
936 1012
937 1013 d = zstd.ZstdCompressionDict(data)
938 1014
939 1015 # Precompute for compression level 3.
940 1016 d.precompute_compress(level=3)
941 1017
942 1018 # Precompute with specific compression parameters.
943 1019 params = zstd.ZstdCompressionParameters(...)
944 1020 d.precompute_compress(compression_params=params)
945 1021
946 1022 .. note::
947 1023
948 1024 When a dictionary is precomputed, the compression parameters used to
949 1025 precompute the dictionary overwrite some of the compression parameters
950 1026 specified to ``ZstdCompressor.__init__``.
951 1027
952 1028 Training Dictionaries
953 1029 ^^^^^^^^^^^^^^^^^^^^^
954 1030
955 1031 Unless using *prefix* dictionaries, dictionary data is produced by *training*
956 1032 on existing data::
957 1033
958 1034 dict_data = zstd.train_dictionary(size, samples)
959 1035
960 1036 This takes a target dictionary size and list of bytes instances and creates and
961 1037 returns a ``ZstdCompressionDict``.
962 1038
963 1039 The dictionary training mechanism is known as *cover*. More details about it are
964 1040 available in the paper *Effective Construction of Relative Lempel-Ziv
965 1041 Dictionaries* (authors: Liao, Petri, Moffat, Wirth).
966 1042
967 1043 The cover algorithm takes parameters ``k` and ``d``. These are the
968 1044 *segment size* and *dmer size*, respectively. The returned dictionary
969 1045 instance created by this function has ``k`` and ``d`` attributes
970 1046 containing the values for these parameters. If a ``ZstdCompressionDict``
971 1047 is constructed from raw bytes data (a content-only dictionary), the
972 1048 ``k`` and ``d`` attributes will be ``0``.
973 1049
974 1050 The segment and dmer size parameters to the cover algorithm can either be
975 1051 specified manually or ``train_dictionary()`` can try multiple values
976 1052 and pick the best one, where *best* means the smallest compressed data size.
977 1053 This later mode is called *optimization* mode.
978 1054
979 1055 If none of ``k``, ``d``, ``steps``, ``threads``, ``level``, ``notifications``,
980 1056 or ``dict_id`` (basically anything from the underlying ``ZDICT_cover_params_t``
981 1057 struct) are defined, *optimization* mode is used with default parameter
982 1058 values.
983 1059
984 1060 If ``steps`` or ``threads`` are defined, then *optimization* mode is engaged
985 1061 with explicit control over those parameters. Specifying ``threads=0`` or
986 1062 ``threads=1`` can be used to engage *optimization* mode if other parameters
987 1063 are not defined.
988 1064
989 1065 Otherwise, non-*optimization* mode is used with the parameters specified.
990 1066
991 1067 This function takes the following arguments:
992 1068
993 1069 dict_size
994 1070 Target size in bytes of the dictionary to generate.
995 1071 samples
996 1072 A list of bytes holding samples the dictionary will be trained from.
997 1073 k
998 1074 Parameter to cover algorithm defining the segment size. A reasonable range
999 1075 is [16, 2048+].
1000 1076 d
1001 1077 Parameter to cover algorithm defining the dmer size. A reasonable range is
1002 1078 [6, 16]. ``d`` must be less than or equal to ``k``.
1003 1079 dict_id
1004 1080 Integer dictionary ID for the produced dictionary. Default is 0, which uses
1005 1081 a random value.
1006 1082 steps
1007 1083 Number of steps through ``k`` values to perform when trying parameter
1008 1084 variations.
1009 1085 threads
1010 1086 Number of threads to use when trying parameter variations. Default is 0,
1011 1087 which means to use a single thread. A negative value can be specified to
1012 1088 use as many threads as there are detected logical CPUs.
1013 1089 level
1014 1090 Integer target compression level when trying parameter variations.
1015 1091 notifications
1016 1092 Controls writing of informational messages to ``stderr``. ``0`` (the
1017 1093 default) means to write nothing. ``1`` writes errors. ``2`` writes
1018 1094 progression info. ``3`` writes more details. And ``4`` writes all info.
1019 1095
1020 1096 Explicit Compression Parameters
1021 1097 -------------------------------
1022 1098
1023 1099 Zstandard offers a high-level *compression level* that maps to lower-level
1024 1100 compression parameters. For many consumers, this numeric level is the only
1025 1101 compression setting you'll need to touch.
1026 1102
1027 1103 But for advanced use cases, it might be desirable to tweak these lower-level
1028 1104 settings.
1029 1105
1030 1106 The ``ZstdCompressionParameters`` type represents these low-level compression
1031 1107 settings.
1032 1108
1033 1109 Instances of this type can be constructed from a myriad of keyword arguments
1034 1110 (defined below) for complete low-level control over each adjustable
1035 1111 compression setting.
1036 1112
1037 1113 From a higher level, one can construct a ``ZstdCompressionParameters`` instance
1038 1114 given a desired compression level and target input and dictionary size
1039 1115 using ``ZstdCompressionParameters.from_level()``. e.g.::
1040 1116
1041 1117 # Derive compression settings for compression level 7.
1042 1118 params = zstd.ZstdCompressionParameters.from_level(7)
1043 1119
1044 1120 # With an input size of 1MB
1045 1121 params = zstd.ZstdCompressionParameters.from_level(7, source_size=1048576)
1046 1122
1047 1123 Using ``from_level()``, it is also possible to override individual compression
1048 1124 parameters or to define additional settings that aren't automatically derived.
1049 1125 e.g.::
1050 1126
1051 1127 params = zstd.ZstdCompressionParameters.from_level(4, window_log=10)
1052 1128 params = zstd.ZstdCompressionParameters.from_level(5, threads=4)
1053 1129
1054 1130 Or you can define low-level compression settings directly::
1055 1131
1056 1132 params = zstd.ZstdCompressionParameters(window_log=12, enable_ldm=True)
1057 1133
1058 1134 Once a ``ZstdCompressionParameters`` instance is obtained, it can be used to
1059 1135 configure a compressor::
1060 1136
1061 1137 cctx = zstd.ZstdCompressor(compression_params=params)
1062 1138
1063 1139 The named arguments and attributes of ``ZstdCompressionParameters`` are as
1064 1140 follows:
1065 1141
1066 1142 * format
1067 1143 * compression_level
1068 1144 * window_log
1069 1145 * hash_log
1070 1146 * chain_log
1071 1147 * search_log
1072 1148 * min_match
1073 1149 * target_length
1074 1150 * compression_strategy
1075 1151 * write_content_size
1076 1152 * write_checksum
1077 1153 * write_dict_id
1078 1154 * job_size
1079 1155 * overlap_size_log
1080 * compress_literals
1081 1156 * force_max_window
1082 1157 * enable_ldm
1083 1158 * ldm_hash_log
1084 1159 * ldm_min_match
1085 1160 * ldm_bucket_size_log
1086 1161 * ldm_hash_every_log
1087 1162 * threads
1088 1163
1089 1164 Some of these are very low-level settings. It may help to consult the official
1090 1165 zstandard documentation for their behavior. Look for the ``ZSTD_p_*`` constants
1091 1166 in ``zstd.h`` (https://github.com/facebook/zstd/blob/dev/lib/zstd.h).
1092 1167
1093 1168 Frame Inspection
1094 1169 ----------------
1095 1170
1096 1171 Data emitted from zstd compression is encapsulated in a *frame*. This frame
1097 1172 begins with a 4 byte *magic number* header followed by 2 to 14 bytes describing
1098 1173 the frame in more detail. For more info, see
1099 1174 https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md.
1100 1175
1101 1176 ``zstd.get_frame_parameters(data)`` parses a zstd *frame* header from a bytes
1102 1177 instance and return a ``FrameParameters`` object describing the frame.
1103 1178
1104 1179 Depending on which fields are present in the frame and their values, the
1105 1180 length of the frame parameters varies. If insufficient bytes are passed
1106 1181 in to fully parse the frame parameters, ``ZstdError`` is raised. To ensure
1107 1182 frame parameters can be parsed, pass in at least 18 bytes.
1108 1183
1109 1184 ``FrameParameters`` instances have the following attributes:
1110 1185
1111 1186 content_size
1112 1187 Integer size of original, uncompressed content. This will be ``0`` if the
1113 1188 original content size isn't written to the frame (controlled with the
1114 1189 ``write_content_size`` argument to ``ZstdCompressor``) or if the input
1115 1190 content size was ``0``.
1116 1191
1117 1192 window_size
1118 1193 Integer size of maximum back-reference distance in compressed data.
1119 1194
1120 1195 dict_id
1121 1196 Integer of dictionary ID used for compression. ``0`` if no dictionary
1122 1197 ID was used or if the dictionary ID was ``0``.
1123 1198
1124 1199 has_checksum
1125 1200 Bool indicating whether a 4 byte content checksum is stored at the end
1126 1201 of the frame.
1127 1202
1128 1203 ``zstd.frame_header_size(data)`` returns the size of the zstandard frame
1129 1204 header.
1130 1205
1131 1206 ``zstd.frame_content_size(data)`` returns the content size as parsed from
1132 1207 the frame header. ``-1`` means the content size is unknown. ``0`` means
1133 1208 an empty frame. The content size is usually correct. However, it may not
1134 1209 be accurate.
1135 1210
1136 1211 Misc Functionality
1137 1212 ------------------
1138 1213
1139 1214 estimate_decompression_context_size()
1140 1215 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1141 1216
1142 1217 Estimate the memory size requirements for a decompressor instance.
1143 1218
1144 1219 Constants
1145 1220 ---------
1146 1221
1147 1222 The following module constants/attributes are exposed:
1148 1223
1149 1224 ZSTD_VERSION
1150 1225 This module attribute exposes a 3-tuple of the Zstandard version. e.g.
1151 1226 ``(1, 0, 0)``
1152 1227 MAX_COMPRESSION_LEVEL
1153 1228 Integer max compression level accepted by compression functions
1154 1229 COMPRESSION_RECOMMENDED_INPUT_SIZE
1155 1230 Recommended chunk size to feed to compressor functions
1156 1231 COMPRESSION_RECOMMENDED_OUTPUT_SIZE
1157 1232 Recommended chunk size for compression output
1158 1233 DECOMPRESSION_RECOMMENDED_INPUT_SIZE
1159 1234 Recommended chunk size to feed into decompresor functions
1160 1235 DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE
1161 1236 Recommended chunk size for decompression output
1162 1237
1163 1238 FRAME_HEADER
1164 1239 bytes containing header of the Zstandard frame
1165 1240 MAGIC_NUMBER
1166 1241 Frame header as an integer
1167 1242
1168 1243 CONTENTSIZE_UNKNOWN
1169 1244 Value for content size when the content size is unknown.
1170 1245 CONTENTSIZE_ERROR
1171 1246 Value for content size when content size couldn't be determined.
1172 1247
1173 1248 WINDOWLOG_MIN
1174 1249 Minimum value for compression parameter
1175 1250 WINDOWLOG_MAX
1176 1251 Maximum value for compression parameter
1177 1252 CHAINLOG_MIN
1178 1253 Minimum value for compression parameter
1179 1254 CHAINLOG_MAX
1180 1255 Maximum value for compression parameter
1181 1256 HASHLOG_MIN
1182 1257 Minimum value for compression parameter
1183 1258 HASHLOG_MAX
1184 1259 Maximum value for compression parameter
1185 1260 SEARCHLOG_MIN
1186 1261 Minimum value for compression parameter
1187 1262 SEARCHLOG_MAX
1188 1263 Maximum value for compression parameter
1189 1264 SEARCHLENGTH_MIN
1190 1265 Minimum value for compression parameter
1191 1266 SEARCHLENGTH_MAX
1192 1267 Maximum value for compression parameter
1193 1268 TARGETLENGTH_MIN
1194 1269 Minimum value for compression parameter
1195 1270 STRATEGY_FAST
1196 1271 Compression strategy
1197 1272 STRATEGY_DFAST
1198 1273 Compression strategy
1199 1274 STRATEGY_GREEDY
1200 1275 Compression strategy
1201 1276 STRATEGY_LAZY
1202 1277 Compression strategy
1203 1278 STRATEGY_LAZY2
1204 1279 Compression strategy
1205 1280 STRATEGY_BTLAZY2
1206 1281 Compression strategy
1207 1282 STRATEGY_BTOPT
1208 1283 Compression strategy
1209 1284 STRATEGY_BTULTRA
1210 1285 Compression strategy
1211 1286
1212 1287 FORMAT_ZSTD1
1213 1288 Zstandard frame format
1214 1289 FORMAT_ZSTD1_MAGICLESS
1215 1290 Zstandard frame format without magic header
1216 1291
1217 1292 Performance Considerations
1218 1293 --------------------------
1219 1294
1220 1295 The ``ZstdCompressor`` and ``ZstdDecompressor`` types maintain state to a
1221 1296 persistent compression or decompression *context*. Reusing a ``ZstdCompressor``
1222 1297 or ``ZstdDecompressor`` instance for multiple operations is faster than
1223 1298 instantiating a new ``ZstdCompressor`` or ``ZstdDecompressor`` for each
1224 1299 operation. The differences are magnified as the size of data decreases. For
1225 1300 example, the difference between *context* reuse and non-reuse for 100,000
1226 1301 100 byte inputs will be significant (possiby over 10x faster to reuse contexts)
1227 1302 whereas 10 100,000,000 byte inputs will be more similar in speed (because the
1228 1303 time spent doing compression dwarfs time spent creating new *contexts*).
1229 1304
1230 1305 Buffer Types
1231 1306 ------------
1232 1307
1233 1308 The API exposes a handful of custom types for interfacing with memory buffers.
1234 1309 The primary goal of these types is to facilitate efficient multi-object
1235 1310 operations.
1236 1311
1237 1312 The essential idea is to have a single memory allocation provide backing
1238 1313 storage for multiple logical objects. This has 2 main advantages: fewer
1239 1314 allocations and optimal memory access patterns. This avoids having to allocate
1240 1315 a Python object for each logical object and furthermore ensures that access of
1241 1316 data for objects can be sequential (read: fast) in memory.
1242 1317
1243 1318 BufferWithSegments
1244 1319 ^^^^^^^^^^^^^^^^^^
1245 1320
1246 1321 The ``BufferWithSegments`` type represents a memory buffer containing N
1247 1322 discrete items of known lengths (segments). It is essentially a fixed size
1248 1323 memory address and an array of 2-tuples of ``(offset, length)`` 64-bit
1249 1324 unsigned native endian integers defining the byte offset and length of each
1250 1325 segment within the buffer.
1251 1326
1252 1327 Instances behave like containers.
1253 1328
1254 1329 ``len()`` returns the number of segments within the instance.
1255 1330
1256 1331 ``o[index]`` or ``__getitem__`` obtains a ``BufferSegment`` representing an
1257 1332 individual segment within the backing buffer. That returned object references
1258 1333 (not copies) memory. This means that iterating all objects doesn't copy
1259 1334 data within the buffer.
1260 1335
1261 1336 The ``.size`` attribute contains the total size in bytes of the backing
1262 1337 buffer.
1263 1338
1264 1339 Instances conform to the buffer protocol. So a reference to the backing bytes
1265 1340 can be obtained via ``memoryview(o)``. A *copy* of the backing bytes can also
1266 1341 be obtained via ``.tobytes()``.
1267 1342
1268 1343 The ``.segments`` attribute exposes the array of ``(offset, length)`` for
1269 1344 segments within the buffer. It is a ``BufferSegments`` type.
1270 1345
1271 1346 BufferSegment
1272 1347 ^^^^^^^^^^^^^
1273 1348
1274 1349 The ``BufferSegment`` type represents a segment within a ``BufferWithSegments``.
1275 1350 It is essentially a reference to N bytes within a ``BufferWithSegments``.
1276 1351
1277 1352 ``len()`` returns the length of the segment in bytes.
1278 1353
1279 1354 ``.offset`` contains the byte offset of this segment within its parent
1280 1355 ``BufferWithSegments`` instance.
1281 1356
1282 1357 The object conforms to the buffer protocol. ``.tobytes()`` can be called to
1283 1358 obtain a ``bytes`` instance with a copy of the backing bytes.
1284 1359
1285 1360 BufferSegments
1286 1361 ^^^^^^^^^^^^^^
1287 1362
1288 1363 This type represents an array of ``(offset, length)`` integers defining segments
1289 1364 within a ``BufferWithSegments``.
1290 1365
1291 1366 The array members are 64-bit unsigned integers using host/native bit order.
1292 1367
1293 1368 Instances conform to the buffer protocol.
1294 1369
1295 1370 BufferWithSegmentsCollection
1296 1371 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1297 1372
1298 1373 The ``BufferWithSegmentsCollection`` type represents a virtual spanning view
1299 1374 of multiple ``BufferWithSegments`` instances.
1300 1375
1301 1376 Instances are constructed from 1 or more ``BufferWithSegments`` instances. The
1302 1377 resulting object behaves like an ordered sequence whose members are the
1303 1378 segments within each ``BufferWithSegments``.
1304 1379
1305 1380 ``len()`` returns the number of segments within all ``BufferWithSegments``
1306 1381 instances.
1307 1382
1308 1383 ``o[index]`` and ``__getitem__(index)`` return the ``BufferSegment`` at
1309 1384 that offset as if all ``BufferWithSegments`` instances were a single
1310 1385 entity.
1311 1386
1312 1387 If the object is composed of 2 ``BufferWithSegments`` instances with the
1313 1388 first having 2 segments and the second have 3 segments, then ``b[0]``
1314 1389 and ``b[1]`` access segments in the first object and ``b[2]``, ``b[3]``,
1315 1390 and ``b[4]`` access segments from the second.
1316 1391
1317 1392 Choosing an API
1318 1393 ===============
1319 1394
1320 1395 There are multiple APIs for performing compression and decompression. This is
1321 1396 because different applications have different needs and the library wants to
1322 1397 facilitate optimal use in as many use cases as possible.
1323 1398
1324 1399 From a high-level, APIs are divided into *one-shot* and *streaming*: either you
1325 1400 are operating on all data at once or you operate on it piecemeal.
1326 1401
1327 1402 The *one-shot* APIs are useful for small data, where the input or output
1328 1403 size is known. (The size can come from a buffer length, file size, or
1329 1404 stored in the zstd frame header.) A limitation of the *one-shot* APIs is that
1330 1405 input and output must fit in memory simultaneously. For say a 4 GB input,
1331 1406 this is often not feasible.
1332 1407
1333 1408 The *one-shot* APIs also perform all work as a single operation. So, if you
1334 1409 feed it large input, it could take a long time for the function to return.
1335 1410
1336 1411 The streaming APIs do not have the limitations of the simple API. But the
1337 1412 price you pay for this flexibility is that they are more complex than a
1338 1413 single function call.
1339 1414
1340 1415 The streaming APIs put the caller in control of compression and decompression
1341 1416 behavior by allowing them to directly control either the input or output side
1342 1417 of the operation.
1343 1418
1344 1419 With the *streaming input*, *compressor*, and *decompressor* APIs, the caller
1345 1420 has full control over the input to the compression or decompression stream.
1346 1421 They can directly choose when new data is operated on.
1347 1422
1348 1423 With the *streaming ouput* APIs, the caller has full control over the output
1349 1424 of the compression or decompression stream. It can choose when to receive
1350 1425 new data.
1351 1426
1352 1427 When using the *streaming* APIs that operate on file-like or stream objects,
1353 1428 it is important to consider what happens in that object when I/O is requested.
1354 1429 There is potential for long pauses as data is read or written from the
1355 1430 underlying stream (say from interacting with a filesystem or network). This
1356 1431 could add considerable overhead.
1357 1432
1358 1433 Thread Safety
1359 1434 =============
1360 1435
1361 1436 ``ZstdCompressor`` and ``ZstdDecompressor`` instances have no guarantees
1362 1437 about thread safety. Do not operate on the same ``ZstdCompressor`` and
1363 1438 ``ZstdDecompressor`` instance simultaneously from different threads. It is
1364 1439 fine to have different threads call into a single instance, just not at the
1365 1440 same time.
1366 1441
1367 1442 Some operations require multiple function calls to complete. e.g. streaming
1368 1443 operations. A single ``ZstdCompressor`` or ``ZstdDecompressor`` cannot be used
1369 1444 for simultaneously active operations. e.g. you must not start a streaming
1370 1445 operation when another streaming operation is already active.
1371 1446
1372 1447 The C extension releases the GIL during non-trivial calls into the zstd C
1373 1448 API. Non-trivial calls are notably compression and decompression. Trivial
1374 1449 calls are things like parsing frame parameters. Where the GIL is released
1375 1450 is considered an implementation detail and can change in any release.
1376 1451
1377 1452 APIs that accept bytes-like objects don't enforce that the underlying object
1378 1453 is read-only. However, it is assumed that the passed object is read-only for
1379 1454 the duration of the function call. It is possible to pass a mutable object
1380 1455 (like a ``bytearray``) to e.g. ``ZstdCompressor.compress()``, have the GIL
1381 1456 released, and mutate the object from another thread. Such a race condition
1382 1457 is a bug in the consumer of python-zstandard. Most Python data types are
1383 1458 immutable, so unless you are doing something fancy, you don't need to
1384 1459 worry about this.
1385 1460
1386 1461 Note on Zstandard's *Experimental* API
1387 1462 ======================================
1388 1463
1389 1464 Many of the Zstandard APIs used by this module are marked as *experimental*
1390 1465 within the Zstandard project.
1391 1466
1392 1467 It is unclear how Zstandard's C API will evolve over time, especially with
1393 1468 regards to this *experimental* functionality. We will try to maintain
1394 1469 backwards compatibility at the Python API level. However, we cannot
1395 1470 guarantee this for things not under our control.
1396 1471
1397 1472 Since a copy of the Zstandard source code is distributed with this
1398 1473 module and since we compile against it, the behavior of a specific
1399 1474 version of this module should be constant for all of time. So if you
1400 1475 pin the version of this module used in your projects (which is a Python
1401 1476 best practice), you should be shielded from unwanted future changes.
1402 1477
1403 1478 Donate
1404 1479 ======
1405 1480
1406 1481 A lot of time has been invested into this project by the author.
1407 1482
1408 1483 If you find this project useful and would like to thank the author for
1409 1484 their work, consider donating some money. Any amount is appreciated.
1410 1485
1411 1486 .. image:: https://www.paypalobjects.com/en_US/i/btn/btn_donate_LG.gif
1412 1487 :target: https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=gregory%2eszorc%40gmail%2ecom&lc=US&item_name=python%2dzstandard&currency_code=USD&bn=PP%2dDonationsBF%3abtn_donate_LG%2egif%3aNonHosted
1413 1488 :alt: Donate via PayPal
1414 1489
1415 1490 .. |ci-status| image:: https://travis-ci.org/indygreg/python-zstandard.svg?branch=master
1416 1491 :target: https://travis-ci.org/indygreg/python-zstandard
1417 1492
1418 1493 .. |win-ci-status| image:: https://ci.appveyor.com/api/projects/status/github/indygreg/python-zstandard?svg=true
1419 1494 :target: https://ci.appveyor.com/project/indygreg/python-zstandard
1420 1495 :alt: Windows build status
@@ -1,502 +1,477 b''
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 int set_parameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, unsigned value) {
14 14 size_t zresult = ZSTD_CCtxParam_setParameter(params, param, value);
15 15 if (ZSTD_isError(zresult)) {
16 16 PyErr_Format(ZstdError, "unable to set compression context parameter: %s",
17 17 ZSTD_getErrorName(zresult));
18 18 return 1;
19 19 }
20 20
21 21 return 0;
22 22 }
23 23
24 24 #define TRY_SET_PARAMETER(params, param, value) if (set_parameter(params, param, value)) return -1;
25 25
26 26 int set_parameters(ZSTD_CCtx_params* params, ZstdCompressionParametersObject* obj) {
27 27 TRY_SET_PARAMETER(params, ZSTD_p_format, obj->format);
28 28 TRY_SET_PARAMETER(params, ZSTD_p_compressionLevel, (unsigned)obj->compressionLevel);
29 29 TRY_SET_PARAMETER(params, ZSTD_p_windowLog, obj->windowLog);
30 30 TRY_SET_PARAMETER(params, ZSTD_p_hashLog, obj->hashLog);
31 31 TRY_SET_PARAMETER(params, ZSTD_p_chainLog, obj->chainLog);
32 32 TRY_SET_PARAMETER(params, ZSTD_p_searchLog, obj->searchLog);
33 33 TRY_SET_PARAMETER(params, ZSTD_p_minMatch, obj->minMatch);
34 34 TRY_SET_PARAMETER(params, ZSTD_p_targetLength, obj->targetLength);
35 35 TRY_SET_PARAMETER(params, ZSTD_p_compressionStrategy, obj->compressionStrategy);
36 36 TRY_SET_PARAMETER(params, ZSTD_p_contentSizeFlag, obj->contentSizeFlag);
37 37 TRY_SET_PARAMETER(params, ZSTD_p_checksumFlag, obj->checksumFlag);
38 38 TRY_SET_PARAMETER(params, ZSTD_p_dictIDFlag, obj->dictIDFlag);
39 39 TRY_SET_PARAMETER(params, ZSTD_p_nbWorkers, obj->threads);
40 40 TRY_SET_PARAMETER(params, ZSTD_p_jobSize, obj->jobSize);
41 41 TRY_SET_PARAMETER(params, ZSTD_p_overlapSizeLog, obj->overlapSizeLog);
42 TRY_SET_PARAMETER(params, ZSTD_p_compressLiterals, obj->compressLiterals);
43 42 TRY_SET_PARAMETER(params, ZSTD_p_forceMaxWindow, obj->forceMaxWindow);
44 43 TRY_SET_PARAMETER(params, ZSTD_p_enableLongDistanceMatching, obj->enableLongDistanceMatching);
45 44 TRY_SET_PARAMETER(params, ZSTD_p_ldmHashLog, obj->ldmHashLog);
46 45 TRY_SET_PARAMETER(params, ZSTD_p_ldmMinMatch, obj->ldmMinMatch);
47 46 TRY_SET_PARAMETER(params, ZSTD_p_ldmBucketSizeLog, obj->ldmBucketSizeLog);
48 47 TRY_SET_PARAMETER(params, ZSTD_p_ldmHashEveryLog, obj->ldmHashEveryLog);
49 48
50 49 return 0;
51 50 }
52 51
53 52 int reset_params(ZstdCompressionParametersObject* params) {
54 53 if (params->params) {
55 54 ZSTD_CCtxParams_reset(params->params);
56 55 }
57 56 else {
58 57 params->params = ZSTD_createCCtxParams();
59 58 if (!params->params) {
60 59 PyErr_NoMemory();
61 60 return 1;
62 61 }
63 62 }
64 63
65 64 return set_parameters(params->params, params);
66 65 }
67 66
68 67 static int ZstdCompressionParameters_init(ZstdCompressionParametersObject* self, PyObject* args, PyObject* kwargs) {
69 68 static char* kwlist[] = {
70 69 "format",
71 70 "compression_level",
72 71 "window_log",
73 72 "hash_log",
74 73 "chain_log",
75 74 "search_log",
76 75 "min_match",
77 76 "target_length",
78 77 "compression_strategy",
79 78 "write_content_size",
80 79 "write_checksum",
81 80 "write_dict_id",
82 81 "job_size",
83 82 "overlap_size_log",
84 83 "force_max_window",
85 84 "enable_ldm",
86 85 "ldm_hash_log",
87 86 "ldm_min_match",
88 87 "ldm_bucket_size_log",
89 88 "ldm_hash_every_log",
90 89 "threads",
91 "compress_literals",
92 90 NULL
93 91 };
94 92
95 93 unsigned format = 0;
96 94 int compressionLevel = 0;
97 95 unsigned windowLog = 0;
98 96 unsigned hashLog = 0;
99 97 unsigned chainLog = 0;
100 98 unsigned searchLog = 0;
101 99 unsigned minMatch = 0;
102 100 unsigned targetLength = 0;
103 101 unsigned compressionStrategy = 0;
104 102 unsigned contentSizeFlag = 1;
105 103 unsigned checksumFlag = 0;
106 104 unsigned dictIDFlag = 0;
107 105 unsigned jobSize = 0;
108 106 unsigned overlapSizeLog = 0;
109 107 unsigned forceMaxWindow = 0;
110 108 unsigned enableLDM = 0;
111 109 unsigned ldmHashLog = 0;
112 110 unsigned ldmMinMatch = 0;
113 111 unsigned ldmBucketSizeLog = 0;
114 112 unsigned ldmHashEveryLog = 0;
115 113 int threads = 0;
116 114
117 /* Setting value 0 has the effect of disabling. So we use -1 as a default
118 * to detect whether to set. Then we automatically derive the expected value
119 * based on the level, just like zstandard does itself. */
120 int compressLiterals = -1;
121
122 115 if (!PyArg_ParseTupleAndKeywords(args, kwargs,
123 "|IiIIIIIIIIIIIIIIIIIIii:CompressionParameters",
116 "|IiIIIIIIIIIIIIIIIIIIi:CompressionParameters",
124 117 kwlist, &format, &compressionLevel, &windowLog, &hashLog, &chainLog,
125 118 &searchLog, &minMatch, &targetLength, &compressionStrategy,
126 119 &contentSizeFlag, &checksumFlag, &dictIDFlag, &jobSize, &overlapSizeLog,
127 120 &forceMaxWindow, &enableLDM, &ldmHashLog, &ldmMinMatch, &ldmBucketSizeLog,
128 &ldmHashEveryLog, &threads, &compressLiterals)) {
121 &ldmHashEveryLog, &threads)) {
129 122 return -1;
130 123 }
131 124
132 125 if (threads < 0) {
133 126 threads = cpu_count();
134 127 }
135 128
136 if (compressLiterals < 0) {
137 compressLiterals = compressionLevel >= 0;
138 }
139
140 129 self->format = format;
141 130 self->compressionLevel = compressionLevel;
142 131 self->windowLog = windowLog;
143 132 self->hashLog = hashLog;
144 133 self->chainLog = chainLog;
145 134 self->searchLog = searchLog;
146 135 self->minMatch = minMatch;
147 136 self->targetLength = targetLength;
148 137 self->compressionStrategy = compressionStrategy;
149 138 self->contentSizeFlag = contentSizeFlag;
150 139 self->checksumFlag = checksumFlag;
151 140 self->dictIDFlag = dictIDFlag;
152 141 self->threads = threads;
153 142 self->jobSize = jobSize;
154 143 self->overlapSizeLog = overlapSizeLog;
155 self->compressLiterals = compressLiterals;
156 144 self->forceMaxWindow = forceMaxWindow;
157 145 self->enableLongDistanceMatching = enableLDM;
158 146 self->ldmHashLog = ldmHashLog;
159 147 self->ldmMinMatch = ldmMinMatch;
160 148 self->ldmBucketSizeLog = ldmBucketSizeLog;
161 149 self->ldmHashEveryLog = ldmHashEveryLog;
162 150
163 151 if (reset_params(self)) {
164 152 return -1;
165 153 }
166 154
167 155 return 0;
168 156 }
169 157
170 158 PyDoc_STRVAR(ZstdCompressionParameters_from_level__doc__,
171 159 "Create a CompressionParameters from a compression level and target sizes\n"
172 160 );
173 161
174 162 ZstdCompressionParametersObject* CompressionParameters_from_level(PyObject* undef, PyObject* args, PyObject* kwargs) {
175 163 int managedKwargs = 0;
176 164 int level;
177 165 PyObject* sourceSize = NULL;
178 166 PyObject* dictSize = NULL;
179 167 unsigned PY_LONG_LONG iSourceSize = 0;
180 168 Py_ssize_t iDictSize = 0;
181 169 PyObject* val;
182 170 ZSTD_compressionParameters params;
183 171 ZstdCompressionParametersObject* result = NULL;
184 172 int res;
185 173
186 174 if (!PyArg_ParseTuple(args, "i:from_level",
187 175 &level)) {
188 176 return NULL;
189 177 }
190 178
191 179 if (!kwargs) {
192 180 kwargs = PyDict_New();
193 181 if (!kwargs) {
194 182 return NULL;
195 183 }
196 184 managedKwargs = 1;
197 185 }
198 186
199 187 sourceSize = PyDict_GetItemString(kwargs, "source_size");
200 188 if (sourceSize) {
201 189 #if PY_MAJOR_VERSION >= 3
202 190 iSourceSize = PyLong_AsUnsignedLongLong(sourceSize);
203 191 if (iSourceSize == (unsigned PY_LONG_LONG)(-1)) {
204 192 goto cleanup;
205 193 }
206 194 #else
207 195 iSourceSize = PyInt_AsUnsignedLongLongMask(sourceSize);
208 196 #endif
209 197
210 198 PyDict_DelItemString(kwargs, "source_size");
211 199 }
212 200
213 201 dictSize = PyDict_GetItemString(kwargs, "dict_size");
214 202 if (dictSize) {
215 203 #if PY_MAJOR_VERSION >= 3
216 204 iDictSize = PyLong_AsSsize_t(dictSize);
217 205 #else
218 206 iDictSize = PyInt_AsSsize_t(dictSize);
219 207 #endif
220 208 if (iDictSize == -1) {
221 209 goto cleanup;
222 210 }
223 211
224 212 PyDict_DelItemString(kwargs, "dict_size");
225 213 }
226 214
227 215
228 216 params = ZSTD_getCParams(level, iSourceSize, iDictSize);
229 217
230 218 /* Values derived from the input level and sizes are passed along to the
231 219 constructor. But only if a value doesn't already exist. */
232 220 val = PyDict_GetItemString(kwargs, "window_log");
233 221 if (!val) {
234 222 val = PyLong_FromUnsignedLong(params.windowLog);
235 223 if (!val) {
236 224 goto cleanup;
237 225 }
238 226 PyDict_SetItemString(kwargs, "window_log", val);
239 227 Py_DECREF(val);
240 228 }
241 229
242 230 val = PyDict_GetItemString(kwargs, "chain_log");
243 231 if (!val) {
244 232 val = PyLong_FromUnsignedLong(params.chainLog);
245 233 if (!val) {
246 234 goto cleanup;
247 235 }
248 236 PyDict_SetItemString(kwargs, "chain_log", val);
249 237 Py_DECREF(val);
250 238 }
251 239
252 240 val = PyDict_GetItemString(kwargs, "hash_log");
253 241 if (!val) {
254 242 val = PyLong_FromUnsignedLong(params.hashLog);
255 243 if (!val) {
256 244 goto cleanup;
257 245 }
258 246 PyDict_SetItemString(kwargs, "hash_log", val);
259 247 Py_DECREF(val);
260 248 }
261 249
262 250 val = PyDict_GetItemString(kwargs, "search_log");
263 251 if (!val) {
264 252 val = PyLong_FromUnsignedLong(params.searchLog);
265 253 if (!val) {
266 254 goto cleanup;
267 255 }
268 256 PyDict_SetItemString(kwargs, "search_log", val);
269 257 Py_DECREF(val);
270 258 }
271 259
272 260 val = PyDict_GetItemString(kwargs, "min_match");
273 261 if (!val) {
274 262 val = PyLong_FromUnsignedLong(params.searchLength);
275 263 if (!val) {
276 264 goto cleanup;
277 265 }
278 266 PyDict_SetItemString(kwargs, "min_match", val);
279 267 Py_DECREF(val);
280 268 }
281 269
282 270 val = PyDict_GetItemString(kwargs, "target_length");
283 271 if (!val) {
284 272 val = PyLong_FromUnsignedLong(params.targetLength);
285 273 if (!val) {
286 274 goto cleanup;
287 275 }
288 276 PyDict_SetItemString(kwargs, "target_length", val);
289 277 Py_DECREF(val);
290 278 }
291 279
292 280 val = PyDict_GetItemString(kwargs, "compression_strategy");
293 281 if (!val) {
294 282 val = PyLong_FromUnsignedLong(params.strategy);
295 283 if (!val) {
296 284 goto cleanup;
297 285 }
298 286 PyDict_SetItemString(kwargs, "compression_strategy", val);
299 287 Py_DECREF(val);
300 288 }
301 289
302 val = PyDict_GetItemString(kwargs, "compress_literals");
303 if (!val) {
304 val = PyLong_FromLong(level >= 0 ? 1 : 0);
305 if (!val) {
306 goto cleanup;
307 }
308 PyDict_SetItemString(kwargs, "compress_literals", val);
309 Py_DECREF(val);
310 }
311
312 290 result = PyObject_New(ZstdCompressionParametersObject, &ZstdCompressionParametersType);
313 291 if (!result) {
314 292 goto cleanup;
315 293 }
316 294
317 295 result->params = NULL;
318 296
319 297 val = PyTuple_New(0);
320 298 if (!val) {
321 299 Py_CLEAR(result);
322 300 goto cleanup;
323 301 }
324 302
325 303 res = ZstdCompressionParameters_init(result, val, kwargs);
326 304 Py_DECREF(val);
327 305
328 306 if (res) {
329 307 Py_CLEAR(result);
330 308 goto cleanup;
331 309 }
332 310
333 311 cleanup:
334 312 if (managedKwargs) {
335 313 Py_DECREF(kwargs);
336 314 }
337 315
338 316 return result;
339 317 }
340 318
341 319 PyDoc_STRVAR(ZstdCompressionParameters_estimated_compression_context_size__doc__,
342 320 "Estimate the size in bytes of a compression context for compression parameters\n"
343 321 );
344 322
345 323 PyObject* ZstdCompressionParameters_estimated_compression_context_size(ZstdCompressionParametersObject* self) {
346 324 return PyLong_FromSize_t(ZSTD_estimateCCtxSize_usingCCtxParams(self->params));
347 325 }
348 326
349 327 PyDoc_STRVAR(ZstdCompressionParameters__doc__,
350 328 "ZstdCompressionParameters: low-level control over zstd compression");
351 329
352 330 static void ZstdCompressionParameters_dealloc(ZstdCompressionParametersObject* self) {
353 331 if (self->params) {
354 332 ZSTD_freeCCtxParams(self->params);
355 333 self->params = NULL;
356 334 }
357 335
358 336 PyObject_Del(self);
359 337 }
360 338
361 339 static PyMethodDef ZstdCompressionParameters_methods[] = {
362 340 {
363 341 "from_level",
364 342 (PyCFunction)CompressionParameters_from_level,
365 343 METH_VARARGS | METH_KEYWORDS | METH_STATIC,
366 344 ZstdCompressionParameters_from_level__doc__
367 345 },
368 346 {
369 347 "estimated_compression_context_size",
370 348 (PyCFunction)ZstdCompressionParameters_estimated_compression_context_size,
371 349 METH_NOARGS,
372 350 ZstdCompressionParameters_estimated_compression_context_size__doc__
373 351 },
374 352 { NULL, NULL }
375 353 };
376 354
377 355 static PyMemberDef ZstdCompressionParameters_members[] = {
378 356 { "format", T_UINT,
379 357 offsetof(ZstdCompressionParametersObject, format), READONLY,
380 358 "compression format" },
381 359 { "compression_level", T_INT,
382 360 offsetof(ZstdCompressionParametersObject, compressionLevel), READONLY,
383 361 "compression level" },
384 362 { "window_log", T_UINT,
385 363 offsetof(ZstdCompressionParametersObject, windowLog), READONLY,
386 364 "window log" },
387 365 { "hash_log", T_UINT,
388 366 offsetof(ZstdCompressionParametersObject, hashLog), READONLY,
389 367 "hash log" },
390 368 { "chain_log", T_UINT,
391 369 offsetof(ZstdCompressionParametersObject, chainLog), READONLY,
392 370 "chain log" },
393 371 { "search_log", T_UINT,
394 372 offsetof(ZstdCompressionParametersObject, searchLog), READONLY,
395 373 "search log" },
396 374 { "min_match", T_UINT,
397 375 offsetof(ZstdCompressionParametersObject, minMatch), READONLY,
398 376 "search length" },
399 377 { "target_length", T_UINT,
400 378 offsetof(ZstdCompressionParametersObject, targetLength), READONLY,
401 379 "target length" },
402 380 { "compression_strategy", T_UINT,
403 381 offsetof(ZstdCompressionParametersObject, compressionStrategy), READONLY,
404 382 "compression strategy" },
405 383 { "write_content_size", T_UINT,
406 384 offsetof(ZstdCompressionParametersObject, contentSizeFlag), READONLY,
407 385 "whether to write content size in frames" },
408 386 { "write_checksum", T_UINT,
409 387 offsetof(ZstdCompressionParametersObject, checksumFlag), READONLY,
410 388 "whether to write checksum in frames" },
411 389 { "write_dict_id", T_UINT,
412 390 offsetof(ZstdCompressionParametersObject, dictIDFlag), READONLY,
413 391 "whether to write dictionary ID in frames" },
414 392 { "threads", T_UINT,
415 393 offsetof(ZstdCompressionParametersObject, threads), READONLY,
416 394 "number of threads to use" },
417 395 { "job_size", T_UINT,
418 396 offsetof(ZstdCompressionParametersObject, jobSize), READONLY,
419 397 "size of compression job when using multiple threads" },
420 398 { "overlap_size_log", T_UINT,
421 399 offsetof(ZstdCompressionParametersObject, overlapSizeLog), READONLY,
422 400 "Size of previous input reloaded at the beginning of each job" },
423 { "compress_literals", T_UINT,
424 offsetof(ZstdCompressionParametersObject, compressLiterals), READONLY,
425 "whether Huffman compression of literals is in use" },
426 401 { "force_max_window", T_UINT,
427 402 offsetof(ZstdCompressionParametersObject, forceMaxWindow), READONLY,
428 403 "force back references to remain smaller than window size" },
429 404 { "enable_ldm", T_UINT,
430 405 offsetof(ZstdCompressionParametersObject, enableLongDistanceMatching), READONLY,
431 406 "whether to enable long distance matching" },
432 407 { "ldm_hash_log", T_UINT,
433 408 offsetof(ZstdCompressionParametersObject, ldmHashLog), READONLY,
434 409 "Size of the table for long distance matching, as a power of 2" },
435 410 { "ldm_min_match", T_UINT,
436 411 offsetof(ZstdCompressionParametersObject, ldmMinMatch), READONLY,
437 412 "minimum size of searched matches for long distance matcher" },
438 413 { "ldm_bucket_size_log", T_UINT,
439 414 offsetof(ZstdCompressionParametersObject, ldmBucketSizeLog), READONLY,
440 415 "log size of each bucket in the LDM hash table for collision resolution" },
441 416 { "ldm_hash_every_log", T_UINT,
442 417 offsetof(ZstdCompressionParametersObject, ldmHashEveryLog), READONLY,
443 418 "frequency of inserting/looking up entries in the LDM hash table" },
444 419 { NULL }
445 420 };
446 421
447 422 PyTypeObject ZstdCompressionParametersType = {
448 423 PyVarObject_HEAD_INIT(NULL, 0)
449 424 "ZstdCompressionParameters", /* tp_name */
450 425 sizeof(ZstdCompressionParametersObject), /* tp_basicsize */
451 426 0, /* tp_itemsize */
452 427 (destructor)ZstdCompressionParameters_dealloc, /* tp_dealloc */
453 428 0, /* tp_print */
454 429 0, /* tp_getattr */
455 430 0, /* tp_setattr */
456 431 0, /* tp_compare */
457 432 0, /* tp_repr */
458 433 0, /* tp_as_number */
459 434 0, /* tp_as_sequence */
460 435 0, /* tp_as_mapping */
461 436 0, /* tp_hash */
462 437 0, /* tp_call */
463 438 0, /* tp_str */
464 439 0, /* tp_getattro */
465 440 0, /* tp_setattro */
466 441 0, /* tp_as_buffer */
467 442 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
468 443 ZstdCompressionParameters__doc__, /* tp_doc */
469 444 0, /* tp_traverse */
470 445 0, /* tp_clear */
471 446 0, /* tp_richcompare */
472 447 0, /* tp_weaklistoffset */
473 448 0, /* tp_iter */
474 449 0, /* tp_iternext */
475 450 ZstdCompressionParameters_methods, /* tp_methods */
476 451 ZstdCompressionParameters_members, /* tp_members */
477 452 0, /* tp_getset */
478 453 0, /* tp_base */
479 454 0, /* tp_dict */
480 455 0, /* tp_descr_get */
481 456 0, /* tp_descr_set */
482 457 0, /* tp_dictoffset */
483 458 (initproc)ZstdCompressionParameters_init, /* tp_init */
484 459 0, /* tp_alloc */
485 460 PyType_GenericNew, /* tp_new */
486 461 };
487 462
488 463 void compressionparams_module_init(PyObject* mod) {
489 464 Py_TYPE(&ZstdCompressionParametersType) = &PyType_Type;
490 465 if (PyType_Ready(&ZstdCompressionParametersType) < 0) {
491 466 return;
492 467 }
493 468
494 469 Py_INCREF(&ZstdCompressionParametersType);
495 470 PyModule_AddObject(mod, "ZstdCompressionParameters",
496 471 (PyObject*)&ZstdCompressionParametersType);
497 472
498 473 /* TODO remove deprecated alias. */
499 474 Py_INCREF(&ZstdCompressionParametersType);
500 475 PyModule_AddObject(mod, "CompressionParameters",
501 476 (PyObject*)&ZstdCompressionParametersType);
502 477 }
@@ -1,405 +1,386 b''
1 1 /**
2 2 * Copyright (c) 2017-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 static void set_unsupported_operation(void) {
14 14 PyObject* iomod;
15 15 PyObject* exc;
16 16
17 17 iomod = PyImport_ImportModule("io");
18 18 if (NULL == iomod) {
19 19 return;
20 20 }
21 21
22 22 exc = PyObject_GetAttrString(iomod, "UnsupportedOperation");
23 23 if (NULL == exc) {
24 24 Py_DECREF(iomod);
25 25 return;
26 26 }
27 27
28 28 PyErr_SetNone(exc);
29 29 Py_DECREF(exc);
30 30 Py_DECREF(iomod);
31 31 }
32 32
33 33 static void reader_dealloc(ZstdCompressionReader* self) {
34 34 Py_XDECREF(self->compressor);
35 35 Py_XDECREF(self->reader);
36 36
37 37 if (self->buffer.buf) {
38 38 PyBuffer_Release(&self->buffer);
39 39 memset(&self->buffer, 0, sizeof(self->buffer));
40 40 }
41 41
42 42 PyObject_Del(self);
43 43 }
44 44
45 45 static ZstdCompressionReader* reader_enter(ZstdCompressionReader* self) {
46 size_t zresult;
47
48 46 if (self->entered) {
49 47 PyErr_SetString(PyExc_ValueError, "cannot __enter__ multiple times");
50 48 return NULL;
51 49 }
52 50
53 zresult = ZSTD_CCtx_setPledgedSrcSize(self->compressor->cctx, self->sourceSize);
54 if (ZSTD_isError(zresult)) {
55 PyErr_Format(ZstdError, "error setting source size: %s",
56 ZSTD_getErrorName(zresult));
57 return NULL;
58 }
59
60 51 self->entered = 1;
61 52
62 53 Py_INCREF(self);
63 54 return self;
64 55 }
65 56
66 57 static PyObject* reader_exit(ZstdCompressionReader* self, PyObject* args) {
67 58 PyObject* exc_type;
68 59 PyObject* exc_value;
69 60 PyObject* exc_tb;
70 61
71 62 if (!PyArg_ParseTuple(args, "OOO:__exit__", &exc_type, &exc_value, &exc_tb)) {
72 63 return NULL;
73 64 }
74 65
75 66 self->entered = 0;
76 67 self->closed = 1;
77 68
78 69 /* Release resources associated with source. */
79 70 Py_CLEAR(self->reader);
80 71 if (self->buffer.buf) {
81 72 PyBuffer_Release(&self->buffer);
82 73 memset(&self->buffer, 0, sizeof(self->buffer));
83 74 }
84 75
85 76 Py_CLEAR(self->compressor);
86 77
87 78 Py_RETURN_FALSE;
88 79 }
89 80
90 81 static PyObject* reader_readable(ZstdCompressionReader* self) {
91 82 Py_RETURN_TRUE;
92 83 }
93 84
94 85 static PyObject* reader_writable(ZstdCompressionReader* self) {
95 86 Py_RETURN_FALSE;
96 87 }
97 88
98 89 static PyObject* reader_seekable(ZstdCompressionReader* self) {
99 90 Py_RETURN_FALSE;
100 91 }
101 92
102 93 static PyObject* reader_readline(PyObject* self, PyObject* args) {
103 94 set_unsupported_operation();
104 95 return NULL;
105 96 }
106 97
107 98 static PyObject* reader_readlines(PyObject* self, PyObject* args) {
108 99 set_unsupported_operation();
109 100 return NULL;
110 101 }
111 102
112 103 static PyObject* reader_write(PyObject* self, PyObject* args) {
113 104 PyErr_SetString(PyExc_OSError, "stream is not writable");
114 105 return NULL;
115 106 }
116 107
117 108 static PyObject* reader_writelines(PyObject* self, PyObject* args) {
118 109 PyErr_SetString(PyExc_OSError, "stream is not writable");
119 110 return NULL;
120 111 }
121 112
122 113 static PyObject* reader_isatty(PyObject* self) {
123 114 Py_RETURN_FALSE;
124 115 }
125 116
126 117 static PyObject* reader_flush(PyObject* self) {
127 118 Py_RETURN_NONE;
128 119 }
129 120
130 121 static PyObject* reader_close(ZstdCompressionReader* self) {
131 122 self->closed = 1;
132 123 Py_RETURN_NONE;
133 124 }
134 125
135 static PyObject* reader_closed(ZstdCompressionReader* self) {
136 if (self->closed) {
137 Py_RETURN_TRUE;
138 }
139 else {
140 Py_RETURN_FALSE;
141 }
142 }
143
144 126 static PyObject* reader_tell(ZstdCompressionReader* self) {
145 127 /* TODO should this raise OSError since stream isn't seekable? */
146 128 return PyLong_FromUnsignedLongLong(self->bytesCompressed);
147 129 }
148 130
149 131 static PyObject* reader_read(ZstdCompressionReader* self, PyObject* args, PyObject* kwargs) {
150 132 static char* kwlist[] = {
151 133 "size",
152 134 NULL
153 135 };
154 136
155 137 Py_ssize_t size = -1;
156 138 PyObject* result = NULL;
157 139 char* resultBuffer;
158 140 Py_ssize_t resultSize;
159 141 size_t zresult;
160 142 size_t oldPos;
161 143
162 if (!self->entered) {
163 PyErr_SetString(ZstdError, "read() must be called from an active context manager");
164 return NULL;
165 }
166
167 144 if (self->closed) {
168 145 PyErr_SetString(PyExc_ValueError, "stream is closed");
169 146 return NULL;
170 147 }
171 148
172 149 if (self->finishedOutput) {
173 150 return PyBytes_FromStringAndSize("", 0);
174 151 }
175 152
176 153 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n", kwlist, &size)) {
177 154 return NULL;
178 155 }
179 156
180 157 if (size < 1) {
181 158 PyErr_SetString(PyExc_ValueError, "cannot read negative or size 0 amounts");
182 159 return NULL;
183 160 }
184 161
185 162 result = PyBytes_FromStringAndSize(NULL, size);
186 163 if (NULL == result) {
187 164 return NULL;
188 165 }
189 166
190 167 PyBytes_AsStringAndSize(result, &resultBuffer, &resultSize);
191 168
192 169 self->output.dst = resultBuffer;
193 170 self->output.size = resultSize;
194 171 self->output.pos = 0;
195 172
196 173 readinput:
197 174
198 175 /* If we have data left over, consume it. */
199 176 if (self->input.pos < self->input.size) {
200 177 oldPos = self->output.pos;
201 178
202 179 Py_BEGIN_ALLOW_THREADS
203 180 zresult = ZSTD_compress_generic(self->compressor->cctx,
204 181 &self->output, &self->input, ZSTD_e_continue);
205 182
206 183 Py_END_ALLOW_THREADS
207 184
208 185 self->bytesCompressed += self->output.pos - oldPos;
209 186
210 187 /* Input exhausted. Clear out state tracking. */
211 188 if (self->input.pos == self->input.size) {
212 189 memset(&self->input, 0, sizeof(self->input));
213 190 Py_CLEAR(self->readResult);
214 191
215 192 if (self->buffer.buf) {
216 193 self->finishedInput = 1;
217 194 }
218 195 }
219 196
220 197 if (ZSTD_isError(zresult)) {
221 198 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
222 199 return NULL;
223 200 }
224 201
225 202 if (self->output.pos) {
226 203 /* If no more room in output, emit it. */
227 204 if (self->output.pos == self->output.size) {
228 205 memset(&self->output, 0, sizeof(self->output));
229 206 return result;
230 207 }
231 208
232 209 /*
233 210 * There is room in the output. We fall through to below, which will either
234 211 * get more input for us or will attempt to end the stream.
235 212 */
236 213 }
237 214
238 215 /* Fall through to gather more input. */
239 216 }
240 217
241 218 if (!self->finishedInput) {
242 219 if (self->reader) {
243 220 Py_buffer buffer;
244 221
245 222 assert(self->readResult == NULL);
246 223 self->readResult = PyObject_CallMethod(self->reader, "read",
247 224 "k", self->readSize);
248 225 if (self->readResult == NULL) {
249 226 return NULL;
250 227 }
251 228
252 229 memset(&buffer, 0, sizeof(buffer));
253 230
254 231 if (0 != PyObject_GetBuffer(self->readResult, &buffer, PyBUF_CONTIG_RO)) {
255 232 return NULL;
256 233 }
257 234
258 235 /* EOF */
259 236 if (0 == buffer.len) {
260 237 self->finishedInput = 1;
261 238 Py_CLEAR(self->readResult);
262 239 }
263 240 else {
264 241 self->input.src = buffer.buf;
265 242 self->input.size = buffer.len;
266 243 self->input.pos = 0;
267 244 }
268 245
269 246 PyBuffer_Release(&buffer);
270 247 }
271 248 else {
272 249 assert(self->buffer.buf);
273 250
274 251 self->input.src = self->buffer.buf;
275 252 self->input.size = self->buffer.len;
276 253 self->input.pos = 0;
277 254 }
278 255 }
279 256
280 257 if (self->input.size) {
281 258 goto readinput;
282 259 }
283 260
284 261 /* Else EOF */
285 262 oldPos = self->output.pos;
286 263
287 264 zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output,
288 265 &self->input, ZSTD_e_end);
289 266
290 267 self->bytesCompressed += self->output.pos - oldPos;
291 268
292 269 if (ZSTD_isError(zresult)) {
293 270 PyErr_Format(ZstdError, "error ending compression stream: %s",
294 271 ZSTD_getErrorName(zresult));
295 272 return NULL;
296 273 }
297 274
298 275 assert(self->output.pos);
299 276
300 277 if (0 == zresult) {
301 278 self->finishedOutput = 1;
302 279 }
303 280
304 281 if (safe_pybytes_resize(&result, self->output.pos)) {
305 282 Py_XDECREF(result);
306 283 return NULL;
307 284 }
308 285
309 286 memset(&self->output, 0, sizeof(self->output));
310 287
311 288 return result;
312 289 }
313 290
314 291 static PyObject* reader_readall(PyObject* self) {
315 292 PyErr_SetNone(PyExc_NotImplementedError);
316 293 return NULL;
317 294 }
318 295
319 296 static PyObject* reader_iter(PyObject* self) {
320 297 set_unsupported_operation();
321 298 return NULL;
322 299 }
323 300
324 301 static PyObject* reader_iternext(PyObject* self) {
325 302 set_unsupported_operation();
326 303 return NULL;
327 304 }
328 305
329 306 static PyMethodDef reader_methods[] = {
330 307 { "__enter__", (PyCFunction)reader_enter, METH_NOARGS,
331 308 PyDoc_STR("Enter a compression context") },
332 309 { "__exit__", (PyCFunction)reader_exit, METH_VARARGS,
333 310 PyDoc_STR("Exit a compression context") },
334 311 { "close", (PyCFunction)reader_close, METH_NOARGS,
335 312 PyDoc_STR("Close the stream so it cannot perform any more operations") },
336 { "closed", (PyCFunction)reader_closed, METH_NOARGS,
337 PyDoc_STR("Whether stream is closed") },
338 313 { "flush", (PyCFunction)reader_flush, METH_NOARGS, PyDoc_STR("no-ops") },
339 314 { "isatty", (PyCFunction)reader_isatty, METH_NOARGS, PyDoc_STR("Returns False") },
340 315 { "readable", (PyCFunction)reader_readable, METH_NOARGS,
341 316 PyDoc_STR("Returns True") },
342 317 { "read", (PyCFunction)reader_read, METH_VARARGS | METH_KEYWORDS, PyDoc_STR("read compressed data") },
343 318 { "readall", (PyCFunction)reader_readall, METH_NOARGS, PyDoc_STR("Not implemented") },
344 319 { "readline", (PyCFunction)reader_readline, METH_VARARGS, PyDoc_STR("Not implemented") },
345 320 { "readlines", (PyCFunction)reader_readlines, METH_VARARGS, PyDoc_STR("Not implemented") },
346 321 { "seekable", (PyCFunction)reader_seekable, METH_NOARGS,
347 322 PyDoc_STR("Returns False") },
348 323 { "tell", (PyCFunction)reader_tell, METH_NOARGS,
349 324 PyDoc_STR("Returns current number of bytes compressed") },
350 325 { "writable", (PyCFunction)reader_writable, METH_NOARGS,
351 326 PyDoc_STR("Returns False") },
352 327 { "write", reader_write, METH_VARARGS, PyDoc_STR("Raises OSError") },
353 328 { "writelines", reader_writelines, METH_VARARGS, PyDoc_STR("Not implemented") },
354 329 { NULL, NULL }
355 330 };
356 331
332 static PyMemberDef reader_members[] = {
333 { "closed", T_BOOL, offsetof(ZstdCompressionReader, closed),
334 READONLY, "whether stream is closed" },
335 { NULL }
336 };
337
357 338 PyTypeObject ZstdCompressionReaderType = {
358 339 PyVarObject_HEAD_INIT(NULL, 0)
359 340 "zstd.ZstdCompressionReader", /* tp_name */
360 341 sizeof(ZstdCompressionReader), /* tp_basicsize */
361 342 0, /* tp_itemsize */
362 343 (destructor)reader_dealloc, /* tp_dealloc */
363 344 0, /* tp_print */
364 345 0, /* tp_getattr */
365 346 0, /* tp_setattr */
366 347 0, /* tp_compare */
367 348 0, /* tp_repr */
368 349 0, /* tp_as_number */
369 350 0, /* tp_as_sequence */
370 351 0, /* tp_as_mapping */
371 352 0, /* tp_hash */
372 353 0, /* tp_call */
373 354 0, /* tp_str */
374 355 0, /* tp_getattro */
375 356 0, /* tp_setattro */
376 357 0, /* tp_as_buffer */
377 358 Py_TPFLAGS_DEFAULT, /* tp_flags */
378 359 0, /* tp_doc */
379 360 0, /* tp_traverse */
380 361 0, /* tp_clear */
381 362 0, /* tp_richcompare */
382 363 0, /* tp_weaklistoffset */
383 364 reader_iter, /* tp_iter */
384 365 reader_iternext, /* tp_iternext */
385 366 reader_methods, /* tp_methods */
386 0, /* tp_members */
367 reader_members, /* tp_members */
387 368 0, /* tp_getset */
388 369 0, /* tp_base */
389 370 0, /* tp_dict */
390 371 0, /* tp_descr_get */
391 372 0, /* tp_descr_set */
392 373 0, /* tp_dictoffset */
393 374 0, /* tp_init */
394 375 0, /* tp_alloc */
395 376 PyType_GenericNew, /* tp_new */
396 377 };
397 378
398 379 void compressionreader_module_init(PyObject* mod) {
399 380 /* TODO make reader a sub-class of io.RawIOBase */
400 381
401 382 Py_TYPE(&ZstdCompressionReaderType) = &PyType_Type;
402 383 if (PyType_Ready(&ZstdCompressionReaderType) < 0) {
403 384 return;
404 385 }
405 386 }
@@ -1,315 +1,316 b''
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 PyDoc_STRVAR(ZstdCompresssionWriter__doc__,
14 14 """A context manager used for writing compressed output to a writer.\n"
15 15 );
16 16
17 17 static void ZstdCompressionWriter_dealloc(ZstdCompressionWriter* self) {
18 18 Py_XDECREF(self->compressor);
19 19 Py_XDECREF(self->writer);
20 20
21 21 PyObject_Del(self);
22 22 }
23 23
24 24 static PyObject* ZstdCompressionWriter_enter(ZstdCompressionWriter* self) {
25 25 size_t zresult;
26 26
27 27 if (self->entered) {
28 28 PyErr_SetString(ZstdError, "cannot __enter__ multiple times");
29 29 return NULL;
30 30 }
31 31
32 32 zresult = ZSTD_CCtx_setPledgedSrcSize(self->compressor->cctx, self->sourceSize);
33 33 if (ZSTD_isError(zresult)) {
34 34 PyErr_Format(ZstdError, "error setting source size: %s",
35 35 ZSTD_getErrorName(zresult));
36 36 return NULL;
37 37 }
38 38
39 39 self->entered = 1;
40 40
41 41 Py_INCREF(self);
42 42 return (PyObject*)self;
43 43 }
44 44
45 45 static PyObject* ZstdCompressionWriter_exit(ZstdCompressionWriter* self, PyObject* args) {
46 46 PyObject* exc_type;
47 47 PyObject* exc_value;
48 48 PyObject* exc_tb;
49 49 size_t zresult;
50 50
51 51 ZSTD_outBuffer output;
52 52 PyObject* res;
53 53
54 54 if (!PyArg_ParseTuple(args, "OOO:__exit__", &exc_type, &exc_value, &exc_tb)) {
55 55 return NULL;
56 56 }
57 57
58 58 self->entered = 0;
59 59
60 60 if (exc_type == Py_None && exc_value == Py_None && exc_tb == Py_None) {
61 61 ZSTD_inBuffer inBuffer;
62 62
63 63 inBuffer.src = NULL;
64 64 inBuffer.size = 0;
65 65 inBuffer.pos = 0;
66 66
67 67 output.dst = PyMem_Malloc(self->outSize);
68 68 if (!output.dst) {
69 69 return PyErr_NoMemory();
70 70 }
71 71 output.size = self->outSize;
72 72 output.pos = 0;
73 73
74 74 while (1) {
75 75 zresult = ZSTD_compress_generic(self->compressor->cctx, &output, &inBuffer, ZSTD_e_end);
76 76 if (ZSTD_isError(zresult)) {
77 77 PyErr_Format(ZstdError, "error ending compression stream: %s",
78 78 ZSTD_getErrorName(zresult));
79 79 PyMem_Free(output.dst);
80 80 return NULL;
81 81 }
82 82
83 83 if (output.pos) {
84 84 #if PY_MAJOR_VERSION >= 3
85 85 res = PyObject_CallMethod(self->writer, "write", "y#",
86 86 #else
87 87 res = PyObject_CallMethod(self->writer, "write", "s#",
88 88 #endif
89 89 output.dst, output.pos);
90 90 Py_XDECREF(res);
91 91 }
92 92
93 93 if (!zresult) {
94 94 break;
95 95 }
96 96
97 97 output.pos = 0;
98 98 }
99 99
100 100 PyMem_Free(output.dst);
101 101 }
102 102
103 103 Py_RETURN_FALSE;
104 104 }
105 105
106 106 static PyObject* ZstdCompressionWriter_memory_size(ZstdCompressionWriter* self) {
107 107 return PyLong_FromSize_t(ZSTD_sizeof_CCtx(self->compressor->cctx));
108 108 }
109 109
110 110 static PyObject* ZstdCompressionWriter_write(ZstdCompressionWriter* self, PyObject* args, PyObject* kwargs) {
111 111 static char* kwlist[] = {
112 112 "data",
113 113 NULL
114 114 };
115 115
116 116 PyObject* result = NULL;
117 117 Py_buffer source;
118 118 size_t zresult;
119 119 ZSTD_inBuffer input;
120 120 ZSTD_outBuffer output;
121 121 PyObject* res;
122 122 Py_ssize_t totalWrite = 0;
123 123
124 124 #if PY_MAJOR_VERSION >= 3
125 125 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:write",
126 126 #else
127 127 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:write",
128 128 #endif
129 129 kwlist, &source)) {
130 130 return NULL;
131 131 }
132 132
133 133 if (!self->entered) {
134 134 PyErr_SetString(ZstdError, "compress must be called from an active context manager");
135 135 goto finally;
136 136 }
137 137
138 138 if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) {
139 139 PyErr_SetString(PyExc_ValueError,
140 140 "data buffer should be contiguous and have at most one dimension");
141 141 goto finally;
142 142 }
143 143
144 144 output.dst = PyMem_Malloc(self->outSize);
145 145 if (!output.dst) {
146 146 PyErr_NoMemory();
147 147 goto finally;
148 148 }
149 149 output.size = self->outSize;
150 150 output.pos = 0;
151 151
152 152 input.src = source.buf;
153 153 input.size = source.len;
154 154 input.pos = 0;
155 155
156 156 while ((ssize_t)input.pos < source.len) {
157 157 Py_BEGIN_ALLOW_THREADS
158 158 zresult = ZSTD_compress_generic(self->compressor->cctx, &output, &input, ZSTD_e_continue);
159 159 Py_END_ALLOW_THREADS
160 160
161 161 if (ZSTD_isError(zresult)) {
162 162 PyMem_Free(output.dst);
163 163 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
164 164 goto finally;
165 165 }
166 166
167 167 /* Copy data from output buffer to writer. */
168 168 if (output.pos) {
169 169 #if PY_MAJOR_VERSION >= 3
170 170 res = PyObject_CallMethod(self->writer, "write", "y#",
171 171 #else
172 172 res = PyObject_CallMethod(self->writer, "write", "s#",
173 173 #endif
174 174 output.dst, output.pos);
175 175 Py_XDECREF(res);
176 176 totalWrite += output.pos;
177 177 self->bytesCompressed += output.pos;
178 178 }
179 179 output.pos = 0;
180 180 }
181 181
182 182 PyMem_Free(output.dst);
183 183
184 184 result = PyLong_FromSsize_t(totalWrite);
185 185
186 186 finally:
187 187 PyBuffer_Release(&source);
188 188 return result;
189 189 }
190 190
191 191 static PyObject* ZstdCompressionWriter_flush(ZstdCompressionWriter* self, PyObject* args) {
192 192 size_t zresult;
193 193 ZSTD_outBuffer output;
194 194 ZSTD_inBuffer input;
195 195 PyObject* res;
196 196 Py_ssize_t totalWrite = 0;
197 197
198 198 if (!self->entered) {
199 199 PyErr_SetString(ZstdError, "flush must be called from an active context manager");
200 200 return NULL;
201 201 }
202 202
203 203 input.src = NULL;
204 204 input.size = 0;
205 205 input.pos = 0;
206 206
207 207 output.dst = PyMem_Malloc(self->outSize);
208 208 if (!output.dst) {
209 209 return PyErr_NoMemory();
210 210 }
211 211 output.size = self->outSize;
212 212 output.pos = 0;
213 213
214 214 while (1) {
215 215 Py_BEGIN_ALLOW_THREADS
216 216 zresult = ZSTD_compress_generic(self->compressor->cctx, &output, &input, ZSTD_e_flush);
217 217 Py_END_ALLOW_THREADS
218 218
219 219 if (ZSTD_isError(zresult)) {
220 220 PyMem_Free(output.dst);
221 221 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
222 222 return NULL;
223 223 }
224 224
225 if (!output.pos) {
226 break;
227 }
228
229 225 /* Copy data from output buffer to writer. */
230 226 if (output.pos) {
231 227 #if PY_MAJOR_VERSION >= 3
232 228 res = PyObject_CallMethod(self->writer, "write", "y#",
233 229 #else
234 230 res = PyObject_CallMethod(self->writer, "write", "s#",
235 231 #endif
236 232 output.dst, output.pos);
237 233 Py_XDECREF(res);
238 234 totalWrite += output.pos;
239 235 self->bytesCompressed += output.pos;
240 236 }
237
241 238 output.pos = 0;
239
240 if (!zresult) {
241 break;
242 }
242 243 }
243 244
244 245 PyMem_Free(output.dst);
245 246
246 247 return PyLong_FromSsize_t(totalWrite);
247 248 }
248 249
249 250 static PyObject* ZstdCompressionWriter_tell(ZstdCompressionWriter* self) {
250 251 return PyLong_FromUnsignedLongLong(self->bytesCompressed);
251 252 }
252 253
253 254 static PyMethodDef ZstdCompressionWriter_methods[] = {
254 255 { "__enter__", (PyCFunction)ZstdCompressionWriter_enter, METH_NOARGS,
255 256 PyDoc_STR("Enter a compression context.") },
256 257 { "__exit__", (PyCFunction)ZstdCompressionWriter_exit, METH_VARARGS,
257 258 PyDoc_STR("Exit a compression context.") },
258 259 { "memory_size", (PyCFunction)ZstdCompressionWriter_memory_size, METH_NOARGS,
259 260 PyDoc_STR("Obtain the memory size of the underlying compressor") },
260 261 { "write", (PyCFunction)ZstdCompressionWriter_write, METH_VARARGS | METH_KEYWORDS,
261 262 PyDoc_STR("Compress data") },
262 263 { "flush", (PyCFunction)ZstdCompressionWriter_flush, METH_NOARGS,
263 264 PyDoc_STR("Flush data and finish a zstd frame") },
264 265 { "tell", (PyCFunction)ZstdCompressionWriter_tell, METH_NOARGS,
265 266 PyDoc_STR("Returns current number of bytes compressed") },
266 267 { NULL, NULL }
267 268 };
268 269
269 270 PyTypeObject ZstdCompressionWriterType = {
270 271 PyVarObject_HEAD_INIT(NULL, 0)
271 272 "zstd.ZstdCompressionWriter", /* tp_name */
272 273 sizeof(ZstdCompressionWriter), /* tp_basicsize */
273 274 0, /* tp_itemsize */
274 275 (destructor)ZstdCompressionWriter_dealloc, /* tp_dealloc */
275 276 0, /* tp_print */
276 277 0, /* tp_getattr */
277 278 0, /* tp_setattr */
278 279 0, /* tp_compare */
279 280 0, /* tp_repr */
280 281 0, /* tp_as_number */
281 282 0, /* tp_as_sequence */
282 283 0, /* tp_as_mapping */
283 284 0, /* tp_hash */
284 285 0, /* tp_call */
285 286 0, /* tp_str */
286 287 0, /* tp_getattro */
287 288 0, /* tp_setattro */
288 289 0, /* tp_as_buffer */
289 290 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
290 291 ZstdCompresssionWriter__doc__, /* tp_doc */
291 292 0, /* tp_traverse */
292 293 0, /* tp_clear */
293 294 0, /* tp_richcompare */
294 295 0, /* tp_weaklistoffset */
295 296 0, /* tp_iter */
296 297 0, /* tp_iternext */
297 298 ZstdCompressionWriter_methods, /* tp_methods */
298 299 0, /* tp_members */
299 300 0, /* tp_getset */
300 301 0, /* tp_base */
301 302 0, /* tp_dict */
302 303 0, /* tp_descr_get */
303 304 0, /* tp_descr_set */
304 305 0, /* tp_dictoffset */
305 306 0, /* tp_init */
306 307 0, /* tp_alloc */
307 308 PyType_GenericNew, /* tp_new */
308 309 };
309 310
310 311 void compressionwriter_module_init(PyObject* mod) {
311 312 Py_TYPE(&ZstdCompressionWriterType) = &PyType_Type;
312 313 if (PyType_Ready(&ZstdCompressionWriterType) < 0) {
313 314 return;
314 315 }
315 316 }
@@ -1,273 +1,256 b''
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 PyDoc_STRVAR(ZstdCompressionObj__doc__,
14 14 "Perform compression using a standard library compatible API.\n"
15 15 );
16 16
17 17 static void ZstdCompressionObj_dealloc(ZstdCompressionObj* self) {
18 18 PyMem_Free(self->output.dst);
19 19 self->output.dst = NULL;
20 20
21 21 Py_XDECREF(self->compressor);
22 22
23 23 PyObject_Del(self);
24 24 }
25 25
26 26 static PyObject* ZstdCompressionObj_compress(ZstdCompressionObj* self, PyObject* args, PyObject* kwargs) {
27 27 static char* kwlist[] = {
28 28 "data",
29 29 NULL
30 30 };
31 31
32 32 Py_buffer source;
33 33 ZSTD_inBuffer input;
34 34 size_t zresult;
35 35 PyObject* result = NULL;
36 36 Py_ssize_t resultSize = 0;
37 37
38 38 if (self->finished) {
39 39 PyErr_SetString(ZstdError, "cannot call compress() after compressor finished");
40 40 return NULL;
41 41 }
42 42
43 43 #if PY_MAJOR_VERSION >= 3
44 44 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:compress",
45 45 #else
46 46 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:compress",
47 47 #endif
48 48 kwlist, &source)) {
49 49 return NULL;
50 50 }
51 51
52 52 if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) {
53 53 PyErr_SetString(PyExc_ValueError,
54 54 "data buffer should be contiguous and have at most one dimension");
55 55 goto finally;
56 56 }
57 57
58 58 input.src = source.buf;
59 59 input.size = source.len;
60 60 input.pos = 0;
61 61
62 62 while ((ssize_t)input.pos < source.len) {
63 63 Py_BEGIN_ALLOW_THREADS
64 64 zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output,
65 65 &input, ZSTD_e_continue);
66 66 Py_END_ALLOW_THREADS
67 67
68 68 if (ZSTD_isError(zresult)) {
69 69 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
70 70 Py_CLEAR(result);
71 71 goto finally;
72 72 }
73 73
74 74 if (self->output.pos) {
75 75 if (result) {
76 76 resultSize = PyBytes_GET_SIZE(result);
77 77
78 78 if (safe_pybytes_resize(&result, resultSize + self->output.pos)) {
79 79 Py_CLEAR(result);
80 80 goto finally;
81 81 }
82 82
83 83 memcpy(PyBytes_AS_STRING(result) + resultSize,
84 84 self->output.dst, self->output.pos);
85 85 }
86 86 else {
87 87 result = PyBytes_FromStringAndSize(self->output.dst, self->output.pos);
88 88 if (!result) {
89 89 goto finally;
90 90 }
91 91 }
92 92
93 93 self->output.pos = 0;
94 94 }
95 95 }
96 96
97 97 if (NULL == result) {
98 98 result = PyBytes_FromString("");
99 99 }
100 100
101 101 finally:
102 102 PyBuffer_Release(&source);
103 103
104 104 return result;
105 105 }
106 106
107 107 static PyObject* ZstdCompressionObj_flush(ZstdCompressionObj* self, PyObject* args, PyObject* kwargs) {
108 108 static char* kwlist[] = {
109 109 "flush_mode",
110 110 NULL
111 111 };
112 112
113 113 int flushMode = compressorobj_flush_finish;
114 114 size_t zresult;
115 115 PyObject* result = NULL;
116 116 Py_ssize_t resultSize = 0;
117 117 ZSTD_inBuffer input;
118 ZSTD_EndDirective zFlushMode;
118 119
119 120 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|i:flush", kwlist, &flushMode)) {
120 121 return NULL;
121 122 }
122 123
123 124 if (flushMode != compressorobj_flush_finish && flushMode != compressorobj_flush_block) {
124 125 PyErr_SetString(PyExc_ValueError, "flush mode not recognized");
125 126 return NULL;
126 127 }
127 128
128 129 if (self->finished) {
129 130 PyErr_SetString(ZstdError, "compressor object already finished");
130 131 return NULL;
131 132 }
132 133
134 switch (flushMode) {
135 case compressorobj_flush_block:
136 zFlushMode = ZSTD_e_flush;
137 break;
138
139 case compressorobj_flush_finish:
140 zFlushMode = ZSTD_e_end;
141 self->finished = 1;
142 break;
143
144 default:
145 PyErr_SetString(ZstdError, "unhandled flush mode");
146 return NULL;
147 }
148
133 149 assert(self->output.pos == 0);
134 150
135 151 input.src = NULL;
136 152 input.size = 0;
137 153 input.pos = 0;
138 154
139 if (flushMode == compressorobj_flush_block) {
140 /* The output buffer is of size ZSTD_CStreamOutSize(), which is
141 guaranteed to hold a full block. */
155 while (1) {
142 156 Py_BEGIN_ALLOW_THREADS
143 zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output,
144 &input, ZSTD_e_flush);
157 zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output,
158 &input, zFlushMode);
145 159 Py_END_ALLOW_THREADS
146 160
147 161 if (ZSTD_isError(zresult)) {
148 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
149 return NULL;
150 }
151
152 /* Output buffer is guaranteed to hold full block. */
153 assert(zresult == 0);
154
155 if (self->output.pos) {
156 result = PyBytes_FromStringAndSize(self->output.dst, self->output.pos);
157 if (!result) {
158 return NULL;
159 }
160 }
161
162 self->output.pos = 0;
163
164 if (result) {
165 return result;
166 }
167 else {
168 return PyBytes_FromString("");
169 }
170 }
171
172 assert(flushMode == compressorobj_flush_finish);
173 self->finished = 1;
174
175 while (1) {
176 zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output,
177 &input, ZSTD_e_end);
178 if (ZSTD_isError(zresult)) {
179 162 PyErr_Format(ZstdError, "error ending compression stream: %s",
180 163 ZSTD_getErrorName(zresult));
181 164 return NULL;
182 165 }
183 166
184 167 if (self->output.pos) {
185 168 if (result) {
186 169 resultSize = PyBytes_GET_SIZE(result);
187 170
188 171 if (safe_pybytes_resize(&result, resultSize + self->output.pos)) {
189 172 Py_XDECREF(result);
190 173 return NULL;
191 174 }
192 175
193 176 memcpy(PyBytes_AS_STRING(result) + resultSize,
194 177 self->output.dst, self->output.pos);
195 178 }
196 179 else {
197 180 result = PyBytes_FromStringAndSize(self->output.dst, self->output.pos);
198 181 if (!result) {
199 182 return NULL;
200 183 }
201 184 }
202 185
203 186 self->output.pos = 0;
204 187 }
205 188
206 189 if (!zresult) {
207 190 break;
208 191 }
209 192 }
210 193
211 194 if (result) {
212 195 return result;
213 196 }
214 197 else {
215 198 return PyBytes_FromString("");
216 199 }
217 200 }
218 201
219 202 static PyMethodDef ZstdCompressionObj_methods[] = {
220 203 { "compress", (PyCFunction)ZstdCompressionObj_compress, METH_VARARGS | METH_KEYWORDS,
221 204 PyDoc_STR("compress data") },
222 205 { "flush", (PyCFunction)ZstdCompressionObj_flush, METH_VARARGS | METH_KEYWORDS,
223 206 PyDoc_STR("finish compression operation") },
224 207 { NULL, NULL }
225 208 };
226 209
227 210 PyTypeObject ZstdCompressionObjType = {
228 211 PyVarObject_HEAD_INIT(NULL, 0)
229 212 "zstd.ZstdCompressionObj", /* tp_name */
230 213 sizeof(ZstdCompressionObj), /* tp_basicsize */
231 214 0, /* tp_itemsize */
232 215 (destructor)ZstdCompressionObj_dealloc, /* tp_dealloc */
233 216 0, /* tp_print */
234 217 0, /* tp_getattr */
235 218 0, /* tp_setattr */
236 219 0, /* tp_compare */
237 220 0, /* tp_repr */
238 221 0, /* tp_as_number */
239 222 0, /* tp_as_sequence */
240 223 0, /* tp_as_mapping */
241 224 0, /* tp_hash */
242 225 0, /* tp_call */
243 226 0, /* tp_str */
244 227 0, /* tp_getattro */
245 228 0, /* tp_setattro */
246 229 0, /* tp_as_buffer */
247 230 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
248 231 ZstdCompressionObj__doc__, /* tp_doc */
249 232 0, /* tp_traverse */
250 233 0, /* tp_clear */
251 234 0, /* tp_richcompare */
252 235 0, /* tp_weaklistoffset */
253 236 0, /* tp_iter */
254 237 0, /* tp_iternext */
255 238 ZstdCompressionObj_methods, /* tp_methods */
256 239 0, /* tp_members */
257 240 0, /* tp_getset */
258 241 0, /* tp_base */
259 242 0, /* tp_dict */
260 243 0, /* tp_descr_get */
261 244 0, /* tp_descr_set */
262 245 0, /* tp_dictoffset */
263 246 0, /* tp_init */
264 247 0, /* tp_alloc */
265 248 PyType_GenericNew, /* tp_new */
266 249 };
267 250
268 251 void compressobj_module_init(PyObject* module) {
269 252 Py_TYPE(&ZstdCompressionObjType) = &PyType_Type;
270 253 if (PyType_Ready(&ZstdCompressionObjType) < 0) {
271 254 return;
272 255 }
273 256 }
@@ -1,1604 +1,1651 b''
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10 #include "pool.h"
11 11
12 12 extern PyObject* ZstdError;
13 13
14 int ensure_cctx(ZstdCompressor* compressor) {
14 int setup_cctx(ZstdCompressor* compressor) {
15 15 size_t zresult;
16 16
17 17 assert(compressor);
18 18 assert(compressor->cctx);
19 19 assert(compressor->params);
20 20
21 ZSTD_CCtx_reset(compressor->cctx);
22
23 21 zresult = ZSTD_CCtx_setParametersUsingCCtxParams(compressor->cctx, compressor->params);
24 22 if (ZSTD_isError(zresult)) {
25 23 PyErr_Format(ZstdError, "could not set compression parameters: %s",
26 24 ZSTD_getErrorName(zresult));
27 25 return 1;
28 26 }
29 27
30 28 if (compressor->dict) {
31 29 if (compressor->dict->cdict) {
32 30 zresult = ZSTD_CCtx_refCDict(compressor->cctx, compressor->dict->cdict);
33 31 }
34 32 else {
35 33 zresult = ZSTD_CCtx_loadDictionary_advanced(compressor->cctx,
36 34 compressor->dict->dictData, compressor->dict->dictSize,
37 35 ZSTD_dlm_byRef, compressor->dict->dictType);
38 36 }
39 37 if (ZSTD_isError(zresult)) {
40 38 PyErr_Format(ZstdError, "could not load compression dictionary: %s",
41 39 ZSTD_getErrorName(zresult));
42 40 return 1;
43 41 }
44 42 }
45 43
46 44 return 0;
47 45 }
48 46
49 47 static PyObject* frame_progression(ZSTD_CCtx* cctx) {
50 48 PyObject* result = NULL;
51 49 PyObject* value;
52 50 ZSTD_frameProgression progression;
53 51
54 52 result = PyTuple_New(3);
55 53 if (!result) {
56 54 return NULL;
57 55 }
58 56
59 57 progression = ZSTD_getFrameProgression(cctx);
60 58
61 59 value = PyLong_FromUnsignedLongLong(progression.ingested);
62 60 if (!value) {
63 61 Py_DECREF(result);
64 62 return NULL;
65 63 }
66 64
67 65 PyTuple_SET_ITEM(result, 0, value);
68 66
69 67 value = PyLong_FromUnsignedLongLong(progression.consumed);
70 68 if (!value) {
71 69 Py_DECREF(result);
72 70 return NULL;
73 71 }
74 72
75 73 PyTuple_SET_ITEM(result, 1, value);
76 74
77 75 value = PyLong_FromUnsignedLongLong(progression.produced);
78 76 if (!value) {
79 77 Py_DECREF(result);
80 78 return NULL;
81 79 }
82 80
83 81 PyTuple_SET_ITEM(result, 2, value);
84 82
85 83 return result;
86 84 }
87 85
88 86 PyDoc_STRVAR(ZstdCompressor__doc__,
89 87 "ZstdCompressor(level=None, dict_data=None, compression_params=None)\n"
90 88 "\n"
91 89 "Create an object used to perform Zstandard compression.\n"
92 90 "\n"
93 91 "An instance can compress data various ways. Instances can be used multiple\n"
94 92 "times. Each compression operation will use the compression parameters\n"
95 93 "defined at construction time.\n"
96 94 "\n"
97 95 "Compression can be configured via the following names arguments:\n"
98 96 "\n"
99 97 "level\n"
100 98 " Integer compression level.\n"
101 99 "dict_data\n"
102 100 " A ``ZstdCompressionDict`` to be used to compress with dictionary data.\n"
103 101 "compression_params\n"
104 102 " A ``CompressionParameters`` instance defining low-level compression"
105 103 " parameters. If defined, this will overwrite the ``level`` argument.\n"
106 104 "write_checksum\n"
107 105 " If True, a 4 byte content checksum will be written with the compressed\n"
108 106 " data, allowing the decompressor to perform content verification.\n"
109 107 "write_content_size\n"
110 108 " If True (the default), the decompressed content size will be included in\n"
111 109 " the header of the compressed data. This data will only be written if the\n"
112 110 " compressor knows the size of the input data.\n"
113 111 "write_dict_id\n"
114 112 " Determines whether the dictionary ID will be written into the compressed\n"
115 113 " data. Defaults to True. Only adds content to the compressed data if\n"
116 114 " a dictionary is being used.\n"
117 115 "threads\n"
118 116 " Number of threads to use to compress data concurrently. When set,\n"
119 117 " compression operations are performed on multiple threads. The default\n"
120 118 " value (0) disables multi-threaded compression. A value of ``-1`` means to\n"
121 119 " set the number of threads to the number of detected logical CPUs.\n"
122 120 );
123 121
124 122 static int ZstdCompressor_init(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
125 123 static char* kwlist[] = {
126 124 "level",
127 125 "dict_data",
128 126 "compression_params",
129 127 "write_checksum",
130 128 "write_content_size",
131 129 "write_dict_id",
132 130 "threads",
133 131 NULL
134 132 };
135 133
136 134 int level = 3;
137 135 ZstdCompressionDict* dict = NULL;
138 136 ZstdCompressionParametersObject* params = NULL;
139 137 PyObject* writeChecksum = NULL;
140 138 PyObject* writeContentSize = NULL;
141 139 PyObject* writeDictID = NULL;
142 140 int threads = 0;
143 141
144 142 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!O!OOOi:ZstdCompressor",
145 143 kwlist, &level, &ZstdCompressionDictType, &dict,
146 144 &ZstdCompressionParametersType, &params,
147 145 &writeChecksum, &writeContentSize, &writeDictID, &threads)) {
148 146 return -1;
149 147 }
150 148
151 149 if (level > ZSTD_maxCLevel()) {
152 150 PyErr_Format(PyExc_ValueError, "level must be less than %d",
153 151 ZSTD_maxCLevel() + 1);
154 152 return -1;
155 153 }
156 154
157 155 if (threads < 0) {
158 156 threads = cpu_count();
159 157 }
160 158
161 159 /* We create a ZSTD_CCtx for reuse among multiple operations to reduce the
162 160 overhead of each compression operation. */
163 161 self->cctx = ZSTD_createCCtx();
164 162 if (!self->cctx) {
165 163 PyErr_NoMemory();
166 164 return -1;
167 165 }
168 166
169 167 /* TODO stuff the original parameters away somewhere so we can reset later. This
170 168 will allow us to do things like automatically adjust cparams based on input
171 169 size (assuming zstd isn't doing that internally). */
172 170
173 171 self->params = ZSTD_createCCtxParams();
174 172 if (!self->params) {
175 173 PyErr_NoMemory();
176 174 return -1;
177 175 }
178 176
179 177 if (params && writeChecksum) {
180 178 PyErr_SetString(PyExc_ValueError,
181 179 "cannot define compression_params and write_checksum");
182 180 return -1;
183 181 }
184 182
185 183 if (params && writeContentSize) {
186 184 PyErr_SetString(PyExc_ValueError,
187 185 "cannot define compression_params and write_content_size");
188 186 return -1;
189 187 }
190 188
191 189 if (params && writeDictID) {
192 190 PyErr_SetString(PyExc_ValueError,
193 191 "cannot define compression_params and write_dict_id");
194 192 return -1;
195 193 }
196 194
197 195 if (params && threads) {
198 196 PyErr_SetString(PyExc_ValueError,
199 197 "cannot define compression_params and threads");
200 198 return -1;
201 199 }
202 200
203 201 if (params) {
204 202 if (set_parameters(self->params, params)) {
205 203 return -1;
206 204 }
207 205 }
208 206 else {
209 207 if (set_parameter(self->params, ZSTD_p_compressionLevel, level)) {
210 208 return -1;
211 209 }
212 210
213 211 if (set_parameter(self->params, ZSTD_p_contentSizeFlag,
214 212 writeContentSize ? PyObject_IsTrue(writeContentSize) : 1)) {
215 213 return -1;
216 214 }
217 215
218 216 if (set_parameter(self->params, ZSTD_p_checksumFlag,
219 217 writeChecksum ? PyObject_IsTrue(writeChecksum) : 0)) {
220 218 return -1;
221 219 }
222 220
223 221 if (set_parameter(self->params, ZSTD_p_dictIDFlag,
224 222 writeDictID ? PyObject_IsTrue(writeDictID) : 1)) {
225 223 return -1;
226 224 }
227 225
228 226 if (threads) {
229 227 if (set_parameter(self->params, ZSTD_p_nbWorkers, threads)) {
230 228 return -1;
231 229 }
232 230 }
233 231 }
234 232
235 233 if (dict) {
236 234 self->dict = dict;
237 235 Py_INCREF(dict);
238 236 }
239 237
240 if (ensure_cctx(self)) {
241 return -1;
242 }
238 if (setup_cctx(self)) {
239 return -1;
240 }
243 241
244 242 return 0;
245 243 }
246 244
247 245 static void ZstdCompressor_dealloc(ZstdCompressor* self) {
248 246 if (self->cctx) {
249 247 ZSTD_freeCCtx(self->cctx);
250 248 self->cctx = NULL;
251 249 }
252 250
253 251 if (self->params) {
254 252 ZSTD_freeCCtxParams(self->params);
255 253 self->params = NULL;
256 254 }
257 255
258 256 Py_XDECREF(self->dict);
259 257 PyObject_Del(self);
260 258 }
261 259
262 260 PyDoc_STRVAR(ZstdCompressor_memory_size__doc__,
263 261 "memory_size()\n"
264 262 "\n"
265 263 "Obtain the memory usage of this compressor, in bytes.\n"
266 264 );
267 265
268 266 static PyObject* ZstdCompressor_memory_size(ZstdCompressor* self) {
269 267 if (self->cctx) {
270 268 return PyLong_FromSize_t(ZSTD_sizeof_CCtx(self->cctx));
271 269 }
272 270 else {
273 271 PyErr_SetString(ZstdError, "no compressor context found; this should never happen");
274 272 return NULL;
275 273 }
276 274 }
277 275
278 276 PyDoc_STRVAR(ZstdCompressor_frame_progression__doc__,
279 277 "frame_progression()\n"
280 278 "\n"
281 279 "Return information on how much work the compressor has done.\n"
282 280 "\n"
283 281 "Returns a 3-tuple of (ingested, consumed, produced).\n"
284 282 );
285 283
286 284 static PyObject* ZstdCompressor_frame_progression(ZstdCompressor* self) {
287 285 return frame_progression(self->cctx);
288 286 }
289 287
290 288 PyDoc_STRVAR(ZstdCompressor_copy_stream__doc__,
291 289 "copy_stream(ifh, ofh[, size=0, read_size=default, write_size=default])\n"
292 290 "compress data between streams\n"
293 291 "\n"
294 292 "Data will be read from ``ifh``, compressed, and written to ``ofh``.\n"
295 293 "``ifh`` must have a ``read(size)`` method. ``ofh`` must have a ``write(data)``\n"
296 294 "method.\n"
297 295 "\n"
298 296 "An optional ``size`` argument specifies the size of the source stream.\n"
299 297 "If defined, compression parameters will be tuned based on the size.\n"
300 298 "\n"
301 299 "Optional arguments ``read_size`` and ``write_size`` define the chunk sizes\n"
302 300 "of ``read()`` and ``write()`` operations, respectively. By default, they use\n"
303 301 "the default compression stream input and output sizes, respectively.\n"
304 302 );
305 303
306 304 static PyObject* ZstdCompressor_copy_stream(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
307 305 static char* kwlist[] = {
308 306 "ifh",
309 307 "ofh",
310 308 "size",
311 309 "read_size",
312 310 "write_size",
313 311 NULL
314 312 };
315 313
316 314 PyObject* source;
317 315 PyObject* dest;
318 316 unsigned long long sourceSize = ZSTD_CONTENTSIZE_UNKNOWN;
319 317 size_t inSize = ZSTD_CStreamInSize();
320 318 size_t outSize = ZSTD_CStreamOutSize();
321 319 ZSTD_inBuffer input;
322 320 ZSTD_outBuffer output;
323 321 Py_ssize_t totalRead = 0;
324 322 Py_ssize_t totalWrite = 0;
325 323 char* readBuffer;
326 324 Py_ssize_t readSize;
327 325 PyObject* readResult = NULL;
328 326 PyObject* res = NULL;
329 327 size_t zresult;
330 328 PyObject* writeResult;
331 329 PyObject* totalReadPy;
332 330 PyObject* totalWritePy;
333 331
334 332 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|Kkk:copy_stream", kwlist,
335 333 &source, &dest, &sourceSize, &inSize, &outSize)) {
336 334 return NULL;
337 335 }
338 336
339 337 if (!PyObject_HasAttrString(source, "read")) {
340 338 PyErr_SetString(PyExc_ValueError, "first argument must have a read() method");
341 339 return NULL;
342 340 }
343 341
344 342 if (!PyObject_HasAttrString(dest, "write")) {
345 343 PyErr_SetString(PyExc_ValueError, "second argument must have a write() method");
346 344 return NULL;
347 345 }
348 346
349 if (ensure_cctx(self)) {
350 return NULL;
351 }
347 ZSTD_CCtx_reset(self->cctx);
352 348
353 349 zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, sourceSize);
354 350 if (ZSTD_isError(zresult)) {
355 351 PyErr_Format(ZstdError, "error setting source size: %s",
356 352 ZSTD_getErrorName(zresult));
357 353 return NULL;
358 354 }
359 355
360 356 /* Prevent free on uninitialized memory in finally. */
361 357 output.dst = PyMem_Malloc(outSize);
362 358 if (!output.dst) {
363 359 PyErr_NoMemory();
364 360 res = NULL;
365 361 goto finally;
366 362 }
367 363 output.size = outSize;
368 364 output.pos = 0;
369 365
370 366 input.src = NULL;
371 367 input.size = 0;
372 368 input.pos = 0;
373 369
374 370 while (1) {
375 371 /* Try to read from source stream. */
376 372 readResult = PyObject_CallMethod(source, "read", "n", inSize);
377 373 if (!readResult) {
378 374 PyErr_SetString(ZstdError, "could not read() from source");
379 375 goto finally;
380 376 }
381 377
382 378 PyBytes_AsStringAndSize(readResult, &readBuffer, &readSize);
383 379
384 380 /* If no data was read, we're at EOF. */
385 381 if (0 == readSize) {
386 382 break;
387 383 }
388 384
389 385 totalRead += readSize;
390 386
391 387 /* Send data to compressor */
392 388 input.src = readBuffer;
393 389 input.size = readSize;
394 390 input.pos = 0;
395 391
396 392 while (input.pos < input.size) {
397 393 Py_BEGIN_ALLOW_THREADS
398 394 zresult = ZSTD_compress_generic(self->cctx, &output, &input, ZSTD_e_continue);
399 395 Py_END_ALLOW_THREADS
400 396
401 397 if (ZSTD_isError(zresult)) {
402 398 res = NULL;
403 399 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
404 400 goto finally;
405 401 }
406 402
407 403 if (output.pos) {
408 404 #if PY_MAJOR_VERSION >= 3
409 405 writeResult = PyObject_CallMethod(dest, "write", "y#",
410 406 #else
411 407 writeResult = PyObject_CallMethod(dest, "write", "s#",
412 408 #endif
413 409 output.dst, output.pos);
414 410 Py_XDECREF(writeResult);
415 411 totalWrite += output.pos;
416 412 output.pos = 0;
417 413 }
418 414 }
419 415
420 416 Py_CLEAR(readResult);
421 417 }
422 418
423 419 /* We've finished reading. Now flush the compressor stream. */
424 420 assert(input.pos == input.size);
425 421
426 422 while (1) {
427 423 Py_BEGIN_ALLOW_THREADS
428 424 zresult = ZSTD_compress_generic(self->cctx, &output, &input, ZSTD_e_end);
429 425 Py_END_ALLOW_THREADS
430 426
431 427 if (ZSTD_isError(zresult)) {
432 428 PyErr_Format(ZstdError, "error ending compression stream: %s",
433 429 ZSTD_getErrorName(zresult));
434 430 res = NULL;
435 431 goto finally;
436 432 }
437 433
438 434 if (output.pos) {
439 435 #if PY_MAJOR_VERSION >= 3
440 436 writeResult = PyObject_CallMethod(dest, "write", "y#",
441 437 #else
442 438 writeResult = PyObject_CallMethod(dest, "write", "s#",
443 439 #endif
444 440 output.dst, output.pos);
445 441 totalWrite += output.pos;
446 442 Py_XDECREF(writeResult);
447 443 output.pos = 0;
448 444 }
449 445
450 446 if (!zresult) {
451 447 break;
452 448 }
453 449 }
454 450
455 451 totalReadPy = PyLong_FromSsize_t(totalRead);
456 452 totalWritePy = PyLong_FromSsize_t(totalWrite);
457 453 res = PyTuple_Pack(2, totalReadPy, totalWritePy);
458 454 Py_DECREF(totalReadPy);
459 455 Py_DECREF(totalWritePy);
460 456
461 457 finally:
462 458 if (output.dst) {
463 459 PyMem_Free(output.dst);
464 460 }
465 461
466 462 Py_XDECREF(readResult);
467 463
468 464 return res;
469 465 }
470 466
471 467 PyDoc_STRVAR(ZstdCompressor_stream_reader__doc__,
472 468 "stream_reader(source, [size=0])\n"
473 469 "\n"
474 470 "Obtain an object that behaves like an I/O stream.\n"
475 471 "\n"
476 472 "The source object can be any object with a ``read(size)`` method\n"
477 473 "or an object that conforms to the buffer protocol.\n"
478 474 );
479 475
480 476 static ZstdCompressionReader* ZstdCompressor_stream_reader(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
481 477 static char* kwlist[] = {
482 478 "source",
483 479 "size",
484 480 "read_size",
485 481 NULL
486 482 };
487 483
488 484 PyObject* source;
489 485 unsigned long long sourceSize = ZSTD_CONTENTSIZE_UNKNOWN;
490 486 size_t readSize = ZSTD_CStreamInSize();
491 487 ZstdCompressionReader* result = NULL;
488 size_t zresult;
492 489
493 490 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Kk:stream_reader", kwlist,
494 491 &source, &sourceSize, &readSize)) {
495 492 return NULL;
496 493 }
497 494
498 495 result = (ZstdCompressionReader*)PyObject_CallObject((PyObject*)&ZstdCompressionReaderType, NULL);
499 496 if (!result) {
500 497 return NULL;
501 498 }
502 499
503 500 if (PyObject_HasAttrString(source, "read")) {
504 501 result->reader = source;
505 502 Py_INCREF(source);
506 503 result->readSize = readSize;
507 504 }
508 505 else if (1 == PyObject_CheckBuffer(source)) {
509 506 if (0 != PyObject_GetBuffer(source, &result->buffer, PyBUF_CONTIG_RO)) {
510 507 goto except;
511 508 }
512 509
513 510 assert(result->buffer.len >= 0);
514 511
515 512 sourceSize = result->buffer.len;
516 513 }
517 514 else {
518 515 PyErr_SetString(PyExc_TypeError,
519 516 "must pass an object with a read() method or that conforms to the buffer protocol");
520 517 goto except;
521 518 }
522 519
523 if (ensure_cctx(self)) {
520 ZSTD_CCtx_reset(self->cctx);
521
522 zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, sourceSize);
523 if (ZSTD_isError(zresult)) {
524 PyErr_Format(ZstdError, "error setting source source: %s",
525 ZSTD_getErrorName(zresult));
524 526 goto except;
525 527 }
526 528
527 529 result->compressor = self;
528 530 Py_INCREF(self);
529 result->sourceSize = sourceSize;
530 531
531 532 return result;
532 533
533 534 except:
534 535 Py_CLEAR(result);
535 536
536 537 return NULL;
537 538 }
538 539
539 540 PyDoc_STRVAR(ZstdCompressor_compress__doc__,
540 541 "compress(data)\n"
541 542 "\n"
542 543 "Compress data in a single operation.\n"
543 544 "\n"
544 545 "This is the simplest mechanism to perform compression: simply pass in a\n"
545 546 "value and get a compressed value back. It is almost the most prone to abuse.\n"
546 547 "The input and output values must fit in memory, so passing in very large\n"
547 548 "values can result in excessive memory usage. For this reason, one of the\n"
548 549 "streaming based APIs is preferred for larger values.\n"
549 550 );
550 551
551 552 static PyObject* ZstdCompressor_compress(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
552 553 static char* kwlist[] = {
553 554 "data",
554 555 NULL
555 556 };
556 557
557 558 Py_buffer source;
558 559 size_t destSize;
559 560 PyObject* output = NULL;
560 561 size_t zresult;
561 562 ZSTD_outBuffer outBuffer;
562 563 ZSTD_inBuffer inBuffer;
563 564
564 565 #if PY_MAJOR_VERSION >= 3
565 566 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|O:compress",
566 567 #else
567 568 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|O:compress",
568 569 #endif
569 570 kwlist, &source)) {
570 571 return NULL;
571 572 }
572 573
573 574 if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) {
574 575 PyErr_SetString(PyExc_ValueError,
575 576 "data buffer should be contiguous and have at most one dimension");
576 577 goto finally;
577 578 }
578 579
579 if (ensure_cctx(self)) {
580 goto finally;
581 }
580 ZSTD_CCtx_reset(self->cctx);
582 581
583 582 destSize = ZSTD_compressBound(source.len);
584 583 output = PyBytes_FromStringAndSize(NULL, destSize);
585 584 if (!output) {
586 585 goto finally;
587 586 }
588 587
589 588 zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, source.len);
590 589 if (ZSTD_isError(zresult)) {
591 590 PyErr_Format(ZstdError, "error setting source size: %s",
592 591 ZSTD_getErrorName(zresult));
593 592 Py_CLEAR(output);
594 593 goto finally;
595 594 }
596 595
597 596 inBuffer.src = source.buf;
598 597 inBuffer.size = source.len;
599 598 inBuffer.pos = 0;
600 599
601 600 outBuffer.dst = PyBytes_AsString(output);
602 601 outBuffer.size = destSize;
603 602 outBuffer.pos = 0;
604 603
605 604 Py_BEGIN_ALLOW_THREADS
606 605 /* By avoiding ZSTD_compress(), we don't necessarily write out content
607 606 size. This means the argument to ZstdCompressor to control frame
608 607 parameters is honored. */
609 608 zresult = ZSTD_compress_generic(self->cctx, &outBuffer, &inBuffer, ZSTD_e_end);
610 609 Py_END_ALLOW_THREADS
611 610
612 611 if (ZSTD_isError(zresult)) {
613 612 PyErr_Format(ZstdError, "cannot compress: %s", ZSTD_getErrorName(zresult));
614 613 Py_CLEAR(output);
615 614 goto finally;
616 615 }
617 616 else if (zresult) {
618 617 PyErr_SetString(ZstdError, "unexpected partial frame flush");
619 618 Py_CLEAR(output);
620 619 goto finally;
621 620 }
622 621
623 622 Py_SIZE(output) = outBuffer.pos;
624 623
625 624 finally:
626 625 PyBuffer_Release(&source);
627 626 return output;
628 627 }
629 628
630 629 PyDoc_STRVAR(ZstdCompressionObj__doc__,
631 630 "compressobj()\n"
632 631 "\n"
633 632 "Return an object exposing ``compress(data)`` and ``flush()`` methods.\n"
634 633 "\n"
635 634 "The returned object exposes an API similar to ``zlib.compressobj`` and\n"
636 635 "``bz2.BZ2Compressor`` so that callers can swap in the zstd compressor\n"
637 636 "without changing how compression is performed.\n"
638 637 );
639 638
640 639 static ZstdCompressionObj* ZstdCompressor_compressobj(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
641 640 static char* kwlist[] = {
642 641 "size",
643 642 NULL
644 643 };
645 644
646 645 unsigned long long inSize = ZSTD_CONTENTSIZE_UNKNOWN;
647 646 size_t outSize = ZSTD_CStreamOutSize();
648 647 ZstdCompressionObj* result = NULL;
649 648 size_t zresult;
650 649
651 650 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|K:compressobj", kwlist, &inSize)) {
652 651 return NULL;
653 652 }
654 653
655 if (ensure_cctx(self)) {
656 return NULL;
657 }
654 ZSTD_CCtx_reset(self->cctx);
658 655
659 656 zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, inSize);
660 657 if (ZSTD_isError(zresult)) {
661 658 PyErr_Format(ZstdError, "error setting source size: %s",
662 659 ZSTD_getErrorName(zresult));
663 660 return NULL;
664 661 }
665 662
666 663 result = (ZstdCompressionObj*)PyObject_CallObject((PyObject*)&ZstdCompressionObjType, NULL);
667 664 if (!result) {
668 665 return NULL;
669 666 }
670 667
671 668 result->output.dst = PyMem_Malloc(outSize);
672 669 if (!result->output.dst) {
673 670 PyErr_NoMemory();
674 671 Py_DECREF(result);
675 672 return NULL;
676 673 }
677 674 result->output.size = outSize;
678 675 result->compressor = self;
679 676 Py_INCREF(result->compressor);
680 677
681 678 return result;
682 679 }
683 680
684 681 PyDoc_STRVAR(ZstdCompressor_read_to_iter__doc__,
685 682 "read_to_iter(reader, [size=0, read_size=default, write_size=default])\n"
686 683 "Read uncompressed data from a reader and return an iterator\n"
687 684 "\n"
688 685 "Returns an iterator of compressed data produced from reading from ``reader``.\n"
689 686 "\n"
690 687 "Uncompressed data will be obtained from ``reader`` by calling the\n"
691 688 "``read(size)`` method of it. The source data will be streamed into a\n"
692 689 "compressor. As compressed data is available, it will be exposed to the\n"
693 690 "iterator.\n"
694 691 "\n"
695 692 "Data is read from the source in chunks of ``read_size``. Compressed chunks\n"
696 693 "are at most ``write_size`` bytes. Both values default to the zstd input and\n"
697 694 "and output defaults, respectively.\n"
698 695 "\n"
699 696 "The caller is partially in control of how fast data is fed into the\n"
700 697 "compressor by how it consumes the returned iterator. The compressor will\n"
701 698 "not consume from the reader unless the caller consumes from the iterator.\n"
702 699 );
703 700
704 701 static ZstdCompressorIterator* ZstdCompressor_read_to_iter(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
705 702 static char* kwlist[] = {
706 703 "reader",
707 704 "size",
708 705 "read_size",
709 706 "write_size",
710 707 NULL
711 708 };
712 709
713 710 PyObject* reader;
714 711 unsigned long long sourceSize = ZSTD_CONTENTSIZE_UNKNOWN;
715 712 size_t inSize = ZSTD_CStreamInSize();
716 713 size_t outSize = ZSTD_CStreamOutSize();
717 714 ZstdCompressorIterator* result;
718 715 size_t zresult;
719 716
720 717 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Kkk:read_to_iter", kwlist,
721 718 &reader, &sourceSize, &inSize, &outSize)) {
722 719 return NULL;
723 720 }
724 721
725 722 result = (ZstdCompressorIterator*)PyObject_CallObject((PyObject*)&ZstdCompressorIteratorType, NULL);
726 723 if (!result) {
727 724 return NULL;
728 725 }
729 726 if (PyObject_HasAttrString(reader, "read")) {
730 727 result->reader = reader;
731 728 Py_INCREF(result->reader);
732 729 }
733 730 else if (1 == PyObject_CheckBuffer(reader)) {
734 731 if (0 != PyObject_GetBuffer(reader, &result->buffer, PyBUF_CONTIG_RO)) {
735 732 goto except;
736 733 }
737 734
738 735 sourceSize = result->buffer.len;
739 736 }
740 737 else {
741 738 PyErr_SetString(PyExc_ValueError,
742 739 "must pass an object with a read() method or conforms to buffer protocol");
743 740 goto except;
744 741 }
745 742
746 if (ensure_cctx(self)) {
747 return NULL;
748 }
743 ZSTD_CCtx_reset(self->cctx);
749 744
750 745 zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, sourceSize);
751 746 if (ZSTD_isError(zresult)) {
752 747 PyErr_Format(ZstdError, "error setting source size: %s",
753 748 ZSTD_getErrorName(zresult));
754 749 return NULL;
755 750 }
756 751
757 752 result->compressor = self;
758 753 Py_INCREF(result->compressor);
759 754
760 755 result->inSize = inSize;
761 756 result->outSize = outSize;
762 757
763 758 result->output.dst = PyMem_Malloc(outSize);
764 759 if (!result->output.dst) {
765 760 PyErr_NoMemory();
766 761 goto except;
767 762 }
768 763 result->output.size = outSize;
769 764
770 765 goto finally;
771 766
772 767 except:
773 768 Py_CLEAR(result);
774 769
775 770 finally:
776 771 return result;
777 772 }
778 773
779 774 PyDoc_STRVAR(ZstdCompressor_stream_writer___doc__,
780 775 "Create a context manager to write compressed data to an object.\n"
781 776 "\n"
782 777 "The passed object must have a ``write()`` method.\n"
783 778 "\n"
784 779 "The caller feeds input data to the object by calling ``compress(data)``.\n"
785 780 "Compressed data is written to the argument given to this function.\n"
786 781 "\n"
787 782 "The function takes an optional ``size`` argument indicating the total size\n"
788 783 "of the eventual input. If specified, the size will influence compression\n"
789 784 "parameter tuning and could result in the size being written into the\n"
790 785 "header of the compressed data.\n"
791 786 "\n"
792 787 "An optional ``write_size`` argument is also accepted. It defines the maximum\n"
793 788 "byte size of chunks fed to ``write()``. By default, it uses the zstd default\n"
794 789 "for a compressor output stream.\n"
795 790 );
796 791
797 792 static ZstdCompressionWriter* ZstdCompressor_stream_writer(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
798 793 static char* kwlist[] = {
799 794 "writer",
800 795 "size",
801 796 "write_size",
802 797 NULL
803 798 };
804 799
805 800 PyObject* writer;
806 801 ZstdCompressionWriter* result;
807 802 unsigned long long sourceSize = ZSTD_CONTENTSIZE_UNKNOWN;
808 803 size_t outSize = ZSTD_CStreamOutSize();
809 804
810 805 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Kk:stream_writer", kwlist,
811 806 &writer, &sourceSize, &outSize)) {
812 807 return NULL;
813 808 }
814 809
815 810 if (!PyObject_HasAttrString(writer, "write")) {
816 811 PyErr_SetString(PyExc_ValueError, "must pass an object with a write() method");
817 812 return NULL;
818 813 }
819 814
820 if (ensure_cctx(self)) {
821 return NULL;
822 }
815 ZSTD_CCtx_reset(self->cctx);
823 816
824 817 result = (ZstdCompressionWriter*)PyObject_CallObject((PyObject*)&ZstdCompressionWriterType, NULL);
825 818 if (!result) {
826 819 return NULL;
827 820 }
828 821
829 822 result->compressor = self;
830 823 Py_INCREF(result->compressor);
831 824
832 825 result->writer = writer;
833 826 Py_INCREF(result->writer);
834 827
835 828 result->sourceSize = sourceSize;
836 829 result->outSize = outSize;
837 830 result->bytesCompressed = 0;
838 831
839 832 return result;
840 833 }
841 834
835 PyDoc_STRVAR(ZstdCompressor_chunker__doc__,
836 "Create an object for iterative compressing to same-sized chunks.\n"
837 );
838
839 static ZstdCompressionChunker* ZstdCompressor_chunker(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
840 static char* kwlist[] = {
841 "size",
842 "chunk_size",
843 NULL
844 };
845
846 unsigned long long sourceSize = ZSTD_CONTENTSIZE_UNKNOWN;
847 size_t chunkSize = ZSTD_CStreamOutSize();
848 ZstdCompressionChunker* chunker;
849 size_t zresult;
850
851 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|Kk:chunker", kwlist,
852 &sourceSize, &chunkSize)) {
853 return NULL;
854 }
855
856 ZSTD_CCtx_reset(self->cctx);
857
858 zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, sourceSize);
859 if (ZSTD_isError(zresult)) {
860 PyErr_Format(ZstdError, "error setting source size: %s",
861 ZSTD_getErrorName(zresult));
862 return NULL;
863 }
864
865 chunker = (ZstdCompressionChunker*)PyObject_CallObject((PyObject*)&ZstdCompressionChunkerType, NULL);
866 if (!chunker) {
867 return NULL;
868 }
869
870 chunker->output.dst = PyMem_Malloc(chunkSize);
871 if (!chunker->output.dst) {
872 PyErr_NoMemory();
873 Py_DECREF(chunker);
874 return NULL;
875 }
876 chunker->output.size = chunkSize;
877 chunker->output.pos = 0;
878
879 chunker->compressor = self;
880 Py_INCREF(chunker->compressor);
881
882 chunker->chunkSize = chunkSize;
883
884 return chunker;
885 }
886
842 887 typedef struct {
843 888 void* sourceData;
844 889 size_t sourceSize;
845 890 } DataSource;
846 891
847 892 typedef struct {
848 893 DataSource* sources;
849 894 Py_ssize_t sourcesSize;
850 895 unsigned long long totalSourceSize;
851 896 } DataSources;
852 897
853 898 typedef struct {
854 899 void* dest;
855 900 Py_ssize_t destSize;
856 901 BufferSegment* segments;
857 902 Py_ssize_t segmentsSize;
858 903 } DestBuffer;
859 904
860 905 typedef enum {
861 906 WorkerError_none = 0,
862 907 WorkerError_zstd = 1,
863 908 WorkerError_no_memory = 2,
864 909 WorkerError_nospace = 3,
865 910 } WorkerError;
866 911
867 912 /**
868 913 * Holds state for an individual worker performing multi_compress_to_buffer work.
869 914 */
870 915 typedef struct {
871 916 /* Used for compression. */
872 917 ZSTD_CCtx* cctx;
873 918
874 919 /* What to compress. */
875 920 DataSource* sources;
876 921 Py_ssize_t sourcesSize;
877 922 Py_ssize_t startOffset;
878 923 Py_ssize_t endOffset;
879 924 unsigned long long totalSourceSize;
880 925
881 926 /* Result storage. */
882 927 DestBuffer* destBuffers;
883 928 Py_ssize_t destCount;
884 929
885 930 /* Error tracking. */
886 931 WorkerError error;
887 932 size_t zresult;
888 933 Py_ssize_t errorOffset;
889 934 } WorkerState;
890 935
891 936 static void compress_worker(WorkerState* state) {
892 937 Py_ssize_t inputOffset = state->startOffset;
893 938 Py_ssize_t remainingItems = state->endOffset - state->startOffset + 1;
894 939 Py_ssize_t currentBufferStartOffset = state->startOffset;
895 940 size_t zresult;
896 941 void* newDest;
897 942 size_t allocationSize;
898 943 size_t boundSize;
899 944 Py_ssize_t destOffset = 0;
900 945 DataSource* sources = state->sources;
901 946 DestBuffer* destBuffer;
902 947
903 948 assert(!state->destBuffers);
904 949 assert(0 == state->destCount);
905 950
906 951 /*
907 952 * The total size of the compressed data is unknown until we actually
908 953 * compress data. That means we can't pre-allocate the exact size we need.
909 954 *
910 955 * There is a cost to every allocation and reallocation. So, it is in our
911 956 * interest to minimize the number of allocations.
912 957 *
913 958 * There is also a cost to too few allocations. If allocations are too
914 959 * large they may fail. If buffers are shared and all inputs become
915 960 * irrelevant at different lifetimes, then a reference to one segment
916 961 * in the buffer will keep the entire buffer alive. This leads to excessive
917 962 * memory usage.
918 963 *
919 964 * Our current strategy is to assume a compression ratio of 16:1 and
920 965 * allocate buffers of that size, rounded up to the nearest power of 2
921 966 * (because computers like round numbers). That ratio is greater than what
922 967 * most inputs achieve. This is by design: we don't want to over-allocate.
923 968 * But we don't want to under-allocate and lead to too many buffers either.
924 969 */
925 970
926 971 state->destCount = 1;
927 972
928 973 state->destBuffers = calloc(1, sizeof(DestBuffer));
929 974 if (NULL == state->destBuffers) {
930 975 state->error = WorkerError_no_memory;
931 976 return;
932 977 }
933 978
934 979 destBuffer = &state->destBuffers[state->destCount - 1];
935 980
936 981 /*
937 982 * Rather than track bounds and grow the segments buffer, allocate space
938 983 * to hold remaining items then truncate when we're done with it.
939 984 */
940 985 destBuffer->segments = calloc(remainingItems, sizeof(BufferSegment));
941 986 if (NULL == destBuffer->segments) {
942 987 state->error = WorkerError_no_memory;
943 988 return;
944 989 }
945 990
946 991 destBuffer->segmentsSize = remainingItems;
947 992
948 993 assert(state->totalSourceSize <= SIZE_MAX);
949 994 allocationSize = roundpow2((size_t)state->totalSourceSize >> 4);
950 995
951 996 /* If the maximum size of the output is larger than that, round up. */
952 997 boundSize = ZSTD_compressBound(sources[inputOffset].sourceSize);
953 998
954 999 if (boundSize > allocationSize) {
955 1000 allocationSize = roundpow2(boundSize);
956 1001 }
957 1002
958 1003 destBuffer->dest = malloc(allocationSize);
959 1004 if (NULL == destBuffer->dest) {
960 1005 state->error = WorkerError_no_memory;
961 1006 return;
962 1007 }
963 1008
964 1009 destBuffer->destSize = allocationSize;
965 1010
966 1011 for (inputOffset = state->startOffset; inputOffset <= state->endOffset; inputOffset++) {
967 1012 void* source = sources[inputOffset].sourceData;
968 1013 size_t sourceSize = sources[inputOffset].sourceSize;
969 1014 size_t destAvailable;
970 1015 void* dest;
971 1016 ZSTD_outBuffer opOutBuffer;
972 1017 ZSTD_inBuffer opInBuffer;
973 1018
974 1019 destAvailable = destBuffer->destSize - destOffset;
975 1020 boundSize = ZSTD_compressBound(sourceSize);
976 1021
977 1022 /*
978 1023 * Not enough space in current buffer to hold largest compressed output.
979 1024 * So allocate and switch to a new output buffer.
980 1025 */
981 1026 if (boundSize > destAvailable) {
982 1027 /*
983 1028 * The downsizing of the existing buffer is optional. It should be cheap
984 1029 * (unlike growing). So we just do it.
985 1030 */
986 1031 if (destAvailable) {
987 1032 newDest = realloc(destBuffer->dest, destOffset);
988 1033 if (NULL == newDest) {
989 1034 state->error = WorkerError_no_memory;
990 1035 return;
991 1036 }
992 1037
993 1038 destBuffer->dest = newDest;
994 1039 destBuffer->destSize = destOffset;
995 1040 }
996 1041
997 1042 /* Truncate segments buffer. */
998 1043 newDest = realloc(destBuffer->segments,
999 1044 (inputOffset - currentBufferStartOffset + 1) * sizeof(BufferSegment));
1000 1045 if (NULL == newDest) {
1001 1046 state->error = WorkerError_no_memory;
1002 1047 return;
1003 1048 }
1004 1049
1005 1050 destBuffer->segments = newDest;
1006 1051 destBuffer->segmentsSize = inputOffset - currentBufferStartOffset;
1007 1052
1008 1053 /* Grow space for new struct. */
1009 1054 /* TODO consider over-allocating so we don't do this every time. */
1010 1055 newDest = realloc(state->destBuffers, (state->destCount + 1) * sizeof(DestBuffer));
1011 1056 if (NULL == newDest) {
1012 1057 state->error = WorkerError_no_memory;
1013 1058 return;
1014 1059 }
1015 1060
1016 1061 state->destBuffers = newDest;
1017 1062 state->destCount++;
1018 1063
1019 1064 destBuffer = &state->destBuffers[state->destCount - 1];
1020 1065
1021 1066 /* Don't take any chances with non-NULL pointers. */
1022 1067 memset(destBuffer, 0, sizeof(DestBuffer));
1023 1068
1024 1069 /**
1025 1070 * We could dynamically update allocation size based on work done so far.
1026 1071 * For now, keep is simple.
1027 1072 */
1028 1073 assert(state->totalSourceSize <= SIZE_MAX);
1029 1074 allocationSize = roundpow2((size_t)state->totalSourceSize >> 4);
1030 1075
1031 1076 if (boundSize > allocationSize) {
1032 1077 allocationSize = roundpow2(boundSize);
1033 1078 }
1034 1079
1035 1080 destBuffer->dest = malloc(allocationSize);
1036 1081 if (NULL == destBuffer->dest) {
1037 1082 state->error = WorkerError_no_memory;
1038 1083 return;
1039 1084 }
1040 1085
1041 1086 destBuffer->destSize = allocationSize;
1042 1087 destAvailable = allocationSize;
1043 1088 destOffset = 0;
1044 1089
1045 1090 destBuffer->segments = calloc(remainingItems, sizeof(BufferSegment));
1046 1091 if (NULL == destBuffer->segments) {
1047 1092 state->error = WorkerError_no_memory;
1048 1093 return;
1049 1094 }
1050 1095
1051 1096 destBuffer->segmentsSize = remainingItems;
1052 1097 currentBufferStartOffset = inputOffset;
1053 1098 }
1054 1099
1055 1100 dest = (char*)destBuffer->dest + destOffset;
1056 1101
1057 1102 opInBuffer.src = source;
1058 1103 opInBuffer.size = sourceSize;
1059 1104 opInBuffer.pos = 0;
1060 1105
1061 1106 opOutBuffer.dst = dest;
1062 1107 opOutBuffer.size = destAvailable;
1063 1108 opOutBuffer.pos = 0;
1064 1109
1065 1110 zresult = ZSTD_CCtx_setPledgedSrcSize(state->cctx, sourceSize);
1066 1111 if (ZSTD_isError(zresult)) {
1067 1112 state->error = WorkerError_zstd;
1068 1113 state->zresult = zresult;
1069 1114 state->errorOffset = inputOffset;
1070 1115 break;
1071 1116 }
1072 1117
1073 1118 zresult = ZSTD_compress_generic(state->cctx, &opOutBuffer, &opInBuffer, ZSTD_e_end);
1074 1119 if (ZSTD_isError(zresult)) {
1075 1120 state->error = WorkerError_zstd;
1076 1121 state->zresult = zresult;
1077 1122 state->errorOffset = inputOffset;
1078 1123 break;
1079 1124 }
1080 1125 else if (zresult) {
1081 1126 state->error = WorkerError_nospace;
1082 1127 state->errorOffset = inputOffset;
1083 1128 break;
1084 1129 }
1085 1130
1086 1131 destBuffer->segments[inputOffset - currentBufferStartOffset].offset = destOffset;
1087 1132 destBuffer->segments[inputOffset - currentBufferStartOffset].length = opOutBuffer.pos;
1088 1133
1089 1134 destOffset += opOutBuffer.pos;
1090 1135 remainingItems--;
1091 1136 }
1092 1137
1093 1138 if (destBuffer->destSize > destOffset) {
1094 1139 newDest = realloc(destBuffer->dest, destOffset);
1095 1140 if (NULL == newDest) {
1096 1141 state->error = WorkerError_no_memory;
1097 1142 return;
1098 1143 }
1099 1144
1100 1145 destBuffer->dest = newDest;
1101 1146 destBuffer->destSize = destOffset;
1102 1147 }
1103 1148 }
1104 1149
1105 1150 ZstdBufferWithSegmentsCollection* compress_from_datasources(ZstdCompressor* compressor,
1106 1151 DataSources* sources, Py_ssize_t threadCount) {
1107 1152 unsigned long long bytesPerWorker;
1108 1153 POOL_ctx* pool = NULL;
1109 1154 WorkerState* workerStates = NULL;
1110 1155 Py_ssize_t i;
1111 1156 unsigned long long workerBytes = 0;
1112 1157 Py_ssize_t workerStartOffset = 0;
1113 1158 Py_ssize_t currentThread = 0;
1114 1159 int errored = 0;
1115 1160 Py_ssize_t segmentsCount = 0;
1116 1161 Py_ssize_t segmentIndex;
1117 1162 PyObject* segmentsArg = NULL;
1118 1163 ZstdBufferWithSegments* buffer;
1119 1164 ZstdBufferWithSegmentsCollection* result = NULL;
1120 1165
1121 1166 assert(sources->sourcesSize > 0);
1122 1167 assert(sources->totalSourceSize > 0);
1123 1168 assert(threadCount >= 1);
1124 1169
1125 1170 /* More threads than inputs makes no sense. */
1126 1171 threadCount = sources->sourcesSize < threadCount ? sources->sourcesSize
1127 1172 : threadCount;
1128 1173
1129 1174 /* TODO lower thread count when input size is too small and threads would add
1130 1175 overhead. */
1131 1176
1132 1177 workerStates = PyMem_Malloc(threadCount * sizeof(WorkerState));
1133 1178 if (NULL == workerStates) {
1134 1179 PyErr_NoMemory();
1135 1180 goto finally;
1136 1181 }
1137 1182
1138 1183 memset(workerStates, 0, threadCount * sizeof(WorkerState));
1139 1184
1140 1185 if (threadCount > 1) {
1141 1186 pool = POOL_create(threadCount, 1);
1142 1187 if (NULL == pool) {
1143 1188 PyErr_SetString(ZstdError, "could not initialize zstd thread pool");
1144 1189 goto finally;
1145 1190 }
1146 1191 }
1147 1192
1148 1193 bytesPerWorker = sources->totalSourceSize / threadCount;
1149 1194
1150 1195 for (i = 0; i < threadCount; i++) {
1151 1196 size_t zresult;
1152 1197
1153 1198 workerStates[i].cctx = ZSTD_createCCtx();
1154 1199 if (!workerStates[i].cctx) {
1155 1200 PyErr_NoMemory();
1156 1201 goto finally;
1157 1202 }
1158 1203
1159 1204 zresult = ZSTD_CCtx_setParametersUsingCCtxParams(workerStates[i].cctx,
1160 1205 compressor->params);
1161 1206 if (ZSTD_isError(zresult)) {
1162 1207 PyErr_Format(ZstdError, "could not set compression parameters: %s",
1163 1208 ZSTD_getErrorName(zresult));
1164 1209 goto finally;
1165 1210 }
1166 1211
1167 1212 if (compressor->dict) {
1168 1213 if (compressor->dict->cdict) {
1169 1214 zresult = ZSTD_CCtx_refCDict(workerStates[i].cctx, compressor->dict->cdict);
1170 1215 }
1171 1216 else {
1172 1217 zresult = ZSTD_CCtx_loadDictionary_advanced(
1173 1218 workerStates[i].cctx,
1174 1219 compressor->dict->dictData,
1175 1220 compressor->dict->dictSize,
1176 1221 ZSTD_dlm_byRef,
1177 1222 compressor->dict->dictType);
1178 1223 }
1179 1224
1180 1225 if (ZSTD_isError(zresult)) {
1181 1226 PyErr_Format(ZstdError, "could not load compression dictionary: %s",
1182 1227 ZSTD_getErrorName(zresult));
1183 1228 goto finally;
1184 1229 }
1185 1230
1186 1231 }
1187 1232
1188 1233 workerStates[i].sources = sources->sources;
1189 1234 workerStates[i].sourcesSize = sources->sourcesSize;
1190 1235 }
1191 1236
1192 1237 Py_BEGIN_ALLOW_THREADS
1193 1238 for (i = 0; i < sources->sourcesSize; i++) {
1194 1239 workerBytes += sources->sources[i].sourceSize;
1195 1240
1196 1241 /*
1197 1242 * The last worker/thread needs to handle all remaining work. Don't
1198 1243 * trigger it prematurely. Defer to the block outside of the loop
1199 1244 * to run the last worker/thread. But do still process this loop
1200 1245 * so workerBytes is correct.
1201 1246 */
1202 1247 if (currentThread == threadCount - 1) {
1203 1248 continue;
1204 1249 }
1205 1250
1206 1251 if (workerBytes >= bytesPerWorker) {
1207 1252 assert(currentThread < threadCount);
1208 1253 workerStates[currentThread].totalSourceSize = workerBytes;
1209 1254 workerStates[currentThread].startOffset = workerStartOffset;
1210 1255 workerStates[currentThread].endOffset = i;
1211 1256
1212 1257 if (threadCount > 1) {
1213 1258 POOL_add(pool, (POOL_function)compress_worker, &workerStates[currentThread]);
1214 1259 }
1215 1260 else {
1216 1261 compress_worker(&workerStates[currentThread]);
1217 1262 }
1218 1263
1219 1264 currentThread++;
1220 1265 workerStartOffset = i + 1;
1221 1266 workerBytes = 0;
1222 1267 }
1223 1268 }
1224 1269
1225 1270 if (workerBytes) {
1226 1271 assert(currentThread < threadCount);
1227 1272 workerStates[currentThread].totalSourceSize = workerBytes;
1228 1273 workerStates[currentThread].startOffset = workerStartOffset;
1229 1274 workerStates[currentThread].endOffset = sources->sourcesSize - 1;
1230 1275
1231 1276 if (threadCount > 1) {
1232 1277 POOL_add(pool, (POOL_function)compress_worker, &workerStates[currentThread]);
1233 1278 }
1234 1279 else {
1235 1280 compress_worker(&workerStates[currentThread]);
1236 1281 }
1237 1282 }
1238 1283
1239 1284 if (threadCount > 1) {
1240 1285 POOL_free(pool);
1241 1286 pool = NULL;
1242 1287 }
1243 1288
1244 1289 Py_END_ALLOW_THREADS
1245 1290
1246 1291 for (i = 0; i < threadCount; i++) {
1247 1292 switch (workerStates[i].error) {
1248 1293 case WorkerError_no_memory:
1249 1294 PyErr_NoMemory();
1250 1295 errored = 1;
1251 1296 break;
1252 1297
1253 1298 case WorkerError_zstd:
1254 1299 PyErr_Format(ZstdError, "error compressing item %zd: %s",
1255 1300 workerStates[i].errorOffset, ZSTD_getErrorName(workerStates[i].zresult));
1256 1301 errored = 1;
1257 1302 break;
1258 1303
1259 1304 case WorkerError_nospace:
1260 1305 PyErr_Format(ZstdError, "error compressing item %zd: not enough space in output",
1261 1306 workerStates[i].errorOffset);
1262 1307 errored = 1;
1263 1308 break;
1264 1309
1265 1310 default:
1266 1311 ;
1267 1312 }
1268 1313
1269 1314 if (errored) {
1270 1315 break;
1271 1316 }
1272 1317
1273 1318 }
1274 1319
1275 1320 if (errored) {
1276 1321 goto finally;
1277 1322 }
1278 1323
1279 1324 segmentsCount = 0;
1280 1325 for (i = 0; i < threadCount; i++) {
1281 1326 WorkerState* state = &workerStates[i];
1282 1327 segmentsCount += state->destCount;
1283 1328 }
1284 1329
1285 1330 segmentsArg = PyTuple_New(segmentsCount);
1286 1331 if (NULL == segmentsArg) {
1287 1332 goto finally;
1288 1333 }
1289 1334
1290 1335 segmentIndex = 0;
1291 1336
1292 1337 for (i = 0; i < threadCount; i++) {
1293 1338 Py_ssize_t j;
1294 1339 WorkerState* state = &workerStates[i];
1295 1340
1296 1341 for (j = 0; j < state->destCount; j++) {
1297 1342 DestBuffer* destBuffer = &state->destBuffers[j];
1298 1343 buffer = BufferWithSegments_FromMemory(destBuffer->dest, destBuffer->destSize,
1299 1344 destBuffer->segments, destBuffer->segmentsSize);
1300 1345
1301 1346 if (NULL == buffer) {
1302 1347 goto finally;
1303 1348 }
1304 1349
1305 1350 /* Tell instance to use free() instsead of PyMem_Free(). */
1306 1351 buffer->useFree = 1;
1307 1352
1308 1353 /*
1309 1354 * BufferWithSegments_FromMemory takes ownership of the backing memory.
1310 1355 * Unset it here so it doesn't get freed below.
1311 1356 */
1312 1357 destBuffer->dest = NULL;
1313 1358 destBuffer->segments = NULL;
1314 1359
1315 1360 PyTuple_SET_ITEM(segmentsArg, segmentIndex++, (PyObject*)buffer);
1316 1361 }
1317 1362 }
1318 1363
1319 1364 result = (ZstdBufferWithSegmentsCollection*)PyObject_CallObject(
1320 1365 (PyObject*)&ZstdBufferWithSegmentsCollectionType, segmentsArg);
1321 1366
1322 1367 finally:
1323 1368 Py_CLEAR(segmentsArg);
1324 1369
1325 1370 if (pool) {
1326 1371 POOL_free(pool);
1327 1372 }
1328 1373
1329 1374 if (workerStates) {
1330 1375 Py_ssize_t j;
1331 1376
1332 1377 for (i = 0; i < threadCount; i++) {
1333 1378 WorkerState state = workerStates[i];
1334 1379
1335 1380 if (state.cctx) {
1336 1381 ZSTD_freeCCtx(state.cctx);
1337 1382 }
1338 1383
1339 1384 /* malloc() is used in worker thread. */
1340 1385
1341 1386 for (j = 0; j < state.destCount; j++) {
1342 1387 if (state.destBuffers) {
1343 1388 free(state.destBuffers[j].dest);
1344 1389 free(state.destBuffers[j].segments);
1345 1390 }
1346 1391 }
1347 1392
1348 1393
1349 1394 free(state.destBuffers);
1350 1395 }
1351 1396
1352 1397 PyMem_Free(workerStates);
1353 1398 }
1354 1399
1355 1400 return result;
1356 1401 }
1357 1402
1358 1403 PyDoc_STRVAR(ZstdCompressor_multi_compress_to_buffer__doc__,
1359 1404 "Compress multiple pieces of data as a single operation\n"
1360 1405 "\n"
1361 1406 "Receives a ``BufferWithSegmentsCollection``, a ``BufferWithSegments``, or\n"
1362 1407 "a list of bytes like objects holding data to compress.\n"
1363 1408 "\n"
1364 1409 "Returns a ``BufferWithSegmentsCollection`` holding compressed data.\n"
1365 1410 "\n"
1366 1411 "This function is optimized to perform multiple compression operations as\n"
1367 1412 "as possible with as little overhead as possbile.\n"
1368 1413 );
1369 1414
1370 1415 static ZstdBufferWithSegmentsCollection* ZstdCompressor_multi_compress_to_buffer(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
1371 1416 static char* kwlist[] = {
1372 1417 "data",
1373 1418 "threads",
1374 1419 NULL
1375 1420 };
1376 1421
1377 1422 PyObject* data;
1378 1423 int threads = 0;
1379 1424 Py_buffer* dataBuffers = NULL;
1380 1425 DataSources sources;
1381 1426 Py_ssize_t i;
1382 1427 Py_ssize_t sourceCount = 0;
1383 1428 ZstdBufferWithSegmentsCollection* result = NULL;
1384 1429
1385 1430 memset(&sources, 0, sizeof(sources));
1386 1431
1387 1432 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|i:multi_compress_to_buffer", kwlist,
1388 1433 &data, &threads)) {
1389 1434 return NULL;
1390 1435 }
1391 1436
1392 1437 if (threads < 0) {
1393 1438 threads = cpu_count();
1394 1439 }
1395 1440
1396 1441 if (threads < 2) {
1397 1442 threads = 1;
1398 1443 }
1399 1444
1400 1445 if (PyObject_TypeCheck(data, &ZstdBufferWithSegmentsType)) {
1401 1446 ZstdBufferWithSegments* buffer = (ZstdBufferWithSegments*)data;
1402 1447
1403 1448 sources.sources = PyMem_Malloc(buffer->segmentCount * sizeof(DataSource));
1404 1449 if (NULL == sources.sources) {
1405 1450 PyErr_NoMemory();
1406 1451 goto finally;
1407 1452 }
1408 1453
1409 1454 for (i = 0; i < buffer->segmentCount; i++) {
1410 1455 if (buffer->segments[i].length > SIZE_MAX) {
1411 1456 PyErr_Format(PyExc_ValueError,
1412 1457 "buffer segment %zd is too large for this platform", i);
1413 1458 goto finally;
1414 1459 }
1415 1460
1416 1461 sources.sources[i].sourceData = (char*)buffer->data + buffer->segments[i].offset;
1417 1462 sources.sources[i].sourceSize = (size_t)buffer->segments[i].length;
1418 1463 sources.totalSourceSize += buffer->segments[i].length;
1419 1464 }
1420 1465
1421 1466 sources.sourcesSize = buffer->segmentCount;
1422 1467 }
1423 1468 else if (PyObject_TypeCheck(data, &ZstdBufferWithSegmentsCollectionType)) {
1424 1469 Py_ssize_t j;
1425 1470 Py_ssize_t offset = 0;
1426 1471 ZstdBufferWithSegments* buffer;
1427 1472 ZstdBufferWithSegmentsCollection* collection = (ZstdBufferWithSegmentsCollection*)data;
1428 1473
1429 1474 sourceCount = BufferWithSegmentsCollection_length(collection);
1430 1475
1431 1476 sources.sources = PyMem_Malloc(sourceCount * sizeof(DataSource));
1432 1477 if (NULL == sources.sources) {
1433 1478 PyErr_NoMemory();
1434 1479 goto finally;
1435 1480 }
1436 1481
1437 1482 for (i = 0; i < collection->bufferCount; i++) {
1438 1483 buffer = collection->buffers[i];
1439 1484
1440 1485 for (j = 0; j < buffer->segmentCount; j++) {
1441 1486 if (buffer->segments[j].length > SIZE_MAX) {
1442 1487 PyErr_Format(PyExc_ValueError,
1443 1488 "buffer segment %zd in buffer %zd is too large for this platform",
1444 1489 j, i);
1445 1490 goto finally;
1446 1491 }
1447 1492
1448 1493 sources.sources[offset].sourceData = (char*)buffer->data + buffer->segments[j].offset;
1449 1494 sources.sources[offset].sourceSize = (size_t)buffer->segments[j].length;
1450 1495 sources.totalSourceSize += buffer->segments[j].length;
1451 1496
1452 1497 offset++;
1453 1498 }
1454 1499 }
1455 1500
1456 1501 sources.sourcesSize = sourceCount;
1457 1502 }
1458 1503 else if (PyList_Check(data)) {
1459 1504 sourceCount = PyList_GET_SIZE(data);
1460 1505
1461 1506 sources.sources = PyMem_Malloc(sourceCount * sizeof(DataSource));
1462 1507 if (NULL == sources.sources) {
1463 1508 PyErr_NoMemory();
1464 1509 goto finally;
1465 1510 }
1466 1511
1467 1512 dataBuffers = PyMem_Malloc(sourceCount * sizeof(Py_buffer));
1468 1513 if (NULL == dataBuffers) {
1469 1514 PyErr_NoMemory();
1470 1515 goto finally;
1471 1516 }
1472 1517
1473 1518 memset(dataBuffers, 0, sourceCount * sizeof(Py_buffer));
1474 1519
1475 1520 for (i = 0; i < sourceCount; i++) {
1476 1521 if (0 != PyObject_GetBuffer(PyList_GET_ITEM(data, i),
1477 1522 &dataBuffers[i], PyBUF_CONTIG_RO)) {
1478 1523 PyErr_Clear();
1479 1524 PyErr_Format(PyExc_TypeError, "item %zd not a bytes like object", i);
1480 1525 goto finally;
1481 1526 }
1482 1527
1483 1528 sources.sources[i].sourceData = dataBuffers[i].buf;
1484 1529 sources.sources[i].sourceSize = dataBuffers[i].len;
1485 1530 sources.totalSourceSize += dataBuffers[i].len;
1486 1531 }
1487 1532
1488 1533 sources.sourcesSize = sourceCount;
1489 1534 }
1490 1535 else {
1491 1536 PyErr_SetString(PyExc_TypeError, "argument must be list of BufferWithSegments");
1492 1537 goto finally;
1493 1538 }
1494 1539
1495 1540 if (0 == sources.sourcesSize) {
1496 1541 PyErr_SetString(PyExc_ValueError, "no source elements found");
1497 1542 goto finally;
1498 1543 }
1499 1544
1500 1545 if (0 == sources.totalSourceSize) {
1501 1546 PyErr_SetString(PyExc_ValueError, "source elements are empty");
1502 1547 goto finally;
1503 1548 }
1504 1549
1505 1550 if (sources.totalSourceSize > SIZE_MAX) {
1506 1551 PyErr_SetString(PyExc_ValueError, "sources are too large for this platform");
1507 1552 goto finally;
1508 1553 }
1509 1554
1510 1555 result = compress_from_datasources(self, &sources, threads);
1511 1556
1512 1557 finally:
1513 1558 PyMem_Free(sources.sources);
1514 1559
1515 1560 if (dataBuffers) {
1516 1561 for (i = 0; i < sourceCount; i++) {
1517 1562 PyBuffer_Release(&dataBuffers[i]);
1518 1563 }
1519 1564
1520 1565 PyMem_Free(dataBuffers);
1521 1566 }
1522 1567
1523 1568 return result;
1524 1569 }
1525 1570
1526 1571 static PyMethodDef ZstdCompressor_methods[] = {
1572 { "chunker", (PyCFunction)ZstdCompressor_chunker,
1573 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_chunker__doc__ },
1527 1574 { "compress", (PyCFunction)ZstdCompressor_compress,
1528 1575 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_compress__doc__ },
1529 1576 { "compressobj", (PyCFunction)ZstdCompressor_compressobj,
1530 1577 METH_VARARGS | METH_KEYWORDS, ZstdCompressionObj__doc__ },
1531 1578 { "copy_stream", (PyCFunction)ZstdCompressor_copy_stream,
1532 1579 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_copy_stream__doc__ },
1533 1580 { "stream_reader", (PyCFunction)ZstdCompressor_stream_reader,
1534 1581 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_stream_reader__doc__ },
1535 1582 { "stream_writer", (PyCFunction)ZstdCompressor_stream_writer,
1536 1583 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_stream_writer___doc__ },
1537 1584 { "read_to_iter", (PyCFunction)ZstdCompressor_read_to_iter,
1538 1585 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_read_to_iter__doc__ },
1539 1586 /* TODO Remove deprecated API */
1540 1587 { "read_from", (PyCFunction)ZstdCompressor_read_to_iter,
1541 1588 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_read_to_iter__doc__ },
1542 1589 /* TODO remove deprecated API */
1543 1590 { "write_to", (PyCFunction)ZstdCompressor_stream_writer,
1544 1591 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_stream_writer___doc__ },
1545 1592 { "multi_compress_to_buffer", (PyCFunction)ZstdCompressor_multi_compress_to_buffer,
1546 1593 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_multi_compress_to_buffer__doc__ },
1547 1594 { "memory_size", (PyCFunction)ZstdCompressor_memory_size,
1548 1595 METH_NOARGS, ZstdCompressor_memory_size__doc__ },
1549 1596 { "frame_progression", (PyCFunction)ZstdCompressor_frame_progression,
1550 1597 METH_NOARGS, ZstdCompressor_frame_progression__doc__ },
1551 1598 { NULL, NULL }
1552 1599 };
1553 1600
1554 1601 PyTypeObject ZstdCompressorType = {
1555 1602 PyVarObject_HEAD_INIT(NULL, 0)
1556 1603 "zstd.ZstdCompressor", /* tp_name */
1557 1604 sizeof(ZstdCompressor), /* tp_basicsize */
1558 1605 0, /* tp_itemsize */
1559 1606 (destructor)ZstdCompressor_dealloc, /* tp_dealloc */
1560 1607 0, /* tp_print */
1561 1608 0, /* tp_getattr */
1562 1609 0, /* tp_setattr */
1563 1610 0, /* tp_compare */
1564 1611 0, /* tp_repr */
1565 1612 0, /* tp_as_number */
1566 1613 0, /* tp_as_sequence */
1567 1614 0, /* tp_as_mapping */
1568 1615 0, /* tp_hash */
1569 1616 0, /* tp_call */
1570 1617 0, /* tp_str */
1571 1618 0, /* tp_getattro */
1572 1619 0, /* tp_setattro */
1573 1620 0, /* tp_as_buffer */
1574 1621 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
1575 1622 ZstdCompressor__doc__, /* tp_doc */
1576 1623 0, /* tp_traverse */
1577 1624 0, /* tp_clear */
1578 1625 0, /* tp_richcompare */
1579 1626 0, /* tp_weaklistoffset */
1580 1627 0, /* tp_iter */
1581 1628 0, /* tp_iternext */
1582 1629 ZstdCompressor_methods, /* tp_methods */
1583 1630 0, /* tp_members */
1584 1631 0, /* tp_getset */
1585 1632 0, /* tp_base */
1586 1633 0, /* tp_dict */
1587 1634 0, /* tp_descr_get */
1588 1635 0, /* tp_descr_set */
1589 1636 0, /* tp_dictoffset */
1590 1637 (initproc)ZstdCompressor_init, /* tp_init */
1591 1638 0, /* tp_alloc */
1592 1639 PyType_GenericNew, /* tp_new */
1593 1640 };
1594 1641
1595 1642 void compressor_module_init(PyObject* mod) {
1596 1643 Py_TYPE(&ZstdCompressorType) = &PyType_Type;
1597 1644 if (PyType_Ready(&ZstdCompressorType) < 0) {
1598 1645 return;
1599 1646 }
1600 1647
1601 1648 Py_INCREF((PyObject*)&ZstdCompressorType);
1602 1649 PyModule_AddObject(mod, "ZstdCompressor",
1603 1650 (PyObject*)&ZstdCompressorType);
1604 1651 }
@@ -1,102 +1,103 b''
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 static char frame_header[] = {
14 14 '\x28',
15 15 '\xb5',
16 16 '\x2f',
17 17 '\xfd',
18 18 };
19 19
20 20 void constants_module_init(PyObject* mod) {
21 21 PyObject* version;
22 22 PyObject* zstdVersion;
23 23 PyObject* frameHeader;
24 24
25 25 #if PY_MAJOR_VERSION >= 3
26 26 version = PyUnicode_FromString(PYTHON_ZSTANDARD_VERSION);
27 27 #else
28 28 version = PyString_FromString(PYTHON_ZSTANDARD_VERSION);
29 29 #endif
30 Py_INCREF(version);
31 30 PyModule_AddObject(mod, "__version__", version);
32 31
33 32 ZstdError = PyErr_NewException("zstd.ZstdError", NULL, NULL);
34 33 PyModule_AddObject(mod, "ZstdError", ZstdError);
35 34
36 35 PyModule_AddIntConstant(mod, "COMPRESSOBJ_FLUSH_FINISH", compressorobj_flush_finish);
37 36 PyModule_AddIntConstant(mod, "COMPRESSOBJ_FLUSH_BLOCK", compressorobj_flush_block);
38 37
39 38 /* For now, the version is a simple tuple instead of a dedicated type. */
40 39 zstdVersion = PyTuple_New(3);
41 40 PyTuple_SetItem(zstdVersion, 0, PyLong_FromLong(ZSTD_VERSION_MAJOR));
42 41 PyTuple_SetItem(zstdVersion, 1, PyLong_FromLong(ZSTD_VERSION_MINOR));
43 42 PyTuple_SetItem(zstdVersion, 2, PyLong_FromLong(ZSTD_VERSION_RELEASE));
44 Py_INCREF(zstdVersion);
45 43 PyModule_AddObject(mod, "ZSTD_VERSION", zstdVersion);
46 44
47 45 frameHeader = PyBytes_FromStringAndSize(frame_header, sizeof(frame_header));
48 46 if (frameHeader) {
49 47 PyModule_AddObject(mod, "FRAME_HEADER", frameHeader);
50 48 }
51 49 else {
52 50 PyErr_Format(PyExc_ValueError, "could not create frame header object");
53 51 }
54 52
55 53 PyModule_AddObject(mod, "CONTENTSIZE_UNKNOWN",
56 54 PyLong_FromUnsignedLongLong(ZSTD_CONTENTSIZE_UNKNOWN));
57 55 PyModule_AddObject(mod, "CONTENTSIZE_ERROR",
58 56 PyLong_FromUnsignedLongLong(ZSTD_CONTENTSIZE_ERROR));
59 57
60 58 PyModule_AddIntConstant(mod, "MAX_COMPRESSION_LEVEL", ZSTD_maxCLevel());
61 59 PyModule_AddIntConstant(mod, "COMPRESSION_RECOMMENDED_INPUT_SIZE",
62 60 (long)ZSTD_CStreamInSize());
63 61 PyModule_AddIntConstant(mod, "COMPRESSION_RECOMMENDED_OUTPUT_SIZE",
64 62 (long)ZSTD_CStreamOutSize());
65 63 PyModule_AddIntConstant(mod, "DECOMPRESSION_RECOMMENDED_INPUT_SIZE",
66 64 (long)ZSTD_DStreamInSize());
67 65 PyModule_AddIntConstant(mod, "DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE",
68 66 (long)ZSTD_DStreamOutSize());
69 67
70 68 PyModule_AddIntConstant(mod, "MAGIC_NUMBER", ZSTD_MAGICNUMBER);
69 PyModule_AddIntConstant(mod, "BLOCKSIZELOG_MAX", ZSTD_BLOCKSIZELOG_MAX);
70 PyModule_AddIntConstant(mod, "BLOCKSIZE_MAX", ZSTD_BLOCKSIZE_MAX);
71 71 PyModule_AddIntConstant(mod, "WINDOWLOG_MIN", ZSTD_WINDOWLOG_MIN);
72 72 PyModule_AddIntConstant(mod, "WINDOWLOG_MAX", ZSTD_WINDOWLOG_MAX);
73 73 PyModule_AddIntConstant(mod, "CHAINLOG_MIN", ZSTD_CHAINLOG_MIN);
74 74 PyModule_AddIntConstant(mod, "CHAINLOG_MAX", ZSTD_CHAINLOG_MAX);
75 75 PyModule_AddIntConstant(mod, "HASHLOG_MIN", ZSTD_HASHLOG_MIN);
76 76 PyModule_AddIntConstant(mod, "HASHLOG_MAX", ZSTD_HASHLOG_MAX);
77 77 PyModule_AddIntConstant(mod, "HASHLOG3_MAX", ZSTD_HASHLOG3_MAX);
78 78 PyModule_AddIntConstant(mod, "SEARCHLOG_MIN", ZSTD_SEARCHLOG_MIN);
79 79 PyModule_AddIntConstant(mod, "SEARCHLOG_MAX", ZSTD_SEARCHLOG_MAX);
80 80 PyModule_AddIntConstant(mod, "SEARCHLENGTH_MIN", ZSTD_SEARCHLENGTH_MIN);
81 81 PyModule_AddIntConstant(mod, "SEARCHLENGTH_MAX", ZSTD_SEARCHLENGTH_MAX);
82 82 PyModule_AddIntConstant(mod, "TARGETLENGTH_MIN", ZSTD_TARGETLENGTH_MIN);
83 PyModule_AddIntConstant(mod, "TARGETLENGTH_MAX", ZSTD_TARGETLENGTH_MAX);
83 84 PyModule_AddIntConstant(mod, "LDM_MINMATCH_MIN", ZSTD_LDM_MINMATCH_MIN);
84 85 PyModule_AddIntConstant(mod, "LDM_MINMATCH_MAX", ZSTD_LDM_MINMATCH_MAX);
85 86 PyModule_AddIntConstant(mod, "LDM_BUCKETSIZELOG_MAX", ZSTD_LDM_BUCKETSIZELOG_MAX);
86 87
87 88 PyModule_AddIntConstant(mod, "STRATEGY_FAST", ZSTD_fast);
88 89 PyModule_AddIntConstant(mod, "STRATEGY_DFAST", ZSTD_dfast);
89 90 PyModule_AddIntConstant(mod, "STRATEGY_GREEDY", ZSTD_greedy);
90 91 PyModule_AddIntConstant(mod, "STRATEGY_LAZY", ZSTD_lazy);
91 92 PyModule_AddIntConstant(mod, "STRATEGY_LAZY2", ZSTD_lazy2);
92 93 PyModule_AddIntConstant(mod, "STRATEGY_BTLAZY2", ZSTD_btlazy2);
93 94 PyModule_AddIntConstant(mod, "STRATEGY_BTOPT", ZSTD_btopt);
94 95 PyModule_AddIntConstant(mod, "STRATEGY_BTULTRA", ZSTD_btultra);
95 96
96 97 PyModule_AddIntConstant(mod, "DICT_TYPE_AUTO", ZSTD_dct_auto);
97 98 PyModule_AddIntConstant(mod, "DICT_TYPE_RAWCONTENT", ZSTD_dct_rawContent);
98 99 PyModule_AddIntConstant(mod, "DICT_TYPE_FULLDICT", ZSTD_dct_fullDict);
99 100
100 101 PyModule_AddIntConstant(mod, "FORMAT_ZSTD1", ZSTD_f_zstd1);
101 102 PyModule_AddIntConstant(mod, "FORMAT_ZSTD1_MAGICLESS", ZSTD_f_zstd1_magicless);
102 103 }
@@ -1,459 +1,440 b''
1 1 /**
2 2 * Copyright (c) 2017-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 static void set_unsupported_operation(void) {
14 14 PyObject* iomod;
15 15 PyObject* exc;
16 16
17 17 iomod = PyImport_ImportModule("io");
18 18 if (NULL == iomod) {
19 19 return;
20 20 }
21 21
22 22 exc = PyObject_GetAttrString(iomod, "UnsupportedOperation");
23 23 if (NULL == exc) {
24 24 Py_DECREF(iomod);
25 25 return;
26 26 }
27 27
28 28 PyErr_SetNone(exc);
29 29 Py_DECREF(exc);
30 30 Py_DECREF(iomod);
31 31 }
32 32
33 33 static void reader_dealloc(ZstdDecompressionReader* self) {
34 34 Py_XDECREF(self->decompressor);
35 35 Py_XDECREF(self->reader);
36 36
37 37 if (self->buffer.buf) {
38 38 PyBuffer_Release(&self->buffer);
39 39 }
40 40
41 41 PyObject_Del(self);
42 42 }
43 43
44 44 static ZstdDecompressionReader* reader_enter(ZstdDecompressionReader* self) {
45 45 if (self->entered) {
46 46 PyErr_SetString(PyExc_ValueError, "cannot __enter__ multiple times");
47 47 return NULL;
48 48 }
49 49
50 if (ensure_dctx(self->decompressor, 1)) {
51 return NULL;
52 }
53
54 50 self->entered = 1;
55 51
56 52 Py_INCREF(self);
57 53 return self;
58 54 }
59 55
60 56 static PyObject* reader_exit(ZstdDecompressionReader* self, PyObject* args) {
61 57 PyObject* exc_type;
62 58 PyObject* exc_value;
63 59 PyObject* exc_tb;
64 60
65 61 if (!PyArg_ParseTuple(args, "OOO:__exit__", &exc_type, &exc_value, &exc_tb)) {
66 62 return NULL;
67 63 }
68 64
69 65 self->entered = 0;
70 66 self->closed = 1;
71 67
72 68 /* Release resources. */
73 69 Py_CLEAR(self->reader);
74 70 if (self->buffer.buf) {
75 71 PyBuffer_Release(&self->buffer);
76 72 memset(&self->buffer, 0, sizeof(self->buffer));
77 73 }
78 74
79 75 Py_CLEAR(self->decompressor);
80 76
81 77 Py_RETURN_FALSE;
82 78 }
83 79
84 80 static PyObject* reader_readable(PyObject* self) {
85 81 Py_RETURN_TRUE;
86 82 }
87 83
88 84 static PyObject* reader_writable(PyObject* self) {
89 85 Py_RETURN_FALSE;
90 86 }
91 87
92 88 static PyObject* reader_seekable(PyObject* self) {
93 89 Py_RETURN_TRUE;
94 90 }
95 91
96 92 static PyObject* reader_close(ZstdDecompressionReader* self) {
97 93 self->closed = 1;
98 94 Py_RETURN_NONE;
99 95 }
100 96
101 static PyObject* reader_closed(ZstdDecompressionReader* self) {
102 if (self->closed) {
103 Py_RETURN_TRUE;
104 }
105 else {
106 Py_RETURN_FALSE;
107 }
108 }
109
110 97 static PyObject* reader_flush(PyObject* self) {
111 98 Py_RETURN_NONE;
112 99 }
113 100
114 101 static PyObject* reader_isatty(PyObject* self) {
115 102 Py_RETURN_FALSE;
116 103 }
117 104
118 105 static PyObject* reader_read(ZstdDecompressionReader* self, PyObject* args, PyObject* kwargs) {
119 106 static char* kwlist[] = {
120 107 "size",
121 108 NULL
122 109 };
123 110
124 111 Py_ssize_t size = -1;
125 112 PyObject* result = NULL;
126 113 char* resultBuffer;
127 114 Py_ssize_t resultSize;
128 115 ZSTD_outBuffer output;
129 116 size_t zresult;
130 117
131 if (!self->entered) {
132 PyErr_SetString(ZstdError, "read() must be called from an active context manager");
133 return NULL;
134 }
135
136 118 if (self->closed) {
137 119 PyErr_SetString(PyExc_ValueError, "stream is closed");
138 120 return NULL;
139 121 }
140 122
141 123 if (self->finishedOutput) {
142 124 return PyBytes_FromStringAndSize("", 0);
143 125 }
144 126
145 127 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n", kwlist, &size)) {
146 128 return NULL;
147 129 }
148 130
149 131 if (size < 1) {
150 132 PyErr_SetString(PyExc_ValueError, "cannot read negative or size 0 amounts");
151 133 return NULL;
152 134 }
153 135
154 136 result = PyBytes_FromStringAndSize(NULL, size);
155 137 if (NULL == result) {
156 138 return NULL;
157 139 }
158 140
159 141 PyBytes_AsStringAndSize(result, &resultBuffer, &resultSize);
160 142
161 143 output.dst = resultBuffer;
162 144 output.size = resultSize;
163 145 output.pos = 0;
164 146
165 147 readinput:
166 148
167 149 /* Consume input data left over from last time. */
168 150 if (self->input.pos < self->input.size) {
169 151 Py_BEGIN_ALLOW_THREADS
170 152 zresult = ZSTD_decompress_generic(self->decompressor->dctx,
171 153 &output, &self->input);
172 154 Py_END_ALLOW_THREADS
173 155
174 156 /* Input exhausted. Clear our state tracking. */
175 157 if (self->input.pos == self->input.size) {
176 158 memset(&self->input, 0, sizeof(self->input));
177 159 Py_CLEAR(self->readResult);
178 160
179 161 if (self->buffer.buf) {
180 162 self->finishedInput = 1;
181 163 }
182 164 }
183 165
184 166 if (ZSTD_isError(zresult)) {
185 167 PyErr_Format(ZstdError, "zstd decompress error: %s", ZSTD_getErrorName(zresult));
186 168 return NULL;
187 169 }
188 170 else if (0 == zresult) {
189 171 self->finishedOutput = 1;
190 172 }
191 173
192 174 /* We fulfilled the full read request. Emit it. */
193 175 if (output.pos && output.pos == output.size) {
194 176 self->bytesDecompressed += output.size;
195 177 return result;
196 178 }
197 179
198 180 /*
199 181 * There is more room in the output. Fall through to try to collect
200 182 * more data so we can try to fill the output.
201 183 */
202 184 }
203 185
204 186 if (!self->finishedInput) {
205 187 if (self->reader) {
206 188 Py_buffer buffer;
207 189
208 190 assert(self->readResult == NULL);
209 191 self->readResult = PyObject_CallMethod(self->reader, "read",
210 192 "k", self->readSize);
211 193 if (NULL == self->readResult) {
212 194 return NULL;
213 195 }
214 196
215 197 memset(&buffer, 0, sizeof(buffer));
216 198
217 199 if (0 != PyObject_GetBuffer(self->readResult, &buffer, PyBUF_CONTIG_RO)) {
218 200 return NULL;
219 201 }
220 202
221 203 /* EOF */
222 204 if (0 == buffer.len) {
223 205 self->finishedInput = 1;
224 206 Py_CLEAR(self->readResult);
225 207 }
226 208 else {
227 209 self->input.src = buffer.buf;
228 210 self->input.size = buffer.len;
229 211 self->input.pos = 0;
230 212 }
231 213
232 214 PyBuffer_Release(&buffer);
233 215 }
234 216 else {
235 217 assert(self->buffer.buf);
236 218 /*
237 219 * We should only get here once since above block will exhaust
238 220 * source buffer until finishedInput is set.
239 221 */
240 222 assert(self->input.src == NULL);
241 223
242 224 self->input.src = self->buffer.buf;
243 225 self->input.size = self->buffer.len;
244 226 self->input.pos = 0;
245 227 }
246 228 }
247 229
248 230 if (self->input.size) {
249 231 goto readinput;
250 232 }
251 233
252 234 /* EOF */
253 235 self->bytesDecompressed += output.pos;
254 236
255 237 if (safe_pybytes_resize(&result, output.pos)) {
256 238 Py_XDECREF(result);
257 239 return NULL;
258 240 }
259 241
260 242 return result;
261 243 }
262 244
263 245 static PyObject* reader_readall(PyObject* self) {
264 246 PyErr_SetNone(PyExc_NotImplementedError);
265 247 return NULL;
266 248 }
267 249
268 250 static PyObject* reader_readline(PyObject* self) {
269 251 PyErr_SetNone(PyExc_NotImplementedError);
270 252 return NULL;
271 253 }
272 254
273 255 static PyObject* reader_readlines(PyObject* self) {
274 256 PyErr_SetNone(PyExc_NotImplementedError);
275 257 return NULL;
276 258 }
277 259
278 260 static PyObject* reader_seek(ZstdDecompressionReader* self, PyObject* args) {
279 261 Py_ssize_t pos;
280 262 int whence = 0;
281 263 unsigned long long readAmount = 0;
282 264 size_t defaultOutSize = ZSTD_DStreamOutSize();
283 265
284 if (!self->entered) {
285 PyErr_SetString(ZstdError, "seek() must be called from an active context manager");
286 return NULL;
287 }
288
289 266 if (self->closed) {
290 267 PyErr_SetString(PyExc_ValueError, "stream is closed");
291 268 return NULL;
292 269 }
293 270
294 271 if (!PyArg_ParseTuple(args, "n|i:seek", &pos, &whence)) {
295 272 return NULL;
296 273 }
297 274
298 275 if (whence == SEEK_SET) {
299 276 if (pos < 0) {
300 277 PyErr_SetString(PyExc_ValueError,
301 278 "cannot seek to negative position with SEEK_SET");
302 279 return NULL;
303 280 }
304 281
305 282 if ((unsigned long long)pos < self->bytesDecompressed) {
306 283 PyErr_SetString(PyExc_ValueError,
307 284 "cannot seek zstd decompression stream backwards");
308 285 return NULL;
309 286 }
310 287
311 288 readAmount = pos - self->bytesDecompressed;
312 289 }
313 290 else if (whence == SEEK_CUR) {
314 291 if (pos < 0) {
315 292 PyErr_SetString(PyExc_ValueError,
316 293 "cannot seek zstd decompression stream backwards");
317 294 return NULL;
318 295 }
319 296
320 297 readAmount = pos;
321 298 }
322 299 else if (whence == SEEK_END) {
323 300 /* We /could/ support this with pos==0. But let's not do that until someone
324 301 needs it. */
325 302 PyErr_SetString(PyExc_ValueError,
326 303 "zstd decompression streams cannot be seeked with SEEK_END");
327 304 return NULL;
328 305 }
329 306
330 307 /* It is a bit inefficient to do this via the Python API. But since there
331 308 is a bit of state tracking involved to read from this type, it is the
332 309 easiest to implement. */
333 310 while (readAmount) {
334 311 Py_ssize_t readSize;
335 312 PyObject* readResult = PyObject_CallMethod((PyObject*)self, "read", "K",
336 313 readAmount < defaultOutSize ? readAmount : defaultOutSize);
337 314
338 315 if (!readResult) {
339 316 return NULL;
340 317 }
341 318
342 319 readSize = PyBytes_GET_SIZE(readResult);
343 320
344 321 /* Empty read means EOF. */
345 322 if (!readSize) {
346 323 break;
347 324 }
348 325
349 326 readAmount -= readSize;
350 327 }
351 328
352 329 return PyLong_FromUnsignedLongLong(self->bytesDecompressed);
353 330 }
354 331
355 332 static PyObject* reader_tell(ZstdDecompressionReader* self) {
356 333 /* TODO should this raise OSError since stream isn't seekable? */
357 334 return PyLong_FromUnsignedLongLong(self->bytesDecompressed);
358 335 }
359 336
360 337 static PyObject* reader_write(PyObject* self, PyObject* args) {
361 338 set_unsupported_operation();
362 339 return NULL;
363 340 }
364 341
365 342 static PyObject* reader_writelines(PyObject* self, PyObject* args) {
366 343 set_unsupported_operation();
367 344 return NULL;
368 345 }
369 346
370 347 static PyObject* reader_iter(PyObject* self) {
371 348 PyErr_SetNone(PyExc_NotImplementedError);
372 349 return NULL;
373 350 }
374 351
375 352 static PyObject* reader_iternext(PyObject* self) {
376 353 PyErr_SetNone(PyExc_NotImplementedError);
377 354 return NULL;
378 355 }
379 356
380 357 static PyMethodDef reader_methods[] = {
381 358 { "__enter__", (PyCFunction)reader_enter, METH_NOARGS,
382 359 PyDoc_STR("Enter a compression context") },
383 360 { "__exit__", (PyCFunction)reader_exit, METH_VARARGS,
384 361 PyDoc_STR("Exit a compression context") },
385 362 { "close", (PyCFunction)reader_close, METH_NOARGS,
386 363 PyDoc_STR("Close the stream so it cannot perform any more operations") },
387 { "closed", (PyCFunction)reader_closed, METH_NOARGS,
388 PyDoc_STR("Whether stream is closed") },
389 364 { "flush", (PyCFunction)reader_flush, METH_NOARGS, PyDoc_STR("no-ops") },
390 365 { "isatty", (PyCFunction)reader_isatty, METH_NOARGS, PyDoc_STR("Returns False") },
391 366 { "readable", (PyCFunction)reader_readable, METH_NOARGS,
392 367 PyDoc_STR("Returns True") },
393 368 { "read", (PyCFunction)reader_read, METH_VARARGS | METH_KEYWORDS,
394 369 PyDoc_STR("read compressed data") },
395 370 { "readall", (PyCFunction)reader_readall, METH_NOARGS, PyDoc_STR("Not implemented") },
396 371 { "readline", (PyCFunction)reader_readline, METH_NOARGS, PyDoc_STR("Not implemented") },
397 372 { "readlines", (PyCFunction)reader_readlines, METH_NOARGS, PyDoc_STR("Not implemented") },
398 373 { "seek", (PyCFunction)reader_seek, METH_VARARGS, PyDoc_STR("Seek the stream") },
399 374 { "seekable", (PyCFunction)reader_seekable, METH_NOARGS,
400 375 PyDoc_STR("Returns True") },
401 376 { "tell", (PyCFunction)reader_tell, METH_NOARGS,
402 377 PyDoc_STR("Returns current number of bytes compressed") },
403 378 { "writable", (PyCFunction)reader_writable, METH_NOARGS,
404 379 PyDoc_STR("Returns False") },
405 380 { "write", (PyCFunction)reader_write, METH_VARARGS, PyDoc_STR("unsupported operation") },
406 381 { "writelines", (PyCFunction)reader_writelines, METH_VARARGS, PyDoc_STR("unsupported operation") },
407 382 { NULL, NULL }
408 383 };
409 384
385 static PyMemberDef reader_members[] = {
386 { "closed", T_BOOL, offsetof(ZstdDecompressionReader, closed),
387 READONLY, "whether stream is closed" },
388 { NULL }
389 };
390
410 391 PyTypeObject ZstdDecompressionReaderType = {
411 392 PyVarObject_HEAD_INIT(NULL, 0)
412 393 "zstd.ZstdDecompressionReader", /* tp_name */
413 394 sizeof(ZstdDecompressionReader), /* tp_basicsize */
414 395 0, /* tp_itemsize */
415 396 (destructor)reader_dealloc, /* tp_dealloc */
416 397 0, /* tp_print */
417 398 0, /* tp_getattr */
418 399 0, /* tp_setattr */
419 400 0, /* tp_compare */
420 401 0, /* tp_repr */
421 402 0, /* tp_as_number */
422 403 0, /* tp_as_sequence */
423 404 0, /* tp_as_mapping */
424 405 0, /* tp_hash */
425 406 0, /* tp_call */
426 407 0, /* tp_str */
427 408 0, /* tp_getattro */
428 409 0, /* tp_setattro */
429 410 0, /* tp_as_buffer */
430 411 Py_TPFLAGS_DEFAULT, /* tp_flags */
431 412 0, /* tp_doc */
432 413 0, /* tp_traverse */
433 414 0, /* tp_clear */
434 415 0, /* tp_richcompare */
435 416 0, /* tp_weaklistoffset */
436 417 reader_iter, /* tp_iter */
437 418 reader_iternext, /* tp_iternext */
438 419 reader_methods, /* tp_methods */
439 0, /* tp_members */
420 reader_members, /* tp_members */
440 421 0, /* tp_getset */
441 422 0, /* tp_base */
442 423 0, /* tp_dict */
443 424 0, /* tp_descr_get */
444 425 0, /* tp_descr_set */
445 426 0, /* tp_dictoffset */
446 427 0, /* tp_init */
447 428 0, /* tp_alloc */
448 429 PyType_GenericNew, /* tp_new */
449 430 };
450 431
451 432
452 433 void decompressionreader_module_init(PyObject* mod) {
453 434 /* TODO make reader a sub-class of io.RawIOBase */
454 435
455 436 Py_TYPE(&ZstdDecompressionReaderType) = &PyType_Type;
456 437 if (PyType_Ready(&ZstdDecompressionReaderType) < 0) {
457 438 return;
458 439 }
459 440 }
@@ -1,174 +1,185 b''
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 PyDoc_STRVAR(DecompressionObj__doc__,
14 14 "Perform decompression using a standard library compatible API.\n"
15 15 );
16 16
17 17 static void DecompressionObj_dealloc(ZstdDecompressionObj* self) {
18 18 Py_XDECREF(self->decompressor);
19 19
20 20 PyObject_Del(self);
21 21 }
22 22
23 23 static PyObject* DecompressionObj_decompress(ZstdDecompressionObj* self, PyObject* args, PyObject* kwargs) {
24 24 static char* kwlist[] = {
25 25 "data",
26 26 NULL
27 27 };
28 28
29 29 Py_buffer source;
30 30 size_t zresult;
31 31 ZSTD_inBuffer input;
32 32 ZSTD_outBuffer output;
33 33 PyObject* result = NULL;
34 34 Py_ssize_t resultSize = 0;
35 35
36 output.dst = NULL;
37
36 38 if (self->finished) {
37 39 PyErr_SetString(ZstdError, "cannot use a decompressobj multiple times");
38 40 return NULL;
39 41 }
40 42
41 43 #if PY_MAJOR_VERSION >= 3
42 44 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:decompress",
43 45 #else
44 46 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:decompress",
45 47 #endif
46 48 kwlist, &source)) {
47 49 return NULL;
48 50 }
49 51
50 52 if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) {
51 53 PyErr_SetString(PyExc_ValueError,
52 54 "data buffer should be contiguous and have at most one dimension");
53 55 goto finally;
54 56 }
55 57
58 /* Special case of empty input. Output will always be empty. */
59 if (source.len == 0) {
60 result = PyBytes_FromString("");
61 goto finally;
62 }
63
56 64 input.src = source.buf;
57 65 input.size = source.len;
58 66 input.pos = 0;
59 67
60 68 output.dst = PyMem_Malloc(self->outSize);
61 69 if (!output.dst) {
62 70 PyErr_NoMemory();
63 71 goto except;
64 72 }
65 73 output.size = self->outSize;
66 74 output.pos = 0;
67 75
68 /* Read input until exhausted. */
69 while (input.pos < input.size) {
76 while (1) {
70 77 Py_BEGIN_ALLOW_THREADS
71 78 zresult = ZSTD_decompress_generic(self->decompressor->dctx, &output, &input);
72 79 Py_END_ALLOW_THREADS
73 80
74 81 if (ZSTD_isError(zresult)) {
75 82 PyErr_Format(ZstdError, "zstd decompressor error: %s",
76 83 ZSTD_getErrorName(zresult));
77 84 goto except;
78 85 }
79 86
80 87 if (0 == zresult) {
81 88 self->finished = 1;
82 89 }
83 90
84 91 if (output.pos) {
85 92 if (result) {
86 93 resultSize = PyBytes_GET_SIZE(result);
87 94 if (-1 == safe_pybytes_resize(&result, resultSize + output.pos)) {
88 95 Py_XDECREF(result);
89 96 goto except;
90 97 }
91 98
92 99 memcpy(PyBytes_AS_STRING(result) + resultSize,
93 100 output.dst, output.pos);
94 101 }
95 102 else {
96 103 result = PyBytes_FromStringAndSize(output.dst, output.pos);
97 104 if (!result) {
98 105 goto except;
99 106 }
100 107 }
108 }
101 109
102 output.pos = 0;
110 if (zresult == 0 || (input.pos == input.size && output.pos == 0)) {
111 break;
103 112 }
113
114 output.pos = 0;
104 115 }
105 116
106 117 if (!result) {
107 118 result = PyBytes_FromString("");
108 119 }
109 120
110 121 goto finally;
111 122
112 123 except:
113 124 Py_CLEAR(result);
114 125
115 126 finally:
116 127 PyMem_Free(output.dst);
117 128 PyBuffer_Release(&source);
118 129
119 130 return result;
120 131 }
121 132
122 133 static PyMethodDef DecompressionObj_methods[] = {
123 134 { "decompress", (PyCFunction)DecompressionObj_decompress,
124 135 METH_VARARGS | METH_KEYWORDS, PyDoc_STR("decompress data") },
125 136 { NULL, NULL }
126 137 };
127 138
128 139 PyTypeObject ZstdDecompressionObjType = {
129 140 PyVarObject_HEAD_INIT(NULL, 0)
130 141 "zstd.ZstdDecompressionObj", /* tp_name */
131 142 sizeof(ZstdDecompressionObj), /* tp_basicsize */
132 143 0, /* tp_itemsize */
133 144 (destructor)DecompressionObj_dealloc, /* tp_dealloc */
134 145 0, /* tp_print */
135 146 0, /* tp_getattr */
136 147 0, /* tp_setattr */
137 148 0, /* tp_compare */
138 149 0, /* tp_repr */
139 150 0, /* tp_as_number */
140 151 0, /* tp_as_sequence */
141 152 0, /* tp_as_mapping */
142 153 0, /* tp_hash */
143 154 0, /* tp_call */
144 155 0, /* tp_str */
145 156 0, /* tp_getattro */
146 157 0, /* tp_setattro */
147 158 0, /* tp_as_buffer */
148 159 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
149 160 DecompressionObj__doc__, /* tp_doc */
150 161 0, /* tp_traverse */
151 162 0, /* tp_clear */
152 163 0, /* tp_richcompare */
153 164 0, /* tp_weaklistoffset */
154 165 0, /* tp_iter */
155 166 0, /* tp_iternext */
156 167 DecompressionObj_methods, /* tp_methods */
157 168 0, /* tp_members */
158 169 0, /* tp_getset */
159 170 0, /* tp_base */
160 171 0, /* tp_dict */
161 172 0, /* tp_descr_get */
162 173 0, /* tp_descr_set */
163 174 0, /* tp_dictoffset */
164 175 0, /* tp_init */
165 176 0, /* tp_alloc */
166 177 PyType_GenericNew, /* tp_new */
167 178 };
168 179
169 180 void decompressobj_module_init(PyObject* module) {
170 181 Py_TYPE(&ZstdDecompressionObjType) = &PyType_Type;
171 182 if (PyType_Ready(&ZstdDecompressionObjType) < 0) {
172 183 return;
173 184 }
174 185 }
@@ -1,1803 +1,1807 b''
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10 #include "pool.h"
11 11
12 12 extern PyObject* ZstdError;
13 13
14 14 /**
15 15 * Ensure the ZSTD_DCtx on a decompressor is initiated and ready for a new operation.
16 16 */
17 17 int ensure_dctx(ZstdDecompressor* decompressor, int loadDict) {
18 18 size_t zresult;
19 19
20 20 ZSTD_DCtx_reset(decompressor->dctx);
21 21
22 22 if (decompressor->maxWindowSize) {
23 23 zresult = ZSTD_DCtx_setMaxWindowSize(decompressor->dctx, decompressor->maxWindowSize);
24 24 if (ZSTD_isError(zresult)) {
25 25 PyErr_Format(ZstdError, "unable to set max window size: %s",
26 26 ZSTD_getErrorName(zresult));
27 27 return 1;
28 28 }
29 29 }
30 30
31 31 zresult = ZSTD_DCtx_setFormat(decompressor->dctx, decompressor->format);
32 32 if (ZSTD_isError(zresult)) {
33 33 PyErr_Format(ZstdError, "unable to set decoding format: %s",
34 34 ZSTD_getErrorName(zresult));
35 35 return 1;
36 36 }
37 37
38 38 if (loadDict && decompressor->dict) {
39 39 if (ensure_ddict(decompressor->dict)) {
40 40 return 1;
41 41 }
42 42
43 43 zresult = ZSTD_DCtx_refDDict(decompressor->dctx, decompressor->dict->ddict);
44 44 if (ZSTD_isError(zresult)) {
45 45 PyErr_Format(ZstdError, "unable to reference prepared dictionary: %s",
46 46 ZSTD_getErrorName(zresult));
47 47 return 1;
48 48 }
49 49 }
50 50
51 51 return 0;
52 52 }
53 53
54 54 PyDoc_STRVAR(Decompressor__doc__,
55 55 "ZstdDecompressor(dict_data=None)\n"
56 56 "\n"
57 57 "Create an object used to perform Zstandard decompression.\n"
58 58 "\n"
59 59 "An instance can perform multiple decompression operations."
60 60 );
61 61
62 62 static int Decompressor_init(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
63 63 static char* kwlist[] = {
64 64 "dict_data",
65 65 "max_window_size",
66 66 "format",
67 67 NULL
68 68 };
69 69
70 70 ZstdCompressionDict* dict = NULL;
71 71 size_t maxWindowSize = 0;
72 72 ZSTD_format_e format = ZSTD_f_zstd1;
73 73
74 74 self->dctx = NULL;
75 75 self->dict = NULL;
76 76
77 77 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O!II:ZstdDecompressor", kwlist,
78 78 &ZstdCompressionDictType, &dict, &maxWindowSize, &format)) {
79 79 return -1;
80 80 }
81 81
82 82 self->dctx = ZSTD_createDCtx();
83 83 if (!self->dctx) {
84 84 PyErr_NoMemory();
85 85 goto except;
86 86 }
87 87
88 88 self->maxWindowSize = maxWindowSize;
89 89 self->format = format;
90 90
91 91 if (dict) {
92 92 self->dict = dict;
93 93 Py_INCREF(dict);
94 94 }
95 95
96 96 if (ensure_dctx(self, 1)) {
97 97 goto except;
98 98 }
99 99
100 100 return 0;
101 101
102 102 except:
103 103 Py_CLEAR(self->dict);
104 104
105 105 if (self->dctx) {
106 106 ZSTD_freeDCtx(self->dctx);
107 107 self->dctx = NULL;
108 108 }
109 109
110 110 return -1;
111 111 }
112 112
113 113 static void Decompressor_dealloc(ZstdDecompressor* self) {
114 114 Py_CLEAR(self->dict);
115 115
116 116 if (self->dctx) {
117 117 ZSTD_freeDCtx(self->dctx);
118 118 self->dctx = NULL;
119 119 }
120 120
121 121 PyObject_Del(self);
122 122 }
123 123
124 124 PyDoc_STRVAR(Decompressor_memory_size__doc__,
125 125 "memory_size() -- Size of decompression context, in bytes\n"
126 126 );
127 127
128 128 static PyObject* Decompressor_memory_size(ZstdDecompressor* self) {
129 129 if (self->dctx) {
130 130 return PyLong_FromSize_t(ZSTD_sizeof_DCtx(self->dctx));
131 131 }
132 132 else {
133 133 PyErr_SetString(ZstdError, "no decompressor context found; this should never happen");
134 134 return NULL;
135 135 }
136 136 }
137 137
138 138 PyDoc_STRVAR(Decompressor_copy_stream__doc__,
139 139 "copy_stream(ifh, ofh[, read_size=default, write_size=default]) -- decompress data between streams\n"
140 140 "\n"
141 141 "Compressed data will be read from ``ifh``, decompressed, and written to\n"
142 142 "``ofh``. ``ifh`` must have a ``read(size)`` method. ``ofh`` must have a\n"
143 143 "``write(data)`` method.\n"
144 144 "\n"
145 145 "The optional ``read_size`` and ``write_size`` arguments control the chunk\n"
146 146 "size of data that is ``read()`` and ``write()`` between streams. They default\n"
147 147 "to the default input and output sizes of zstd decompressor streams.\n"
148 148 );
149 149
150 150 static PyObject* Decompressor_copy_stream(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
151 151 static char* kwlist[] = {
152 152 "ifh",
153 153 "ofh",
154 154 "read_size",
155 155 "write_size",
156 156 NULL
157 157 };
158 158
159 159 PyObject* source;
160 160 PyObject* dest;
161 161 size_t inSize = ZSTD_DStreamInSize();
162 162 size_t outSize = ZSTD_DStreamOutSize();
163 163 ZSTD_inBuffer input;
164 164 ZSTD_outBuffer output;
165 165 Py_ssize_t totalRead = 0;
166 166 Py_ssize_t totalWrite = 0;
167 167 char* readBuffer;
168 168 Py_ssize_t readSize;
169 169 PyObject* readResult = NULL;
170 170 PyObject* res = NULL;
171 171 size_t zresult = 0;
172 172 PyObject* writeResult;
173 173 PyObject* totalReadPy;
174 174 PyObject* totalWritePy;
175 175
176 176 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|kk:copy_stream", kwlist,
177 177 &source, &dest, &inSize, &outSize)) {
178 178 return NULL;
179 179 }
180 180
181 181 if (!PyObject_HasAttrString(source, "read")) {
182 182 PyErr_SetString(PyExc_ValueError, "first argument must have a read() method");
183 183 return NULL;
184 184 }
185 185
186 186 if (!PyObject_HasAttrString(dest, "write")) {
187 187 PyErr_SetString(PyExc_ValueError, "second argument must have a write() method");
188 188 return NULL;
189 189 }
190 190
191 191 /* Prevent free on uninitialized memory in finally. */
192 192 output.dst = NULL;
193 193
194 194 if (ensure_dctx(self, 1)) {
195 195 res = NULL;
196 196 goto finally;
197 197 }
198 198
199 199 output.dst = PyMem_Malloc(outSize);
200 200 if (!output.dst) {
201 201 PyErr_NoMemory();
202 202 res = NULL;
203 203 goto finally;
204 204 }
205 205 output.size = outSize;
206 206 output.pos = 0;
207 207
208 208 /* Read source stream until EOF */
209 209 while (1) {
210 210 readResult = PyObject_CallMethod(source, "read", "n", inSize);
211 211 if (!readResult) {
212 212 PyErr_SetString(ZstdError, "could not read() from source");
213 213 goto finally;
214 214 }
215 215
216 216 PyBytes_AsStringAndSize(readResult, &readBuffer, &readSize);
217 217
218 218 /* If no data was read, we're at EOF. */
219 219 if (0 == readSize) {
220 220 break;
221 221 }
222 222
223 223 totalRead += readSize;
224 224
225 225 /* Send data to decompressor */
226 226 input.src = readBuffer;
227 227 input.size = readSize;
228 228 input.pos = 0;
229 229
230 230 while (input.pos < input.size) {
231 231 Py_BEGIN_ALLOW_THREADS
232 232 zresult = ZSTD_decompress_generic(self->dctx, &output, &input);
233 233 Py_END_ALLOW_THREADS
234 234
235 235 if (ZSTD_isError(zresult)) {
236 236 PyErr_Format(ZstdError, "zstd decompressor error: %s",
237 237 ZSTD_getErrorName(zresult));
238 238 res = NULL;
239 239 goto finally;
240 240 }
241 241
242 242 if (output.pos) {
243 243 #if PY_MAJOR_VERSION >= 3
244 244 writeResult = PyObject_CallMethod(dest, "write", "y#",
245 245 #else
246 246 writeResult = PyObject_CallMethod(dest, "write", "s#",
247 247 #endif
248 248 output.dst, output.pos);
249 249
250 250 Py_XDECREF(writeResult);
251 251 totalWrite += output.pos;
252 252 output.pos = 0;
253 253 }
254 254 }
255 255
256 256 Py_CLEAR(readResult);
257 257 }
258 258
259 259 /* Source stream is exhausted. Finish up. */
260 260
261 261 totalReadPy = PyLong_FromSsize_t(totalRead);
262 262 totalWritePy = PyLong_FromSsize_t(totalWrite);
263 263 res = PyTuple_Pack(2, totalReadPy, totalWritePy);
264 264 Py_DECREF(totalReadPy);
265 265 Py_DECREF(totalWritePy);
266 266
267 267 finally:
268 268 if (output.dst) {
269 269 PyMem_Free(output.dst);
270 270 }
271 271
272 272 Py_XDECREF(readResult);
273 273
274 274 return res;
275 275 }
276 276
277 277 PyDoc_STRVAR(Decompressor_decompress__doc__,
278 278 "decompress(data[, max_output_size=None]) -- Decompress data in its entirety\n"
279 279 "\n"
280 280 "This method will decompress the entirety of the argument and return the\n"
281 281 "result.\n"
282 282 "\n"
283 283 "The input bytes are expected to contain a full Zstandard frame (something\n"
284 284 "compressed with ``ZstdCompressor.compress()`` or similar). If the input does\n"
285 285 "not contain a full frame, an exception will be raised.\n"
286 286 "\n"
287 287 "If the frame header of the compressed data does not contain the content size\n"
288 288 "``max_output_size`` must be specified or ``ZstdError`` will be raised. An\n"
289 289 "allocation of size ``max_output_size`` will be performed and an attempt will\n"
290 290 "be made to perform decompression into that buffer. If the buffer is too\n"
291 291 "small or cannot be allocated, ``ZstdError`` will be raised. The buffer will\n"
292 292 "be resized if it is too large.\n"
293 293 "\n"
294 294 "Uncompressed data could be much larger than compressed data. As a result,\n"
295 295 "calling this function could result in a very large memory allocation being\n"
296 296 "performed to hold the uncompressed data. Therefore it is **highly**\n"
297 297 "recommended to use a streaming decompression method instead of this one.\n"
298 298 );
299 299
300 300 PyObject* Decompressor_decompress(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
301 301 static char* kwlist[] = {
302 302 "data",
303 303 "max_output_size",
304 304 NULL
305 305 };
306 306
307 307 Py_buffer source;
308 308 Py_ssize_t maxOutputSize = 0;
309 309 unsigned long long decompressedSize;
310 310 size_t destCapacity;
311 311 PyObject* result = NULL;
312 312 size_t zresult;
313 313 ZSTD_outBuffer outBuffer;
314 314 ZSTD_inBuffer inBuffer;
315 315
316 316 #if PY_MAJOR_VERSION >= 3
317 317 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|n:decompress",
318 318 #else
319 319 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|n:decompress",
320 320 #endif
321 321 kwlist, &source, &maxOutputSize)) {
322 322 return NULL;
323 323 }
324 324
325 325 if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) {
326 326 PyErr_SetString(PyExc_ValueError,
327 327 "data buffer should be contiguous and have at most one dimension");
328 328 goto finally;
329 329 }
330 330
331 331 if (ensure_dctx(self, 1)) {
332 332 goto finally;
333 333 }
334 334
335 335 decompressedSize = ZSTD_getFrameContentSize(source.buf, source.len);
336 336
337 337 if (ZSTD_CONTENTSIZE_ERROR == decompressedSize) {
338 338 PyErr_SetString(ZstdError, "error determining content size from frame header");
339 339 goto finally;
340 340 }
341 341 /* Special case of empty frame. */
342 342 else if (0 == decompressedSize) {
343 343 result = PyBytes_FromStringAndSize("", 0);
344 344 goto finally;
345 345 }
346 346 /* Missing content size in frame header. */
347 347 if (ZSTD_CONTENTSIZE_UNKNOWN == decompressedSize) {
348 348 if (0 == maxOutputSize) {
349 349 PyErr_SetString(ZstdError, "could not determine content size in frame header");
350 350 goto finally;
351 351 }
352 352
353 353 result = PyBytes_FromStringAndSize(NULL, maxOutputSize);
354 354 destCapacity = maxOutputSize;
355 355 decompressedSize = 0;
356 356 }
357 357 /* Size is recorded in frame header. */
358 358 else {
359 359 assert(SIZE_MAX >= PY_SSIZE_T_MAX);
360 360 if (decompressedSize > PY_SSIZE_T_MAX) {
361 361 PyErr_SetString(ZstdError, "frame is too large to decompress on this platform");
362 362 goto finally;
363 363 }
364 364
365 365 result = PyBytes_FromStringAndSize(NULL, (Py_ssize_t)decompressedSize);
366 366 destCapacity = (size_t)decompressedSize;
367 367 }
368 368
369 369 if (!result) {
370 370 goto finally;
371 371 }
372 372
373 373 outBuffer.dst = PyBytes_AsString(result);
374 374 outBuffer.size = destCapacity;
375 375 outBuffer.pos = 0;
376 376
377 377 inBuffer.src = source.buf;
378 378 inBuffer.size = source.len;
379 379 inBuffer.pos = 0;
380 380
381 381 Py_BEGIN_ALLOW_THREADS
382 382 zresult = ZSTD_decompress_generic(self->dctx, &outBuffer, &inBuffer);
383 383 Py_END_ALLOW_THREADS
384 384
385 385 if (ZSTD_isError(zresult)) {
386 386 PyErr_Format(ZstdError, "decompression error: %s", ZSTD_getErrorName(zresult));
387 387 Py_CLEAR(result);
388 388 goto finally;
389 389 }
390 390 else if (zresult) {
391 391 PyErr_Format(ZstdError, "decompression error: did not decompress full frame");
392 392 Py_CLEAR(result);
393 393 goto finally;
394 394 }
395 395 else if (decompressedSize && outBuffer.pos != decompressedSize) {
396 396 PyErr_Format(ZstdError, "decompression error: decompressed %zu bytes; expected %llu",
397 397 zresult, decompressedSize);
398 398 Py_CLEAR(result);
399 399 goto finally;
400 400 }
401 401 else if (outBuffer.pos < destCapacity) {
402 402 if (safe_pybytes_resize(&result, outBuffer.pos)) {
403 403 Py_CLEAR(result);
404 404 goto finally;
405 405 }
406 406 }
407 407
408 408 finally:
409 409 PyBuffer_Release(&source);
410 410 return result;
411 411 }
412 412
413 413 PyDoc_STRVAR(Decompressor_decompressobj__doc__,
414 414 "decompressobj([write_size=default])\n"
415 415 "\n"
416 416 "Incrementally feed data into a decompressor.\n"
417 417 "\n"
418 418 "The returned object exposes a ``decompress(data)`` method. This makes it\n"
419 419 "compatible with ``zlib.decompressobj`` and ``bz2.BZ2Decompressor`` so that\n"
420 420 "callers can swap in the zstd decompressor while using the same API.\n"
421 421 );
422 422
423 423 static ZstdDecompressionObj* Decompressor_decompressobj(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
424 424 static char* kwlist[] = {
425 425 "write_size",
426 426 NULL
427 427 };
428 428
429 429 ZstdDecompressionObj* result = NULL;
430 430 size_t outSize = ZSTD_DStreamOutSize();
431 431
432 432 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|k:decompressobj", kwlist, &outSize)) {
433 433 return NULL;
434 434 }
435 435
436 436 if (!outSize) {
437 437 PyErr_SetString(PyExc_ValueError, "write_size must be positive");
438 438 return NULL;
439 439 }
440 440
441 441 result = (ZstdDecompressionObj*)PyObject_CallObject((PyObject*)&ZstdDecompressionObjType, NULL);
442 442 if (!result) {
443 443 return NULL;
444 444 }
445 445
446 446 if (ensure_dctx(self, 1)) {
447 447 Py_DECREF(result);
448 448 return NULL;
449 449 }
450 450
451 451 result->decompressor = self;
452 452 Py_INCREF(result->decompressor);
453 453 result->outSize = outSize;
454 454
455 455 return result;
456 456 }
457 457
458 458 PyDoc_STRVAR(Decompressor_read_to_iter__doc__,
459 459 "read_to_iter(reader[, read_size=default, write_size=default, skip_bytes=0])\n"
460 460 "Read compressed data and return an iterator\n"
461 461 "\n"
462 462 "Returns an iterator of decompressed data chunks produced from reading from\n"
463 463 "the ``reader``.\n"
464 464 "\n"
465 465 "Compressed data will be obtained from ``reader`` by calling the\n"
466 466 "``read(size)`` method of it. The source data will be streamed into a\n"
467 467 "decompressor. As decompressed data is available, it will be exposed to the\n"
468 468 "returned iterator.\n"
469 469 "\n"
470 470 "Data is ``read()`` in chunks of size ``read_size`` and exposed to the\n"
471 471 "iterator in chunks of size ``write_size``. The default values are the input\n"
472 472 "and output sizes for a zstd streaming decompressor.\n"
473 473 "\n"
474 474 "There is also support for skipping the first ``skip_bytes`` of data from\n"
475 475 "the source.\n"
476 476 );
477 477
478 478 static ZstdDecompressorIterator* Decompressor_read_to_iter(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
479 479 static char* kwlist[] = {
480 480 "reader",
481 481 "read_size",
482 482 "write_size",
483 483 "skip_bytes",
484 484 NULL
485 485 };
486 486
487 487 PyObject* reader;
488 488 size_t inSize = ZSTD_DStreamInSize();
489 489 size_t outSize = ZSTD_DStreamOutSize();
490 490 ZstdDecompressorIterator* result;
491 491 size_t skipBytes = 0;
492 492
493 493 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|kkk:read_to_iter", kwlist,
494 494 &reader, &inSize, &outSize, &skipBytes)) {
495 495 return NULL;
496 496 }
497 497
498 498 if (skipBytes >= inSize) {
499 499 PyErr_SetString(PyExc_ValueError,
500 500 "skip_bytes must be smaller than read_size");
501 501 return NULL;
502 502 }
503 503
504 504 result = (ZstdDecompressorIterator*)PyObject_CallObject((PyObject*)&ZstdDecompressorIteratorType, NULL);
505 505 if (!result) {
506 506 return NULL;
507 507 }
508 508
509 509 if (PyObject_HasAttrString(reader, "read")) {
510 510 result->reader = reader;
511 511 Py_INCREF(result->reader);
512 512 }
513 513 else if (1 == PyObject_CheckBuffer(reader)) {
514 514 /* Object claims it is a buffer. Try to get a handle to it. */
515 515 if (0 != PyObject_GetBuffer(reader, &result->buffer, PyBUF_CONTIG_RO)) {
516 516 goto except;
517 517 }
518 518 }
519 519 else {
520 520 PyErr_SetString(PyExc_ValueError,
521 521 "must pass an object with a read() method or conforms to buffer protocol");
522 522 goto except;
523 523 }
524 524
525 525 result->decompressor = self;
526 526 Py_INCREF(result->decompressor);
527 527
528 528 result->inSize = inSize;
529 529 result->outSize = outSize;
530 530 result->skipBytes = skipBytes;
531 531
532 532 if (ensure_dctx(self, 1)) {
533 533 goto except;
534 534 }
535 535
536 536 result->input.src = PyMem_Malloc(inSize);
537 537 if (!result->input.src) {
538 538 PyErr_NoMemory();
539 539 goto except;
540 540 }
541 541
542 542 goto finally;
543 543
544 544 except:
545 545 Py_CLEAR(result);
546 546
547 547 finally:
548 548
549 549 return result;
550 550 }
551 551
552 552 PyDoc_STRVAR(Decompressor_stream_reader__doc__,
553 553 "stream_reader(source, [read_size=default])\n"
554 554 "\n"
555 555 "Obtain an object that behaves like an I/O stream that can be used for\n"
556 556 "reading decompressed output from an object.\n"
557 557 "\n"
558 558 "The source object can be any object with a ``read(size)`` method or that\n"
559 559 "conforms to the buffer protocol.\n"
560 560 );
561 561
562 562 static ZstdDecompressionReader* Decompressor_stream_reader(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
563 563 static char* kwlist[] = {
564 564 "source",
565 565 "read_size",
566 566 NULL
567 567 };
568 568
569 569 PyObject* source;
570 570 size_t readSize = ZSTD_DStreamInSize();
571 571 ZstdDecompressionReader* result;
572 572
573 573 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k:stream_reader", kwlist,
574 574 &source, &readSize)) {
575 575 return NULL;
576 576 }
577 577
578 if (ensure_dctx(self, 1)) {
579 return NULL;
580 }
581
578 582 result = (ZstdDecompressionReader*)PyObject_CallObject((PyObject*)&ZstdDecompressionReaderType, NULL);
579 583 if (NULL == result) {
580 584 return NULL;
581 585 }
582 586
583 587 if (PyObject_HasAttrString(source, "read")) {
584 588 result->reader = source;
585 589 Py_INCREF(source);
586 590 result->readSize = readSize;
587 591 }
588 592 else if (1 == PyObject_CheckBuffer(source)) {
589 593 if (0 != PyObject_GetBuffer(source, &result->buffer, PyBUF_CONTIG_RO)) {
590 594 Py_CLEAR(result);
591 595 return NULL;
592 596 }
593 597 }
594 598 else {
595 599 PyErr_SetString(PyExc_TypeError,
596 600 "must pass an object with a read() method or that conforms to the buffer protocol");
597 601 Py_CLEAR(result);
598 602 return NULL;
599 603 }
600 604
601 605 result->decompressor = self;
602 606 Py_INCREF(self);
603 607
604 608 return result;
605 609 }
606 610
607 611 PyDoc_STRVAR(Decompressor_stream_writer__doc__,
608 612 "Create a context manager to write decompressed data to an object.\n"
609 613 "\n"
610 614 "The passed object must have a ``write()`` method.\n"
611 615 "\n"
612 616 "The caller feeds intput data to the object by calling ``write(data)``.\n"
613 617 "Decompressed data is written to the argument given as it is decompressed.\n"
614 618 "\n"
615 619 "An optional ``write_size`` argument defines the size of chunks to\n"
616 620 "``write()`` to the writer. It defaults to the default output size for a zstd\n"
617 621 "streaming decompressor.\n"
618 622 );
619 623
620 624 static ZstdDecompressionWriter* Decompressor_stream_writer(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
621 625 static char* kwlist[] = {
622 626 "writer",
623 627 "write_size",
624 628 NULL
625 629 };
626 630
627 631 PyObject* writer;
628 632 size_t outSize = ZSTD_DStreamOutSize();
629 633 ZstdDecompressionWriter* result;
630 634
631 635 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k:stream_writer", kwlist,
632 636 &writer, &outSize)) {
633 637 return NULL;
634 638 }
635 639
636 640 if (!PyObject_HasAttrString(writer, "write")) {
637 641 PyErr_SetString(PyExc_ValueError, "must pass an object with a write() method");
638 642 return NULL;
639 643 }
640 644
641 645 result = (ZstdDecompressionWriter*)PyObject_CallObject((PyObject*)&ZstdDecompressionWriterType, NULL);
642 646 if (!result) {
643 647 return NULL;
644 648 }
645 649
646 650 result->decompressor = self;
647 651 Py_INCREF(result->decompressor);
648 652
649 653 result->writer = writer;
650 654 Py_INCREF(result->writer);
651 655
652 656 result->outSize = outSize;
653 657
654 658 return result;
655 659 }
656 660
657 661 PyDoc_STRVAR(Decompressor_decompress_content_dict_chain__doc__,
658 662 "Decompress a series of chunks using the content dictionary chaining technique\n"
659 663 );
660 664
661 665 static PyObject* Decompressor_decompress_content_dict_chain(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
662 666 static char* kwlist[] = {
663 667 "frames",
664 668 NULL
665 669 };
666 670
667 671 PyObject* chunks;
668 672 Py_ssize_t chunksLen;
669 673 Py_ssize_t chunkIndex;
670 674 char parity = 0;
671 675 PyObject* chunk;
672 676 char* chunkData;
673 677 Py_ssize_t chunkSize;
674 678 size_t zresult;
675 679 ZSTD_frameHeader frameHeader;
676 680 void* buffer1 = NULL;
677 681 size_t buffer1Size = 0;
678 682 size_t buffer1ContentSize = 0;
679 683 void* buffer2 = NULL;
680 684 size_t buffer2Size = 0;
681 685 size_t buffer2ContentSize = 0;
682 686 void* destBuffer = NULL;
683 687 PyObject* result = NULL;
684 688 ZSTD_outBuffer outBuffer;
685 689 ZSTD_inBuffer inBuffer;
686 690
687 691 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!:decompress_content_dict_chain",
688 692 kwlist, &PyList_Type, &chunks)) {
689 693 return NULL;
690 694 }
691 695
692 696 chunksLen = PyList_Size(chunks);
693 697 if (!chunksLen) {
694 698 PyErr_SetString(PyExc_ValueError, "empty input chain");
695 699 return NULL;
696 700 }
697 701
698 702 /* The first chunk should not be using a dictionary. We handle it specially. */
699 703 chunk = PyList_GetItem(chunks, 0);
700 704 if (!PyBytes_Check(chunk)) {
701 705 PyErr_SetString(PyExc_ValueError, "chunk 0 must be bytes");
702 706 return NULL;
703 707 }
704 708
705 709 /* We require that all chunks be zstd frames and that they have content size set. */
706 710 PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize);
707 711 zresult = ZSTD_getFrameHeader(&frameHeader, (void*)chunkData, chunkSize);
708 712 if (ZSTD_isError(zresult)) {
709 713 PyErr_SetString(PyExc_ValueError, "chunk 0 is not a valid zstd frame");
710 714 return NULL;
711 715 }
712 716 else if (zresult) {
713 717 PyErr_SetString(PyExc_ValueError, "chunk 0 is too small to contain a zstd frame");
714 718 return NULL;
715 719 }
716 720
717 721 if (ZSTD_CONTENTSIZE_UNKNOWN == frameHeader.frameContentSize) {
718 722 PyErr_SetString(PyExc_ValueError, "chunk 0 missing content size in frame");
719 723 return NULL;
720 724 }
721 725
722 726 assert(ZSTD_CONTENTSIZE_ERROR != frameHeader.frameContentSize);
723 727
724 728 /* We check against PY_SSIZE_T_MAX here because we ultimately cast the
725 729 * result to a Python object and it's length can be no greater than
726 730 * Py_ssize_t. In theory, we could have an intermediate frame that is
727 731 * larger. But a) why would this API be used for frames that large b)
728 732 * it isn't worth the complexity to support. */
729 733 assert(SIZE_MAX >= PY_SSIZE_T_MAX);
730 734 if (frameHeader.frameContentSize > PY_SSIZE_T_MAX) {
731 735 PyErr_SetString(PyExc_ValueError,
732 736 "chunk 0 is too large to decompress on this platform");
733 737 return NULL;
734 738 }
735 739
736 740 if (ensure_dctx(self, 0)) {
737 741 goto finally;
738 742 }
739 743
740 744 buffer1Size = (size_t)frameHeader.frameContentSize;
741 745 buffer1 = PyMem_Malloc(buffer1Size);
742 746 if (!buffer1) {
743 747 goto finally;
744 748 }
745 749
746 750 outBuffer.dst = buffer1;
747 751 outBuffer.size = buffer1Size;
748 752 outBuffer.pos = 0;
749 753
750 754 inBuffer.src = chunkData;
751 755 inBuffer.size = chunkSize;
752 756 inBuffer.pos = 0;
753 757
754 758 Py_BEGIN_ALLOW_THREADS
755 759 zresult = ZSTD_decompress_generic(self->dctx, &outBuffer, &inBuffer);
756 760 Py_END_ALLOW_THREADS
757 761 if (ZSTD_isError(zresult)) {
758 762 PyErr_Format(ZstdError, "could not decompress chunk 0: %s", ZSTD_getErrorName(zresult));
759 763 goto finally;
760 764 }
761 765 else if (zresult) {
762 766 PyErr_Format(ZstdError, "chunk 0 did not decompress full frame");
763 767 goto finally;
764 768 }
765 769
766 770 buffer1ContentSize = outBuffer.pos;
767 771
768 772 /* Special case of a simple chain. */
769 773 if (1 == chunksLen) {
770 774 result = PyBytes_FromStringAndSize(buffer1, buffer1Size);
771 775 goto finally;
772 776 }
773 777
774 778 /* This should ideally look at next chunk. But this is slightly simpler. */
775 779 buffer2Size = (size_t)frameHeader.frameContentSize;
776 780 buffer2 = PyMem_Malloc(buffer2Size);
777 781 if (!buffer2) {
778 782 goto finally;
779 783 }
780 784
781 785 /* For each subsequent chunk, use the previous fulltext as a content dictionary.
782 786 Our strategy is to have 2 buffers. One holds the previous fulltext (to be
783 787 used as a content dictionary) and the other holds the new fulltext. The
784 788 buffers grow when needed but never decrease in size. This limits the
785 789 memory allocator overhead.
786 790 */
787 791 for (chunkIndex = 1; chunkIndex < chunksLen; chunkIndex++) {
788 792 chunk = PyList_GetItem(chunks, chunkIndex);
789 793 if (!PyBytes_Check(chunk)) {
790 794 PyErr_Format(PyExc_ValueError, "chunk %zd must be bytes", chunkIndex);
791 795 goto finally;
792 796 }
793 797
794 798 PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize);
795 799 zresult = ZSTD_getFrameHeader(&frameHeader, (void*)chunkData, chunkSize);
796 800 if (ZSTD_isError(zresult)) {
797 801 PyErr_Format(PyExc_ValueError, "chunk %zd is not a valid zstd frame", chunkIndex);
798 802 goto finally;
799 803 }
800 804 else if (zresult) {
801 805 PyErr_Format(PyExc_ValueError, "chunk %zd is too small to contain a zstd frame", chunkIndex);
802 806 goto finally;
803 807 }
804 808
805 809 if (ZSTD_CONTENTSIZE_UNKNOWN == frameHeader.frameContentSize) {
806 810 PyErr_Format(PyExc_ValueError, "chunk %zd missing content size in frame", chunkIndex);
807 811 goto finally;
808 812 }
809 813
810 814 assert(ZSTD_CONTENTSIZE_ERROR != frameHeader.frameContentSize);
811 815
812 816 if (frameHeader.frameContentSize > PY_SSIZE_T_MAX) {
813 817 PyErr_Format(PyExc_ValueError,
814 818 "chunk %zd is too large to decompress on this platform", chunkIndex);
815 819 goto finally;
816 820 }
817 821
818 822 inBuffer.src = chunkData;
819 823 inBuffer.size = chunkSize;
820 824 inBuffer.pos = 0;
821 825
822 826 parity = chunkIndex % 2;
823 827
824 828 /* This could definitely be abstracted to reduce code duplication. */
825 829 if (parity) {
826 830 /* Resize destination buffer to hold larger content. */
827 831 if (buffer2Size < frameHeader.frameContentSize) {
828 832 buffer2Size = (size_t)frameHeader.frameContentSize;
829 833 destBuffer = PyMem_Realloc(buffer2, buffer2Size);
830 834 if (!destBuffer) {
831 835 goto finally;
832 836 }
833 837 buffer2 = destBuffer;
834 838 }
835 839
836 840 Py_BEGIN_ALLOW_THREADS
837 841 zresult = ZSTD_DCtx_refPrefix_advanced(self->dctx,
838 842 buffer1, buffer1ContentSize, ZSTD_dct_rawContent);
839 843 Py_END_ALLOW_THREADS
840 844 if (ZSTD_isError(zresult)) {
841 845 PyErr_Format(ZstdError,
842 846 "failed to load prefix dictionary at chunk %zd", chunkIndex);
843 847 goto finally;
844 848 }
845 849
846 850 outBuffer.dst = buffer2;
847 851 outBuffer.size = buffer2Size;
848 852 outBuffer.pos = 0;
849 853
850 854 Py_BEGIN_ALLOW_THREADS
851 855 zresult = ZSTD_decompress_generic(self->dctx, &outBuffer, &inBuffer);
852 856 Py_END_ALLOW_THREADS
853 857 if (ZSTD_isError(zresult)) {
854 858 PyErr_Format(ZstdError, "could not decompress chunk %zd: %s",
855 859 chunkIndex, ZSTD_getErrorName(zresult));
856 860 goto finally;
857 861 }
858 862 else if (zresult) {
859 863 PyErr_Format(ZstdError, "chunk %zd did not decompress full frame",
860 864 chunkIndex);
861 865 goto finally;
862 866 }
863 867
864 868 buffer2ContentSize = outBuffer.pos;
865 869 }
866 870 else {
867 871 if (buffer1Size < frameHeader.frameContentSize) {
868 872 buffer1Size = (size_t)frameHeader.frameContentSize;
869 873 destBuffer = PyMem_Realloc(buffer1, buffer1Size);
870 874 if (!destBuffer) {
871 875 goto finally;
872 876 }
873 877 buffer1 = destBuffer;
874 878 }
875 879
876 880 Py_BEGIN_ALLOW_THREADS
877 881 zresult = ZSTD_DCtx_refPrefix_advanced(self->dctx,
878 882 buffer2, buffer2ContentSize, ZSTD_dct_rawContent);
879 883 Py_END_ALLOW_THREADS
880 884 if (ZSTD_isError(zresult)) {
881 885 PyErr_Format(ZstdError,
882 886 "failed to load prefix dictionary at chunk %zd", chunkIndex);
883 887 goto finally;
884 888 }
885 889
886 890 outBuffer.dst = buffer1;
887 891 outBuffer.size = buffer1Size;
888 892 outBuffer.pos = 0;
889 893
890 894 Py_BEGIN_ALLOW_THREADS
891 895 zresult = ZSTD_decompress_generic(self->dctx, &outBuffer, &inBuffer);
892 896 Py_END_ALLOW_THREADS
893 897 if (ZSTD_isError(zresult)) {
894 898 PyErr_Format(ZstdError, "could not decompress chunk %zd: %s",
895 899 chunkIndex, ZSTD_getErrorName(zresult));
896 900 goto finally;
897 901 }
898 902 else if (zresult) {
899 903 PyErr_Format(ZstdError, "chunk %zd did not decompress full frame",
900 904 chunkIndex);
901 905 goto finally;
902 906 }
903 907
904 908 buffer1ContentSize = outBuffer.pos;
905 909 }
906 910 }
907 911
908 912 result = PyBytes_FromStringAndSize(parity ? buffer2 : buffer1,
909 913 parity ? buffer2ContentSize : buffer1ContentSize);
910 914
911 915 finally:
912 916 if (buffer2) {
913 917 PyMem_Free(buffer2);
914 918 }
915 919 if (buffer1) {
916 920 PyMem_Free(buffer1);
917 921 }
918 922
919 923 return result;
920 924 }
921 925
922 926 typedef struct {
923 927 void* sourceData;
924 928 size_t sourceSize;
925 929 size_t destSize;
926 930 } FramePointer;
927 931
928 932 typedef struct {
929 933 FramePointer* frames;
930 934 Py_ssize_t framesSize;
931 935 unsigned long long compressedSize;
932 936 } FrameSources;
933 937
934 938 typedef struct {
935 939 void* dest;
936 940 Py_ssize_t destSize;
937 941 BufferSegment* segments;
938 942 Py_ssize_t segmentsSize;
939 943 } DestBuffer;
940 944
941 945 typedef enum {
942 946 WorkerError_none = 0,
943 947 WorkerError_zstd = 1,
944 948 WorkerError_memory = 2,
945 949 WorkerError_sizeMismatch = 3,
946 950 WorkerError_unknownSize = 4,
947 951 } WorkerError;
948 952
949 953 typedef struct {
950 954 /* Source records and length */
951 955 FramePointer* framePointers;
952 956 /* Which records to process. */
953 957 Py_ssize_t startOffset;
954 958 Py_ssize_t endOffset;
955 959 unsigned long long totalSourceSize;
956 960
957 961 /* Compression state and settings. */
958 962 ZSTD_DCtx* dctx;
959 963 int requireOutputSizes;
960 964
961 965 /* Output storage. */
962 966 DestBuffer* destBuffers;
963 967 Py_ssize_t destCount;
964 968
965 969 /* Item that error occurred on. */
966 970 Py_ssize_t errorOffset;
967 971 /* If an error occurred. */
968 972 WorkerError error;
969 973 /* result from zstd decompression operation */
970 974 size_t zresult;
971 975 } WorkerState;
972 976
973 977 static void decompress_worker(WorkerState* state) {
974 978 size_t allocationSize;
975 979 DestBuffer* destBuffer;
976 980 Py_ssize_t frameIndex;
977 981 Py_ssize_t localOffset = 0;
978 982 Py_ssize_t currentBufferStartIndex = state->startOffset;
979 983 Py_ssize_t remainingItems = state->endOffset - state->startOffset + 1;
980 984 void* tmpBuf;
981 985 Py_ssize_t destOffset = 0;
982 986 FramePointer* framePointers = state->framePointers;
983 987 size_t zresult;
984 988 unsigned long long totalOutputSize = 0;
985 989
986 990 assert(NULL == state->destBuffers);
987 991 assert(0 == state->destCount);
988 992 assert(state->endOffset - state->startOffset >= 0);
989 993
990 994 /* We could get here due to the way work is allocated. Ideally we wouldn't
991 995 get here. But that would require a bit of a refactor in the caller. */
992 996 if (state->totalSourceSize > SIZE_MAX) {
993 997 state->error = WorkerError_memory;
994 998 state->errorOffset = 0;
995 999 return;
996 1000 }
997 1001
998 1002 /*
999 1003 * We need to allocate a buffer to hold decompressed data. How we do this
1000 1004 * depends on what we know about the output. The following scenarios are
1001 1005 * possible:
1002 1006 *
1003 1007 * 1. All structs defining frames declare the output size.
1004 1008 * 2. The decompressed size is embedded within the zstd frame.
1005 1009 * 3. The decompressed size is not stored anywhere.
1006 1010 *
1007 1011 * For now, we only support #1 and #2.
1008 1012 */
1009 1013
1010 1014 /* Resolve ouput segments. */
1011 1015 for (frameIndex = state->startOffset; frameIndex <= state->endOffset; frameIndex++) {
1012 1016 FramePointer* fp = &framePointers[frameIndex];
1013 1017 unsigned long long decompressedSize;
1014 1018
1015 1019 if (0 == fp->destSize) {
1016 1020 decompressedSize = ZSTD_getFrameContentSize(fp->sourceData, fp->sourceSize);
1017 1021
1018 1022 if (ZSTD_CONTENTSIZE_ERROR == decompressedSize) {
1019 1023 state->error = WorkerError_unknownSize;
1020 1024 state->errorOffset = frameIndex;
1021 1025 return;
1022 1026 }
1023 1027 else if (ZSTD_CONTENTSIZE_UNKNOWN == decompressedSize) {
1024 1028 if (state->requireOutputSizes) {
1025 1029 state->error = WorkerError_unknownSize;
1026 1030 state->errorOffset = frameIndex;
1027 1031 return;
1028 1032 }
1029 1033
1030 1034 /* This will fail the assert for .destSize > 0 below. */
1031 1035 decompressedSize = 0;
1032 1036 }
1033 1037
1034 1038 if (decompressedSize > SIZE_MAX) {
1035 1039 state->error = WorkerError_memory;
1036 1040 state->errorOffset = frameIndex;
1037 1041 return;
1038 1042 }
1039 1043
1040 1044 fp->destSize = (size_t)decompressedSize;
1041 1045 }
1042 1046
1043 1047 totalOutputSize += fp->destSize;
1044 1048 }
1045 1049
1046 1050 state->destBuffers = calloc(1, sizeof(DestBuffer));
1047 1051 if (NULL == state->destBuffers) {
1048 1052 state->error = WorkerError_memory;
1049 1053 return;
1050 1054 }
1051 1055
1052 1056 state->destCount = 1;
1053 1057
1054 1058 destBuffer = &state->destBuffers[state->destCount - 1];
1055 1059
1056 1060 assert(framePointers[state->startOffset].destSize > 0); /* For now. */
1057 1061
1058 1062 allocationSize = roundpow2((size_t)state->totalSourceSize);
1059 1063
1060 1064 if (framePointers[state->startOffset].destSize > allocationSize) {
1061 1065 allocationSize = roundpow2(framePointers[state->startOffset].destSize);
1062 1066 }
1063 1067
1064 1068 destBuffer->dest = malloc(allocationSize);
1065 1069 if (NULL == destBuffer->dest) {
1066 1070 state->error = WorkerError_memory;
1067 1071 return;
1068 1072 }
1069 1073
1070 1074 destBuffer->destSize = allocationSize;
1071 1075
1072 1076 destBuffer->segments = calloc(remainingItems, sizeof(BufferSegment));
1073 1077 if (NULL == destBuffer->segments) {
1074 1078 /* Caller will free state->dest as part of cleanup. */
1075 1079 state->error = WorkerError_memory;
1076 1080 return;
1077 1081 }
1078 1082
1079 1083 destBuffer->segmentsSize = remainingItems;
1080 1084
1081 1085 for (frameIndex = state->startOffset; frameIndex <= state->endOffset; frameIndex++) {
1082 1086 ZSTD_outBuffer outBuffer;
1083 1087 ZSTD_inBuffer inBuffer;
1084 1088 const void* source = framePointers[frameIndex].sourceData;
1085 1089 const size_t sourceSize = framePointers[frameIndex].sourceSize;
1086 1090 void* dest;
1087 1091 const size_t decompressedSize = framePointers[frameIndex].destSize;
1088 1092 size_t destAvailable = destBuffer->destSize - destOffset;
1089 1093
1090 1094 assert(decompressedSize > 0); /* For now. */
1091 1095
1092 1096 /*
1093 1097 * Not enough space in current buffer. Finish current before and allocate and
1094 1098 * switch to a new one.
1095 1099 */
1096 1100 if (decompressedSize > destAvailable) {
1097 1101 /*
1098 1102 * Shrinking the destination buffer is optional. But it should be cheap,
1099 1103 * so we just do it.
1100 1104 */
1101 1105 if (destAvailable) {
1102 1106 tmpBuf = realloc(destBuffer->dest, destOffset);
1103 1107 if (NULL == tmpBuf) {
1104 1108 state->error = WorkerError_memory;
1105 1109 return;
1106 1110 }
1107 1111
1108 1112 destBuffer->dest = tmpBuf;
1109 1113 destBuffer->destSize = destOffset;
1110 1114 }
1111 1115
1112 1116 /* Truncate segments buffer. */
1113 1117 tmpBuf = realloc(destBuffer->segments,
1114 1118 (frameIndex - currentBufferStartIndex) * sizeof(BufferSegment));
1115 1119 if (NULL == tmpBuf) {
1116 1120 state->error = WorkerError_memory;
1117 1121 return;
1118 1122 }
1119 1123
1120 1124 destBuffer->segments = tmpBuf;
1121 1125 destBuffer->segmentsSize = frameIndex - currentBufferStartIndex;
1122 1126
1123 1127 /* Grow space for new DestBuffer. */
1124 1128 tmpBuf = realloc(state->destBuffers, (state->destCount + 1) * sizeof(DestBuffer));
1125 1129 if (NULL == tmpBuf) {
1126 1130 state->error = WorkerError_memory;
1127 1131 return;
1128 1132 }
1129 1133
1130 1134 state->destBuffers = tmpBuf;
1131 1135 state->destCount++;
1132 1136
1133 1137 destBuffer = &state->destBuffers[state->destCount - 1];
1134 1138
1135 1139 /* Don't take any chances will non-NULL pointers. */
1136 1140 memset(destBuffer, 0, sizeof(DestBuffer));
1137 1141
1138 1142 allocationSize = roundpow2((size_t)state->totalSourceSize);
1139 1143
1140 1144 if (decompressedSize > allocationSize) {
1141 1145 allocationSize = roundpow2(decompressedSize);
1142 1146 }
1143 1147
1144 1148 destBuffer->dest = malloc(allocationSize);
1145 1149 if (NULL == destBuffer->dest) {
1146 1150 state->error = WorkerError_memory;
1147 1151 return;
1148 1152 }
1149 1153
1150 1154 destBuffer->destSize = allocationSize;
1151 1155 destAvailable = allocationSize;
1152 1156 destOffset = 0;
1153 1157 localOffset = 0;
1154 1158
1155 1159 destBuffer->segments = calloc(remainingItems, sizeof(BufferSegment));
1156 1160 if (NULL == destBuffer->segments) {
1157 1161 state->error = WorkerError_memory;
1158 1162 return;
1159 1163 }
1160 1164
1161 1165 destBuffer->segmentsSize = remainingItems;
1162 1166 currentBufferStartIndex = frameIndex;
1163 1167 }
1164 1168
1165 1169 dest = (char*)destBuffer->dest + destOffset;
1166 1170
1167 1171 outBuffer.dst = dest;
1168 1172 outBuffer.size = decompressedSize;
1169 1173 outBuffer.pos = 0;
1170 1174
1171 1175 inBuffer.src = source;
1172 1176 inBuffer.size = sourceSize;
1173 1177 inBuffer.pos = 0;
1174 1178
1175 1179 zresult = ZSTD_decompress_generic(state->dctx, &outBuffer, &inBuffer);
1176 1180 if (ZSTD_isError(zresult)) {
1177 1181 state->error = WorkerError_zstd;
1178 1182 state->zresult = zresult;
1179 1183 state->errorOffset = frameIndex;
1180 1184 return;
1181 1185 }
1182 1186 else if (zresult || outBuffer.pos != decompressedSize) {
1183 1187 state->error = WorkerError_sizeMismatch;
1184 1188 state->zresult = outBuffer.pos;
1185 1189 state->errorOffset = frameIndex;
1186 1190 return;
1187 1191 }
1188 1192
1189 1193 destBuffer->segments[localOffset].offset = destOffset;
1190 1194 destBuffer->segments[localOffset].length = outBuffer.pos;
1191 1195 destOffset += outBuffer.pos;
1192 1196 localOffset++;
1193 1197 remainingItems--;
1194 1198 }
1195 1199
1196 1200 if (destBuffer->destSize > destOffset) {
1197 1201 tmpBuf = realloc(destBuffer->dest, destOffset);
1198 1202 if (NULL == tmpBuf) {
1199 1203 state->error = WorkerError_memory;
1200 1204 return;
1201 1205 }
1202 1206
1203 1207 destBuffer->dest = tmpBuf;
1204 1208 destBuffer->destSize = destOffset;
1205 1209 }
1206 1210 }
1207 1211
1208 1212 ZstdBufferWithSegmentsCollection* decompress_from_framesources(ZstdDecompressor* decompressor, FrameSources* frames,
1209 1213 Py_ssize_t threadCount) {
1210 1214 Py_ssize_t i = 0;
1211 1215 int errored = 0;
1212 1216 Py_ssize_t segmentsCount;
1213 1217 ZstdBufferWithSegments* bws = NULL;
1214 1218 PyObject* resultArg = NULL;
1215 1219 Py_ssize_t resultIndex;
1216 1220 ZstdBufferWithSegmentsCollection* result = NULL;
1217 1221 FramePointer* framePointers = frames->frames;
1218 1222 unsigned long long workerBytes = 0;
1219 1223 Py_ssize_t currentThread = 0;
1220 1224 Py_ssize_t workerStartOffset = 0;
1221 1225 POOL_ctx* pool = NULL;
1222 1226 WorkerState* workerStates = NULL;
1223 1227 unsigned long long bytesPerWorker;
1224 1228
1225 1229 /* Caller should normalize 0 and negative values to 1 or larger. */
1226 1230 assert(threadCount >= 1);
1227 1231
1228 1232 /* More threads than inputs makes no sense under any conditions. */
1229 1233 threadCount = frames->framesSize < threadCount ? frames->framesSize
1230 1234 : threadCount;
1231 1235
1232 1236 /* TODO lower thread count if input size is too small and threads would just
1233 1237 add overhead. */
1234 1238
1235 1239 if (decompressor->dict) {
1236 1240 if (ensure_ddict(decompressor->dict)) {
1237 1241 return NULL;
1238 1242 }
1239 1243 }
1240 1244
1241 1245 /* If threadCount==1, we don't start a thread pool. But we do leverage the
1242 1246 same API for dispatching work. */
1243 1247 workerStates = PyMem_Malloc(threadCount * sizeof(WorkerState));
1244 1248 if (NULL == workerStates) {
1245 1249 PyErr_NoMemory();
1246 1250 goto finally;
1247 1251 }
1248 1252
1249 1253 memset(workerStates, 0, threadCount * sizeof(WorkerState));
1250 1254
1251 1255 if (threadCount > 1) {
1252 1256 pool = POOL_create(threadCount, 1);
1253 1257 if (NULL == pool) {
1254 1258 PyErr_SetString(ZstdError, "could not initialize zstd thread pool");
1255 1259 goto finally;
1256 1260 }
1257 1261 }
1258 1262
1259 1263 bytesPerWorker = frames->compressedSize / threadCount;
1260 1264
1261 1265 if (bytesPerWorker > SIZE_MAX) {
1262 1266 PyErr_SetString(ZstdError, "too much data per worker for this platform");
1263 1267 goto finally;
1264 1268 }
1265 1269
1266 1270 for (i = 0; i < threadCount; i++) {
1267 1271 size_t zresult;
1268 1272
1269 1273 workerStates[i].dctx = ZSTD_createDCtx();
1270 1274 if (NULL == workerStates[i].dctx) {
1271 1275 PyErr_NoMemory();
1272 1276 goto finally;
1273 1277 }
1274 1278
1275 1279 ZSTD_copyDCtx(workerStates[i].dctx, decompressor->dctx);
1276 1280
1277 1281 if (decompressor->dict) {
1278 1282 zresult = ZSTD_DCtx_refDDict(workerStates[i].dctx, decompressor->dict->ddict);
1279 1283 if (zresult) {
1280 1284 PyErr_Format(ZstdError, "unable to reference prepared dictionary: %s",
1281 1285 ZSTD_getErrorName(zresult));
1282 1286 goto finally;
1283 1287 }
1284 1288 }
1285 1289
1286 1290 workerStates[i].framePointers = framePointers;
1287 1291 workerStates[i].requireOutputSizes = 1;
1288 1292 }
1289 1293
1290 1294 Py_BEGIN_ALLOW_THREADS
1291 1295 /* There are many ways to split work among workers.
1292 1296
1293 1297 For now, we take a simple approach of splitting work so each worker
1294 1298 gets roughly the same number of input bytes. This will result in more
1295 1299 starvation than running N>threadCount jobs. But it avoids complications
1296 1300 around state tracking, which could involve extra locking.
1297 1301 */
1298 1302 for (i = 0; i < frames->framesSize; i++) {
1299 1303 workerBytes += frames->frames[i].sourceSize;
1300 1304
1301 1305 /*
1302 1306 * The last worker/thread needs to handle all remaining work. Don't
1303 1307 * trigger it prematurely. Defer to the block outside of the loop.
1304 1308 * (But still process this loop so workerBytes is correct.
1305 1309 */
1306 1310 if (currentThread == threadCount - 1) {
1307 1311 continue;
1308 1312 }
1309 1313
1310 1314 if (workerBytes >= bytesPerWorker) {
1311 1315 workerStates[currentThread].startOffset = workerStartOffset;
1312 1316 workerStates[currentThread].endOffset = i;
1313 1317 workerStates[currentThread].totalSourceSize = workerBytes;
1314 1318
1315 1319 if (threadCount > 1) {
1316 1320 POOL_add(pool, (POOL_function)decompress_worker, &workerStates[currentThread]);
1317 1321 }
1318 1322 else {
1319 1323 decompress_worker(&workerStates[currentThread]);
1320 1324 }
1321 1325 currentThread++;
1322 1326 workerStartOffset = i + 1;
1323 1327 workerBytes = 0;
1324 1328 }
1325 1329 }
1326 1330
1327 1331 if (workerBytes) {
1328 1332 workerStates[currentThread].startOffset = workerStartOffset;
1329 1333 workerStates[currentThread].endOffset = frames->framesSize - 1;
1330 1334 workerStates[currentThread].totalSourceSize = workerBytes;
1331 1335
1332 1336 if (threadCount > 1) {
1333 1337 POOL_add(pool, (POOL_function)decompress_worker, &workerStates[currentThread]);
1334 1338 }
1335 1339 else {
1336 1340 decompress_worker(&workerStates[currentThread]);
1337 1341 }
1338 1342 }
1339 1343
1340 1344 if (threadCount > 1) {
1341 1345 POOL_free(pool);
1342 1346 pool = NULL;
1343 1347 }
1344 1348 Py_END_ALLOW_THREADS
1345 1349
1346 1350 for (i = 0; i < threadCount; i++) {
1347 1351 switch (workerStates[i].error) {
1348 1352 case WorkerError_none:
1349 1353 break;
1350 1354
1351 1355 case WorkerError_zstd:
1352 1356 PyErr_Format(ZstdError, "error decompressing item %zd: %s",
1353 1357 workerStates[i].errorOffset, ZSTD_getErrorName(workerStates[i].zresult));
1354 1358 errored = 1;
1355 1359 break;
1356 1360
1357 1361 case WorkerError_memory:
1358 1362 PyErr_NoMemory();
1359 1363 errored = 1;
1360 1364 break;
1361 1365
1362 1366 case WorkerError_sizeMismatch:
1363 1367 PyErr_Format(ZstdError, "error decompressing item %zd: decompressed %zu bytes; expected %zu",
1364 1368 workerStates[i].errorOffset, workerStates[i].zresult,
1365 1369 framePointers[workerStates[i].errorOffset].destSize);
1366 1370 errored = 1;
1367 1371 break;
1368 1372
1369 1373 case WorkerError_unknownSize:
1370 1374 PyErr_Format(PyExc_ValueError, "could not determine decompressed size of item %zd",
1371 1375 workerStates[i].errorOffset);
1372 1376 errored = 1;
1373 1377 break;
1374 1378
1375 1379 default:
1376 1380 PyErr_Format(ZstdError, "unhandled error type: %d; this is a bug",
1377 1381 workerStates[i].error);
1378 1382 errored = 1;
1379 1383 break;
1380 1384 }
1381 1385
1382 1386 if (errored) {
1383 1387 break;
1384 1388 }
1385 1389 }
1386 1390
1387 1391 if (errored) {
1388 1392 goto finally;
1389 1393 }
1390 1394
1391 1395 segmentsCount = 0;
1392 1396 for (i = 0; i < threadCount; i++) {
1393 1397 segmentsCount += workerStates[i].destCount;
1394 1398 }
1395 1399
1396 1400 resultArg = PyTuple_New(segmentsCount);
1397 1401 if (NULL == resultArg) {
1398 1402 goto finally;
1399 1403 }
1400 1404
1401 1405 resultIndex = 0;
1402 1406
1403 1407 for (i = 0; i < threadCount; i++) {
1404 1408 Py_ssize_t bufferIndex;
1405 1409 WorkerState* state = &workerStates[i];
1406 1410
1407 1411 for (bufferIndex = 0; bufferIndex < state->destCount; bufferIndex++) {
1408 1412 DestBuffer* destBuffer = &state->destBuffers[bufferIndex];
1409 1413
1410 1414 bws = BufferWithSegments_FromMemory(destBuffer->dest, destBuffer->destSize,
1411 1415 destBuffer->segments, destBuffer->segmentsSize);
1412 1416 if (NULL == bws) {
1413 1417 goto finally;
1414 1418 }
1415 1419
1416 1420 /*
1417 1421 * Memory for buffer and segments was allocated using malloc() in worker
1418 1422 * and the memory is transferred to the BufferWithSegments instance. So
1419 1423 * tell instance to use free() and NULL the reference in the state struct
1420 1424 * so it isn't freed below.
1421 1425 */
1422 1426 bws->useFree = 1;
1423 1427 destBuffer->dest = NULL;
1424 1428 destBuffer->segments = NULL;
1425 1429
1426 1430 PyTuple_SET_ITEM(resultArg, resultIndex++, (PyObject*)bws);
1427 1431 }
1428 1432 }
1429 1433
1430 1434 result = (ZstdBufferWithSegmentsCollection*)PyObject_CallObject(
1431 1435 (PyObject*)&ZstdBufferWithSegmentsCollectionType, resultArg);
1432 1436
1433 1437 finally:
1434 1438 Py_CLEAR(resultArg);
1435 1439
1436 1440 if (workerStates) {
1437 1441 for (i = 0; i < threadCount; i++) {
1438 1442 Py_ssize_t bufferIndex;
1439 1443 WorkerState* state = &workerStates[i];
1440 1444
1441 1445 if (state->dctx) {
1442 1446 ZSTD_freeDCtx(state->dctx);
1443 1447 }
1444 1448
1445 1449 for (bufferIndex = 0; bufferIndex < state->destCount; bufferIndex++) {
1446 1450 if (state->destBuffers) {
1447 1451 /*
1448 1452 * Will be NULL if memory transfered to a BufferWithSegments.
1449 1453 * Otherwise it is left over after an error occurred.
1450 1454 */
1451 1455 free(state->destBuffers[bufferIndex].dest);
1452 1456 free(state->destBuffers[bufferIndex].segments);
1453 1457 }
1454 1458 }
1455 1459
1456 1460 free(state->destBuffers);
1457 1461 }
1458 1462
1459 1463 PyMem_Free(workerStates);
1460 1464 }
1461 1465
1462 1466 POOL_free(pool);
1463 1467
1464 1468 return result;
1465 1469 }
1466 1470
1467 1471 PyDoc_STRVAR(Decompressor_multi_decompress_to_buffer__doc__,
1468 1472 "Decompress multiple frames to output buffers\n"
1469 1473 "\n"
1470 1474 "Receives a ``BufferWithSegments``, a ``BufferWithSegmentsCollection`` or a\n"
1471 1475 "list of bytes-like objects. Each item in the passed collection should be a\n"
1472 1476 "compressed zstd frame.\n"
1473 1477 "\n"
1474 1478 "Unless ``decompressed_sizes`` is specified, the content size *must* be\n"
1475 1479 "written into the zstd frame header. If ``decompressed_sizes`` is specified,\n"
1476 1480 "it is an object conforming to the buffer protocol that represents an array\n"
1477 1481 "of 64-bit unsigned integers in the machine's native format. Specifying\n"
1478 1482 "``decompressed_sizes`` avoids a pre-scan of each frame to determine its\n"
1479 1483 "output size.\n"
1480 1484 "\n"
1481 1485 "Returns a ``BufferWithSegmentsCollection`` containing the decompressed\n"
1482 1486 "data. All decompressed data is allocated in a single memory buffer. The\n"
1483 1487 "``BufferWithSegments`` instance tracks which objects are at which offsets\n"
1484 1488 "and their respective lengths.\n"
1485 1489 "\n"
1486 1490 "The ``threads`` argument controls how many threads to use for operations.\n"
1487 1491 "Negative values will use the same number of threads as logical CPUs on the\n"
1488 1492 "machine.\n"
1489 1493 );
1490 1494
1491 1495 static ZstdBufferWithSegmentsCollection* Decompressor_multi_decompress_to_buffer(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
1492 1496 static char* kwlist[] = {
1493 1497 "frames",
1494 1498 "decompressed_sizes",
1495 1499 "threads",
1496 1500 NULL
1497 1501 };
1498 1502
1499 1503 PyObject* frames;
1500 1504 Py_buffer frameSizes;
1501 1505 int threads = 0;
1502 1506 Py_ssize_t frameCount;
1503 1507 Py_buffer* frameBuffers = NULL;
1504 1508 FramePointer* framePointers = NULL;
1505 1509 unsigned long long* frameSizesP = NULL;
1506 1510 unsigned long long totalInputSize = 0;
1507 1511 FrameSources frameSources;
1508 1512 ZstdBufferWithSegmentsCollection* result = NULL;
1509 1513 Py_ssize_t i;
1510 1514
1511 1515 memset(&frameSizes, 0, sizeof(frameSizes));
1512 1516
1513 1517 #if PY_MAJOR_VERSION >= 3
1514 1518 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|y*i:multi_decompress_to_buffer",
1515 1519 #else
1516 1520 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s*i:multi_decompress_to_buffer",
1517 1521 #endif
1518 1522 kwlist, &frames, &frameSizes, &threads)) {
1519 1523 return NULL;
1520 1524 }
1521 1525
1522 1526 if (frameSizes.buf) {
1523 1527 if (!PyBuffer_IsContiguous(&frameSizes, 'C') || frameSizes.ndim > 1) {
1524 1528 PyErr_SetString(PyExc_ValueError, "decompressed_sizes buffer should be contiguous and have a single dimension");
1525 1529 goto finally;
1526 1530 }
1527 1531
1528 1532 frameSizesP = (unsigned long long*)frameSizes.buf;
1529 1533 }
1530 1534
1531 1535 if (threads < 0) {
1532 1536 threads = cpu_count();
1533 1537 }
1534 1538
1535 1539 if (threads < 2) {
1536 1540 threads = 1;
1537 1541 }
1538 1542
1539 1543 if (PyObject_TypeCheck(frames, &ZstdBufferWithSegmentsType)) {
1540 1544 ZstdBufferWithSegments* buffer = (ZstdBufferWithSegments*)frames;
1541 1545 frameCount = buffer->segmentCount;
1542 1546
1543 1547 if (frameSizes.buf && frameSizes.len != frameCount * (Py_ssize_t)sizeof(unsigned long long)) {
1544 1548 PyErr_Format(PyExc_ValueError, "decompressed_sizes size mismatch; expected %zd, got %zd",
1545 1549 frameCount * sizeof(unsigned long long), frameSizes.len);
1546 1550 goto finally;
1547 1551 }
1548 1552
1549 1553 framePointers = PyMem_Malloc(frameCount * sizeof(FramePointer));
1550 1554 if (!framePointers) {
1551 1555 PyErr_NoMemory();
1552 1556 goto finally;
1553 1557 }
1554 1558
1555 1559 for (i = 0; i < frameCount; i++) {
1556 1560 void* sourceData;
1557 1561 unsigned long long sourceSize;
1558 1562 unsigned long long decompressedSize = 0;
1559 1563
1560 1564 if (buffer->segments[i].offset + buffer->segments[i].length > buffer->dataSize) {
1561 1565 PyErr_Format(PyExc_ValueError, "item %zd has offset outside memory area", i);
1562 1566 goto finally;
1563 1567 }
1564 1568
1565 1569 sourceData = (char*)buffer->data + buffer->segments[i].offset;
1566 1570 sourceSize = buffer->segments[i].length;
1567 1571 totalInputSize += sourceSize;
1568 1572
1569 1573 if (frameSizesP) {
1570 1574 decompressedSize = frameSizesP[i];
1571 1575 }
1572 1576
1573 1577 if (sourceSize > SIZE_MAX) {
1574 1578 PyErr_Format(PyExc_ValueError,
1575 1579 "item %zd is too large for this platform", i);
1576 1580 goto finally;
1577 1581 }
1578 1582
1579 1583 if (decompressedSize > SIZE_MAX) {
1580 1584 PyErr_Format(PyExc_ValueError,
1581 1585 "decompressed size of item %zd is too large for this platform", i);
1582 1586 goto finally;
1583 1587 }
1584 1588
1585 1589 framePointers[i].sourceData = sourceData;
1586 1590 framePointers[i].sourceSize = (size_t)sourceSize;
1587 1591 framePointers[i].destSize = (size_t)decompressedSize;
1588 1592 }
1589 1593 }
1590 1594 else if (PyObject_TypeCheck(frames, &ZstdBufferWithSegmentsCollectionType)) {
1591 1595 Py_ssize_t offset = 0;
1592 1596 ZstdBufferWithSegments* buffer;
1593 1597 ZstdBufferWithSegmentsCollection* collection = (ZstdBufferWithSegmentsCollection*)frames;
1594 1598
1595 1599 frameCount = BufferWithSegmentsCollection_length(collection);
1596 1600
1597 1601 if (frameSizes.buf && frameSizes.len != frameCount) {
1598 1602 PyErr_Format(PyExc_ValueError,
1599 1603 "decompressed_sizes size mismatch; expected %zd; got %zd",
1600 1604 frameCount * sizeof(unsigned long long), frameSizes.len);
1601 1605 goto finally;
1602 1606 }
1603 1607
1604 1608 framePointers = PyMem_Malloc(frameCount * sizeof(FramePointer));
1605 1609 if (NULL == framePointers) {
1606 1610 PyErr_NoMemory();
1607 1611 goto finally;
1608 1612 }
1609 1613
1610 1614 /* Iterate the data structure directly because it is faster. */
1611 1615 for (i = 0; i < collection->bufferCount; i++) {
1612 1616 Py_ssize_t segmentIndex;
1613 1617 buffer = collection->buffers[i];
1614 1618
1615 1619 for (segmentIndex = 0; segmentIndex < buffer->segmentCount; segmentIndex++) {
1616 1620 unsigned long long decompressedSize = frameSizesP ? frameSizesP[offset] : 0;
1617 1621
1618 1622 if (buffer->segments[segmentIndex].offset + buffer->segments[segmentIndex].length > buffer->dataSize) {
1619 1623 PyErr_Format(PyExc_ValueError, "item %zd has offset outside memory area",
1620 1624 offset);
1621 1625 goto finally;
1622 1626 }
1623 1627
1624 1628 if (buffer->segments[segmentIndex].length > SIZE_MAX) {
1625 1629 PyErr_Format(PyExc_ValueError,
1626 1630 "item %zd in buffer %zd is too large for this platform",
1627 1631 segmentIndex, i);
1628 1632 goto finally;
1629 1633 }
1630 1634
1631 1635 if (decompressedSize > SIZE_MAX) {
1632 1636 PyErr_Format(PyExc_ValueError,
1633 1637 "decompressed size of item %zd in buffer %zd is too large for this platform",
1634 1638 segmentIndex, i);
1635 1639 goto finally;
1636 1640 }
1637 1641
1638 1642 totalInputSize += buffer->segments[segmentIndex].length;
1639 1643
1640 1644 framePointers[offset].sourceData = (char*)buffer->data + buffer->segments[segmentIndex].offset;
1641 1645 framePointers[offset].sourceSize = (size_t)buffer->segments[segmentIndex].length;
1642 1646 framePointers[offset].destSize = (size_t)decompressedSize;
1643 1647
1644 1648 offset++;
1645 1649 }
1646 1650 }
1647 1651 }
1648 1652 else if (PyList_Check(frames)) {
1649 1653 frameCount = PyList_GET_SIZE(frames);
1650 1654
1651 1655 if (frameSizes.buf && frameSizes.len != frameCount * (Py_ssize_t)sizeof(unsigned long long)) {
1652 1656 PyErr_Format(PyExc_ValueError, "decompressed_sizes size mismatch; expected %zd, got %zd",
1653 1657 frameCount * sizeof(unsigned long long), frameSizes.len);
1654 1658 goto finally;
1655 1659 }
1656 1660
1657 1661 framePointers = PyMem_Malloc(frameCount * sizeof(FramePointer));
1658 1662 if (!framePointers) {
1659 1663 PyErr_NoMemory();
1660 1664 goto finally;
1661 1665 }
1662 1666
1663 1667 frameBuffers = PyMem_Malloc(frameCount * sizeof(Py_buffer));
1664 1668 if (NULL == frameBuffers) {
1665 1669 PyErr_NoMemory();
1666 1670 goto finally;
1667 1671 }
1668 1672
1669 1673 memset(frameBuffers, 0, frameCount * sizeof(Py_buffer));
1670 1674
1671 1675 /* Do a pass to assemble info about our input buffers and output sizes. */
1672 1676 for (i = 0; i < frameCount; i++) {
1673 1677 unsigned long long decompressedSize = frameSizesP ? frameSizesP[i] : 0;
1674 1678
1675 1679 if (0 != PyObject_GetBuffer(PyList_GET_ITEM(frames, i),
1676 1680 &frameBuffers[i], PyBUF_CONTIG_RO)) {
1677 1681 PyErr_Clear();
1678 1682 PyErr_Format(PyExc_TypeError, "item %zd not a bytes like object", i);
1679 1683 goto finally;
1680 1684 }
1681 1685
1682 1686 if (decompressedSize > SIZE_MAX) {
1683 1687 PyErr_Format(PyExc_ValueError,
1684 1688 "decompressed size of item %zd is too large for this platform", i);
1685 1689 goto finally;
1686 1690 }
1687 1691
1688 1692 totalInputSize += frameBuffers[i].len;
1689 1693
1690 1694 framePointers[i].sourceData = frameBuffers[i].buf;
1691 1695 framePointers[i].sourceSize = frameBuffers[i].len;
1692 1696 framePointers[i].destSize = (size_t)decompressedSize;
1693 1697 }
1694 1698 }
1695 1699 else {
1696 1700 PyErr_SetString(PyExc_TypeError, "argument must be list or BufferWithSegments");
1697 1701 goto finally;
1698 1702 }
1699 1703
1700 1704 /* We now have an array with info about our inputs and outputs. Feed it into
1701 1705 our generic decompression function. */
1702 1706 frameSources.frames = framePointers;
1703 1707 frameSources.framesSize = frameCount;
1704 1708 frameSources.compressedSize = totalInputSize;
1705 1709
1706 1710 result = decompress_from_framesources(self, &frameSources, threads);
1707 1711
1708 1712 finally:
1709 1713 if (frameSizes.buf) {
1710 1714 PyBuffer_Release(&frameSizes);
1711 1715 }
1712 1716 PyMem_Free(framePointers);
1713 1717
1714 1718 if (frameBuffers) {
1715 1719 for (i = 0; i < frameCount; i++) {
1716 1720 PyBuffer_Release(&frameBuffers[i]);
1717 1721 }
1718 1722
1719 1723 PyMem_Free(frameBuffers);
1720 1724 }
1721 1725
1722 1726 return result;
1723 1727 }
1724 1728
1725 1729 static PyMethodDef Decompressor_methods[] = {
1726 1730 { "copy_stream", (PyCFunction)Decompressor_copy_stream, METH_VARARGS | METH_KEYWORDS,
1727 1731 Decompressor_copy_stream__doc__ },
1728 1732 { "decompress", (PyCFunction)Decompressor_decompress, METH_VARARGS | METH_KEYWORDS,
1729 1733 Decompressor_decompress__doc__ },
1730 1734 { "decompressobj", (PyCFunction)Decompressor_decompressobj, METH_VARARGS | METH_KEYWORDS,
1731 1735 Decompressor_decompressobj__doc__ },
1732 1736 { "read_to_iter", (PyCFunction)Decompressor_read_to_iter, METH_VARARGS | METH_KEYWORDS,
1733 1737 Decompressor_read_to_iter__doc__ },
1734 1738 /* TODO Remove deprecated API */
1735 1739 { "read_from", (PyCFunction)Decompressor_read_to_iter, METH_VARARGS | METH_KEYWORDS,
1736 1740 Decompressor_read_to_iter__doc__ },
1737 1741 { "stream_reader", (PyCFunction)Decompressor_stream_reader,
1738 1742 METH_VARARGS | METH_KEYWORDS, Decompressor_stream_reader__doc__ },
1739 1743 { "stream_writer", (PyCFunction)Decompressor_stream_writer, METH_VARARGS | METH_KEYWORDS,
1740 1744 Decompressor_stream_writer__doc__ },
1741 1745 /* TODO remove deprecated API */
1742 1746 { "write_to", (PyCFunction)Decompressor_stream_writer, METH_VARARGS | METH_KEYWORDS,
1743 1747 Decompressor_stream_writer__doc__ },
1744 1748 { "decompress_content_dict_chain", (PyCFunction)Decompressor_decompress_content_dict_chain,
1745 1749 METH_VARARGS | METH_KEYWORDS, Decompressor_decompress_content_dict_chain__doc__ },
1746 1750 { "multi_decompress_to_buffer", (PyCFunction)Decompressor_multi_decompress_to_buffer,
1747 1751 METH_VARARGS | METH_KEYWORDS, Decompressor_multi_decompress_to_buffer__doc__ },
1748 1752 { "memory_size", (PyCFunction)Decompressor_memory_size, METH_NOARGS,
1749 1753 Decompressor_memory_size__doc__ },
1750 1754 { NULL, NULL }
1751 1755 };
1752 1756
1753 1757 PyTypeObject ZstdDecompressorType = {
1754 1758 PyVarObject_HEAD_INIT(NULL, 0)
1755 1759 "zstd.ZstdDecompressor", /* tp_name */
1756 1760 sizeof(ZstdDecompressor), /* tp_basicsize */
1757 1761 0, /* tp_itemsize */
1758 1762 (destructor)Decompressor_dealloc, /* tp_dealloc */
1759 1763 0, /* tp_print */
1760 1764 0, /* tp_getattr */
1761 1765 0, /* tp_setattr */
1762 1766 0, /* tp_compare */
1763 1767 0, /* tp_repr */
1764 1768 0, /* tp_as_number */
1765 1769 0, /* tp_as_sequence */
1766 1770 0, /* tp_as_mapping */
1767 1771 0, /* tp_hash */
1768 1772 0, /* tp_call */
1769 1773 0, /* tp_str */
1770 1774 0, /* tp_getattro */
1771 1775 0, /* tp_setattro */
1772 1776 0, /* tp_as_buffer */
1773 1777 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
1774 1778 Decompressor__doc__, /* tp_doc */
1775 1779 0, /* tp_traverse */
1776 1780 0, /* tp_clear */
1777 1781 0, /* tp_richcompare */
1778 1782 0, /* tp_weaklistoffset */
1779 1783 0, /* tp_iter */
1780 1784 0, /* tp_iternext */
1781 1785 Decompressor_methods, /* tp_methods */
1782 1786 0, /* tp_members */
1783 1787 0, /* tp_getset */
1784 1788 0, /* tp_base */
1785 1789 0, /* tp_dict */
1786 1790 0, /* tp_descr_get */
1787 1791 0, /* tp_descr_set */
1788 1792 0, /* tp_dictoffset */
1789 1793 (initproc)Decompressor_init, /* tp_init */
1790 1794 0, /* tp_alloc */
1791 1795 PyType_GenericNew, /* tp_new */
1792 1796 };
1793 1797
1794 1798 void decompressor_module_init(PyObject* mod) {
1795 1799 Py_TYPE(&ZstdDecompressorType) = &PyType_Type;
1796 1800 if (PyType_Ready(&ZstdDecompressorType) < 0) {
1797 1801 return;
1798 1802 }
1799 1803
1800 1804 Py_INCREF((PyObject*)&ZstdDecompressorType);
1801 1805 PyModule_AddObject(mod, "ZstdDecompressor",
1802 1806 (PyObject*)&ZstdDecompressorType);
1803 1807 }
@@ -1,346 +1,373 b''
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #define PY_SSIZE_T_CLEAN
10 10 #include <Python.h>
11 11 #include "structmember.h"
12 12
13 13 #define ZSTD_STATIC_LINKING_ONLY
14 14 #define ZDICT_STATIC_LINKING_ONLY
15 15 #include <zstd.h>
16 16 #include <zdict.h>
17 17
18 #define PYTHON_ZSTANDARD_VERSION "0.9.0"
18 /* Remember to change the string in zstandard/__init__ as well */
19 #define PYTHON_ZSTANDARD_VERSION "0.10.1"
19 20
20 21 typedef enum {
21 22 compressorobj_flush_finish,
22 23 compressorobj_flush_block,
23 24 } CompressorObj_Flush;
24 25
25 26 /*
26 27 Represents a ZstdCompressionParameters type.
27 28
28 29 This type holds all the low-level compression parameters that can be set.
29 30 */
30 31 typedef struct {
31 32 PyObject_HEAD
32 33 ZSTD_CCtx_params* params;
33 34 unsigned format;
34 35 int compressionLevel;
35 36 unsigned windowLog;
36 37 unsigned hashLog;
37 38 unsigned chainLog;
38 39 unsigned searchLog;
39 40 unsigned minMatch;
40 41 unsigned targetLength;
41 42 unsigned compressionStrategy;
42 43 unsigned contentSizeFlag;
43 44 unsigned checksumFlag;
44 45 unsigned dictIDFlag;
45 46 unsigned threads;
46 47 unsigned jobSize;
47 48 unsigned overlapSizeLog;
48 unsigned compressLiterals;
49 49 unsigned forceMaxWindow;
50 50 unsigned enableLongDistanceMatching;
51 51 unsigned ldmHashLog;
52 52 unsigned ldmMinMatch;
53 53 unsigned ldmBucketSizeLog;
54 54 unsigned ldmHashEveryLog;
55 55 } ZstdCompressionParametersObject;
56 56
57 57 extern PyTypeObject ZstdCompressionParametersType;
58 58
59 59 /*
60 60 Represents a FrameParameters type.
61 61
62 62 This type is basically a wrapper around ZSTD_frameParams.
63 63 */
64 64 typedef struct {
65 65 PyObject_HEAD
66 66 unsigned long long frameContentSize;
67 67 unsigned long long windowSize;
68 68 unsigned dictID;
69 69 char checksumFlag;
70 70 } FrameParametersObject;
71 71
72 72 extern PyTypeObject FrameParametersType;
73 73
74 74 /*
75 75 Represents a ZstdCompressionDict type.
76 76
77 77 Instances hold data used for a zstd compression dictionary.
78 78 */
79 79 typedef struct {
80 80 PyObject_HEAD
81 81
82 82 /* Pointer to dictionary data. Owned by self. */
83 83 void* dictData;
84 84 /* Size of dictionary data. */
85 85 size_t dictSize;
86 86 ZSTD_dictContentType_e dictType;
87 87 /* k parameter for cover dictionaries. Only populated by train_cover_dict(). */
88 88 unsigned k;
89 89 /* d parameter for cover dictionaries. Only populated by train_cover_dict(). */
90 90 unsigned d;
91 91 /* Digested dictionary, suitable for reuse. */
92 92 ZSTD_CDict* cdict;
93 93 ZSTD_DDict* ddict;
94 94 } ZstdCompressionDict;
95 95
96 96 extern PyTypeObject ZstdCompressionDictType;
97 97
98 98 /*
99 99 Represents a ZstdCompressor type.
100 100 */
101 101 typedef struct {
102 102 PyObject_HEAD
103 103
104 104 /* Number of threads to use for operations. */
105 105 unsigned int threads;
106 106 /* Pointer to compression dictionary to use. NULL if not using dictionary
107 107 compression. */
108 108 ZstdCompressionDict* dict;
109 109 /* Compression context to use. Populated during object construction. */
110 110 ZSTD_CCtx* cctx;
111 111 /* Compression parameters in use. */
112 112 ZSTD_CCtx_params* params;
113 113 } ZstdCompressor;
114 114
115 115 extern PyTypeObject ZstdCompressorType;
116 116
117 117 typedef struct {
118 118 PyObject_HEAD
119 119
120 120 ZstdCompressor* compressor;
121 121 ZSTD_outBuffer output;
122 122 int finished;
123 123 } ZstdCompressionObj;
124 124
125 125 extern PyTypeObject ZstdCompressionObjType;
126 126
127 127 typedef struct {
128 128 PyObject_HEAD
129 129
130 130 ZstdCompressor* compressor;
131 131 PyObject* writer;
132 132 unsigned long long sourceSize;
133 133 size_t outSize;
134 134 int entered;
135 135 unsigned long long bytesCompressed;
136 136 } ZstdCompressionWriter;
137 137
138 138 extern PyTypeObject ZstdCompressionWriterType;
139 139
140 140 typedef struct {
141 141 PyObject_HEAD
142 142
143 143 ZstdCompressor* compressor;
144 144 PyObject* reader;
145 145 Py_buffer buffer;
146 146 Py_ssize_t bufferOffset;
147 147 size_t inSize;
148 148 size_t outSize;
149 149
150 150 ZSTD_inBuffer input;
151 151 ZSTD_outBuffer output;
152 152 int finishedOutput;
153 153 int finishedInput;
154 154 PyObject* readResult;
155 155 } ZstdCompressorIterator;
156 156
157 157 extern PyTypeObject ZstdCompressorIteratorType;
158 158
159 159 typedef struct {
160 160 PyObject_HEAD
161 161
162 162 ZstdCompressor* compressor;
163 163 PyObject* reader;
164 164 Py_buffer buffer;
165 unsigned long long sourceSize;
166 165 size_t readSize;
167 166
168 167 int entered;
169 168 int closed;
170 169 unsigned long long bytesCompressed;
171 170
172 171 ZSTD_inBuffer input;
173 172 ZSTD_outBuffer output;
174 173 int finishedInput;
175 174 int finishedOutput;
176 175 PyObject* readResult;
177 176 } ZstdCompressionReader;
178 177
179 178 extern PyTypeObject ZstdCompressionReaderType;
180 179
181 180 typedef struct {
182 181 PyObject_HEAD
183 182
183 ZstdCompressor* compressor;
184 ZSTD_inBuffer input;
185 ZSTD_outBuffer output;
186 Py_buffer inBuffer;
187 int finished;
188 size_t chunkSize;
189 } ZstdCompressionChunker;
190
191 extern PyTypeObject ZstdCompressionChunkerType;
192
193 typedef enum {
194 compressionchunker_mode_normal,
195 compressionchunker_mode_flush,
196 compressionchunker_mode_finish,
197 } CompressionChunkerMode;
198
199 typedef struct {
200 PyObject_HEAD
201
202 ZstdCompressionChunker* chunker;
203 CompressionChunkerMode mode;
204 } ZstdCompressionChunkerIterator;
205
206 extern PyTypeObject ZstdCompressionChunkerIteratorType;
207
208 typedef struct {
209 PyObject_HEAD
210
184 211 ZSTD_DCtx* dctx;
185 212 ZstdCompressionDict* dict;
186 213 size_t maxWindowSize;
187 214 ZSTD_format_e format;
188 215 } ZstdDecompressor;
189 216
190 217 extern PyTypeObject ZstdDecompressorType;
191 218
192 219 typedef struct {
193 220 PyObject_HEAD
194 221
195 222 ZstdDecompressor* decompressor;
196 223 size_t outSize;
197 224 int finished;
198 225 } ZstdDecompressionObj;
199 226
200 227 extern PyTypeObject ZstdDecompressionObjType;
201 228
202 229 typedef struct {
203 230 PyObject_HEAD
204 231
205 232 /* Parent decompressor to which this object is associated. */
206 233 ZstdDecompressor* decompressor;
207 234 /* Object to read() from (if reading from a stream). */
208 235 PyObject* reader;
209 236 /* Size for read() operations on reader. */
210 237 size_t readSize;
211 238 /* Buffer to read from (if reading from a buffer). */
212 239 Py_buffer buffer;
213 240
214 241 /* Whether the context manager is active. */
215 242 int entered;
216 243 /* Whether we've closed the stream. */
217 244 int closed;
218 245
219 246 /* Number of bytes decompressed and returned to user. */
220 247 unsigned long long bytesDecompressed;
221 248
222 249 /* Tracks data going into decompressor. */
223 250 ZSTD_inBuffer input;
224 251
225 252 /* Holds output from read() operation on reader. */
226 253 PyObject* readResult;
227 254
228 255 /* Whether all input has been sent to the decompressor. */
229 256 int finishedInput;
230 257 /* Whether all output has been flushed from the decompressor. */
231 258 int finishedOutput;
232 259 } ZstdDecompressionReader;
233 260
234 261 extern PyTypeObject ZstdDecompressionReaderType;
235 262
236 263 typedef struct {
237 264 PyObject_HEAD
238 265
239 266 ZstdDecompressor* decompressor;
240 267 PyObject* writer;
241 268 size_t outSize;
242 269 int entered;
243 270 } ZstdDecompressionWriter;
244 271
245 272 extern PyTypeObject ZstdDecompressionWriterType;
246 273
247 274 typedef struct {
248 275 PyObject_HEAD
249 276
250 277 ZstdDecompressor* decompressor;
251 278 PyObject* reader;
252 279 Py_buffer buffer;
253 280 Py_ssize_t bufferOffset;
254 281 size_t inSize;
255 282 size_t outSize;
256 283 size_t skipBytes;
257 284 ZSTD_inBuffer input;
258 285 ZSTD_outBuffer output;
259 286 Py_ssize_t readCount;
260 287 int finishedInput;
261 288 int finishedOutput;
262 289 } ZstdDecompressorIterator;
263 290
264 291 extern PyTypeObject ZstdDecompressorIteratorType;
265 292
266 293 typedef struct {
267 294 int errored;
268 295 PyObject* chunk;
269 296 } DecompressorIteratorResult;
270 297
271 298 typedef struct {
272 299 /* The public API is that these are 64-bit unsigned integers. So these can't
273 300 * be size_t, even though values larger than SIZE_MAX or PY_SSIZE_T_MAX may
274 301 * be nonsensical for this platform. */
275 302 unsigned long long offset;
276 303 unsigned long long length;
277 304 } BufferSegment;
278 305
279 306 typedef struct {
280 307 PyObject_HEAD
281 308
282 309 PyObject* parent;
283 310 BufferSegment* segments;
284 311 Py_ssize_t segmentCount;
285 312 } ZstdBufferSegments;
286 313
287 314 extern PyTypeObject ZstdBufferSegmentsType;
288 315
289 316 typedef struct {
290 317 PyObject_HEAD
291 318
292 319 PyObject* parent;
293 320 void* data;
294 321 Py_ssize_t dataSize;
295 322 unsigned long long offset;
296 323 } ZstdBufferSegment;
297 324
298 325 extern PyTypeObject ZstdBufferSegmentType;
299 326
300 327 typedef struct {
301 328 PyObject_HEAD
302 329
303 330 Py_buffer parent;
304 331 void* data;
305 332 unsigned long long dataSize;
306 333 BufferSegment* segments;
307 334 Py_ssize_t segmentCount;
308 335 int useFree;
309 336 } ZstdBufferWithSegments;
310 337
311 338 extern PyTypeObject ZstdBufferWithSegmentsType;
312 339
313 340 /**
314 341 * An ordered collection of BufferWithSegments exposed as a squashed collection.
315 342 *
316 343 * This type provides a virtual view spanning multiple BufferWithSegments
317 344 * instances. It allows multiple instances to be "chained" together and
318 345 * exposed as a single collection. e.g. if there are 2 buffers holding
319 346 * 10 segments each, then o[14] will access the 5th segment in the 2nd buffer.
320 347 */
321 348 typedef struct {
322 349 PyObject_HEAD
323 350
324 351 /* An array of buffers that should be exposed through this instance. */
325 352 ZstdBufferWithSegments** buffers;
326 353 /* Number of elements in buffers array. */
327 354 Py_ssize_t bufferCount;
328 355 /* Array of first offset in each buffer instance. 0th entry corresponds
329 356 to number of elements in the 0th buffer. 1st entry corresponds to the
330 357 sum of elements in 0th and 1st buffers. */
331 358 Py_ssize_t* firstElements;
332 359 } ZstdBufferWithSegmentsCollection;
333 360
334 361 extern PyTypeObject ZstdBufferWithSegmentsCollectionType;
335 362
336 363 int set_parameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, unsigned value);
337 364 int set_parameters(ZSTD_CCtx_params* params, ZstdCompressionParametersObject* obj);
338 365 FrameParametersObject* get_frame_parameters(PyObject* self, PyObject* args, PyObject* kwargs);
339 366 int ensure_ddict(ZstdCompressionDict* dict);
340 367 int ensure_dctx(ZstdDecompressor* decompressor, int loadDict);
341 368 ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs);
342 369 ZstdBufferWithSegments* BufferWithSegments_FromMemory(void* data, unsigned long long dataSize, BufferSegment* segments, Py_ssize_t segmentsSize);
343 370 Py_ssize_t BufferWithSegmentsCollection_length(ZstdBufferWithSegmentsCollection*);
344 371 int cpu_count(void);
345 372 size_t roundpow2(size_t);
346 373 int safe_pybytes_resize(PyObject** obj, Py_ssize_t size);
@@ -1,196 +1,199 b''
1 1 # Copyright (c) 2016-present, Gregory Szorc
2 2 # All rights reserved.
3 3 #
4 4 # This software may be modified and distributed under the terms
5 5 # of the BSD license. See the LICENSE file for details.
6 6
7 7 from __future__ import absolute_import
8 8
9 9 import cffi
10 10 import distutils.ccompiler
11 11 import os
12 12 import re
13 13 import subprocess
14 14 import tempfile
15 15
16 16
17 17 HERE = os.path.abspath(os.path.dirname(__file__))
18 18
19 19 SOURCES = ['zstd/%s' % p for p in (
20 'common/debug.c',
20 21 'common/entropy_common.c',
21 22 'common/error_private.c',
22 23 'common/fse_decompress.c',
23 24 'common/pool.c',
24 25 'common/threading.c',
25 26 'common/xxhash.c',
26 27 'common/zstd_common.c',
27 28 'compress/fse_compress.c',
29 'compress/hist.c',
28 30 'compress/huf_compress.c',
29 31 'compress/zstd_compress.c',
30 32 'compress/zstd_double_fast.c',
31 33 'compress/zstd_fast.c',
32 34 'compress/zstd_lazy.c',
33 35 'compress/zstd_ldm.c',
34 36 'compress/zstd_opt.c',
35 37 'compress/zstdmt_compress.c',
36 38 'decompress/huf_decompress.c',
37 39 'decompress/zstd_decompress.c',
38 40 'dictBuilder/cover.c',
41 'dictBuilder/fastcover.c',
39 42 'dictBuilder/divsufsort.c',
40 43 'dictBuilder/zdict.c',
41 44 )]
42 45
43 46 # Headers whose preprocessed output will be fed into cdef().
44 47 HEADERS = [os.path.join(HERE, 'zstd', *p) for p in (
45 48 ('zstd.h',),
46 49 ('dictBuilder', 'zdict.h'),
47 50 )]
48 51
49 52 INCLUDE_DIRS = [os.path.join(HERE, d) for d in (
50 53 'zstd',
51 54 'zstd/common',
52 55 'zstd/compress',
53 56 'zstd/decompress',
54 57 'zstd/dictBuilder',
55 58 )]
56 59
57 60 # cffi can't parse some of the primitives in zstd.h. So we invoke the
58 61 # preprocessor and feed its output into cffi.
59 62 compiler = distutils.ccompiler.new_compiler()
60 63
61 64 # Needed for MSVC.
62 65 if hasattr(compiler, 'initialize'):
63 66 compiler.initialize()
64 67
65 68 # Distutils doesn't set compiler.preprocessor, so invoke the preprocessor
66 69 # manually.
67 70 if compiler.compiler_type == 'unix':
68 71 args = list(compiler.executables['compiler'])
69 72 args.extend([
70 73 '-E',
71 74 '-DZSTD_STATIC_LINKING_ONLY',
72 75 '-DZDICT_STATIC_LINKING_ONLY',
73 76 ])
74 77 elif compiler.compiler_type == 'msvc':
75 78 args = [compiler.cc]
76 79 args.extend([
77 80 '/EP',
78 81 '/DZSTD_STATIC_LINKING_ONLY',
79 82 '/DZDICT_STATIC_LINKING_ONLY',
80 83 ])
81 84 else:
82 85 raise Exception('unsupported compiler type: %s' % compiler.compiler_type)
83 86
84 87 def preprocess(path):
85 88 with open(path, 'rb') as fh:
86 89 lines = []
87 90 it = iter(fh)
88 91
89 92 for l in it:
90 93 # zstd.h includes <stddef.h>, which is also included by cffi's
91 94 # boilerplate. This can lead to duplicate declarations. So we strip
92 95 # this include from the preprocessor invocation.
93 96 #
94 97 # The same things happens for including zstd.h, so give it the same
95 98 # treatment.
96 99 #
97 100 # We define ZSTD_STATIC_LINKING_ONLY, which is redundant with the inline
98 101 # #define in zstdmt_compress.h and results in a compiler warning. So drop
99 102 # the inline #define.
100 103 if l.startswith((b'#include <stddef.h>',
101 104 b'#include "zstd.h"',
102 105 b'#define ZSTD_STATIC_LINKING_ONLY')):
103 106 continue
104 107
105 108 # ZSTDLIB_API may not be defined if we dropped zstd.h. It isn't
106 109 # important so just filter it out.
107 110 if l.startswith(b'ZSTDLIB_API'):
108 111 l = l[len(b'ZSTDLIB_API '):]
109 112
110 113 lines.append(l)
111 114
112 115 fd, input_file = tempfile.mkstemp(suffix='.h')
113 116 os.write(fd, b''.join(lines))
114 117 os.close(fd)
115 118
116 119 try:
117 120 process = subprocess.Popen(args + [input_file], stdout=subprocess.PIPE)
118 121 output = process.communicate()[0]
119 122 ret = process.poll()
120 123 if ret:
121 124 raise Exception('preprocessor exited with error')
122 125
123 126 return output
124 127 finally:
125 128 os.unlink(input_file)
126 129
127 130
128 131 def normalize_output(output):
129 132 lines = []
130 133 for line in output.splitlines():
131 134 # CFFI's parser doesn't like __attribute__ on UNIX compilers.
132 135 if line.startswith(b'__attribute__ ((visibility ("default"))) '):
133 136 line = line[len(b'__attribute__ ((visibility ("default"))) '):]
134 137
135 138 if line.startswith(b'__attribute__((deprecated('):
136 139 continue
137 140 elif b'__declspec(deprecated(' in line:
138 141 continue
139 142
140 143 lines.append(line)
141 144
142 145 return b'\n'.join(lines)
143 146
144 147
145 148 ffi = cffi.FFI()
146 149 # zstd.h uses a possible undefined MIN(). Define it until
147 150 # https://github.com/facebook/zstd/issues/976 is fixed.
148 151 # *_DISABLE_DEPRECATE_WARNINGS prevents the compiler from emitting a warning
149 152 # when cffi uses the function. Since we statically link against zstd, even
150 153 # if we use the deprecated functions it shouldn't be a huge problem.
151 154 ffi.set_source('_zstd_cffi', '''
152 155 #define MIN(a,b) ((a)<(b) ? (a) : (b))
153 156 #define ZSTD_STATIC_LINKING_ONLY
154 157 #include <zstd.h>
155 158 #define ZDICT_STATIC_LINKING_ONLY
156 159 #define ZDICT_DISABLE_DEPRECATE_WARNINGS
157 160 #include <zdict.h>
158 161 ''', sources=SOURCES,
159 162 include_dirs=INCLUDE_DIRS,
160 163 extra_compile_args=['-DZSTD_MULTITHREAD'])
161 164
162 165 DEFINE = re.compile(b'^\\#define ([a-zA-Z0-9_]+) ')
163 166
164 167 sources = []
165 168
166 169 # Feed normalized preprocessor output for headers into the cdef parser.
167 170 for header in HEADERS:
168 171 preprocessed = preprocess(header)
169 172 sources.append(normalize_output(preprocessed))
170 173
171 174 # #define's are effectively erased as part of going through preprocessor.
172 175 # So perform a manual pass to re-add those to the cdef source.
173 176 with open(header, 'rb') as fh:
174 177 for line in fh:
175 178 line = line.strip()
176 179 m = DEFINE.match(line)
177 180 if not m:
178 181 continue
179 182
180 183 if m.group(1) == b'ZSTD_STATIC_LINKING_ONLY':
181 184 continue
182 185
183 186 # The parser doesn't like some constants with complex values.
184 187 if m.group(1) in (b'ZSTD_LIB_VERSION', b'ZSTD_VERSION_STRING'):
185 188 continue
186 189
187 190 # The ... is magic syntax by the cdef parser to resolve the
188 191 # value at compile time.
189 192 sources.append(m.group(0) + b' ...')
190 193
191 194 cdeflines = b'\n'.join(sources).splitlines()
192 195 cdeflines = [l for l in cdeflines if l.strip()]
193 196 ffi.cdef(b'\n'.join(cdeflines).decode('latin1'))
194 197
195 198 if __name__ == '__main__':
196 199 ffi.compile()
@@ -1,160 +1,188 b''
1 1 # Copyright (c) 2016-present, Gregory Szorc
2 2 # All rights reserved.
3 3 #
4 4 # This software may be modified and distributed under the terms
5 5 # of the BSD license. See the LICENSE file for details.
6 6
7 7 import distutils.ccompiler
8 8 import os
9 import sys
10 9
11 10 from distutils.extension import Extension
12 11
13 12
14 13 zstd_sources = ['zstd/%s' % p for p in (
14 'common/debug.c',
15 15 'common/entropy_common.c',
16 16 'common/error_private.c',
17 17 'common/fse_decompress.c',
18 18 'common/pool.c',
19 19 'common/threading.c',
20 20 'common/xxhash.c',
21 21 'common/zstd_common.c',
22 22 'compress/fse_compress.c',
23 'compress/hist.c',
23 24 'compress/huf_compress.c',
24 25 'compress/zstd_compress.c',
25 26 'compress/zstd_double_fast.c',
26 27 'compress/zstd_fast.c',
27 28 'compress/zstd_lazy.c',
28 29 'compress/zstd_ldm.c',
29 30 'compress/zstd_opt.c',
30 31 'compress/zstdmt_compress.c',
31 32 'decompress/huf_decompress.c',
32 33 'decompress/zstd_decompress.c',
33 34 'dictBuilder/cover.c',
34 35 'dictBuilder/divsufsort.c',
36 'dictBuilder/fastcover.c',
35 37 'dictBuilder/zdict.c',
36 38 )]
37 39
38 40 zstd_sources_legacy = ['zstd/%s' % p for p in (
39 41 'deprecated/zbuff_common.c',
40 42 'deprecated/zbuff_compress.c',
41 43 'deprecated/zbuff_decompress.c',
42 44 'legacy/zstd_v01.c',
43 45 'legacy/zstd_v02.c',
44 46 'legacy/zstd_v03.c',
45 47 'legacy/zstd_v04.c',
46 48 'legacy/zstd_v05.c',
47 49 'legacy/zstd_v06.c',
48 50 'legacy/zstd_v07.c'
49 51 )]
50 52
51 53 zstd_includes = [
52 54 'zstd',
53 55 'zstd/common',
54 56 'zstd/compress',
55 57 'zstd/decompress',
56 58 'zstd/dictBuilder',
57 59 ]
58 60
59 61 zstd_includes_legacy = [
60 62 'zstd/deprecated',
61 63 'zstd/legacy',
62 64 ]
63 65
64 66 ext_includes = [
65 67 'c-ext',
66 68 'zstd/common',
67 69 ]
68 70
69 71 ext_sources = [
70 72 'zstd/common/pool.c',
71 73 'zstd/common/threading.c',
72 74 'zstd.c',
73 75 'c-ext/bufferutil.c',
74 76 'c-ext/compressiondict.c',
75 77 'c-ext/compressobj.c',
76 78 'c-ext/compressor.c',
77 79 'c-ext/compressoriterator.c',
80 'c-ext/compressionchunker.c',
78 81 'c-ext/compressionparams.c',
79 82 'c-ext/compressionreader.c',
80 83 'c-ext/compressionwriter.c',
81 84 'c-ext/constants.c',
82 85 'c-ext/decompressobj.c',
83 86 'c-ext/decompressor.c',
84 87 'c-ext/decompressoriterator.c',
85 88 'c-ext/decompressionreader.c',
86 89 'c-ext/decompressionwriter.c',
87 90 'c-ext/frameparams.c',
88 91 ]
89 92
90 93 zstd_depends = [
91 94 'c-ext/python-zstandard.h',
92 95 ]
93 96
94 97
95 98 def get_c_extension(support_legacy=False, system_zstd=False, name='zstd',
96 warnings_as_errors=False):
97 """Obtain a distutils.extension.Extension for the C extension."""
98 root = os.path.abspath(os.path.dirname(__file__))
99 warnings_as_errors=False, root=None):
100 """Obtain a distutils.extension.Extension for the C extension.
101
102 ``support_legacy`` controls whether to compile in legacy zstd format support.
103
104 ``system_zstd`` controls whether to compile against the system zstd library.
105 For this to work, the system zstd library and headers must match what
106 python-zstandard is coded against exactly.
107
108 ``name`` is the module name of the C extension to produce.
109
110 ``warnings_as_errors`` controls whether compiler warnings are turned into
111 compiler errors.
99 112
100 sources = set([os.path.join(root, p) for p in ext_sources])
113 ``root`` defines a root path that source should be computed as relative
114 to. This should be the directory with the main ``setup.py`` that is
115 being invoked. If not defined, paths will be relative to this file.
116 """
117 actual_root = os.path.abspath(os.path.dirname(__file__))
118 root = root or actual_root
119
120 sources = set([os.path.join(actual_root, p) for p in ext_sources])
101 121 if not system_zstd:
102 sources.update([os.path.join(root, p) for p in zstd_sources])
122 sources.update([os.path.join(actual_root, p) for p in zstd_sources])
103 123 if support_legacy:
104 sources.update([os.path.join(root, p) for p in zstd_sources_legacy])
124 sources.update([os.path.join(actual_root, p)
125 for p in zstd_sources_legacy])
105 126 sources = list(sources)
106 127
107 include_dirs = set([os.path.join(root, d) for d in ext_includes])
128 include_dirs = set([os.path.join(actual_root, d) for d in ext_includes])
108 129 if not system_zstd:
109 include_dirs.update([os.path.join(root, d) for d in zstd_includes])
130 include_dirs.update([os.path.join(actual_root, d)
131 for d in zstd_includes])
110 132 if support_legacy:
111 include_dirs.update([os.path.join(root, d) for d in zstd_includes_legacy])
133 include_dirs.update([os.path.join(actual_root, d)
134 for d in zstd_includes_legacy])
112 135 include_dirs = list(include_dirs)
113 136
114 depends = [os.path.join(root, p) for p in zstd_depends]
137 depends = [os.path.join(actual_root, p) for p in zstd_depends]
115 138
116 139 compiler = distutils.ccompiler.new_compiler()
117 140
118 141 # Needed for MSVC.
119 142 if hasattr(compiler, 'initialize'):
120 143 compiler.initialize()
121 144
122 145 if compiler.compiler_type == 'unix':
123 146 compiler_type = 'unix'
124 147 elif compiler.compiler_type == 'msvc':
125 148 compiler_type = 'msvc'
126 149 elif compiler.compiler_type == 'mingw32':
127 150 compiler_type = 'mingw32'
128 151 else:
129 152 raise Exception('unhandled compiler type: %s' %
130 153 compiler.compiler_type)
131 154
132 155 extra_args = ['-DZSTD_MULTITHREAD']
133 156
134 157 if not system_zstd:
135 158 extra_args.append('-DZSTDLIB_VISIBILITY=')
136 159 extra_args.append('-DZDICTLIB_VISIBILITY=')
137 160 extra_args.append('-DZSTDERRORLIB_VISIBILITY=')
138 161
139 162 if compiler_type == 'unix':
140 163 extra_args.append('-fvisibility=hidden')
141 164
142 165 if not system_zstd and support_legacy:
143 166 extra_args.append('-DZSTD_LEGACY_SUPPORT=1')
144 167
145 168 if warnings_as_errors:
146 169 if compiler_type in ('unix', 'mingw32'):
147 170 extra_args.append('-Werror')
148 171 elif compiler_type == 'msvc':
149 172 extra_args.append('/WX')
150 173 else:
151 174 assert False
152 175
153 176 libraries = ['zstd'] if system_zstd else []
154 177
178 # Python 3.7 doesn't like absolute paths. So normalize to relative.
179 sources = [os.path.relpath(p, root) for p in sources]
180 include_dirs = [os.path.relpath(p, root) for p in include_dirs]
181 depends = [os.path.relpath(p, root) for p in depends]
182
155 183 # TODO compile with optimizations.
156 184 return Extension(name, sources,
157 185 include_dirs=include_dirs,
158 186 depends=depends,
159 187 extra_compile_args=extra_args,
160 188 libraries=libraries)
@@ -1,1266 +1,1463 b''
1 1 import hashlib
2 2 import io
3 3 import struct
4 4 import sys
5 5 import tarfile
6 6 import unittest
7 7
8 8 import zstandard as zstd
9 9
10 10 from .common import (
11 11 make_cffi,
12 12 OpCountingBytesIO,
13 13 )
14 14
15 15
16 16 if sys.version_info[0] >= 3:
17 17 next = lambda it: it.__next__()
18 18 else:
19 19 next = lambda it: it.next()
20 20
21 21
22 22 def multithreaded_chunk_size(level, source_size=0):
23 23 params = zstd.ZstdCompressionParameters.from_level(level,
24 24 source_size=source_size)
25 25
26 26 return 1 << (params.window_log + 2)
27 27
28 28
29 29 @make_cffi
30 30 class TestCompressor(unittest.TestCase):
31 31 def test_level_bounds(self):
32 32 with self.assertRaises(ValueError):
33 33 zstd.ZstdCompressor(level=23)
34 34
35 35 def test_memory_size(self):
36 36 cctx = zstd.ZstdCompressor(level=1)
37 37 self.assertGreater(cctx.memory_size(), 100)
38 38
39 39
40 40 @make_cffi
41 41 class TestCompressor_compress(unittest.TestCase):
42 42 def test_compress_empty(self):
43 43 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
44 44 result = cctx.compress(b'')
45 45 self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
46 46 params = zstd.get_frame_parameters(result)
47 47 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
48 48 self.assertEqual(params.window_size, 524288)
49 49 self.assertEqual(params.dict_id, 0)
50 50 self.assertFalse(params.has_checksum, 0)
51 51
52 52 cctx = zstd.ZstdCompressor()
53 53 result = cctx.compress(b'')
54 54 self.assertEqual(result, b'\x28\xb5\x2f\xfd\x20\x00\x01\x00\x00')
55 55 params = zstd.get_frame_parameters(result)
56 56 self.assertEqual(params.content_size, 0)
57 57
58 58 def test_input_types(self):
59 59 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
60 60 expected = b'\x28\xb5\x2f\xfd\x00\x00\x19\x00\x00\x66\x6f\x6f'
61 61
62 62 mutable_array = bytearray(3)
63 63 mutable_array[:] = b'foo'
64 64
65 65 sources = [
66 66 memoryview(b'foo'),
67 67 bytearray(b'foo'),
68 68 mutable_array,
69 69 ]
70 70
71 71 for source in sources:
72 72 self.assertEqual(cctx.compress(source), expected)
73 73
74 74 def test_compress_large(self):
75 75 chunks = []
76 76 for i in range(255):
77 77 chunks.append(struct.Struct('>B').pack(i) * 16384)
78 78
79 79 cctx = zstd.ZstdCompressor(level=3, write_content_size=False)
80 80 result = cctx.compress(b''.join(chunks))
81 81 self.assertEqual(len(result), 999)
82 82 self.assertEqual(result[0:4], b'\x28\xb5\x2f\xfd')
83 83
84 84 # This matches the test for read_to_iter() below.
85 85 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
86 86 result = cctx.compress(b'f' * zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE + b'o')
87 87 self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x40\x54\x00\x00'
88 88 b'\x10\x66\x66\x01\x00\xfb\xff\x39\xc0'
89 89 b'\x02\x09\x00\x00\x6f')
90 90
91 91 def test_negative_level(self):
92 92 cctx = zstd.ZstdCompressor(level=-4)
93 93 result = cctx.compress(b'foo' * 256)
94 94
95 95 def test_no_magic(self):
96 96 params = zstd.ZstdCompressionParameters.from_level(
97 97 1, format=zstd.FORMAT_ZSTD1)
98 98 cctx = zstd.ZstdCompressor(compression_params=params)
99 99 magic = cctx.compress(b'foobar')
100 100
101 101 params = zstd.ZstdCompressionParameters.from_level(
102 102 1, format=zstd.FORMAT_ZSTD1_MAGICLESS)
103 103 cctx = zstd.ZstdCompressor(compression_params=params)
104 104 no_magic = cctx.compress(b'foobar')
105 105
106 106 self.assertEqual(magic[0:4], b'\x28\xb5\x2f\xfd')
107 107 self.assertEqual(magic[4:], no_magic)
108 108
109 109 def test_write_checksum(self):
110 110 cctx = zstd.ZstdCompressor(level=1)
111 111 no_checksum = cctx.compress(b'foobar')
112 112 cctx = zstd.ZstdCompressor(level=1, write_checksum=True)
113 113 with_checksum = cctx.compress(b'foobar')
114 114
115 115 self.assertEqual(len(with_checksum), len(no_checksum) + 4)
116 116
117 117 no_params = zstd.get_frame_parameters(no_checksum)
118 118 with_params = zstd.get_frame_parameters(with_checksum)
119 119
120 120 self.assertFalse(no_params.has_checksum)
121 121 self.assertTrue(with_params.has_checksum)
122 122
123 123 def test_write_content_size(self):
124 124 cctx = zstd.ZstdCompressor(level=1)
125 125 with_size = cctx.compress(b'foobar' * 256)
126 126 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
127 127 no_size = cctx.compress(b'foobar' * 256)
128 128
129 129 self.assertEqual(len(with_size), len(no_size) + 1)
130 130
131 131 no_params = zstd.get_frame_parameters(no_size)
132 132 with_params = zstd.get_frame_parameters(with_size)
133 133 self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
134 134 self.assertEqual(with_params.content_size, 1536)
135 135
136 136 def test_no_dict_id(self):
137 137 samples = []
138 138 for i in range(128):
139 139 samples.append(b'foo' * 64)
140 140 samples.append(b'bar' * 64)
141 141 samples.append(b'foobar' * 64)
142 142
143 143 d = zstd.train_dictionary(1024, samples)
144 144
145 145 cctx = zstd.ZstdCompressor(level=1, dict_data=d)
146 146 with_dict_id = cctx.compress(b'foobarfoobar')
147 147
148 148 cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False)
149 149 no_dict_id = cctx.compress(b'foobarfoobar')
150 150
151 151 self.assertEqual(len(with_dict_id), len(no_dict_id) + 4)
152 152
153 153 no_params = zstd.get_frame_parameters(no_dict_id)
154 154 with_params = zstd.get_frame_parameters(with_dict_id)
155 155 self.assertEqual(no_params.dict_id, 0)
156 self.assertEqual(with_params.dict_id, 1387616518)
156 self.assertEqual(with_params.dict_id, 1880053135)
157 157
158 158 def test_compress_dict_multiple(self):
159 159 samples = []
160 160 for i in range(128):
161 161 samples.append(b'foo' * 64)
162 162 samples.append(b'bar' * 64)
163 163 samples.append(b'foobar' * 64)
164 164
165 165 d = zstd.train_dictionary(8192, samples)
166 166
167 167 cctx = zstd.ZstdCompressor(level=1, dict_data=d)
168 168
169 169 for i in range(32):
170 170 cctx.compress(b'foo bar foobar foo bar foobar')
171 171
172 172 def test_dict_precompute(self):
173 173 samples = []
174 174 for i in range(128):
175 175 samples.append(b'foo' * 64)
176 176 samples.append(b'bar' * 64)
177 177 samples.append(b'foobar' * 64)
178 178
179 179 d = zstd.train_dictionary(8192, samples)
180 180 d.precompute_compress(level=1)
181 181
182 182 cctx = zstd.ZstdCompressor(level=1, dict_data=d)
183 183
184 184 for i in range(32):
185 185 cctx.compress(b'foo bar foobar foo bar foobar')
186 186
187 187 def test_multithreaded(self):
188 188 chunk_size = multithreaded_chunk_size(1)
189 189 source = b''.join([b'x' * chunk_size, b'y' * chunk_size])
190 190
191 191 cctx = zstd.ZstdCompressor(level=1, threads=2)
192 192 compressed = cctx.compress(source)
193 193
194 194 params = zstd.get_frame_parameters(compressed)
195 195 self.assertEqual(params.content_size, chunk_size * 2)
196 196 self.assertEqual(params.dict_id, 0)
197 197 self.assertFalse(params.has_checksum)
198 198
199 199 dctx = zstd.ZstdDecompressor()
200 200 self.assertEqual(dctx.decompress(compressed), source)
201 201
202 202 def test_multithreaded_dict(self):
203 203 samples = []
204 204 for i in range(128):
205 205 samples.append(b'foo' * 64)
206 206 samples.append(b'bar' * 64)
207 207 samples.append(b'foobar' * 64)
208 208
209 209 d = zstd.train_dictionary(1024, samples)
210 210
211 211 cctx = zstd.ZstdCompressor(dict_data=d, threads=2)
212 212
213 213 result = cctx.compress(b'foo')
214 214 params = zstd.get_frame_parameters(result);
215 215 self.assertEqual(params.content_size, 3);
216 216 self.assertEqual(params.dict_id, d.dict_id())
217 217
218 218 self.assertEqual(result,
219 b'\x28\xb5\x2f\xfd\x23\x06\x59\xb5\x52\x03\x19\x00\x00'
219 b'\x28\xb5\x2f\xfd\x23\x8f\x55\x0f\x70\x03\x19\x00\x00'
220 220 b'\x66\x6f\x6f')
221 221
222 222 def test_multithreaded_compression_params(self):
223 223 params = zstd.ZstdCompressionParameters.from_level(0, threads=2)
224 224 cctx = zstd.ZstdCompressor(compression_params=params)
225 225
226 226 result = cctx.compress(b'foo')
227 227 params = zstd.get_frame_parameters(result);
228 228 self.assertEqual(params.content_size, 3);
229 229
230 230 self.assertEqual(result,
231 231 b'\x28\xb5\x2f\xfd\x20\x03\x19\x00\x00\x66\x6f\x6f')
232 232
233 233
234 234 @make_cffi
235 235 class TestCompressor_compressobj(unittest.TestCase):
236 236 def test_compressobj_empty(self):
237 237 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
238 238 cobj = cctx.compressobj()
239 239 self.assertEqual(cobj.compress(b''), b'')
240 240 self.assertEqual(cobj.flush(),
241 241 b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
242 242
243 243 def test_input_types(self):
244 244 expected = b'\x28\xb5\x2f\xfd\x00\x48\x19\x00\x00\x66\x6f\x6f'
245 245 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
246 246
247 247 mutable_array = bytearray(3)
248 248 mutable_array[:] = b'foo'
249 249
250 250 sources = [
251 251 memoryview(b'foo'),
252 252 bytearray(b'foo'),
253 253 mutable_array,
254 254 ]
255 255
256 256 for source in sources:
257 257 cobj = cctx.compressobj()
258 258 self.assertEqual(cobj.compress(source), b'')
259 259 self.assertEqual(cobj.flush(), expected)
260 260
261 261 def test_compressobj_large(self):
262 262 chunks = []
263 263 for i in range(255):
264 264 chunks.append(struct.Struct('>B').pack(i) * 16384)
265 265
266 266 cctx = zstd.ZstdCompressor(level=3)
267 267 cobj = cctx.compressobj()
268 268
269 269 result = cobj.compress(b''.join(chunks)) + cobj.flush()
270 270 self.assertEqual(len(result), 999)
271 271 self.assertEqual(result[0:4], b'\x28\xb5\x2f\xfd')
272 272
273 273 params = zstd.get_frame_parameters(result)
274 274 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
275 275 self.assertEqual(params.window_size, 1048576)
276 276 self.assertEqual(params.dict_id, 0)
277 277 self.assertFalse(params.has_checksum)
278 278
279 279 def test_write_checksum(self):
280 280 cctx = zstd.ZstdCompressor(level=1)
281 281 cobj = cctx.compressobj()
282 282 no_checksum = cobj.compress(b'foobar') + cobj.flush()
283 283 cctx = zstd.ZstdCompressor(level=1, write_checksum=True)
284 284 cobj = cctx.compressobj()
285 285 with_checksum = cobj.compress(b'foobar') + cobj.flush()
286 286
287 287 no_params = zstd.get_frame_parameters(no_checksum)
288 288 with_params = zstd.get_frame_parameters(with_checksum)
289 289 self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
290 290 self.assertEqual(with_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
291 291 self.assertEqual(no_params.dict_id, 0)
292 292 self.assertEqual(with_params.dict_id, 0)
293 293 self.assertFalse(no_params.has_checksum)
294 294 self.assertTrue(with_params.has_checksum)
295 295
296 296 self.assertEqual(len(with_checksum), len(no_checksum) + 4)
297 297
298 298 def test_write_content_size(self):
299 299 cctx = zstd.ZstdCompressor(level=1)
300 300 cobj = cctx.compressobj(size=len(b'foobar' * 256))
301 301 with_size = cobj.compress(b'foobar' * 256) + cobj.flush()
302 302 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
303 303 cobj = cctx.compressobj(size=len(b'foobar' * 256))
304 304 no_size = cobj.compress(b'foobar' * 256) + cobj.flush()
305 305
306 306 no_params = zstd.get_frame_parameters(no_size)
307 307 with_params = zstd.get_frame_parameters(with_size)
308 308 self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
309 309 self.assertEqual(with_params.content_size, 1536)
310 310 self.assertEqual(no_params.dict_id, 0)
311 311 self.assertEqual(with_params.dict_id, 0)
312 312 self.assertFalse(no_params.has_checksum)
313 313 self.assertFalse(with_params.has_checksum)
314 314
315 315 self.assertEqual(len(with_size), len(no_size) + 1)
316 316
317 317 def test_compress_after_finished(self):
318 318 cctx = zstd.ZstdCompressor()
319 319 cobj = cctx.compressobj()
320 320
321 321 cobj.compress(b'foo')
322 322 cobj.flush()
323 323
324 324 with self.assertRaisesRegexp(zstd.ZstdError, 'cannot call compress\(\) after compressor'):
325 325 cobj.compress(b'foo')
326 326
327 327 with self.assertRaisesRegexp(zstd.ZstdError, 'compressor object already finished'):
328 328 cobj.flush()
329 329
330 330 def test_flush_block_repeated(self):
331 331 cctx = zstd.ZstdCompressor(level=1)
332 332 cobj = cctx.compressobj()
333 333
334 334 self.assertEqual(cobj.compress(b'foo'), b'')
335 335 self.assertEqual(cobj.flush(zstd.COMPRESSOBJ_FLUSH_BLOCK),
336 336 b'\x28\xb5\x2f\xfd\x00\x48\x18\x00\x00foo')
337 337 self.assertEqual(cobj.compress(b'bar'), b'')
338 338 # 3 byte header plus content.
339 self.assertEqual(cobj.flush(), b'\x19\x00\x00bar')
339 self.assertEqual(cobj.flush(zstd.COMPRESSOBJ_FLUSH_BLOCK),
340 b'\x18\x00\x00bar')
341 self.assertEqual(cobj.flush(), b'\x01\x00\x00')
340 342
341 343 def test_flush_empty_block(self):
342 344 cctx = zstd.ZstdCompressor(write_checksum=True)
343 345 cobj = cctx.compressobj()
344 346
345 347 cobj.compress(b'foobar')
346 348 cobj.flush(zstd.COMPRESSOBJ_FLUSH_BLOCK)
347 349 # No-op if no block is active (this is internal to zstd).
348 350 self.assertEqual(cobj.flush(zstd.COMPRESSOBJ_FLUSH_BLOCK), b'')
349 351
350 352 trailing = cobj.flush()
351 353 # 3 bytes block header + 4 bytes frame checksum
352 354 self.assertEqual(len(trailing), 7)
353 355 header = trailing[0:3]
354 356 self.assertEqual(header, b'\x01\x00\x00')
355 357
356 358 def test_multithreaded(self):
357 359 source = io.BytesIO()
358 360 source.write(b'a' * 1048576)
359 361 source.write(b'b' * 1048576)
360 362 source.write(b'c' * 1048576)
361 363 source.seek(0)
362 364
363 365 cctx = zstd.ZstdCompressor(level=1, threads=2)
364 366 cobj = cctx.compressobj()
365 367
366 368 chunks = []
367 369 while True:
368 370 d = source.read(8192)
369 371 if not d:
370 372 break
371 373
372 374 chunks.append(cobj.compress(d))
373 375
374 376 chunks.append(cobj.flush())
375 377
376 378 compressed = b''.join(chunks)
377 379
378 380 self.assertEqual(len(compressed), 295)
379 381
380 382 def test_frame_progression(self):
381 383 cctx = zstd.ZstdCompressor()
382 384
383 385 self.assertEqual(cctx.frame_progression(), (0, 0, 0))
384 386
385 387 cobj = cctx.compressobj()
386 388
387 389 cobj.compress(b'foobar')
388 390 self.assertEqual(cctx.frame_progression(), (6, 0, 0))
389 391
390 392 cobj.flush()
391 393 self.assertEqual(cctx.frame_progression(), (6, 6, 15))
392 394
393 395 def test_bad_size(self):
394 396 cctx = zstd.ZstdCompressor()
395 397
396 398 cobj = cctx.compressobj(size=2)
397 399 with self.assertRaisesRegexp(zstd.ZstdError, 'Src size is incorrect'):
398 400 cobj.compress(b'foo')
399 401
400 402 # Try another operation on this instance.
401 403 with self.assertRaisesRegexp(zstd.ZstdError, 'Src size is incorrect'):
402 404 cobj.compress(b'aa')
403 405
404 406 # Try another operation on the compressor.
405 407 cctx.compressobj(size=4)
406 408 cctx.compress(b'foobar')
407 409
408 410
409 411 @make_cffi
410 412 class TestCompressor_copy_stream(unittest.TestCase):
411 413 def test_no_read(self):
412 414 source = object()
413 415 dest = io.BytesIO()
414 416
415 417 cctx = zstd.ZstdCompressor()
416 418 with self.assertRaises(ValueError):
417 419 cctx.copy_stream(source, dest)
418 420
419 421 def test_no_write(self):
420 422 source = io.BytesIO()
421 423 dest = object()
422 424
423 425 cctx = zstd.ZstdCompressor()
424 426 with self.assertRaises(ValueError):
425 427 cctx.copy_stream(source, dest)
426 428
427 429 def test_empty(self):
428 430 source = io.BytesIO()
429 431 dest = io.BytesIO()
430 432
431 433 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
432 434 r, w = cctx.copy_stream(source, dest)
433 435 self.assertEqual(int(r), 0)
434 436 self.assertEqual(w, 9)
435 437
436 438 self.assertEqual(dest.getvalue(),
437 439 b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
438 440
439 441 def test_large_data(self):
440 442 source = io.BytesIO()
441 443 for i in range(255):
442 444 source.write(struct.Struct('>B').pack(i) * 16384)
443 445 source.seek(0)
444 446
445 447 dest = io.BytesIO()
446 448 cctx = zstd.ZstdCompressor()
447 449 r, w = cctx.copy_stream(source, dest)
448 450
449 451 self.assertEqual(r, 255 * 16384)
450 452 self.assertEqual(w, 999)
451 453
452 454 params = zstd.get_frame_parameters(dest.getvalue())
453 455 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
454 456 self.assertEqual(params.window_size, 1048576)
455 457 self.assertEqual(params.dict_id, 0)
456 458 self.assertFalse(params.has_checksum)
457 459
458 460 def test_write_checksum(self):
459 461 source = io.BytesIO(b'foobar')
460 462 no_checksum = io.BytesIO()
461 463
462 464 cctx = zstd.ZstdCompressor(level=1)
463 465 cctx.copy_stream(source, no_checksum)
464 466
465 467 source.seek(0)
466 468 with_checksum = io.BytesIO()
467 469 cctx = zstd.ZstdCompressor(level=1, write_checksum=True)
468 470 cctx.copy_stream(source, with_checksum)
469 471
470 472 self.assertEqual(len(with_checksum.getvalue()),
471 473 len(no_checksum.getvalue()) + 4)
472 474
473 475 no_params = zstd.get_frame_parameters(no_checksum.getvalue())
474 476 with_params = zstd.get_frame_parameters(with_checksum.getvalue())
475 477 self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
476 478 self.assertEqual(with_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
477 479 self.assertEqual(no_params.dict_id, 0)
478 480 self.assertEqual(with_params.dict_id, 0)
479 481 self.assertFalse(no_params.has_checksum)
480 482 self.assertTrue(with_params.has_checksum)
481 483
482 484 def test_write_content_size(self):
483 485 source = io.BytesIO(b'foobar' * 256)
484 486 no_size = io.BytesIO()
485 487
486 488 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
487 489 cctx.copy_stream(source, no_size)
488 490
489 491 source.seek(0)
490 492 with_size = io.BytesIO()
491 493 cctx = zstd.ZstdCompressor(level=1)
492 494 cctx.copy_stream(source, with_size)
493 495
494 496 # Source content size is unknown, so no content size written.
495 497 self.assertEqual(len(with_size.getvalue()),
496 498 len(no_size.getvalue()))
497 499
498 500 source.seek(0)
499 501 with_size = io.BytesIO()
500 502 cctx.copy_stream(source, with_size, size=len(source.getvalue()))
501 503
502 504 # We specified source size, so content size header is present.
503 505 self.assertEqual(len(with_size.getvalue()),
504 506 len(no_size.getvalue()) + 1)
505 507
506 508 no_params = zstd.get_frame_parameters(no_size.getvalue())
507 509 with_params = zstd.get_frame_parameters(with_size.getvalue())
508 510 self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
509 511 self.assertEqual(with_params.content_size, 1536)
510 512 self.assertEqual(no_params.dict_id, 0)
511 513 self.assertEqual(with_params.dict_id, 0)
512 514 self.assertFalse(no_params.has_checksum)
513 515 self.assertFalse(with_params.has_checksum)
514 516
515 517 def test_read_write_size(self):
516 518 source = OpCountingBytesIO(b'foobarfoobar')
517 519 dest = OpCountingBytesIO()
518 520 cctx = zstd.ZstdCompressor()
519 521 r, w = cctx.copy_stream(source, dest, read_size=1, write_size=1)
520 522
521 523 self.assertEqual(r, len(source.getvalue()))
522 524 self.assertEqual(w, 21)
523 525 self.assertEqual(source._read_count, len(source.getvalue()) + 1)
524 526 self.assertEqual(dest._write_count, len(dest.getvalue()))
525 527
526 528 def test_multithreaded(self):
527 529 source = io.BytesIO()
528 530 source.write(b'a' * 1048576)
529 531 source.write(b'b' * 1048576)
530 532 source.write(b'c' * 1048576)
531 533 source.seek(0)
532 534
533 535 dest = io.BytesIO()
534 536 cctx = zstd.ZstdCompressor(threads=2, write_content_size=False)
535 537 r, w = cctx.copy_stream(source, dest)
536 538 self.assertEqual(r, 3145728)
537 539 self.assertEqual(w, 295)
538 540
539 541 params = zstd.get_frame_parameters(dest.getvalue())
540 542 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
541 543 self.assertEqual(params.dict_id, 0)
542 544 self.assertFalse(params.has_checksum)
543 545
544 546 # Writing content size and checksum works.
545 547 cctx = zstd.ZstdCompressor(threads=2, write_checksum=True)
546 548 dest = io.BytesIO()
547 549 source.seek(0)
548 550 cctx.copy_stream(source, dest, size=len(source.getvalue()))
549 551
550 552 params = zstd.get_frame_parameters(dest.getvalue())
551 553 self.assertEqual(params.content_size, 3145728)
552 554 self.assertEqual(params.dict_id, 0)
553 555 self.assertTrue(params.has_checksum)
554 556
555 557 def test_bad_size(self):
556 558 source = io.BytesIO()
557 559 source.write(b'a' * 32768)
558 560 source.write(b'b' * 32768)
559 561 source.seek(0)
560 562
561 563 dest = io.BytesIO()
562 564
563 565 cctx = zstd.ZstdCompressor()
564 566
565 567 with self.assertRaisesRegexp(zstd.ZstdError, 'Src size is incorrect'):
566 568 cctx.copy_stream(source, dest, size=42)
567 569
568 570 # Try another operation on this compressor.
569 571 source.seek(0)
570 572 dest = io.BytesIO()
571 573 cctx.copy_stream(source, dest)
572 574
573 575
574 576 @make_cffi
575 577 class TestCompressor_stream_reader(unittest.TestCase):
576 578 def test_context_manager(self):
577 579 cctx = zstd.ZstdCompressor()
578 580
579 reader = cctx.stream_reader(b'foo' * 60)
580 with self.assertRaisesRegexp(zstd.ZstdError, 'read\(\) must be called from an active'):
581 reader.read(10)
582
583 581 with cctx.stream_reader(b'foo') as reader:
584 582 with self.assertRaisesRegexp(ValueError, 'cannot __enter__ multiple times'):
585 583 with reader as reader2:
586 584 pass
587 585
586 def test_no_context_manager(self):
587 cctx = zstd.ZstdCompressor()
588
589 reader = cctx.stream_reader(b'foo')
590 reader.read(4)
591 self.assertFalse(reader.closed)
592
593 reader.close()
594 self.assertTrue(reader.closed)
595 with self.assertRaisesRegexp(ValueError, 'stream is closed'):
596 reader.read(1)
597
588 598 def test_not_implemented(self):
589 599 cctx = zstd.ZstdCompressor()
590 600
591 601 with cctx.stream_reader(b'foo' * 60) as reader:
592 602 with self.assertRaises(io.UnsupportedOperation):
593 603 reader.readline()
594 604
595 605 with self.assertRaises(io.UnsupportedOperation):
596 606 reader.readlines()
597 607
598 608 # This could probably be implemented someday.
599 609 with self.assertRaises(NotImplementedError):
600 610 reader.readall()
601 611
602 612 with self.assertRaises(io.UnsupportedOperation):
603 613 iter(reader)
604 614
605 615 with self.assertRaises(io.UnsupportedOperation):
606 616 next(reader)
607 617
608 618 with self.assertRaises(OSError):
609 619 reader.writelines([])
610 620
611 621 with self.assertRaises(OSError):
612 622 reader.write(b'foo')
613 623
614 624 def test_constant_methods(self):
615 625 cctx = zstd.ZstdCompressor()
616 626
617 627 with cctx.stream_reader(b'boo') as reader:
618 628 self.assertTrue(reader.readable())
619 629 self.assertFalse(reader.writable())
620 630 self.assertFalse(reader.seekable())
621 631 self.assertFalse(reader.isatty())
632 self.assertFalse(reader.closed)
622 633 self.assertIsNone(reader.flush())
634 self.assertFalse(reader.closed)
635
636 self.assertTrue(reader.closed)
623 637
624 638 def test_read_closed(self):
625 639 cctx = zstd.ZstdCompressor()
626 640
627 641 with cctx.stream_reader(b'foo' * 60) as reader:
628 642 reader.close()
643 self.assertTrue(reader.closed)
629 644 with self.assertRaisesRegexp(ValueError, 'stream is closed'):
630 645 reader.read(10)
631 646
632 647 def test_read_bad_size(self):
633 648 cctx = zstd.ZstdCompressor()
634 649
635 650 with cctx.stream_reader(b'foo') as reader:
636 651 with self.assertRaisesRegexp(ValueError, 'cannot read negative or size 0 amounts'):
637 652 reader.read(-1)
638 653
639 654 with self.assertRaisesRegexp(ValueError, 'cannot read negative or size 0 amounts'):
640 655 reader.read(0)
641 656
642 657 def test_read_buffer(self):
643 658 cctx = zstd.ZstdCompressor()
644 659
645 660 source = b''.join([b'foo' * 60, b'bar' * 60, b'baz' * 60])
646 661 frame = cctx.compress(source)
647 662
648 663 with cctx.stream_reader(source) as reader:
649 664 self.assertEqual(reader.tell(), 0)
650 665
651 666 # We should get entire frame in one read.
652 667 result = reader.read(8192)
653 668 self.assertEqual(result, frame)
654 669 self.assertEqual(reader.tell(), len(result))
655 670 self.assertEqual(reader.read(), b'')
656 671 self.assertEqual(reader.tell(), len(result))
657 672
658 673 def test_read_buffer_small_chunks(self):
659 674 cctx = zstd.ZstdCompressor()
660 675
661 676 source = b'foo' * 60
662 677 chunks = []
663 678
664 679 with cctx.stream_reader(source) as reader:
665 680 self.assertEqual(reader.tell(), 0)
666 681
667 682 while True:
668 683 chunk = reader.read(1)
669 684 if not chunk:
670 685 break
671 686
672 687 chunks.append(chunk)
673 688 self.assertEqual(reader.tell(), sum(map(len, chunks)))
674 689
675 690 self.assertEqual(b''.join(chunks), cctx.compress(source))
676 691
677 692 def test_read_stream(self):
678 693 cctx = zstd.ZstdCompressor()
679 694
680 695 source = b''.join([b'foo' * 60, b'bar' * 60, b'baz' * 60])
681 696 frame = cctx.compress(source)
682 697
683 698 with cctx.stream_reader(io.BytesIO(source), size=len(source)) as reader:
684 699 self.assertEqual(reader.tell(), 0)
685 700
686 701 chunk = reader.read(8192)
687 702 self.assertEqual(chunk, frame)
688 703 self.assertEqual(reader.tell(), len(chunk))
689 704 self.assertEqual(reader.read(), b'')
690 705 self.assertEqual(reader.tell(), len(chunk))
691 706
692 707 def test_read_stream_small_chunks(self):
693 708 cctx = zstd.ZstdCompressor()
694 709
695 710 source = b'foo' * 60
696 711 chunks = []
697 712
698 713 with cctx.stream_reader(io.BytesIO(source), size=len(source)) as reader:
699 714 self.assertEqual(reader.tell(), 0)
700 715
701 716 while True:
702 717 chunk = reader.read(1)
703 718 if not chunk:
704 719 break
705 720
706 721 chunks.append(chunk)
707 722 self.assertEqual(reader.tell(), sum(map(len, chunks)))
708 723
709 724 self.assertEqual(b''.join(chunks), cctx.compress(source))
710 725
711 726 def test_read_after_exit(self):
712 727 cctx = zstd.ZstdCompressor()
713 728
714 729 with cctx.stream_reader(b'foo' * 60) as reader:
715 730 while reader.read(8192):
716 731 pass
717 732
718 with self.assertRaisesRegexp(zstd.ZstdError, 'read\(\) must be called from an active'):
733 with self.assertRaisesRegexp(ValueError, 'stream is closed'):
719 734 reader.read(10)
720 735
721 736 def test_bad_size(self):
722 737 cctx = zstd.ZstdCompressor()
723 738
724 739 source = io.BytesIO(b'foobar')
725 740
726 741 with cctx.stream_reader(source, size=2) as reader:
727 742 with self.assertRaisesRegexp(zstd.ZstdError, 'Src size is incorrect'):
728 743 reader.read(10)
729 744
730 745 # Try another compression operation.
731 746 with cctx.stream_reader(source, size=42):
732 747 pass
733 748
734 749
735 750 @make_cffi
736 751 class TestCompressor_stream_writer(unittest.TestCase):
737 752 def test_empty(self):
738 753 buffer = io.BytesIO()
739 754 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
740 755 with cctx.stream_writer(buffer) as compressor:
741 756 compressor.write(b'')
742 757
743 758 result = buffer.getvalue()
744 759 self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
745 760
746 761 params = zstd.get_frame_parameters(result)
747 762 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
748 763 self.assertEqual(params.window_size, 524288)
749 764 self.assertEqual(params.dict_id, 0)
750 765 self.assertFalse(params.has_checksum)
751 766
752 767 def test_input_types(self):
753 768 expected = b'\x28\xb5\x2f\xfd\x00\x48\x19\x00\x00\x66\x6f\x6f'
754 769 cctx = zstd.ZstdCompressor(level=1)
755 770
756 771 mutable_array = bytearray(3)
757 772 mutable_array[:] = b'foo'
758 773
759 774 sources = [
760 775 memoryview(b'foo'),
761 776 bytearray(b'foo'),
762 777 mutable_array,
763 778 ]
764 779
765 780 for source in sources:
766 781 buffer = io.BytesIO()
767 782 with cctx.stream_writer(buffer) as compressor:
768 783 compressor.write(source)
769 784
770 785 self.assertEqual(buffer.getvalue(), expected)
771 786
772 787 def test_multiple_compress(self):
773 788 buffer = io.BytesIO()
774 789 cctx = zstd.ZstdCompressor(level=5)
775 790 with cctx.stream_writer(buffer) as compressor:
776 791 self.assertEqual(compressor.write(b'foo'), 0)
777 792 self.assertEqual(compressor.write(b'bar'), 0)
778 793 self.assertEqual(compressor.write(b'x' * 8192), 0)
779 794
780 795 result = buffer.getvalue()
781 796 self.assertEqual(result,
782 797 b'\x28\xb5\x2f\xfd\x00\x50\x75\x00\x00\x38\x66\x6f'
783 798 b'\x6f\x62\x61\x72\x78\x01\x00\xfc\xdf\x03\x23')
784 799
785 800 def test_dictionary(self):
786 801 samples = []
787 802 for i in range(128):
788 803 samples.append(b'foo' * 64)
789 804 samples.append(b'bar' * 64)
790 805 samples.append(b'foobar' * 64)
791 806
792 807 d = zstd.train_dictionary(8192, samples)
793 808
794 809 h = hashlib.sha1(d.as_bytes()).hexdigest()
795 self.assertEqual(h, '3040faa0ddc37d50e71a4dd28052cb8db5d9d027')
810 self.assertEqual(h, '2b3b6428da5bf2c9cc9d4bb58ba0bc5990dd0e79')
796 811
797 812 buffer = io.BytesIO()
798 813 cctx = zstd.ZstdCompressor(level=9, dict_data=d)
799 814 with cctx.stream_writer(buffer) as compressor:
800 815 self.assertEqual(compressor.write(b'foo'), 0)
801 816 self.assertEqual(compressor.write(b'bar'), 0)
802 817 self.assertEqual(compressor.write(b'foo' * 16384), 0)
803 818
804 819 compressed = buffer.getvalue()
805 820
806 821 params = zstd.get_frame_parameters(compressed)
807 822 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
808 823 self.assertEqual(params.window_size, 2097152)
809 824 self.assertEqual(params.dict_id, d.dict_id())
810 825 self.assertFalse(params.has_checksum)
811 self.assertEqual(compressed,
812 b'\x28\xb5\x2f\xfd\x03\x58\x06\x59\xb5\x52\x5d\x00'
813 b'\x00\x00\x02\xfc\x3d\x3f\xd9\xb0\x51\x03\x45\x89')
826
827 h = hashlib.sha1(compressed).hexdigest()
828 self.assertEqual(h, '23f88344263678478f5f82298e0a5d1833125786')
829
830 source = b'foo' + b'bar' + (b'foo' * 16384)
831
832 dctx = zstd.ZstdDecompressor(dict_data=d)
833
834 self.assertEqual(dctx.decompress(compressed, max_output_size=len(source)),
835 source)
814 836
815 837 def test_compression_params(self):
816 838 params = zstd.ZstdCompressionParameters(
817 839 window_log=20,
818 840 chain_log=6,
819 841 hash_log=12,
820 842 min_match=5,
821 843 search_log=4,
822 844 target_length=10,
823 845 compression_strategy=zstd.STRATEGY_FAST)
824 846
825 847 buffer = io.BytesIO()
826 848 cctx = zstd.ZstdCompressor(compression_params=params)
827 849 with cctx.stream_writer(buffer) as compressor:
828 850 self.assertEqual(compressor.write(b'foo'), 0)
829 851 self.assertEqual(compressor.write(b'bar'), 0)
830 852 self.assertEqual(compressor.write(b'foobar' * 16384), 0)
831 853
832 854 compressed = buffer.getvalue()
833 855
834 856 params = zstd.get_frame_parameters(compressed)
835 857 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
836 858 self.assertEqual(params.window_size, 1048576)
837 859 self.assertEqual(params.dict_id, 0)
838 860 self.assertFalse(params.has_checksum)
839 861
840 862 h = hashlib.sha1(compressed).hexdigest()
841 863 self.assertEqual(h, '2a8111d72eb5004cdcecbdac37da9f26720d30ef')
842 864
843 865 def test_write_checksum(self):
844 866 no_checksum = io.BytesIO()
845 867 cctx = zstd.ZstdCompressor(level=1)
846 868 with cctx.stream_writer(no_checksum) as compressor:
847 869 self.assertEqual(compressor.write(b'foobar'), 0)
848 870
849 871 with_checksum = io.BytesIO()
850 872 cctx = zstd.ZstdCompressor(level=1, write_checksum=True)
851 873 with cctx.stream_writer(with_checksum) as compressor:
852 874 self.assertEqual(compressor.write(b'foobar'), 0)
853 875
854 876 no_params = zstd.get_frame_parameters(no_checksum.getvalue())
855 877 with_params = zstd.get_frame_parameters(with_checksum.getvalue())
856 878 self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
857 879 self.assertEqual(with_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
858 880 self.assertEqual(no_params.dict_id, 0)
859 881 self.assertEqual(with_params.dict_id, 0)
860 882 self.assertFalse(no_params.has_checksum)
861 883 self.assertTrue(with_params.has_checksum)
862 884
863 885 self.assertEqual(len(with_checksum.getvalue()),
864 886 len(no_checksum.getvalue()) + 4)
865 887
866 888 def test_write_content_size(self):
867 889 no_size = io.BytesIO()
868 890 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
869 891 with cctx.stream_writer(no_size) as compressor:
870 892 self.assertEqual(compressor.write(b'foobar' * 256), 0)
871 893
872 894 with_size = io.BytesIO()
873 895 cctx = zstd.ZstdCompressor(level=1)
874 896 with cctx.stream_writer(with_size) as compressor:
875 897 self.assertEqual(compressor.write(b'foobar' * 256), 0)
876 898
877 899 # Source size is not known in streaming mode, so header not
878 900 # written.
879 901 self.assertEqual(len(with_size.getvalue()),
880 902 len(no_size.getvalue()))
881 903
882 904 # Declaring size will write the header.
883 905 with_size = io.BytesIO()
884 906 with cctx.stream_writer(with_size, size=len(b'foobar' * 256)) as compressor:
885 907 self.assertEqual(compressor.write(b'foobar' * 256), 0)
886 908
887 909 no_params = zstd.get_frame_parameters(no_size.getvalue())
888 910 with_params = zstd.get_frame_parameters(with_size.getvalue())
889 911 self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
890 912 self.assertEqual(with_params.content_size, 1536)
891 913 self.assertEqual(no_params.dict_id, 0)
892 914 self.assertEqual(with_params.dict_id, 0)
893 915 self.assertFalse(no_params.has_checksum)
894 916 self.assertFalse(with_params.has_checksum)
895 917
896 918 self.assertEqual(len(with_size.getvalue()),
897 919 len(no_size.getvalue()) + 1)
898 920
899 921 def test_no_dict_id(self):
900 922 samples = []
901 923 for i in range(128):
902 924 samples.append(b'foo' * 64)
903 925 samples.append(b'bar' * 64)
904 926 samples.append(b'foobar' * 64)
905 927
906 928 d = zstd.train_dictionary(1024, samples)
907 929
908 930 with_dict_id = io.BytesIO()
909 931 cctx = zstd.ZstdCompressor(level=1, dict_data=d)
910 932 with cctx.stream_writer(with_dict_id) as compressor:
911 933 self.assertEqual(compressor.write(b'foobarfoobar'), 0)
912 934
913 935 self.assertEqual(with_dict_id.getvalue()[4:5], b'\x03')
914 936
915 937 cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False)
916 938 no_dict_id = io.BytesIO()
917 939 with cctx.stream_writer(no_dict_id) as compressor:
918 940 self.assertEqual(compressor.write(b'foobarfoobar'), 0)
919 941
920 942 self.assertEqual(no_dict_id.getvalue()[4:5], b'\x00')
921 943
922 944 no_params = zstd.get_frame_parameters(no_dict_id.getvalue())
923 945 with_params = zstd.get_frame_parameters(with_dict_id.getvalue())
924 946 self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
925 947 self.assertEqual(with_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
926 948 self.assertEqual(no_params.dict_id, 0)
927 949 self.assertEqual(with_params.dict_id, d.dict_id())
928 950 self.assertFalse(no_params.has_checksum)
929 951 self.assertFalse(with_params.has_checksum)
930 952
931 953 self.assertEqual(len(with_dict_id.getvalue()),
932 954 len(no_dict_id.getvalue()) + 4)
933 955
934 956 def test_memory_size(self):
935 957 cctx = zstd.ZstdCompressor(level=3)
936 958 buffer = io.BytesIO()
937 959 with cctx.stream_writer(buffer) as compressor:
938 960 compressor.write(b'foo')
939 961 size = compressor.memory_size()
940 962
941 963 self.assertGreater(size, 100000)
942 964
943 965 def test_write_size(self):
944 966 cctx = zstd.ZstdCompressor(level=3)
945 967 dest = OpCountingBytesIO()
946 968 with cctx.stream_writer(dest, write_size=1) as compressor:
947 969 self.assertEqual(compressor.write(b'foo'), 0)
948 970 self.assertEqual(compressor.write(b'bar'), 0)
949 971 self.assertEqual(compressor.write(b'foobar'), 0)
950 972
951 973 self.assertEqual(len(dest.getvalue()), dest._write_count)
952 974
953 975 def test_flush_repeated(self):
954 976 cctx = zstd.ZstdCompressor(level=3)
955 977 dest = OpCountingBytesIO()
956 978 with cctx.stream_writer(dest) as compressor:
957 979 self.assertEqual(compressor.write(b'foo'), 0)
958 980 self.assertEqual(dest._write_count, 0)
959 981 self.assertEqual(compressor.flush(), 12)
960 982 self.assertEqual(dest._write_count, 1)
961 983 self.assertEqual(compressor.write(b'bar'), 0)
962 984 self.assertEqual(dest._write_count, 1)
963 985 self.assertEqual(compressor.flush(), 6)
964 986 self.assertEqual(dest._write_count, 2)
965 987 self.assertEqual(compressor.write(b'baz'), 0)
966 988
967 989 self.assertEqual(dest._write_count, 3)
968 990
969 991 def test_flush_empty_block(self):
970 992 cctx = zstd.ZstdCompressor(level=3, write_checksum=True)
971 993 dest = OpCountingBytesIO()
972 994 with cctx.stream_writer(dest) as compressor:
973 995 self.assertEqual(compressor.write(b'foobar' * 8192), 0)
974 996 count = dest._write_count
975 997 offset = dest.tell()
976 998 self.assertEqual(compressor.flush(), 23)
977 999 self.assertGreater(dest._write_count, count)
978 1000 self.assertGreater(dest.tell(), offset)
979 1001 offset = dest.tell()
980 1002 # Ending the write here should cause an empty block to be written
981 1003 # to denote end of frame.
982 1004
983 1005 trailing = dest.getvalue()[offset:]
984 1006 # 3 bytes block header + 4 bytes frame checksum
985 1007 self.assertEqual(len(trailing), 7)
986 1008
987 1009 header = trailing[0:3]
988 1010 self.assertEqual(header, b'\x01\x00\x00')
989 1011
990 1012 def test_multithreaded(self):
991 1013 dest = io.BytesIO()
992 1014 cctx = zstd.ZstdCompressor(threads=2)
993 1015 with cctx.stream_writer(dest) as compressor:
994 1016 compressor.write(b'a' * 1048576)
995 1017 compressor.write(b'b' * 1048576)
996 1018 compressor.write(b'c' * 1048576)
997 1019
998 1020 self.assertEqual(len(dest.getvalue()), 295)
999 1021
1000 1022 def test_tell(self):
1001 1023 dest = io.BytesIO()
1002 1024 cctx = zstd.ZstdCompressor()
1003 1025 with cctx.stream_writer(dest) as compressor:
1004 1026 self.assertEqual(compressor.tell(), 0)
1005 1027
1006 1028 for i in range(256):
1007 1029 compressor.write(b'foo' * (i + 1))
1008 1030 self.assertEqual(compressor.tell(), dest.tell())
1009 1031
1010 1032 def test_bad_size(self):
1011 1033 cctx = zstd.ZstdCompressor()
1012 1034
1013 1035 dest = io.BytesIO()
1014 1036
1015 1037 with self.assertRaisesRegexp(zstd.ZstdError, 'Src size is incorrect'):
1016 1038 with cctx.stream_writer(dest, size=2) as compressor:
1017 1039 compressor.write(b'foo')
1018 1040
1019 1041 # Test another operation.
1020 1042 with cctx.stream_writer(dest, size=42):
1021 1043 pass
1022 1044
1023 1045 def test_tarfile_compat(self):
1024 1046 raise unittest.SkipTest('not yet fully working')
1025 1047
1026 1048 dest = io.BytesIO()
1027 1049 cctx = zstd.ZstdCompressor()
1028 1050 with cctx.stream_writer(dest) as compressor:
1029 1051 with tarfile.open('tf', mode='w', fileobj=compressor) as tf:
1030 1052 tf.add(__file__, 'test_compressor.py')
1031 1053
1032 1054 dest.seek(0)
1033 1055
1034 1056 dctx = zstd.ZstdDecompressor()
1035 1057 with dctx.stream_reader(dest) as reader:
1036 1058 with tarfile.open(mode='r:', fileobj=reader) as tf:
1037 1059 for member in tf:
1038 1060 self.assertEqual(member.name, 'test_compressor.py')
1039 1061
1040 1062 @make_cffi
1041 1063 class TestCompressor_read_to_iter(unittest.TestCase):
1042 1064 def test_type_validation(self):
1043 1065 cctx = zstd.ZstdCompressor()
1044 1066
1045 1067 # Object with read() works.
1046 1068 for chunk in cctx.read_to_iter(io.BytesIO()):
1047 1069 pass
1048 1070
1049 1071 # Buffer protocol works.
1050 1072 for chunk in cctx.read_to_iter(b'foobar'):
1051 1073 pass
1052 1074
1053 1075 with self.assertRaisesRegexp(ValueError, 'must pass an object with a read'):
1054 1076 for chunk in cctx.read_to_iter(True):
1055 1077 pass
1056 1078
1057 1079 def test_read_empty(self):
1058 1080 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
1059 1081
1060 1082 source = io.BytesIO()
1061 1083 it = cctx.read_to_iter(source)
1062 1084 chunks = list(it)
1063 1085 self.assertEqual(len(chunks), 1)
1064 1086 compressed = b''.join(chunks)
1065 1087 self.assertEqual(compressed, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
1066 1088
1067 1089 # And again with the buffer protocol.
1068 1090 it = cctx.read_to_iter(b'')
1069 1091 chunks = list(it)
1070 1092 self.assertEqual(len(chunks), 1)
1071 1093 compressed2 = b''.join(chunks)
1072 1094 self.assertEqual(compressed2, compressed)
1073 1095
1074 1096 def test_read_large(self):
1075 1097 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
1076 1098
1077 1099 source = io.BytesIO()
1078 1100 source.write(b'f' * zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE)
1079 1101 source.write(b'o')
1080 1102 source.seek(0)
1081 1103
1082 1104 # Creating an iterator should not perform any compression until
1083 1105 # first read.
1084 1106 it = cctx.read_to_iter(source, size=len(source.getvalue()))
1085 1107 self.assertEqual(source.tell(), 0)
1086 1108
1087 1109 # We should have exactly 2 output chunks.
1088 1110 chunks = []
1089 1111 chunk = next(it)
1090 1112 self.assertIsNotNone(chunk)
1091 1113 self.assertEqual(source.tell(), zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE)
1092 1114 chunks.append(chunk)
1093 1115 chunk = next(it)
1094 1116 self.assertIsNotNone(chunk)
1095 1117 chunks.append(chunk)
1096 1118
1097 1119 self.assertEqual(source.tell(), len(source.getvalue()))
1098 1120
1099 1121 with self.assertRaises(StopIteration):
1100 1122 next(it)
1101 1123
1102 1124 # And again for good measure.
1103 1125 with self.assertRaises(StopIteration):
1104 1126 next(it)
1105 1127
1106 1128 # We should get the same output as the one-shot compression mechanism.
1107 1129 self.assertEqual(b''.join(chunks), cctx.compress(source.getvalue()))
1108 1130
1109 1131 params = zstd.get_frame_parameters(b''.join(chunks))
1110 1132 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
1111 1133 self.assertEqual(params.window_size, 262144)
1112 1134 self.assertEqual(params.dict_id, 0)
1113 1135 self.assertFalse(params.has_checksum)
1114 1136
1115 1137 # Now check the buffer protocol.
1116 1138 it = cctx.read_to_iter(source.getvalue())
1117 1139 chunks = list(it)
1118 1140 self.assertEqual(len(chunks), 2)
1119 1141
1120 1142 params = zstd.get_frame_parameters(b''.join(chunks))
1121 1143 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
1122 1144 #self.assertEqual(params.window_size, 262144)
1123 1145 self.assertEqual(params.dict_id, 0)
1124 1146 self.assertFalse(params.has_checksum)
1125 1147
1126 1148 self.assertEqual(b''.join(chunks), cctx.compress(source.getvalue()))
1127 1149
1128 1150 def test_read_write_size(self):
1129 1151 source = OpCountingBytesIO(b'foobarfoobar')
1130 1152 cctx = zstd.ZstdCompressor(level=3)
1131 1153 for chunk in cctx.read_to_iter(source, read_size=1, write_size=1):
1132 1154 self.assertEqual(len(chunk), 1)
1133 1155
1134 1156 self.assertEqual(source._read_count, len(source.getvalue()) + 1)
1135 1157
1136 1158 def test_multithreaded(self):
1137 1159 source = io.BytesIO()
1138 1160 source.write(b'a' * 1048576)
1139 1161 source.write(b'b' * 1048576)
1140 1162 source.write(b'c' * 1048576)
1141 1163 source.seek(0)
1142 1164
1143 1165 cctx = zstd.ZstdCompressor(threads=2)
1144 1166
1145 1167 compressed = b''.join(cctx.read_to_iter(source))
1146 1168 self.assertEqual(len(compressed), 295)
1147 1169
1148 1170 def test_bad_size(self):
1149 1171 cctx = zstd.ZstdCompressor()
1150 1172
1151 1173 source = io.BytesIO(b'a' * 42)
1152 1174
1153 1175 with self.assertRaisesRegexp(zstd.ZstdError, 'Src size is incorrect'):
1154 1176 b''.join(cctx.read_to_iter(source, size=2))
1155 1177
1156 1178 # Test another operation on errored compressor.
1157 1179 b''.join(cctx.read_to_iter(source))
1158 1180
1159 1181
1182 @make_cffi
1183 class TestCompressor_chunker(unittest.TestCase):
1184 def test_empty(self):
1185 cctx = zstd.ZstdCompressor(write_content_size=False)
1186 chunker = cctx.chunker()
1187
1188 it = chunker.compress(b'')
1189
1190 with self.assertRaises(StopIteration):
1191 next(it)
1192
1193 it = chunker.finish()
1194
1195 self.assertEqual(next(it), b'\x28\xb5\x2f\xfd\x00\x50\x01\x00\x00')
1196
1197 with self.assertRaises(StopIteration):
1198 next(it)
1199
1200 def test_simple_input(self):
1201 cctx = zstd.ZstdCompressor()
1202 chunker = cctx.chunker()
1203
1204 it = chunker.compress(b'foobar')
1205
1206 with self.assertRaises(StopIteration):
1207 next(it)
1208
1209 it = chunker.compress(b'baz' * 30)
1210
1211 with self.assertRaises(StopIteration):
1212 next(it)
1213
1214 it = chunker.finish()
1215
1216 self.assertEqual(next(it),
1217 b'\x28\xb5\x2f\xfd\x00\x50\x7d\x00\x00\x48\x66\x6f'
1218 b'\x6f\x62\x61\x72\x62\x61\x7a\x01\x00\xe4\xe4\x8e')
1219
1220 with self.assertRaises(StopIteration):
1221 next(it)
1222
1223 def test_input_size(self):
1224 cctx = zstd.ZstdCompressor()
1225 chunker = cctx.chunker(size=1024)
1226
1227 it = chunker.compress(b'x' * 1000)
1228
1229 with self.assertRaises(StopIteration):
1230 next(it)
1231
1232 it = chunker.compress(b'y' * 24)
1233
1234 with self.assertRaises(StopIteration):
1235 next(it)
1236
1237 chunks = list(chunker.finish())
1238
1239 self.assertEqual(chunks, [
1240 b'\x28\xb5\x2f\xfd\x60\x00\x03\x65\x00\x00\x18\x78\x78\x79\x02\x00'
1241 b'\xa0\x16\xe3\x2b\x80\x05'
1242 ])
1243
1244 dctx = zstd.ZstdDecompressor()
1245
1246 self.assertEqual(dctx.decompress(b''.join(chunks)),
1247 (b'x' * 1000) + (b'y' * 24))
1248
1249 def test_small_chunk_size(self):
1250 cctx = zstd.ZstdCompressor()
1251 chunker = cctx.chunker(chunk_size=1)
1252
1253 chunks = list(chunker.compress(b'foo' * 1024))
1254 self.assertEqual(chunks, [])
1255
1256 chunks = list(chunker.finish())
1257 self.assertTrue(all(len(chunk) == 1 for chunk in chunks))
1258
1259 self.assertEqual(
1260 b''.join(chunks),
1261 b'\x28\xb5\x2f\xfd\x00\x50\x55\x00\x00\x18\x66\x6f\x6f\x01\x00'
1262 b'\xfa\xd3\x77\x43')
1263
1264 dctx = zstd.ZstdDecompressor()
1265 self.assertEqual(dctx.decompress(b''.join(chunks),
1266 max_output_size=10000),
1267 b'foo' * 1024)
1268
1269 def test_input_types(self):
1270 cctx = zstd.ZstdCompressor()
1271
1272 mutable_array = bytearray(3)
1273 mutable_array[:] = b'foo'
1274
1275 sources = [
1276 memoryview(b'foo'),
1277 bytearray(b'foo'),
1278 mutable_array,
1279 ]
1280
1281 for source in sources:
1282 chunker = cctx.chunker()
1283
1284 self.assertEqual(list(chunker.compress(source)), [])
1285 self.assertEqual(list(chunker.finish()), [
1286 b'\x28\xb5\x2f\xfd\x00\x50\x19\x00\x00\x66\x6f\x6f'
1287 ])
1288
1289 def test_flush(self):
1290 cctx = zstd.ZstdCompressor()
1291 chunker = cctx.chunker()
1292
1293 self.assertEqual(list(chunker.compress(b'foo' * 1024)), [])
1294 self.assertEqual(list(chunker.compress(b'bar' * 1024)), [])
1295
1296 chunks1 = list(chunker.flush())
1297
1298 self.assertEqual(chunks1, [
1299 b'\x28\xb5\x2f\xfd\x00\x50\x8c\x00\x00\x30\x66\x6f\x6f\x62\x61\x72'
1300 b'\x02\x00\xfa\x03\xfe\xd0\x9f\xbe\x1b\x02'
1301 ])
1302
1303 self.assertEqual(list(chunker.flush()), [])
1304 self.assertEqual(list(chunker.flush()), [])
1305
1306 self.assertEqual(list(chunker.compress(b'baz' * 1024)), [])
1307
1308 chunks2 = list(chunker.flush())
1309 self.assertEqual(len(chunks2), 1)
1310
1311 chunks3 = list(chunker.finish())
1312 self.assertEqual(len(chunks2), 1)
1313
1314 dctx = zstd.ZstdDecompressor()
1315
1316 self.assertEqual(dctx.decompress(b''.join(chunks1 + chunks2 + chunks3),
1317 max_output_size=10000),
1318 (b'foo' * 1024) + (b'bar' * 1024) + (b'baz' * 1024))
1319
1320 def test_compress_after_finish(self):
1321 cctx = zstd.ZstdCompressor()
1322 chunker = cctx.chunker()
1323
1324 list(chunker.compress(b'foo'))
1325 list(chunker.finish())
1326
1327 with self.assertRaisesRegexp(
1328 zstd.ZstdError,
1329 'cannot call compress\(\) after compression finished'):
1330 list(chunker.compress(b'foo'))
1331
1332 def test_flush_after_finish(self):
1333 cctx = zstd.ZstdCompressor()
1334 chunker = cctx.chunker()
1335
1336 list(chunker.compress(b'foo'))
1337 list(chunker.finish())
1338
1339 with self.assertRaisesRegexp(
1340 zstd.ZstdError,
1341 'cannot call flush\(\) after compression finished'):
1342 list(chunker.flush())
1343
1344 def test_finish_after_finish(self):
1345 cctx = zstd.ZstdCompressor()
1346 chunker = cctx.chunker()
1347
1348 list(chunker.compress(b'foo'))
1349 list(chunker.finish())
1350
1351 with self.assertRaisesRegexp(
1352 zstd.ZstdError,
1353 'cannot call finish\(\) after compression finished'):
1354 list(chunker.finish())
1355
1356
1160 1357 class TestCompressor_multi_compress_to_buffer(unittest.TestCase):
1161 1358 def test_invalid_inputs(self):
1162 1359 cctx = zstd.ZstdCompressor()
1163 1360
1164 1361 with self.assertRaises(TypeError):
1165 1362 cctx.multi_compress_to_buffer(True)
1166 1363
1167 1364 with self.assertRaises(TypeError):
1168 1365 cctx.multi_compress_to_buffer((1, 2))
1169 1366
1170 1367 with self.assertRaisesRegexp(TypeError, 'item 0 not a bytes like object'):
1171 1368 cctx.multi_compress_to_buffer([u'foo'])
1172 1369
1173 1370 def test_empty_input(self):
1174 1371 cctx = zstd.ZstdCompressor()
1175 1372
1176 1373 with self.assertRaisesRegexp(ValueError, 'no source elements found'):
1177 1374 cctx.multi_compress_to_buffer([])
1178 1375
1179 1376 with self.assertRaisesRegexp(ValueError, 'source elements are empty'):
1180 1377 cctx.multi_compress_to_buffer([b'', b'', b''])
1181 1378
1182 1379 def test_list_input(self):
1183 1380 cctx = zstd.ZstdCompressor(write_checksum=True)
1184 1381
1185 1382 original = [b'foo' * 12, b'bar' * 6]
1186 1383 frames = [cctx.compress(c) for c in original]
1187 1384 b = cctx.multi_compress_to_buffer(original)
1188 1385
1189 1386 self.assertIsInstance(b, zstd.BufferWithSegmentsCollection)
1190 1387
1191 1388 self.assertEqual(len(b), 2)
1192 1389 self.assertEqual(b.size(), 44)
1193 1390
1194 1391 self.assertEqual(b[0].tobytes(), frames[0])
1195 1392 self.assertEqual(b[1].tobytes(), frames[1])
1196 1393
1197 1394 def test_buffer_with_segments_input(self):
1198 1395 cctx = zstd.ZstdCompressor(write_checksum=True)
1199 1396
1200 1397 original = [b'foo' * 4, b'bar' * 6]
1201 1398 frames = [cctx.compress(c) for c in original]
1202 1399
1203 1400 offsets = struct.pack('=QQQQ', 0, len(original[0]),
1204 1401 len(original[0]), len(original[1]))
1205 1402 segments = zstd.BufferWithSegments(b''.join(original), offsets)
1206 1403
1207 1404 result = cctx.multi_compress_to_buffer(segments)
1208 1405
1209 1406 self.assertEqual(len(result), 2)
1210 1407 self.assertEqual(result.size(), 47)
1211 1408
1212 1409 self.assertEqual(result[0].tobytes(), frames[0])
1213 1410 self.assertEqual(result[1].tobytes(), frames[1])
1214 1411
1215 1412 def test_buffer_with_segments_collection_input(self):
1216 1413 cctx = zstd.ZstdCompressor(write_checksum=True)
1217 1414
1218 1415 original = [
1219 1416 b'foo1',
1220 1417 b'foo2' * 2,
1221 1418 b'foo3' * 3,
1222 1419 b'foo4' * 4,
1223 1420 b'foo5' * 5,
1224 1421 ]
1225 1422
1226 1423 frames = [cctx.compress(c) for c in original]
1227 1424
1228 1425 b = b''.join([original[0], original[1]])
1229 1426 b1 = zstd.BufferWithSegments(b, struct.pack('=QQQQ',
1230 1427 0, len(original[0]),
1231 1428 len(original[0]), len(original[1])))
1232 1429 b = b''.join([original[2], original[3], original[4]])
1233 1430 b2 = zstd.BufferWithSegments(b, struct.pack('=QQQQQQ',
1234 1431 0, len(original[2]),
1235 1432 len(original[2]), len(original[3]),
1236 1433 len(original[2]) + len(original[3]), len(original[4])))
1237 1434
1238 1435 c = zstd.BufferWithSegmentsCollection(b1, b2)
1239 1436
1240 1437 result = cctx.multi_compress_to_buffer(c)
1241 1438
1242 1439 self.assertEqual(len(result), len(frames))
1243 1440
1244 1441 for i, frame in enumerate(frames):
1245 1442 self.assertEqual(result[i].tobytes(), frame)
1246 1443
1247 1444 def test_multiple_threads(self):
1248 1445 # threads argument will cause multi-threaded ZSTD APIs to be used, which will
1249 1446 # make output different.
1250 1447 refcctx = zstd.ZstdCompressor(write_checksum=True)
1251 1448 reference = [refcctx.compress(b'x' * 64), refcctx.compress(b'y' * 64)]
1252 1449
1253 1450 cctx = zstd.ZstdCompressor(write_checksum=True)
1254 1451
1255 1452 frames = []
1256 1453 frames.extend(b'x' * 64 for i in range(256))
1257 1454 frames.extend(b'y' * 64 for i in range(256))
1258 1455
1259 1456 result = cctx.multi_compress_to_buffer(frames, threads=-1)
1260 1457
1261 1458 self.assertEqual(len(result), 512)
1262 1459 for i in range(512):
1263 1460 if i < 256:
1264 1461 self.assertEqual(result[i].tobytes(), reference[0])
1265 1462 else:
1266 1463 self.assertEqual(result[i].tobytes(), reference[1])
@@ -1,188 +1,320 b''
1 1 import io
2 2 import os
3 3 import unittest
4 4
5 5 try:
6 6 import hypothesis
7 7 import hypothesis.strategies as strategies
8 8 except ImportError:
9 9 raise unittest.SkipTest('hypothesis not available')
10 10
11 11 import zstandard as zstd
12 12
13 13 from . common import (
14 14 make_cffi,
15 15 random_input_data,
16 16 )
17 17
18 18
19 19 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
20 20 @make_cffi
21 21 class TestCompressor_stream_reader_fuzzing(unittest.TestCase):
22 22 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
23 23 level=strategies.integers(min_value=1, max_value=5),
24 24 source_read_size=strategies.integers(1, 16384),
25 25 read_sizes=strategies.data())
26 26 def test_stream_source_read_variance(self, original, level, source_read_size,
27 27 read_sizes):
28 28 refctx = zstd.ZstdCompressor(level=level)
29 29 ref_frame = refctx.compress(original)
30 30
31 31 cctx = zstd.ZstdCompressor(level=level)
32 32 with cctx.stream_reader(io.BytesIO(original), size=len(original),
33 33 read_size=source_read_size) as reader:
34 34 chunks = []
35 35 while True:
36 36 read_size = read_sizes.draw(strategies.integers(1, 16384))
37 37 chunk = reader.read(read_size)
38 38
39 39 if not chunk:
40 40 break
41 41 chunks.append(chunk)
42 42
43 43 self.assertEqual(b''.join(chunks), ref_frame)
44 44
45 45 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
46 46 level=strategies.integers(min_value=1, max_value=5),
47 47 source_read_size=strategies.integers(1, 16384),
48 48 read_sizes=strategies.data())
49 49 def test_buffer_source_read_variance(self, original, level, source_read_size,
50 50 read_sizes):
51 51
52 52 refctx = zstd.ZstdCompressor(level=level)
53 53 ref_frame = refctx.compress(original)
54 54
55 55 cctx = zstd.ZstdCompressor(level=level)
56 56 with cctx.stream_reader(original, size=len(original),
57 57 read_size=source_read_size) as reader:
58 58 chunks = []
59 59 while True:
60 60 read_size = read_sizes.draw(strategies.integers(1, 16384))
61 61 chunk = reader.read(read_size)
62 62 if not chunk:
63 63 break
64 64 chunks.append(chunk)
65 65
66 66 self.assertEqual(b''.join(chunks), ref_frame)
67 67
68 68
69 69 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
70 70 @make_cffi
71 71 class TestCompressor_stream_writer_fuzzing(unittest.TestCase):
72 72 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
73 73 level=strategies.integers(min_value=1, max_value=5),
74 74 write_size=strategies.integers(min_value=1, max_value=1048576))
75 75 def test_write_size_variance(self, original, level, write_size):
76 76 refctx = zstd.ZstdCompressor(level=level)
77 77 ref_frame = refctx.compress(original)
78 78
79 79 cctx = zstd.ZstdCompressor(level=level)
80 80 b = io.BytesIO()
81 81 with cctx.stream_writer(b, size=len(original), write_size=write_size) as compressor:
82 82 compressor.write(original)
83 83
84 84 self.assertEqual(b.getvalue(), ref_frame)
85 85
86 86
87 87 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
88 88 @make_cffi
89 89 class TestCompressor_copy_stream_fuzzing(unittest.TestCase):
90 90 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
91 91 level=strategies.integers(min_value=1, max_value=5),
92 92 read_size=strategies.integers(min_value=1, max_value=1048576),
93 93 write_size=strategies.integers(min_value=1, max_value=1048576))
94 94 def test_read_write_size_variance(self, original, level, read_size, write_size):
95 95 refctx = zstd.ZstdCompressor(level=level)
96 96 ref_frame = refctx.compress(original)
97 97
98 98 cctx = zstd.ZstdCompressor(level=level)
99 99 source = io.BytesIO(original)
100 100 dest = io.BytesIO()
101 101
102 102 cctx.copy_stream(source, dest, size=len(original), read_size=read_size,
103 103 write_size=write_size)
104 104
105 105 self.assertEqual(dest.getvalue(), ref_frame)
106 106
107 107
108 108 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
109 109 @make_cffi
110 110 class TestCompressor_compressobj_fuzzing(unittest.TestCase):
111 111 @hypothesis.settings(
112 112 suppress_health_check=[hypothesis.HealthCheck.large_base_example])
113 113 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
114 114 level=strategies.integers(min_value=1, max_value=5),
115 115 chunk_sizes=strategies.data())
116 116 def test_random_input_sizes(self, original, level, chunk_sizes):
117 117 refctx = zstd.ZstdCompressor(level=level)
118 118 ref_frame = refctx.compress(original)
119 119
120 120 cctx = zstd.ZstdCompressor(level=level)
121 121 cobj = cctx.compressobj(size=len(original))
122 122
123 123 chunks = []
124 124 i = 0
125 125 while True:
126 126 chunk_size = chunk_sizes.draw(strategies.integers(1, 4096))
127 127 source = original[i:i + chunk_size]
128 128 if not source:
129 129 break
130 130
131 131 chunks.append(cobj.compress(source))
132 132 i += chunk_size
133 133
134 134 chunks.append(cobj.flush())
135 135
136 136 self.assertEqual(b''.join(chunks), ref_frame)
137 137
138 @hypothesis.settings(
139 suppress_health_check=[hypothesis.HealthCheck.large_base_example])
140 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
141 level=strategies.integers(min_value=1, max_value=5),
142 chunk_sizes=strategies.data(),
143 flushes=strategies.data())
144 def test_flush_block(self, original, level, chunk_sizes, flushes):
145 cctx = zstd.ZstdCompressor(level=level)
146 cobj = cctx.compressobj()
147
148 dctx = zstd.ZstdDecompressor()
149 dobj = dctx.decompressobj()
150
151 compressed_chunks = []
152 decompressed_chunks = []
153 i = 0
154 while True:
155 input_size = chunk_sizes.draw(strategies.integers(1, 4096))
156 source = original[i:i + input_size]
157 if not source:
158 break
159
160 i += input_size
161
162 chunk = cobj.compress(source)
163 compressed_chunks.append(chunk)
164 decompressed_chunks.append(dobj.decompress(chunk))
165
166 if not flushes.draw(strategies.booleans()):
167 continue
168
169 chunk = cobj.flush(zstd.COMPRESSOBJ_FLUSH_BLOCK)
170 compressed_chunks.append(chunk)
171 decompressed_chunks.append(dobj.decompress(chunk))
172
173 self.assertEqual(b''.join(decompressed_chunks), original[0:i])
174
175 chunk = cobj.flush(zstd.COMPRESSOBJ_FLUSH_FINISH)
176 compressed_chunks.append(chunk)
177 decompressed_chunks.append(dobj.decompress(chunk))
178
179 self.assertEqual(dctx.decompress(b''.join(compressed_chunks),
180 max_output_size=len(original)),
181 original)
182 self.assertEqual(b''.join(decompressed_chunks), original)
138 183
139 184 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
140 185 @make_cffi
141 186 class TestCompressor_read_to_iter_fuzzing(unittest.TestCase):
142 187 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
143 188 level=strategies.integers(min_value=1, max_value=5),
144 189 read_size=strategies.integers(min_value=1, max_value=4096),
145 190 write_size=strategies.integers(min_value=1, max_value=4096))
146 191 def test_read_write_size_variance(self, original, level, read_size, write_size):
147 192 refcctx = zstd.ZstdCompressor(level=level)
148 193 ref_frame = refcctx.compress(original)
149 194
150 195 source = io.BytesIO(original)
151 196
152 197 cctx = zstd.ZstdCompressor(level=level)
153 198 chunks = list(cctx.read_to_iter(source, size=len(original),
154 199 read_size=read_size,
155 200 write_size=write_size))
156 201
157 202 self.assertEqual(b''.join(chunks), ref_frame)
158 203
159 204
160 205 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
161 206 class TestCompressor_multi_compress_to_buffer_fuzzing(unittest.TestCase):
162 207 @hypothesis.given(original=strategies.lists(strategies.sampled_from(random_input_data()),
163 208 min_size=1, max_size=1024),
164 209 threads=strategies.integers(min_value=1, max_value=8),
165 210 use_dict=strategies.booleans())
166 211 def test_data_equivalence(self, original, threads, use_dict):
167 212 kwargs = {}
168 213
169 214 # Use a content dictionary because it is cheap to create.
170 215 if use_dict:
171 216 kwargs['dict_data'] = zstd.ZstdCompressionDict(original[0])
172 217
173 218 cctx = zstd.ZstdCompressor(level=1,
174 219 write_checksum=True,
175 220 **kwargs)
176 221
177 222 result = cctx.multi_compress_to_buffer(original, threads=-1)
178 223
179 224 self.assertEqual(len(result), len(original))
180 225
181 226 # The frame produced via the batch APIs may not be bit identical to that
182 227 # produced by compress() because compression parameters are adjusted
183 228 # from the first input in batch mode. So the only thing we can do is
184 229 # verify the decompressed data matches the input.
185 230 dctx = zstd.ZstdDecompressor(**kwargs)
186 231
187 232 for i, frame in enumerate(result):
188 233 self.assertEqual(dctx.decompress(frame), original[i])
234
235
236 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
237 @make_cffi
238 class TestCompressor_chunker_fuzzing(unittest.TestCase):
239 @hypothesis.settings(
240 suppress_health_check=[hypothesis.HealthCheck.large_base_example])
241 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
242 level=strategies.integers(min_value=1, max_value=5),
243 chunk_size=strategies.integers(
244 min_value=1,
245 max_value=32 * 1048576),
246 input_sizes=strategies.data())
247 def test_random_input_sizes(self, original, level, chunk_size, input_sizes):
248 cctx = zstd.ZstdCompressor(level=level)
249 chunker = cctx.chunker(chunk_size=chunk_size)
250
251 chunks = []
252 i = 0
253 while True:
254 input_size = input_sizes.draw(strategies.integers(1, 4096))
255 source = original[i:i + input_size]
256 if not source:
257 break
258
259 chunks.extend(chunker.compress(source))
260 i += input_size
261
262 chunks.extend(chunker.finish())
263
264 dctx = zstd.ZstdDecompressor()
265
266 self.assertEqual(dctx.decompress(b''.join(chunks),
267 max_output_size=len(original)),
268 original)
269
270 self.assertTrue(all(len(chunk) == chunk_size for chunk in chunks[:-1]))
271
272 @hypothesis.settings(
273 suppress_health_check=[hypothesis.HealthCheck.large_base_example])
274 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
275 level=strategies.integers(min_value=1, max_value=5),
276 chunk_size=strategies.integers(
277 min_value=1,
278 max_value=32 * 1048576),
279 input_sizes=strategies.data(),
280 flushes=strategies.data())
281 def test_flush_block(self, original, level, chunk_size, input_sizes,
282 flushes):
283 cctx = zstd.ZstdCompressor(level=level)
284 chunker = cctx.chunker(chunk_size=chunk_size)
285
286 dctx = zstd.ZstdDecompressor()
287 dobj = dctx.decompressobj()
288
289 compressed_chunks = []
290 decompressed_chunks = []
291 i = 0
292 while True:
293 input_size = input_sizes.draw(strategies.integers(1, 4096))
294 source = original[i:i + input_size]
295 if not source:
296 break
297
298 i += input_size
299
300 chunks = list(chunker.compress(source))
301 compressed_chunks.extend(chunks)
302 decompressed_chunks.append(dobj.decompress(b''.join(chunks)))
303
304 if not flushes.draw(strategies.booleans()):
305 continue
306
307 chunks = list(chunker.flush())
308 compressed_chunks.extend(chunks)
309 decompressed_chunks.append(dobj.decompress(b''.join(chunks)))
310
311 self.assertEqual(b''.join(decompressed_chunks), original[0:i])
312
313 chunks = list(chunker.finish())
314 compressed_chunks.extend(chunks)
315 decompressed_chunks.append(dobj.decompress(b''.join(chunks)))
316
317 self.assertEqual(dctx.decompress(b''.join(compressed_chunks),
318 max_output_size=len(original)),
319 original)
320 self.assertEqual(b''.join(decompressed_chunks), original) No newline at end of file
@@ -1,202 +1,194 b''
1 1 import sys
2 2 import unittest
3 3
4 4 import zstandard as zstd
5 5
6 6 from . common import (
7 7 make_cffi,
8 8 )
9 9
10 10
11 11 @make_cffi
12 12 class TestCompressionParameters(unittest.TestCase):
13 13 def test_bounds(self):
14 14 zstd.ZstdCompressionParameters(window_log=zstd.WINDOWLOG_MIN,
15 15 chain_log=zstd.CHAINLOG_MIN,
16 16 hash_log=zstd.HASHLOG_MIN,
17 17 search_log=zstd.SEARCHLOG_MIN,
18 18 min_match=zstd.SEARCHLENGTH_MIN + 1,
19 19 target_length=zstd.TARGETLENGTH_MIN,
20 20 compression_strategy=zstd.STRATEGY_FAST)
21 21
22 22 zstd.ZstdCompressionParameters(window_log=zstd.WINDOWLOG_MAX,
23 23 chain_log=zstd.CHAINLOG_MAX,
24 24 hash_log=zstd.HASHLOG_MAX,
25 25 search_log=zstd.SEARCHLOG_MAX,
26 26 min_match=zstd.SEARCHLENGTH_MAX - 1,
27 target_length=zstd.TARGETLENGTH_MAX,
27 28 compression_strategy=zstd.STRATEGY_BTULTRA)
28 29
29 30 def test_from_level(self):
30 31 p = zstd.ZstdCompressionParameters.from_level(1)
31 32 self.assertIsInstance(p, zstd.CompressionParameters)
32 33
33 34 self.assertEqual(p.window_log, 19)
34 35
35 36 p = zstd.ZstdCompressionParameters.from_level(-4)
36 37 self.assertEqual(p.window_log, 19)
37 self.assertEqual(p.compress_literals, 0)
38 38
39 39 def test_members(self):
40 40 p = zstd.ZstdCompressionParameters(window_log=10,
41 41 chain_log=6,
42 42 hash_log=7,
43 43 search_log=4,
44 44 min_match=5,
45 45 target_length=8,
46 46 compression_strategy=1)
47 47 self.assertEqual(p.window_log, 10)
48 48 self.assertEqual(p.chain_log, 6)
49 49 self.assertEqual(p.hash_log, 7)
50 50 self.assertEqual(p.search_log, 4)
51 51 self.assertEqual(p.min_match, 5)
52 52 self.assertEqual(p.target_length, 8)
53 53 self.assertEqual(p.compression_strategy, 1)
54 54
55 55 p = zstd.ZstdCompressionParameters(compression_level=2)
56 56 self.assertEqual(p.compression_level, 2)
57 57
58 58 p = zstd.ZstdCompressionParameters(threads=4)
59 59 self.assertEqual(p.threads, 4)
60 60
61 61 p = zstd.ZstdCompressionParameters(threads=2, job_size=1048576,
62 62 overlap_size_log=6)
63 63 self.assertEqual(p.threads, 2)
64 64 self.assertEqual(p.job_size, 1048576)
65 65 self.assertEqual(p.overlap_size_log, 6)
66 66
67 p = zstd.ZstdCompressionParameters(compression_level=2)
68 self.assertEqual(p.compress_literals, 1)
69
70 p = zstd.ZstdCompressionParameters(compress_literals=False)
71 self.assertEqual(p.compress_literals, 0)
72
73 67 p = zstd.ZstdCompressionParameters(compression_level=-1)
74 68 self.assertEqual(p.compression_level, -1)
75 self.assertEqual(p.compress_literals, 0)
76 69
77 p = zstd.ZstdCompressionParameters(compression_level=-2, compress_literals=True)
70 p = zstd.ZstdCompressionParameters(compression_level=-2)
78 71 self.assertEqual(p.compression_level, -2)
79 self.assertEqual(p.compress_literals, 1)
80 72
81 73 p = zstd.ZstdCompressionParameters(force_max_window=True)
82 74 self.assertEqual(p.force_max_window, 1)
83 75
84 76 p = zstd.ZstdCompressionParameters(enable_ldm=True)
85 77 self.assertEqual(p.enable_ldm, 1)
86 78
87 79 p = zstd.ZstdCompressionParameters(ldm_hash_log=7)
88 80 self.assertEqual(p.ldm_hash_log, 7)
89 81
90 82 p = zstd.ZstdCompressionParameters(ldm_min_match=6)
91 83 self.assertEqual(p.ldm_min_match, 6)
92 84
93 85 p = zstd.ZstdCompressionParameters(ldm_bucket_size_log=7)
94 86 self.assertEqual(p.ldm_bucket_size_log, 7)
95 87
96 88 p = zstd.ZstdCompressionParameters(ldm_hash_every_log=8)
97 89 self.assertEqual(p.ldm_hash_every_log, 8)
98 90
99 91 def test_estimated_compression_context_size(self):
100 92 p = zstd.ZstdCompressionParameters(window_log=20,
101 93 chain_log=16,
102 94 hash_log=17,
103 95 search_log=1,
104 96 min_match=5,
105 97 target_length=16,
106 98 compression_strategy=zstd.STRATEGY_DFAST)
107 99
108 100 # 32-bit has slightly different values from 64-bit.
109 101 self.assertAlmostEqual(p.estimated_compression_context_size(), 1294072,
110 102 delta=250)
111 103
112 104
113 105 @make_cffi
114 106 class TestFrameParameters(unittest.TestCase):
115 107 def test_invalid_type(self):
116 108 with self.assertRaises(TypeError):
117 109 zstd.get_frame_parameters(None)
118 110
119 111 # Python 3 doesn't appear to convert unicode to Py_buffer.
120 112 if sys.version_info[0] >= 3:
121 113 with self.assertRaises(TypeError):
122 114 zstd.get_frame_parameters(u'foobarbaz')
123 115 else:
124 116 # CPython will convert unicode to Py_buffer. But CFFI won't.
125 117 if zstd.backend == 'cffi':
126 118 with self.assertRaises(TypeError):
127 119 zstd.get_frame_parameters(u'foobarbaz')
128 120 else:
129 121 with self.assertRaises(zstd.ZstdError):
130 122 zstd.get_frame_parameters(u'foobarbaz')
131 123
132 124 def test_invalid_input_sizes(self):
133 125 with self.assertRaisesRegexp(zstd.ZstdError, 'not enough data for frame'):
134 126 zstd.get_frame_parameters(b'')
135 127
136 128 with self.assertRaisesRegexp(zstd.ZstdError, 'not enough data for frame'):
137 129 zstd.get_frame_parameters(zstd.FRAME_HEADER)
138 130
139 131 def test_invalid_frame(self):
140 132 with self.assertRaisesRegexp(zstd.ZstdError, 'Unknown frame descriptor'):
141 133 zstd.get_frame_parameters(b'foobarbaz')
142 134
143 135 def test_attributes(self):
144 136 params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x00\x00')
145 137 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
146 138 self.assertEqual(params.window_size, 1024)
147 139 self.assertEqual(params.dict_id, 0)
148 140 self.assertFalse(params.has_checksum)
149 141
150 142 # Lowest 2 bits indicate a dictionary and length. Here, the dict id is 1 byte.
151 143 params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x01\x00\xff')
152 144 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
153 145 self.assertEqual(params.window_size, 1024)
154 146 self.assertEqual(params.dict_id, 255)
155 147 self.assertFalse(params.has_checksum)
156 148
157 149 # Lowest 3rd bit indicates if checksum is present.
158 150 params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x04\x00')
159 151 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
160 152 self.assertEqual(params.window_size, 1024)
161 153 self.assertEqual(params.dict_id, 0)
162 154 self.assertTrue(params.has_checksum)
163 155
164 156 # Upper 2 bits indicate content size.
165 157 params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x40\x00\xff\x00')
166 158 self.assertEqual(params.content_size, 511)
167 159 self.assertEqual(params.window_size, 1024)
168 160 self.assertEqual(params.dict_id, 0)
169 161 self.assertFalse(params.has_checksum)
170 162
171 163 # Window descriptor is 2nd byte after frame header.
172 164 params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x00\x40')
173 165 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
174 166 self.assertEqual(params.window_size, 262144)
175 167 self.assertEqual(params.dict_id, 0)
176 168 self.assertFalse(params.has_checksum)
177 169
178 170 # Set multiple things.
179 171 params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x45\x40\x0f\x10\x00')
180 172 self.assertEqual(params.content_size, 272)
181 173 self.assertEqual(params.window_size, 262144)
182 174 self.assertEqual(params.dict_id, 15)
183 175 self.assertTrue(params.has_checksum)
184 176
185 177 def test_input_types(self):
186 178 v = zstd.FRAME_HEADER + b'\x00\x00'
187 179
188 180 mutable_array = bytearray(len(v))
189 181 mutable_array[:] = v
190 182
191 183 sources = [
192 184 memoryview(v),
193 185 bytearray(v),
194 186 mutable_array,
195 187 ]
196 188
197 189 for source in sources:
198 190 params = zstd.get_frame_parameters(source)
199 191 self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
200 192 self.assertEqual(params.window_size, 1024)
201 193 self.assertEqual(params.dict_id, 0)
202 194 self.assertFalse(params.has_checksum)
@@ -1,75 +1,75 b''
1 1 import io
2 2 import os
3 3 import sys
4 4 import unittest
5 5
6 6 try:
7 7 import hypothesis
8 8 import hypothesis.strategies as strategies
9 9 except ImportError:
10 10 raise unittest.SkipTest('hypothesis not available')
11 11
12 12 import zstandard as zstd
13 13
14 14 from .common import (
15 15 make_cffi,
16 16 )
17 17
18 18
19 19 s_windowlog = strategies.integers(min_value=zstd.WINDOWLOG_MIN,
20 20 max_value=zstd.WINDOWLOG_MAX)
21 21 s_chainlog = strategies.integers(min_value=zstd.CHAINLOG_MIN,
22 22 max_value=zstd.CHAINLOG_MAX)
23 23 s_hashlog = strategies.integers(min_value=zstd.HASHLOG_MIN,
24 24 max_value=zstd.HASHLOG_MAX)
25 25 s_searchlog = strategies.integers(min_value=zstd.SEARCHLOG_MIN,
26 26 max_value=zstd.SEARCHLOG_MAX)
27 27 s_searchlength = strategies.integers(min_value=zstd.SEARCHLENGTH_MIN,
28 28 max_value=zstd.SEARCHLENGTH_MAX)
29 29 s_targetlength = strategies.integers(min_value=zstd.TARGETLENGTH_MIN,
30 max_value=2**32)
30 max_value=zstd.TARGETLENGTH_MAX)
31 31 s_strategy = strategies.sampled_from((zstd.STRATEGY_FAST,
32 32 zstd.STRATEGY_DFAST,
33 33 zstd.STRATEGY_GREEDY,
34 34 zstd.STRATEGY_LAZY,
35 35 zstd.STRATEGY_LAZY2,
36 36 zstd.STRATEGY_BTLAZY2,
37 37 zstd.STRATEGY_BTOPT,
38 38 zstd.STRATEGY_BTULTRA))
39 39
40 40
41 41 @make_cffi
42 42 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
43 43 class TestCompressionParametersHypothesis(unittest.TestCase):
44 44 @hypothesis.given(s_windowlog, s_chainlog, s_hashlog, s_searchlog,
45 45 s_searchlength, s_targetlength, s_strategy)
46 46 def test_valid_init(self, windowlog, chainlog, hashlog, searchlog,
47 47 searchlength, targetlength, strategy):
48 48 zstd.ZstdCompressionParameters(window_log=windowlog,
49 49 chain_log=chainlog,
50 50 hash_log=hashlog,
51 51 search_log=searchlog,
52 52 min_match=searchlength,
53 53 target_length=targetlength,
54 54 compression_strategy=strategy)
55 55
56 56 @hypothesis.given(s_windowlog, s_chainlog, s_hashlog, s_searchlog,
57 57 s_searchlength, s_targetlength, s_strategy)
58 58 def test_estimated_compression_context_size(self, windowlog, chainlog,
59 59 hashlog, searchlog,
60 60 searchlength, targetlength,
61 61 strategy):
62 62 if searchlength == zstd.SEARCHLENGTH_MIN and strategy in (zstd.STRATEGY_FAST, zstd.STRATEGY_GREEDY):
63 63 searchlength += 1
64 64 elif searchlength == zstd.SEARCHLENGTH_MAX and strategy != zstd.STRATEGY_FAST:
65 65 searchlength -= 1
66 66
67 67 p = zstd.ZstdCompressionParameters(window_log=windowlog,
68 68 chain_log=chainlog,
69 69 hash_log=hashlog,
70 70 search_log=searchlog,
71 71 min_match=searchlength,
72 72 target_length=targetlength,
73 73 compression_strategy=strategy)
74 74 size = p.estimated_compression_context_size()
75 75
@@ -1,1139 +1,1178 b''
1 1 import io
2 2 import os
3 3 import random
4 4 import struct
5 5 import sys
6 6 import unittest
7 7
8 8 import zstandard as zstd
9 9
10 10 from .common import (
11 11 generate_samples,
12 12 make_cffi,
13 13 OpCountingBytesIO,
14 14 )
15 15
16 16
17 17 if sys.version_info[0] >= 3:
18 18 next = lambda it: it.__next__()
19 19 else:
20 20 next = lambda it: it.next()
21 21
22 22
23 23 @make_cffi
24 24 class TestFrameHeaderSize(unittest.TestCase):
25 25 def test_empty(self):
26 26 with self.assertRaisesRegexp(
27 27 zstd.ZstdError, 'could not determine frame header size: Src size '
28 28 'is incorrect'):
29 29 zstd.frame_header_size(b'')
30 30
31 31 def test_too_small(self):
32 32 with self.assertRaisesRegexp(
33 33 zstd.ZstdError, 'could not determine frame header size: Src size '
34 34 'is incorrect'):
35 35 zstd.frame_header_size(b'foob')
36 36
37 37 def test_basic(self):
38 38 # It doesn't matter that it isn't a valid frame.
39 39 self.assertEqual(zstd.frame_header_size(b'long enough but no magic'), 6)
40 40
41 41
42 42 @make_cffi
43 43 class TestFrameContentSize(unittest.TestCase):
44 44 def test_empty(self):
45 45 with self.assertRaisesRegexp(zstd.ZstdError,
46 46 'error when determining content size'):
47 47 zstd.frame_content_size(b'')
48 48
49 49 def test_too_small(self):
50 50 with self.assertRaisesRegexp(zstd.ZstdError,
51 51 'error when determining content size'):
52 52 zstd.frame_content_size(b'foob')
53 53
54 54 def test_bad_frame(self):
55 55 with self.assertRaisesRegexp(zstd.ZstdError,
56 56 'error when determining content size'):
57 57 zstd.frame_content_size(b'invalid frame header')
58 58
59 59 def test_unknown(self):
60 60 cctx = zstd.ZstdCompressor(write_content_size=False)
61 61 frame = cctx.compress(b'foobar')
62 62
63 63 self.assertEqual(zstd.frame_content_size(frame), -1)
64 64
65 65 def test_empty(self):
66 66 cctx = zstd.ZstdCompressor()
67 67 frame = cctx.compress(b'')
68 68
69 69 self.assertEqual(zstd.frame_content_size(frame), 0)
70 70
71 71 def test_basic(self):
72 72 cctx = zstd.ZstdCompressor()
73 73 frame = cctx.compress(b'foobar')
74 74
75 75 self.assertEqual(zstd.frame_content_size(frame), 6)
76 76
77 77
78 78 @make_cffi
79 79 class TestDecompressor(unittest.TestCase):
80 80 def test_memory_size(self):
81 81 dctx = zstd.ZstdDecompressor()
82 82
83 83 self.assertGreater(dctx.memory_size(), 100)
84 84
85 85
86 86 @make_cffi
87 87 class TestDecompressor_decompress(unittest.TestCase):
88 88 def test_empty_input(self):
89 89 dctx = zstd.ZstdDecompressor()
90 90
91 91 with self.assertRaisesRegexp(zstd.ZstdError, 'error determining content size from frame header'):
92 92 dctx.decompress(b'')
93 93
94 94 def test_invalid_input(self):
95 95 dctx = zstd.ZstdDecompressor()
96 96
97 97 with self.assertRaisesRegexp(zstd.ZstdError, 'error determining content size from frame header'):
98 98 dctx.decompress(b'foobar')
99 99
100 100 def test_input_types(self):
101 101 cctx = zstd.ZstdCompressor(level=1)
102 102 compressed = cctx.compress(b'foo')
103 103
104 104 mutable_array = bytearray(len(compressed))
105 105 mutable_array[:] = compressed
106 106
107 107 sources = [
108 108 memoryview(compressed),
109 109 bytearray(compressed),
110 110 mutable_array,
111 111 ]
112 112
113 113 dctx = zstd.ZstdDecompressor()
114 114 for source in sources:
115 115 self.assertEqual(dctx.decompress(source), b'foo')
116 116
117 117 def test_no_content_size_in_frame(self):
118 118 cctx = zstd.ZstdCompressor(write_content_size=False)
119 119 compressed = cctx.compress(b'foobar')
120 120
121 121 dctx = zstd.ZstdDecompressor()
122 122 with self.assertRaisesRegexp(zstd.ZstdError, 'could not determine content size in frame header'):
123 123 dctx.decompress(compressed)
124 124
125 125 def test_content_size_present(self):
126 126 cctx = zstd.ZstdCompressor()
127 127 compressed = cctx.compress(b'foobar')
128 128
129 129 dctx = zstd.ZstdDecompressor()
130 130 decompressed = dctx.decompress(compressed)
131 131 self.assertEqual(decompressed, b'foobar')
132 132
133 133 def test_empty_roundtrip(self):
134 134 cctx = zstd.ZstdCompressor()
135 135 compressed = cctx.compress(b'')
136 136
137 137 dctx = zstd.ZstdDecompressor()
138 138 decompressed = dctx.decompress(compressed)
139 139
140 140 self.assertEqual(decompressed, b'')
141 141
142 142 def test_max_output_size(self):
143 143 cctx = zstd.ZstdCompressor(write_content_size=False)
144 144 source = b'foobar' * 256
145 145 compressed = cctx.compress(source)
146 146
147 147 dctx = zstd.ZstdDecompressor()
148 148 # Will fit into buffer exactly the size of input.
149 149 decompressed = dctx.decompress(compressed, max_output_size=len(source))
150 150 self.assertEqual(decompressed, source)
151 151
152 152 # Input size - 1 fails
153 153 with self.assertRaisesRegexp(zstd.ZstdError,
154 154 'decompression error: did not decompress full frame'):
155 155 dctx.decompress(compressed, max_output_size=len(source) - 1)
156 156
157 157 # Input size + 1 works
158 158 decompressed = dctx.decompress(compressed, max_output_size=len(source) + 1)
159 159 self.assertEqual(decompressed, source)
160 160
161 161 # A much larger buffer works.
162 162 decompressed = dctx.decompress(compressed, max_output_size=len(source) * 64)
163 163 self.assertEqual(decompressed, source)
164 164
165 165 def test_stupidly_large_output_buffer(self):
166 166 cctx = zstd.ZstdCompressor(write_content_size=False)
167 167 compressed = cctx.compress(b'foobar' * 256)
168 168 dctx = zstd.ZstdDecompressor()
169 169
170 170 # Will get OverflowError on some Python distributions that can't
171 171 # handle really large integers.
172 172 with self.assertRaises((MemoryError, OverflowError)):
173 173 dctx.decompress(compressed, max_output_size=2**62)
174 174
175 175 def test_dictionary(self):
176 176 samples = []
177 177 for i in range(128):
178 178 samples.append(b'foo' * 64)
179 179 samples.append(b'bar' * 64)
180 180 samples.append(b'foobar' * 64)
181 181
182 182 d = zstd.train_dictionary(8192, samples)
183 183
184 184 orig = b'foobar' * 16384
185 185 cctx = zstd.ZstdCompressor(level=1, dict_data=d)
186 186 compressed = cctx.compress(orig)
187 187
188 188 dctx = zstd.ZstdDecompressor(dict_data=d)
189 189 decompressed = dctx.decompress(compressed)
190 190
191 191 self.assertEqual(decompressed, orig)
192 192
193 193 def test_dictionary_multiple(self):
194 194 samples = []
195 195 for i in range(128):
196 196 samples.append(b'foo' * 64)
197 197 samples.append(b'bar' * 64)
198 198 samples.append(b'foobar' * 64)
199 199
200 200 d = zstd.train_dictionary(8192, samples)
201 201
202 202 sources = (b'foobar' * 8192, b'foo' * 8192, b'bar' * 8192)
203 203 compressed = []
204 204 cctx = zstd.ZstdCompressor(level=1, dict_data=d)
205 205 for source in sources:
206 206 compressed.append(cctx.compress(source))
207 207
208 208 dctx = zstd.ZstdDecompressor(dict_data=d)
209 209 for i in range(len(sources)):
210 210 decompressed = dctx.decompress(compressed[i])
211 211 self.assertEqual(decompressed, sources[i])
212 212
213 213 def test_max_window_size(self):
214 214 with open(__file__, 'rb') as fh:
215 215 source = fh.read()
216 216
217 217 # If we write a content size, the decompressor engages single pass
218 218 # mode and the window size doesn't come into play.
219 219 cctx = zstd.ZstdCompressor(write_content_size=False)
220 220 frame = cctx.compress(source)
221 221
222 222 dctx = zstd.ZstdDecompressor(max_window_size=1)
223 223
224 224 with self.assertRaisesRegexp(
225 225 zstd.ZstdError, 'decompression error: Frame requires too much memory'):
226 226 dctx.decompress(frame, max_output_size=len(source))
227 227
228 228
229 229 @make_cffi
230 230 class TestDecompressor_copy_stream(unittest.TestCase):
231 231 def test_no_read(self):
232 232 source = object()
233 233 dest = io.BytesIO()
234 234
235 235 dctx = zstd.ZstdDecompressor()
236 236 with self.assertRaises(ValueError):
237 237 dctx.copy_stream(source, dest)
238 238
239 239 def test_no_write(self):
240 240 source = io.BytesIO()
241 241 dest = object()
242 242
243 243 dctx = zstd.ZstdDecompressor()
244 244 with self.assertRaises(ValueError):
245 245 dctx.copy_stream(source, dest)
246 246
247 247 def test_empty(self):
248 248 source = io.BytesIO()
249 249 dest = io.BytesIO()
250 250
251 251 dctx = zstd.ZstdDecompressor()
252 252 # TODO should this raise an error?
253 253 r, w = dctx.copy_stream(source, dest)
254 254
255 255 self.assertEqual(r, 0)
256 256 self.assertEqual(w, 0)
257 257 self.assertEqual(dest.getvalue(), b'')
258 258
259 259 def test_large_data(self):
260 260 source = io.BytesIO()
261 261 for i in range(255):
262 262 source.write(struct.Struct('>B').pack(i) * 16384)
263 263 source.seek(0)
264 264
265 265 compressed = io.BytesIO()
266 266 cctx = zstd.ZstdCompressor()
267 267 cctx.copy_stream(source, compressed)
268 268
269 269 compressed.seek(0)
270 270 dest = io.BytesIO()
271 271 dctx = zstd.ZstdDecompressor()
272 272 r, w = dctx.copy_stream(compressed, dest)
273 273
274 274 self.assertEqual(r, len(compressed.getvalue()))
275 275 self.assertEqual(w, len(source.getvalue()))
276 276
277 277 def test_read_write_size(self):
278 278 source = OpCountingBytesIO(zstd.ZstdCompressor().compress(
279 279 b'foobarfoobar'))
280 280
281 281 dest = OpCountingBytesIO()
282 282 dctx = zstd.ZstdDecompressor()
283 283 r, w = dctx.copy_stream(source, dest, read_size=1, write_size=1)
284 284
285 285 self.assertEqual(r, len(source.getvalue()))
286 286 self.assertEqual(w, len(b'foobarfoobar'))
287 287 self.assertEqual(source._read_count, len(source.getvalue()) + 1)
288 288 self.assertEqual(dest._write_count, len(dest.getvalue()))
289 289
290 290
291 291 @make_cffi
292 292 class TestDecompressor_stream_reader(unittest.TestCase):
293 293 def test_context_manager(self):
294 294 dctx = zstd.ZstdDecompressor()
295 295
296 reader = dctx.stream_reader(b'foo')
297 with self.assertRaisesRegexp(zstd.ZstdError, 'read\(\) must be called from an active'):
298 reader.read(1)
299
300 296 with dctx.stream_reader(b'foo') as reader:
301 297 with self.assertRaisesRegexp(ValueError, 'cannot __enter__ multiple times'):
302 298 with reader as reader2:
303 299 pass
304 300
305 301 def test_not_implemented(self):
306 302 dctx = zstd.ZstdDecompressor()
307 303
308 304 with dctx.stream_reader(b'foo') as reader:
309 305 with self.assertRaises(NotImplementedError):
310 306 reader.readline()
311 307
312 308 with self.assertRaises(NotImplementedError):
313 309 reader.readlines()
314 310
315 311 with self.assertRaises(NotImplementedError):
316 312 reader.readall()
317 313
318 314 with self.assertRaises(NotImplementedError):
319 315 iter(reader)
320 316
321 317 with self.assertRaises(NotImplementedError):
322 318 next(reader)
323 319
324 320 with self.assertRaises(io.UnsupportedOperation):
325 321 reader.write(b'foo')
326 322
327 323 with self.assertRaises(io.UnsupportedOperation):
328 324 reader.writelines([])
329 325
330 326 def test_constant_methods(self):
331 327 dctx = zstd.ZstdDecompressor()
332 328
333 329 with dctx.stream_reader(b'foo') as reader:
330 self.assertFalse(reader.closed)
334 331 self.assertTrue(reader.readable())
335 332 self.assertFalse(reader.writable())
336 333 self.assertTrue(reader.seekable())
337 334 self.assertFalse(reader.isatty())
335 self.assertFalse(reader.closed)
338 336 self.assertIsNone(reader.flush())
337 self.assertFalse(reader.closed)
338
339 self.assertTrue(reader.closed)
339 340
340 341 def test_read_closed(self):
341 342 dctx = zstd.ZstdDecompressor()
342 343
343 344 with dctx.stream_reader(b'foo') as reader:
344 345 reader.close()
346 self.assertTrue(reader.closed)
345 347 with self.assertRaisesRegexp(ValueError, 'stream is closed'):
346 348 reader.read(1)
347 349
348 350 def test_bad_read_size(self):
349 351 dctx = zstd.ZstdDecompressor()
350 352
351 353 with dctx.stream_reader(b'foo') as reader:
352 354 with self.assertRaisesRegexp(ValueError, 'cannot read negative or size 0 amounts'):
353 355 reader.read(-1)
354 356
355 357 with self.assertRaisesRegexp(ValueError, 'cannot read negative or size 0 amounts'):
356 358 reader.read(0)
357 359
358 360 def test_read_buffer(self):
359 361 cctx = zstd.ZstdCompressor()
360 362
361 363 source = b''.join([b'foo' * 60, b'bar' * 60, b'baz' * 60])
362 364 frame = cctx.compress(source)
363 365
364 366 dctx = zstd.ZstdDecompressor()
365 367
366 368 with dctx.stream_reader(frame) as reader:
367 369 self.assertEqual(reader.tell(), 0)
368 370
369 371 # We should get entire frame in one read.
370 372 result = reader.read(8192)
371 373 self.assertEqual(result, source)
372 374 self.assertEqual(reader.tell(), len(source))
373 375
374 376 # Read after EOF should return empty bytes.
375 self.assertEqual(reader.read(), b'')
377 self.assertEqual(reader.read(1), b'')
376 378 self.assertEqual(reader.tell(), len(result))
377 379
378 self.assertTrue(reader.closed())
380 self.assertTrue(reader.closed)
379 381
380 382 def test_read_buffer_small_chunks(self):
381 383 cctx = zstd.ZstdCompressor()
382 384 source = b''.join([b'foo' * 60, b'bar' * 60, b'baz' * 60])
383 385 frame = cctx.compress(source)
384 386
385 387 dctx = zstd.ZstdDecompressor()
386 388 chunks = []
387 389
388 390 with dctx.stream_reader(frame, read_size=1) as reader:
389 391 while True:
390 392 chunk = reader.read(1)
391 393 if not chunk:
392 394 break
393 395
394 396 chunks.append(chunk)
395 397 self.assertEqual(reader.tell(), sum(map(len, chunks)))
396 398
397 399 self.assertEqual(b''.join(chunks), source)
398 400
399 401 def test_read_stream(self):
400 402 cctx = zstd.ZstdCompressor()
401 403 source = b''.join([b'foo' * 60, b'bar' * 60, b'baz' * 60])
402 404 frame = cctx.compress(source)
403 405
404 406 dctx = zstd.ZstdDecompressor()
405 407 with dctx.stream_reader(io.BytesIO(frame)) as reader:
406 408 self.assertEqual(reader.tell(), 0)
407 409
408 410 chunk = reader.read(8192)
409 411 self.assertEqual(chunk, source)
410 412 self.assertEqual(reader.tell(), len(source))
411 self.assertEqual(reader.read(), b'')
413 self.assertEqual(reader.read(1), b'')
412 414 self.assertEqual(reader.tell(), len(source))
415 self.assertFalse(reader.closed)
416
417 self.assertTrue(reader.closed)
413 418
414 419 def test_read_stream_small_chunks(self):
415 420 cctx = zstd.ZstdCompressor()
416 421 source = b''.join([b'foo' * 60, b'bar' * 60, b'baz' * 60])
417 422 frame = cctx.compress(source)
418 423
419 424 dctx = zstd.ZstdDecompressor()
420 425 chunks = []
421 426
422 427 with dctx.stream_reader(io.BytesIO(frame), read_size=1) as reader:
423 428 while True:
424 429 chunk = reader.read(1)
425 430 if not chunk:
426 431 break
427 432
428 433 chunks.append(chunk)
429 434 self.assertEqual(reader.tell(), sum(map(len, chunks)))
430 435
431 436 self.assertEqual(b''.join(chunks), source)
432 437
433 438 def test_read_after_exit(self):
434 439 cctx = zstd.ZstdCompressor()
435 440 frame = cctx.compress(b'foo' * 60)
436 441
437 442 dctx = zstd.ZstdDecompressor()
438 443
439 444 with dctx.stream_reader(frame) as reader:
440 445 while reader.read(16):
441 446 pass
442 447
443 with self.assertRaisesRegexp(zstd.ZstdError, 'read\(\) must be called from an active'):
448 self.assertTrue(reader.closed)
449
450 with self.assertRaisesRegexp(ValueError, 'stream is closed'):
444 451 reader.read(10)
445 452
446 453 def test_illegal_seeks(self):
447 454 cctx = zstd.ZstdCompressor()
448 455 frame = cctx.compress(b'foo' * 60)
449 456
450 457 dctx = zstd.ZstdDecompressor()
451 458
452 459 with dctx.stream_reader(frame) as reader:
453 460 with self.assertRaisesRegexp(ValueError,
454 461 'cannot seek to negative position'):
455 462 reader.seek(-1, os.SEEK_SET)
456 463
457 464 reader.read(1)
458 465
459 466 with self.assertRaisesRegexp(
460 467 ValueError, 'cannot seek zstd decompression stream backwards'):
461 468 reader.seek(0, os.SEEK_SET)
462 469
463 470 with self.assertRaisesRegexp(
464 471 ValueError, 'cannot seek zstd decompression stream backwards'):
465 472 reader.seek(-1, os.SEEK_CUR)
466 473
467 474 with self.assertRaisesRegexp(
468 475 ValueError,
469 476 'zstd decompression streams cannot be seeked with SEEK_END'):
470 477 reader.seek(0, os.SEEK_END)
471 478
472 479 reader.close()
473 480
474 481 with self.assertRaisesRegexp(ValueError, 'stream is closed'):
475 482 reader.seek(4, os.SEEK_SET)
476 483
477 with self.assertRaisesRegexp(
478 zstd.ZstdError, 'seek\(\) must be called from an active context'):
484 with self.assertRaisesRegexp(ValueError, 'stream is closed'):
479 485 reader.seek(0)
480 486
481 487 def test_seek(self):
482 488 source = b'foobar' * 60
483 489 cctx = zstd.ZstdCompressor()
484 490 frame = cctx.compress(source)
485 491
486 492 dctx = zstd.ZstdDecompressor()
487 493
488 494 with dctx.stream_reader(frame) as reader:
489 495 reader.seek(3)
490 496 self.assertEqual(reader.read(3), b'bar')
491 497
492 498 reader.seek(4, os.SEEK_CUR)
493 499 self.assertEqual(reader.read(2), b'ar')
494 500
501 def test_no_context_manager(self):
502 source = b'foobar' * 60
503 cctx = zstd.ZstdCompressor()
504 frame = cctx.compress(source)
505
506 dctx = zstd.ZstdDecompressor()
507 reader = dctx.stream_reader(frame)
508
509 self.assertEqual(reader.read(6), b'foobar')
510 self.assertEqual(reader.read(18), b'foobar' * 3)
511 self.assertFalse(reader.closed)
512
513 # Calling close prevents subsequent use.
514 reader.close()
515 self.assertTrue(reader.closed)
516
517 with self.assertRaisesRegexp(ValueError, 'stream is closed'):
518 reader.read(6)
519
520 def test_read_after_error(self):
521 source = io.BytesIO(b'')
522 dctx = zstd.ZstdDecompressor()
523
524 reader = dctx.stream_reader(source)
525
526 with reader:
527 with self.assertRaises(TypeError):
528 reader.read()
529
530 with reader:
531 with self.assertRaisesRegexp(ValueError, 'stream is closed'):
532 reader.read(100)
533
495 534
496 535 @make_cffi
497 536 class TestDecompressor_decompressobj(unittest.TestCase):
498 537 def test_simple(self):
499 538 data = zstd.ZstdCompressor(level=1).compress(b'foobar')
500 539
501 540 dctx = zstd.ZstdDecompressor()
502 541 dobj = dctx.decompressobj()
503 542 self.assertEqual(dobj.decompress(data), b'foobar')
504 543
505 544 def test_input_types(self):
506 545 compressed = zstd.ZstdCompressor(level=1).compress(b'foo')
507 546
508 547 dctx = zstd.ZstdDecompressor()
509 548
510 549 mutable_array = bytearray(len(compressed))
511 550 mutable_array[:] = compressed
512 551
513 552 sources = [
514 553 memoryview(compressed),
515 554 bytearray(compressed),
516 555 mutable_array,
517 556 ]
518 557
519 558 for source in sources:
520 559 dobj = dctx.decompressobj()
521 560 self.assertEqual(dobj.decompress(source), b'foo')
522 561
523 562 def test_reuse(self):
524 563 data = zstd.ZstdCompressor(level=1).compress(b'foobar')
525 564
526 565 dctx = zstd.ZstdDecompressor()
527 566 dobj = dctx.decompressobj()
528 567 dobj.decompress(data)
529 568
530 569 with self.assertRaisesRegexp(zstd.ZstdError, 'cannot use a decompressobj'):
531 570 dobj.decompress(data)
532 571
533 572 def test_bad_write_size(self):
534 573 dctx = zstd.ZstdDecompressor()
535 574
536 575 with self.assertRaisesRegexp(ValueError, 'write_size must be positive'):
537 576 dctx.decompressobj(write_size=0)
538 577
539 578 def test_write_size(self):
540 579 source = b'foo' * 64 + b'bar' * 128
541 580 data = zstd.ZstdCompressor(level=1).compress(source)
542 581
543 582 dctx = zstd.ZstdDecompressor()
544 583
545 584 for i in range(128):
546 585 dobj = dctx.decompressobj(write_size=i + 1)
547 586 self.assertEqual(dobj.decompress(data), source)
548 587
549 588 def decompress_via_writer(data):
550 589 buffer = io.BytesIO()
551 590 dctx = zstd.ZstdDecompressor()
552 591 with dctx.stream_writer(buffer) as decompressor:
553 592 decompressor.write(data)
554 593 return buffer.getvalue()
555 594
556 595
557 596 @make_cffi
558 597 class TestDecompressor_stream_writer(unittest.TestCase):
559 598 def test_empty_roundtrip(self):
560 599 cctx = zstd.ZstdCompressor()
561 600 empty = cctx.compress(b'')
562 601 self.assertEqual(decompress_via_writer(empty), b'')
563 602
564 603 def test_input_types(self):
565 604 cctx = zstd.ZstdCompressor(level=1)
566 605 compressed = cctx.compress(b'foo')
567 606
568 607 mutable_array = bytearray(len(compressed))
569 608 mutable_array[:] = compressed
570 609
571 610 sources = [
572 611 memoryview(compressed),
573 612 bytearray(compressed),
574 613 mutable_array,
575 614 ]
576 615
577 616 dctx = zstd.ZstdDecompressor()
578 617 for source in sources:
579 618 buffer = io.BytesIO()
580 619 with dctx.stream_writer(buffer) as decompressor:
581 620 decompressor.write(source)
582 621
583 622 self.assertEqual(buffer.getvalue(), b'foo')
584 623
585 624 def test_large_roundtrip(self):
586 625 chunks = []
587 626 for i in range(255):
588 627 chunks.append(struct.Struct('>B').pack(i) * 16384)
589 628 orig = b''.join(chunks)
590 629 cctx = zstd.ZstdCompressor()
591 630 compressed = cctx.compress(orig)
592 631
593 632 self.assertEqual(decompress_via_writer(compressed), orig)
594 633
595 634 def test_multiple_calls(self):
596 635 chunks = []
597 636 for i in range(255):
598 637 for j in range(255):
599 638 chunks.append(struct.Struct('>B').pack(j) * i)
600 639
601 640 orig = b''.join(chunks)
602 641 cctx = zstd.ZstdCompressor()
603 642 compressed = cctx.compress(orig)
604 643
605 644 buffer = io.BytesIO()
606 645 dctx = zstd.ZstdDecompressor()
607 646 with dctx.stream_writer(buffer) as decompressor:
608 647 pos = 0
609 648 while pos < len(compressed):
610 649 pos2 = pos + 8192
611 650 decompressor.write(compressed[pos:pos2])
612 651 pos += 8192
613 652 self.assertEqual(buffer.getvalue(), orig)
614 653
615 654 def test_dictionary(self):
616 655 samples = []
617 656 for i in range(128):
618 657 samples.append(b'foo' * 64)
619 658 samples.append(b'bar' * 64)
620 659 samples.append(b'foobar' * 64)
621 660
622 661 d = zstd.train_dictionary(8192, samples)
623 662
624 663 orig = b'foobar' * 16384
625 664 buffer = io.BytesIO()
626 665 cctx = zstd.ZstdCompressor(dict_data=d)
627 666 with cctx.stream_writer(buffer) as compressor:
628 667 self.assertEqual(compressor.write(orig), 0)
629 668
630 669 compressed = buffer.getvalue()
631 670 buffer = io.BytesIO()
632 671
633 672 dctx = zstd.ZstdDecompressor(dict_data=d)
634 673 with dctx.stream_writer(buffer) as decompressor:
635 674 self.assertEqual(decompressor.write(compressed), len(orig))
636 675
637 676 self.assertEqual(buffer.getvalue(), orig)
638 677
639 678 def test_memory_size(self):
640 679 dctx = zstd.ZstdDecompressor()
641 680 buffer = io.BytesIO()
642 681 with dctx.stream_writer(buffer) as decompressor:
643 682 size = decompressor.memory_size()
644 683
645 684 self.assertGreater(size, 100000)
646 685
647 686 def test_write_size(self):
648 687 source = zstd.ZstdCompressor().compress(b'foobarfoobar')
649 688 dest = OpCountingBytesIO()
650 689 dctx = zstd.ZstdDecompressor()
651 690 with dctx.stream_writer(dest, write_size=1) as decompressor:
652 691 s = struct.Struct('>B')
653 692 for c in source:
654 693 if not isinstance(c, str):
655 694 c = s.pack(c)
656 695 decompressor.write(c)
657 696
658 697 self.assertEqual(dest.getvalue(), b'foobarfoobar')
659 698 self.assertEqual(dest._write_count, len(dest.getvalue()))
660 699
661 700
662 701 @make_cffi
663 702 class TestDecompressor_read_to_iter(unittest.TestCase):
664 703 def test_type_validation(self):
665 704 dctx = zstd.ZstdDecompressor()
666 705
667 706 # Object with read() works.
668 707 dctx.read_to_iter(io.BytesIO())
669 708
670 709 # Buffer protocol works.
671 710 dctx.read_to_iter(b'foobar')
672 711
673 712 with self.assertRaisesRegexp(ValueError, 'must pass an object with a read'):
674 713 b''.join(dctx.read_to_iter(True))
675 714
676 715 def test_empty_input(self):
677 716 dctx = zstd.ZstdDecompressor()
678 717
679 718 source = io.BytesIO()
680 719 it = dctx.read_to_iter(source)
681 720 # TODO this is arguably wrong. Should get an error about missing frame foo.
682 721 with self.assertRaises(StopIteration):
683 722 next(it)
684 723
685 724 it = dctx.read_to_iter(b'')
686 725 with self.assertRaises(StopIteration):
687 726 next(it)
688 727
689 728 def test_invalid_input(self):
690 729 dctx = zstd.ZstdDecompressor()
691 730
692 731 source = io.BytesIO(b'foobar')
693 732 it = dctx.read_to_iter(source)
694 733 with self.assertRaisesRegexp(zstd.ZstdError, 'Unknown frame descriptor'):
695 734 next(it)
696 735
697 736 it = dctx.read_to_iter(b'foobar')
698 737 with self.assertRaisesRegexp(zstd.ZstdError, 'Unknown frame descriptor'):
699 738 next(it)
700 739
701 740 def test_empty_roundtrip(self):
702 741 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
703 742 empty = cctx.compress(b'')
704 743
705 744 source = io.BytesIO(empty)
706 745 source.seek(0)
707 746
708 747 dctx = zstd.ZstdDecompressor()
709 748 it = dctx.read_to_iter(source)
710 749
711 750 # No chunks should be emitted since there is no data.
712 751 with self.assertRaises(StopIteration):
713 752 next(it)
714 753
715 754 # Again for good measure.
716 755 with self.assertRaises(StopIteration):
717 756 next(it)
718 757
719 758 def test_skip_bytes_too_large(self):
720 759 dctx = zstd.ZstdDecompressor()
721 760
722 761 with self.assertRaisesRegexp(ValueError, 'skip_bytes must be smaller than read_size'):
723 762 b''.join(dctx.read_to_iter(b'', skip_bytes=1, read_size=1))
724 763
725 764 with self.assertRaisesRegexp(ValueError, 'skip_bytes larger than first input chunk'):
726 765 b''.join(dctx.read_to_iter(b'foobar', skip_bytes=10))
727 766
728 767 def test_skip_bytes(self):
729 768 cctx = zstd.ZstdCompressor(write_content_size=False)
730 769 compressed = cctx.compress(b'foobar')
731 770
732 771 dctx = zstd.ZstdDecompressor()
733 772 output = b''.join(dctx.read_to_iter(b'hdr' + compressed, skip_bytes=3))
734 773 self.assertEqual(output, b'foobar')
735 774
736 775 def test_large_output(self):
737 776 source = io.BytesIO()
738 777 source.write(b'f' * zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE)
739 778 source.write(b'o')
740 779 source.seek(0)
741 780
742 781 cctx = zstd.ZstdCompressor(level=1)
743 782 compressed = io.BytesIO(cctx.compress(source.getvalue()))
744 783 compressed.seek(0)
745 784
746 785 dctx = zstd.ZstdDecompressor()
747 786 it = dctx.read_to_iter(compressed)
748 787
749 788 chunks = []
750 789 chunks.append(next(it))
751 790 chunks.append(next(it))
752 791
753 792 with self.assertRaises(StopIteration):
754 793 next(it)
755 794
756 795 decompressed = b''.join(chunks)
757 796 self.assertEqual(decompressed, source.getvalue())
758 797
759 798 # And again with buffer protocol.
760 799 it = dctx.read_to_iter(compressed.getvalue())
761 800 chunks = []
762 801 chunks.append(next(it))
763 802 chunks.append(next(it))
764 803
765 804 with self.assertRaises(StopIteration):
766 805 next(it)
767 806
768 807 decompressed = b''.join(chunks)
769 808 self.assertEqual(decompressed, source.getvalue())
770 809
771 810 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
772 811 def test_large_input(self):
773 812 bytes = list(struct.Struct('>B').pack(i) for i in range(256))
774 813 compressed = io.BytesIO()
775 814 input_size = 0
776 815 cctx = zstd.ZstdCompressor(level=1)
777 816 with cctx.stream_writer(compressed) as compressor:
778 817 while True:
779 818 compressor.write(random.choice(bytes))
780 819 input_size += 1
781 820
782 821 have_compressed = len(compressed.getvalue()) > zstd.DECOMPRESSION_RECOMMENDED_INPUT_SIZE
783 822 have_raw = input_size > zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE * 2
784 823 if have_compressed and have_raw:
785 824 break
786 825
787 826 compressed.seek(0)
788 827 self.assertGreater(len(compressed.getvalue()),
789 828 zstd.DECOMPRESSION_RECOMMENDED_INPUT_SIZE)
790 829
791 830 dctx = zstd.ZstdDecompressor()
792 831 it = dctx.read_to_iter(compressed)
793 832
794 833 chunks = []
795 834 chunks.append(next(it))
796 835 chunks.append(next(it))
797 836 chunks.append(next(it))
798 837
799 838 with self.assertRaises(StopIteration):
800 839 next(it)
801 840
802 841 decompressed = b''.join(chunks)
803 842 self.assertEqual(len(decompressed), input_size)
804 843
805 844 # And again with buffer protocol.
806 845 it = dctx.read_to_iter(compressed.getvalue())
807 846
808 847 chunks = []
809 848 chunks.append(next(it))
810 849 chunks.append(next(it))
811 850 chunks.append(next(it))
812 851
813 852 with self.assertRaises(StopIteration):
814 853 next(it)
815 854
816 855 decompressed = b''.join(chunks)
817 856 self.assertEqual(len(decompressed), input_size)
818 857
819 858 def test_interesting(self):
820 859 # Found this edge case via fuzzing.
821 860 cctx = zstd.ZstdCompressor(level=1)
822 861
823 862 source = io.BytesIO()
824 863
825 864 compressed = io.BytesIO()
826 865 with cctx.stream_writer(compressed) as compressor:
827 866 for i in range(256):
828 867 chunk = b'\0' * 1024
829 868 compressor.write(chunk)
830 869 source.write(chunk)
831 870
832 871 dctx = zstd.ZstdDecompressor()
833 872
834 873 simple = dctx.decompress(compressed.getvalue(),
835 874 max_output_size=len(source.getvalue()))
836 875 self.assertEqual(simple, source.getvalue())
837 876
838 877 compressed.seek(0)
839 878 streamed = b''.join(dctx.read_to_iter(compressed))
840 879 self.assertEqual(streamed, source.getvalue())
841 880
842 881 def test_read_write_size(self):
843 882 source = OpCountingBytesIO(zstd.ZstdCompressor().compress(b'foobarfoobar'))
844 883 dctx = zstd.ZstdDecompressor()
845 884 for chunk in dctx.read_to_iter(source, read_size=1, write_size=1):
846 885 self.assertEqual(len(chunk), 1)
847 886
848 887 self.assertEqual(source._read_count, len(source.getvalue()))
849 888
850 889 def test_magic_less(self):
851 890 params = zstd.CompressionParameters.from_level(
852 891 1, format=zstd.FORMAT_ZSTD1_MAGICLESS)
853 892 cctx = zstd.ZstdCompressor(compression_params=params)
854 893 frame = cctx.compress(b'foobar')
855 894
856 895 self.assertNotEqual(frame[0:4], b'\x28\xb5\x2f\xfd')
857 896
858 897 dctx = zstd.ZstdDecompressor()
859 898 with self.assertRaisesRegexp(
860 899 zstd.ZstdError, 'error determining content size from frame header'):
861 900 dctx.decompress(frame)
862 901
863 902 dctx = zstd.ZstdDecompressor(format=zstd.FORMAT_ZSTD1_MAGICLESS)
864 903 res = b''.join(dctx.read_to_iter(frame))
865 904 self.assertEqual(res, b'foobar')
866 905
867 906
868 907 @make_cffi
869 908 class TestDecompressor_content_dict_chain(unittest.TestCase):
870 909 def test_bad_inputs_simple(self):
871 910 dctx = zstd.ZstdDecompressor()
872 911
873 912 with self.assertRaises(TypeError):
874 913 dctx.decompress_content_dict_chain(b'foo')
875 914
876 915 with self.assertRaises(TypeError):
877 916 dctx.decompress_content_dict_chain((b'foo', b'bar'))
878 917
879 918 with self.assertRaisesRegexp(ValueError, 'empty input chain'):
880 919 dctx.decompress_content_dict_chain([])
881 920
882 921 with self.assertRaisesRegexp(ValueError, 'chunk 0 must be bytes'):
883 922 dctx.decompress_content_dict_chain([u'foo'])
884 923
885 924 with self.assertRaisesRegexp(ValueError, 'chunk 0 must be bytes'):
886 925 dctx.decompress_content_dict_chain([True])
887 926
888 927 with self.assertRaisesRegexp(ValueError, 'chunk 0 is too small to contain a zstd frame'):
889 928 dctx.decompress_content_dict_chain([zstd.FRAME_HEADER])
890 929
891 930 with self.assertRaisesRegexp(ValueError, 'chunk 0 is not a valid zstd frame'):
892 931 dctx.decompress_content_dict_chain([b'foo' * 8])
893 932
894 933 no_size = zstd.ZstdCompressor(write_content_size=False).compress(b'foo' * 64)
895 934
896 935 with self.assertRaisesRegexp(ValueError, 'chunk 0 missing content size in frame'):
897 936 dctx.decompress_content_dict_chain([no_size])
898 937
899 938 # Corrupt first frame.
900 939 frame = zstd.ZstdCompressor().compress(b'foo' * 64)
901 940 frame = frame[0:12] + frame[15:]
902 941 with self.assertRaisesRegexp(zstd.ZstdError,
903 942 'chunk 0 did not decompress full frame'):
904 943 dctx.decompress_content_dict_chain([frame])
905 944
906 945 def test_bad_subsequent_input(self):
907 946 initial = zstd.ZstdCompressor().compress(b'foo' * 64)
908 947
909 948 dctx = zstd.ZstdDecompressor()
910 949
911 950 with self.assertRaisesRegexp(ValueError, 'chunk 1 must be bytes'):
912 951 dctx.decompress_content_dict_chain([initial, u'foo'])
913 952
914 953 with self.assertRaisesRegexp(ValueError, 'chunk 1 must be bytes'):
915 954 dctx.decompress_content_dict_chain([initial, None])
916 955
917 956 with self.assertRaisesRegexp(ValueError, 'chunk 1 is too small to contain a zstd frame'):
918 957 dctx.decompress_content_dict_chain([initial, zstd.FRAME_HEADER])
919 958
920 959 with self.assertRaisesRegexp(ValueError, 'chunk 1 is not a valid zstd frame'):
921 960 dctx.decompress_content_dict_chain([initial, b'foo' * 8])
922 961
923 962 no_size = zstd.ZstdCompressor(write_content_size=False).compress(b'foo' * 64)
924 963
925 964 with self.assertRaisesRegexp(ValueError, 'chunk 1 missing content size in frame'):
926 965 dctx.decompress_content_dict_chain([initial, no_size])
927 966
928 967 # Corrupt second frame.
929 968 cctx = zstd.ZstdCompressor(dict_data=zstd.ZstdCompressionDict(b'foo' * 64))
930 969 frame = cctx.compress(b'bar' * 64)
931 970 frame = frame[0:12] + frame[15:]
932 971
933 972 with self.assertRaisesRegexp(zstd.ZstdError, 'chunk 1 did not decompress full frame'):
934 973 dctx.decompress_content_dict_chain([initial, frame])
935 974
936 975 def test_simple(self):
937 976 original = [
938 977 b'foo' * 64,
939 978 b'foobar' * 64,
940 979 b'baz' * 64,
941 980 b'foobaz' * 64,
942 981 b'foobarbaz' * 64,
943 982 ]
944 983
945 984 chunks = []
946 985 chunks.append(zstd.ZstdCompressor().compress(original[0]))
947 986 for i, chunk in enumerate(original[1:]):
948 987 d = zstd.ZstdCompressionDict(original[i])
949 988 cctx = zstd.ZstdCompressor(dict_data=d)
950 989 chunks.append(cctx.compress(chunk))
951 990
952 991 for i in range(1, len(original)):
953 992 chain = chunks[0:i]
954 993 expected = original[i - 1]
955 994 dctx = zstd.ZstdDecompressor()
956 995 decompressed = dctx.decompress_content_dict_chain(chain)
957 996 self.assertEqual(decompressed, expected)
958 997
959 998
960 999 # TODO enable for CFFI
961 1000 class TestDecompressor_multi_decompress_to_buffer(unittest.TestCase):
962 1001 def test_invalid_inputs(self):
963 1002 dctx = zstd.ZstdDecompressor()
964 1003
965 1004 with self.assertRaises(TypeError):
966 1005 dctx.multi_decompress_to_buffer(True)
967 1006
968 1007 with self.assertRaises(TypeError):
969 1008 dctx.multi_decompress_to_buffer((1, 2))
970 1009
971 1010 with self.assertRaisesRegexp(TypeError, 'item 0 not a bytes like object'):
972 1011 dctx.multi_decompress_to_buffer([u'foo'])
973 1012
974 1013 with self.assertRaisesRegexp(ValueError, 'could not determine decompressed size of item 0'):
975 1014 dctx.multi_decompress_to_buffer([b'foobarbaz'])
976 1015
977 1016 def test_list_input(self):
978 1017 cctx = zstd.ZstdCompressor()
979 1018
980 1019 original = [b'foo' * 4, b'bar' * 6]
981 1020 frames = [cctx.compress(d) for d in original]
982 1021
983 1022 dctx = zstd.ZstdDecompressor()
984 1023 result = dctx.multi_decompress_to_buffer(frames)
985 1024
986 1025 self.assertEqual(len(result), len(frames))
987 1026 self.assertEqual(result.size(), sum(map(len, original)))
988 1027
989 1028 for i, data in enumerate(original):
990 1029 self.assertEqual(result[i].tobytes(), data)
991 1030
992 1031 self.assertEqual(result[0].offset, 0)
993 1032 self.assertEqual(len(result[0]), 12)
994 1033 self.assertEqual(result[1].offset, 12)
995 1034 self.assertEqual(len(result[1]), 18)
996 1035
997 1036 def test_list_input_frame_sizes(self):
998 1037 cctx = zstd.ZstdCompressor()
999 1038
1000 1039 original = [b'foo' * 4, b'bar' * 6, b'baz' * 8]
1001 1040 frames = [cctx.compress(d) for d in original]
1002 1041 sizes = struct.pack('=' + 'Q' * len(original), *map(len, original))
1003 1042
1004 1043 dctx = zstd.ZstdDecompressor()
1005 1044 result = dctx.multi_decompress_to_buffer(frames, decompressed_sizes=sizes)
1006 1045
1007 1046 self.assertEqual(len(result), len(frames))
1008 1047 self.assertEqual(result.size(), sum(map(len, original)))
1009 1048
1010 1049 for i, data in enumerate(original):
1011 1050 self.assertEqual(result[i].tobytes(), data)
1012 1051
1013 1052 def test_buffer_with_segments_input(self):
1014 1053 cctx = zstd.ZstdCompressor()
1015 1054
1016 1055 original = [b'foo' * 4, b'bar' * 6]
1017 1056 frames = [cctx.compress(d) for d in original]
1018 1057
1019 1058 dctx = zstd.ZstdDecompressor()
1020 1059
1021 1060 segments = struct.pack('=QQQQ', 0, len(frames[0]), len(frames[0]), len(frames[1]))
1022 1061 b = zstd.BufferWithSegments(b''.join(frames), segments)
1023 1062
1024 1063 result = dctx.multi_decompress_to_buffer(b)
1025 1064
1026 1065 self.assertEqual(len(result), len(frames))
1027 1066 self.assertEqual(result[0].offset, 0)
1028 1067 self.assertEqual(len(result[0]), 12)
1029 1068 self.assertEqual(result[1].offset, 12)
1030 1069 self.assertEqual(len(result[1]), 18)
1031 1070
1032 1071 def test_buffer_with_segments_sizes(self):
1033 1072 cctx = zstd.ZstdCompressor(write_content_size=False)
1034 1073 original = [b'foo' * 4, b'bar' * 6, b'baz' * 8]
1035 1074 frames = [cctx.compress(d) for d in original]
1036 1075 sizes = struct.pack('=' + 'Q' * len(original), *map(len, original))
1037 1076
1038 1077 segments = struct.pack('=QQQQQQ', 0, len(frames[0]),
1039 1078 len(frames[0]), len(frames[1]),
1040 1079 len(frames[0]) + len(frames[1]), len(frames[2]))
1041 1080 b = zstd.BufferWithSegments(b''.join(frames), segments)
1042 1081
1043 1082 dctx = zstd.ZstdDecompressor()
1044 1083 result = dctx.multi_decompress_to_buffer(b, decompressed_sizes=sizes)
1045 1084
1046 1085 self.assertEqual(len(result), len(frames))
1047 1086 self.assertEqual(result.size(), sum(map(len, original)))
1048 1087
1049 1088 for i, data in enumerate(original):
1050 1089 self.assertEqual(result[i].tobytes(), data)
1051 1090
1052 1091 def test_buffer_with_segments_collection_input(self):
1053 1092 cctx = zstd.ZstdCompressor()
1054 1093
1055 1094 original = [
1056 1095 b'foo0' * 2,
1057 1096 b'foo1' * 3,
1058 1097 b'foo2' * 4,
1059 1098 b'foo3' * 5,
1060 1099 b'foo4' * 6,
1061 1100 ]
1062 1101
1063 1102 frames = cctx.multi_compress_to_buffer(original)
1064 1103
1065 1104 # Check round trip.
1066 1105 dctx = zstd.ZstdDecompressor()
1067 1106 decompressed = dctx.multi_decompress_to_buffer(frames, threads=3)
1068 1107
1069 1108 self.assertEqual(len(decompressed), len(original))
1070 1109
1071 1110 for i, data in enumerate(original):
1072 1111 self.assertEqual(data, decompressed[i].tobytes())
1073 1112
1074 1113 # And a manual mode.
1075 1114 b = b''.join([frames[0].tobytes(), frames[1].tobytes()])
1076 1115 b1 = zstd.BufferWithSegments(b, struct.pack('=QQQQ',
1077 1116 0, len(frames[0]),
1078 1117 len(frames[0]), len(frames[1])))
1079 1118
1080 1119 b = b''.join([frames[2].tobytes(), frames[3].tobytes(), frames[4].tobytes()])
1081 1120 b2 = zstd.BufferWithSegments(b, struct.pack('=QQQQQQ',
1082 1121 0, len(frames[2]),
1083 1122 len(frames[2]), len(frames[3]),
1084 1123 len(frames[2]) + len(frames[3]), len(frames[4])))
1085 1124
1086 1125 c = zstd.BufferWithSegmentsCollection(b1, b2)
1087 1126
1088 1127 dctx = zstd.ZstdDecompressor()
1089 1128 decompressed = dctx.multi_decompress_to_buffer(c)
1090 1129
1091 1130 self.assertEqual(len(decompressed), 5)
1092 1131 for i in range(5):
1093 1132 self.assertEqual(decompressed[i].tobytes(), original[i])
1094 1133
1095 1134 def test_dict(self):
1096 1135 d = zstd.train_dictionary(16384, generate_samples(), k=64, d=16)
1097 1136
1098 1137 cctx = zstd.ZstdCompressor(dict_data=d, level=1)
1099 1138 frames = [cctx.compress(s) for s in generate_samples()]
1100 1139
1101 1140 dctx = zstd.ZstdDecompressor(dict_data=d)
1102 1141 result = dctx.multi_decompress_to_buffer(frames)
1103 1142 self.assertEqual([o.tobytes() for o in result], generate_samples())
1104 1143
1105 1144 def test_multiple_threads(self):
1106 1145 cctx = zstd.ZstdCompressor()
1107 1146
1108 1147 frames = []
1109 1148 frames.extend(cctx.compress(b'x' * 64) for i in range(256))
1110 1149 frames.extend(cctx.compress(b'y' * 64) for i in range(256))
1111 1150
1112 1151 dctx = zstd.ZstdDecompressor()
1113 1152 result = dctx.multi_decompress_to_buffer(frames, threads=-1)
1114 1153
1115 1154 self.assertEqual(len(result), len(frames))
1116 1155 self.assertEqual(result.size(), 2 * 64 * 256)
1117 1156 self.assertEqual(result[0].tobytes(), b'x' * 64)
1118 1157 self.assertEqual(result[256].tobytes(), b'y' * 64)
1119 1158
1120 1159 def test_item_failure(self):
1121 1160 cctx = zstd.ZstdCompressor()
1122 1161 frames = [cctx.compress(b'x' * 128), cctx.compress(b'y' * 128)]
1123 1162
1124 1163 frames[1] = frames[1][0:15] + b'extra' + frames[1][15:]
1125 1164
1126 1165 dctx = zstd.ZstdDecompressor()
1127 1166
1128 1167 with self.assertRaisesRegexp(zstd.ZstdError,
1129 1168 'error decompressing item 1: ('
1130 1169 'Corrupted block|'
1131 1170 'Destination buffer is too small)'):
1132 1171 dctx.multi_decompress_to_buffer(frames)
1133 1172
1134 1173 with self.assertRaisesRegexp(zstd.ZstdError,
1135 1174 'error decompressing item 1: ('
1136 1175 'Corrupted block|'
1137 1176 'Destination buffer is too small)'):
1138 1177 dctx.multi_decompress_to_buffer(frames, threads=2)
1139 1178
@@ -1,59 +1,64 b''
1 1 from __future__ import unicode_literals
2 2
3 3 import unittest
4 4
5 5 import zstandard as zstd
6 6
7 7 from . common import (
8 8 make_cffi,
9 9 )
10 10
11 11
12 12 @make_cffi
13 13 class TestModuleAttributes(unittest.TestCase):
14 14 def test_version(self):
15 self.assertEqual(zstd.ZSTD_VERSION, (1, 3, 4))
15 self.assertEqual(zstd.ZSTD_VERSION, (1, 3, 6))
16
17 self.assertEqual(zstd.__version__, '0.10.1')
16 18
17 19 def test_constants(self):
18 20 self.assertEqual(zstd.MAX_COMPRESSION_LEVEL, 22)
19 21 self.assertEqual(zstd.FRAME_HEADER, b'\x28\xb5\x2f\xfd')
20 22
21 23 def test_hasattr(self):
22 24 attrs = (
23 25 'CONTENTSIZE_UNKNOWN',
24 26 'CONTENTSIZE_ERROR',
25 27 'COMPRESSION_RECOMMENDED_INPUT_SIZE',
26 28 'COMPRESSION_RECOMMENDED_OUTPUT_SIZE',
27 29 'DECOMPRESSION_RECOMMENDED_INPUT_SIZE',
28 30 'DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE',
29 31 'MAGIC_NUMBER',
32 'BLOCKSIZELOG_MAX',
33 'BLOCKSIZE_MAX',
30 34 'WINDOWLOG_MIN',
31 35 'WINDOWLOG_MAX',
32 36 'CHAINLOG_MIN',
33 37 'CHAINLOG_MAX',
34 38 'HASHLOG_MIN',
35 39 'HASHLOG_MAX',
36 40 'HASHLOG3_MAX',
37 41 'SEARCHLOG_MIN',
38 42 'SEARCHLOG_MAX',
39 43 'SEARCHLENGTH_MIN',
40 44 'SEARCHLENGTH_MAX',
41 45 'TARGETLENGTH_MIN',
46 'TARGETLENGTH_MAX',
42 47 'LDM_MINMATCH_MIN',
43 48 'LDM_MINMATCH_MAX',
44 49 'LDM_BUCKETSIZELOG_MAX',
45 50 'STRATEGY_FAST',
46 51 'STRATEGY_DFAST',
47 52 'STRATEGY_GREEDY',
48 53 'STRATEGY_LAZY',
49 54 'STRATEGY_LAZY2',
50 55 'STRATEGY_BTLAZY2',
51 56 'STRATEGY_BTOPT',
52 57 'STRATEGY_BTULTRA',
53 58 'DICT_TYPE_AUTO',
54 59 'DICT_TYPE_RAWCONTENT',
55 60 'DICT_TYPE_FULLDICT',
56 61 )
57 62
58 63 for a in attrs:
59 64 self.assertTrue(hasattr(zstd, a), a)
@@ -1,87 +1,88 b''
1 1 import struct
2 2 import sys
3 3 import unittest
4 4
5 5 import zstandard as zstd
6 6
7 7 from . common import (
8 8 generate_samples,
9 9 make_cffi,
10 10 )
11 11
12 12 if sys.version_info[0] >= 3:
13 13 int_type = int
14 14 else:
15 15 int_type = long
16 16
17 17
18 18 @make_cffi
19 19 class TestTrainDictionary(unittest.TestCase):
20 20 def test_no_args(self):
21 21 with self.assertRaises(TypeError):
22 22 zstd.train_dictionary()
23 23
24 24 def test_bad_args(self):
25 25 with self.assertRaises(TypeError):
26 26 zstd.train_dictionary(8192, u'foo')
27 27
28 28 with self.assertRaises(ValueError):
29 29 zstd.train_dictionary(8192, [u'foo'])
30 30
31 31 def test_no_params(self):
32 32 d = zstd.train_dictionary(8192, generate_samples())
33 33 self.assertIsInstance(d.dict_id(), int_type)
34 34
35 35 # The dictionary ID may be different across platforms.
36 36 expected = b'\x37\xa4\x30\xec' + struct.pack('<I', d.dict_id())
37 37
38 38 data = d.as_bytes()
39 39 self.assertEqual(data[0:8], expected)
40 40
41 41 def test_basic(self):
42 42 d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16)
43 43 self.assertIsInstance(d.dict_id(), int_type)
44 44
45 45 data = d.as_bytes()
46 46 self.assertEqual(data[0:4], b'\x37\xa4\x30\xec')
47 47
48 48 self.assertEqual(d.k, 64)
49 49 self.assertEqual(d.d, 16)
50 50
51 51 def test_set_dict_id(self):
52 52 d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16,
53 53 dict_id=42)
54 54 self.assertEqual(d.dict_id(), 42)
55 55
56 56 def test_optimize(self):
57 57 d = zstd.train_dictionary(8192, generate_samples(), threads=-1, steps=1,
58 58 d=16)
59 59
60 self.assertEqual(d.k, 50)
60 # This varies by platform.
61 self.assertIn(d.k, (50, 2000))
61 62 self.assertEqual(d.d, 16)
62 63
63 64 @make_cffi
64 65 class TestCompressionDict(unittest.TestCase):
65 66 def test_bad_mode(self):
66 67 with self.assertRaisesRegexp(ValueError, 'invalid dictionary load mode'):
67 68 zstd.ZstdCompressionDict(b'foo', dict_type=42)
68 69
69 70 def test_bad_precompute_compress(self):
70 71 d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16)
71 72
72 73 with self.assertRaisesRegexp(ValueError, 'must specify one of level or '):
73 74 d.precompute_compress()
74 75
75 76 with self.assertRaisesRegexp(ValueError, 'must only specify one of level or '):
76 77 d.precompute_compress(level=3,
77 78 compression_params=zstd.CompressionParameters())
78 79
79 80 def test_precompute_compress_rawcontent(self):
80 81 d = zstd.ZstdCompressionDict(b'dictcontent' * 64,
81 82 dict_type=zstd.DICT_TYPE_RAWCONTENT)
82 83 d.precompute_compress(level=1)
83 84
84 85 d = zstd.ZstdCompressionDict(b'dictcontent' * 64,
85 86 dict_type=zstd.DICT_TYPE_FULLDICT)
86 87 with self.assertRaisesRegexp(zstd.ZstdError, 'unable to precompute dictionary'):
87 88 d.precompute_compress(level=1)
@@ -1,62 +1,65 b''
1 1 # Copyright (c) 2017-present, Gregory Szorc
2 2 # All rights reserved.
3 3 #
4 4 # This software may be modified and distributed under the terms
5 5 # of the BSD license. See the LICENSE file for details.
6 6
7 7 """Python interface to the Zstandard (zstd) compression library."""
8 8
9 9 from __future__ import absolute_import, unicode_literals
10 10
11 11 # This module serves 2 roles:
12 12 #
13 13 # 1) Export the C or CFFI "backend" through a central module.
14 14 # 2) Implement additional functionality built on top of C or CFFI backend.
15 15
16 16 import os
17 17 import platform
18 18
19 19 # Some Python implementations don't support C extensions. That's why we have
20 20 # a CFFI implementation in the first place. The code here import one of our
21 21 # "backends" then re-exports the symbols from this module. For convenience,
22 22 # we support falling back to the CFFI backend if the C extension can't be
23 23 # imported. But for performance reasons, we only do this on unknown Python
24 24 # implementation. Notably, for CPython we require the C extension by default.
25 25 # Because someone will inevitably want special behavior, the behavior is
26 26 # configurable via an environment variable. A potentially better way to handle
27 27 # this is to import a special ``__importpolicy__`` module or something
28 28 # defining a variable and `setup.py` could write the file with whatever
29 29 # policy was specified at build time. Until someone needs it, we go with
30 30 # the hacky but simple environment variable approach.
31 31 _module_policy = os.environ.get('PYTHON_ZSTANDARD_IMPORT_POLICY', 'default')
32 32
33 33 if _module_policy == 'default':
34 34 if platform.python_implementation() in ('CPython',):
35 35 from zstd import *
36 36 backend = 'cext'
37 37 elif platform.python_implementation() in ('PyPy',):
38 38 from zstd_cffi import *
39 39 backend = 'cffi'
40 40 else:
41 41 try:
42 42 from zstd import *
43 43 backend = 'cext'
44 44 except ImportError:
45 45 from zstd_cffi import *
46 46 backend = 'cffi'
47 47 elif _module_policy == 'cffi_fallback':
48 48 try:
49 49 from zstd import *
50 50 backend = 'cext'
51 51 except ImportError:
52 52 from zstd_cffi import *
53 53 backend = 'cffi'
54 54 elif _module_policy == 'cext':
55 55 from zstd import *
56 56 backend = 'cext'
57 57 elif _module_policy == 'cffi':
58 58 from zstd_cffi import *
59 59 backend = 'cffi'
60 60 else:
61 61 raise ImportError('unknown module import policy: %s; use default, cffi_fallback, '
62 62 'cext, or cffi' % _module_policy)
63
64 # Keep this in sync with python-zstandard.h.
65 __version__ = '0.10.1'
@@ -1,342 +1,344 b''
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 /* A Python C extension for Zstandard. */
10 10
11 11 #if defined(_WIN32)
12 12 #define WIN32_LEAN_AND_MEAN
13 13 #include <Windows.h>
14 14 #elif defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__DragonFly__)
15 15 #include <sys/types.h>
16 16 #include <sys/sysctl.h>
17 17 #endif
18 18
19 19 #include "python-zstandard.h"
20 20
21 21 PyObject *ZstdError;
22 22
23 23 PyDoc_STRVAR(estimate_decompression_context_size__doc__,
24 24 "estimate_decompression_context_size()\n"
25 25 "\n"
26 26 "Estimate the amount of memory allocated to a decompression context.\n"
27 27 );
28 28
29 29 static PyObject* estimate_decompression_context_size(PyObject* self) {
30 30 return PyLong_FromSize_t(ZSTD_estimateDCtxSize());
31 31 }
32 32
33 33 PyDoc_STRVAR(frame_content_size__doc__,
34 34 "frame_content_size(data)\n"
35 35 "\n"
36 36 "Obtain the decompressed size of a frame."
37 37 );
38 38
39 39 static PyObject* frame_content_size(PyObject* self, PyObject* args, PyObject* kwargs) {
40 40 static char* kwlist[] = {
41 41 "source",
42 42 NULL
43 43 };
44 44
45 45 Py_buffer source;
46 46 PyObject* result = NULL;
47 47 unsigned long long size;
48 48
49 49 #if PY_MAJOR_VERSION >= 3
50 50 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:frame_content_size",
51 51 #else
52 52 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:frame_content_size",
53 53 #endif
54 54 kwlist, &source)) {
55 55 return NULL;
56 56 }
57 57
58 58 if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) {
59 59 PyErr_SetString(PyExc_ValueError,
60 60 "data buffer should be contiguous and have at most one dimension");
61 61 goto finally;
62 62 }
63 63
64 64 size = ZSTD_getFrameContentSize(source.buf, source.len);
65 65
66 66 if (size == ZSTD_CONTENTSIZE_ERROR) {
67 67 PyErr_SetString(ZstdError, "error when determining content size");
68 68 }
69 69 else if (size == ZSTD_CONTENTSIZE_UNKNOWN) {
70 70 result = PyLong_FromLong(-1);
71 71 }
72 72 else {
73 73 result = PyLong_FromUnsignedLongLong(size);
74 74 }
75 75
76 76 finally:
77 77 PyBuffer_Release(&source);
78 78
79 79 return result;
80 80 }
81 81
82 82 PyDoc_STRVAR(frame_header_size__doc__,
83 83 "frame_header_size(data)\n"
84 84 "\n"
85 85 "Obtain the size of a frame header.\n"
86 86 );
87 87
88 88 static PyObject* frame_header_size(PyObject* self, PyObject* args, PyObject* kwargs) {
89 89 static char* kwlist[] = {
90 90 "source",
91 91 NULL
92 92 };
93 93
94 94 Py_buffer source;
95 95 PyObject* result = NULL;
96 96 size_t zresult;
97 97
98 98 #if PY_MAJOR_VERSION >= 3
99 99 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:frame_header_size",
100 100 #else
101 101 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:frame_header_size",
102 102 #endif
103 103 kwlist, &source)) {
104 104 return NULL;
105 105 }
106 106
107 107 if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) {
108 108 PyErr_SetString(PyExc_ValueError,
109 109 "data buffer should be contiguous and have at most one dimension");
110 110 goto finally;
111 111 }
112 112
113 113 zresult = ZSTD_frameHeaderSize(source.buf, source.len);
114 114 if (ZSTD_isError(zresult)) {
115 115 PyErr_Format(ZstdError, "could not determine frame header size: %s",
116 116 ZSTD_getErrorName(zresult));
117 117 }
118 118 else {
119 119 result = PyLong_FromSize_t(zresult);
120 120 }
121 121
122 122 finally:
123 123
124 124 PyBuffer_Release(&source);
125 125
126 126 return result;
127 127 }
128 128
129 129 PyDoc_STRVAR(get_frame_parameters__doc__,
130 130 "get_frame_parameters(data)\n"
131 131 "\n"
132 132 "Obtains a ``FrameParameters`` instance by parsing data.\n");
133 133
134 134 PyDoc_STRVAR(train_dictionary__doc__,
135 135 "train_dictionary(dict_size, samples, k=None, d=None, steps=None,\n"
136 136 " threads=None,notifications=0, dict_id=0, level=0)\n"
137 137 "\n"
138 138 "Train a dictionary from sample data using the COVER algorithm.\n"
139 139 "\n"
140 140 "A compression dictionary of size ``dict_size`` will be created from the\n"
141 141 "iterable of ``samples``. The raw dictionary bytes will be returned.\n"
142 142 "\n"
143 143 "The COVER algorithm has 2 parameters: ``k`` and ``d``. These control the\n"
144 144 "*segment size* and *dmer size*. A reasonable range for ``k`` is\n"
145 145 "``[16, 2048+]``. A reasonable range for ``d`` is ``[6, 16]``.\n"
146 146 "``d`` must be less than or equal to ``k``.\n"
147 147 "\n"
148 148 "``steps`` can be specified to control the number of steps through potential\n"
149 149 "values of ``k`` and ``d`` to try. ``k`` and ``d`` will only be varied if\n"
150 150 "those arguments are not defined. i.e. if ``d`` is ``8``, then only ``k``\n"
151 151 "will be varied in this mode.\n"
152 152 "\n"
153 153 "``threads`` can specify how many threads to use to test various ``k`` and\n"
154 154 "``d`` values. ``-1`` will use as many threads as available CPUs. By default,\n"
155 155 "a single thread is used.\n"
156 156 "\n"
157 157 "When ``k`` and ``d`` are not defined, default values are used and the\n"
158 158 "algorithm will perform multiple iterations - or steps - to try to find\n"
159 159 "ideal parameters. If both ``k`` and ``d`` are specified, then those values\n"
160 160 "will be used. ``steps`` or ``threads`` triggers optimization mode to test\n"
161 161 "multiple ``k`` and ``d`` variations.\n"
162 162 );
163 163
164 164 static char zstd_doc[] = "Interface to zstandard";
165 165
166 166 static PyMethodDef zstd_methods[] = {
167 167 { "estimate_decompression_context_size", (PyCFunction)estimate_decompression_context_size,
168 168 METH_NOARGS, estimate_decompression_context_size__doc__ },
169 169 { "frame_content_size", (PyCFunction)frame_content_size,
170 170 METH_VARARGS | METH_KEYWORDS, frame_content_size__doc__ },
171 171 { "frame_header_size", (PyCFunction)frame_header_size,
172 172 METH_VARARGS | METH_KEYWORDS, frame_header_size__doc__ },
173 173 { "get_frame_parameters", (PyCFunction)get_frame_parameters,
174 174 METH_VARARGS | METH_KEYWORDS, get_frame_parameters__doc__ },
175 175 { "train_dictionary", (PyCFunction)train_dictionary,
176 176 METH_VARARGS | METH_KEYWORDS, train_dictionary__doc__ },
177 177 { NULL, NULL }
178 178 };
179 179
180 180 void bufferutil_module_init(PyObject* mod);
181 181 void compressobj_module_init(PyObject* mod);
182 182 void compressor_module_init(PyObject* mod);
183 183 void compressionparams_module_init(PyObject* mod);
184 184 void constants_module_init(PyObject* mod);
185 void compressionchunker_module_init(PyObject* mod);
185 186 void compressiondict_module_init(PyObject* mod);
186 187 void compressionreader_module_init(PyObject* mod);
187 188 void compressionwriter_module_init(PyObject* mod);
188 189 void compressoriterator_module_init(PyObject* mod);
189 190 void decompressor_module_init(PyObject* mod);
190 191 void decompressobj_module_init(PyObject* mod);
191 192 void decompressionreader_module_init(PyObject *mod);
192 193 void decompressionwriter_module_init(PyObject* mod);
193 194 void decompressoriterator_module_init(PyObject* mod);
194 195 void frameparams_module_init(PyObject* mod);
195 196
196 197 void zstd_module_init(PyObject* m) {
197 198 /* python-zstandard relies on unstable zstd C API features. This means
198 199 that changes in zstd may break expectations in python-zstandard.
199 200
200 201 python-zstandard is distributed with a copy of the zstd sources.
201 202 python-zstandard is only guaranteed to work with the bundled version
202 203 of zstd.
203 204
204 205 However, downstream redistributors or packagers may unbundle zstd
205 206 from python-zstandard. This can result in a mismatch between zstd
206 207 versions and API semantics. This essentially "voids the warranty"
207 208 of python-zstandard and may cause undefined behavior.
208 209
209 210 We detect this mismatch here and refuse to load the module if this
210 211 scenario is detected.
211 212 */
212 if (ZSTD_VERSION_NUMBER != 10304 || ZSTD_versionNumber() != 10304) {
213 if (ZSTD_VERSION_NUMBER != 10306 || ZSTD_versionNumber() != 10306) {
213 214 PyErr_SetString(PyExc_ImportError, "zstd C API mismatch; Python bindings not compiled against expected zstd version");
214 215 return;
215 216 }
216 217
217 218 bufferutil_module_init(m);
218 219 compressionparams_module_init(m);
219 220 compressiondict_module_init(m);
220 221 compressobj_module_init(m);
221 222 compressor_module_init(m);
223 compressionchunker_module_init(m);
222 224 compressionreader_module_init(m);
223 225 compressionwriter_module_init(m);
224 226 compressoriterator_module_init(m);
225 227 constants_module_init(m);
226 228 decompressor_module_init(m);
227 229 decompressobj_module_init(m);
228 230 decompressionreader_module_init(m);
229 231 decompressionwriter_module_init(m);
230 232 decompressoriterator_module_init(m);
231 233 frameparams_module_init(m);
232 234 }
233 235
234 236 #if defined(__GNUC__) && (__GNUC__ >= 4)
235 237 # define PYTHON_ZSTD_VISIBILITY __attribute__ ((visibility ("default")))
236 238 #else
237 239 # define PYTHON_ZSTD_VISIBILITY
238 240 #endif
239 241
240 242 #if PY_MAJOR_VERSION >= 3
241 243 static struct PyModuleDef zstd_module = {
242 244 PyModuleDef_HEAD_INIT,
243 245 "zstd",
244 246 zstd_doc,
245 247 -1,
246 248 zstd_methods
247 249 };
248 250
249 251 PYTHON_ZSTD_VISIBILITY PyMODINIT_FUNC PyInit_zstd(void) {
250 252 PyObject *m = PyModule_Create(&zstd_module);
251 253 if (m) {
252 254 zstd_module_init(m);
253 255 if (PyErr_Occurred()) {
254 256 Py_DECREF(m);
255 257 m = NULL;
256 258 }
257 259 }
258 260 return m;
259 261 }
260 262 #else
261 263 PYTHON_ZSTD_VISIBILITY PyMODINIT_FUNC initzstd(void) {
262 264 PyObject *m = Py_InitModule3("zstd", zstd_methods, zstd_doc);
263 265 if (m) {
264 266 zstd_module_init(m);
265 267 }
266 268 }
267 269 #endif
268 270
269 271 /* Attempt to resolve the number of CPUs in the system. */
270 272 int cpu_count() {
271 273 int count = 0;
272 274
273 275 #if defined(_WIN32)
274 276 SYSTEM_INFO si;
275 277 si.dwNumberOfProcessors = 0;
276 278 GetSystemInfo(&si);
277 279 count = si.dwNumberOfProcessors;
278 280 #elif defined(__APPLE__)
279 281 int num;
280 282 size_t size = sizeof(int);
281 283
282 284 if (0 == sysctlbyname("hw.logicalcpu", &num, &size, NULL, 0)) {
283 285 count = num;
284 286 }
285 287 #elif defined(__linux__)
286 288 count = sysconf(_SC_NPROCESSORS_ONLN);
287 289 #elif defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__DragonFly__)
288 290 int mib[2];
289 291 size_t len = sizeof(count);
290 292 mib[0] = CTL_HW;
291 293 mib[1] = HW_NCPU;
292 294 if (0 != sysctl(mib, 2, &count, &len, NULL, 0)) {
293 295 count = 0;
294 296 }
295 297 #elif defined(__hpux)
296 298 count = mpctl(MPC_GETNUMSPUS, NULL, NULL);
297 299 #endif
298 300
299 301 return count;
300 302 }
301 303
302 304 size_t roundpow2(size_t i) {
303 305 i--;
304 306 i |= i >> 1;
305 307 i |= i >> 2;
306 308 i |= i >> 4;
307 309 i |= i >> 8;
308 310 i |= i >> 16;
309 311 i++;
310 312
311 313 return i;
312 314 }
313 315
314 316 /* Safer version of _PyBytes_Resize().
315 317 *
316 318 * _PyBytes_Resize() only works if the refcount is 1. In some scenarios,
317 319 * we can get an object with a refcount > 1, even if it was just created
318 320 * with PyBytes_FromStringAndSize()! That's because (at least) CPython
319 321 * pre-allocates PyBytes instances of size 1 for every possible byte value.
320 322 *
321 323 * If non-0 is returned, obj may or may not be NULL.
322 324 */
323 325 int safe_pybytes_resize(PyObject** obj, Py_ssize_t size) {
324 326 PyObject* tmp;
325 327
326 328 if ((*obj)->ob_refcnt == 1) {
327 329 return _PyBytes_Resize(obj, size);
328 330 }
329 331
330 332 tmp = PyBytes_FromStringAndSize(NULL, size);
331 333 if (!tmp) {
332 334 return -1;
333 335 }
334 336
335 337 memcpy(PyBytes_AS_STRING(tmp), PyBytes_AS_STRING(*obj),
336 338 PyBytes_GET_SIZE(*obj));
337 339
338 340 Py_DECREF(*obj);
339 341 *obj = tmp;
340 342
341 343 return 0;
342 344 } No newline at end of file
@@ -1,471 +1,458 b''
1 1 /* ******************************************************************
2 2 bitstream
3 3 Part of FSE library
4 header file (to include)
5 Copyright (C) 2013-2017, Yann Collet.
4 Copyright (C) 2013-present, Yann Collet.
6 5
7 6 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
8 7
9 8 Redistribution and use in source and binary forms, with or without
10 9 modification, are permitted provided that the following conditions are
11 10 met:
12 11
13 12 * Redistributions of source code must retain the above copyright
14 13 notice, this list of conditions and the following disclaimer.
15 14 * Redistributions in binary form must reproduce the above
16 15 copyright notice, this list of conditions and the following disclaimer
17 16 in the documentation and/or other materials provided with the
18 17 distribution.
19 18
20 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 23 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 30
32 31 You can contact the author at :
33 32 - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
34 33 ****************************************************************** */
35 34 #ifndef BITSTREAM_H_MODULE
36 35 #define BITSTREAM_H_MODULE
37 36
38 37 #if defined (__cplusplus)
39 38 extern "C" {
40 39 #endif
41 40
42 41 /*
43 42 * This API consists of small unitary functions, which must be inlined for best performance.
44 43 * Since link-time-optimization is not available for all compilers,
45 44 * these functions are defined into a .h to be included.
46 45 */
47 46
48 47 /*-****************************************
49 48 * Dependencies
50 49 ******************************************/
51 50 #include "mem.h" /* unaligned access routines */
51 #include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */
52 52 #include "error_private.h" /* error codes and messages */
53 53
54 54
55 /*-*************************************
56 * Debug
57 ***************************************/
58 #if defined(BIT_DEBUG) && (BIT_DEBUG>=1)
59 # include <assert.h>
60 #else
61 # ifndef assert
62 # define assert(condition) ((void)0)
63 # endif
64 #endif
65
66
67 55 /*=========================================
68 56 * Target specific
69 57 =========================================*/
70 58 #if defined(__BMI__) && defined(__GNUC__)
71 59 # include <immintrin.h> /* support for bextr (experimental) */
72 60 #endif
73 61
74 62 #define STREAM_ACCUMULATOR_MIN_32 25
75 63 #define STREAM_ACCUMULATOR_MIN_64 57
76 64 #define STREAM_ACCUMULATOR_MIN ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
77 65
78 66
79 67 /*-******************************************
80 68 * bitStream encoding API (write forward)
81 69 ********************************************/
82 70 /* bitStream can mix input from multiple sources.
83 71 * A critical property of these streams is that they encode and decode in **reverse** direction.
84 72 * So the first bit sequence you add will be the last to be read, like a LIFO stack.
85 73 */
86 typedef struct
87 {
74 typedef struct {
88 75 size_t bitContainer;
89 76 unsigned bitPos;
90 77 char* startPtr;
91 78 char* ptr;
92 79 char* endPtr;
93 80 } BIT_CStream_t;
94 81
95 82 MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
96 83 MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
97 84 MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC);
98 85 MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
99 86
100 87 /* Start with initCStream, providing the size of buffer to write into.
101 88 * bitStream will never write outside of this buffer.
102 89 * `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
103 90 *
104 91 * bits are first added to a local register.
105 92 * Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
106 93 * Writing data into memory is an explicit operation, performed by the flushBits function.
107 94 * Hence keep track how many bits are potentially stored into local register to avoid register overflow.
108 95 * After a flushBits, a maximum of 7 bits might still be stored into local register.
109 96 *
110 97 * Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
111 98 *
112 99 * Last operation is to close the bitStream.
113 100 * The function returns the final size of CStream in bytes.
114 101 * If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
115 102 */
116 103
117 104
118 105 /*-********************************************
119 106 * bitStream decoding API (read backward)
120 107 **********************************************/
121 typedef struct
122 {
108 typedef struct {
123 109 size_t bitContainer;
124 110 unsigned bitsConsumed;
125 111 const char* ptr;
126 112 const char* start;
127 113 const char* limitPtr;
128 114 } BIT_DStream_t;
129 115
130 116 typedef enum { BIT_DStream_unfinished = 0,
131 117 BIT_DStream_endOfBuffer = 1,
132 118 BIT_DStream_completed = 2,
133 119 BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */
134 120 /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
135 121
136 122 MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
137 123 MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
138 124 MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
139 125 MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
140 126
141 127
142 128 /* Start by invoking BIT_initDStream().
143 129 * A chunk of the bitStream is then stored into a local register.
144 130 * Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
145 131 * You can then retrieve bitFields stored into the local register, **in reverse order**.
146 132 * Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
147 133 * A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
148 134 * Otherwise, it can be less than that, so proceed accordingly.
149 135 * Checking if DStream has reached its end can be performed with BIT_endOfDStream().
150 136 */
151 137
152 138
153 139 /*-****************************************
154 140 * unsafe API
155 141 ******************************************/
156 142 MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
157 143 /* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
158 144
159 145 MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
160 146 /* unsafe version; does not check buffer overflow */
161 147
162 148 MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
163 149 /* faster, but works only if nbBits >= 1 */
164 150
165 151
166 152
167 153 /*-**************************************************************
168 154 * Internal functions
169 155 ****************************************************************/
170 156 MEM_STATIC unsigned BIT_highbit32 (U32 val)
171 157 {
172 158 assert(val != 0);
173 159 {
174 160 # if defined(_MSC_VER) /* Visual */
175 161 unsigned long r=0;
176 162 _BitScanReverse ( &r, val );
177 163 return (unsigned) r;
178 164 # elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */
179 165 return 31 - __builtin_clz (val);
180 166 # else /* Software version */
181 167 static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29,
182 168 11, 14, 16, 18, 22, 25, 3, 30,
183 169 8, 12, 20, 28, 15, 17, 24, 7,
184 170 19, 27, 23, 6, 26, 5, 4, 31 };
185 171 U32 v = val;
186 172 v |= v >> 1;
187 173 v |= v >> 2;
188 174 v |= v >> 4;
189 175 v |= v >> 8;
190 176 v |= v >> 16;
191 177 return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
192 178 # endif
193 179 }
194 180 }
195 181
196 182 /*===== Local Constants =====*/
197 183 static const unsigned BIT_mask[] = {
198 184 0, 1, 3, 7, 0xF, 0x1F,
199 185 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
200 186 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF,
201 187 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF,
202 188 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF,
203 189 0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */
204 190 #define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0]))
205 191
206 192 /*-**************************************************************
207 193 * bitStream encoding
208 194 ****************************************************************/
209 195 /*! BIT_initCStream() :
210 196 * `dstCapacity` must be > sizeof(size_t)
211 197 * @return : 0 if success,
212 198 * otherwise an error code (can be tested using ERR_isError()) */
213 199 MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
214 200 void* startPtr, size_t dstCapacity)
215 201 {
216 202 bitC->bitContainer = 0;
217 203 bitC->bitPos = 0;
218 204 bitC->startPtr = (char*)startPtr;
219 205 bitC->ptr = bitC->startPtr;
220 206 bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer);
221 207 if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall);
222 208 return 0;
223 209 }
224 210
225 211 /*! BIT_addBits() :
226 212 * can add up to 31 bits into `bitC`.
227 213 * Note : does not check for register overflow ! */
228 214 MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
229 215 size_t value, unsigned nbBits)
230 216 {
231 217 MEM_STATIC_ASSERT(BIT_MASK_SIZE == 32);
232 218 assert(nbBits < BIT_MASK_SIZE);
233 219 assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
234 220 bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
235 221 bitC->bitPos += nbBits;
236 222 }
237 223
238 224 /*! BIT_addBitsFast() :
239 * works only if `value` is _clean_, meaning all high bits above nbBits are 0 */
225 * works only if `value` is _clean_,
226 * meaning all high bits above nbBits are 0 */
240 227 MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
241 228 size_t value, unsigned nbBits)
242 229 {
243 230 assert((value>>nbBits) == 0);
244 231 assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
245 232 bitC->bitContainer |= value << bitC->bitPos;
246 233 bitC->bitPos += nbBits;
247 234 }
248 235
249 236 /*! BIT_flushBitsFast() :
250 237 * assumption : bitContainer has not overflowed
251 238 * unsafe version; does not check buffer overflow */
252 239 MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC)
253 240 {
254 241 size_t const nbBytes = bitC->bitPos >> 3;
255 242 assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
256 243 MEM_writeLEST(bitC->ptr, bitC->bitContainer);
257 244 bitC->ptr += nbBytes;
258 245 assert(bitC->ptr <= bitC->endPtr);
259 246 bitC->bitPos &= 7;
260 247 bitC->bitContainer >>= nbBytes*8;
261 248 }
262 249
263 250 /*! BIT_flushBits() :
264 251 * assumption : bitContainer has not overflowed
265 252 * safe version; check for buffer overflow, and prevents it.
266 253 * note : does not signal buffer overflow.
267 254 * overflow will be revealed later on using BIT_closeCStream() */
268 255 MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC)
269 256 {
270 257 size_t const nbBytes = bitC->bitPos >> 3;
271 258 assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
272 259 MEM_writeLEST(bitC->ptr, bitC->bitContainer);
273 260 bitC->ptr += nbBytes;
274 261 if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
275 262 bitC->bitPos &= 7;
276 263 bitC->bitContainer >>= nbBytes*8;
277 264 }
278 265
279 266 /*! BIT_closeCStream() :
280 267 * @return : size of CStream, in bytes,
281 268 * or 0 if it could not fit into dstBuffer */
282 269 MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
283 270 {
284 271 BIT_addBitsFast(bitC, 1, 1); /* endMark */
285 272 BIT_flushBits(bitC);
286 273 if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
287 274 return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
288 275 }
289 276
290 277
291 278 /*-********************************************************
292 279 * bitStream decoding
293 280 **********************************************************/
294 281 /*! BIT_initDStream() :
295 282 * Initialize a BIT_DStream_t.
296 283 * `bitD` : a pointer to an already allocated BIT_DStream_t structure.
297 284 * `srcSize` must be the *exact* size of the bitStream, in bytes.
298 285 * @return : size of stream (== srcSize), or an errorCode if a problem is detected
299 286 */
300 287 MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
301 288 {
302 289 if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
303 290
304 291 bitD->start = (const char*)srcBuffer;
305 292 bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer);
306 293
307 294 if (srcSize >= sizeof(bitD->bitContainer)) { /* normal case */
308 295 bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
309 296 bitD->bitContainer = MEM_readLEST(bitD->ptr);
310 297 { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
311 298 bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */
312 299 if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
313 300 } else {
314 301 bitD->ptr = bitD->start;
315 302 bitD->bitContainer = *(const BYTE*)(bitD->start);
316 303 switch(srcSize)
317 304 {
318 305 case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
319 306 /* fall-through */
320 307
321 308 case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
322 309 /* fall-through */
323 310
324 311 case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
325 312 /* fall-through */
326 313
327 314 case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
328 315 /* fall-through */
329 316
330 317 case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
331 318 /* fall-through */
332 319
333 320 case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8;
334 321 /* fall-through */
335 322
336 323 default: break;
337 324 }
338 325 { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
339 326 bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
340 327 if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */
341 328 }
342 329 bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
343 330 }
344 331
345 332 return srcSize;
346 333 }
347 334
348 335 MEM_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
349 336 {
350 337 return bitContainer >> start;
351 338 }
352 339
353 340 MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
354 341 {
355 342 #if defined(__BMI__) && defined(__GNUC__) && __GNUC__*1000+__GNUC_MINOR__ >= 4008 /* experimental */
356 343 # if defined(__x86_64__)
357 344 if (sizeof(bitContainer)==8)
358 345 return _bextr_u64(bitContainer, start, nbBits);
359 346 else
360 347 # endif
361 348 return _bextr_u32(bitContainer, start, nbBits);
362 349 #else
363 350 assert(nbBits < BIT_MASK_SIZE);
364 351 return (bitContainer >> start) & BIT_mask[nbBits];
365 352 #endif
366 353 }
367 354
368 355 MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
369 356 {
370 357 assert(nbBits < BIT_MASK_SIZE);
371 358 return bitContainer & BIT_mask[nbBits];
372 359 }
373 360
374 361 /*! BIT_lookBits() :
375 362 * Provides next n bits from local register.
376 363 * local register is not modified.
377 364 * On 32-bits, maxNbBits==24.
378 365 * On 64-bits, maxNbBits==56.
379 366 * @return : value extracted */
380 367 MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits)
381 368 {
382 369 #if defined(__BMI__) && defined(__GNUC__) /* experimental; fails if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8 */
383 370 return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits);
384 371 #else
385 372 U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
386 373 return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask);
387 374 #endif
388 375 }
389 376
390 377 /*! BIT_lookBitsFast() :
391 378 * unsafe version; only works if nbBits >= 1 */
392 379 MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
393 380 {
394 381 U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
395 382 assert(nbBits >= 1);
396 383 return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
397 384 }
398 385
399 386 MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
400 387 {
401 388 bitD->bitsConsumed += nbBits;
402 389 }
403 390
404 391 /*! BIT_readBits() :
405 392 * Read (consume) next n bits from local register and update.
406 393 * Pay attention to not read more than nbBits contained into local register.
407 394 * @return : extracted value. */
408 395 MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits)
409 396 {
410 397 size_t const value = BIT_lookBits(bitD, nbBits);
411 398 BIT_skipBits(bitD, nbBits);
412 399 return value;
413 400 }
414 401
415 402 /*! BIT_readBitsFast() :
416 403 * unsafe version; only works only if nbBits >= 1 */
417 404 MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
418 405 {
419 406 size_t const value = BIT_lookBitsFast(bitD, nbBits);
420 407 assert(nbBits >= 1);
421 408 BIT_skipBits(bitD, nbBits);
422 409 return value;
423 410 }
424 411
425 412 /*! BIT_reloadDStream() :
426 413 * Refill `bitD` from buffer previously set in BIT_initDStream() .
427 414 * This function is safe, it guarantees it will not read beyond src buffer.
428 415 * @return : status of `BIT_DStream_t` internal register.
429 416 * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
430 417 MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
431 418 {
432 419 if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */
433 420 return BIT_DStream_overflow;
434 421
435 422 if (bitD->ptr >= bitD->limitPtr) {
436 423 bitD->ptr -= bitD->bitsConsumed >> 3;
437 424 bitD->bitsConsumed &= 7;
438 425 bitD->bitContainer = MEM_readLEST(bitD->ptr);
439 426 return BIT_DStream_unfinished;
440 427 }
441 428 if (bitD->ptr == bitD->start) {
442 429 if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
443 430 return BIT_DStream_completed;
444 431 }
445 432 /* start < ptr < limitPtr */
446 433 { U32 nbBytes = bitD->bitsConsumed >> 3;
447 434 BIT_DStream_status result = BIT_DStream_unfinished;
448 435 if (bitD->ptr - nbBytes < bitD->start) {
449 436 nbBytes = (U32)(bitD->ptr - bitD->start); /* ptr > start */
450 437 result = BIT_DStream_endOfBuffer;
451 438 }
452 439 bitD->ptr -= nbBytes;
453 440 bitD->bitsConsumed -= nbBytes*8;
454 441 bitD->bitContainer = MEM_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */
455 442 return result;
456 443 }
457 444 }
458 445
459 446 /*! BIT_endOfDStream() :
460 447 * @return : 1 if DStream has _exactly_ reached its end (all bits consumed).
461 448 */
462 449 MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
463 450 {
464 451 return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
465 452 }
466 453
467 454 #if defined (__cplusplus)
468 455 }
469 456 #endif
470 457
471 458 #endif /* BITSTREAM_H_MODULE */
@@ -1,111 +1,133 b''
1 1 /*
2 2 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
3 3 * All rights reserved.
4 4 *
5 5 * This source code is licensed under both the BSD-style license (found in the
6 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 7 * in the COPYING file in the root directory of this source tree).
8 8 * You may select, at your option, one of the above-listed licenses.
9 9 */
10 10
11 11 #ifndef ZSTD_COMPILER_H
12 12 #define ZSTD_COMPILER_H
13 13
14 14 /*-*******************************************************
15 15 * Compiler specifics
16 16 *********************************************************/
17 17 /* force inlining */
18 18 #if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
19 19 # define INLINE_KEYWORD inline
20 20 #else
21 21 # define INLINE_KEYWORD
22 22 #endif
23 23
24 24 #if defined(__GNUC__)
25 25 # define FORCE_INLINE_ATTR __attribute__((always_inline))
26 26 #elif defined(_MSC_VER)
27 27 # define FORCE_INLINE_ATTR __forceinline
28 28 #else
29 29 # define FORCE_INLINE_ATTR
30 30 #endif
31 31
32 32 /**
33 33 * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
34 34 * parameters. They must be inlined for the compiler to elimininate the constant
35 35 * branches.
36 36 */
37 37 #define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
38 38 /**
39 39 * HINT_INLINE is used to help the compiler generate better code. It is *not*
40 40 * used for "templates", so it can be tweaked based on the compilers
41 41 * performance.
42 42 *
43 43 * gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the
44 44 * always_inline attribute.
45 45 *
46 46 * clang up to 5.0.0 (trunk) benefit tremendously from the always_inline
47 47 * attribute.
48 48 */
49 49 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
50 50 # define HINT_INLINE static INLINE_KEYWORD
51 51 #else
52 52 # define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
53 53 #endif
54 54
55 55 /* force no inlining */
56 56 #ifdef _MSC_VER
57 57 # define FORCE_NOINLINE static __declspec(noinline)
58 58 #else
59 59 # ifdef __GNUC__
60 60 # define FORCE_NOINLINE static __attribute__((__noinline__))
61 61 # else
62 62 # define FORCE_NOINLINE static
63 63 # endif
64 64 #endif
65 65
66 66 /* target attribute */
67 67 #ifndef __has_attribute
68 68 #define __has_attribute(x) 0 /* Compatibility with non-clang compilers. */
69 69 #endif
70 70 #if defined(__GNUC__)
71 71 # define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
72 72 #else
73 73 # define TARGET_ATTRIBUTE(target)
74 74 #endif
75 75
76 76 /* Enable runtime BMI2 dispatch based on the CPU.
77 77 * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
78 78 */
79 79 #ifndef DYNAMIC_BMI2
80 #if (defined(__clang__) && __has_attribute(__target__)) \
80 #if ((defined(__clang__) && __has_attribute(__target__)) \
81 81 || (defined(__GNUC__) \
82 && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) \
82 && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
83 83 && (defined(__x86_64__) || defined(_M_X86)) \
84 84 && !defined(__BMI2__)
85 85 # define DYNAMIC_BMI2 1
86 86 #else
87 87 # define DYNAMIC_BMI2 0
88 88 #endif
89 89 #endif
90 90
91 /* prefetch */
92 #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
93 # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
94 # define PREFETCH(ptr) _mm_prefetch((const char*)ptr, _MM_HINT_T0)
95 #elif defined(__GNUC__)
96 # define PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0)
91 /* prefetch
92 * can be disabled, by declaring NO_PREFETCH macro
93 * All prefetch invocations use a single default locality 2,
94 * generating instruction prefetcht1,
95 * which, according to Intel, means "load data into L2 cache".
96 * This is a good enough "middle ground" for the time being,
97 * though in theory, it would be better to specialize locality depending on data being prefetched.
98 * Tests could not determine any sensible difference based on locality value. */
99 #if defined(NO_PREFETCH)
100 # define PREFETCH(ptr) (void)(ptr) /* disabled */
97 101 #else
98 # define PREFETCH(ptr) /* disabled */
99 #endif
102 # if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
103 # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
104 # define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
105 # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
106 # define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
107 # else
108 # define PREFETCH(ptr) (void)(ptr) /* disabled */
109 # endif
110 #endif /* NO_PREFETCH */
111
112 #define CACHELINE_SIZE 64
113
114 #define PREFETCH_AREA(p, s) { \
115 const char* const _ptr = (const char*)(p); \
116 size_t const _size = (size_t)(s); \
117 size_t _pos; \
118 for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \
119 PREFETCH(_ptr + _pos); \
120 } \
121 }
100 122
101 123 /* disable warnings */
102 124 #ifdef _MSC_VER /* Visual Studio */
103 125 # include <intrin.h> /* For Visual 2005 */
104 126 # pragma warning(disable : 4100) /* disable: C4100: unreferenced formal parameter */
105 127 # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
106 128 # pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */
107 129 # pragma warning(disable : 4214) /* disable: C4214: non-int bitfields */
108 130 # pragma warning(disable : 4324) /* disable: C4324: padded structure */
109 131 #endif
110 132
111 133 #endif /* ZSTD_COMPILER_H */
@@ -1,216 +1,215 b''
1 1 /*
2 2 * Copyright (c) 2018-present, Facebook, Inc.
3 3 * All rights reserved.
4 4 *
5 5 * This source code is licensed under both the BSD-style license (found in the
6 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 7 * in the COPYING file in the root directory of this source tree).
8 8 * You may select, at your option, one of the above-listed licenses.
9 9 */
10 10
11 11 #ifndef ZSTD_COMMON_CPU_H
12 12 #define ZSTD_COMMON_CPU_H
13 13
14 14 /**
15 15 * Implementation taken from folly/CpuId.h
16 16 * https://github.com/facebook/folly/blob/master/folly/CpuId.h
17 17 */
18 18
19 19 #include <string.h>
20 20
21 21 #include "mem.h"
22 22
23 23 #ifdef _MSC_VER
24 24 #include <intrin.h>
25 25 #endif
26 26
27 27 typedef struct {
28 28 U32 f1c;
29 29 U32 f1d;
30 30 U32 f7b;
31 31 U32 f7c;
32 32 } ZSTD_cpuid_t;
33 33
34 34 MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
35 35 U32 f1c = 0;
36 36 U32 f1d = 0;
37 37 U32 f7b = 0;
38 38 U32 f7c = 0;
39 #ifdef _MSC_VER
39 #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
40 40 int reg[4];
41 41 __cpuid((int*)reg, 0);
42 42 {
43 43 int const n = reg[0];
44 44 if (n >= 1) {
45 45 __cpuid((int*)reg, 1);
46 46 f1c = (U32)reg[2];
47 47 f1d = (U32)reg[3];
48 48 }
49 49 if (n >= 7) {
50 50 __cpuidex((int*)reg, 7, 0);
51 51 f7b = (U32)reg[1];
52 52 f7c = (U32)reg[2];
53 53 }
54 54 }
55 55 #elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__)
56 56 /* The following block like the normal cpuid branch below, but gcc
57 57 * reserves ebx for use of its pic register so we must specially
58 58 * handle the save and restore to avoid clobbering the register
59 59 */
60 60 U32 n;
61 61 __asm__(
62 62 "pushl %%ebx\n\t"
63 63 "cpuid\n\t"
64 64 "popl %%ebx\n\t"
65 65 : "=a"(n)
66 66 : "a"(0)
67 67 : "ecx", "edx");
68 68 if (n >= 1) {
69 69 U32 f1a;
70 70 __asm__(
71 71 "pushl %%ebx\n\t"
72 72 "cpuid\n\t"
73 73 "popl %%ebx\n\t"
74 74 : "=a"(f1a), "=c"(f1c), "=d"(f1d)
75 : "a"(1)
76 :);
75 : "a"(1));
77 76 }
78 77 if (n >= 7) {
79 78 __asm__(
80 79 "pushl %%ebx\n\t"
81 80 "cpuid\n\t"
82 81 "movl %%ebx, %%eax\n\r"
83 82 "popl %%ebx"
84 83 : "=a"(f7b), "=c"(f7c)
85 84 : "a"(7), "c"(0)
86 85 : "edx");
87 86 }
88 87 #elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
89 88 U32 n;
90 89 __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx");
91 90 if (n >= 1) {
92 91 U32 f1a;
93 92 __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx");
94 93 }
95 94 if (n >= 7) {
96 95 U32 f7a;
97 96 __asm__("cpuid"
98 97 : "=a"(f7a), "=b"(f7b), "=c"(f7c)
99 98 : "a"(7), "c"(0)
100 99 : "edx");
101 100 }
102 101 #endif
103 102 {
104 103 ZSTD_cpuid_t cpuid;
105 104 cpuid.f1c = f1c;
106 105 cpuid.f1d = f1d;
107 106 cpuid.f7b = f7b;
108 107 cpuid.f7c = f7c;
109 108 return cpuid;
110 109 }
111 110 }
112 111
113 112 #define X(name, r, bit) \
114 113 MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) { \
115 114 return ((cpuid.r) & (1U << bit)) != 0; \
116 115 }
117 116
118 117 /* cpuid(1): Processor Info and Feature Bits. */
119 118 #define C(name, bit) X(name, f1c, bit)
120 119 C(sse3, 0)
121 120 C(pclmuldq, 1)
122 121 C(dtes64, 2)
123 122 C(monitor, 3)
124 123 C(dscpl, 4)
125 124 C(vmx, 5)
126 125 C(smx, 6)
127 126 C(eist, 7)
128 127 C(tm2, 8)
129 128 C(ssse3, 9)
130 129 C(cnxtid, 10)
131 130 C(fma, 12)
132 131 C(cx16, 13)
133 132 C(xtpr, 14)
134 133 C(pdcm, 15)
135 134 C(pcid, 17)
136 135 C(dca, 18)
137 136 C(sse41, 19)
138 137 C(sse42, 20)
139 138 C(x2apic, 21)
140 139 C(movbe, 22)
141 140 C(popcnt, 23)
142 141 C(tscdeadline, 24)
143 142 C(aes, 25)
144 143 C(xsave, 26)
145 144 C(osxsave, 27)
146 145 C(avx, 28)
147 146 C(f16c, 29)
148 147 C(rdrand, 30)
149 148 #undef C
150 149 #define D(name, bit) X(name, f1d, bit)
151 150 D(fpu, 0)
152 151 D(vme, 1)
153 152 D(de, 2)
154 153 D(pse, 3)
155 154 D(tsc, 4)
156 155 D(msr, 5)
157 156 D(pae, 6)
158 157 D(mce, 7)
159 158 D(cx8, 8)
160 159 D(apic, 9)
161 160 D(sep, 11)
162 161 D(mtrr, 12)
163 162 D(pge, 13)
164 163 D(mca, 14)
165 164 D(cmov, 15)
166 165 D(pat, 16)
167 166 D(pse36, 17)
168 167 D(psn, 18)
169 168 D(clfsh, 19)
170 169 D(ds, 21)
171 170 D(acpi, 22)
172 171 D(mmx, 23)
173 172 D(fxsr, 24)
174 173 D(sse, 25)
175 174 D(sse2, 26)
176 175 D(ss, 27)
177 176 D(htt, 28)
178 177 D(tm, 29)
179 178 D(pbe, 31)
180 179 #undef D
181 180
182 181 /* cpuid(7): Extended Features. */
183 182 #define B(name, bit) X(name, f7b, bit)
184 183 B(bmi1, 3)
185 184 B(hle, 4)
186 185 B(avx2, 5)
187 186 B(smep, 7)
188 187 B(bmi2, 8)
189 188 B(erms, 9)
190 189 B(invpcid, 10)
191 190 B(rtm, 11)
192 191 B(mpx, 14)
193 192 B(avx512f, 16)
194 193 B(avx512dq, 17)
195 194 B(rdseed, 18)
196 195 B(adx, 19)
197 196 B(smap, 20)
198 197 B(avx512ifma, 21)
199 198 B(pcommit, 22)
200 199 B(clflushopt, 23)
201 200 B(clwb, 24)
202 201 B(avx512pf, 26)
203 202 B(avx512er, 27)
204 203 B(avx512cd, 28)
205 204 B(sha, 29)
206 205 B(avx512bw, 30)
207 206 B(avx512vl, 31)
208 207 #undef B
209 208 #define C(name, bit) X(name, f7c, bit)
210 209 C(prefetchwt1, 0)
211 210 C(avx512vbmi, 1)
212 211 #undef C
213 212
214 213 #undef X
215 214
216 215 #endif /* ZSTD_COMMON_CPU_H */
@@ -1,221 +1,236 b''
1 1 /*
2 2 Common functions of New Generation Entropy library
3 3 Copyright (C) 2016, Yann Collet.
4 4
5 5 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
6 6
7 7 Redistribution and use in source and binary forms, with or without
8 8 modification, are permitted provided that the following conditions are
9 9 met:
10 10
11 11 * Redistributions of source code must retain the above copyright
12 12 notice, this list of conditions and the following disclaimer.
13 13 * Redistributions in binary form must reproduce the above
14 14 copyright notice, this list of conditions and the following disclaimer
15 15 in the documentation and/or other materials provided with the
16 16 distribution.
17 17
18 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 19 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 22 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 29
30 30 You can contact the author at :
31 31 - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
32 32 - Public forum : https://groups.google.com/forum/#!forum/lz4c
33 33 *************************************************************************** */
34 34
35 35 /* *************************************
36 36 * Dependencies
37 37 ***************************************/
38 38 #include "mem.h"
39 39 #include "error_private.h" /* ERR_*, ERROR */
40 40 #define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */
41 41 #include "fse.h"
42 42 #define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */
43 43 #include "huf.h"
44 44
45 45
46 46 /*=== Version ===*/
47 47 unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
48 48
49 49
50 50 /*=== Error Management ===*/
51 51 unsigned FSE_isError(size_t code) { return ERR_isError(code); }
52 52 const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
53 53
54 54 unsigned HUF_isError(size_t code) { return ERR_isError(code); }
55 55 const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
56 56
57 57
58 58 /*-**************************************************************
59 59 * FSE NCount encoding-decoding
60 60 ****************************************************************/
61 61 size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
62 62 const void* headerBuffer, size_t hbSize)
63 63 {
64 64 const BYTE* const istart = (const BYTE*) headerBuffer;
65 65 const BYTE* const iend = istart + hbSize;
66 66 const BYTE* ip = istart;
67 67 int nbBits;
68 68 int remaining;
69 69 int threshold;
70 70 U32 bitStream;
71 71 int bitCount;
72 72 unsigned charnum = 0;
73 73 int previous0 = 0;
74 74
75 if (hbSize < 4) return ERROR(srcSize_wrong);
75 if (hbSize < 4) {
76 /* This function only works when hbSize >= 4 */
77 char buffer[4];
78 memset(buffer, 0, sizeof(buffer));
79 memcpy(buffer, headerBuffer, hbSize);
80 { size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr,
81 buffer, sizeof(buffer));
82 if (FSE_isError(countSize)) return countSize;
83 if (countSize > hbSize) return ERROR(corruption_detected);
84 return countSize;
85 } }
86 assert(hbSize >= 4);
87
88 /* init */
89 memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0])); /* all symbols not present in NCount have a frequency of 0 */
76 90 bitStream = MEM_readLE32(ip);
77 91 nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG; /* extract tableLog */
78 92 if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
79 93 bitStream >>= 4;
80 94 bitCount = 4;
81 95 *tableLogPtr = nbBits;
82 96 remaining = (1<<nbBits)+1;
83 97 threshold = 1<<nbBits;
84 98 nbBits++;
85 99
86 100 while ((remaining>1) & (charnum<=*maxSVPtr)) {
87 101 if (previous0) {
88 102 unsigned n0 = charnum;
89 103 while ((bitStream & 0xFFFF) == 0xFFFF) {
90 104 n0 += 24;
91 105 if (ip < iend-5) {
92 106 ip += 2;
93 107 bitStream = MEM_readLE32(ip) >> bitCount;
94 108 } else {
95 109 bitStream >>= 16;
96 110 bitCount += 16;
97 111 } }
98 112 while ((bitStream & 3) == 3) {
99 113 n0 += 3;
100 114 bitStream >>= 2;
101 115 bitCount += 2;
102 116 }
103 117 n0 += bitStream & 3;
104 118 bitCount += 2;
105 119 if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
106 120 while (charnum < n0) normalizedCounter[charnum++] = 0;
107 121 if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
122 assert((bitCount >> 3) <= 3); /* For first condition to work */
108 123 ip += bitCount>>3;
109 124 bitCount &= 7;
110 125 bitStream = MEM_readLE32(ip) >> bitCount;
111 126 } else {
112 127 bitStream >>= 2;
113 128 } }
114 129 { int const max = (2*threshold-1) - remaining;
115 130 int count;
116 131
117 132 if ((bitStream & (threshold-1)) < (U32)max) {
118 133 count = bitStream & (threshold-1);
119 134 bitCount += nbBits-1;
120 135 } else {
121 136 count = bitStream & (2*threshold-1);
122 137 if (count >= threshold) count -= max;
123 138 bitCount += nbBits;
124 139 }
125 140
126 141 count--; /* extra accuracy */
127 142 remaining -= count < 0 ? -count : count; /* -1 means +1 */
128 143 normalizedCounter[charnum++] = (short)count;
129 144 previous0 = !count;
130 145 while (remaining < threshold) {
131 146 nbBits--;
132 147 threshold >>= 1;
133 148 }
134 149
135 150 if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
136 151 ip += bitCount>>3;
137 152 bitCount &= 7;
138 153 } else {
139 154 bitCount -= (int)(8 * (iend - 4 - ip));
140 155 ip = iend - 4;
141 156 }
142 157 bitStream = MEM_readLE32(ip) >> (bitCount & 31);
143 158 } } /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
144 159 if (remaining != 1) return ERROR(corruption_detected);
145 160 if (bitCount > 32) return ERROR(corruption_detected);
146 161 *maxSVPtr = charnum-1;
147 162
148 163 ip += (bitCount+7)>>3;
149 164 return ip-istart;
150 165 }
151 166
152 167
153 168 /*! HUF_readStats() :
154 169 Read compact Huffman tree, saved by HUF_writeCTable().
155 170 `huffWeight` is destination buffer.
156 171 `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
157 172 @return : size read from `src` , or an error Code .
158 173 Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
159 174 */
160 175 size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
161 176 U32* nbSymbolsPtr, U32* tableLogPtr,
162 177 const void* src, size_t srcSize)
163 178 {
164 179 U32 weightTotal;
165 180 const BYTE* ip = (const BYTE*) src;
166 181 size_t iSize;
167 182 size_t oSize;
168 183
169 184 if (!srcSize) return ERROR(srcSize_wrong);
170 185 iSize = ip[0];
171 186 /* memset(huffWeight, 0, hwSize); *//* is not necessary, even though some analyzer complain ... */
172 187
173 188 if (iSize >= 128) { /* special header */
174 189 oSize = iSize - 127;
175 190 iSize = ((oSize+1)/2);
176 191 if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
177 192 if (oSize >= hwSize) return ERROR(corruption_detected);
178 193 ip += 1;
179 194 { U32 n;
180 195 for (n=0; n<oSize; n+=2) {
181 196 huffWeight[n] = ip[n/2] >> 4;
182 197 huffWeight[n+1] = ip[n/2] & 15;
183 198 } } }
184 199 else { /* header compressed with FSE (normal case) */
185 200 FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)]; /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */
186 201 if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
187 202 oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6); /* max (hwSize-1) values decoded, as last one is implied */
188 203 if (FSE_isError(oSize)) return oSize;
189 204 }
190 205
191 206 /* collect weight stats */
192 207 memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
193 208 weightTotal = 0;
194 209 { U32 n; for (n=0; n<oSize; n++) {
195 210 if (huffWeight[n] >= HUF_TABLELOG_MAX) return ERROR(corruption_detected);
196 211 rankStats[huffWeight[n]]++;
197 212 weightTotal += (1 << huffWeight[n]) >> 1;
198 213 } }
199 214 if (weightTotal == 0) return ERROR(corruption_detected);
200 215
201 216 /* get last non-null symbol weight (implied, total must be 2^n) */
202 217 { U32 const tableLog = BIT_highbit32(weightTotal) + 1;
203 218 if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
204 219 *tableLogPtr = tableLog;
205 220 /* determine last weight */
206 221 { U32 const total = 1 << tableLog;
207 222 U32 const rest = total - weightTotal;
208 223 U32 const verif = 1 << BIT_highbit32(rest);
209 224 U32 const lastWeight = BIT_highbit32(rest) + 1;
210 225 if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */
211 226 huffWeight[oSize] = (BYTE)lastWeight;
212 227 rankStats[lastWeight]++;
213 228 } }
214 229
215 230 /* check tree construction validity */
216 231 if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected); /* by construction : at least 2 elts of rank 1, must be even */
217 232
218 233 /* results */
219 234 *nbSymbolsPtr = (U32)(oSize+1);
220 235 return iSize+1;
221 236 }
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
1 NO CONTENT: modified file
The requested commit or file is too big and content was truncated. Show full diff
General Comments 0
You need to be logged in to leave comments. Login now