##// END OF EJS Templates
zstd: vendor python-zstandard 0.8.0...
Gregory Szorc -
r31796:e0dc4053 default
parent child Browse files
Show More
This diff has been collapsed as it changes many lines, (770 lines changed) Show them Hide them
@@ -0,0 +1,770
1 /**
2 * Copyright (c) 2017-present, Gregory Szorc
3 * All rights reserved.
4 *
5 * This software may be modified and distributed under the terms
6 * of the BSD license. See the LICENSE file for details.
7 */
8
9 #include "python-zstandard.h"
10
11 extern PyObject* ZstdError;
12
13 PyDoc_STRVAR(BufferWithSegments__doc__,
14 "BufferWithSegments - A memory buffer holding known sub-segments.\n"
15 "\n"
16 "This type represents a contiguous chunk of memory containing N discrete\n"
17 "items within sub-segments of that memory.\n"
18 "\n"
19 "Segments within the buffer are stored as an array of\n"
20 "``(offset, length)`` pairs, where each element is an unsigned 64-bit\n"
21 "integer using the host/native bit order representation.\n"
22 "\n"
23 "The type exists to facilitate operations against N>1 items without the\n"
24 "overhead of Python object creation and management.\n"
25 );
26
27 static void BufferWithSegments_dealloc(ZstdBufferWithSegments* self) {
28 /* Backing memory is either canonically owned by a Py_buffer or by us. */
29 if (self->parent.buf) {
30 PyBuffer_Release(&self->parent);
31 }
32 else if (self->useFree) {
33 free(self->data);
34 }
35 else {
36 PyMem_Free(self->data);
37 }
38
39 self->data = NULL;
40
41 if (self->useFree) {
42 free(self->segments);
43 }
44 else {
45 PyMem_Free(self->segments);
46 }
47
48 self->segments = NULL;
49
50 PyObject_Del(self);
51 }
52
53 static int BufferWithSegments_init(ZstdBufferWithSegments* self, PyObject* args, PyObject* kwargs) {
54 static char* kwlist[] = {
55 "data",
56 "segments",
57 NULL
58 };
59
60 Py_buffer segments;
61 Py_ssize_t segmentCount;
62 Py_ssize_t i;
63
64 memset(&self->parent, 0, sizeof(self->parent));
65
66 #if PY_MAJOR_VERSION >= 3
67 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*y*:BufferWithSegments",
68 #else
69 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*s*:BufferWithSegments",
70 #endif
71 kwlist, &self->parent, &segments)) {
72 return -1;
73 }
74
75 if (!PyBuffer_IsContiguous(&self->parent, 'C') || self->parent.ndim > 1) {
76 PyErr_SetString(PyExc_ValueError, "data buffer should be contiguous and have a single dimension");
77 goto except;
78 }
79
80 if (!PyBuffer_IsContiguous(&segments, 'C') || segments.ndim > 1) {
81 PyErr_SetString(PyExc_ValueError, "segments buffer should be contiguous and have a single dimension");
82 goto except;
83 }
84
85 if (segments.len % sizeof(BufferSegment)) {
86 PyErr_Format(PyExc_ValueError, "segments array size is not a multiple of %lu",
87 sizeof(BufferSegment));
88 goto except;
89 }
90
91 segmentCount = segments.len / sizeof(BufferSegment);
92
93 /* Validate segments data, as blindly trusting it could lead to arbitrary
94 memory access. */
95 for (i = 0; i < segmentCount; i++) {
96 BufferSegment* segment = &((BufferSegment*)(segments.buf))[i];
97
98 if (segment->offset + segment->length > (unsigned long long)self->parent.len) {
99 PyErr_SetString(PyExc_ValueError, "offset within segments array references memory outside buffer");
100 goto except;
101 return -1;
102 }
103 }
104
105 /* Make a copy of the segments data. It is cheap to do so and is a guard
106 against caller changing offsets, which has security implications. */
107 self->segments = PyMem_Malloc(segments.len);
108 if (!self->segments) {
109 PyErr_NoMemory();
110 goto except;
111 }
112
113 memcpy(self->segments, segments.buf, segments.len);
114 PyBuffer_Release(&segments);
115
116 self->data = self->parent.buf;
117 self->dataSize = self->parent.len;
118 self->segmentCount = segmentCount;
119
120 return 0;
121
122 except:
123 PyBuffer_Release(&self->parent);
124 PyBuffer_Release(&segments);
125 return -1;
126 };
127
128 /**
129 * Construct a BufferWithSegments from existing memory and offsets.
130 *
131 * Ownership of the backing memory and BufferSegments will be transferred to
132 * the created object and freed when the BufferWithSegments is destroyed.
133 */
134 ZstdBufferWithSegments* BufferWithSegments_FromMemory(void* data, unsigned long long dataSize,
135 BufferSegment* segments, Py_ssize_t segmentsSize) {
136 ZstdBufferWithSegments* result = NULL;
137 Py_ssize_t i;
138
139 if (NULL == data) {
140 PyErr_SetString(PyExc_ValueError, "data is NULL");
141 return NULL;
142 }
143
144 if (NULL == segments) {
145 PyErr_SetString(PyExc_ValueError, "segments is NULL");
146 return NULL;
147 }
148
149 for (i = 0; i < segmentsSize; i++) {
150 BufferSegment* segment = &segments[i];
151
152 if (segment->offset + segment->length > dataSize) {
153 PyErr_SetString(PyExc_ValueError, "offset in segments overflows buffer size");
154 return NULL;
155 }
156 }
157
158 result = PyObject_New(ZstdBufferWithSegments, &ZstdBufferWithSegmentsType);
159 if (NULL == result) {
160 return NULL;
161 }
162
163 result->useFree = 0;
164
165 memset(&result->parent, 0, sizeof(result->parent));
166 result->data = data;
167 result->dataSize = dataSize;
168 result->segments = segments;
169 result->segmentCount = segmentsSize;
170
171 return result;
172 }
173
174 static Py_ssize_t BufferWithSegments_length(ZstdBufferWithSegments* self) {
175 return self->segmentCount;
176 }
177
178 static ZstdBufferSegment* BufferWithSegments_item(ZstdBufferWithSegments* self, Py_ssize_t i) {
179 ZstdBufferSegment* result = NULL;
180
181 if (i < 0) {
182 PyErr_SetString(PyExc_IndexError, "offset must be non-negative");
183 return NULL;
184 }
185
186 if (i >= self->segmentCount) {
187 PyErr_Format(PyExc_IndexError, "offset must be less than %zd", self->segmentCount);
188 return NULL;
189 }
190
191 result = (ZstdBufferSegment*)PyObject_CallObject((PyObject*)&ZstdBufferSegmentType, NULL);
192 if (NULL == result) {
193 return NULL;
194 }
195
196 result->parent = (PyObject*)self;
197 Py_INCREF(self);
198
199 result->data = (char*)self->data + self->segments[i].offset;
200 result->dataSize = self->segments[i].length;
201 result->offset = self->segments[i].offset;
202
203 return result;
204 }
205
206 #if PY_MAJOR_VERSION >= 3
207 static int BufferWithSegments_getbuffer(ZstdBufferWithSegments* self, Py_buffer* view, int flags) {
208 return PyBuffer_FillInfo(view, (PyObject*)self, self->data, self->dataSize, 1, flags);
209 }
210 #else
211 static Py_ssize_t BufferWithSegments_getreadbuffer(ZstdBufferWithSegments* self, Py_ssize_t segment, void **ptrptr) {
212 if (segment != 0) {
213 PyErr_SetString(PyExc_ValueError, "segment number must be 0");
214 return -1;
215 }
216
217 *ptrptr = self->data;
218 return self->dataSize;
219 }
220
221 static Py_ssize_t BufferWithSegments_getsegcount(ZstdBufferWithSegments* self, Py_ssize_t* len) {
222 if (len) {
223 *len = 1;
224 }
225
226 return 1;
227 }
228 #endif
229
230 PyDoc_STRVAR(BufferWithSegments_tobytes__doc__,
231 "Obtain a bytes instance for this buffer.\n"
232 );
233
234 static PyObject* BufferWithSegments_tobytes(ZstdBufferWithSegments* self) {
235 return PyBytes_FromStringAndSize(self->data, self->dataSize);
236 }
237
238 PyDoc_STRVAR(BufferWithSegments_segments__doc__,
239 "Obtain a BufferSegments describing segments in this sintance.\n"
240 );
241
242 static ZstdBufferSegments* BufferWithSegments_segments(ZstdBufferWithSegments* self) {
243 ZstdBufferSegments* result = (ZstdBufferSegments*)PyObject_CallObject((PyObject*)&ZstdBufferSegmentsType, NULL);
244 if (NULL == result) {
245 return NULL;
246 }
247
248 result->parent = (PyObject*)self;
249 Py_INCREF(self);
250 result->segments = self->segments;
251 result->segmentCount = self->segmentCount;
252
253 return result;
254 }
255
256 static PySequenceMethods BufferWithSegments_sq = {
257 (lenfunc)BufferWithSegments_length, /* sq_length */
258 0, /* sq_concat */
259 0, /* sq_repeat */
260 (ssizeargfunc)BufferWithSegments_item, /* sq_item */
261 0, /* sq_ass_item */
262 0, /* sq_contains */
263 0, /* sq_inplace_concat */
264 0 /* sq_inplace_repeat */
265 };
266
267 static PyBufferProcs BufferWithSegments_as_buffer = {
268 #if PY_MAJOR_VERSION >= 3
269 (getbufferproc)BufferWithSegments_getbuffer, /* bf_getbuffer */
270 0 /* bf_releasebuffer */
271 #else
272 (readbufferproc)BufferWithSegments_getreadbuffer, /* bf_getreadbuffer */
273 0, /* bf_getwritebuffer */
274 (segcountproc)BufferWithSegments_getsegcount, /* bf_getsegcount */
275 0 /* bf_getcharbuffer */
276 #endif
277 };
278
279 static PyMethodDef BufferWithSegments_methods[] = {
280 { "segments", (PyCFunction)BufferWithSegments_segments,
281 METH_NOARGS, BufferWithSegments_segments__doc__ },
282 { "tobytes", (PyCFunction)BufferWithSegments_tobytes,
283 METH_NOARGS, BufferWithSegments_tobytes__doc__ },
284 { NULL, NULL }
285 };
286
287 static PyMemberDef BufferWithSegments_members[] = {
288 { "size", T_ULONGLONG, offsetof(ZstdBufferWithSegments, dataSize),
289 READONLY, "total size of the buffer in bytes" },
290 { NULL }
291 };
292
293 PyTypeObject ZstdBufferWithSegmentsType = {
294 PyVarObject_HEAD_INIT(NULL, 0)
295 "zstd.BufferWithSegments", /* tp_name */
296 sizeof(ZstdBufferWithSegments),/* tp_basicsize */
297 0, /* tp_itemsize */
298 (destructor)BufferWithSegments_dealloc, /* tp_dealloc */
299 0, /* tp_print */
300 0, /* tp_getattr */
301 0, /* tp_setattr */
302 0, /* tp_compare */
303 0, /* tp_repr */
304 0, /* tp_as_number */
305 &BufferWithSegments_sq, /* tp_as_sequence */
306 0, /* tp_as_mapping */
307 0, /* tp_hash */
308 0, /* tp_call */
309 0, /* tp_str */
310 0, /* tp_getattro */
311 0, /* tp_setattro */
312 &BufferWithSegments_as_buffer, /* tp_as_buffer */
313 Py_TPFLAGS_DEFAULT, /* tp_flags */
314 BufferWithSegments__doc__, /* tp_doc */
315 0, /* tp_traverse */
316 0, /* tp_clear */
317 0, /* tp_richcompare */
318 0, /* tp_weaklistoffset */
319 0, /* tp_iter */
320 0, /* tp_iternext */
321 BufferWithSegments_methods, /* tp_methods */
322 BufferWithSegments_members, /* tp_members */
323 0, /* tp_getset */
324 0, /* tp_base */
325 0, /* tp_dict */
326 0, /* tp_descr_get */
327 0, /* tp_descr_set */
328 0, /* tp_dictoffset */
329 (initproc)BufferWithSegments_init, /* tp_init */
330 0, /* tp_alloc */
331 PyType_GenericNew, /* tp_new */
332 };
333
334 PyDoc_STRVAR(BufferSegments__doc__,
335 "BufferSegments - Represents segments/offsets within a BufferWithSegments\n"
336 );
337
338 static void BufferSegments_dealloc(ZstdBufferSegments* self) {
339 Py_CLEAR(self->parent);
340 PyObject_Del(self);
341 }
342
343 #if PY_MAJOR_VERSION >= 3
344 static int BufferSegments_getbuffer(ZstdBufferSegments* self, Py_buffer* view, int flags) {
345 return PyBuffer_FillInfo(view, (PyObject*)self,
346 (void*)self->segments, self->segmentCount * sizeof(BufferSegment),
347 1, flags);
348 }
349 #else
350 static Py_ssize_t BufferSegments_getreadbuffer(ZstdBufferSegments* self, Py_ssize_t segment, void **ptrptr) {
351 if (segment != 0) {
352 PyErr_SetString(PyExc_ValueError, "segment number must be 0");
353 return -1;
354 }
355
356 *ptrptr = (void*)self->segments;
357 return self->segmentCount * sizeof(BufferSegment);
358 }
359
360 static Py_ssize_t BufferSegments_getsegcount(ZstdBufferSegments* self, Py_ssize_t* len) {
361 if (len) {
362 *len = 1;
363 }
364
365 return 1;
366 }
367 #endif
368
369 static PyBufferProcs BufferSegments_as_buffer = {
370 #if PY_MAJOR_VERSION >= 3
371 (getbufferproc)BufferSegments_getbuffer,
372 0
373 #else
374 (readbufferproc)BufferSegments_getreadbuffer,
375 0,
376 (segcountproc)BufferSegments_getsegcount,
377 0
378 #endif
379 };
380
381 PyTypeObject ZstdBufferSegmentsType = {
382 PyVarObject_HEAD_INIT(NULL, 0)
383 "zstd.BufferSegments", /* tp_name */
384 sizeof(ZstdBufferSegments),/* tp_basicsize */
385 0, /* tp_itemsize */
386 (destructor)BufferSegments_dealloc, /* tp_dealloc */
387 0, /* tp_print */
388 0, /* tp_getattr */
389 0, /* tp_setattr */
390 0, /* tp_compare */
391 0, /* tp_repr */
392 0, /* tp_as_number */
393 0, /* tp_as_sequence */
394 0, /* tp_as_mapping */
395 0, /* tp_hash */
396 0, /* tp_call */
397 0, /* tp_str */
398 0, /* tp_getattro */
399 0, /* tp_setattro */
400 &BufferSegments_as_buffer, /* tp_as_buffer */
401 Py_TPFLAGS_DEFAULT, /* tp_flags */
402 BufferSegments__doc__, /* tp_doc */
403 0, /* tp_traverse */
404 0, /* tp_clear */
405 0, /* tp_richcompare */
406 0, /* tp_weaklistoffset */
407 0, /* tp_iter */
408 0, /* tp_iternext */
409 0, /* tp_methods */
410 0, /* tp_members */
411 0, /* tp_getset */
412 0, /* tp_base */
413 0, /* tp_dict */
414 0, /* tp_descr_get */
415 0, /* tp_descr_set */
416 0, /* tp_dictoffset */
417 0, /* tp_init */
418 0, /* tp_alloc */
419 PyType_GenericNew, /* tp_new */
420 };
421
422 PyDoc_STRVAR(BufferSegment__doc__,
423 "BufferSegment - Represents a segment within a BufferWithSegments\n"
424 );
425
426 static void BufferSegment_dealloc(ZstdBufferSegment* self) {
427 Py_CLEAR(self->parent);
428 PyObject_Del(self);
429 }
430
431 static Py_ssize_t BufferSegment_length(ZstdBufferSegment* self) {
432 return self->dataSize;
433 }
434
435 #if PY_MAJOR_VERSION >= 3
436 static int BufferSegment_getbuffer(ZstdBufferSegment* self, Py_buffer* view, int flags) {
437 return PyBuffer_FillInfo(view, (PyObject*)self,
438 self->data, self->dataSize, 1, flags);
439 }
440 #else
441 static Py_ssize_t BufferSegment_getreadbuffer(ZstdBufferSegment* self, Py_ssize_t segment, void **ptrptr) {
442 if (segment != 0) {
443 PyErr_SetString(PyExc_ValueError, "segment number must be 0");
444 return -1;
445 }
446
447 *ptrptr = self->data;
448 return self->dataSize;
449 }
450
451 static Py_ssize_t BufferSegment_getsegcount(ZstdBufferSegment* self, Py_ssize_t* len) {
452 if (len) {
453 *len = 1;
454 }
455
456 return 1;
457 }
458 #endif
459
460 PyDoc_STRVAR(BufferSegment_tobytes__doc__,
461 "Obtain a bytes instance for this segment.\n"
462 );
463
464 static PyObject* BufferSegment_tobytes(ZstdBufferSegment* self) {
465 return PyBytes_FromStringAndSize(self->data, self->dataSize);
466 }
467
468 static PySequenceMethods BufferSegment_sq = {
469 (lenfunc)BufferSegment_length, /* sq_length */
470 0, /* sq_concat */
471 0, /* sq_repeat */
472 0, /* sq_item */
473 0, /* sq_ass_item */
474 0, /* sq_contains */
475 0, /* sq_inplace_concat */
476 0 /* sq_inplace_repeat */
477 };
478
479 static PyBufferProcs BufferSegment_as_buffer = {
480 #if PY_MAJOR_VERSION >= 3
481 (getbufferproc)BufferSegment_getbuffer,
482 0
483 #else
484 (readbufferproc)BufferSegment_getreadbuffer,
485 0,
486 (segcountproc)BufferSegment_getsegcount,
487 0
488 #endif
489 };
490
491 static PyMethodDef BufferSegment_methods[] = {
492 { "tobytes", (PyCFunction)BufferSegment_tobytes,
493 METH_NOARGS, BufferSegment_tobytes__doc__ },
494 { NULL, NULL }
495 };
496
497 static PyMemberDef BufferSegment_members[] = {
498 { "offset", T_ULONGLONG, offsetof(ZstdBufferSegment, offset), READONLY,
499 "offset of segment within parent buffer" },
500 { NULL }
501 };
502
503 PyTypeObject ZstdBufferSegmentType = {
504 PyVarObject_HEAD_INIT(NULL, 0)
505 "zstd.BufferSegment", /* tp_name */
506 sizeof(ZstdBufferSegment),/* tp_basicsize */
507 0, /* tp_itemsize */
508 (destructor)BufferSegment_dealloc, /* tp_dealloc */
509 0, /* tp_print */
510 0, /* tp_getattr */
511 0, /* tp_setattr */
512 0, /* tp_compare */
513 0, /* tp_repr */
514 0, /* tp_as_number */
515 &BufferSegment_sq, /* tp_as_sequence */
516 0, /* tp_as_mapping */
517 0, /* tp_hash */
518 0, /* tp_call */
519 0, /* tp_str */
520 0, /* tp_getattro */
521 0, /* tp_setattro */
522 &BufferSegment_as_buffer, /* tp_as_buffer */
523 Py_TPFLAGS_DEFAULT, /* tp_flags */
524 BufferSegment__doc__, /* tp_doc */
525 0, /* tp_traverse */
526 0, /* tp_clear */
527 0, /* tp_richcompare */
528 0, /* tp_weaklistoffset */
529 0, /* tp_iter */
530 0, /* tp_iternext */
531 BufferSegment_methods, /* tp_methods */
532 BufferSegment_members, /* tp_members */
533 0, /* tp_getset */
534 0, /* tp_base */
535 0, /* tp_dict */
536 0, /* tp_descr_get */
537 0, /* tp_descr_set */
538 0, /* tp_dictoffset */
539 0, /* tp_init */
540 0, /* tp_alloc */
541 PyType_GenericNew, /* tp_new */
542 };
543
544 PyDoc_STRVAR(BufferWithSegmentsCollection__doc__,
545 "Represents a collection of BufferWithSegments.\n"
546 );
547
548 static void BufferWithSegmentsCollection_dealloc(ZstdBufferWithSegmentsCollection* self) {
549 Py_ssize_t i;
550
551 if (self->firstElements) {
552 PyMem_Free(self->firstElements);
553 self->firstElements = NULL;
554 }
555
556 if (self->buffers) {
557 for (i = 0; i < self->bufferCount; i++) {
558 Py_CLEAR(self->buffers[i]);
559 }
560
561 PyMem_Free(self->buffers);
562 self->buffers = NULL;
563 }
564
565 PyObject_Del(self);
566 }
567
568 static int BufferWithSegmentsCollection_init(ZstdBufferWithSegmentsCollection* self, PyObject* args) {
569 Py_ssize_t size;
570 Py_ssize_t i;
571 Py_ssize_t offset = 0;
572
573 size = PyTuple_Size(args);
574 if (-1 == size) {
575 return -1;
576 }
577
578 if (0 == size) {
579 PyErr_SetString(PyExc_ValueError, "must pass at least 1 argument");
580 return -1;
581 }
582
583 for (i = 0; i < size; i++) {
584 PyObject* item = PyTuple_GET_ITEM(args, i);
585 if (!PyObject_TypeCheck(item, &ZstdBufferWithSegmentsType)) {
586 PyErr_SetString(PyExc_TypeError, "arguments must be BufferWithSegments instances");
587 return -1;
588 }
589
590 if (0 == ((ZstdBufferWithSegments*)item)->segmentCount ||
591 0 == ((ZstdBufferWithSegments*)item)->dataSize) {
592 PyErr_SetString(PyExc_ValueError, "ZstdBufferWithSegments cannot be empty");
593 return -1;
594 }
595 }
596
597 self->buffers = PyMem_Malloc(size * sizeof(ZstdBufferWithSegments*));
598 if (NULL == self->buffers) {
599 PyErr_NoMemory();
600 return -1;
601 }
602
603 self->firstElements = PyMem_Malloc(size * sizeof(Py_ssize_t));
604 if (NULL == self->firstElements) {
605 PyMem_Free(self->buffers);
606 self->buffers = NULL;
607 PyErr_NoMemory();
608 return -1;
609 }
610
611 self->bufferCount = size;
612
613 for (i = 0; i < size; i++) {
614 ZstdBufferWithSegments* item = (ZstdBufferWithSegments*)PyTuple_GET_ITEM(args, i);
615
616 self->buffers[i] = item;
617 Py_INCREF(item);
618
619 if (i > 0) {
620 self->firstElements[i - 1] = offset;
621 }
622
623 offset += item->segmentCount;
624 }
625
626 self->firstElements[size - 1] = offset;
627
628 return 0;
629 }
630
631 static PyObject* BufferWithSegmentsCollection_size(ZstdBufferWithSegmentsCollection* self) {
632 Py_ssize_t i;
633 Py_ssize_t j;
634 unsigned long long size = 0;
635
636 for (i = 0; i < self->bufferCount; i++) {
637 for (j = 0; j < self->buffers[i]->segmentCount; j++) {
638 size += self->buffers[i]->segments[j].length;
639 }
640 }
641
642 return PyLong_FromUnsignedLongLong(size);
643 }
644
645 Py_ssize_t BufferWithSegmentsCollection_length(ZstdBufferWithSegmentsCollection* self) {
646 return self->firstElements[self->bufferCount - 1];
647 }
648
649 static ZstdBufferSegment* BufferWithSegmentsCollection_item(ZstdBufferWithSegmentsCollection* self, Py_ssize_t i) {
650 Py_ssize_t bufferOffset;
651
652 if (i < 0) {
653 PyErr_SetString(PyExc_IndexError, "offset must be non-negative");
654 return NULL;
655 }
656
657 if (i >= BufferWithSegmentsCollection_length(self)) {
658 PyErr_Format(PyExc_IndexError, "offset must be less than %zd",
659 BufferWithSegmentsCollection_length(self));
660 return NULL;
661 }
662
663 for (bufferOffset = 0; bufferOffset < self->bufferCount; bufferOffset++) {
664 Py_ssize_t offset = 0;
665
666 if (i < self->firstElements[bufferOffset]) {
667 if (bufferOffset > 0) {
668 offset = self->firstElements[bufferOffset - 1];
669 }
670
671 return BufferWithSegments_item(self->buffers[bufferOffset], i - offset);
672 }
673 }
674
675 PyErr_SetString(ZstdError, "error resolving segment; this should not happen");
676 return NULL;
677 }
678
679 static PySequenceMethods BufferWithSegmentsCollection_sq = {
680 (lenfunc)BufferWithSegmentsCollection_length, /* sq_length */
681 0, /* sq_concat */
682 0, /* sq_repeat */
683 (ssizeargfunc)BufferWithSegmentsCollection_item, /* sq_item */
684 0, /* sq_ass_item */
685 0, /* sq_contains */
686 0, /* sq_inplace_concat */
687 0 /* sq_inplace_repeat */
688 };
689
690 static PyMethodDef BufferWithSegmentsCollection_methods[] = {
691 { "size", (PyCFunction)BufferWithSegmentsCollection_size,
692 METH_NOARGS, PyDoc_STR("total size in bytes of all segments") },
693 { NULL, NULL }
694 };
695
696 PyTypeObject ZstdBufferWithSegmentsCollectionType = {
697 PyVarObject_HEAD_INIT(NULL, 0)
698 "zstd.BufferWithSegmentsCollection", /* tp_name */
699 sizeof(ZstdBufferWithSegmentsCollection),/* tp_basicsize */
700 0, /* tp_itemsize */
701 (destructor)BufferWithSegmentsCollection_dealloc, /* tp_dealloc */
702 0, /* tp_print */
703 0, /* tp_getattr */
704 0, /* tp_setattr */
705 0, /* tp_compare */
706 0, /* tp_repr */
707 0, /* tp_as_number */
708 &BufferWithSegmentsCollection_sq, /* tp_as_sequence */
709 0, /* tp_as_mapping */
710 0, /* tp_hash */
711 0, /* tp_call */
712 0, /* tp_str */
713 0, /* tp_getattro */
714 0, /* tp_setattro */
715 0, /* tp_as_buffer */
716 Py_TPFLAGS_DEFAULT, /* tp_flags */
717 BufferWithSegmentsCollection__doc__, /* tp_doc */
718 0, /* tp_traverse */
719 0, /* tp_clear */
720 0, /* tp_richcompare */
721 0, /* tp_weaklistoffset */
722 /* TODO implement iterator for performance. */
723 0, /* tp_iter */
724 0, /* tp_iternext */
725 BufferWithSegmentsCollection_methods, /* tp_methods */
726 0, /* tp_members */
727 0, /* tp_getset */
728 0, /* tp_base */
729 0, /* tp_dict */
730 0, /* tp_descr_get */
731 0, /* tp_descr_set */
732 0, /* tp_dictoffset */
733 (initproc)BufferWithSegmentsCollection_init, /* tp_init */
734 0, /* tp_alloc */
735 PyType_GenericNew, /* tp_new */
736 };
737
738 void bufferutil_module_init(PyObject* mod) {
739 Py_TYPE(&ZstdBufferWithSegmentsType) = &PyType_Type;
740 if (PyType_Ready(&ZstdBufferWithSegmentsType) < 0) {
741 return;
742 }
743
744 Py_INCREF(&ZstdBufferWithSegmentsType);
745 PyModule_AddObject(mod, "BufferWithSegments", (PyObject*)&ZstdBufferWithSegmentsType);
746
747 Py_TYPE(&ZstdBufferSegmentsType) = &PyType_Type;
748 if (PyType_Ready(&ZstdBufferSegmentsType) < 0) {
749 return;
750 }
751
752 Py_INCREF(&ZstdBufferSegmentsType);
753 PyModule_AddObject(mod, "BufferSegments", (PyObject*)&ZstdBufferSegmentsType);
754
755 Py_TYPE(&ZstdBufferSegmentType) = &PyType_Type;
756 if (PyType_Ready(&ZstdBufferSegmentType) < 0) {
757 return;
758 }
759
760 Py_INCREF(&ZstdBufferSegmentType);
761 PyModule_AddObject(mod, "BufferSegment", (PyObject*)&ZstdBufferSegmentType);
762
763 Py_TYPE(&ZstdBufferWithSegmentsCollectionType) = &PyType_Type;
764 if (PyType_Ready(&ZstdBufferWithSegmentsCollectionType) < 0) {
765 return;
766 }
767
768 Py_INCREF(&ZstdBufferWithSegmentsCollectionType);
769 PyModule_AddObject(mod, "BufferWithSegmentsCollection", (PyObject*)&ZstdBufferWithSegmentsCollectionType);
770 }
@@ -0,0 +1,112
1 import struct
2
3 try:
4 import unittest2 as unittest
5 except ImportError:
6 import unittest
7
8 import zstd
9
10 ss = struct.Struct('=QQ')
11
12
13 class TestBufferWithSegments(unittest.TestCase):
14 def test_arguments(self):
15 with self.assertRaises(TypeError):
16 zstd.BufferWithSegments()
17
18 with self.assertRaises(TypeError):
19 zstd.BufferWithSegments(b'foo')
20
21 # Segments data should be a multiple of 16.
22 with self.assertRaisesRegexp(ValueError, 'segments array size is not a multiple of 16'):
23 zstd.BufferWithSegments(b'foo', b'\x00\x00')
24
25 def test_invalid_offset(self):
26 with self.assertRaisesRegexp(ValueError, 'offset within segments array references memory'):
27 zstd.BufferWithSegments(b'foo', ss.pack(0, 4))
28
29 def test_invalid_getitem(self):
30 b = zstd.BufferWithSegments(b'foo', ss.pack(0, 3))
31
32 with self.assertRaisesRegexp(IndexError, 'offset must be non-negative'):
33 test = b[-10]
34
35 with self.assertRaisesRegexp(IndexError, 'offset must be less than 1'):
36 test = b[1]
37
38 with self.assertRaisesRegexp(IndexError, 'offset must be less than 1'):
39 test = b[2]
40
41 def test_single(self):
42 b = zstd.BufferWithSegments(b'foo', ss.pack(0, 3))
43 self.assertEqual(len(b), 1)
44 self.assertEqual(b.size, 3)
45 self.assertEqual(b.tobytes(), b'foo')
46
47 self.assertEqual(len(b[0]), 3)
48 self.assertEqual(b[0].offset, 0)
49 self.assertEqual(b[0].tobytes(), b'foo')
50
51 def test_multiple(self):
52 b = zstd.BufferWithSegments(b'foofooxfooxy', b''.join([ss.pack(0, 3),
53 ss.pack(3, 4),
54 ss.pack(7, 5)]))
55 self.assertEqual(len(b), 3)
56 self.assertEqual(b.size, 12)
57 self.assertEqual(b.tobytes(), b'foofooxfooxy')
58
59 self.assertEqual(b[0].tobytes(), b'foo')
60 self.assertEqual(b[1].tobytes(), b'foox')
61 self.assertEqual(b[2].tobytes(), b'fooxy')
62
63
64 class TestBufferWithSegmentsCollection(unittest.TestCase):
65 def test_empty_constructor(self):
66 with self.assertRaisesRegexp(ValueError, 'must pass at least 1 argument'):
67 zstd.BufferWithSegmentsCollection()
68
69 def test_argument_validation(self):
70 with self.assertRaisesRegexp(TypeError, 'arguments must be BufferWithSegments'):
71 zstd.BufferWithSegmentsCollection(None)
72
73 with self.assertRaisesRegexp(TypeError, 'arguments must be BufferWithSegments'):
74 zstd.BufferWithSegmentsCollection(zstd.BufferWithSegments(b'foo', ss.pack(0, 3)),
75 None)
76
77 with self.assertRaisesRegexp(ValueError, 'ZstdBufferWithSegments cannot be empty'):
78 zstd.BufferWithSegmentsCollection(zstd.BufferWithSegments(b'', b''))
79
80 def test_length(self):
81 b1 = zstd.BufferWithSegments(b'foo', ss.pack(0, 3))
82 b2 = zstd.BufferWithSegments(b'barbaz', b''.join([ss.pack(0, 3),
83 ss.pack(3, 3)]))
84
85 c = zstd.BufferWithSegmentsCollection(b1)
86 self.assertEqual(len(c), 1)
87 self.assertEqual(c.size(), 3)
88
89 c = zstd.BufferWithSegmentsCollection(b2)
90 self.assertEqual(len(c), 2)
91 self.assertEqual(c.size(), 6)
92
93 c = zstd.BufferWithSegmentsCollection(b1, b2)
94 self.assertEqual(len(c), 3)
95 self.assertEqual(c.size(), 9)
96
97 def test_getitem(self):
98 b1 = zstd.BufferWithSegments(b'foo', ss.pack(0, 3))
99 b2 = zstd.BufferWithSegments(b'barbaz', b''.join([ss.pack(0, 3),
100 ss.pack(3, 3)]))
101
102 c = zstd.BufferWithSegmentsCollection(b1, b2)
103
104 with self.assertRaisesRegexp(IndexError, 'offset must be less than 3'):
105 c[3]
106
107 with self.assertRaisesRegexp(IndexError, 'offset must be less than 3'):
108 c[4]
109
110 self.assertEqual(c[0].tobytes(), b'foo')
111 self.assertEqual(c[1].tobytes(), b'bar')
112 self.assertEqual(c[2].tobytes(), b'baz')
@@ -0,0 +1,143
1 import io
2 import os
3
4 try:
5 import unittest2 as unittest
6 except ImportError:
7 import unittest
8
9 try:
10 import hypothesis
11 import hypothesis.strategies as strategies
12 except ImportError:
13 raise unittest.SkipTest('hypothesis not available')
14
15 import zstd
16
17 from . common import (
18 make_cffi,
19 random_input_data,
20 )
21
22
23 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
24 @make_cffi
25 class TestCompressor_write_to_fuzzing(unittest.TestCase):
26 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
27 level=strategies.integers(min_value=1, max_value=5),
28 write_size=strategies.integers(min_value=1, max_value=1048576))
29 def test_write_size_variance(self, original, level, write_size):
30 refctx = zstd.ZstdCompressor(level=level)
31 ref_frame = refctx.compress(original)
32
33 cctx = zstd.ZstdCompressor(level=level)
34 b = io.BytesIO()
35 with cctx.write_to(b, size=len(original), write_size=write_size) as compressor:
36 compressor.write(original)
37
38 self.assertEqual(b.getvalue(), ref_frame)
39
40
41 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
42 @make_cffi
43 class TestCompressor_copy_stream_fuzzing(unittest.TestCase):
44 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
45 level=strategies.integers(min_value=1, max_value=5),
46 read_size=strategies.integers(min_value=1, max_value=1048576),
47 write_size=strategies.integers(min_value=1, max_value=1048576))
48 def test_read_write_size_variance(self, original, level, read_size, write_size):
49 refctx = zstd.ZstdCompressor(level=level)
50 ref_frame = refctx.compress(original)
51
52 cctx = zstd.ZstdCompressor(level=level)
53 source = io.BytesIO(original)
54 dest = io.BytesIO()
55
56 cctx.copy_stream(source, dest, size=len(original), read_size=read_size,
57 write_size=write_size)
58
59 self.assertEqual(dest.getvalue(), ref_frame)
60
61
62 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
63 @make_cffi
64 class TestCompressor_compressobj_fuzzing(unittest.TestCase):
65 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
66 level=strategies.integers(min_value=1, max_value=5),
67 chunk_sizes=strategies.streaming(
68 strategies.integers(min_value=1, max_value=4096)))
69 def test_random_input_sizes(self, original, level, chunk_sizes):
70 chunk_sizes = iter(chunk_sizes)
71
72 refctx = zstd.ZstdCompressor(level=level)
73 ref_frame = refctx.compress(original)
74
75 cctx = zstd.ZstdCompressor(level=level)
76 cobj = cctx.compressobj(size=len(original))
77
78 chunks = []
79 i = 0
80 while True:
81 chunk_size = next(chunk_sizes)
82 source = original[i:i + chunk_size]
83 if not source:
84 break
85
86 chunks.append(cobj.compress(source))
87 i += chunk_size
88
89 chunks.append(cobj.flush())
90
91 self.assertEqual(b''.join(chunks), ref_frame)
92
93
94 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
95 @make_cffi
96 class TestCompressor_read_from_fuzzing(unittest.TestCase):
97 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
98 level=strategies.integers(min_value=1, max_value=5),
99 read_size=strategies.integers(min_value=1, max_value=4096),
100 write_size=strategies.integers(min_value=1, max_value=4096))
101 def test_read_write_size_variance(self, original, level, read_size, write_size):
102 refcctx = zstd.ZstdCompressor(level=level)
103 ref_frame = refcctx.compress(original)
104
105 source = io.BytesIO(original)
106
107 cctx = zstd.ZstdCompressor(level=level)
108 chunks = list(cctx.read_from(source, size=len(original), read_size=read_size,
109 write_size=write_size))
110
111 self.assertEqual(b''.join(chunks), ref_frame)
112
113
114 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
115 class TestCompressor_multi_compress_to_buffer_fuzzing(unittest.TestCase):
116 @hypothesis.given(original=strategies.lists(strategies.sampled_from(random_input_data()),
117 min_size=1, max_size=1024),
118 threads=strategies.integers(min_value=1, max_value=8),
119 use_dict=strategies.booleans())
120 def test_data_equivalence(self, original, threads, use_dict):
121 kwargs = {}
122
123 # Use a content dictionary because it is cheap to create.
124 if use_dict:
125 kwargs['dict_data'] = zstd.ZstdCompressionDict(original[0])
126
127 cctx = zstd.ZstdCompressor(level=1,
128 write_content_size=True,
129 write_checksum=True,
130 **kwargs)
131
132 result = cctx.multi_compress_to_buffer(original, threads=-1)
133
134 self.assertEqual(len(result), len(original))
135
136 # The frame produced via the batch APIs may not be bit identical to that
137 # produced by compress() because compression parameters are adjusted
138 # from the first input in batch mode. So the only thing we can do is
139 # verify the decompressed data matches the input.
140 dctx = zstd.ZstdDecompressor(**kwargs)
141
142 for i, frame in enumerate(result):
143 self.assertEqual(dctx.decompress(frame), original[i])
@@ -0,0 +1,79
1 import io
2 import os
3
4 try:
5 import unittest2 as unittest
6 except ImportError:
7 import unittest
8
9 try:
10 import hypothesis
11 import hypothesis.strategies as strategies
12 except ImportError:
13 raise unittest.SkipTest('hypothesis not available')
14
15 import zstd
16
17 from .common import (
18 make_cffi,
19 )
20
21
22 s_windowlog = strategies.integers(min_value=zstd.WINDOWLOG_MIN,
23 max_value=zstd.WINDOWLOG_MAX)
24 s_chainlog = strategies.integers(min_value=zstd.CHAINLOG_MIN,
25 max_value=zstd.CHAINLOG_MAX)
26 s_hashlog = strategies.integers(min_value=zstd.HASHLOG_MIN,
27 max_value=zstd.HASHLOG_MAX)
28 s_searchlog = strategies.integers(min_value=zstd.SEARCHLOG_MIN,
29 max_value=zstd.SEARCHLOG_MAX)
30 s_searchlength = strategies.integers(min_value=zstd.SEARCHLENGTH_MIN,
31 max_value=zstd.SEARCHLENGTH_MAX)
32 s_targetlength = strategies.integers(min_value=zstd.TARGETLENGTH_MIN,
33 max_value=zstd.TARGETLENGTH_MAX)
34 s_strategy = strategies.sampled_from((zstd.STRATEGY_FAST,
35 zstd.STRATEGY_DFAST,
36 zstd.STRATEGY_GREEDY,
37 zstd.STRATEGY_LAZY,
38 zstd.STRATEGY_LAZY2,
39 zstd.STRATEGY_BTLAZY2,
40 zstd.STRATEGY_BTOPT))
41
42
43 @make_cffi
44 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
45 class TestCompressionParametersHypothesis(unittest.TestCase):
46 @hypothesis.given(s_windowlog, s_chainlog, s_hashlog, s_searchlog,
47 s_searchlength, s_targetlength, s_strategy)
48 def test_valid_init(self, windowlog, chainlog, hashlog, searchlog,
49 searchlength, targetlength, strategy):
50 # ZSTD_checkCParams moves the goal posts on us from what's advertised
51 # in the constants. So move along with them.
52 if searchlength == zstd.SEARCHLENGTH_MIN and strategy in (zstd.STRATEGY_FAST, zstd.STRATEGY_GREEDY):
53 searchlength += 1
54 elif searchlength == zstd.SEARCHLENGTH_MAX and strategy != zstd.STRATEGY_FAST:
55 searchlength -= 1
56
57 p = zstd.CompressionParameters(windowlog, chainlog, hashlog,
58 searchlog, searchlength,
59 targetlength, strategy)
60
61 cctx = zstd.ZstdCompressor(compression_params=p)
62 with cctx.write_to(io.BytesIO()):
63 pass
64
65 @hypothesis.given(s_windowlog, s_chainlog, s_hashlog, s_searchlog,
66 s_searchlength, s_targetlength, s_strategy)
67 def test_estimate_compression_context_size(self, windowlog, chainlog,
68 hashlog, searchlog,
69 searchlength, targetlength,
70 strategy):
71 if searchlength == zstd.SEARCHLENGTH_MIN and strategy in (zstd.STRATEGY_FAST, zstd.STRATEGY_GREEDY):
72 searchlength += 1
73 elif searchlength == zstd.SEARCHLENGTH_MAX and strategy != zstd.STRATEGY_FAST:
74 searchlength -= 1
75
76 p = zstd.CompressionParameters(windowlog, chainlog, hashlog,
77 searchlog, searchlength,
78 targetlength, strategy)
79 size = zstd.estimate_compression_context_size(p)
@@ -0,0 +1,151
1 import io
2 import os
3
4 try:
5 import unittest2 as unittest
6 except ImportError:
7 import unittest
8
9 try:
10 import hypothesis
11 import hypothesis.strategies as strategies
12 except ImportError:
13 raise unittest.SkipTest('hypothesis not available')
14
15 import zstd
16
17 from . common import (
18 make_cffi,
19 random_input_data,
20 )
21
22
23 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
24 @make_cffi
25 class TestDecompressor_write_to_fuzzing(unittest.TestCase):
26 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
27 level=strategies.integers(min_value=1, max_value=5),
28 write_size=strategies.integers(min_value=1, max_value=8192),
29 input_sizes=strategies.streaming(
30 strategies.integers(min_value=1, max_value=4096)))
31 def test_write_size_variance(self, original, level, write_size, input_sizes):
32 input_sizes = iter(input_sizes)
33
34 cctx = zstd.ZstdCompressor(level=level)
35 frame = cctx.compress(original)
36
37 dctx = zstd.ZstdDecompressor()
38 source = io.BytesIO(frame)
39 dest = io.BytesIO()
40
41 with dctx.write_to(dest, write_size=write_size) as decompressor:
42 while True:
43 chunk = source.read(next(input_sizes))
44 if not chunk:
45 break
46
47 decompressor.write(chunk)
48
49 self.assertEqual(dest.getvalue(), original)
50
51
52 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
53 @make_cffi
54 class TestDecompressor_copy_stream_fuzzing(unittest.TestCase):
55 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
56 level=strategies.integers(min_value=1, max_value=5),
57 read_size=strategies.integers(min_value=1, max_value=8192),
58 write_size=strategies.integers(min_value=1, max_value=8192))
59 def test_read_write_size_variance(self, original, level, read_size, write_size):
60 cctx = zstd.ZstdCompressor(level=level)
61 frame = cctx.compress(original)
62
63 source = io.BytesIO(frame)
64 dest = io.BytesIO()
65
66 dctx = zstd.ZstdDecompressor()
67 dctx.copy_stream(source, dest, read_size=read_size, write_size=write_size)
68
69 self.assertEqual(dest.getvalue(), original)
70
71
72 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
73 @make_cffi
74 class TestDecompressor_decompressobj_fuzzing(unittest.TestCase):
75 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
76 level=strategies.integers(min_value=1, max_value=5),
77 chunk_sizes=strategies.streaming(
78 strategies.integers(min_value=1, max_value=4096)))
79 def test_random_input_sizes(self, original, level, chunk_sizes):
80 chunk_sizes = iter(chunk_sizes)
81
82 cctx = zstd.ZstdCompressor(level=level)
83 frame = cctx.compress(original)
84
85 source = io.BytesIO(frame)
86
87 dctx = zstd.ZstdDecompressor()
88 dobj = dctx.decompressobj()
89
90 chunks = []
91 while True:
92 chunk = source.read(next(chunk_sizes))
93 if not chunk:
94 break
95
96 chunks.append(dobj.decompress(chunk))
97
98 self.assertEqual(b''.join(chunks), original)
99
100
101 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
102 @make_cffi
103 class TestDecompressor_read_from_fuzzing(unittest.TestCase):
104 @hypothesis.given(original=strategies.sampled_from(random_input_data()),
105 level=strategies.integers(min_value=1, max_value=5),
106 read_size=strategies.integers(min_value=1, max_value=4096),
107 write_size=strategies.integers(min_value=1, max_value=4096))
108 def test_read_write_size_variance(self, original, level, read_size, write_size):
109 cctx = zstd.ZstdCompressor(level=level)
110 frame = cctx.compress(original)
111
112 source = io.BytesIO(frame)
113
114 dctx = zstd.ZstdDecompressor()
115 chunks = list(dctx.read_from(source, read_size=read_size, write_size=write_size))
116
117 self.assertEqual(b''.join(chunks), original)
118
119
120 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
121 class TestDecompressor_multi_decompress_to_buffer_fuzzing(unittest.TestCase):
122 @hypothesis.given(original=strategies.lists(strategies.sampled_from(random_input_data()),
123 min_size=1, max_size=1024),
124 threads=strategies.integers(min_value=1, max_value=8),
125 use_dict=strategies.booleans())
126 def test_data_equivalence(self, original, threads, use_dict):
127 kwargs = {}
128 if use_dict:
129 kwargs['dict_data'] = zstd.ZstdCompressionDict(original[0])
130
131 cctx = zstd.ZstdCompressor(level=1,
132 write_content_size=True,
133 write_checksum=True,
134 **kwargs)
135
136 frames_buffer = cctx.multi_compress_to_buffer(original, threads=-1)
137
138 dctx = zstd.ZstdDecompressor(**kwargs)
139
140 result = dctx.multi_decompress_to_buffer(frames_buffer)
141
142 self.assertEqual(len(result), len(original))
143 for i, frame in enumerate(result):
144 self.assertEqual(frame.tobytes(), original[i])
145
146 frames_list = [f.tobytes() for f in frames_buffer]
147 result = dctx.multi_decompress_to_buffer(frames_list)
148
149 self.assertEqual(len(result), len(original))
150 for i, frame in enumerate(result):
151 self.assertEqual(frame.tobytes(), original[i])
@@ -1,117 +1,145
1 1 Version History
2 2 ===============
3 3
4 0.8.0 (released 2017-03-08)
5 ---------------------------
6
7 * CompressionParameters now has a estimated_compression_context_size() method.
8 zstd.estimate_compression_context_size() is now deprecated and slated for
9 removal.
10 * Implemented a lot of fuzzing tests.
11 * CompressionParameters instances now perform extra validation by calling
12 ZSTD_checkCParams() at construction time.
13 * multi_compress_to_buffer() API for compressing multiple inputs as a
14 single operation, as efficiently as possible.
15 * ZSTD_CStream instances are now used across multiple operations on
16 ZstdCompressor instances, resulting in much better performance for
17 APIs that do streaming.
18 * ZSTD_DStream instances are now used across multiple operations on
19 ZstdDecompressor instances, resulting in much better performance for
20 APIs that do streaming.
21 * train_dictionary() now releases the GIL.
22 * Support for training dictionaries using the COVER algorithm.
23 * multi_decompress_to_buffer() API for decompressing multiple frames as a
24 single operation, as efficiently as possible.
25 * Support for multi-threaded compression.
26 * Disable deprecation warnings when compiling CFFI module.
27 * Fixed memory leak in train_dictionary().
28 * Removed DictParameters type.
29 * train_dictionary() now accepts keyword arguments instead of a
30 DictParameters instance to control dictionary generation.
31
4 32 0.7.0 (released 2017-02-07)
5 33 ---------------------------
6 34
7 35 * Added zstd.get_frame_parameters() to obtain info about a zstd frame.
8 36 * Added ZstdDecompressor.decompress_content_dict_chain() for efficient
9 37 decompression of *content-only dictionary chains*.
10 38 * CFFI module fully implemented; all tests run against both C extension and
11 39 CFFI implementation.
12 40 * Vendored version of zstd updated to 1.1.3.
13 41 * Use ZstdDecompressor.decompress() now uses ZSTD_createDDict_byReference()
14 42 to avoid extra memory allocation of dict data.
15 43 * Add function names to error messages (by using ":name" in PyArg_Parse*
16 44 functions).
17 45 * Reuse decompression context across operations. Previously, we created a
18 46 new ZSTD_DCtx for each decompress(). This was measured to slow down
19 47 decompression by 40-200MB/s. The API guarantees say ZstdDecompressor
20 48 is not thread safe. So we reuse the ZSTD_DCtx across operations and make
21 49 things faster in the process.
22 50 * ZstdCompressor.write_to()'s compress() and flush() methods now return number
23 51 of bytes written.
24 52 * ZstdDecompressor.write_to()'s write() method now returns the number of bytes
25 53 written to the underlying output object.
26 54 * CompressionParameters instances now expose their values as attributes.
27 55 * CompressionParameters instances no longer are subscriptable nor behave
28 56 as tuples (backwards incompatible). Use attributes to obtain values.
29 57 * DictParameters instances now expose their values as attributes.
30 58
31 59 0.6.0 (released 2017-01-14)
32 60 ---------------------------
33 61
34 62 * Support for legacy zstd protocols (build time opt in feature).
35 63 * Automation improvements to test against Python 3.6, latest versions
36 64 of Tox, more deterministic AppVeyor behavior.
37 65 * CFFI "parser" improved to use a compiler preprocessor instead of rewriting
38 66 source code manually.
39 67 * Vendored version of zstd updated to 1.1.2.
40 68 * Documentation improvements.
41 69 * Introduce a bench.py script for performing (crude) benchmarks.
42 70 * ZSTD_CCtx instances are now reused across multiple compress() operations.
43 71 * ZstdCompressor.write_to() now has a flush() method.
44 72 * ZstdCompressor.compressobj()'s flush() method now accepts an argument to
45 73 flush a block (as opposed to ending the stream).
46 74 * Disallow compress(b'') when writing content sizes by default (issue #11).
47 75
48 76 0.5.2 (released 2016-11-12)
49 77 ---------------------------
50 78
51 79 * more packaging fixes for source distribution
52 80
53 81 0.5.1 (released 2016-11-12)
54 82 ---------------------------
55 83
56 84 * setup_zstd.py is included in the source distribution
57 85
58 86 0.5.0 (released 2016-11-10)
59 87 ---------------------------
60 88
61 89 * Vendored version of zstd updated to 1.1.1.
62 90 * Continuous integration for Python 3.6 and 3.7
63 91 * Continuous integration for Conda
64 92 * Added compression and decompression APIs providing similar interfaces
65 93 to the standard library ``zlib`` and ``bz2`` modules. This allows
66 94 coding to a common interface.
67 95 * ``zstd.__version__` is now defined.
68 96 * ``read_from()`` on various APIs now accepts objects implementing the buffer
69 97 protocol.
70 98 * ``read_from()`` has gained a ``skip_bytes`` argument. This allows callers
71 99 to pass in an existing buffer with a header without having to create a
72 100 slice or a new object.
73 101 * Implemented ``ZstdCompressionDict.as_bytes()``.
74 102 * Python's memory allocator is now used instead of ``malloc()``.
75 103 * Low-level zstd data structures are reused in more instances, cutting down
76 104 on overhead for certain operations.
77 105 * ``distutils`` boilerplate for obtaining an ``Extension`` instance
78 106 has now been refactored into a standalone ``setup_zstd.py`` file. This
79 107 allows other projects with ``setup.py`` files to reuse the
80 108 ``distutils`` code for this project without copying code.
81 109 * The monolithic ``zstd.c`` file has been split into a header file defining
82 110 types and separate ``.c`` source files for the implementation.
83 111
84 112 History of the Project
85 113 ======================
86 114
87 115 2016-08-31 - Zstandard 1.0.0 is released and Gregory starts hacking on a
88 116 Python extension for use by the Mercurial project. A very hacky prototype
89 117 is sent to the mercurial-devel list for RFC.
90 118
91 119 2016-09-03 - Most functionality from Zstandard C API implemented. Source
92 120 code published on https://github.com/indygreg/python-zstandard. Travis-CI
93 121 automation configured. 0.0.1 release on PyPI.
94 122
95 123 2016-09-05 - After the API was rounded out a bit and support for Python
96 124 2.6 and 2.7 was added, version 0.1 was released to PyPI.
97 125
98 126 2016-09-05 - After the compressor and decompressor APIs were changed, 0.2
99 127 was released to PyPI.
100 128
101 129 2016-09-10 - 0.3 is released with a bunch of new features. ZstdCompressor
102 130 now accepts arguments controlling frame parameters. The source size can now
103 131 be declared when performing streaming compression. ZstdDecompressor.decompress()
104 132 is implemented. Compression dictionaries are now cached when using the simple
105 133 compression and decompression APIs. Memory size APIs added.
106 134 ZstdCompressor.read_from() and ZstdDecompressor.read_from() have been
107 135 implemented. This rounds out the major compression/decompression APIs planned
108 136 by the author.
109 137
110 138 2016-10-02 - 0.3.3 is released with a bug fix for read_from not fully
111 139 decoding a zstd frame (issue #2).
112 140
113 141 2016-10-02 - 0.4.0 is released with zstd 1.1.0, support for custom read and
114 142 write buffer sizes, and a few bug fixes involving failure to read/write
115 143 all data when buffer sizes were too small to hold remaining data.
116 144
117 145 2016-11-10 - 0.5.0 is released with zstd 1.1.1 and other enhancements.
This diff has been collapsed as it changes many lines, (580 lines changed) Show them Hide them
@@ -1,943 +1,1393
1 1 ================
2 2 python-zstandard
3 3 ================
4 4
5 5 This project provides Python bindings for interfacing with the
6 6 `Zstandard <http://www.zstd.net>`_ compression library. A C extension
7 7 and CFFI interface are provided.
8 8
9 9 The primary goal of the project is to provide a rich interface to the
10 10 underlying C API through a Pythonic interface while not sacrificing
11 11 performance. This means exposing most of the features and flexibility
12 12 of the C API while not sacrificing usability or safety that Python provides.
13 13
14 14 The canonical home for this project is
15 15 https://github.com/indygreg/python-zstandard.
16 16
17 17 | |ci-status| |win-ci-status|
18 18
19 19 State of Project
20 20 ================
21 21
22 22 The project is officially in beta state. The author is reasonably satisfied
23 with the current API and that functionality works as advertised. There
24 may be some backwards incompatible changes before 1.0. Though the author
25 does not intend to make any major changes to the Python API.
23 that functionality works as advertised. **There will be some backwards
24 incompatible changes before 1.0, probably in the 0.9 release.** This may
25 involve renaming the main module from *zstd* to *zstandard* and renaming
26 various types and methods. Pin the package version to prevent unwanted
27 breakage when this change occurs!
26 28
27 29 This project is vendored and distributed with Mercurial 4.1, where it is
28 30 used in a production capacity.
29 31
30 32 There is continuous integration for Python versions 2.6, 2.7, and 3.3+
31 33 on Linux x86_x64 and Windows x86 and x86_64. The author is reasonably
32 34 confident the extension is stable and works as advertised on these
33 35 platforms.
34 36
37 The CFFI bindings are mostly feature complete. Where a feature is implemented
38 in CFFI, unit tests run against both C extension and CFFI implementation to
39 ensure behavior parity.
40
35 41 Expected Changes
36 42 ----------------
37 43
38 44 The author is reasonably confident in the current state of what's
39 45 implemented on the ``ZstdCompressor`` and ``ZstdDecompressor`` types.
40 46 Those APIs likely won't change significantly. Some low-level behavior
41 47 (such as naming and types expected by arguments) may change.
42 48
43 49 There will likely be arguments added to control the input and output
44 50 buffer sizes (currently, certain operations read and write in chunk
45 51 sizes using zstd's preferred defaults).
46 52
47 53 There should be an API that accepts an object that conforms to the buffer
48 54 interface and returns an iterator over compressed or decompressed output.
49 55
56 There should be an API that exposes an ``io.RawIOBase`` interface to
57 compressor and decompressor streams, like how ``gzip.GzipFile`` from
58 the standard library works (issue 13).
59
50 60 The author is on the fence as to whether to support the extremely
51 61 low level compression and decompression APIs. It could be useful to
52 62 support compression without the framing headers. But the author doesn't
53 63 believe it a high priority at this time.
54 64
55 The CFFI bindings are feature complete and all tests run against both
56 the C extension and CFFI bindings to ensure behavior parity.
65 There will likely be a refactoring of the module names. Currently,
66 ``zstd`` is a C extension and ``zstd_cffi`` is the CFFI interface.
67 This means that all code for the C extension must be implemented in
68 C. ``zstd`` may be converted to a Python module so code can be reused
69 between CFFI and C and so not all code in the C extension has to be C.
57 70
58 71 Requirements
59 72 ============
60 73
61 74 This extension is designed to run with Python 2.6, 2.7, 3.3, 3.4, 3.5, and
62 75 3.6 on common platforms (Linux, Windows, and OS X). Only x86_64 is
63 76 currently well-tested as an architecture.
64 77
65 78 Installing
66 79 ==========
67 80
68 81 This package is uploaded to PyPI at https://pypi.python.org/pypi/zstandard.
69 82 So, to install this package::
70 83
71 84 $ pip install zstandard
72 85
73 86 Binary wheels are made available for some platforms. If you need to
74 87 install from a source distribution, all you should need is a working C
75 88 compiler and the Python development headers/libraries. On many Linux
76 89 distributions, you can install a ``python-dev`` or ``python-devel``
77 90 package to provide these dependencies.
78 91
79 92 Packages are also uploaded to Anaconda Cloud at
80 93 https://anaconda.org/indygreg/zstandard. See that URL for how to install
81 94 this package with ``conda``.
82 95
83 96 Performance
84 97 ===========
85 98
86 99 Very crude and non-scientific benchmarking (most benchmarks fall in this
87 100 category because proper benchmarking is hard) show that the Python bindings
88 101 perform within 10% of the native C implementation.
89 102
90 103 The following table compares the performance of compressing and decompressing
91 104 a 1.1 GB tar file comprised of the files in a Firefox source checkout. Values
92 105 obtained with the ``zstd`` program are on the left. The remaining columns detail
93 106 performance of various compression APIs in the Python bindings.
94 107
95 108 +-------+-----------------+-----------------+-----------------+---------------+
96 109 | Level | Native | Simple | Stream In | Stream Out |
97 110 | | Comp / Decomp | Comp / Decomp | Comp / Decomp | Comp |
98 111 +=======+=================+=================+=================+===============+
99 112 | 1 | 490 / 1338 MB/s | 458 / 1266 MB/s | 407 / 1156 MB/s | 405 MB/s |
100 113 +-------+-----------------+-----------------+-----------------+---------------+
101 114 | 2 | 412 / 1288 MB/s | 381 / 1203 MB/s | 345 / 1128 MB/s | 349 MB/s |
102 115 +-------+-----------------+-----------------+-----------------+---------------+
103 116 | 3 | 342 / 1312 MB/s | 319 / 1182 MB/s | 285 / 1165 MB/s | 287 MB/s |
104 117 +-------+-----------------+-----------------+-----------------+---------------+
105 118 | 11 | 64 / 1506 MB/s | 66 / 1436 MB/s | 56 / 1342 MB/s | 57 MB/s |
106 119 +-------+-----------------+-----------------+-----------------+---------------+
107 120
108 121 Again, these are very unscientific. But it shows that Python is capable of
109 122 compressing at several hundred MB/s and decompressing at over 1 GB/s.
110 123
111 124 Comparison to Other Python Bindings
112 125 ===================================
113 126
114 127 https://pypi.python.org/pypi/zstd is an alternate Python binding to
115 128 Zstandard. At the time this was written, the latest release of that
116 129 package (1.1.2) only exposed the simple APIs for compression and decompression.
117 130 This package exposes much more of the zstd API, including streaming and
118 131 dictionary compression. This package also has CFFI support.
119 132
120 133 Bundling of Zstandard Source Code
121 134 =================================
122 135
123 136 The source repository for this project contains a vendored copy of the
124 137 Zstandard source code. This is done for a few reasons.
125 138
126 139 First, Zstandard is relatively new and not yet widely available as a system
127 140 package. Providing a copy of the source code enables the Python C extension
128 141 to be compiled without requiring the user to obtain the Zstandard source code
129 142 separately.
130 143
131 144 Second, Zstandard has both a stable *public* API and an *experimental* API.
132 145 The *experimental* API is actually quite useful (contains functionality for
133 146 training dictionaries for example), so it is something we wish to expose to
134 147 Python. However, the *experimental* API is only available via static linking.
135 148 Furthermore, the *experimental* API can change at any time. So, control over
136 149 the exact version of the Zstandard library linked against is important to
137 150 ensure known behavior.
138 151
139 152 Instructions for Building and Testing
140 153 =====================================
141 154
142 155 Once you have the source code, the extension can be built via setup.py::
143 156
144 157 $ python setup.py build_ext
145 158
146 159 We recommend testing with ``nose``::
147 160
148 161 $ nosetests
149 162
150 163 A Tox configuration is present to test against multiple Python versions::
151 164
152 165 $ tox
153 166
154 167 Tests use the ``hypothesis`` Python package to perform fuzzing. If you
155 don't have it, those tests won't run.
168 don't have it, those tests won't run. Since the fuzzing tests take longer
169 to execute than normal tests, you'll need to opt in to running them by
170 setting the ``ZSTD_SLOW_TESTS`` environment variable. This is set
171 automatically when using ``tox``.
156 172
157 There is also an experimental CFFI module. You need the ``cffi`` Python
158 package installed to build and test that.
173 The ``cffi`` Python package needs to be installed in order to build the CFFI
174 bindings. If it isn't present, the CFFI bindings won't be built.
159 175
160 176 To create a virtualenv with all development dependencies, do something
161 177 like the following::
162 178
163 179 # Python 2
164 180 $ virtualenv venv
165 181
166 182 # Python 3
167 183 $ python3 -m venv venv
168 184
169 185 $ source venv/bin/activate
170 186 $ pip install cffi hypothesis nose tox
171 187
172 188 API
173 189 ===
174 190
175 The compiled C extension provides a ``zstd`` Python module. This module
176 exposes the following interfaces.
191 The compiled C extension provides a ``zstd`` Python module. The CFFI
192 bindings provide a ``zstd_cffi`` module. Both provide an identical API
193 interface. The types, functions, and attributes exposed by these modules
194 are documented in the sections below.
195
196 .. note::
197
198 The documentation in this section makes references to various zstd
199 concepts and functionality. The ``Concepts`` section below explains
200 these concepts in more detail.
177 201
178 202 ZstdCompressor
179 203 --------------
180 204
181 205 The ``ZstdCompressor`` class provides an interface for performing
182 206 compression operations.
183 207
184 208 Each instance is associated with parameters that control compression
185 209 behavior. These come from the following named arguments (all optional):
186 210
187 211 level
188 212 Integer compression level. Valid values are between 1 and 22.
189 213 dict_data
190 214 Compression dictionary to use.
191 215
192 216 Note: When using dictionary data and ``compress()`` is called multiple
193 217 times, the ``CompressionParameters`` derived from an integer compression
194 218 ``level`` and the first compressed data's size will be reused for all
195 219 subsequent operations. This may not be desirable if source data size
196 220 varies significantly.
197 221 compression_params
198 222 A ``CompressionParameters`` instance (overrides the ``level`` value).
199 223 write_checksum
200 224 Whether a 4 byte checksum should be written with the compressed data.
201 225 Defaults to False. If True, the decompressor can verify that decompressed
202 226 data matches the original input data.
203 227 write_content_size
204 228 Whether the size of the uncompressed data will be written into the
205 229 header of compressed data. Defaults to False. The data will only be
206 230 written if the compressor knows the size of the input data. This is
207 231 likely not true for streaming compression.
208 232 write_dict_id
209 233 Whether to write the dictionary ID into the compressed data.
210 234 Defaults to True. The dictionary ID is only written if a dictionary
211 235 is being used.
236 threads
237 Enables and sets the number of threads to use for multi-threaded compression
238 operations. Defaults to 0, which means to use single-threaded compression.
239 Negative values will resolve to the number of logical CPUs in the system.
240 Read below for more info on multi-threaded compression. This argument only
241 controls thread count for operations that operate on individual pieces of
242 data. APIs that spawn multiple threads for working on multiple pieces of
243 data have their own ``threads`` argument.
212 244
213 245 Unless specified otherwise, assume that no two methods of ``ZstdCompressor``
214 246 instances can be called from multiple Python threads simultaneously. In other
215 247 words, assume instances are not thread safe unless stated otherwise.
216 248
217 249 Simple API
218 250 ^^^^^^^^^^
219 251
220 252 ``compress(data)`` compresses and returns data as a one-shot operation.::
221 253
222 254 cctx = zstd.ZstdCompressor()
223 255 compressed = cctx.compress(b'data to compress')
224 256
257 The ``data`` argument can be any object that implements the *buffer protocol*.
258
225 259 Unless ``compression_params`` or ``dict_data`` are passed to the
226 260 ``ZstdCompressor``, each invocation of ``compress()`` will calculate the
227 261 optimal compression parameters for the configured compression ``level`` and
228 262 input data size (some parameters are fine-tuned for small input sizes).
229 263
230 264 If a compression dictionary is being used, the compression parameters
231 265 determined from the first input's size will be reused for subsequent
232 266 operations.
233 267
234 268 There is currently a deficiency in zstd's C APIs that makes it difficult
235 269 to round trip empty inputs when ``write_content_size=True``. Attempting
236 270 this will raise a ``ValueError`` unless ``allow_empty=True`` is passed
237 271 to ``compress()``.
238 272
239 273 Streaming Input API
240 274 ^^^^^^^^^^^^^^^^^^^
241 275
242 276 ``write_to(fh)`` (which behaves as a context manager) allows you to *stream*
243 277 data into a compressor.::
244 278
245 279 cctx = zstd.ZstdCompressor(level=10)
246 280 with cctx.write_to(fh) as compressor:
247 281 compressor.write(b'chunk 0')
248 282 compressor.write(b'chunk 1')
249 283 ...
250 284
251 285 The argument to ``write_to()`` must have a ``write(data)`` method. As
252 286 compressed data is available, ``write()`` will be called with the compressed
253 287 data as its argument. Many common Python types implement ``write()``, including
254 288 open file handles and ``io.BytesIO``.
255 289
256 290 ``write_to()`` returns an object representing a streaming compressor instance.
257 291 It **must** be used as a context manager. That object's ``write(data)`` method
258 292 is used to feed data into the compressor.
259 293
260 294 A ``flush()`` method can be called to evict whatever data remains within the
261 295 compressor's internal state into the output object. This may result in 0 or
262 296 more ``write()`` calls to the output object.
263 297
264 298 Both ``write()`` and ``flush()`` return the number of bytes written to the
265 299 object's ``write()``. In many cases, small inputs do not accumulate enough
266 300 data to cause a write and ``write()`` will return ``0``.
267 301
268 302 If the size of the data being fed to this streaming compressor is known,
269 303 you can declare it before compression begins::
270 304
271 305 cctx = zstd.ZstdCompressor()
272 306 with cctx.write_to(fh, size=data_len) as compressor:
273 307 compressor.write(chunk0)
274 308 compressor.write(chunk1)
275 309 ...
276 310
277 311 Declaring the size of the source data allows compression parameters to
278 312 be tuned. And if ``write_content_size`` is used, it also results in the
279 313 content size being written into the frame header of the output data.
280 314
281 315 The size of chunks being ``write()`` to the destination can be specified::
282 316
283 317 cctx = zstd.ZstdCompressor()
284 318 with cctx.write_to(fh, write_size=32768) as compressor:
285 319 ...
286 320
287 321 To see how much memory is being used by the streaming compressor::
288 322
289 323 cctx = zstd.ZstdCompressor()
290 324 with cctx.write_to(fh) as compressor:
291 325 ...
292 326 byte_size = compressor.memory_size()
293 327
294 328 Streaming Output API
295 329 ^^^^^^^^^^^^^^^^^^^^
296 330
297 331 ``read_from(reader)`` provides a mechanism to stream data out of a compressor
298 332 as an iterator of data chunks.::
299 333
300 334 cctx = zstd.ZstdCompressor()
301 335 for chunk in cctx.read_from(fh):
302 336 # Do something with emitted data.
303 337
304 338 ``read_from()`` accepts an object that has a ``read(size)`` method or conforms
305 339 to the buffer protocol. (``bytes`` and ``memoryview`` are 2 common types that
306 340 provide the buffer protocol.)
307 341
308 342 Uncompressed data is fetched from the source either by calling ``read(size)``
309 343 or by fetching a slice of data from the object directly (in the case where
310 344 the buffer protocol is being used). The returned iterator consists of chunks
311 345 of compressed data.
312 346
313 347 If reading from the source via ``read()``, ``read()`` will be called until
314 348 it raises or returns an empty bytes (``b''``). It is perfectly valid for
315 349 the source to deliver fewer bytes than were what requested by ``read(size)``.
316 350
317 351 Like ``write_to()``, ``read_from()`` also accepts a ``size`` argument
318 352 declaring the size of the input stream::
319 353
320 354 cctx = zstd.ZstdCompressor()
321 355 for chunk in cctx.read_from(fh, size=some_int):
322 356 pass
323 357
324 358 You can also control the size that data is ``read()`` from the source and
325 359 the ideal size of output chunks::
326 360
327 361 cctx = zstd.ZstdCompressor()
328 362 for chunk in cctx.read_from(fh, read_size=16384, write_size=8192):
329 363 pass
330 364
331 365 Unlike ``write_to()``, ``read_from()`` does not give direct control over the
332 366 sizes of chunks fed into the compressor. Instead, chunk sizes will be whatever
333 367 the object being read from delivers. These will often be of a uniform size.
334 368
335 369 Stream Copying API
336 370 ^^^^^^^^^^^^^^^^^^
337 371
338 372 ``copy_stream(ifh, ofh)`` can be used to copy data between 2 streams while
339 373 compressing it.::
340 374
341 375 cctx = zstd.ZstdCompressor()
342 376 cctx.copy_stream(ifh, ofh)
343 377
344 378 For example, say you wish to compress a file::
345 379
346 380 cctx = zstd.ZstdCompressor()
347 381 with open(input_path, 'rb') as ifh, open(output_path, 'wb') as ofh:
348 382 cctx.copy_stream(ifh, ofh)
349 383
350 384 It is also possible to declare the size of the source stream::
351 385
352 386 cctx = zstd.ZstdCompressor()
353 387 cctx.copy_stream(ifh, ofh, size=len_of_input)
354 388
355 389 You can also specify how large the chunks that are ``read()`` and ``write()``
356 390 from and to the streams::
357 391
358 392 cctx = zstd.ZstdCompressor()
359 393 cctx.copy_stream(ifh, ofh, read_size=32768, write_size=16384)
360 394
361 395 The stream copier returns a 2-tuple of bytes read and written::
362 396
363 397 cctx = zstd.ZstdCompressor()
364 398 read_count, write_count = cctx.copy_stream(ifh, ofh)
365 399
366 400 Compressor API
367 401 ^^^^^^^^^^^^^^
368 402
369 403 ``compressobj()`` returns an object that exposes ``compress(data)`` and
370 404 ``flush()`` methods. Each returns compressed data or an empty bytes.
371 405
372 406 The purpose of ``compressobj()`` is to provide an API-compatible interface
373 407 with ``zlib.compressobj`` and ``bz2.BZ2Compressor``. This allows callers to
374 408 swap in different compressor objects while using the same API.
375 409
376 410 ``flush()`` accepts an optional argument indicating how to end the stream.
377 411 ``zstd.COMPRESSOBJ_FLUSH_FINISH`` (the default) ends the compression stream.
378 412 Once this type of flush is performed, ``compress()`` and ``flush()`` can
379 413 no longer be called. This type of flush **must** be called to end the
380 414 compression context. If not called, returned data may be incomplete.
381 415
382 416 A ``zstd.COMPRESSOBJ_FLUSH_BLOCK`` argument to ``flush()`` will flush a
383 417 zstd block. Flushes of this type can be performed multiple times. The next
384 418 call to ``compress()`` will begin a new zstd block.
385 419
386 420 Here is how this API should be used::
387 421
388 422 cctx = zstd.ZstdCompressor()
389 423 cobj = cctx.compressobj()
390 424 data = cobj.compress(b'raw input 0')
391 425 data = cobj.compress(b'raw input 1')
392 426 data = cobj.flush()
393 427
394 428 Or to flush blocks::
395 429
396 430 cctx.zstd.ZstdCompressor()
397 431 cobj = cctx.compressobj()
398 432 data = cobj.compress(b'chunk in first block')
399 433 data = cobj.flush(zstd.COMPRESSOBJ_FLUSH_BLOCK)
400 434 data = cobj.compress(b'chunk in second block')
401 435 data = cobj.flush()
402 436
403 437 For best performance results, keep input chunks under 256KB. This avoids
404 438 extra allocations for a large output object.
405 439
406 440 It is possible to declare the input size of the data that will be fed into
407 441 the compressor::
408 442
409 443 cctx = zstd.ZstdCompressor()
410 444 cobj = cctx.compressobj(size=6)
411 445 data = cobj.compress(b'foobar')
412 446 data = cobj.flush()
413 447
448 Batch Compression API
449 ^^^^^^^^^^^^^^^^^^^^^
450
451 (Experimental. Not yet supported in CFFI bindings.)
452
453 ``multi_compress_to_buffer(data, [threads=0])`` performs compression of multiple
454 inputs as a single operation.
455
456 Data to be compressed can be passed as a ``BufferWithSegmentsCollection``, a
457 ``BufferWithSegments``, or a list containing byte like objects. Each element of
458 the container will be compressed individually using the configured parameters
459 on the ``ZstdCompressor`` instance.
460
461 The ``threads`` argument controls how many threads to use for compression. The
462 default is ``0`` which means to use a single thread. Negative values use the
463 number of logical CPUs in the machine.
464
465 The function returns a ``BufferWithSegmentsCollection``. This type represents
466 N discrete memory allocations, eaching holding 1 or more compressed frames.
467
468 Output data is written to shared memory buffers. This means that unlike
469 regular Python objects, a reference to *any* object within the collection
470 keeps the shared buffer and therefore memory backing it alive. This can have
471 undesirable effects on process memory usage.
472
473 The API and behavior of this function is experimental and will likely change.
474 Known deficiencies include:
475
476 * If asked to use multiple threads, it will always spawn that many threads,
477 even if the input is too small to use them. It should automatically lower
478 the thread count when the extra threads would just add overhead.
479 * The buffer allocation strategy is fixed. There is room to make it dynamic,
480 perhaps even to allow one output buffer per input, facilitating a variation
481 of the API to return a list without the adverse effects of shared memory
482 buffers.
483
414 484 ZstdDecompressor
415 485 ----------------
416 486
417 487 The ``ZstdDecompressor`` class provides an interface for performing
418 488 decompression.
419 489
420 490 Each instance is associated with parameters that control decompression. These
421 491 come from the following named arguments (all optional):
422 492
423 493 dict_data
424 494 Compression dictionary to use.
425 495
426 496 The interface of this class is very similar to ``ZstdCompressor`` (by design).
427 497
428 498 Unless specified otherwise, assume that no two methods of ``ZstdDecompressor``
429 499 instances can be called from multiple Python threads simultaneously. In other
430 500 words, assume instances are not thread safe unless stated otherwise.
431 501
432 502 Simple API
433 503 ^^^^^^^^^^
434 504
435 505 ``decompress(data)`` can be used to decompress an entire compressed zstd
436 506 frame in a single operation.::
437 507
438 508 dctx = zstd.ZstdDecompressor()
439 509 decompressed = dctx.decompress(data)
440 510
441 511 By default, ``decompress(data)`` will only work on data written with the content
442 512 size encoded in its header. This can be achieved by creating a
443 513 ``ZstdCompressor`` with ``write_content_size=True``. If compressed data without
444 514 an embedded content size is seen, ``zstd.ZstdError`` will be raised.
445 515
446 516 If the compressed data doesn't have its content size embedded within it,
447 517 decompression can be attempted by specifying the ``max_output_size``
448 518 argument.::
449 519
450 520 dctx = zstd.ZstdDecompressor()
451 521 uncompressed = dctx.decompress(data, max_output_size=1048576)
452 522
453 523 Ideally, ``max_output_size`` will be identical to the decompressed output
454 524 size.
455 525
456 526 If ``max_output_size`` is too small to hold the decompressed data,
457 527 ``zstd.ZstdError`` will be raised.
458 528
459 529 If ``max_output_size`` is larger than the decompressed data, the allocated
460 530 output buffer will be resized to only use the space required.
461 531
462 532 Please note that an allocation of the requested ``max_output_size`` will be
463 533 performed every time the method is called. Setting to a very large value could
464 534 result in a lot of work for the memory allocator and may result in
465 535 ``MemoryError`` being raised if the allocation fails.
466 536
467 537 If the exact size of decompressed data is unknown, it is **strongly**
468 538 recommended to use a streaming API.
469 539
470 540 Streaming Input API
471 541 ^^^^^^^^^^^^^^^^^^^
472 542
473 543 ``write_to(fh)`` can be used to incrementally send compressed data to a
474 544 decompressor.::
475 545
476 546 dctx = zstd.ZstdDecompressor()
477 547 with dctx.write_to(fh) as decompressor:
478 548 decompressor.write(compressed_data)
479 549
480 550 This behaves similarly to ``zstd.ZstdCompressor``: compressed data is written to
481 551 the decompressor by calling ``write(data)`` and decompressed output is written
482 552 to the output object by calling its ``write(data)`` method.
483 553
484 554 Calls to ``write()`` will return the number of bytes written to the output
485 555 object. Not all inputs will result in bytes being written, so return values
486 556 of ``0`` are possible.
487 557
488 558 The size of chunks being ``write()`` to the destination can be specified::
489 559
490 560 dctx = zstd.ZstdDecompressor()
491 561 with dctx.write_to(fh, write_size=16384) as decompressor:
492 562 pass
493 563
494 564 You can see how much memory is being used by the decompressor::
495 565
496 566 dctx = zstd.ZstdDecompressor()
497 567 with dctx.write_to(fh) as decompressor:
498 568 byte_size = decompressor.memory_size()
499 569
500 570 Streaming Output API
501 571 ^^^^^^^^^^^^^^^^^^^^
502 572
503 573 ``read_from(fh)`` provides a mechanism to stream decompressed data out of a
504 574 compressed source as an iterator of data chunks.::
505 575
506 576 dctx = zstd.ZstdDecompressor()
507 577 for chunk in dctx.read_from(fh):
508 578 # Do something with original data.
509 579
510 580 ``read_from()`` accepts a) an object with a ``read(size)`` method that will
511 581 return compressed bytes b) an object conforming to the buffer protocol that
512 582 can expose its data as a contiguous range of bytes. The ``bytes`` and
513 583 ``memoryview`` types expose this buffer protocol.
514 584
515 585 ``read_from()`` returns an iterator whose elements are chunks of the
516 586 decompressed data.
517 587
518 588 The size of requested ``read()`` from the source can be specified::
519 589
520 590 dctx = zstd.ZstdDecompressor()
521 591 for chunk in dctx.read_from(fh, read_size=16384):
522 592 pass
523 593
524 594 It is also possible to skip leading bytes in the input data::
525 595
526 596 dctx = zstd.ZstdDecompressor()
527 597 for chunk in dctx.read_from(fh, skip_bytes=1):
528 598 pass
529 599
530 600 Skipping leading bytes is useful if the source data contains extra
531 601 *header* data but you want to avoid the overhead of making a buffer copy
532 602 or allocating a new ``memoryview`` object in order to decompress the data.
533 603
534 604 Similarly to ``ZstdCompressor.read_from()``, the consumer of the iterator
535 605 controls when data is decompressed. If the iterator isn't consumed,
536 606 decompression is put on hold.
537 607
538 608 When ``read_from()`` is passed an object conforming to the buffer protocol,
539 609 the behavior may seem similar to what occurs when the simple decompression
540 610 API is used. However, this API works when the decompressed size is unknown.
541 611 Furthermore, if feeding large inputs, the decompressor will work in chunks
542 612 instead of performing a single operation.
543 613
544 614 Stream Copying API
545 615 ^^^^^^^^^^^^^^^^^^
546 616
547 617 ``copy_stream(ifh, ofh)`` can be used to copy data across 2 streams while
548 618 performing decompression.::
549 619
550 620 dctx = zstd.ZstdDecompressor()
551 621 dctx.copy_stream(ifh, ofh)
552 622
553 623 e.g. to decompress a file to another file::
554 624
555 625 dctx = zstd.ZstdDecompressor()
556 626 with open(input_path, 'rb') as ifh, open(output_path, 'wb') as ofh:
557 627 dctx.copy_stream(ifh, ofh)
558 628
559 629 The size of chunks being ``read()`` and ``write()`` from and to the streams
560 630 can be specified::
561 631
562 632 dctx = zstd.ZstdDecompressor()
563 633 dctx.copy_stream(ifh, ofh, read_size=8192, write_size=16384)
564 634
565 635 Decompressor API
566 636 ^^^^^^^^^^^^^^^^
567 637
568 638 ``decompressobj()`` returns an object that exposes a ``decompress(data)``
569 639 methods. Compressed data chunks are fed into ``decompress(data)`` and
570 640 uncompressed output (or an empty bytes) is returned. Output from subsequent
571 641 calls needs to be concatenated to reassemble the full decompressed byte
572 642 sequence.
573 643
574 644 The purpose of ``decompressobj()`` is to provide an API-compatible interface
575 645 with ``zlib.decompressobj`` and ``bz2.BZ2Decompressor``. This allows callers
576 646 to swap in different decompressor objects while using the same API.
577 647
578 648 Each object is single use: once an input frame is decoded, ``decompress()``
579 649 can no longer be called.
580 650
581 651 Here is how this API should be used::
582 652
583 653 dctx = zstd.ZstdDeompressor()
584 654 dobj = cctx.decompressobj()
585 655 data = dobj.decompress(compressed_chunk_0)
586 656 data = dobj.decompress(compressed_chunk_1)
587 657
658 Batch Decompression API
659 ^^^^^^^^^^^^^^^^^^^^^^^
660
661 (Experimental. Not yet supported in CFFI bindings.)
662
663 ``multi_decompress_to_buffer()`` performs decompression of multiple
664 frames as a single operation and returns a ``BufferWithSegmentsCollection``
665 containing decompressed data for all inputs.
666
667 Compressed frames can be passed to the function as a ``BufferWithSegments``,
668 a ``BufferWithSegmentsCollection``, or as a list containing objects that
669 conform to the buffer protocol. For best performance, pass a
670 ``BufferWithSegmentsCollection`` or a ``BufferWithSegments``, as
671 minimal input validation will be done for that type. If calling from
672 Python (as opposed to C), constructing one of these instances may add
673 overhead cancelling out the performance overhead of validation for list
674 inputs.
675
676 The decompressed size of each frame must be discoverable. It can either be
677 embedded within the zstd frame (``write_content_size=True`` argument to
678 ``ZstdCompressor``) or passed in via the ``decompressed_sizes`` argument.
679
680 The ``decompressed_sizes`` argument is an object conforming to the buffer
681 protocol which holds an array of 64-bit unsigned integers in the machine's
682 native format defining the decompressed sizes of each frame. If this argument
683 is passed, it avoids having to scan each frame for its decompressed size.
684 This frame scanning can add noticeable overhead in some scenarios.
685
686 The ``threads`` argument controls the number of threads to use to perform
687 decompression operations. The default (``0``) or the value ``1`` means to
688 use a single thread. Negative values use the number of logical CPUs in the
689 machine.
690
691 .. note::
692
693 It is possible to pass a ``mmap.mmap()`` instance into this function by
694 wrapping it with a ``BufferWithSegments`` instance (which will define the
695 offsets of frames within the memory mapped region).
696
697 This function is logically equivalent to performing ``dctx.decompress()``
698 on each input frame and returning the result.
699
700 This function exists to perform decompression on multiple frames as fast
701 as possible by having as little overhead as possible. Since decompression is
702 performed as a single operation and since the decompressed output is stored in
703 a single buffer, extra memory allocations, Python objects, and Python function
704 calls are avoided. This is ideal for scenarios where callers need to access
705 decompressed data for multiple frames.
706
707 Currently, the implementation always spawns multiple threads when requested,
708 even if the amount of work to do is small. In the future, it will be smarter
709 about avoiding threads and their associated overhead when the amount of
710 work to do is small.
711
588 712 Content-Only Dictionary Chain Decompression
589 713 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
590 714
591 715 ``decompress_content_dict_chain(frames)`` performs decompression of a list of
592 716 zstd frames produced using chained *content-only* dictionary compression. Such
593 717 a list of frames is produced by compressing discrete inputs where each
594 718 non-initial input is compressed with a *content-only* dictionary consisting
595 719 of the content of the previous input.
596 720
597 721 For example, say you have the following inputs::
598 722
599 723 inputs = [b'input 1', b'input 2', b'input 3']
600 724
601 725 The zstd frame chain consists of:
602 726
603 727 1. ``b'input 1'`` compressed in standalone/discrete mode
604 728 2. ``b'input 2'`` compressed using ``b'input 1'`` as a *content-only* dictionary
605 729 3. ``b'input 3'`` compressed using ``b'input 2'`` as a *content-only* dictionary
606 730
607 731 Each zstd frame **must** have the content size written.
608 732
609 733 The following Python code can be used to produce a *content-only dictionary
610 734 chain*::
611 735
612 def make_chain(inputs):
613 frames = []
736 def make_chain(inputs):
737 frames = []
614 738
615 # First frame is compressed in standalone/discrete mode.
616 zctx = zstd.ZstdCompressor(write_content_size=True)
617 frames.append(zctx.compress(inputs[0]))
739 # First frame is compressed in standalone/discrete mode.
740 zctx = zstd.ZstdCompressor(write_content_size=True)
741 frames.append(zctx.compress(inputs[0]))
618 742
619 # Subsequent frames use the previous fulltext as a content-only dictionary
620 for i, raw in enumerate(inputs[1:]):
621 dict_data = zstd.ZstdCompressionDict(inputs[i])
622 zctx = zstd.ZstdCompressor(write_content_size=True, dict_data=dict_data)
623 frames.append(zctx.compress(raw))
743 # Subsequent frames use the previous fulltext as a content-only dictionary
744 for i, raw in enumerate(inputs[1:]):
745 dict_data = zstd.ZstdCompressionDict(inputs[i])
746 zctx = zstd.ZstdCompressor(write_content_size=True, dict_data=dict_data)
747 frames.append(zctx.compress(raw))
624 748
625 return frames
749 return frames
626 750
627 751 ``decompress_content_dict_chain()`` returns the uncompressed data of the last
628 752 element in the input chain.
629 753
630 754 It is possible to implement *content-only dictionary chain* decompression
631 755 on top of other Python APIs. However, this function will likely be significantly
632 756 faster, especially for long input chains, as it avoids the overhead of
633 757 instantiating and passing around intermediate objects between C and Python.
634 758
635 Choosing an API
636 ---------------
637
638 Various forms of compression and decompression APIs are provided because each
639 are suitable for different use cases.
759 Multi-Threaded Compression
760 --------------------------
640 761
641 The simple/one-shot APIs are useful for small data, when the decompressed
642 data size is known (either recorded in the zstd frame header via
643 ``write_content_size`` or known via an out-of-band mechanism, such as a file
644 size).
762 ``ZstdCompressor`` accepts a ``threads`` argument that controls the number
763 of threads to use for compression. The way this works is that input is split
764 into segments and each segment is fed into a worker pool for compression. Once
765 a segment is compressed, it is flushed/appended to the output.
645 766
646 A limitation of the simple APIs is that input or output data must fit in memory.
647 And unless using advanced tricks with Python *buffer objects*, both input and
648 output must fit in memory simultaneously.
649
650 Another limitation is that compression or decompression is performed as a single
651 operation. So if you feed large input, it could take a long time for the
652 function to return.
767 The segment size for multi-threaded compression is chosen from the window size
768 of the compressor. This is derived from the ``window_log`` attribute of a
769 ``CompressionParameters`` instance. By default, segment sizes are in the 1+MB
770 range.
653 771
654 The streaming APIs do not have the limitations of the simple API. The cost to
655 this is they are more complex to use than a single function call.
656
657 The streaming APIs put the caller in control of compression and decompression
658 behavior by allowing them to directly control either the input or output side
659 of the operation.
660
661 With the streaming input APIs, the caller feeds data into the compressor or
662 decompressor as they see fit. Output data will only be written after the caller
663 has explicitly written data.
772 If multi-threaded compression is requested and the input is smaller than the
773 configured segment size, only a single compression thread will be used. If the
774 input is smaller than the segment size multiplied by the thread pool size or
775 if data cannot be delivered to the compressor fast enough, not all requested
776 compressor threads may be active simultaneously.
664 777
665 With the streaming output APIs, the caller consumes output from the compressor
666 or decompressor as they see fit. The compressor or decompressor will only
667 consume data from the source when the caller is ready to receive it.
778 Compared to non-multi-threaded compression, multi-threaded compression has
779 higher per-operation overhead. This includes extra memory operations,
780 thread creation, lock acquisition, etc.
668 781
669 One end of the streaming APIs involves a file-like object that must
670 ``write()`` output data or ``read()`` input data. Depending on what the
671 backing storage for these objects is, those operations may not complete quickly.
672 For example, when streaming compressed data to a file, the ``write()`` into
673 a streaming compressor could result in a ``write()`` to the filesystem, which
674 may take a long time to finish due to slow I/O on the filesystem. So, there
675 may be overhead in streaming APIs beyond the compression and decompression
676 operations.
782 Due to the nature of multi-threaded compression using *N* compression
783 *states*, the output from multi-threaded compression will likely be larger
784 than non-multi-threaded compression. The difference is usually small. But
785 there is a CPU/wall time versus size trade off that may warrant investigation.
786
787 Output from multi-threaded compression does not require any special handling
788 on the decompression side. In other words, any zstd decompressor should be able
789 to consume data produced with multi-threaded compression.
677 790
678 791 Dictionary Creation and Management
679 792 ----------------------------------
680 793
681 Zstandard allows *dictionaries* to be used when compressing and
682 decompressing data. The idea is that if you are compressing a lot of similar
683 data, you can precompute common properties of that data (such as recurring
684 byte sequences) to achieve better compression ratios.
685
686 In Python, compression dictionaries are represented as the
687 ``ZstdCompressionDict`` type.
794 Compression dictionaries are represented as the ``ZstdCompressionDict`` type.
688 795
689 796 Instances can be constructed from bytes::
690 797
691 798 dict_data = zstd.ZstdCompressionDict(data)
692 799
693 800 It is possible to construct a dictionary from *any* data. Unless the
694 801 data begins with a magic header, the dictionary will be treated as
695 802 *content-only*. *Content-only* dictionaries allow compression operations
696 803 that follow to reference raw data within the content. For one use of
697 804 *content-only* dictionaries, see
698 805 ``ZstdDecompressor.decompress_content_dict_chain()``.
699 806
700 807 More interestingly, instances can be created by *training* on sample data::
701 808
702 809 dict_data = zstd.train_dictionary(size, samples)
703 810
704 811 This takes a list of bytes instances and creates and returns a
705 812 ``ZstdCompressionDict``.
706 813
707 814 You can see how many bytes are in the dictionary by calling ``len()``::
708 815
709 816 dict_data = zstd.train_dictionary(size, samples)
710 817 dict_size = len(dict_data) # will not be larger than ``size``
711 818
712 819 Once you have a dictionary, you can pass it to the objects performing
713 820 compression and decompression::
714 821
715 822 dict_data = zstd.train_dictionary(16384, samples)
716 823
717 824 cctx = zstd.ZstdCompressor(dict_data=dict_data)
718 825 for source_data in input_data:
719 826 compressed = cctx.compress(source_data)
720 827 # Do something with compressed data.
721 828
722 829 dctx = zstd.ZstdDecompressor(dict_data=dict_data)
723 830 for compressed_data in input_data:
724 831 buffer = io.BytesIO()
725 832 with dctx.write_to(buffer) as decompressor:
726 833 decompressor.write(compressed_data)
727 834 # Do something with raw data in ``buffer``.
728 835
729 836 Dictionaries have unique integer IDs. You can retrieve this ID via::
730 837
731 838 dict_id = zstd.dictionary_id(dict_data)
732 839
733 840 You can obtain the raw data in the dict (useful for persisting and constructing
734 841 a ``ZstdCompressionDict`` later) via ``as_bytes()``::
735 842
736 843 dict_data = zstd.train_dictionary(size, samples)
737 844 raw_data = dict_data.as_bytes()
738 845
846 The following named arguments to ``train_dictionary`` can also be used
847 to further control dictionary generation.
848
849 selectivity
850 Integer selectivity level. Default is 9. Larger values yield more data in
851 dictionary.
852 level
853 Integer compression level. Default is 6.
854 dict_id
855 Integer dictionary ID for the produced dictionary. Default is 0, which
856 means to use a random value.
857 notifications
858 Controls writing of informational messages to ``stderr``. ``0`` (the
859 default) means to write nothing. ``1`` writes errors. ``2`` writes
860 progression info. ``3`` writes more details. And ``4`` writes all info.
861
862 Cover Dictionaries
863 ^^^^^^^^^^^^^^^^^^
864
865 An alternate dictionary training mechanism named *cover* is also available.
866 More details about this training mechanism are available in the paper
867 *Effective Construction of Relative Lempel-Ziv Dictionaries* (authors:
868 Liao, Petri, Moffat, Wirth).
869
870 To use this mechanism, use ``zstd.train_cover_dictionary()`` instead of
871 ``zstd.train_dictionary()``. The function behaves nearly the same except
872 its arguments are different and the returned dictionary will contain ``k``
873 and ``d`` attributes reflecting the parameters to the cover algorithm.
874
875 .. note::
876
877 The ``k`` and ``d`` attributes are only populated on dictionary
878 instances created by this function. If a ``ZstdCompressionDict`` is
879 constructed from raw bytes data, the ``k`` and ``d`` attributes will
880 be ``0``.
881
882 The segment and dmer size parameters to the cover algorithm can either be
883 specified manually or you can ask ``train_cover_dictionary()`` to try
884 multiple values and pick the best one, where *best* means the smallest
885 compressed data size.
886
887 In manual mode, the ``k`` and ``d`` arguments must be specified or a
888 ``ZstdError`` will be raised.
889
890 In automatic mode (triggered by specifying ``optimize=True``), ``k``
891 and ``d`` are optional. If a value isn't specified, then default values for
892 both are tested. The ``steps`` argument can control the number of steps
893 through ``k`` values. The ``level`` argument defines the compression level
894 that will be used when testing the compressed size. And ``threads`` can
895 specify the number of threads to use for concurrent operation.
896
897 This function takes the following arguments:
898
899 dict_size
900 Target size in bytes of the dictionary to generate.
901 samples
902 A list of bytes holding samples the dictionary will be trained from.
903 k
904 Parameter to cover algorithm defining the segment size. A reasonable range
905 is [16, 2048+].
906 d
907 Parameter to cover algorithm defining the dmer size. A reasonable range is
908 [6, 16]. ``d`` must be less than or equal to ``k``.
909 dict_id
910 Integer dictionary ID for the produced dictionary. Default is 0, which uses
911 a random value.
912 optimize
913 When true, test dictionary generation with multiple parameters.
914 level
915 Integer target compression level when testing compression with
916 ``optimize=True``. Default is 1.
917 steps
918 Number of steps through ``k`` values to perform when ``optimize=True``.
919 Default is 32.
920 threads
921 Number of threads to use when ``optimize=True``. Default is 0, which means
922 to use a single thread. A negative value can be specified to use as many
923 threads as there are detected logical CPUs.
924 notifications
925 Controls writing of informational messages to ``stderr``. See the
926 documentation for ``train_dictionary()`` for more.
927
739 928 Explicit Compression Parameters
740 929 -------------------------------
741 930
742 931 Zstandard's integer compression levels along with the input size and dictionary
743 932 size are converted into a data structure defining multiple parameters to tune
744 933 behavior of the compression algorithm. It is possible to use define this
745 934 data structure explicitly to have lower-level control over compression behavior.
746 935
747 936 The ``zstd.CompressionParameters`` type represents this data structure.
748 937 You can see how Zstandard converts compression levels to this data structure
749 938 by calling ``zstd.get_compression_parameters()``. e.g.::
750 939
751 940 params = zstd.get_compression_parameters(5)
752 941
753 942 This function also accepts the uncompressed data size and dictionary size
754 943 to adjust parameters::
755 944
756 945 params = zstd.get_compression_parameters(3, source_size=len(data), dict_size=len(dict_data))
757 946
758 947 You can also construct compression parameters from their low-level components::
759 948
760 949 params = zstd.CompressionParameters(20, 6, 12, 5, 4, 10, zstd.STRATEGY_FAST)
761 950
762 951 You can then configure a compressor to use the custom parameters::
763 952
764 953 cctx = zstd.ZstdCompressor(compression_params=params)
765 954
766 955 The members/attributes of ``CompressionParameters`` instances are as follows::
767 956
768 957 * window_log
769 958 * chain_log
770 959 * hash_log
771 960 * search_log
772 961 * search_length
773 962 * target_length
774 963 * strategy
775 964
776 965 This is the order the arguments are passed to the constructor if not using
777 966 named arguments.
778 967
779 968 You'll need to read the Zstandard documentation for what these parameters
780 969 do.
781 970
782 971 Frame Inspection
783 972 ----------------
784 973
785 974 Data emitted from zstd compression is encapsulated in a *frame*. This frame
786 975 begins with a 4 byte *magic number* header followed by 2 to 14 bytes describing
787 976 the frame in more detail. For more info, see
788 977 https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md.
789 978
790 979 ``zstd.get_frame_parameters(data)`` parses a zstd *frame* header from a bytes
791 980 instance and return a ``FrameParameters`` object describing the frame.
792 981
793 982 Depending on which fields are present in the frame and their values, the
794 983 length of the frame parameters varies. If insufficient bytes are passed
795 984 in to fully parse the frame parameters, ``ZstdError`` is raised. To ensure
796 985 frame parameters can be parsed, pass in at least 18 bytes.
797 986
798 987 ``FrameParameters`` instances have the following attributes:
799 988
800 989 content_size
801 990 Integer size of original, uncompressed content. This will be ``0`` if the
802 991 original content size isn't written to the frame (controlled with the
803 992 ``write_content_size`` argument to ``ZstdCompressor``) or if the input
804 993 content size was ``0``.
805 994
806 995 window_size
807 996 Integer size of maximum back-reference distance in compressed data.
808 997
809 998 dict_id
810 999 Integer of dictionary ID used for compression. ``0`` if no dictionary
811 1000 ID was used or if the dictionary ID was ``0``.
812 1001
813 1002 has_checksum
814 1003 Bool indicating whether a 4 byte content checksum is stored at the end
815 1004 of the frame.
816 1005
817 1006 Misc Functionality
818 1007 ------------------
819 1008
820 1009 estimate_compression_context_size(CompressionParameters)
821 1010 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
822 1011
823 1012 Given a ``CompressionParameters`` struct, estimate the memory size required
824 1013 to perform compression.
825 1014
826 1015 estimate_decompression_context_size()
827 1016 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
828 1017
829 1018 Estimate the memory size requirements for a decompressor instance.
830 1019
831 1020 Constants
832 1021 ---------
833 1022
834 1023 The following module constants/attributes are exposed:
835 1024
836 1025 ZSTD_VERSION
837 1026 This module attribute exposes a 3-tuple of the Zstandard version. e.g.
838 1027 ``(1, 0, 0)``
839 1028 MAX_COMPRESSION_LEVEL
840 1029 Integer max compression level accepted by compression functions
841 1030 COMPRESSION_RECOMMENDED_INPUT_SIZE
842 1031 Recommended chunk size to feed to compressor functions
843 1032 COMPRESSION_RECOMMENDED_OUTPUT_SIZE
844 1033 Recommended chunk size for compression output
845 1034 DECOMPRESSION_RECOMMENDED_INPUT_SIZE
846 1035 Recommended chunk size to feed into decompresor functions
847 1036 DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE
848 1037 Recommended chunk size for decompression output
849 1038
850 1039 FRAME_HEADER
851 1040 bytes containing header of the Zstandard frame
852 1041 MAGIC_NUMBER
853 1042 Frame header as an integer
854 1043
855 1044 WINDOWLOG_MIN
856 1045 Minimum value for compression parameter
857 1046 WINDOWLOG_MAX
858 1047 Maximum value for compression parameter
859 1048 CHAINLOG_MIN
860 1049 Minimum value for compression parameter
861 1050 CHAINLOG_MAX
862 1051 Maximum value for compression parameter
863 1052 HASHLOG_MIN
864 1053 Minimum value for compression parameter
865 1054 HASHLOG_MAX
866 1055 Maximum value for compression parameter
867 1056 SEARCHLOG_MIN
868 1057 Minimum value for compression parameter
869 1058 SEARCHLOG_MAX
870 1059 Maximum value for compression parameter
871 1060 SEARCHLENGTH_MIN
872 1061 Minimum value for compression parameter
873 1062 SEARCHLENGTH_MAX
874 1063 Maximum value for compression parameter
875 1064 TARGETLENGTH_MIN
876 1065 Minimum value for compression parameter
877 1066 TARGETLENGTH_MAX
878 1067 Maximum value for compression parameter
879 1068 STRATEGY_FAST
880 1069 Compression strategy
881 1070 STRATEGY_DFAST
882 1071 Compression strategy
883 1072 STRATEGY_GREEDY
884 1073 Compression strategy
885 1074 STRATEGY_LAZY
886 1075 Compression strategy
887 1076 STRATEGY_LAZY2
888 1077 Compression strategy
889 1078 STRATEGY_BTLAZY2
890 1079 Compression strategy
891 1080 STRATEGY_BTOPT
892 1081 Compression strategy
893 1082
894 1083 Performance Considerations
895 1084 --------------------------
896 1085
897 1086 The ``ZstdCompressor`` and ``ZstdDecompressor`` types maintain state to a
898 1087 persistent compression or decompression *context*. Reusing a ``ZstdCompressor``
899 1088 or ``ZstdDecompressor`` instance for multiple operations is faster than
900 1089 instantiating a new ``ZstdCompressor`` or ``ZstdDecompressor`` for each
901 1090 operation. The differences are magnified as the size of data decreases. For
902 1091 example, the difference between *context* reuse and non-reuse for 100,000
903 1092 100 byte inputs will be significant (possiby over 10x faster to reuse contexts)
904 1093 whereas 10 1,000,000 byte inputs will be more similar in speed (because the
905 1094 time spent doing compression dwarfs time spent creating new *contexts*).
906 1095
1096 Buffer Types
1097 ------------
1098
1099 The API exposes a handful of custom types for interfacing with memory buffers.
1100 The primary goal of these types is to facilitate efficient multi-object
1101 operations.
1102
1103 The essential idea is to have a single memory allocation provide backing
1104 storage for multiple logical objects. This has 2 main advantages: fewer
1105 allocations and optimal memory access patterns. This avoids having to allocate
1106 a Python object for each logical object and furthermore ensures that access of
1107 data for objects can be sequential (read: fast) in memory.
1108
1109 BufferWithSegments
1110 ^^^^^^^^^^^^^^^^^^
1111
1112 The ``BufferWithSegments`` type represents a memory buffer containing N
1113 discrete items of known lengths (segments). It is essentially a fixed size
1114 memory address and an array of 2-tuples of ``(offset, length)`` 64-bit
1115 unsigned native endian integers defining the byte offset and length of each
1116 segment within the buffer.
1117
1118 Instances behave like containers.
1119
1120 ``len()`` returns the number of segments within the instance.
1121
1122 ``o[index]`` or ``__getitem__`` obtains a ``BufferSegment`` representing an
1123 individual segment within the backing buffer. That returned object references
1124 (not copies) memory. This means that iterating all objects doesn't copy
1125 data within the buffer.
1126
1127 The ``.size`` attribute contains the total size in bytes of the backing
1128 buffer.
1129
1130 Instances conform to the buffer protocol. So a reference to the backing bytes
1131 can be obtained via ``memoryview(o)``. A *copy* of the backing bytes can also
1132 be obtained via ``.tobytes()``.
1133
1134 The ``.segments`` attribute exposes the array of ``(offset, length)`` for
1135 segments within the buffer. It is a ``BufferSegments`` type.
1136
1137 BufferSegment
1138 ^^^^^^^^^^^^^
1139
1140 The ``BufferSegment`` type represents a segment within a ``BufferWithSegments``.
1141 It is essentially a reference to N bytes within a ``BufferWithSegments``.
1142
1143 ``len()`` returns the length of the segment in bytes.
1144
1145 ``.offset`` contains the byte offset of this segment within its parent
1146 ``BufferWithSegments`` instance.
1147
1148 The object conforms to the buffer protocol. ``.tobytes()`` can be called to
1149 obtain a ``bytes`` instance with a copy of the backing bytes.
1150
1151 BufferSegments
1152 ^^^^^^^^^^^^^^
1153
1154 This type represents an array of ``(offset, length)`` integers defining segments
1155 within a ``BufferWithSegments``.
1156
1157 The array members are 64-bit unsigned integers using host/native bit order.
1158
1159 Instances conform to the buffer protocol.
1160
1161 BufferWithSegmentsCollection
1162 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1163
1164 The ``BufferWithSegmentsCollection`` type represents a virtual spanning view
1165 of multiple ``BufferWithSegments`` instances.
1166
1167 Instances are constructed from 1 or more ``BufferWithSegments`` instances. The
1168 resulting object behaves like an ordered sequence whose members are the
1169 segments within each ``BufferWithSegments``.
1170
1171 ``len()`` returns the number of segments within all ``BufferWithSegments``
1172 instances.
1173
1174 ``o[index]`` and ``__getitem__(index)`` return the ``BufferSegment`` at
1175 that offset as if all ``BufferWithSegments`` instances were a single
1176 entity.
1177
1178 If the object is composed of 2 ``BufferWithSegments`` instances with the
1179 first having 2 segments and the second have 3 segments, then ``b[0]``
1180 and ``b[1]`` access segments in the first object and ``b[2]``, ``b[3]``,
1181 and ``b[4]`` access segments from the second.
1182
1183 Choosing an API
1184 ===============
1185
1186 There are multiple APIs for performing compression and decompression. This is
1187 because different applications have different needs and the library wants to
1188 facilitate optimal use in as many use cases as possible.
1189
1190 From a high-level, APIs are divided into *one-shot* and *streaming*. See
1191 the ``Concepts`` section for a description of how these are different at
1192 the C layer.
1193
1194 The *one-shot* APIs are useful for small data, where the input or output
1195 size is known. (The size can come from a buffer length, file size, or
1196 stored in the zstd frame header.) A limitation of the *one-shot* APIs is that
1197 input and output must fit in memory simultaneously. For say a 4 GB input,
1198 this is often not feasible.
1199
1200 The *one-shot* APIs also perform all work as a single operation. So, if you
1201 feed it large input, it could take a long time for the function to return.
1202
1203 The streaming APIs do not have the limitations of the simple API. But the
1204 price you pay for this flexibility is that they are more complex than a
1205 single function call.
1206
1207 The streaming APIs put the caller in control of compression and decompression
1208 behavior by allowing them to directly control either the input or output side
1209 of the operation.
1210
1211 With the *streaming input*, *compressor*, and *decompressor* APIs, the caller
1212 has full control over the input to the compression or decompression stream.
1213 They can directly choose when new data is operated on.
1214
1215 With the *streaming ouput* APIs, the caller has full control over the output
1216 of the compression or decompression stream. It can choose when to receive
1217 new data.
1218
1219 When using the *streaming* APIs that operate on file-like or stream objects,
1220 it is important to consider what happens in that object when I/O is requested.
1221 There is potential for long pauses as data is read or written from the
1222 underlying stream (say from interacting with a filesystem or network). This
1223 could add considerable overhead.
1224
1225 Concepts
1226 ========
1227
1228 It is important to have a basic understanding of how Zstandard works in order
1229 to optimally use this library. In addition, there are some low-level Python
1230 concepts that are worth explaining to aid understanding. This section aims to
1231 provide that knowledge.
1232
1233 Zstandard Frames and Compression Format
1234 ---------------------------------------
1235
1236 Compressed zstandard data almost always exists within a container called a
1237 *frame*. (For the technically curious, see the
1238 `specification <https://github.com/facebook/zstd/blob/3bee41a70eaf343fbcae3637b3f6edbe52f35ed8/doc/zstd_compression_format.md>_.)
1239
1240 The frame contains a header and optional trailer. The header contains a
1241 magic number to self-identify as a zstd frame and a description of the
1242 compressed data that follows.
1243
1244 Among other things, the frame *optionally* contains the size of the
1245 decompressed data the frame represents, a 32-bit checksum of the
1246 decompressed data (to facilitate verification during decompression),
1247 and the ID of the dictionary used to compress the data.
1248
1249 Storing the original content size in the frame (``write_content_size=True``
1250 to ``ZstdCompressor``) is important for performance in some scenarios. Having
1251 the decompressed size stored there (or storing it elsewhere) allows
1252 decompression to perform a single memory allocation that is exactly sized to
1253 the output. This is faster than continuously growing a memory buffer to hold
1254 output.
1255
1256 Compression and Decompression Contexts
1257 --------------------------------------
1258
1259 In order to perform a compression or decompression operation with the zstd
1260 C API, you need what's called a *context*. A context essentially holds
1261 configuration and state for a compression or decompression operation. For
1262 example, a compression context holds the configured compression level.
1263
1264 Contexts can be reused for multiple operations. Since creating and
1265 destroying contexts is not free, there are performance advantages to
1266 reusing contexts.
1267
1268 The ``ZstdCompressor`` and ``ZstdDecompressor`` types are essentially
1269 wrappers around these contexts in the zstd C API.
1270
1271 One-shot And Streaming Operations
1272 ---------------------------------
1273
1274 A compression or decompression operation can either be performed as a
1275 single *one-shot* operation or as a continuous *streaming* operation.
1276
1277 In one-shot mode (the *simple* APIs provided by the Python interface),
1278 **all** input is handed to the compressor or decompressor as a single buffer
1279 and **all** output is returned as a single buffer.
1280
1281 In streaming mode, input is delivered to the compressor or decompressor as
1282 a series of chunks via multiple function calls. Likewise, output is
1283 obtained in chunks as well.
1284
1285 Streaming operations require an additional *stream* object to be created
1286 to track the operation. These are logical extensions of *context*
1287 instances.
1288
1289 There are advantages and disadvantages to each mode of operation. There
1290 are scenarios where certain modes can't be used. See the
1291 ``Choosing an API`` section for more.
1292
1293 Dictionaries
1294 ------------
1295
1296 A compression *dictionary* is essentially data used to seed the compressor
1297 state so it can achieve better compression. The idea is that if you are
1298 compressing a lot of similar pieces of data (e.g. JSON documents or anything
1299 sharing similar structure), then you can find common patterns across multiple
1300 objects then leverage those common patterns during compression and
1301 decompression operations to achieve better compression ratios.
1302
1303 Dictionary compression is generally only useful for small inputs - data no
1304 larger than a few kilobytes. The upper bound on this range is highly dependent
1305 on the input data and the dictionary.
1306
1307 Python Buffer Protocol
1308 ----------------------
1309
1310 Many functions in the library operate on objects that implement Python's
1311 `buffer protocol <https://docs.python.org/3.6/c-api/buffer.html>`_.
1312
1313 The *buffer protocol* is an internal implementation detail of a Python
1314 type that allows instances of that type (objects) to be exposed as a raw
1315 pointer (or buffer) in the C API. In other words, it allows objects to be
1316 exposed as an array of bytes.
1317
1318 From the perspective of the C API, objects implementing the *buffer protocol*
1319 all look the same: they are just a pointer to a memory address of a defined
1320 length. This allows the C API to be largely type agnostic when accessing their
1321 data. This allows custom types to be passed in without first converting them
1322 to a specific type.
1323
1324 Many Python types implement the buffer protocol. These include ``bytes``
1325 (``str`` on Python 2), ``bytearray``, ``array.array``, ``io.BytesIO``,
1326 ``mmap.mmap``, and ``memoryview``.
1327
1328 ``python-zstandard`` APIs that accept objects conforming to the buffer
1329 protocol require that the buffer is *C contiguous* and has a single
1330 dimension (``ndim==1``). This is usually the case. An example of where it
1331 is not is a Numpy matrix type.
1332
1333 Requiring Output Sizes for Non-Streaming Decompression APIs
1334 -----------------------------------------------------------
1335
1336 Non-streaming decompression APIs require that either the output size is
1337 explicitly defined (either in the zstd frame header or passed into the
1338 function) or that a max output size is specified. This restriction is for
1339 your safety.
1340
1341 The *one-shot* decompression APIs store the decompressed result in a
1342 single buffer. This means that a buffer needs to be pre-allocated to hold
1343 the result. If the decompressed size is not known, then there is no universal
1344 good default size to use. Any default will fail or will be highly sub-optimal
1345 in some scenarios (it will either be too small or will put stress on the
1346 memory allocator to allocate a too large block).
1347
1348 A *helpful* API may retry decompression with buffers of increasing size.
1349 While useful, there are obvious performance disadvantages, namely redoing
1350 decompression N times until it works. In addition, there is a security
1351 concern. Say the input came from highly compressible data, like 1 GB of the
1352 same byte value. The output size could be several magnitudes larger than the
1353 input size. An input of <100KB could decompress to >1GB. Without a bounds
1354 restriction on the decompressed size, certain inputs could exhaust all system
1355 memory. That's not good and is why the maximum output size is limited.
1356
907 1357 Note on Zstandard's *Experimental* API
908 1358 ======================================
909 1359
910 1360 Many of the Zstandard APIs used by this module are marked as *experimental*
911 1361 within the Zstandard project. This includes a large number of useful
912 1362 features, such as compression and frame parameters and parts of dictionary
913 1363 compression.
914 1364
915 1365 It is unclear how Zstandard's C API will evolve over time, especially with
916 1366 regards to this *experimental* functionality. We will try to maintain
917 1367 backwards compatibility at the Python API level. However, we cannot
918 1368 guarantee this for things not under our control.
919 1369
920 1370 Since a copy of the Zstandard source code is distributed with this
921 1371 module and since we compile against it, the behavior of a specific
922 1372 version of this module should be constant for all of time. So if you
923 1373 pin the version of this module used in your projects (which is a Python
924 1374 best practice), you should be buffered from unwanted future changes.
925 1375
926 1376 Donate
927 1377 ======
928 1378
929 1379 A lot of time has been invested into this project by the author.
930 1380
931 1381 If you find this project useful and would like to thank the author for
932 1382 their work, consider donating some money. Any amount is appreciated.
933 1383
934 1384 .. image:: https://www.paypalobjects.com/en_US/i/btn/btn_donate_LG.gif
935 1385 :target: https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=gregory%2eszorc%40gmail%2ecom&lc=US&item_name=python%2dzstandard&currency_code=USD&bn=PP%2dDonationsBF%3abtn_donate_LG%2egif%3aNonHosted
936 1386 :alt: Donate via PayPal
937 1387
938 1388 .. |ci-status| image:: https://travis-ci.org/indygreg/python-zstandard.svg?branch=master
939 1389 :target: https://travis-ci.org/indygreg/python-zstandard
940 1390
941 1391 .. |win-ci-status| image:: https://ci.appveyor.com/api/projects/status/github/indygreg/python-zstandard?svg=true
942 1392 :target: https://ci.appveyor.com/project/indygreg/python-zstandard
943 1393 :alt: Windows build status
@@ -1,248 +1,392
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) {
14 static char *kwlist[] = { "dict_size", "samples", "parameters", NULL };
14 static char* kwlist[] = {
15 "dict_size",
16 "samples",
17 "selectivity",
18 "level",
19 "notifications",
20 "dict_id",
21 NULL
22 };
15 23 size_t capacity;
16 24 PyObject* samples;
17 25 Py_ssize_t samplesLen;
18 PyObject* parameters = NULL;
26 unsigned selectivity = 0;
27 int level = 0;
28 unsigned notifications = 0;
29 unsigned dictID = 0;
19 30 ZDICT_params_t zparams;
20 31 Py_ssize_t sampleIndex;
21 32 Py_ssize_t sampleSize;
22 33 PyObject* sampleItem;
23 34 size_t zresult;
24 void* sampleBuffer;
35 void* sampleBuffer = NULL;
25 36 void* sampleOffset;
26 37 size_t samplesSize = 0;
27 size_t* sampleSizes;
28 void* dict;
29 ZstdCompressionDict* result;
38 size_t* sampleSizes = NULL;
39 void* dict = NULL;
40 ZstdCompressionDict* result = NULL;
30 41
31 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|O!:train_dictionary",
42 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IiII:train_dictionary",
32 43 kwlist,
33 44 &capacity,
34 45 &PyList_Type, &samples,
35 (PyObject*)&DictParametersType, &parameters)) {
46 &selectivity, &level, &notifications, &dictID)) {
36 47 return NULL;
37 48 }
38 49
39 /* Validate parameters first since it is easiest. */
40 zparams.selectivityLevel = 0;
41 zparams.compressionLevel = 0;
42 zparams.notificationLevel = 0;
43 zparams.dictID = 0;
44 zparams.reserved[0] = 0;
45 zparams.reserved[1] = 0;
50 memset(&zparams, 0, sizeof(zparams));
46 51
47 if (parameters) {
48 /* TODO validate data ranges */
49 zparams.selectivityLevel = PyLong_AsUnsignedLong(PyTuple_GetItem(parameters, 0));
50 zparams.compressionLevel = PyLong_AsLong(PyTuple_GetItem(parameters, 1));
51 zparams.notificationLevel = PyLong_AsUnsignedLong(PyTuple_GetItem(parameters, 2));
52 zparams.dictID = PyLong_AsUnsignedLong(PyTuple_GetItem(parameters, 3));
53 }
52 zparams.selectivityLevel = selectivity;
53 zparams.compressionLevel = level;
54 zparams.notificationLevel = notifications;
55 zparams.dictID = dictID;
54 56
55 57 /* Figure out the size of the raw samples */
56 58 samplesLen = PyList_Size(samples);
57 59 for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
58 60 sampleItem = PyList_GetItem(samples, sampleIndex);
59 61 if (!PyBytes_Check(sampleItem)) {
60 62 PyErr_SetString(PyExc_ValueError, "samples must be bytes");
61 63 return NULL;
62 64 }
63 65 samplesSize += PyBytes_GET_SIZE(sampleItem);
64 66 }
65 67
66 68 /* Now that we know the total size of the raw simples, we can allocate
67 69 a buffer for the raw data */
68 70 sampleBuffer = PyMem_Malloc(samplesSize);
69 71 if (!sampleBuffer) {
70 72 PyErr_NoMemory();
71 return NULL;
73 goto finally;
72 74 }
73 75 sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t));
74 76 if (!sampleSizes) {
75 PyMem_Free(sampleBuffer);
76 77 PyErr_NoMemory();
77 return NULL;
78 goto finally;
78 79 }
79 80
80 81 sampleOffset = sampleBuffer;
81 82 /* Now iterate again and assemble the samples in the buffer */
82 83 for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
83 84 sampleItem = PyList_GetItem(samples, sampleIndex);
84 85 sampleSize = PyBytes_GET_SIZE(sampleItem);
85 86 sampleSizes[sampleIndex] = sampleSize;
86 87 memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize);
87 88 sampleOffset = (char*)sampleOffset + sampleSize;
88 89 }
89 90
90 91 dict = PyMem_Malloc(capacity);
91 92 if (!dict) {
92 PyMem_Free(sampleSizes);
93 PyMem_Free(sampleBuffer);
94 93 PyErr_NoMemory();
95 return NULL;
94 goto finally;
96 95 }
97 96
97 /* TODO consider using dup2() to redirect zstd's stderr writing to a buffer */
98 Py_BEGIN_ALLOW_THREADS
98 99 zresult = ZDICT_trainFromBuffer_advanced(dict, capacity,
99 100 sampleBuffer, sampleSizes, (unsigned int)samplesLen,
100 101 zparams);
102 Py_END_ALLOW_THREADS
101 103 if (ZDICT_isError(zresult)) {
102 104 PyErr_Format(ZstdError, "Cannot train dict: %s", ZDICT_getErrorName(zresult));
103 105 PyMem_Free(dict);
104 PyMem_Free(sampleSizes);
105 PyMem_Free(sampleBuffer);
106 return NULL;
106 goto finally;
107 107 }
108 108
109 109 result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType);
110 110 if (!result) {
111 return NULL;
111 goto finally;
112 112 }
113 113
114 114 result->dictData = dict;
115 115 result->dictSize = zresult;
116 result->d = 0;
117 result->k = 0;
118
119 finally:
120 PyMem_Free(sampleBuffer);
121 PyMem_Free(sampleSizes);
122
116 123 return result;
117 124 }
118 125
126 ZstdCompressionDict* train_cover_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) {
127 static char* kwlist[] = {
128 "dict_size",
129 "samples",
130 "k",
131 "d",
132 "notifications",
133 "dict_id",
134 "level",
135 "optimize",
136 "steps",
137 "threads",
138 NULL
139 };
140
141 size_t capacity;
142 PyObject* samples;
143 unsigned k = 0;
144 unsigned d = 0;
145 unsigned notifications = 0;
146 unsigned dictID = 0;
147 int level = 0;
148 PyObject* optimize = NULL;
149 unsigned steps = 0;
150 int threads = 0;
151 COVER_params_t params;
152 Py_ssize_t samplesLen;
153 Py_ssize_t i;
154 size_t samplesSize = 0;
155 void* sampleBuffer = NULL;
156 size_t* sampleSizes = NULL;
157 void* sampleOffset;
158 Py_ssize_t sampleSize;
159 void* dict = NULL;
160 size_t zresult;
161 ZstdCompressionDict* result = NULL;
162
163 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiOIi:train_cover_dictionary",
164 kwlist, &capacity, &PyList_Type, &samples,
165 &k, &d, &notifications, &dictID, &level, &optimize, &steps, &threads)) {
166 return NULL;
167 }
168
169 if (threads < 0) {
170 threads = cpu_count();
171 }
172
173 memset(&params, 0, sizeof(params));
174 params.k = k;
175 params.d = d;
176 params.steps = steps;
177 params.nbThreads = threads;
178 params.notificationLevel = notifications;
179 params.dictID = dictID;
180 params.compressionLevel = level;
181
182 /* Figure out total size of input samples. */
183 samplesLen = PyList_Size(samples);
184 for (i = 0; i < samplesLen; i++) {
185 PyObject* sampleItem = PyList_GET_ITEM(samples, i);
186
187 if (!PyBytes_Check(sampleItem)) {
188 PyErr_SetString(PyExc_ValueError, "samples must be bytes");
189 return NULL;
190 }
191 samplesSize += PyBytes_GET_SIZE(sampleItem);
192 }
193
194 sampleBuffer = PyMem_Malloc(samplesSize);
195 if (!sampleBuffer) {
196 PyErr_NoMemory();
197 goto finally;
198 }
199
200 sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t));
201 if (!sampleSizes) {
202 PyErr_NoMemory();
203 goto finally;
204 }
205
206 sampleOffset = sampleBuffer;
207 for (i = 0; i < samplesLen; i++) {
208 PyObject* sampleItem = PyList_GET_ITEM(samples, i);
209 sampleSize = PyBytes_GET_SIZE(sampleItem);
210 sampleSizes[i] = sampleSize;
211 memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize);
212 sampleOffset = (char*)sampleOffset + sampleSize;
213 }
214
215 dict = PyMem_Malloc(capacity);
216 if (!dict) {
217 PyErr_NoMemory();
218 goto finally;
219 }
220
221 Py_BEGIN_ALLOW_THREADS
222 if (optimize && PyObject_IsTrue(optimize)) {
223 zresult = COVER_optimizeTrainFromBuffer(dict, capacity,
224 sampleBuffer, sampleSizes, (unsigned)samplesLen, &params);
225 }
226 else {
227 zresult = COVER_trainFromBuffer(dict, capacity,
228 sampleBuffer, sampleSizes, (unsigned)samplesLen, params);
229 }
230 Py_END_ALLOW_THREADS
231
232 if (ZDICT_isError(zresult)) {
233 PyMem_Free(dict);
234 PyErr_Format(ZstdError, "cannot train dict: %s", ZDICT_getErrorName(zresult));
235 goto finally;
236 }
237
238 result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType);
239 if (!result) {
240 PyMem_Free(dict);
241 goto finally;
242 }
243
244 result->dictData = dict;
245 result->dictSize = zresult;
246 result->d = params.d;
247 result->k = params.k;
248
249 finally:
250 PyMem_Free(sampleBuffer);
251 PyMem_Free(sampleSizes);
252
253 return result;
254 }
119 255
120 256 PyDoc_STRVAR(ZstdCompressionDict__doc__,
121 257 "ZstdCompressionDict(data) - Represents a computed compression dictionary\n"
122 258 "\n"
123 259 "This type holds the results of a computed Zstandard compression dictionary.\n"
124 260 "Instances are obtained by calling ``train_dictionary()`` or by passing bytes\n"
125 261 "obtained from another source into the constructor.\n"
126 262 );
127 263
128 264 static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args) {
129 265 const char* source;
130 266 Py_ssize_t sourceSize;
131 267
132 268 self->dictData = NULL;
133 269 self->dictSize = 0;
134 270
135 271 #if PY_MAJOR_VERSION >= 3
136 272 if (!PyArg_ParseTuple(args, "y#:ZstdCompressionDict",
137 273 #else
138 274 if (!PyArg_ParseTuple(args, "s#:ZstdCompressionDict",
139 275 #endif
140 276 &source, &sourceSize)) {
141 277 return -1;
142 278 }
143 279
144 280 self->dictData = PyMem_Malloc(sourceSize);
145 281 if (!self->dictData) {
146 282 PyErr_NoMemory();
147 283 return -1;
148 284 }
149 285
150 286 memcpy(self->dictData, source, sourceSize);
151 287 self->dictSize = sourceSize;
152 288
153 289 return 0;
154 290 }
155 291
156 292 static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) {
157 293 if (self->dictData) {
158 294 PyMem_Free(self->dictData);
159 295 self->dictData = NULL;
160 296 }
161 297
162 298 PyObject_Del(self);
163 299 }
164 300
165 301 static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) {
166 302 unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize);
167 303
168 304 return PyLong_FromLong(dictID);
169 305 }
170 306
171 307 static PyObject* ZstdCompressionDict_as_bytes(ZstdCompressionDict* self) {
172 308 return PyBytes_FromStringAndSize(self->dictData, self->dictSize);
173 309 }
174 310
175 311 static PyMethodDef ZstdCompressionDict_methods[] = {
176 312 { "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS,
177 313 PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") },
178 314 { "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS,
179 315 PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") },
180 316 { NULL, NULL }
181 317 };
182 318
319 static PyMemberDef ZstdCompressionDict_members[] = {
320 { "k", T_UINT, offsetof(ZstdCompressionDict, k), READONLY,
321 "segment size" },
322 { "d", T_UINT, offsetof(ZstdCompressionDict, d), READONLY,
323 "dmer size" },
324 { NULL }
325 };
326
183 327 static Py_ssize_t ZstdCompressionDict_length(ZstdCompressionDict* self) {
184 328 return self->dictSize;
185 329 }
186 330
187 331 static PySequenceMethods ZstdCompressionDict_sq = {
188 332 (lenfunc)ZstdCompressionDict_length, /* sq_length */
189 333 0, /* sq_concat */
190 334 0, /* sq_repeat */
191 335 0, /* sq_item */
192 336 0, /* sq_ass_item */
193 337 0, /* sq_contains */
194 338 0, /* sq_inplace_concat */
195 339 0 /* sq_inplace_repeat */
196 340 };
197 341
198 342 PyTypeObject ZstdCompressionDictType = {
199 343 PyVarObject_HEAD_INIT(NULL, 0)
200 344 "zstd.ZstdCompressionDict", /* tp_name */
201 345 sizeof(ZstdCompressionDict), /* tp_basicsize */
202 346 0, /* tp_itemsize */
203 347 (destructor)ZstdCompressionDict_dealloc, /* tp_dealloc */
204 348 0, /* tp_print */
205 349 0, /* tp_getattr */
206 350 0, /* tp_setattr */
207 351 0, /* tp_compare */
208 352 0, /* tp_repr */
209 353 0, /* tp_as_number */
210 354 &ZstdCompressionDict_sq, /* tp_as_sequence */
211 355 0, /* tp_as_mapping */
212 356 0, /* tp_hash */
213 357 0, /* tp_call */
214 358 0, /* tp_str */
215 359 0, /* tp_getattro */
216 360 0, /* tp_setattro */
217 361 0, /* tp_as_buffer */
218 362 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
219 363 ZstdCompressionDict__doc__, /* tp_doc */
220 364 0, /* tp_traverse */
221 365 0, /* tp_clear */
222 366 0, /* tp_richcompare */
223 367 0, /* tp_weaklistoffset */
224 368 0, /* tp_iter */
225 369 0, /* tp_iternext */
226 370 ZstdCompressionDict_methods, /* tp_methods */
227 0, /* tp_members */
371 ZstdCompressionDict_members, /* tp_members */
228 372 0, /* tp_getset */
229 373 0, /* tp_base */
230 374 0, /* tp_dict */
231 375 0, /* tp_descr_get */
232 376 0, /* tp_descr_set */
233 377 0, /* tp_dictoffset */
234 378 (initproc)ZstdCompressionDict_init, /* tp_init */
235 379 0, /* tp_alloc */
236 380 PyType_GenericNew, /* tp_new */
237 381 };
238 382
239 383 void compressiondict_module_init(PyObject* mod) {
240 384 Py_TYPE(&ZstdCompressionDictType) = &PyType_Type;
241 385 if (PyType_Ready(&ZstdCompressionDictType) < 0) {
242 386 return;
243 387 }
244 388
245 389 Py_INCREF((PyObject*)&ZstdCompressionDictType);
246 390 PyModule_AddObject(mod, "ZstdCompressionDict",
247 391 (PyObject*)&ZstdCompressionDictType);
248 392 }
@@ -1,220 +1,253
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 void ztopy_compression_parameters(CompressionParametersObject* params, ZSTD_compressionParameters* zparams) {
12 12 zparams->windowLog = params->windowLog;
13 13 zparams->chainLog = params->chainLog;
14 14 zparams->hashLog = params->hashLog;
15 15 zparams->searchLog = params->searchLog;
16 16 zparams->searchLength = params->searchLength;
17 17 zparams->targetLength = params->targetLength;
18 18 zparams->strategy = params->strategy;
19 19 }
20 20
21 21 CompressionParametersObject* get_compression_parameters(PyObject* self, PyObject* args) {
22 22 int compressionLevel;
23 23 unsigned PY_LONG_LONG sourceSize = 0;
24 24 Py_ssize_t dictSize = 0;
25 25 ZSTD_compressionParameters params;
26 26 CompressionParametersObject* result;
27 27
28 28 if (!PyArg_ParseTuple(args, "i|Kn:get_compression_parameters",
29 29 &compressionLevel, &sourceSize, &dictSize)) {
30 30 return NULL;
31 31 }
32 32
33 33 params = ZSTD_getCParams(compressionLevel, sourceSize, dictSize);
34 34
35 35 result = PyObject_New(CompressionParametersObject, &CompressionParametersType);
36 36 if (!result) {
37 37 return NULL;
38 38 }
39 39
40 40 result->windowLog = params.windowLog;
41 41 result->chainLog = params.chainLog;
42 42 result->hashLog = params.hashLog;
43 43 result->searchLog = params.searchLog;
44 44 result->searchLength = params.searchLength;
45 45 result->targetLength = params.targetLength;
46 46 result->strategy = params.strategy;
47 47
48 48 return result;
49 49 }
50 50
51 51 static int CompressionParameters_init(CompressionParametersObject* self, PyObject* args, PyObject* kwargs) {
52 52 static char* kwlist[] = {
53 53 "window_log",
54 54 "chain_log",
55 55 "hash_log",
56 56 "search_log",
57 57 "search_length",
58 58 "target_length",
59 59 "strategy",
60 60 NULL
61 61 };
62 62
63 63 unsigned windowLog;
64 64 unsigned chainLog;
65 65 unsigned hashLog;
66 66 unsigned searchLog;
67 67 unsigned searchLength;
68 68 unsigned targetLength;
69 69 unsigned strategy;
70 ZSTD_compressionParameters params;
71 size_t zresult;
70 72
71 73 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "IIIIIII:CompressionParameters",
72 74 kwlist, &windowLog, &chainLog, &hashLog, &searchLog, &searchLength,
73 75 &targetLength, &strategy)) {
74 76 return -1;
75 77 }
76 78
77 79 if (windowLog < ZSTD_WINDOWLOG_MIN || windowLog > ZSTD_WINDOWLOG_MAX) {
78 80 PyErr_SetString(PyExc_ValueError, "invalid window log value");
79 81 return -1;
80 82 }
81 83
82 84 if (chainLog < ZSTD_CHAINLOG_MIN || chainLog > ZSTD_CHAINLOG_MAX) {
83 85 PyErr_SetString(PyExc_ValueError, "invalid chain log value");
84 86 return -1;
85 87 }
86 88
87 89 if (hashLog < ZSTD_HASHLOG_MIN || hashLog > ZSTD_HASHLOG_MAX) {
88 90 PyErr_SetString(PyExc_ValueError, "invalid hash log value");
89 91 return -1;
90 92 }
91 93
92 94 if (searchLog < ZSTD_SEARCHLOG_MIN || searchLog > ZSTD_SEARCHLOG_MAX) {
93 95 PyErr_SetString(PyExc_ValueError, "invalid search log value");
94 96 return -1;
95 97 }
96 98
97 99 if (searchLength < ZSTD_SEARCHLENGTH_MIN || searchLength > ZSTD_SEARCHLENGTH_MAX) {
98 100 PyErr_SetString(PyExc_ValueError, "invalid search length value");
99 101 return -1;
100 102 }
101 103
102 104 if (targetLength < ZSTD_TARGETLENGTH_MIN || targetLength > ZSTD_TARGETLENGTH_MAX) {
103 105 PyErr_SetString(PyExc_ValueError, "invalid target length value");
104 106 return -1;
105 107 }
106 108
107 109 if (strategy < ZSTD_fast || strategy > ZSTD_btopt) {
108 110 PyErr_SetString(PyExc_ValueError, "invalid strategy value");
109 111 return -1;
110 112 }
111 113
112 114 self->windowLog = windowLog;
113 115 self->chainLog = chainLog;
114 116 self->hashLog = hashLog;
115 117 self->searchLog = searchLog;
116 118 self->searchLength = searchLength;
117 119 self->targetLength = targetLength;
118 120 self->strategy = strategy;
119 121
122 ztopy_compression_parameters(self, &params);
123 zresult = ZSTD_checkCParams(params);
124
125 if (ZSTD_isError(zresult)) {
126 PyErr_Format(PyExc_ValueError, "invalid compression parameters: %s",
127 ZSTD_getErrorName(zresult));
128 return -1;
129 }
130
120 131 return 0;
121 132 }
122 133
134 PyDoc_STRVAR(CompressionParameters_estimated_compression_context_size__doc__,
135 "Estimate the size in bytes of a compression context for compression parameters\n"
136 );
137
138 PyObject* CompressionParameters_estimated_compression_context_size(CompressionParametersObject* self) {
139 ZSTD_compressionParameters params;
140
141 ztopy_compression_parameters(self, &params);
142
143 return PyLong_FromSize_t(ZSTD_estimateCCtxSize(params));
144 }
145
123 146 PyObject* estimate_compression_context_size(PyObject* self, PyObject* args) {
124 147 CompressionParametersObject* params;
125 148 ZSTD_compressionParameters zparams;
126 149 PyObject* result;
127 150
128 151 if (!PyArg_ParseTuple(args, "O!:estimate_compression_context_size",
129 152 &CompressionParametersType, &params)) {
130 153 return NULL;
131 154 }
132 155
133 156 ztopy_compression_parameters(params, &zparams);
134 157 result = PyLong_FromSize_t(ZSTD_estimateCCtxSize(zparams));
135 158 return result;
136 159 }
137 160
138 161 PyDoc_STRVAR(CompressionParameters__doc__,
139 162 "CompressionParameters: low-level control over zstd compression");
140 163
141 164 static void CompressionParameters_dealloc(PyObject* self) {
142 165 PyObject_Del(self);
143 166 }
144 167
168 static PyMethodDef CompressionParameters_methods[] = {
169 {
170 "estimated_compression_context_size",
171 (PyCFunction)CompressionParameters_estimated_compression_context_size,
172 METH_NOARGS,
173 CompressionParameters_estimated_compression_context_size__doc__
174 },
175 { NULL, NULL }
176 };
177
145 178 static PyMemberDef CompressionParameters_members[] = {
146 179 { "window_log", T_UINT,
147 180 offsetof(CompressionParametersObject, windowLog), READONLY,
148 181 "window log" },
149 182 { "chain_log", T_UINT,
150 183 offsetof(CompressionParametersObject, chainLog), READONLY,
151 184 "chain log" },
152 185 { "hash_log", T_UINT,
153 186 offsetof(CompressionParametersObject, hashLog), READONLY,
154 187 "hash log" },
155 188 { "search_log", T_UINT,
156 189 offsetof(CompressionParametersObject, searchLog), READONLY,
157 190 "search log" },
158 191 { "search_length", T_UINT,
159 192 offsetof(CompressionParametersObject, searchLength), READONLY,
160 193 "search length" },
161 194 { "target_length", T_UINT,
162 195 offsetof(CompressionParametersObject, targetLength), READONLY,
163 196 "target length" },
164 197 { "strategy", T_INT,
165 198 offsetof(CompressionParametersObject, strategy), READONLY,
166 199 "strategy" },
167 200 { NULL }
168 201 };
169 202
170 203 PyTypeObject CompressionParametersType = {
171 204 PyVarObject_HEAD_INIT(NULL, 0)
172 205 "CompressionParameters", /* tp_name */
173 206 sizeof(CompressionParametersObject), /* tp_basicsize */
174 207 0, /* tp_itemsize */
175 208 (destructor)CompressionParameters_dealloc, /* tp_dealloc */
176 209 0, /* tp_print */
177 210 0, /* tp_getattr */
178 211 0, /* tp_setattr */
179 212 0, /* tp_compare */
180 213 0, /* tp_repr */
181 214 0, /* tp_as_number */
182 215 0, /* tp_as_sequence */
183 216 0, /* tp_as_mapping */
184 217 0, /* tp_hash */
185 218 0, /* tp_call */
186 219 0, /* tp_str */
187 220 0, /* tp_getattro */
188 221 0, /* tp_setattro */
189 222 0, /* tp_as_buffer */
190 223 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
191 224 CompressionParameters__doc__, /* tp_doc */
192 225 0, /* tp_traverse */
193 226 0, /* tp_clear */
194 227 0, /* tp_richcompare */
195 228 0, /* tp_weaklistoffset */
196 229 0, /* tp_iter */
197 230 0, /* tp_iternext */
198 0, /* tp_methods */
231 CompressionParameters_methods, /* tp_methods */
199 232 CompressionParameters_members, /* tp_members */
200 233 0, /* tp_getset */
201 234 0, /* tp_base */
202 235 0, /* tp_dict */
203 236 0, /* tp_descr_get */
204 237 0, /* tp_descr_set */
205 238 0, /* tp_dictoffset */
206 239 (initproc)CompressionParameters_init, /* tp_init */
207 240 0, /* tp_alloc */
208 241 PyType_GenericNew, /* tp_new */
209 242 };
210 243
211 244 void compressionparams_module_init(PyObject* mod) {
212 245 Py_TYPE(&CompressionParametersType) = &PyType_Type;
213 246 if (PyType_Ready(&CompressionParametersType) < 0) {
214 247 return;
215 248 }
216 249
217 Py_IncRef((PyObject*)&CompressionParametersType);
250 Py_INCREF(&CompressionParametersType);
218 251 PyModule_AddObject(mod, "CompressionParameters",
219 252 (PyObject*)&CompressionParametersType);
220 253 }
@@ -1,290 +1,305
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 PyDoc_STRVAR(ZstdCompresssionWriter__doc__,
14 14 """A context manager used for writing compressed output to a writer.\n"
15 15 );
16 16
17 17 static void ZstdCompressionWriter_dealloc(ZstdCompressionWriter* self) {
18 18 Py_XDECREF(self->compressor);
19 19 Py_XDECREF(self->writer);
20 20
21 if (self->cstream) {
22 ZSTD_freeCStream(self->cstream);
23 self->cstream = NULL;
24 }
25
26 21 PyObject_Del(self);
27 22 }
28 23
29 24 static PyObject* ZstdCompressionWriter_enter(ZstdCompressionWriter* self) {
30 25 if (self->entered) {
31 26 PyErr_SetString(ZstdError, "cannot __enter__ multiple times");
32 27 return NULL;
33 28 }
34 29
35 self->cstream = CStream_from_ZstdCompressor(self->compressor, self->sourceSize);
36 if (!self->cstream) {
37 return NULL;
30 if (self->compressor->mtcctx) {
31 if (init_mtcstream(self->compressor, self->sourceSize)) {
32 return NULL;
33 }
34 }
35 else {
36 if (0 != init_cstream(self->compressor, self->sourceSize)) {
37 return NULL;
38 }
38 39 }
39 40
40 41 self->entered = 1;
41 42
42 43 Py_INCREF(self);
43 44 return (PyObject*)self;
44 45 }
45 46
46 47 static PyObject* ZstdCompressionWriter_exit(ZstdCompressionWriter* self, PyObject* args) {
47 48 PyObject* exc_type;
48 49 PyObject* exc_value;
49 50 PyObject* exc_tb;
50 51 size_t zresult;
51 52
52 53 ZSTD_outBuffer output;
53 54 PyObject* res;
54 55
55 56 if (!PyArg_ParseTuple(args, "OOO:__exit__", &exc_type, &exc_value, &exc_tb)) {
56 57 return NULL;
57 58 }
58 59
59 60 self->entered = 0;
60 61
61 if (self->cstream && exc_type == Py_None && exc_value == Py_None &&
62 exc_tb == Py_None) {
62 if ((self->compressor->cstream || self->compressor->mtcctx) && exc_type == Py_None
63 && exc_value == Py_None && exc_tb == Py_None) {
63 64
64 65 output.dst = PyMem_Malloc(self->outSize);
65 66 if (!output.dst) {
66 67 return PyErr_NoMemory();
67 68 }
68 69 output.size = self->outSize;
69 70 output.pos = 0;
70 71
71 72 while (1) {
72 zresult = ZSTD_endStream(self->cstream, &output);
73 if (self->compressor->mtcctx) {
74 zresult = ZSTDMT_endStream(self->compressor->mtcctx, &output);
75 }
76 else {
77 zresult = ZSTD_endStream(self->compressor->cstream, &output);
78 }
73 79 if (ZSTD_isError(zresult)) {
74 80 PyErr_Format(ZstdError, "error ending compression stream: %s",
75 81 ZSTD_getErrorName(zresult));
76 82 PyMem_Free(output.dst);
77 83 return NULL;
78 84 }
79 85
80 86 if (output.pos) {
81 87 #if PY_MAJOR_VERSION >= 3
82 88 res = PyObject_CallMethod(self->writer, "write", "y#",
83 89 #else
84 90 res = PyObject_CallMethod(self->writer, "write", "s#",
85 91 #endif
86 92 output.dst, output.pos);
87 93 Py_XDECREF(res);
88 94 }
89 95
90 96 if (!zresult) {
91 97 break;
92 98 }
93 99
94 100 output.pos = 0;
95 101 }
96 102
97 103 PyMem_Free(output.dst);
98 ZSTD_freeCStream(self->cstream);
99 self->cstream = NULL;
100 104 }
101 105
102 106 Py_RETURN_FALSE;
103 107 }
104 108
105 109 static PyObject* ZstdCompressionWriter_memory_size(ZstdCompressionWriter* self) {
106 if (!self->cstream) {
110 if (!self->compressor->cstream) {
107 111 PyErr_SetString(ZstdError, "cannot determine size of an inactive compressor; "
108 112 "call when a context manager is active");
109 113 return NULL;
110 114 }
111 115
112 return PyLong_FromSize_t(ZSTD_sizeof_CStream(self->cstream));
116 return PyLong_FromSize_t(ZSTD_sizeof_CStream(self->compressor->cstream));
113 117 }
114 118
115 119 static PyObject* ZstdCompressionWriter_write(ZstdCompressionWriter* self, PyObject* args) {
116 120 const char* source;
117 121 Py_ssize_t sourceSize;
118 122 size_t zresult;
119 123 ZSTD_inBuffer input;
120 124 ZSTD_outBuffer output;
121 125 PyObject* res;
122 126 Py_ssize_t totalWrite = 0;
123 127
124 128 #if PY_MAJOR_VERSION >= 3
125 129 if (!PyArg_ParseTuple(args, "y#:write", &source, &sourceSize)) {
126 130 #else
127 131 if (!PyArg_ParseTuple(args, "s#:write", &source, &sourceSize)) {
128 132 #endif
129 133 return NULL;
130 134 }
131 135
132 136 if (!self->entered) {
133 137 PyErr_SetString(ZstdError, "compress must be called from an active context manager");
134 138 return NULL;
135 139 }
136 140
137 141 output.dst = PyMem_Malloc(self->outSize);
138 142 if (!output.dst) {
139 143 return PyErr_NoMemory();
140 144 }
141 145 output.size = self->outSize;
142 146 output.pos = 0;
143 147
144 148 input.src = source;
145 149 input.size = sourceSize;
146 150 input.pos = 0;
147 151
148 152 while ((ssize_t)input.pos < sourceSize) {
149 153 Py_BEGIN_ALLOW_THREADS
150 zresult = ZSTD_compressStream(self->cstream, &output, &input);
154 if (self->compressor->mtcctx) {
155 zresult = ZSTDMT_compressStream(self->compressor->mtcctx,
156 &output, &input);
157 }
158 else {
159 zresult = ZSTD_compressStream(self->compressor->cstream, &output, &input);
160 }
151 161 Py_END_ALLOW_THREADS
152 162
153 163 if (ZSTD_isError(zresult)) {
154 164 PyMem_Free(output.dst);
155 165 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
156 166 return NULL;
157 167 }
158 168
159 169 /* Copy data from output buffer to writer. */
160 170 if (output.pos) {
161 171 #if PY_MAJOR_VERSION >= 3
162 172 res = PyObject_CallMethod(self->writer, "write", "y#",
163 173 #else
164 174 res = PyObject_CallMethod(self->writer, "write", "s#",
165 175 #endif
166 176 output.dst, output.pos);
167 177 Py_XDECREF(res);
168 178 totalWrite += output.pos;
169 179 }
170 180 output.pos = 0;
171 181 }
172 182
173 183 PyMem_Free(output.dst);
174 184
175 185 return PyLong_FromSsize_t(totalWrite);
176 186 }
177 187
178 188 static PyObject* ZstdCompressionWriter_flush(ZstdCompressionWriter* self, PyObject* args) {
179 189 size_t zresult;
180 190 ZSTD_outBuffer output;
181 191 PyObject* res;
182 192 Py_ssize_t totalWrite = 0;
183 193
184 194 if (!self->entered) {
185 195 PyErr_SetString(ZstdError, "flush must be called from an active context manager");
186 196 return NULL;
187 197 }
188 198
189 199 output.dst = PyMem_Malloc(self->outSize);
190 200 if (!output.dst) {
191 201 return PyErr_NoMemory();
192 202 }
193 203 output.size = self->outSize;
194 204 output.pos = 0;
195 205
196 206 while (1) {
197 207 Py_BEGIN_ALLOW_THREADS
198 zresult = ZSTD_flushStream(self->cstream, &output);
208 if (self->compressor->mtcctx) {
209 zresult = ZSTDMT_flushStream(self->compressor->mtcctx, &output);
210 }
211 else {
212 zresult = ZSTD_flushStream(self->compressor->cstream, &output);
213 }
199 214 Py_END_ALLOW_THREADS
200 215
201 216 if (ZSTD_isError(zresult)) {
202 217 PyMem_Free(output.dst);
203 218 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
204 219 return NULL;
205 220 }
206 221
207 222 if (!output.pos) {
208 223 break;
209 224 }
210 225
211 226 /* Copy data from output buffer to writer. */
212 227 if (output.pos) {
213 228 #if PY_MAJOR_VERSION >= 3
214 229 res = PyObject_CallMethod(self->writer, "write", "y#",
215 230 #else
216 231 res = PyObject_CallMethod(self->writer, "write", "s#",
217 232 #endif
218 233 output.dst, output.pos);
219 234 Py_XDECREF(res);
220 235 totalWrite += output.pos;
221 236 }
222 237 output.pos = 0;
223 238 }
224 239
225 240 PyMem_Free(output.dst);
226 241
227 242 return PyLong_FromSsize_t(totalWrite);
228 243 }
229 244
230 245 static PyMethodDef ZstdCompressionWriter_methods[] = {
231 246 { "__enter__", (PyCFunction)ZstdCompressionWriter_enter, METH_NOARGS,
232 247 PyDoc_STR("Enter a compression context.") },
233 248 { "__exit__", (PyCFunction)ZstdCompressionWriter_exit, METH_VARARGS,
234 249 PyDoc_STR("Exit a compression context.") },
235 250 { "memory_size", (PyCFunction)ZstdCompressionWriter_memory_size, METH_NOARGS,
236 251 PyDoc_STR("Obtain the memory size of the underlying compressor") },
237 252 { "write", (PyCFunction)ZstdCompressionWriter_write, METH_VARARGS,
238 253 PyDoc_STR("Compress data") },
239 254 { "flush", (PyCFunction)ZstdCompressionWriter_flush, METH_NOARGS,
240 255 PyDoc_STR("Flush data and finish a zstd frame") },
241 256 { NULL, NULL }
242 257 };
243 258
244 259 PyTypeObject ZstdCompressionWriterType = {
245 260 PyVarObject_HEAD_INIT(NULL, 0)
246 261 "zstd.ZstdCompressionWriter", /* tp_name */
247 262 sizeof(ZstdCompressionWriter), /* tp_basicsize */
248 263 0, /* tp_itemsize */
249 264 (destructor)ZstdCompressionWriter_dealloc, /* tp_dealloc */
250 265 0, /* tp_print */
251 266 0, /* tp_getattr */
252 267 0, /* tp_setattr */
253 268 0, /* tp_compare */
254 269 0, /* tp_repr */
255 270 0, /* tp_as_number */
256 271 0, /* tp_as_sequence */
257 272 0, /* tp_as_mapping */
258 273 0, /* tp_hash */
259 274 0, /* tp_call */
260 275 0, /* tp_str */
261 276 0, /* tp_getattro */
262 277 0, /* tp_setattro */
263 278 0, /* tp_as_buffer */
264 279 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
265 280 ZstdCompresssionWriter__doc__, /* tp_doc */
266 281 0, /* tp_traverse */
267 282 0, /* tp_clear */
268 283 0, /* tp_richcompare */
269 284 0, /* tp_weaklistoffset */
270 285 0, /* tp_iter */
271 286 0, /* tp_iternext */
272 287 ZstdCompressionWriter_methods, /* tp_methods */
273 288 0, /* tp_members */
274 289 0, /* tp_getset */
275 290 0, /* tp_base */
276 291 0, /* tp_dict */
277 292 0, /* tp_descr_get */
278 293 0, /* tp_descr_set */
279 294 0, /* tp_dictoffset */
280 295 0, /* tp_init */
281 296 0, /* tp_alloc */
282 297 PyType_GenericNew, /* tp_new */
283 298 };
284 299
285 300 void compressionwriter_module_init(PyObject* mod) {
286 301 Py_TYPE(&ZstdCompressionWriterType) = &PyType_Type;
287 302 if (PyType_Ready(&ZstdCompressionWriterType) < 0) {
288 303 return;
289 304 }
290 305 }
@@ -1,250 +1,258
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 PyDoc_STRVAR(ZstdCompressionObj__doc__,
14 14 "Perform compression using a standard library compatible API.\n"
15 15 );
16 16
17 17 static void ZstdCompressionObj_dealloc(ZstdCompressionObj* self) {
18 18 PyMem_Free(self->output.dst);
19 19 self->output.dst = NULL;
20 20
21 if (self->cstream) {
22 ZSTD_freeCStream(self->cstream);
23 self->cstream = NULL;
24 }
25
26 21 Py_XDECREF(self->compressor);
27 22
28 23 PyObject_Del(self);
29 24 }
30 25
31 26 static PyObject* ZstdCompressionObj_compress(ZstdCompressionObj* self, PyObject* args) {
32 27 const char* source;
33 28 Py_ssize_t sourceSize;
34 29 ZSTD_inBuffer input;
35 30 size_t zresult;
36 31 PyObject* result = NULL;
37 32 Py_ssize_t resultSize = 0;
38 33
39 34 if (self->finished) {
40 35 PyErr_SetString(ZstdError, "cannot call compress() after compressor finished");
41 36 return NULL;
42 37 }
43 38
44 39 #if PY_MAJOR_VERSION >= 3
45 40 if (!PyArg_ParseTuple(args, "y#:compress", &source, &sourceSize)) {
46 41 #else
47 42 if (!PyArg_ParseTuple(args, "s#:compress", &source, &sourceSize)) {
48 43 #endif
49 44 return NULL;
50 45 }
51 46
52 47 input.src = source;
53 48 input.size = sourceSize;
54 49 input.pos = 0;
55 50
56 51 while ((ssize_t)input.pos < sourceSize) {
57 52 Py_BEGIN_ALLOW_THREADS
58 zresult = ZSTD_compressStream(self->cstream, &self->output, &input);
53 if (self->compressor->mtcctx) {
54 zresult = ZSTDMT_compressStream(self->compressor->mtcctx,
55 &self->output, &input);
56 }
57 else {
58 zresult = ZSTD_compressStream(self->compressor->cstream, &self->output, &input);
59 }
59 60 Py_END_ALLOW_THREADS
60 61
61 62 if (ZSTD_isError(zresult)) {
62 63 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
63 64 return NULL;
64 65 }
65 66
66 67 if (self->output.pos) {
67 68 if (result) {
68 69 resultSize = PyBytes_GET_SIZE(result);
69 70 if (-1 == _PyBytes_Resize(&result, resultSize + self->output.pos)) {
70 71 return NULL;
71 72 }
72 73
73 74 memcpy(PyBytes_AS_STRING(result) + resultSize,
74 75 self->output.dst, self->output.pos);
75 76 }
76 77 else {
77 78 result = PyBytes_FromStringAndSize(self->output.dst, self->output.pos);
78 79 if (!result) {
79 80 return NULL;
80 81 }
81 82 }
82 83
83 84 self->output.pos = 0;
84 85 }
85 86 }
86 87
87 88 if (result) {
88 89 return result;
89 90 }
90 91 else {
91 92 return PyBytes_FromString("");
92 93 }
93 94 }
94 95
95 96 static PyObject* ZstdCompressionObj_flush(ZstdCompressionObj* self, PyObject* args) {
96 97 int flushMode = compressorobj_flush_finish;
97 98 size_t zresult;
98 99 PyObject* result = NULL;
99 100 Py_ssize_t resultSize = 0;
100 101
101 102 if (!PyArg_ParseTuple(args, "|i:flush", &flushMode)) {
102 103 return NULL;
103 104 }
104 105
105 106 if (flushMode != compressorobj_flush_finish && flushMode != compressorobj_flush_block) {
106 107 PyErr_SetString(PyExc_ValueError, "flush mode not recognized");
107 108 return NULL;
108 109 }
109 110
110 111 if (self->finished) {
111 112 PyErr_SetString(ZstdError, "compressor object already finished");
112 113 return NULL;
113 114 }
114 115
115 116 assert(self->output.pos == 0);
116 117
117 118 if (flushMode == compressorobj_flush_block) {
118 119 /* The output buffer is of size ZSTD_CStreamOutSize(), which is
119 120 guaranteed to hold a full block. */
120 121 Py_BEGIN_ALLOW_THREADS
121 zresult = ZSTD_flushStream(self->cstream, &self->output);
122 if (self->compressor->mtcctx) {
123 zresult = ZSTDMT_flushStream(self->compressor->mtcctx, &self->output);
124 }
125 else {
126 zresult = ZSTD_flushStream(self->compressor->cstream, &self->output);
127 }
122 128 Py_END_ALLOW_THREADS
123 129
124 130 if (ZSTD_isError(zresult)) {
125 131 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
126 132 return NULL;
127 133 }
128 134
129 135 /* Output buffer is guaranteed to hold full block. */
130 136 assert(zresult == 0);
131 137
132 138 if (self->output.pos) {
133 139 result = PyBytes_FromStringAndSize(self->output.dst, self->output.pos);
134 140 if (!result) {
135 141 return NULL;
136 142 }
137 143 }
138 144
139 145 self->output.pos = 0;
140 146
141 147 if (result) {
142 148 return result;
143 149 }
144 150 else {
145 151 return PyBytes_FromString("");
146 152 }
147 153 }
148 154
149 155 assert(flushMode == compressorobj_flush_finish);
150 156 self->finished = 1;
151 157
152 158 while (1) {
153 zresult = ZSTD_endStream(self->cstream, &self->output);
159 if (self->compressor->mtcctx) {
160 zresult = ZSTDMT_endStream(self->compressor->mtcctx, &self->output);
161 }
162 else {
163 zresult = ZSTD_endStream(self->compressor->cstream, &self->output);
164 }
154 165 if (ZSTD_isError(zresult)) {
155 166 PyErr_Format(ZstdError, "error ending compression stream: %s",
156 167 ZSTD_getErrorName(zresult));
157 168 return NULL;
158 169 }
159 170
160 171 if (self->output.pos) {
161 172 if (result) {
162 173 resultSize = PyBytes_GET_SIZE(result);
163 174 if (-1 == _PyBytes_Resize(&result, resultSize + self->output.pos)) {
164 175 return NULL;
165 176 }
166 177
167 178 memcpy(PyBytes_AS_STRING(result) + resultSize,
168 179 self->output.dst, self->output.pos);
169 180 }
170 181 else {
171 182 result = PyBytes_FromStringAndSize(self->output.dst, self->output.pos);
172 183 if (!result) {
173 184 return NULL;
174 185 }
175 186 }
176 187
177 188 self->output.pos = 0;
178 189 }
179 190
180 191 if (!zresult) {
181 192 break;
182 193 }
183 194 }
184 195
185 ZSTD_freeCStream(self->cstream);
186 self->cstream = NULL;
187
188 196 if (result) {
189 197 return result;
190 198 }
191 199 else {
192 200 return PyBytes_FromString("");
193 201 }
194 202 }
195 203
196 204 static PyMethodDef ZstdCompressionObj_methods[] = {
197 205 { "compress", (PyCFunction)ZstdCompressionObj_compress, METH_VARARGS,
198 206 PyDoc_STR("compress data") },
199 207 { "flush", (PyCFunction)ZstdCompressionObj_flush, METH_VARARGS,
200 208 PyDoc_STR("finish compression operation") },
201 209 { NULL, NULL }
202 210 };
203 211
204 212 PyTypeObject ZstdCompressionObjType = {
205 213 PyVarObject_HEAD_INIT(NULL, 0)
206 214 "zstd.ZstdCompressionObj", /* tp_name */
207 215 sizeof(ZstdCompressionObj), /* tp_basicsize */
208 216 0, /* tp_itemsize */
209 217 (destructor)ZstdCompressionObj_dealloc, /* tp_dealloc */
210 218 0, /* tp_print */
211 219 0, /* tp_getattr */
212 220 0, /* tp_setattr */
213 221 0, /* tp_compare */
214 222 0, /* tp_repr */
215 223 0, /* tp_as_number */
216 224 0, /* tp_as_sequence */
217 225 0, /* tp_as_mapping */
218 226 0, /* tp_hash */
219 227 0, /* tp_call */
220 228 0, /* tp_str */
221 229 0, /* tp_getattro */
222 230 0, /* tp_setattro */
223 231 0, /* tp_as_buffer */
224 232 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
225 233 ZstdCompressionObj__doc__, /* tp_doc */
226 234 0, /* tp_traverse */
227 235 0, /* tp_clear */
228 236 0, /* tp_richcompare */
229 237 0, /* tp_weaklistoffset */
230 238 0, /* tp_iter */
231 239 0, /* tp_iternext */
232 240 ZstdCompressionObj_methods, /* tp_methods */
233 241 0, /* tp_members */
234 242 0, /* tp_getset */
235 243 0, /* tp_base */
236 244 0, /* tp_dict */
237 245 0, /* tp_descr_get */
238 246 0, /* tp_descr_set */
239 247 0, /* tp_dictoffset */
240 248 0, /* tp_init */
241 249 0, /* tp_alloc */
242 250 PyType_GenericNew, /* tp_new */
243 251 };
244 252
245 253 void compressobj_module_init(PyObject* module) {
246 254 Py_TYPE(&ZstdCompressionObjType) = &PyType_Type;
247 255 if (PyType_Ready(&ZstdCompressionObjType) < 0) {
248 256 return;
249 257 }
250 258 }
This diff has been collapsed as it changes many lines, (957 lines changed) Show them Hide them
@@ -1,791 +1,1544
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 #include "pool.h"
10 11
11 12 extern PyObject* ZstdError;
12 13
13 int populate_cdict(ZstdCompressor* compressor, void* dictData, size_t dictSize, ZSTD_parameters* zparams) {
14 int populate_cdict(ZstdCompressor* compressor, ZSTD_parameters* zparams) {
14 15 ZSTD_customMem zmem;
15 assert(!compressor->cdict);
16
17 if (compressor->cdict || !compressor->dict || !compressor->dict->dictData) {
18 return 0;
19 }
20
16 21 Py_BEGIN_ALLOW_THREADS
17 22 memset(&zmem, 0, sizeof(zmem));
18 23 compressor->cdict = ZSTD_createCDict_advanced(compressor->dict->dictData,
19 24 compressor->dict->dictSize, 1, *zparams, zmem);
20 25 Py_END_ALLOW_THREADS
21 26
22 27 if (!compressor->cdict) {
23 28 PyErr_SetString(ZstdError, "could not create compression dictionary");
24 29 return 1;
25 30 }
26 31
27 32 return 0;
28 33 }
29 34
30 35 /**
31 * Initialize a zstd CStream from a ZstdCompressor instance.
32 *
33 * Returns a ZSTD_CStream on success or NULL on failure. If NULL, a Python
34 * exception will be set.
35 */
36 ZSTD_CStream* CStream_from_ZstdCompressor(ZstdCompressor* compressor, Py_ssize_t sourceSize) {
37 ZSTD_CStream* cstream;
36 * Ensure the ZSTD_CStream on a ZstdCompressor instance is initialized.
37 *
38 * Returns 0 on success. Other value on failure. Will set a Python exception
39 * on failure.
40 */
41 int init_cstream(ZstdCompressor* compressor, unsigned long long sourceSize) {
38 42 ZSTD_parameters zparams;
39 43 void* dictData = NULL;
40 44 size_t dictSize = 0;
41 45 size_t zresult;
42 46
43 cstream = ZSTD_createCStream();
44 if (!cstream) {
45 PyErr_SetString(ZstdError, "cannot create CStream");
46 return NULL;
47 if (compressor->cstream) {
48 zresult = ZSTD_resetCStream(compressor->cstream, sourceSize);
49 if (ZSTD_isError(zresult)) {
50 PyErr_Format(ZstdError, "could not reset CStream: %s",
51 ZSTD_getErrorName(zresult));
52 return -1;
53 }
54
55 return 0;
56 }
57
58 compressor->cstream = ZSTD_createCStream();
59 if (!compressor->cstream) {
60 PyErr_SetString(ZstdError, "could not create CStream");
61 return -1;
47 62 }
48 63
49 64 if (compressor->dict) {
50 65 dictData = compressor->dict->dictData;
51 66 dictSize = compressor->dict->dictSize;
52 67 }
53 68
54 69 memset(&zparams, 0, sizeof(zparams));
55 70 if (compressor->cparams) {
56 71 ztopy_compression_parameters(compressor->cparams, &zparams.cParams);
57 72 /* Do NOT call ZSTD_adjustCParams() here because the compression params
58 73 come from the user. */
59 74 }
60 75 else {
61 76 zparams.cParams = ZSTD_getCParams(compressor->compressionLevel, sourceSize, dictSize);
62 77 }
63 78
64 79 zparams.fParams = compressor->fparams;
65 80
66 zresult = ZSTD_initCStream_advanced(cstream, dictData, dictSize, zparams, sourceSize);
81 zresult = ZSTD_initCStream_advanced(compressor->cstream, dictData, dictSize,
82 zparams, sourceSize);
67 83
68 84 if (ZSTD_isError(zresult)) {
69 ZSTD_freeCStream(cstream);
85 ZSTD_freeCStream(compressor->cstream);
86 compressor->cstream = NULL;
70 87 PyErr_Format(ZstdError, "cannot init CStream: %s", ZSTD_getErrorName(zresult));
71 return NULL;
88 return -1;
72 89 }
73 90
74 return cstream;
91 return 0;;
92 }
93
94 int init_mtcstream(ZstdCompressor* compressor, Py_ssize_t sourceSize) {
95 size_t zresult;
96 void* dictData = NULL;
97 size_t dictSize = 0;
98 ZSTD_parameters zparams;
99
100 assert(compressor->mtcctx);
101
102 if (compressor->dict) {
103 dictData = compressor->dict->dictData;
104 dictSize = compressor->dict->dictSize;
105 }
106
107 memset(&zparams, 0, sizeof(zparams));
108 if (compressor->cparams) {
109 ztopy_compression_parameters(compressor->cparams, &zparams.cParams);
110 }
111 else {
112 zparams.cParams = ZSTD_getCParams(compressor->compressionLevel, sourceSize, dictSize);
113 }
114
115 zparams.fParams = compressor->fparams;
116
117 zresult = ZSTDMT_initCStream_advanced(compressor->mtcctx, dictData, dictSize,
118 zparams, sourceSize);
119
120 if (ZSTD_isError(zresult)) {
121 PyErr_Format(ZstdError, "cannot init CStream: %s", ZSTD_getErrorName(zresult));
122 return -1;
123 }
124
125 return 0;
75 126 }
76 127
77 128 PyDoc_STRVAR(ZstdCompressor__doc__,
78 129 "ZstdCompressor(level=None, dict_data=None, compression_params=None)\n"
79 130 "\n"
80 131 "Create an object used to perform Zstandard compression.\n"
81 132 "\n"
82 133 "An instance can compress data various ways. Instances can be used multiple\n"
83 134 "times. Each compression operation will use the compression parameters\n"
84 135 "defined at construction time.\n"
85 136 "\n"
86 137 "Compression can be configured via the following names arguments:\n"
87 138 "\n"
88 139 "level\n"
89 140 " Integer compression level.\n"
90 141 "dict_data\n"
91 142 " A ``ZstdCompressionDict`` to be used to compress with dictionary data.\n"
92 143 "compression_params\n"
93 144 " A ``CompressionParameters`` instance defining low-level compression"
94 145 " parameters. If defined, this will overwrite the ``level`` argument.\n"
95 146 "write_checksum\n"
96 147 " If True, a 4 byte content checksum will be written with the compressed\n"
97 148 " data, allowing the decompressor to perform content verification.\n"
98 149 "write_content_size\n"
99 150 " If True, the decompressed content size will be included in the header of\n"
100 151 " the compressed data. This data will only be written if the compressor\n"
101 152 " knows the size of the input data.\n"
102 153 "write_dict_id\n"
103 154 " Determines whether the dictionary ID will be written into the compressed\n"
104 155 " data. Defaults to True. Only adds content to the compressed data if\n"
105 156 " a dictionary is being used.\n"
157 "threads\n"
158 " Number of threads to use to compress data concurrently. When set,\n"
159 " compression operations are performed on multiple threads. The default\n"
160 " value (0) disables multi-threaded compression. A value of ``-1`` means to\n"
161 " set the number of threads to the number of detected logical CPUs.\n"
106 162 );
107 163
108 164 static int ZstdCompressor_init(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
109 165 static char* kwlist[] = {
110 166 "level",
111 167 "dict_data",
112 168 "compression_params",
113 169 "write_checksum",
114 170 "write_content_size",
115 171 "write_dict_id",
172 "threads",
116 173 NULL
117 174 };
118 175
119 176 int level = 3;
120 177 ZstdCompressionDict* dict = NULL;
121 178 CompressionParametersObject* params = NULL;
122 179 PyObject* writeChecksum = NULL;
123 180 PyObject* writeContentSize = NULL;
124 181 PyObject* writeDictID = NULL;
182 int threads = 0;
125 183
126 self->cctx = NULL;
127 self->dict = NULL;
128 self->cparams = NULL;
129 self->cdict = NULL;
130
131 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!O!OOO:ZstdCompressor",
184 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!O!OOOi:ZstdCompressor",
132 185 kwlist, &level, &ZstdCompressionDictType, &dict,
133 186 &CompressionParametersType, &params,
134 &writeChecksum, &writeContentSize, &writeDictID)) {
187 &writeChecksum, &writeContentSize, &writeDictID, &threads)) {
135 188 return -1;
136 189 }
137 190
138 191 if (level < 1) {
139 192 PyErr_SetString(PyExc_ValueError, "level must be greater than 0");
140 193 return -1;
141 194 }
142 195
143 196 if (level > ZSTD_maxCLevel()) {
144 197 PyErr_Format(PyExc_ValueError, "level must be less than %d",
145 198 ZSTD_maxCLevel() + 1);
146 199 return -1;
147 200 }
148 201
202 if (threads < 0) {
203 threads = cpu_count();
204 }
205
206 self->threads = threads;
207
149 208 /* We create a ZSTD_CCtx for reuse among multiple operations to reduce the
150 209 overhead of each compression operation. */
151 self->cctx = ZSTD_createCCtx();
152 if (!self->cctx) {
153 PyErr_NoMemory();
154 return -1;
210 if (threads) {
211 self->mtcctx = ZSTDMT_createCCtx(threads);
212 if (!self->mtcctx) {
213 PyErr_NoMemory();
214 return -1;
215 }
216 }
217 else {
218 self->cctx = ZSTD_createCCtx();
219 if (!self->cctx) {
220 PyErr_NoMemory();
221 return -1;
222 }
155 223 }
156 224
157 225 self->compressionLevel = level;
158 226
159 227 if (dict) {
160 228 self->dict = dict;
161 229 Py_INCREF(dict);
162 230 }
163 231
164 232 if (params) {
165 233 self->cparams = params;
166 234 Py_INCREF(params);
167 235 }
168 236
169 237 memset(&self->fparams, 0, sizeof(self->fparams));
170 238
171 239 if (writeChecksum && PyObject_IsTrue(writeChecksum)) {
172 240 self->fparams.checksumFlag = 1;
173 241 }
174 242 if (writeContentSize && PyObject_IsTrue(writeContentSize)) {
175 243 self->fparams.contentSizeFlag = 1;
176 244 }
177 245 if (writeDictID && PyObject_Not(writeDictID)) {
178 246 self->fparams.noDictIDFlag = 1;
179 247 }
180 248
181 249 return 0;
182 250 }
183 251
184 252 static void ZstdCompressor_dealloc(ZstdCompressor* self) {
253 if (self->cstream) {
254 ZSTD_freeCStream(self->cstream);
255 self->cstream = NULL;
256 }
257
185 258 Py_XDECREF(self->cparams);
186 259 Py_XDECREF(self->dict);
187 260
188 261 if (self->cdict) {
189 262 ZSTD_freeCDict(self->cdict);
190 263 self->cdict = NULL;
191 264 }
192 265
193 266 if (self->cctx) {
194 267 ZSTD_freeCCtx(self->cctx);
195 268 self->cctx = NULL;
196 269 }
197 270
271 if (self->mtcctx) {
272 ZSTDMT_freeCCtx(self->mtcctx);
273 self->mtcctx = NULL;
274 }
275
198 276 PyObject_Del(self);
199 277 }
200 278
201 279 PyDoc_STRVAR(ZstdCompressor_copy_stream__doc__,
202 280 "copy_stream(ifh, ofh[, size=0, read_size=default, write_size=default])\n"
203 281 "compress data between streams\n"
204 282 "\n"
205 283 "Data will be read from ``ifh``, compressed, and written to ``ofh``.\n"
206 284 "``ifh`` must have a ``read(size)`` method. ``ofh`` must have a ``write(data)``\n"
207 285 "method.\n"
208 286 "\n"
209 287 "An optional ``size`` argument specifies the size of the source stream.\n"
210 288 "If defined, compression parameters will be tuned based on the size.\n"
211 289 "\n"
212 290 "Optional arguments ``read_size`` and ``write_size`` define the chunk sizes\n"
213 291 "of ``read()`` and ``write()`` operations, respectively. By default, they use\n"
214 292 "the default compression stream input and output sizes, respectively.\n"
215 293 );
216 294
217 295 static PyObject* ZstdCompressor_copy_stream(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
218 296 static char* kwlist[] = {
219 297 "ifh",
220 298 "ofh",
221 299 "size",
222 300 "read_size",
223 301 "write_size",
224 302 NULL
225 303 };
226 304
227 305 PyObject* source;
228 306 PyObject* dest;
229 307 Py_ssize_t sourceSize = 0;
230 308 size_t inSize = ZSTD_CStreamInSize();
231 309 size_t outSize = ZSTD_CStreamOutSize();
232 ZSTD_CStream* cstream;
233 310 ZSTD_inBuffer input;
234 311 ZSTD_outBuffer output;
235 312 Py_ssize_t totalRead = 0;
236 313 Py_ssize_t totalWrite = 0;
237 314 char* readBuffer;
238 315 Py_ssize_t readSize;
239 316 PyObject* readResult;
240 317 PyObject* res = NULL;
241 318 size_t zresult;
242 319 PyObject* writeResult;
243 320 PyObject* totalReadPy;
244 321 PyObject* totalWritePy;
245 322
246 323 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nkk:copy_stream", kwlist,
247 324 &source, &dest, &sourceSize, &inSize, &outSize)) {
248 325 return NULL;
249 326 }
250 327
251 328 if (!PyObject_HasAttrString(source, "read")) {
252 329 PyErr_SetString(PyExc_ValueError, "first argument must have a read() method");
253 330 return NULL;
254 331 }
255 332
256 333 if (!PyObject_HasAttrString(dest, "write")) {
257 334 PyErr_SetString(PyExc_ValueError, "second argument must have a write() method");
258 335 return NULL;
259 336 }
260 337
261 338 /* Prevent free on uninitialized memory in finally. */
262 339 output.dst = NULL;
263 340
264 cstream = CStream_from_ZstdCompressor(self, sourceSize);
265 if (!cstream) {
266 res = NULL;
267 goto finally;
341 if (self->mtcctx) {
342 if (init_mtcstream(self, sourceSize)) {
343 res = NULL;
344 goto finally;
345 }
346 }
347 else {
348 if (0 != init_cstream(self, sourceSize)) {
349 res = NULL;
350 goto finally;
351 }
268 352 }
269 353
270 354 output.dst = PyMem_Malloc(outSize);
271 355 if (!output.dst) {
272 356 PyErr_NoMemory();
273 357 res = NULL;
274 358 goto finally;
275 359 }
276 360 output.size = outSize;
277 361 output.pos = 0;
278 362
279 363 while (1) {
280 364 /* Try to read from source stream. */
281 365 readResult = PyObject_CallMethod(source, "read", "n", inSize);
282 366 if (!readResult) {
283 367 PyErr_SetString(ZstdError, "could not read() from source");
284 368 goto finally;
285 369 }
286 370
287 371 PyBytes_AsStringAndSize(readResult, &readBuffer, &readSize);
288 372
289 373 /* If no data was read, we're at EOF. */
290 374 if (0 == readSize) {
291 375 break;
292 376 }
293 377
294 378 totalRead += readSize;
295 379
296 380 /* Send data to compressor */
297 381 input.src = readBuffer;
298 382 input.size = readSize;
299 383 input.pos = 0;
300 384
301 385 while (input.pos < input.size) {
302 386 Py_BEGIN_ALLOW_THREADS
303 zresult = ZSTD_compressStream(cstream, &output, &input);
387 if (self->mtcctx) {
388 zresult = ZSTDMT_compressStream(self->mtcctx, &output, &input);
389 }
390 else {
391 zresult = ZSTD_compressStream(self->cstream, &output, &input);
392 }
304 393 Py_END_ALLOW_THREADS
305 394
306 395 if (ZSTD_isError(zresult)) {
307 396 res = NULL;
308 397 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
309 398 goto finally;
310 399 }
311 400
312 401 if (output.pos) {
313 402 #if PY_MAJOR_VERSION >= 3
314 403 writeResult = PyObject_CallMethod(dest, "write", "y#",
315 404 #else
316 405 writeResult = PyObject_CallMethod(dest, "write", "s#",
317 406 #endif
318 407 output.dst, output.pos);
319 408 Py_XDECREF(writeResult);
320 409 totalWrite += output.pos;
321 410 output.pos = 0;
322 411 }
323 412 }
324 413 }
325 414
326 415 /* We've finished reading. Now flush the compressor stream. */
327 416 while (1) {
328 zresult = ZSTD_endStream(cstream, &output);
417 if (self->mtcctx) {
418 zresult = ZSTDMT_endStream(self->mtcctx, &output);
419 }
420 else {
421 zresult = ZSTD_endStream(self->cstream, &output);
422 }
329 423 if (ZSTD_isError(zresult)) {
330 424 PyErr_Format(ZstdError, "error ending compression stream: %s",
331 425 ZSTD_getErrorName(zresult));
332 426 res = NULL;
333 427 goto finally;
334 428 }
335 429
336 430 if (output.pos) {
337 431 #if PY_MAJOR_VERSION >= 3
338 432 writeResult = PyObject_CallMethod(dest, "write", "y#",
339 433 #else
340 434 writeResult = PyObject_CallMethod(dest, "write", "s#",
341 435 #endif
342 436 output.dst, output.pos);
343 437 totalWrite += output.pos;
344 438 Py_XDECREF(writeResult);
345 439 output.pos = 0;
346 440 }
347 441
348 442 if (!zresult) {
349 443 break;
350 444 }
351 445 }
352 446
353 ZSTD_freeCStream(cstream);
354 cstream = NULL;
355
356 447 totalReadPy = PyLong_FromSsize_t(totalRead);
357 448 totalWritePy = PyLong_FromSsize_t(totalWrite);
358 449 res = PyTuple_Pack(2, totalReadPy, totalWritePy);
359 Py_DecRef(totalReadPy);
360 Py_DecRef(totalWritePy);
450 Py_DECREF(totalReadPy);
451 Py_DECREF(totalWritePy);
361 452
362 453 finally:
363 454 if (output.dst) {
364 455 PyMem_Free(output.dst);
365 456 }
366 457
367 if (cstream) {
368 ZSTD_freeCStream(cstream);
369 }
370
371 458 return res;
372 459 }
373 460
374 461 PyDoc_STRVAR(ZstdCompressor_compress__doc__,
375 462 "compress(data, allow_empty=False)\n"
376 463 "\n"
377 464 "Compress data in a single operation.\n"
378 465 "\n"
379 466 "This is the simplest mechanism to perform compression: simply pass in a\n"
380 467 "value and get a compressed value back. It is almost the most prone to abuse.\n"
381 468 "The input and output values must fit in memory, so passing in very large\n"
382 469 "values can result in excessive memory usage. For this reason, one of the\n"
383 470 "streaming based APIs is preferred for larger values.\n"
384 471 );
385 472
386 473 static PyObject* ZstdCompressor_compress(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
387 474 static char* kwlist[] = {
388 475 "data",
389 476 "allow_empty",
390 477 NULL
391 478 };
392 479
393 480 const char* source;
394 481 Py_ssize_t sourceSize;
395 482 PyObject* allowEmpty = NULL;
396 483 size_t destSize;
397 484 PyObject* output;
398 485 char* dest;
399 486 void* dictData = NULL;
400 487 size_t dictSize = 0;
401 488 size_t zresult;
402 489 ZSTD_parameters zparams;
403 490
404 491 #if PY_MAJOR_VERSION >= 3
405 492 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|O:compress",
406 493 #else
407 494 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|O:compress",
408 495 #endif
409 496 kwlist, &source, &sourceSize, &allowEmpty)) {
410 497 return NULL;
411 498 }
412 499
500 if (self->threads && self->dict) {
501 PyErr_SetString(ZstdError,
502 "compress() cannot be used with both dictionaries and multi-threaded compression");
503 return NULL;
504 }
505
506 if (self->threads && self->cparams) {
507 PyErr_SetString(ZstdError,
508 "compress() cannot be used with both compression parameters and multi-threaded compression");
509 return NULL;
510 }
511
413 512 /* Limitation in zstd C API doesn't let decompression side distinguish
414 513 between content size of 0 and unknown content size. This can make round
415 514 tripping via Python difficult. Until this is fixed, require a flag
416 515 to fire the footgun.
417 516 https://github.com/indygreg/python-zstandard/issues/11 */
418 517 if (0 == sourceSize && self->fparams.contentSizeFlag
419 518 && (!allowEmpty || PyObject_Not(allowEmpty))) {
420 519 PyErr_SetString(PyExc_ValueError, "cannot write empty inputs when writing content sizes");
421 520 return NULL;
422 521 }
423 522
424 523 destSize = ZSTD_compressBound(sourceSize);
425 524 output = PyBytes_FromStringAndSize(NULL, destSize);
426 525 if (!output) {
427 526 return NULL;
428 527 }
429 528
430 529 dest = PyBytes_AsString(output);
431 530
432 531 if (self->dict) {
433 532 dictData = self->dict->dictData;
434 533 dictSize = self->dict->dictSize;
435 534 }
436 535
437 536 memset(&zparams, 0, sizeof(zparams));
438 537 if (!self->cparams) {
439 538 zparams.cParams = ZSTD_getCParams(self->compressionLevel, sourceSize, dictSize);
440 539 }
441 540 else {
442 541 ztopy_compression_parameters(self->cparams, &zparams.cParams);
443 542 /* Do NOT call ZSTD_adjustCParams() here because the compression params
444 543 come from the user. */
445 544 }
446 545
447 546 zparams.fParams = self->fparams;
448 547
449 548 /* The raw dict data has to be processed before it can be used. Since this
450 549 adds overhead - especially if multiple dictionary compression operations
451 550 are performed on the same ZstdCompressor instance - we create a
452 551 ZSTD_CDict once and reuse it for all operations.
453 552
454 553 Note: the compression parameters used for the first invocation (possibly
455 554 derived from the source size) will be reused on all subsequent invocations.
456 555 https://github.com/facebook/zstd/issues/358 contains more info. We could
457 556 potentially add an argument somewhere to control this behavior.
458 557 */
459 if (dictData && !self->cdict) {
460 if (populate_cdict(self, dictData, dictSize, &zparams)) {
461 Py_DECREF(output);
462 return NULL;
463 }
558 if (0 != populate_cdict(self, &zparams)) {
559 Py_DECREF(output);
560 return NULL;
464 561 }
465 562
466 563 Py_BEGIN_ALLOW_THREADS
467 /* By avoiding ZSTD_compress(), we don't necessarily write out content
468 size. This means the argument to ZstdCompressor to control frame
469 parameters is honored. */
470 if (self->cdict) {
471 zresult = ZSTD_compress_usingCDict(self->cctx, dest, destSize,
472 source, sourceSize, self->cdict);
564 if (self->mtcctx) {
565 zresult = ZSTDMT_compressCCtx(self->mtcctx, dest, destSize,
566 source, sourceSize, self->compressionLevel);
473 567 }
474 568 else {
475 zresult = ZSTD_compress_advanced(self->cctx, dest, destSize,
476 source, sourceSize, dictData, dictSize, zparams);
569 /* By avoiding ZSTD_compress(), we don't necessarily write out content
570 size. This means the argument to ZstdCompressor to control frame
571 parameters is honored. */
572 if (self->cdict) {
573 zresult = ZSTD_compress_usingCDict(self->cctx, dest, destSize,
574 source, sourceSize, self->cdict);
575 }
576 else {
577 zresult = ZSTD_compress_advanced(self->cctx, dest, destSize,
578 source, sourceSize, dictData, dictSize, zparams);
579 }
477 580 }
478 581 Py_END_ALLOW_THREADS
479 582
480 583 if (ZSTD_isError(zresult)) {
481 584 PyErr_Format(ZstdError, "cannot compress: %s", ZSTD_getErrorName(zresult));
482 585 Py_CLEAR(output);
483 586 return NULL;
484 587 }
485 588 else {
486 589 Py_SIZE(output) = zresult;
487 590 }
488 591
489 592 return output;
490 593 }
491 594
492 595 PyDoc_STRVAR(ZstdCompressionObj__doc__,
493 596 "compressobj()\n"
494 597 "\n"
495 598 "Return an object exposing ``compress(data)`` and ``flush()`` methods.\n"
496 599 "\n"
497 600 "The returned object exposes an API similar to ``zlib.compressobj`` and\n"
498 601 "``bz2.BZ2Compressor`` so that callers can swap in the zstd compressor\n"
499 602 "without changing how compression is performed.\n"
500 603 );
501 604
502 605 static ZstdCompressionObj* ZstdCompressor_compressobj(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
503 606 static char* kwlist[] = {
504 607 "size",
505 608 NULL
506 609 };
507 610
508 611 Py_ssize_t inSize = 0;
509 612 size_t outSize = ZSTD_CStreamOutSize();
510 ZstdCompressionObj* result = PyObject_New(ZstdCompressionObj, &ZstdCompressionObjType);
511 if (!result) {
512 return NULL;
513 }
613 ZstdCompressionObj* result = NULL;
514 614
515 615 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n:compressobj", kwlist, &inSize)) {
516 616 return NULL;
517 617 }
518 618
519 result->cstream = CStream_from_ZstdCompressor(self, inSize);
520 if (!result->cstream) {
521 Py_DECREF(result);
619 result = (ZstdCompressionObj*)PyObject_CallObject((PyObject*)&ZstdCompressionObjType, NULL);
620 if (!result) {
522 621 return NULL;
523 622 }
524 623
624 if (self->mtcctx) {
625 if (init_mtcstream(self, inSize)) {
626 Py_DECREF(result);
627 return NULL;
628 }
629 }
630 else {
631 if (0 != init_cstream(self, inSize)) {
632 Py_DECREF(result);
633 return NULL;
634 }
635 }
636
525 637 result->output.dst = PyMem_Malloc(outSize);
526 638 if (!result->output.dst) {
527 639 PyErr_NoMemory();
528 640 Py_DECREF(result);
529 641 return NULL;
530 642 }
531 643 result->output.size = outSize;
532 result->output.pos = 0;
533
534 644 result->compressor = self;
535 645 Py_INCREF(result->compressor);
536 646
537 result->finished = 0;
538
539 647 return result;
540 648 }
541 649
542 650 PyDoc_STRVAR(ZstdCompressor_read_from__doc__,
543 651 "read_from(reader, [size=0, read_size=default, write_size=default])\n"
544 652 "Read uncompress data from a reader and return an iterator\n"
545 653 "\n"
546 654 "Returns an iterator of compressed data produced from reading from ``reader``.\n"
547 655 "\n"
548 656 "Uncompressed data will be obtained from ``reader`` by calling the\n"
549 657 "``read(size)`` method of it. The source data will be streamed into a\n"
550 658 "compressor. As compressed data is available, it will be exposed to the\n"
551 659 "iterator.\n"
552 660 "\n"
553 661 "Data is read from the source in chunks of ``read_size``. Compressed chunks\n"
554 662 "are at most ``write_size`` bytes. Both values default to the zstd input and\n"
555 663 "and output defaults, respectively.\n"
556 664 "\n"
557 665 "The caller is partially in control of how fast data is fed into the\n"
558 666 "compressor by how it consumes the returned iterator. The compressor will\n"
559 667 "not consume from the reader unless the caller consumes from the iterator.\n"
560 668 );
561 669
562 670 static ZstdCompressorIterator* ZstdCompressor_read_from(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
563 671 static char* kwlist[] = {
564 672 "reader",
565 673 "size",
566 674 "read_size",
567 675 "write_size",
568 676 NULL
569 677 };
570 678
571 679 PyObject* reader;
572 680 Py_ssize_t sourceSize = 0;
573 681 size_t inSize = ZSTD_CStreamInSize();
574 682 size_t outSize = ZSTD_CStreamOutSize();
575 683 ZstdCompressorIterator* result;
576 684
577 685 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nkk:read_from", kwlist,
578 686 &reader, &sourceSize, &inSize, &outSize)) {
579 687 return NULL;
580 688 }
581 689
582 result = PyObject_New(ZstdCompressorIterator, &ZstdCompressorIteratorType);
690 result = (ZstdCompressorIterator*)PyObject_CallObject((PyObject*)&ZstdCompressorIteratorType, NULL);
583 691 if (!result) {
584 692 return NULL;
585 693 }
586
587 result->compressor = NULL;
588 result->reader = NULL;
589 result->buffer = NULL;
590 result->cstream = NULL;
591 result->input.src = NULL;
592 result->output.dst = NULL;
593 result->readResult = NULL;
594
595 694 if (PyObject_HasAttrString(reader, "read")) {
596 695 result->reader = reader;
597 696 Py_INCREF(result->reader);
598 697 }
599 698 else if (1 == PyObject_CheckBuffer(reader)) {
600 699 result->buffer = PyMem_Malloc(sizeof(Py_buffer));
601 700 if (!result->buffer) {
602 701 goto except;
603 702 }
604 703
605 704 memset(result->buffer, 0, sizeof(Py_buffer));
606 705
607 706 if (0 != PyObject_GetBuffer(reader, result->buffer, PyBUF_CONTIG_RO)) {
608 707 goto except;
609 708 }
610 709
611 result->bufferOffset = 0;
612 710 sourceSize = result->buffer->len;
613 711 }
614 712 else {
615 713 PyErr_SetString(PyExc_ValueError,
616 714 "must pass an object with a read() method or conforms to buffer protocol");
617 715 goto except;
618 716 }
619 717
620 718 result->compressor = self;
621 719 Py_INCREF(result->compressor);
622 720
623 721 result->sourceSize = sourceSize;
624 result->cstream = CStream_from_ZstdCompressor(self, sourceSize);
625 if (!result->cstream) {
626 goto except;
722
723 if (self->mtcctx) {
724 if (init_mtcstream(self, sourceSize)) {
725 goto except;
726 }
727 }
728 else {
729 if (0 != init_cstream(self, sourceSize)) {
730 goto except;
731 }
627 732 }
628 733
629 734 result->inSize = inSize;
630 735 result->outSize = outSize;
631 736
632 737 result->output.dst = PyMem_Malloc(outSize);
633 738 if (!result->output.dst) {
634 739 PyErr_NoMemory();
635 740 goto except;
636 741 }
637 742 result->output.size = outSize;
638 result->output.pos = 0;
639
640 result->input.src = NULL;
641 result->input.size = 0;
642 result->input.pos = 0;
643
644 result->finishedInput = 0;
645 result->finishedOutput = 0;
646 743
647 744 goto finally;
648 745
649 746 except:
650 if (result->cstream) {
651 ZSTD_freeCStream(result->cstream);
652 result->cstream = NULL;
653 }
654
655 Py_DecRef((PyObject*)result->compressor);
656 Py_DecRef(result->reader);
657
747 Py_XDECREF(result->compressor);
748 Py_XDECREF(result->reader);
658 749 Py_DECREF(result);
659 750 result = NULL;
660 751
661 752 finally:
662 753 return result;
663 754 }
664 755
665 756 PyDoc_STRVAR(ZstdCompressor_write_to___doc__,
666 757 "Create a context manager to write compressed data to an object.\n"
667 758 "\n"
668 759 "The passed object must have a ``write()`` method.\n"
669 760 "\n"
670 761 "The caller feeds input data to the object by calling ``compress(data)``.\n"
671 762 "Compressed data is written to the argument given to this function.\n"
672 763 "\n"
673 764 "The function takes an optional ``size`` argument indicating the total size\n"
674 765 "of the eventual input. If specified, the size will influence compression\n"
675 766 "parameter tuning and could result in the size being written into the\n"
676 767 "header of the compressed data.\n"
677 768 "\n"
678 769 "An optional ``write_size`` argument is also accepted. It defines the maximum\n"
679 770 "byte size of chunks fed to ``write()``. By default, it uses the zstd default\n"
680 771 "for a compressor output stream.\n"
681 772 );
682 773
683 774 static ZstdCompressionWriter* ZstdCompressor_write_to(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
684 775 static char* kwlist[] = {
685 776 "writer",
686 777 "size",
687 778 "write_size",
688 779 NULL
689 780 };
690 781
691 782 PyObject* writer;
692 783 ZstdCompressionWriter* result;
693 784 Py_ssize_t sourceSize = 0;
694 785 size_t outSize = ZSTD_CStreamOutSize();
695 786
696 787 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nk:write_to", kwlist,
697 788 &writer, &sourceSize, &outSize)) {
698 789 return NULL;
699 790 }
700 791
701 792 if (!PyObject_HasAttrString(writer, "write")) {
702 793 PyErr_SetString(PyExc_ValueError, "must pass an object with a write() method");
703 794 return NULL;
704 795 }
705 796
706 result = PyObject_New(ZstdCompressionWriter, &ZstdCompressionWriterType);
797 result = (ZstdCompressionWriter*)PyObject_CallObject((PyObject*)&ZstdCompressionWriterType, NULL);
707 798 if (!result) {
708 799 return NULL;
709 800 }
710 801
711 802 result->compressor = self;
712 803 Py_INCREF(result->compressor);
713 804
714 805 result->writer = writer;
715 806 Py_INCREF(result->writer);
716 807
717 808 result->sourceSize = sourceSize;
718
719 809 result->outSize = outSize;
720 810
721 result->entered = 0;
722 result->cstream = NULL;
811 return result;
812 }
813
814 typedef struct {
815 void* sourceData;
816 size_t sourceSize;
817 } DataSource;
818
819 typedef struct {
820 DataSource* sources;
821 Py_ssize_t sourcesSize;
822 unsigned long long totalSourceSize;
823 } DataSources;
824
825 typedef struct {
826 void* dest;
827 Py_ssize_t destSize;
828 BufferSegment* segments;
829 Py_ssize_t segmentsSize;
830 } DestBuffer;
831
832 typedef enum {
833 WorkerError_none = 0,
834 WorkerError_zstd = 1,
835 WorkerError_no_memory = 2,
836 } WorkerError;
837
838 /**
839 * Holds state for an individual worker performing multi_compress_to_buffer work.
840 */
841 typedef struct {
842 /* Used for compression. */
843 ZSTD_CCtx* cctx;
844 ZSTD_CDict* cdict;
845 int cLevel;
846 CompressionParametersObject* cParams;
847 ZSTD_frameParameters fParams;
848
849 /* What to compress. */
850 DataSource* sources;
851 Py_ssize_t sourcesSize;
852 Py_ssize_t startOffset;
853 Py_ssize_t endOffset;
854 unsigned long long totalSourceSize;
855
856 /* Result storage. */
857 DestBuffer* destBuffers;
858 Py_ssize_t destCount;
859
860 /* Error tracking. */
861 WorkerError error;
862 size_t zresult;
863 Py_ssize_t errorOffset;
864 } WorkerState;
865
866 static void compress_worker(WorkerState* state) {
867 Py_ssize_t inputOffset = state->startOffset;
868 Py_ssize_t remainingItems = state->endOffset - state->startOffset + 1;
869 Py_ssize_t currentBufferStartOffset = state->startOffset;
870 size_t zresult;
871 ZSTD_parameters zparams;
872 void* newDest;
873 size_t allocationSize;
874 size_t boundSize;
875 Py_ssize_t destOffset = 0;
876 DataSource* sources = state->sources;
877 DestBuffer* destBuffer;
878
879 assert(!state->destBuffers);
880 assert(0 == state->destCount);
881
882 if (state->cParams) {
883 ztopy_compression_parameters(state->cParams, &zparams.cParams);
884 }
885
886 zparams.fParams = state->fParams;
887
888 /*
889 * The total size of the compressed data is unknown until we actually
890 * compress data. That means we can't pre-allocate the exact size we need.
891 *
892 * There is a cost to every allocation and reallocation. So, it is in our
893 * interest to minimize the number of allocations.
894 *
895 * There is also a cost to too few allocations. If allocations are too
896 * large they may fail. If buffers are shared and all inputs become
897 * irrelevant at different lifetimes, then a reference to one segment
898 * in the buffer will keep the entire buffer alive. This leads to excessive
899 * memory usage.
900 *
901 * Our current strategy is to assume a compression ratio of 16:1 and
902 * allocate buffers of that size, rounded up to the nearest power of 2
903 * (because computers like round numbers). That ratio is greater than what
904 * most inputs achieve. This is by design: we don't want to over-allocate.
905 * But we don't want to under-allocate and lead to too many buffers either.
906 */
907
908 state->destCount = 1;
909
910 state->destBuffers = calloc(1, sizeof(DestBuffer));
911 if (NULL == state->destBuffers) {
912 state->error = WorkerError_no_memory;
913 return;
914 }
915
916 destBuffer = &state->destBuffers[state->destCount - 1];
917
918 /*
919 * Rather than track bounds and grow the segments buffer, allocate space
920 * to hold remaining items then truncate when we're done with it.
921 */
922 destBuffer->segments = calloc(remainingItems, sizeof(BufferSegment));
923 if (NULL == destBuffer->segments) {
924 state->error = WorkerError_no_memory;
925 return;
926 }
927
928 destBuffer->segmentsSize = remainingItems;
929
930 allocationSize = roundpow2(state->totalSourceSize >> 4);
931
932 /* If the maximum size of the output is larger than that, round up. */
933 boundSize = ZSTD_compressBound(sources[inputOffset].sourceSize);
934
935 if (boundSize > allocationSize) {
936 allocationSize = roundpow2(boundSize);
937 }
938
939 destBuffer->dest = malloc(allocationSize);
940 if (NULL == destBuffer->dest) {
941 state->error = WorkerError_no_memory;
942 return;
943 }
944
945 destBuffer->destSize = allocationSize;
946
947 for (inputOffset = state->startOffset; inputOffset <= state->endOffset; inputOffset++) {
948 void* source = sources[inputOffset].sourceData;
949 size_t sourceSize = sources[inputOffset].sourceSize;
950 size_t destAvailable;
951 void* dest;
952
953 destAvailable = destBuffer->destSize - destOffset;
954 boundSize = ZSTD_compressBound(sourceSize);
955
956 /*
957 * Not enough space in current buffer to hold largest compressed output.
958 * So allocate and switch to a new output buffer.
959 */
960 if (boundSize > destAvailable) {
961 /*
962 * The downsizing of the existing buffer is optional. It should be cheap
963 * (unlike growing). So we just do it.
964 */
965 if (destAvailable) {
966 newDest = realloc(destBuffer->dest, destOffset);
967 if (NULL == newDest) {
968 state->error = WorkerError_no_memory;
969 return;
970 }
971
972 destBuffer->dest = newDest;
973 destBuffer->destSize = destOffset;
974 }
975
976 /* Truncate segments buffer. */
977 newDest = realloc(destBuffer->segments,
978 (inputOffset - currentBufferStartOffset + 1) * sizeof(BufferSegment));
979 if (NULL == newDest) {
980 state->error = WorkerError_no_memory;
981 return;
982 }
983
984 destBuffer->segments = newDest;
985 destBuffer->segmentsSize = inputOffset - currentBufferStartOffset;
986
987 /* Grow space for new struct. */
988 /* TODO consider over-allocating so we don't do this every time. */
989 newDest = realloc(state->destBuffers, (state->destCount + 1) * sizeof(DestBuffer));
990 if (NULL == newDest) {
991 state->error = WorkerError_no_memory;
992 return;
993 }
994
995 state->destBuffers = newDest;
996 state->destCount++;
997
998 destBuffer = &state->destBuffers[state->destCount - 1];
999
1000 /* Don't take any chances with non-NULL pointers. */
1001 memset(destBuffer, 0, sizeof(DestBuffer));
1002
1003 /**
1004 * We could dynamically update allocation size based on work done so far.
1005 * For now, keep is simple.
1006 */
1007 allocationSize = roundpow2(state->totalSourceSize >> 4);
1008
1009 if (boundSize > allocationSize) {
1010 allocationSize = roundpow2(boundSize);
1011 }
1012
1013 destBuffer->dest = malloc(allocationSize);
1014 if (NULL == destBuffer->dest) {
1015 state->error = WorkerError_no_memory;
1016 return;
1017 }
1018
1019 destBuffer->destSize = allocationSize;
1020 destAvailable = allocationSize;
1021 destOffset = 0;
1022
1023 destBuffer->segments = calloc(remainingItems, sizeof(BufferSegment));
1024 if (NULL == destBuffer->segments) {
1025 state->error = WorkerError_no_memory;
1026 return;
1027 }
1028
1029 destBuffer->segmentsSize = remainingItems;
1030 currentBufferStartOffset = inputOffset;
1031 }
1032
1033 dest = (char*)destBuffer->dest + destOffset;
1034
1035 if (state->cdict) {
1036 zresult = ZSTD_compress_usingCDict(state->cctx, dest, destAvailable,
1037 source, sourceSize, state->cdict);
1038 }
1039 else {
1040 if (!state->cParams) {
1041 zparams.cParams = ZSTD_getCParams(state->cLevel, sourceSize, 0);
1042 }
1043
1044 zresult = ZSTD_compress_advanced(state->cctx, dest, destAvailable,
1045 source, sourceSize, NULL, 0, zparams);
1046 }
1047
1048 if (ZSTD_isError(zresult)) {
1049 state->error = WorkerError_zstd;
1050 state->zresult = zresult;
1051 state->errorOffset = inputOffset;
1052 break;
1053 }
1054
1055 destBuffer->segments[inputOffset - currentBufferStartOffset].offset = destOffset;
1056 destBuffer->segments[inputOffset - currentBufferStartOffset].length = zresult;
1057
1058 destOffset += zresult;
1059 remainingItems--;
1060 }
1061
1062 if (destBuffer->destSize > destOffset) {
1063 newDest = realloc(destBuffer->dest, destOffset);
1064 if (NULL == newDest) {
1065 state->error = WorkerError_no_memory;
1066 return;
1067 }
1068
1069 destBuffer->dest = newDest;
1070 destBuffer->destSize = destOffset;
1071 }
1072 }
1073
1074 ZstdBufferWithSegmentsCollection* compress_from_datasources(ZstdCompressor* compressor,
1075 DataSources* sources, unsigned int threadCount) {
1076 ZSTD_parameters zparams;
1077 unsigned long long bytesPerWorker;
1078 POOL_ctx* pool = NULL;
1079 WorkerState* workerStates = NULL;
1080 Py_ssize_t i;
1081 unsigned long long workerBytes = 0;
1082 Py_ssize_t workerStartOffset = 0;
1083 size_t currentThread = 0;
1084 int errored = 0;
1085 Py_ssize_t segmentsCount = 0;
1086 Py_ssize_t segmentIndex;
1087 PyObject* segmentsArg = NULL;
1088 ZstdBufferWithSegments* buffer;
1089 ZstdBufferWithSegmentsCollection* result = NULL;
1090
1091 assert(sources->sourcesSize > 0);
1092 assert(sources->totalSourceSize > 0);
1093 assert(threadCount >= 1);
1094
1095 /* More threads than inputs makes no sense. */
1096 threadCount = sources->sourcesSize < threadCount ? (unsigned int)sources->sourcesSize
1097 : threadCount;
1098
1099 /* TODO lower thread count when input size is too small and threads would add
1100 overhead. */
1101
1102 /*
1103 * When dictionaries are used, parameters are derived from the size of the
1104 * first element.
1105 *
1106 * TODO come up with a better mechanism.
1107 */
1108 memset(&zparams, 0, sizeof(zparams));
1109 if (compressor->cparams) {
1110 ztopy_compression_parameters(compressor->cparams, &zparams.cParams);
1111 }
1112 else {
1113 zparams.cParams = ZSTD_getCParams(compressor->compressionLevel,
1114 sources->sources[0].sourceSize,
1115 compressor->dict ? compressor->dict->dictSize : 0);
1116 }
1117
1118 zparams.fParams = compressor->fparams;
1119
1120 if (0 != populate_cdict(compressor, &zparams)) {
1121 return NULL;
1122 }
1123
1124 workerStates = PyMem_Malloc(threadCount * sizeof(WorkerState));
1125 if (NULL == workerStates) {
1126 PyErr_NoMemory();
1127 goto finally;
1128 }
1129
1130 memset(workerStates, 0, threadCount * sizeof(WorkerState));
1131
1132 if (threadCount > 1) {
1133 pool = POOL_create(threadCount, 1);
1134 if (NULL == pool) {
1135 PyErr_SetString(ZstdError, "could not initialize zstd thread pool");
1136 goto finally;
1137 }
1138 }
1139
1140 bytesPerWorker = sources->totalSourceSize / threadCount;
1141
1142 for (i = 0; i < threadCount; i++) {
1143 workerStates[i].cctx = ZSTD_createCCtx();
1144 if (!workerStates[i].cctx) {
1145 PyErr_NoMemory();
1146 goto finally;
1147 }
1148
1149 workerStates[i].cdict = compressor->cdict;
1150 workerStates[i].cLevel = compressor->compressionLevel;
1151 workerStates[i].cParams = compressor->cparams;
1152 workerStates[i].fParams = compressor->fparams;
1153
1154 workerStates[i].sources = sources->sources;
1155 workerStates[i].sourcesSize = sources->sourcesSize;
1156 }
1157
1158 Py_BEGIN_ALLOW_THREADS
1159 for (i = 0; i < sources->sourcesSize; i++) {
1160 workerBytes += sources->sources[i].sourceSize;
1161
1162 /*
1163 * The last worker/thread needs to handle all remaining work. Don't
1164 * trigger it prematurely. Defer to the block outside of the loop
1165 * to run the last worker/thread. But do still process this loop
1166 * so workerBytes is correct.
1167 */
1168 if (currentThread == threadCount - 1) {
1169 continue;
1170 }
1171
1172 if (workerBytes >= bytesPerWorker) {
1173 assert(currentThread < threadCount);
1174 workerStates[currentThread].totalSourceSize = workerBytes;
1175 workerStates[currentThread].startOffset = workerStartOffset;
1176 workerStates[currentThread].endOffset = i;
1177
1178 if (threadCount > 1) {
1179 POOL_add(pool, (POOL_function)compress_worker, &workerStates[currentThread]);
1180 }
1181 else {
1182 compress_worker(&workerStates[currentThread]);
1183 }
1184
1185 currentThread++;
1186 workerStartOffset = i + 1;
1187 workerBytes = 0;
1188 }
1189 }
1190
1191 if (workerBytes) {
1192 assert(currentThread < threadCount);
1193 workerStates[currentThread].totalSourceSize = workerBytes;
1194 workerStates[currentThread].startOffset = workerStartOffset;
1195 workerStates[currentThread].endOffset = sources->sourcesSize - 1;
1196
1197 if (threadCount > 1) {
1198 POOL_add(pool, (POOL_function)compress_worker, &workerStates[currentThread]);
1199 }
1200 else {
1201 compress_worker(&workerStates[currentThread]);
1202 }
1203 }
1204
1205 if (threadCount > 1) {
1206 POOL_free(pool);
1207 pool = NULL;
1208 }
1209
1210 Py_END_ALLOW_THREADS
1211
1212 for (i = 0; i < threadCount; i++) {
1213 switch (workerStates[i].error) {
1214 case WorkerError_no_memory:
1215 PyErr_NoMemory();
1216 errored = 1;
1217 break;
1218
1219 case WorkerError_zstd:
1220 PyErr_Format(ZstdError, "error compressing item %zd: %s",
1221 workerStates[i].errorOffset, ZSTD_getErrorName(workerStates[i].zresult));
1222 errored = 1;
1223 break;
1224 default:
1225 ;
1226 }
1227
1228 if (errored) {
1229 break;
1230 }
1231
1232 }
1233
1234 if (errored) {
1235 goto finally;
1236 }
1237
1238 segmentsCount = 0;
1239 for (i = 0; i < threadCount; i++) {
1240 WorkerState* state = &workerStates[i];
1241 segmentsCount += state->destCount;
1242 }
1243
1244 segmentsArg = PyTuple_New(segmentsCount);
1245 if (NULL == segmentsArg) {
1246 goto finally;
1247 }
1248
1249 segmentIndex = 0;
1250
1251 for (i = 0; i < threadCount; i++) {
1252 Py_ssize_t j;
1253 WorkerState* state = &workerStates[i];
1254
1255 for (j = 0; j < state->destCount; j++) {
1256 DestBuffer* destBuffer = &state->destBuffers[j];
1257 buffer = BufferWithSegments_FromMemory(destBuffer->dest, destBuffer->destSize,
1258 destBuffer->segments, destBuffer->segmentsSize);
1259
1260 if (NULL == buffer) {
1261 goto finally;
1262 }
1263
1264 /* Tell instance to use free() instsead of PyMem_Free(). */
1265 buffer->useFree = 1;
1266
1267 /*
1268 * BufferWithSegments_FromMemory takes ownership of the backing memory.
1269 * Unset it here so it doesn't get freed below.
1270 */
1271 destBuffer->dest = NULL;
1272 destBuffer->segments = NULL;
1273
1274 PyTuple_SET_ITEM(segmentsArg, segmentIndex++, (PyObject*)buffer);
1275 }
1276 }
1277
1278 result = (ZstdBufferWithSegmentsCollection*)PyObject_CallObject(
1279 (PyObject*)&ZstdBufferWithSegmentsCollectionType, segmentsArg);
1280
1281 finally:
1282 Py_CLEAR(segmentsArg);
1283
1284 if (pool) {
1285 POOL_free(pool);
1286 }
1287
1288 if (workerStates) {
1289 Py_ssize_t j;
1290
1291 for (i = 0; i < threadCount; i++) {
1292 WorkerState state = workerStates[i];
1293
1294 if (state.cctx) {
1295 ZSTD_freeCCtx(state.cctx);
1296 }
1297
1298 /* malloc() is used in worker thread. */
1299
1300 for (j = 0; j < state.destCount; j++) {
1301 if (state.destBuffers) {
1302 free(state.destBuffers[j].dest);
1303 free(state.destBuffers[j].segments);
1304 }
1305 }
1306
1307
1308 free(state.destBuffers);
1309 }
1310
1311 PyMem_Free(workerStates);
1312 }
1313
1314 return result;
1315 }
1316
1317 PyDoc_STRVAR(ZstdCompressor_multi_compress_to_buffer__doc__,
1318 "Compress multiple pieces of data as a single operation\n"
1319 "\n"
1320 "Receives a ``BufferWithSegmentsCollection``, a ``BufferWithSegments``, or\n"
1321 "a list of bytes like objects holding data to compress.\n"
1322 "\n"
1323 "Returns a ``BufferWithSegmentsCollection`` holding compressed data.\n"
1324 "\n"
1325 "This function is optimized to perform multiple compression operations as\n"
1326 "as possible with as little overhead as possbile.\n"
1327 );
1328
1329 static ZstdBufferWithSegmentsCollection* ZstdCompressor_multi_compress_to_buffer(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
1330 static char* kwlist[] = {
1331 "data",
1332 "threads",
1333 NULL
1334 };
1335
1336 PyObject* data;
1337 int threads = 0;
1338 Py_buffer* dataBuffers = NULL;
1339 DataSources sources;
1340 Py_ssize_t i;
1341 Py_ssize_t sourceCount = 0;
1342 ZstdBufferWithSegmentsCollection* result = NULL;
1343
1344 if (self->mtcctx) {
1345 PyErr_SetString(ZstdError,
1346 "function cannot be called on ZstdCompressor configured for multi-threaded compression");
1347 return NULL;
1348 }
1349
1350 memset(&sources, 0, sizeof(sources));
1351
1352 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|i:multi_compress_to_buffer", kwlist,
1353 &data, &threads)) {
1354 return NULL;
1355 }
1356
1357 if (threads < 0) {
1358 threads = cpu_count();
1359 }
1360
1361 if (threads < 2) {
1362 threads = 1;
1363 }
1364
1365 if (PyObject_TypeCheck(data, &ZstdBufferWithSegmentsType)) {
1366 ZstdBufferWithSegments* buffer = (ZstdBufferWithSegments*)data;
1367
1368 sources.sources = PyMem_Malloc(buffer->segmentCount * sizeof(DataSource));
1369 if (NULL == sources.sources) {
1370 PyErr_NoMemory();
1371 goto finally;
1372 }
1373
1374 for (i = 0; i < buffer->segmentCount; i++) {
1375 sources.sources[i].sourceData = (char*)buffer->data + buffer->segments[i].offset;
1376 sources.sources[i].sourceSize = buffer->segments[i].length;
1377 sources.totalSourceSize += buffer->segments[i].length;
1378 }
1379
1380 sources.sourcesSize = buffer->segmentCount;
1381 }
1382 else if (PyObject_TypeCheck(data, &ZstdBufferWithSegmentsCollectionType)) {
1383 Py_ssize_t j;
1384 Py_ssize_t offset = 0;
1385 ZstdBufferWithSegments* buffer;
1386 ZstdBufferWithSegmentsCollection* collection = (ZstdBufferWithSegmentsCollection*)data;
1387
1388 sourceCount = BufferWithSegmentsCollection_length(collection);
1389
1390 sources.sources = PyMem_Malloc(sourceCount * sizeof(DataSource));
1391 if (NULL == sources.sources) {
1392 PyErr_NoMemory();
1393 goto finally;
1394 }
1395
1396 for (i = 0; i < collection->bufferCount; i++) {
1397 buffer = collection->buffers[i];
1398
1399 for (j = 0; j < buffer->segmentCount; j++) {
1400 sources.sources[offset].sourceData = (char*)buffer->data + buffer->segments[j].offset;
1401 sources.sources[offset].sourceSize = buffer->segments[j].length;
1402 sources.totalSourceSize += buffer->segments[j].length;
1403
1404 offset++;
1405 }
1406 }
1407
1408 sources.sourcesSize = sourceCount;
1409 }
1410 else if (PyList_Check(data)) {
1411 sourceCount = PyList_GET_SIZE(data);
1412
1413 sources.sources = PyMem_Malloc(sourceCount * sizeof(DataSource));
1414 if (NULL == sources.sources) {
1415 PyErr_NoMemory();
1416 goto finally;
1417 }
1418
1419 /*
1420 * It isn't clear whether the address referred to by Py_buffer.buf
1421 * is still valid after PyBuffer_Release. We we hold a reference to all
1422 * Py_buffer instances for the duration of the operation.
1423 */
1424 dataBuffers = PyMem_Malloc(sourceCount * sizeof(Py_buffer));
1425 if (NULL == dataBuffers) {
1426 PyErr_NoMemory();
1427 goto finally;
1428 }
1429
1430 memset(dataBuffers, 0, sourceCount * sizeof(Py_buffer));
1431
1432 for (i = 0; i < sourceCount; i++) {
1433 if (0 != PyObject_GetBuffer(PyList_GET_ITEM(data, i),
1434 &dataBuffers[i], PyBUF_CONTIG_RO)) {
1435 PyErr_Clear();
1436 PyErr_Format(PyExc_TypeError, "item %zd not a bytes like object", i);
1437 goto finally;
1438 }
1439
1440 sources.sources[i].sourceData = dataBuffers[i].buf;
1441 sources.sources[i].sourceSize = dataBuffers[i].len;
1442 sources.totalSourceSize += dataBuffers[i].len;
1443 }
1444
1445 sources.sourcesSize = sourceCount;
1446 }
1447 else {
1448 PyErr_SetString(PyExc_TypeError, "argument must be list of BufferWithSegments");
1449 goto finally;
1450 }
1451
1452 if (0 == sources.sourcesSize) {
1453 PyErr_SetString(PyExc_ValueError, "no source elements found");
1454 goto finally;
1455 }
1456
1457 if (0 == sources.totalSourceSize) {
1458 PyErr_SetString(PyExc_ValueError, "source elements are empty");
1459 goto finally;
1460 }
1461
1462 result = compress_from_datasources(self, &sources, threads);
1463
1464 finally:
1465 PyMem_Free(sources.sources);
1466
1467 if (dataBuffers) {
1468 for (i = 0; i < sourceCount; i++) {
1469 PyBuffer_Release(&dataBuffers[i]);
1470 }
1471
1472 PyMem_Free(dataBuffers);
1473 }
723 1474
724 1475 return result;
725 1476 }
726 1477
727 1478 static PyMethodDef ZstdCompressor_methods[] = {
728 1479 { "compress", (PyCFunction)ZstdCompressor_compress,
729 1480 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_compress__doc__ },
730 1481 { "compressobj", (PyCFunction)ZstdCompressor_compressobj,
731 1482 METH_VARARGS | METH_KEYWORDS, ZstdCompressionObj__doc__ },
732 1483 { "copy_stream", (PyCFunction)ZstdCompressor_copy_stream,
733 1484 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_copy_stream__doc__ },
734 1485 { "read_from", (PyCFunction)ZstdCompressor_read_from,
735 1486 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_read_from__doc__ },
736 1487 { "write_to", (PyCFunction)ZstdCompressor_write_to,
737 1488 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_write_to___doc__ },
1489 { "multi_compress_to_buffer", (PyCFunction)ZstdCompressor_multi_compress_to_buffer,
1490 METH_VARARGS | METH_KEYWORDS, ZstdCompressor_multi_compress_to_buffer__doc__ },
738 1491 { NULL, NULL }
739 1492 };
740 1493
741 1494 PyTypeObject ZstdCompressorType = {
742 1495 PyVarObject_HEAD_INIT(NULL, 0)
743 1496 "zstd.ZstdCompressor", /* tp_name */
744 1497 sizeof(ZstdCompressor), /* tp_basicsize */
745 1498 0, /* tp_itemsize */
746 1499 (destructor)ZstdCompressor_dealloc, /* tp_dealloc */
747 1500 0, /* tp_print */
748 1501 0, /* tp_getattr */
749 1502 0, /* tp_setattr */
750 1503 0, /* tp_compare */
751 1504 0, /* tp_repr */
752 1505 0, /* tp_as_number */
753 1506 0, /* tp_as_sequence */
754 1507 0, /* tp_as_mapping */
755 1508 0, /* tp_hash */
756 1509 0, /* tp_call */
757 1510 0, /* tp_str */
758 1511 0, /* tp_getattro */
759 1512 0, /* tp_setattro */
760 1513 0, /* tp_as_buffer */
761 1514 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
762 1515 ZstdCompressor__doc__, /* tp_doc */
763 1516 0, /* tp_traverse */
764 1517 0, /* tp_clear */
765 1518 0, /* tp_richcompare */
766 1519 0, /* tp_weaklistoffset */
767 1520 0, /* tp_iter */
768 1521 0, /* tp_iternext */
769 1522 ZstdCompressor_methods, /* tp_methods */
770 1523 0, /* tp_members */
771 1524 0, /* tp_getset */
772 1525 0, /* tp_base */
773 1526 0, /* tp_dict */
774 1527 0, /* tp_descr_get */
775 1528 0, /* tp_descr_set */
776 1529 0, /* tp_dictoffset */
777 1530 (initproc)ZstdCompressor_init, /* tp_init */
778 1531 0, /* tp_alloc */
779 1532 PyType_GenericNew, /* tp_new */
780 1533 };
781 1534
782 1535 void compressor_module_init(PyObject* mod) {
783 1536 Py_TYPE(&ZstdCompressorType) = &PyType_Type;
784 1537 if (PyType_Ready(&ZstdCompressorType) < 0) {
785 1538 return;
786 1539 }
787 1540
788 1541 Py_INCREF((PyObject*)&ZstdCompressorType);
789 1542 PyModule_AddObject(mod, "ZstdCompressor",
790 1543 (PyObject*)&ZstdCompressorType);
791 1544 }
@@ -1,234 +1,247
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 #define min(a, b) (((a) < (b)) ? (a) : (b))
12 12
13 13 extern PyObject* ZstdError;
14 14
15 15 PyDoc_STRVAR(ZstdCompressorIterator__doc__,
16 16 "Represents an iterator of compressed data.\n"
17 17 );
18 18
19 19 static void ZstdCompressorIterator_dealloc(ZstdCompressorIterator* self) {
20 20 Py_XDECREF(self->readResult);
21 21 Py_XDECREF(self->compressor);
22 22 Py_XDECREF(self->reader);
23 23
24 24 if (self->buffer) {
25 25 PyBuffer_Release(self->buffer);
26 26 PyMem_FREE(self->buffer);
27 27 self->buffer = NULL;
28 28 }
29 29
30 if (self->cstream) {
31 ZSTD_freeCStream(self->cstream);
32 self->cstream = NULL;
33 }
34
35 30 if (self->output.dst) {
36 31 PyMem_Free(self->output.dst);
37 32 self->output.dst = NULL;
38 33 }
39 34
40 35 PyObject_Del(self);
41 36 }
42 37
43 38 static PyObject* ZstdCompressorIterator_iter(PyObject* self) {
44 39 Py_INCREF(self);
45 40 return self;
46 41 }
47 42
48 43 static PyObject* ZstdCompressorIterator_iternext(ZstdCompressorIterator* self) {
49 44 size_t zresult;
50 45 PyObject* readResult = NULL;
51 46 PyObject* chunk;
52 47 char* readBuffer;
53 48 Py_ssize_t readSize = 0;
54 49 Py_ssize_t bufferRemaining;
55 50
56 51 if (self->finishedOutput) {
57 52 PyErr_SetString(PyExc_StopIteration, "output flushed");
58 53 return NULL;
59 54 }
60 55
61 56 feedcompressor:
62 57
63 58 /* If we have data left in the input, consume it. */
64 59 if (self->input.pos < self->input.size) {
65 60 Py_BEGIN_ALLOW_THREADS
66 zresult = ZSTD_compressStream(self->cstream, &self->output, &self->input);
61 if (self->compressor->mtcctx) {
62 zresult = ZSTDMT_compressStream(self->compressor->mtcctx,
63 &self->output, &self->input);
64 }
65 else {
66 zresult = ZSTD_compressStream(self->compressor->cstream, &self->output,
67 &self->input);
68 }
67 69 Py_END_ALLOW_THREADS
68 70
69 71 /* Release the Python object holding the input buffer. */
70 72 if (self->input.pos == self->input.size) {
71 73 self->input.src = NULL;
72 74 self->input.pos = 0;
73 75 self->input.size = 0;
74 76 Py_DECREF(self->readResult);
75 77 self->readResult = NULL;
76 78 }
77 79
78 80 if (ZSTD_isError(zresult)) {
79 81 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
80 82 return NULL;
81 83 }
82 84
83 85 /* If it produced output data, emit it. */
84 86 if (self->output.pos) {
85 87 chunk = PyBytes_FromStringAndSize(self->output.dst, self->output.pos);
86 88 self->output.pos = 0;
87 89 return chunk;
88 90 }
89 91 }
90 92
91 93 /* We should never have output data sitting around after a previous call. */
92 94 assert(self->output.pos == 0);
93 95
94 96 /* The code above should have either emitted a chunk and returned or consumed
95 97 the entire input buffer. So the state of the input buffer is not
96 98 relevant. */
97 99 if (!self->finishedInput) {
98 100 if (self->reader) {
99 101 readResult = PyObject_CallMethod(self->reader, "read", "I", self->inSize);
100 102 if (!readResult) {
101 103 PyErr_SetString(ZstdError, "could not read() from source");
102 104 return NULL;
103 105 }
104 106
105 107 PyBytes_AsStringAndSize(readResult, &readBuffer, &readSize);
106 108 }
107 109 else {
108 110 assert(self->buffer && self->buffer->buf);
109 111
110 112 /* Only support contiguous C arrays. */
111 113 assert(self->buffer->strides == NULL && self->buffer->suboffsets == NULL);
112 114 assert(self->buffer->itemsize == 1);
113 115
114 116 readBuffer = (char*)self->buffer->buf + self->bufferOffset;
115 117 bufferRemaining = self->buffer->len - self->bufferOffset;
116 118 readSize = min(bufferRemaining, (Py_ssize_t)self->inSize);
117 119 self->bufferOffset += readSize;
118 120 }
119 121
120 122 if (0 == readSize) {
121 123 Py_XDECREF(readResult);
122 124 self->finishedInput = 1;
123 125 }
124 126 else {
125 127 self->readResult = readResult;
126 128 }
127 129 }
128 130
129 131 /* EOF */
130 132 if (0 == readSize) {
131 zresult = ZSTD_endStream(self->cstream, &self->output);
133 if (self->compressor->mtcctx) {
134 zresult = ZSTDMT_endStream(self->compressor->mtcctx, &self->output);
135 }
136 else {
137 zresult = ZSTD_endStream(self->compressor->cstream, &self->output);
138 }
132 139 if (ZSTD_isError(zresult)) {
133 140 PyErr_Format(ZstdError, "error ending compression stream: %s",
134 141 ZSTD_getErrorName(zresult));
135 142 return NULL;
136 143 }
137 144
138 145 assert(self->output.pos);
139 146
140 147 if (0 == zresult) {
141 148 self->finishedOutput = 1;
142 149 }
143 150
144 151 chunk = PyBytes_FromStringAndSize(self->output.dst, self->output.pos);
145 152 self->output.pos = 0;
146 153 return chunk;
147 154 }
148 155
149 156 /* New data from reader. Feed into compressor. */
150 157 self->input.src = readBuffer;
151 158 self->input.size = readSize;
152 159 self->input.pos = 0;
153 160
154 161 Py_BEGIN_ALLOW_THREADS
155 zresult = ZSTD_compressStream(self->cstream, &self->output, &self->input);
162 if (self->compressor->mtcctx) {
163 zresult = ZSTDMT_compressStream(self->compressor->mtcctx, &self->output,
164 &self->input);
165 }
166 else {
167 zresult = ZSTD_compressStream(self->compressor->cstream, &self->output, &self->input);
168 }
156 169 Py_END_ALLOW_THREADS
157 170
158 171 /* The input buffer currently points to memory managed by Python
159 172 (readBuffer). This object was allocated by this function. If it wasn't
160 173 fully consumed, we need to release it in a subsequent function call.
161 174 If it is fully consumed, do that now.
162 175 */
163 176 if (self->input.pos == self->input.size) {
164 177 self->input.src = NULL;
165 178 self->input.pos = 0;
166 179 self->input.size = 0;
167 180 Py_XDECREF(self->readResult);
168 181 self->readResult = NULL;
169 182 }
170 183
171 184 if (ZSTD_isError(zresult)) {
172 185 PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
173 186 return NULL;
174 187 }
175 188
176 189 assert(self->input.pos <= self->input.size);
177 190
178 191 /* If we didn't write anything, start the process over. */
179 192 if (0 == self->output.pos) {
180 193 goto feedcompressor;
181 194 }
182 195
183 196 chunk = PyBytes_FromStringAndSize(self->output.dst, self->output.pos);
184 197 self->output.pos = 0;
185 198 return chunk;
186 199 }
187 200
188 201 PyTypeObject ZstdCompressorIteratorType = {
189 202 PyVarObject_HEAD_INIT(NULL, 0)
190 203 "zstd.ZstdCompressorIterator", /* tp_name */
191 204 sizeof(ZstdCompressorIterator), /* tp_basicsize */
192 205 0, /* tp_itemsize */
193 206 (destructor)ZstdCompressorIterator_dealloc, /* tp_dealloc */
194 207 0, /* tp_print */
195 208 0, /* tp_getattr */
196 209 0, /* tp_setattr */
197 210 0, /* tp_compare */
198 211 0, /* tp_repr */
199 212 0, /* tp_as_number */
200 213 0, /* tp_as_sequence */
201 214 0, /* tp_as_mapping */
202 215 0, /* tp_hash */
203 216 0, /* tp_call */
204 217 0, /* tp_str */
205 218 0, /* tp_getattro */
206 219 0, /* tp_setattro */
207 220 0, /* tp_as_buffer */
208 221 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
209 222 ZstdCompressorIterator__doc__, /* tp_doc */
210 223 0, /* tp_traverse */
211 224 0, /* tp_clear */
212 225 0, /* tp_richcompare */
213 226 0, /* tp_weaklistoffset */
214 227 ZstdCompressorIterator_iter, /* tp_iter */
215 228 (iternextfunc)ZstdCompressorIterator_iternext, /* tp_iternext */
216 229 0, /* tp_methods */
217 230 0, /* tp_members */
218 231 0, /* tp_getset */
219 232 0, /* tp_base */
220 233 0, /* tp_dict */
221 234 0, /* tp_descr_get */
222 235 0, /* tp_descr_set */
223 236 0, /* tp_dictoffset */
224 237 0, /* tp_init */
225 238 0, /* tp_alloc */
226 239 PyType_GenericNew, /* tp_new */
227 240 };
228 241
229 242 void compressoriterator_module_init(PyObject* mod) {
230 243 Py_TYPE(&ZstdCompressorIteratorType) = &PyType_Type;
231 244 if (PyType_Ready(&ZstdCompressorIteratorType) < 0) {
232 245 return;
233 246 }
234 247 }
@@ -1,87 +1,87
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 static char frame_header[] = {
14 14 '\x28',
15 15 '\xb5',
16 16 '\x2f',
17 17 '\xfd',
18 18 };
19 19
20 20 void constants_module_init(PyObject* mod) {
21 21 PyObject* version;
22 22 PyObject* zstdVersion;
23 23 PyObject* frameHeader;
24 24
25 25 #if PY_MAJOR_VERSION >= 3
26 26 version = PyUnicode_FromString(PYTHON_ZSTANDARD_VERSION);
27 27 #else
28 28 version = PyString_FromString(PYTHON_ZSTANDARD_VERSION);
29 29 #endif
30 30 Py_INCREF(version);
31 31 PyModule_AddObject(mod, "__version__", version);
32 32
33 33 ZstdError = PyErr_NewException("zstd.ZstdError", NULL, NULL);
34 34 PyModule_AddObject(mod, "ZstdError", ZstdError);
35 35
36 36 PyModule_AddIntConstant(mod, "COMPRESSOBJ_FLUSH_FINISH", compressorobj_flush_finish);
37 37 PyModule_AddIntConstant(mod, "COMPRESSOBJ_FLUSH_BLOCK", compressorobj_flush_block);
38 38
39 39 /* For now, the version is a simple tuple instead of a dedicated type. */
40 40 zstdVersion = PyTuple_New(3);
41 41 PyTuple_SetItem(zstdVersion, 0, PyLong_FromLong(ZSTD_VERSION_MAJOR));
42 42 PyTuple_SetItem(zstdVersion, 1, PyLong_FromLong(ZSTD_VERSION_MINOR));
43 43 PyTuple_SetItem(zstdVersion, 2, PyLong_FromLong(ZSTD_VERSION_RELEASE));
44 Py_IncRef(zstdVersion);
44 Py_INCREF(zstdVersion);
45 45 PyModule_AddObject(mod, "ZSTD_VERSION", zstdVersion);
46 46
47 47 frameHeader = PyBytes_FromStringAndSize(frame_header, sizeof(frame_header));
48 48 if (frameHeader) {
49 49 PyModule_AddObject(mod, "FRAME_HEADER", frameHeader);
50 50 }
51 51 else {
52 52 PyErr_Format(PyExc_ValueError, "could not create frame header object");
53 53 }
54 54
55 55 PyModule_AddIntConstant(mod, "MAX_COMPRESSION_LEVEL", ZSTD_maxCLevel());
56 56 PyModule_AddIntConstant(mod, "COMPRESSION_RECOMMENDED_INPUT_SIZE",
57 57 (long)ZSTD_CStreamInSize());
58 58 PyModule_AddIntConstant(mod, "COMPRESSION_RECOMMENDED_OUTPUT_SIZE",
59 59 (long)ZSTD_CStreamOutSize());
60 60 PyModule_AddIntConstant(mod, "DECOMPRESSION_RECOMMENDED_INPUT_SIZE",
61 61 (long)ZSTD_DStreamInSize());
62 62 PyModule_AddIntConstant(mod, "DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE",
63 63 (long)ZSTD_DStreamOutSize());
64 64
65 65 PyModule_AddIntConstant(mod, "MAGIC_NUMBER", ZSTD_MAGICNUMBER);
66 66 PyModule_AddIntConstant(mod, "WINDOWLOG_MIN", ZSTD_WINDOWLOG_MIN);
67 67 PyModule_AddIntConstant(mod, "WINDOWLOG_MAX", ZSTD_WINDOWLOG_MAX);
68 68 PyModule_AddIntConstant(mod, "CHAINLOG_MIN", ZSTD_CHAINLOG_MIN);
69 69 PyModule_AddIntConstant(mod, "CHAINLOG_MAX", ZSTD_CHAINLOG_MAX);
70 70 PyModule_AddIntConstant(mod, "HASHLOG_MIN", ZSTD_HASHLOG_MIN);
71 71 PyModule_AddIntConstant(mod, "HASHLOG_MAX", ZSTD_HASHLOG_MAX);
72 72 PyModule_AddIntConstant(mod, "HASHLOG3_MAX", ZSTD_HASHLOG3_MAX);
73 73 PyModule_AddIntConstant(mod, "SEARCHLOG_MIN", ZSTD_SEARCHLOG_MIN);
74 74 PyModule_AddIntConstant(mod, "SEARCHLOG_MAX", ZSTD_SEARCHLOG_MAX);
75 75 PyModule_AddIntConstant(mod, "SEARCHLENGTH_MIN", ZSTD_SEARCHLENGTH_MIN);
76 76 PyModule_AddIntConstant(mod, "SEARCHLENGTH_MAX", ZSTD_SEARCHLENGTH_MAX);
77 77 PyModule_AddIntConstant(mod, "TARGETLENGTH_MIN", ZSTD_TARGETLENGTH_MIN);
78 78 PyModule_AddIntConstant(mod, "TARGETLENGTH_MAX", ZSTD_TARGETLENGTH_MAX);
79 79
80 80 PyModule_AddIntConstant(mod, "STRATEGY_FAST", ZSTD_fast);
81 81 PyModule_AddIntConstant(mod, "STRATEGY_DFAST", ZSTD_dfast);
82 82 PyModule_AddIntConstant(mod, "STRATEGY_GREEDY", ZSTD_greedy);
83 83 PyModule_AddIntConstant(mod, "STRATEGY_LAZY", ZSTD_lazy);
84 84 PyModule_AddIntConstant(mod, "STRATEGY_LAZY2", ZSTD_lazy2);
85 85 PyModule_AddIntConstant(mod, "STRATEGY_BTLAZY2", ZSTD_btlazy2);
86 86 PyModule_AddIntConstant(mod, "STRATEGY_BTOPT", ZSTD_btopt);
87 87 }
@@ -1,188 +1,179
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 PyDoc_STRVAR(ZstdDecompressionWriter__doc,
14 14 """A context manager used for writing decompressed output.\n"
15 15 );
16 16
17 17 static void ZstdDecompressionWriter_dealloc(ZstdDecompressionWriter* self) {
18 18 Py_XDECREF(self->decompressor);
19 19 Py_XDECREF(self->writer);
20 20
21 if (self->dstream) {
22 ZSTD_freeDStream(self->dstream);
23 self->dstream = NULL;
24 }
25
26 21 PyObject_Del(self);
27 22 }
28 23
29 24 static PyObject* ZstdDecompressionWriter_enter(ZstdDecompressionWriter* self) {
30 25 if (self->entered) {
31 26 PyErr_SetString(ZstdError, "cannot __enter__ multiple times");
32 27 return NULL;
33 28 }
34 29
35 self->dstream = DStream_from_ZstdDecompressor(self->decompressor);
36 if (!self->dstream) {
30 if (0 != init_dstream(self->decompressor)) {
37 31 return NULL;
38 32 }
39 33
40 34 self->entered = 1;
41 35
42 36 Py_INCREF(self);
43 37 return (PyObject*)self;
44 38 }
45 39
46 40 static PyObject* ZstdDecompressionWriter_exit(ZstdDecompressionWriter* self, PyObject* args) {
47 41 self->entered = 0;
48 42
49 if (self->dstream) {
50 ZSTD_freeDStream(self->dstream);
51 self->dstream = NULL;
52 }
53
54 43 Py_RETURN_FALSE;
55 44 }
56 45
57 46 static PyObject* ZstdDecompressionWriter_memory_size(ZstdDecompressionWriter* self) {
58 if (!self->dstream) {
47 if (!self->decompressor->dstream) {
59 48 PyErr_SetString(ZstdError, "cannot determine size of inactive decompressor; "
60 49 "call when context manager is active");
61 50 return NULL;
62 51 }
63 52
64 return PyLong_FromSize_t(ZSTD_sizeof_DStream(self->dstream));
53 return PyLong_FromSize_t(ZSTD_sizeof_DStream(self->decompressor->dstream));
65 54 }
66 55
67 56 static PyObject* ZstdDecompressionWriter_write(ZstdDecompressionWriter* self, PyObject* args) {
68 57 const char* source;
69 58 Py_ssize_t sourceSize;
70 59 size_t zresult = 0;
71 60 ZSTD_inBuffer input;
72 61 ZSTD_outBuffer output;
73 62 PyObject* res;
74 63 Py_ssize_t totalWrite = 0;
75 64
76 65 #if PY_MAJOR_VERSION >= 3
77 66 if (!PyArg_ParseTuple(args, "y#:write", &source, &sourceSize)) {
78 67 #else
79 68 if (!PyArg_ParseTuple(args, "s#:write", &source, &sourceSize)) {
80 69 #endif
81 70 return NULL;
82 71 }
83 72
84 73 if (!self->entered) {
85 74 PyErr_SetString(ZstdError, "write must be called from an active context manager");
86 75 return NULL;
87 76 }
88 77
78 assert(self->decompressor->dstream);
79
89 80 output.dst = PyMem_Malloc(self->outSize);
90 81 if (!output.dst) {
91 82 return PyErr_NoMemory();
92 83 }
93 84 output.size = self->outSize;
94 85 output.pos = 0;
95 86
96 87 input.src = source;
97 88 input.size = sourceSize;
98 89 input.pos = 0;
99 90
100 91 while ((ssize_t)input.pos < sourceSize) {
101 92 Py_BEGIN_ALLOW_THREADS
102 zresult = ZSTD_decompressStream(self->dstream, &output, &input);
93 zresult = ZSTD_decompressStream(self->decompressor->dstream, &output, &input);
103 94 Py_END_ALLOW_THREADS
104 95
105 96 if (ZSTD_isError(zresult)) {
106 97 PyMem_Free(output.dst);
107 98 PyErr_Format(ZstdError, "zstd decompress error: %s",
108 99 ZSTD_getErrorName(zresult));
109 100 return NULL;
110 101 }
111 102
112 103 if (output.pos) {
113 104 #if PY_MAJOR_VERSION >= 3
114 105 res = PyObject_CallMethod(self->writer, "write", "y#",
115 106 #else
116 107 res = PyObject_CallMethod(self->writer, "write", "s#",
117 108 #endif
118 109 output.dst, output.pos);
119 110 Py_XDECREF(res);
120 111 totalWrite += output.pos;
121 112 output.pos = 0;
122 113 }
123 114 }
124 115
125 116 PyMem_Free(output.dst);
126 117
127 118 return PyLong_FromSsize_t(totalWrite);
128 119 }
129 120
130 121 static PyMethodDef ZstdDecompressionWriter_methods[] = {
131 122 { "__enter__", (PyCFunction)ZstdDecompressionWriter_enter, METH_NOARGS,
132 123 PyDoc_STR("Enter a decompression context.") },
133 124 { "__exit__", (PyCFunction)ZstdDecompressionWriter_exit, METH_VARARGS,
134 125 PyDoc_STR("Exit a decompression context.") },
135 126 { "memory_size", (PyCFunction)ZstdDecompressionWriter_memory_size, METH_NOARGS,
136 127 PyDoc_STR("Obtain the memory size in bytes of the underlying decompressor.") },
137 128 { "write", (PyCFunction)ZstdDecompressionWriter_write, METH_VARARGS,
138 129 PyDoc_STR("Compress data") },
139 130 { NULL, NULL }
140 131 };
141 132
142 133 PyTypeObject ZstdDecompressionWriterType = {
143 134 PyVarObject_HEAD_INIT(NULL, 0)
144 135 "zstd.ZstdDecompressionWriter", /* tp_name */
145 136 sizeof(ZstdDecompressionWriter),/* tp_basicsize */
146 137 0, /* tp_itemsize */
147 138 (destructor)ZstdDecompressionWriter_dealloc, /* tp_dealloc */
148 139 0, /* tp_print */
149 140 0, /* tp_getattr */
150 141 0, /* tp_setattr */
151 142 0, /* tp_compare */
152 143 0, /* tp_repr */
153 144 0, /* tp_as_number */
154 145 0, /* tp_as_sequence */
155 146 0, /* tp_as_mapping */
156 147 0, /* tp_hash */
157 148 0, /* tp_call */
158 149 0, /* tp_str */
159 150 0, /* tp_getattro */
160 151 0, /* tp_setattro */
161 152 0, /* tp_as_buffer */
162 153 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
163 154 ZstdDecompressionWriter__doc, /* tp_doc */
164 155 0, /* tp_traverse */
165 156 0, /* tp_clear */
166 157 0, /* tp_richcompare */
167 158 0, /* tp_weaklistoffset */
168 159 0, /* tp_iter */
169 160 0, /* tp_iternext */
170 161 ZstdDecompressionWriter_methods,/* tp_methods */
171 162 0, /* tp_members */
172 163 0, /* tp_getset */
173 164 0, /* tp_base */
174 165 0, /* tp_dict */
175 166 0, /* tp_descr_get */
176 167 0, /* tp_descr_set */
177 168 0, /* tp_dictoffset */
178 169 0, /* tp_init */
179 170 0, /* tp_alloc */
180 171 PyType_GenericNew, /* tp_new */
181 172 };
182 173
183 174 void decompressionwriter_module_init(PyObject* mod) {
184 175 Py_TYPE(&ZstdDecompressionWriterType) = &PyType_Type;
185 176 if (PyType_Ready(&ZstdDecompressionWriterType) < 0) {
186 177 return;
187 178 }
188 179 }
@@ -1,170 +1,167
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 PyDoc_STRVAR(DecompressionObj__doc__,
14 14 "Perform decompression using a standard library compatible API.\n"
15 15 );
16 16
17 17 static void DecompressionObj_dealloc(ZstdDecompressionObj* self) {
18 if (self->dstream) {
19 ZSTD_freeDStream(self->dstream);
20 self->dstream = NULL;
21 }
22
23 18 Py_XDECREF(self->decompressor);
24 19
25 20 PyObject_Del(self);
26 21 }
27 22
28 23 static PyObject* DecompressionObj_decompress(ZstdDecompressionObj* self, PyObject* args) {
29 24 const char* source;
30 25 Py_ssize_t sourceSize;
31 26 size_t zresult;
32 27 ZSTD_inBuffer input;
33 28 ZSTD_outBuffer output;
34 29 size_t outSize = ZSTD_DStreamOutSize();
35 30 PyObject* result = NULL;
36 31 Py_ssize_t resultSize = 0;
37 32
33 /* Constructor should ensure stream is populated. */
34 assert(self->decompressor->dstream);
35
38 36 if (self->finished) {
39 37 PyErr_SetString(ZstdError, "cannot use a decompressobj multiple times");
40 38 return NULL;
41 39 }
42 40
43 41 #if PY_MAJOR_VERSION >= 3
44 42 if (!PyArg_ParseTuple(args, "y#:decompress",
45 43 #else
46 44 if (!PyArg_ParseTuple(args, "s#:decompress",
47 45 #endif
48 46 &source, &sourceSize)) {
49 47 return NULL;
50 48 }
51 49
52 50 input.src = source;
53 51 input.size = sourceSize;
54 52 input.pos = 0;
55 53
56 54 output.dst = PyMem_Malloc(outSize);
57 55 if (!output.dst) {
58 56 PyErr_NoMemory();
59 57 return NULL;
60 58 }
61 59 output.size = outSize;
62 60 output.pos = 0;
63 61
64 62 /* Read input until exhausted. */
65 63 while (input.pos < input.size) {
66 64 Py_BEGIN_ALLOW_THREADS
67 zresult = ZSTD_decompressStream(self->dstream, &output, &input);
65 zresult = ZSTD_decompressStream(self->decompressor->dstream, &output, &input);
68 66 Py_END_ALLOW_THREADS
69 67
70 68 if (ZSTD_isError(zresult)) {
71 69 PyErr_Format(ZstdError, "zstd decompressor error: %s",
72 70 ZSTD_getErrorName(zresult));
73 71 result = NULL;
74 72 goto finally;
75 73 }
76 74
77 75 if (0 == zresult) {
78 76 self->finished = 1;
79 77 }
80 78
81 79 if (output.pos) {
82 80 if (result) {
83 81 resultSize = PyBytes_GET_SIZE(result);
84 82 if (-1 == _PyBytes_Resize(&result, resultSize + output.pos)) {
85 83 goto except;
86 84 }
87 85
88 86 memcpy(PyBytes_AS_STRING(result) + resultSize,
89 87 output.dst, output.pos);
90 88 }
91 89 else {
92 90 result = PyBytes_FromStringAndSize(output.dst, output.pos);
93 91 if (!result) {
94 92 goto except;
95 93 }
96 94 }
97 95
98 96 output.pos = 0;
99 97 }
100 98 }
101 99
102 100 if (!result) {
103 101 result = PyBytes_FromString("");
104 102 }
105 103
106 104 goto finally;
107 105
108 106 except:
109 Py_DecRef(result);
110 result = NULL;
107 Py_CLEAR(result);
111 108
112 109 finally:
113 110 PyMem_Free(output.dst);
114 111
115 112 return result;
116 113 }
117 114
118 115 static PyMethodDef DecompressionObj_methods[] = {
119 116 { "decompress", (PyCFunction)DecompressionObj_decompress,
120 117 METH_VARARGS, PyDoc_STR("decompress data") },
121 118 { NULL, NULL }
122 119 };
123 120
124 121 PyTypeObject ZstdDecompressionObjType = {
125 122 PyVarObject_HEAD_INIT(NULL, 0)
126 123 "zstd.ZstdDecompressionObj", /* tp_name */
127 124 sizeof(ZstdDecompressionObj), /* tp_basicsize */
128 125 0, /* tp_itemsize */
129 126 (destructor)DecompressionObj_dealloc, /* tp_dealloc */
130 127 0, /* tp_print */
131 128 0, /* tp_getattr */
132 129 0, /* tp_setattr */
133 130 0, /* tp_compare */
134 131 0, /* tp_repr */
135 132 0, /* tp_as_number */
136 133 0, /* tp_as_sequence */
137 134 0, /* tp_as_mapping */
138 135 0, /* tp_hash */
139 136 0, /* tp_call */
140 137 0, /* tp_str */
141 138 0, /* tp_getattro */
142 139 0, /* tp_setattro */
143 140 0, /* tp_as_buffer */
144 141 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
145 142 DecompressionObj__doc__, /* tp_doc */
146 143 0, /* tp_traverse */
147 144 0, /* tp_clear */
148 145 0, /* tp_richcompare */
149 146 0, /* tp_weaklistoffset */
150 147 0, /* tp_iter */
151 148 0, /* tp_iternext */
152 149 DecompressionObj_methods, /* tp_methods */
153 150 0, /* tp_members */
154 151 0, /* tp_getset */
155 152 0, /* tp_base */
156 153 0, /* tp_dict */
157 154 0, /* tp_descr_get */
158 155 0, /* tp_descr_set */
159 156 0, /* tp_dictoffset */
160 157 0, /* tp_init */
161 158 0, /* tp_alloc */
162 159 PyType_GenericNew, /* tp_new */
163 160 };
164 161
165 162 void decompressobj_module_init(PyObject* module) {
166 163 Py_TYPE(&ZstdDecompressionObjType) = &PyType_Type;
167 164 if (PyType_Ready(&ZstdDecompressionObjType) < 0) {
168 165 return;
169 166 }
170 167 }
This diff has been collapsed as it changes many lines, (859 lines changed) Show them Hide them
@@ -1,845 +1,1580
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 #include "pool.h"
10 11
11 12 extern PyObject* ZstdError;
12 13
13 ZSTD_DStream* DStream_from_ZstdDecompressor(ZstdDecompressor* decompressor) {
14 ZSTD_DStream* dstream;
14 /**
15 * Ensure the ZSTD_DStream on a ZstdDecompressor is initialized and reset.
16 *
17 * This should be called before starting a decompression operation with a
18 * ZSTD_DStream on a ZstdDecompressor.
19 */
20 int init_dstream(ZstdDecompressor* decompressor) {
15 21 void* dictData = NULL;
16 22 size_t dictSize = 0;
17 23 size_t zresult;
18 24
19 dstream = ZSTD_createDStream();
20 if (!dstream) {
25 /* Simple case of dstream already exists. Just reset it. */
26 if (decompressor->dstream) {
27 zresult = ZSTD_resetDStream(decompressor->dstream);
28 if (ZSTD_isError(zresult)) {
29 PyErr_Format(ZstdError, "could not reset DStream: %s",
30 ZSTD_getErrorName(zresult));
31 return -1;
32 }
33
34 return 0;
35 }
36
37 decompressor->dstream = ZSTD_createDStream();
38 if (!decompressor->dstream) {
21 39 PyErr_SetString(ZstdError, "could not create DStream");
22 return NULL;
40 return -1;
23 41 }
24 42
25 43 if (decompressor->dict) {
26 44 dictData = decompressor->dict->dictData;
27 45 dictSize = decompressor->dict->dictSize;
28 46 }
29 47
30 48 if (dictData) {
31 zresult = ZSTD_initDStream_usingDict(dstream, dictData, dictSize);
49 zresult = ZSTD_initDStream_usingDict(decompressor->dstream, dictData, dictSize);
32 50 }
33 51 else {
34 zresult = ZSTD_initDStream(dstream);
52 zresult = ZSTD_initDStream(decompressor->dstream);
35 53 }
36 54
37 55 if (ZSTD_isError(zresult)) {
56 /* Don't leave a reference to an invalid object. */
57 ZSTD_freeDStream(decompressor->dstream);
58 decompressor->dstream = NULL;
59
38 60 PyErr_Format(ZstdError, "could not initialize DStream: %s",
39 61 ZSTD_getErrorName(zresult));
40 return NULL;
62 return -1;
41 63 }
42 64
43 return dstream;
65 return 0;
44 66 }
45 67
46 68 PyDoc_STRVAR(Decompressor__doc__,
47 69 "ZstdDecompressor(dict_data=None)\n"
48 70 "\n"
49 71 "Create an object used to perform Zstandard decompression.\n"
50 72 "\n"
51 73 "An instance can perform multiple decompression operations."
52 74 );
53 75
54 76 static int Decompressor_init(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
55 77 static char* kwlist[] = {
56 78 "dict_data",
57 79 NULL
58 80 };
59 81
60 82 ZstdCompressionDict* dict = NULL;
61 83
62 84 self->dctx = NULL;
63 85 self->dict = NULL;
64 86 self->ddict = NULL;
65 87
66 88 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O!:ZstdDecompressor", kwlist,
67 89 &ZstdCompressionDictType, &dict)) {
68 90 return -1;
69 91 }
70 92
71 93 /* TODO lazily initialize the reference ZSTD_DCtx on first use since
72 94 not instances of ZstdDecompressor will use a ZSTD_DCtx. */
73 95 self->dctx = ZSTD_createDCtx();
74 96 if (!self->dctx) {
75 97 PyErr_NoMemory();
76 98 goto except;
77 99 }
78 100
79 101 if (dict) {
80 102 self->dict = dict;
81 103 Py_INCREF(dict);
82 104 }
83 105
84 106 return 0;
85 107
86 108 except:
87 109 if (self->dctx) {
88 110 ZSTD_freeDCtx(self->dctx);
89 111 self->dctx = NULL;
90 112 }
91 113
92 114 return -1;
93 115 }
94 116
95 117 static void Decompressor_dealloc(ZstdDecompressor* self) {
96 if (self->dctx) {
97 ZSTD_freeDCtx(self->dctx);
98 }
99
100 Py_XDECREF(self->dict);
118 Py_CLEAR(self->dict);
101 119
102 120 if (self->ddict) {
103 121 ZSTD_freeDDict(self->ddict);
104 122 self->ddict = NULL;
105 123 }
106 124
125 if (self->dstream) {
126 ZSTD_freeDStream(self->dstream);
127 self->dstream = NULL;
128 }
129
130 if (self->dctx) {
131 ZSTD_freeDCtx(self->dctx);
132 self->dctx = NULL;
133 }
134
107 135 PyObject_Del(self);
108 136 }
109 137
110 138 PyDoc_STRVAR(Decompressor_copy_stream__doc__,
111 139 "copy_stream(ifh, ofh[, read_size=default, write_size=default]) -- decompress data between streams\n"
112 140 "\n"
113 141 "Compressed data will be read from ``ifh``, decompressed, and written to\n"
114 142 "``ofh``. ``ifh`` must have a ``read(size)`` method. ``ofh`` must have a\n"
115 143 "``write(data)`` method.\n"
116 144 "\n"
117 145 "The optional ``read_size`` and ``write_size`` arguments control the chunk\n"
118 146 "size of data that is ``read()`` and ``write()`` between streams. They default\n"
119 147 "to the default input and output sizes of zstd decompressor streams.\n"
120 148 );
121 149
122 150 static PyObject* Decompressor_copy_stream(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
123 151 static char* kwlist[] = {
124 152 "ifh",
125 153 "ofh",
126 154 "read_size",
127 155 "write_size",
128 156 NULL
129 157 };
130 158
131 159 PyObject* source;
132 160 PyObject* dest;
133 161 size_t inSize = ZSTD_DStreamInSize();
134 162 size_t outSize = ZSTD_DStreamOutSize();
135 ZSTD_DStream* dstream;
136 163 ZSTD_inBuffer input;
137 164 ZSTD_outBuffer output;
138 165 Py_ssize_t totalRead = 0;
139 166 Py_ssize_t totalWrite = 0;
140 167 char* readBuffer;
141 168 Py_ssize_t readSize;
142 169 PyObject* readResult;
143 170 PyObject* res = NULL;
144 171 size_t zresult = 0;
145 172 PyObject* writeResult;
146 173 PyObject* totalReadPy;
147 174 PyObject* totalWritePy;
148 175
149 176 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|kk:copy_stream", kwlist,
150 177 &source, &dest, &inSize, &outSize)) {
151 178 return NULL;
152 179 }
153 180
154 181 if (!PyObject_HasAttrString(source, "read")) {
155 182 PyErr_SetString(PyExc_ValueError, "first argument must have a read() method");
156 183 return NULL;
157 184 }
158 185
159 186 if (!PyObject_HasAttrString(dest, "write")) {
160 187 PyErr_SetString(PyExc_ValueError, "second argument must have a write() method");
161 188 return NULL;
162 189 }
163 190
164 191 /* Prevent free on uninitialized memory in finally. */
165 192 output.dst = NULL;
166 193
167 dstream = DStream_from_ZstdDecompressor(self);
168 if (!dstream) {
194 if (0 != init_dstream(self)) {
169 195 res = NULL;
170 196 goto finally;
171 197 }
172 198
173 199 output.dst = PyMem_Malloc(outSize);
174 200 if (!output.dst) {
175 201 PyErr_NoMemory();
176 202 res = NULL;
177 203 goto finally;
178 204 }
179 205 output.size = outSize;
180 206 output.pos = 0;
181 207
182 208 /* Read source stream until EOF */
183 209 while (1) {
184 210 readResult = PyObject_CallMethod(source, "read", "n", inSize);
185 211 if (!readResult) {
186 212 PyErr_SetString(ZstdError, "could not read() from source");
187 213 goto finally;
188 214 }
189 215
190 216 PyBytes_AsStringAndSize(readResult, &readBuffer, &readSize);
191 217
192 218 /* If no data was read, we're at EOF. */
193 219 if (0 == readSize) {
194 220 break;
195 221 }
196 222
197 223 totalRead += readSize;
198 224
199 225 /* Send data to decompressor */
200 226 input.src = readBuffer;
201 227 input.size = readSize;
202 228 input.pos = 0;
203 229
204 230 while (input.pos < input.size) {
205 231 Py_BEGIN_ALLOW_THREADS
206 zresult = ZSTD_decompressStream(dstream, &output, &input);
232 zresult = ZSTD_decompressStream(self->dstream, &output, &input);
207 233 Py_END_ALLOW_THREADS
208 234
209 235 if (ZSTD_isError(zresult)) {
210 236 PyErr_Format(ZstdError, "zstd decompressor error: %s",
211 237 ZSTD_getErrorName(zresult));
212 238 res = NULL;
213 239 goto finally;
214 240 }
215 241
216 242 if (output.pos) {
217 243 #if PY_MAJOR_VERSION >= 3
218 244 writeResult = PyObject_CallMethod(dest, "write", "y#",
219 245 #else
220 246 writeResult = PyObject_CallMethod(dest, "write", "s#",
221 247 #endif
222 248 output.dst, output.pos);
223 249
224 250 Py_XDECREF(writeResult);
225 251 totalWrite += output.pos;
226 252 output.pos = 0;
227 253 }
228 254 }
229 255 }
230 256
231 257 /* Source stream is exhausted. Finish up. */
232 258
233 ZSTD_freeDStream(dstream);
234 dstream = NULL;
235
236 259 totalReadPy = PyLong_FromSsize_t(totalRead);
237 260 totalWritePy = PyLong_FromSsize_t(totalWrite);
238 261 res = PyTuple_Pack(2, totalReadPy, totalWritePy);
239 Py_DecRef(totalReadPy);
240 Py_DecRef(totalWritePy);
262 Py_DECREF(totalReadPy);
263 Py_DECREF(totalWritePy);
241 264
242 265 finally:
243 266 if (output.dst) {
244 267 PyMem_Free(output.dst);
245 268 }
246 269
247 if (dstream) {
248 ZSTD_freeDStream(dstream);
249 }
250
251 270 return res;
252 271 }
253 272
254 273 PyDoc_STRVAR(Decompressor_decompress__doc__,
255 274 "decompress(data[, max_output_size=None]) -- Decompress data in its entirety\n"
256 275 "\n"
257 276 "This method will decompress the entirety of the argument and return the\n"
258 277 "result.\n"
259 278 "\n"
260 279 "The input bytes are expected to contain a full Zstandard frame (something\n"
261 280 "compressed with ``ZstdCompressor.compress()`` or similar). If the input does\n"
262 281 "not contain a full frame, an exception will be raised.\n"
263 282 "\n"
264 283 "If the frame header of the compressed data does not contain the content size\n"
265 284 "``max_output_size`` must be specified or ``ZstdError`` will be raised. An\n"
266 285 "allocation of size ``max_output_size`` will be performed and an attempt will\n"
267 286 "be made to perform decompression into that buffer. If the buffer is too\n"
268 287 "small or cannot be allocated, ``ZstdError`` will be raised. The buffer will\n"
269 288 "be resized if it is too large.\n"
270 289 "\n"
271 290 "Uncompressed data could be much larger than compressed data. As a result,\n"
272 291 "calling this function could result in a very large memory allocation being\n"
273 292 "performed to hold the uncompressed data. Therefore it is **highly**\n"
274 293 "recommended to use a streaming decompression method instead of this one.\n"
275 294 );
276 295
277 296 PyObject* Decompressor_decompress(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
278 297 static char* kwlist[] = {
279 298 "data",
280 299 "max_output_size",
281 300 NULL
282 301 };
283 302
284 303 const char* source;
285 304 Py_ssize_t sourceSize;
286 305 Py_ssize_t maxOutputSize = 0;
287 306 unsigned long long decompressedSize;
288 307 size_t destCapacity;
289 308 PyObject* result = NULL;
290 309 void* dictData = NULL;
291 310 size_t dictSize = 0;
292 311 size_t zresult;
293 312
294 313 #if PY_MAJOR_VERSION >= 3
295 314 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|n:decompress",
296 315 #else
297 316 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|n:decompress",
298 317 #endif
299 318 kwlist, &source, &sourceSize, &maxOutputSize)) {
300 319 return NULL;
301 320 }
302 321
303 322 if (self->dict) {
304 323 dictData = self->dict->dictData;
305 324 dictSize = self->dict->dictSize;
306 325 }
307 326
308 327 if (dictData && !self->ddict) {
309 328 Py_BEGIN_ALLOW_THREADS
310 329 self->ddict = ZSTD_createDDict_byReference(dictData, dictSize);
311 330 Py_END_ALLOW_THREADS
312 331
313 332 if (!self->ddict) {
314 333 PyErr_SetString(ZstdError, "could not create decompression dict");
315 334 return NULL;
316 335 }
317 336 }
318 337
319 338 decompressedSize = ZSTD_getDecompressedSize(source, sourceSize);
320 339 /* 0 returned if content size not in the zstd frame header */
321 340 if (0 == decompressedSize) {
322 341 if (0 == maxOutputSize) {
323 342 PyErr_SetString(ZstdError, "input data invalid or missing content size "
324 343 "in frame header");
325 344 return NULL;
326 345 }
327 346 else {
328 347 result = PyBytes_FromStringAndSize(NULL, maxOutputSize);
329 348 destCapacity = maxOutputSize;
330 349 }
331 350 }
332 351 else {
333 352 result = PyBytes_FromStringAndSize(NULL, decompressedSize);
334 353 destCapacity = decompressedSize;
335 354 }
336 355
337 356 if (!result) {
338 357 return NULL;
339 358 }
340 359
341 360 Py_BEGIN_ALLOW_THREADS
342 361 if (self->ddict) {
343 362 zresult = ZSTD_decompress_usingDDict(self->dctx,
344 363 PyBytes_AsString(result), destCapacity,
345 364 source, sourceSize, self->ddict);
346 365 }
347 366 else {
348 367 zresult = ZSTD_decompressDCtx(self->dctx,
349 368 PyBytes_AsString(result), destCapacity, source, sourceSize);
350 369 }
351 370 Py_END_ALLOW_THREADS
352 371
353 372 if (ZSTD_isError(zresult)) {
354 373 PyErr_Format(ZstdError, "decompression error: %s", ZSTD_getErrorName(zresult));
355 Py_DecRef(result);
374 Py_DECREF(result);
356 375 return NULL;
357 376 }
358 377 else if (decompressedSize && zresult != decompressedSize) {
359 378 PyErr_Format(ZstdError, "decompression error: decompressed %zu bytes; expected %llu",
360 379 zresult, decompressedSize);
361 Py_DecRef(result);
380 Py_DECREF(result);
362 381 return NULL;
363 382 }
364 383 else if (zresult < destCapacity) {
365 384 if (_PyBytes_Resize(&result, zresult)) {
366 Py_DecRef(result);
385 Py_DECREF(result);
367 386 return NULL;
368 387 }
369 388 }
370 389
371 390 return result;
372 391 }
373 392
374 393 PyDoc_STRVAR(Decompressor_decompressobj__doc__,
375 394 "decompressobj()\n"
376 395 "\n"
377 396 "Incrementally feed data into a decompressor.\n"
378 397 "\n"
379 398 "The returned object exposes a ``decompress(data)`` method. This makes it\n"
380 399 "compatible with ``zlib.decompressobj`` and ``bz2.BZ2Decompressor`` so that\n"
381 400 "callers can swap in the zstd decompressor while using the same API.\n"
382 401 );
383 402
384 403 static ZstdDecompressionObj* Decompressor_decompressobj(ZstdDecompressor* self) {
385 ZstdDecompressionObj* result = PyObject_New(ZstdDecompressionObj, &ZstdDecompressionObjType);
404 ZstdDecompressionObj* result = (ZstdDecompressionObj*)PyObject_CallObject((PyObject*)&ZstdDecompressionObjType, NULL);
386 405 if (!result) {
387 406 return NULL;
388 407 }
389 408
390 result->dstream = DStream_from_ZstdDecompressor(self);
391 if (!result->dstream) {
392 Py_DecRef((PyObject*)result);
409 if (0 != init_dstream(self)) {
410 Py_DECREF(result);
393 411 return NULL;
394 412 }
395 413
396 414 result->decompressor = self;
397 415 Py_INCREF(result->decompressor);
398 416
399 result->finished = 0;
400
401 417 return result;
402 418 }
403 419
404 420 PyDoc_STRVAR(Decompressor_read_from__doc__,
405 421 "read_from(reader[, read_size=default, write_size=default, skip_bytes=0])\n"
406 422 "Read compressed data and return an iterator\n"
407 423 "\n"
408 424 "Returns an iterator of decompressed data chunks produced from reading from\n"
409 425 "the ``reader``.\n"
410 426 "\n"
411 427 "Compressed data will be obtained from ``reader`` by calling the\n"
412 428 "``read(size)`` method of it. The source data will be streamed into a\n"
413 429 "decompressor. As decompressed data is available, it will be exposed to the\n"
414 430 "returned iterator.\n"
415 431 "\n"
416 432 "Data is ``read()`` in chunks of size ``read_size`` and exposed to the\n"
417 433 "iterator in chunks of size ``write_size``. The default values are the input\n"
418 434 "and output sizes for a zstd streaming decompressor.\n"
419 435 "\n"
420 436 "There is also support for skipping the first ``skip_bytes`` of data from\n"
421 437 "the source.\n"
422 438 );
423 439
424 440 static ZstdDecompressorIterator* Decompressor_read_from(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
425 441 static char* kwlist[] = {
426 442 "reader",
427 443 "read_size",
428 444 "write_size",
429 445 "skip_bytes",
430 446 NULL
431 447 };
432 448
433 449 PyObject* reader;
434 450 size_t inSize = ZSTD_DStreamInSize();
435 451 size_t outSize = ZSTD_DStreamOutSize();
436 452 ZstdDecompressorIterator* result;
437 453 size_t skipBytes = 0;
438 454
439 455 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|kkk:read_from", kwlist,
440 456 &reader, &inSize, &outSize, &skipBytes)) {
441 457 return NULL;
442 458 }
443 459
444 460 if (skipBytes >= inSize) {
445 461 PyErr_SetString(PyExc_ValueError,
446 462 "skip_bytes must be smaller than read_size");
447 463 return NULL;
448 464 }
449 465
450 result = PyObject_New(ZstdDecompressorIterator, &ZstdDecompressorIteratorType);
466 result = (ZstdDecompressorIterator*)PyObject_CallObject((PyObject*)&ZstdDecompressorIteratorType, NULL);
451 467 if (!result) {
452 468 return NULL;
453 469 }
454 470
455 result->decompressor = NULL;
456 result->reader = NULL;
457 result->buffer = NULL;
458 result->dstream = NULL;
459 result->input.src = NULL;
460 result->output.dst = NULL;
461
462 471 if (PyObject_HasAttrString(reader, "read")) {
463 472 result->reader = reader;
464 473 Py_INCREF(result->reader);
465 474 }
466 475 else if (1 == PyObject_CheckBuffer(reader)) {
467 476 /* Object claims it is a buffer. Try to get a handle to it. */
468 477 result->buffer = PyMem_Malloc(sizeof(Py_buffer));
469 478 if (!result->buffer) {
470 479 goto except;
471 480 }
472 481
473 482 memset(result->buffer, 0, sizeof(Py_buffer));
474 483
475 484 if (0 != PyObject_GetBuffer(reader, result->buffer, PyBUF_CONTIG_RO)) {
476 485 goto except;
477 486 }
478
479 result->bufferOffset = 0;
480 487 }
481 488 else {
482 489 PyErr_SetString(PyExc_ValueError,
483 490 "must pass an object with a read() method or conforms to buffer protocol");
484 491 goto except;
485 492 }
486 493
487 494 result->decompressor = self;
488 495 Py_INCREF(result->decompressor);
489 496
490 497 result->inSize = inSize;
491 498 result->outSize = outSize;
492 499 result->skipBytes = skipBytes;
493 500
494 result->dstream = DStream_from_ZstdDecompressor(self);
495 if (!result->dstream) {
501 if (0 != init_dstream(self)) {
496 502 goto except;
497 503 }
498 504
499 505 result->input.src = PyMem_Malloc(inSize);
500 506 if (!result->input.src) {
501 507 PyErr_NoMemory();
502 508 goto except;
503 509 }
504 result->input.size = 0;
505 result->input.pos = 0;
506
507 result->output.dst = NULL;
508 result->output.size = 0;
509 result->output.pos = 0;
510
511 result->readCount = 0;
512 result->finishedInput = 0;
513 result->finishedOutput = 0;
514 510
515 511 goto finally;
516 512
517 513 except:
518 514 Py_CLEAR(result->reader);
519 515
520 516 if (result->buffer) {
521 517 PyBuffer_Release(result->buffer);
522 518 Py_CLEAR(result->buffer);
523 519 }
524 520
525 521 Py_CLEAR(result);
526 522
527 523 finally:
528 524
529 525 return result;
530 526 }
531 527
532 528 PyDoc_STRVAR(Decompressor_write_to__doc__,
533 529 "Create a context manager to write decompressed data to an object.\n"
534 530 "\n"
535 531 "The passed object must have a ``write()`` method.\n"
536 532 "\n"
537 533 "The caller feeds intput data to the object by calling ``write(data)``.\n"
538 534 "Decompressed data is written to the argument given as it is decompressed.\n"
539 535 "\n"
540 536 "An optional ``write_size`` argument defines the size of chunks to\n"
541 537 "``write()`` to the writer. It defaults to the default output size for a zstd\n"
542 538 "streaming decompressor.\n"
543 539 );
544 540
545 541 static ZstdDecompressionWriter* Decompressor_write_to(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
546 542 static char* kwlist[] = {
547 543 "writer",
548 544 "write_size",
549 545 NULL
550 546 };
551 547
552 548 PyObject* writer;
553 549 size_t outSize = ZSTD_DStreamOutSize();
554 550 ZstdDecompressionWriter* result;
555 551
556 552 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k:write_to", kwlist,
557 553 &writer, &outSize)) {
558 554 return NULL;
559 555 }
560 556
561 557 if (!PyObject_HasAttrString(writer, "write")) {
562 558 PyErr_SetString(PyExc_ValueError, "must pass an object with a write() method");
563 559 return NULL;
564 560 }
565 561
566 result = PyObject_New(ZstdDecompressionWriter, &ZstdDecompressionWriterType);
562 result = (ZstdDecompressionWriter*)PyObject_CallObject((PyObject*)&ZstdDecompressionWriterType, NULL);
567 563 if (!result) {
568 564 return NULL;
569 565 }
570 566
571 567 result->decompressor = self;
572 568 Py_INCREF(result->decompressor);
573 569
574 570 result->writer = writer;
575 571 Py_INCREF(result->writer);
576 572
577 573 result->outSize = outSize;
578 574
579 result->entered = 0;
580 result->dstream = NULL;
581
582 575 return result;
583 576 }
584 577
585 578 PyDoc_STRVAR(Decompressor_decompress_content_dict_chain__doc__,
586 579 "Decompress a series of chunks using the content dictionary chaining technique\n"
587 580 );
588 581
589 582 static PyObject* Decompressor_decompress_content_dict_chain(PyObject* self, PyObject* args, PyObject* kwargs) {
590 583 static char* kwlist[] = {
591 584 "frames",
592 585 NULL
593 586 };
594 587
595 588 PyObject* chunks;
596 589 Py_ssize_t chunksLen;
597 590 Py_ssize_t chunkIndex;
598 591 char parity = 0;
599 592 PyObject* chunk;
600 593 char* chunkData;
601 594 Py_ssize_t chunkSize;
602 595 ZSTD_DCtx* dctx = NULL;
603 596 size_t zresult;
604 597 ZSTD_frameParams frameParams;
605 598 void* buffer1 = NULL;
606 599 size_t buffer1Size = 0;
607 600 size_t buffer1ContentSize = 0;
608 601 void* buffer2 = NULL;
609 602 size_t buffer2Size = 0;
610 603 size_t buffer2ContentSize = 0;
611 604 void* destBuffer = NULL;
612 605 PyObject* result = NULL;
613 606
614 607 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!:decompress_content_dict_chain",
615 608 kwlist, &PyList_Type, &chunks)) {
616 609 return NULL;
617 610 }
618 611
619 612 chunksLen = PyList_Size(chunks);
620 613 if (!chunksLen) {
621 614 PyErr_SetString(PyExc_ValueError, "empty input chain");
622 615 return NULL;
623 616 }
624 617
625 618 /* The first chunk should not be using a dictionary. We handle it specially. */
626 619 chunk = PyList_GetItem(chunks, 0);
627 620 if (!PyBytes_Check(chunk)) {
628 621 PyErr_SetString(PyExc_ValueError, "chunk 0 must be bytes");
629 622 return NULL;
630 623 }
631 624
632 625 /* We require that all chunks be zstd frames and that they have content size set. */
633 626 PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize);
634 627 zresult = ZSTD_getFrameParams(&frameParams, (void*)chunkData, chunkSize);
635 628 if (ZSTD_isError(zresult)) {
636 629 PyErr_SetString(PyExc_ValueError, "chunk 0 is not a valid zstd frame");
637 630 return NULL;
638 631 }
639 632 else if (zresult) {
640 633 PyErr_SetString(PyExc_ValueError, "chunk 0 is too small to contain a zstd frame");
641 634 return NULL;
642 635 }
643 636
644 637 if (0 == frameParams.frameContentSize) {
645 638 PyErr_SetString(PyExc_ValueError, "chunk 0 missing content size in frame");
646 639 return NULL;
647 640 }
648 641
649 642 dctx = ZSTD_createDCtx();
650 643 if (!dctx) {
651 644 PyErr_NoMemory();
652 645 goto finally;
653 646 }
654 647
655 648 buffer1Size = frameParams.frameContentSize;
656 649 buffer1 = PyMem_Malloc(buffer1Size);
657 650 if (!buffer1) {
658 651 goto finally;
659 652 }
660 653
661 654 Py_BEGIN_ALLOW_THREADS
662 655 zresult = ZSTD_decompressDCtx(dctx, buffer1, buffer1Size, chunkData, chunkSize);
663 656 Py_END_ALLOW_THREADS
664 657 if (ZSTD_isError(zresult)) {
665 658 PyErr_Format(ZstdError, "could not decompress chunk 0: %s", ZSTD_getErrorName(zresult));
666 659 goto finally;
667 660 }
668 661
669 662 buffer1ContentSize = zresult;
670 663
671 664 /* Special case of a simple chain. */
672 665 if (1 == chunksLen) {
673 666 result = PyBytes_FromStringAndSize(buffer1, buffer1Size);
674 667 goto finally;
675 668 }
676 669
677 670 /* This should ideally look at next chunk. But this is slightly simpler. */
678 671 buffer2Size = frameParams.frameContentSize;
679 672 buffer2 = PyMem_Malloc(buffer2Size);
680 673 if (!buffer2) {
681 674 goto finally;
682 675 }
683 676
684 677 /* For each subsequent chunk, use the previous fulltext as a content dictionary.
685 678 Our strategy is to have 2 buffers. One holds the previous fulltext (to be
686 679 used as a content dictionary) and the other holds the new fulltext. The
687 680 buffers grow when needed but never decrease in size. This limits the
688 681 memory allocator overhead.
689 682 */
690 683 for (chunkIndex = 1; chunkIndex < chunksLen; chunkIndex++) {
691 684 chunk = PyList_GetItem(chunks, chunkIndex);
692 685 if (!PyBytes_Check(chunk)) {
693 686 PyErr_Format(PyExc_ValueError, "chunk %zd must be bytes", chunkIndex);
694 687 goto finally;
695 688 }
696 689
697 690 PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize);
698 691 zresult = ZSTD_getFrameParams(&frameParams, (void*)chunkData, chunkSize);
699 692 if (ZSTD_isError(zresult)) {
700 693 PyErr_Format(PyExc_ValueError, "chunk %zd is not a valid zstd frame", chunkIndex);
701 694 goto finally;
702 695 }
703 696 else if (zresult) {
704 697 PyErr_Format(PyExc_ValueError, "chunk %zd is too small to contain a zstd frame", chunkIndex);
705 698 goto finally;
706 699 }
707 700
708 701 if (0 == frameParams.frameContentSize) {
709 702 PyErr_Format(PyExc_ValueError, "chunk %zd missing content size in frame", chunkIndex);
710 703 goto finally;
711 704 }
712 705
713 706 parity = chunkIndex % 2;
714 707
715 708 /* This could definitely be abstracted to reduce code duplication. */
716 709 if (parity) {
717 710 /* Resize destination buffer to hold larger content. */
718 711 if (buffer2Size < frameParams.frameContentSize) {
719 712 buffer2Size = frameParams.frameContentSize;
720 713 destBuffer = PyMem_Realloc(buffer2, buffer2Size);
721 714 if (!destBuffer) {
722 715 goto finally;
723 716 }
724 717 buffer2 = destBuffer;
725 718 }
726 719
727 720 Py_BEGIN_ALLOW_THREADS
728 721 zresult = ZSTD_decompress_usingDict(dctx, buffer2, buffer2Size,
729 722 chunkData, chunkSize, buffer1, buffer1ContentSize);
730 723 Py_END_ALLOW_THREADS
731 724 if (ZSTD_isError(zresult)) {
732 725 PyErr_Format(ZstdError, "could not decompress chunk %zd: %s",
733 726 chunkIndex, ZSTD_getErrorName(zresult));
734 727 goto finally;
735 728 }
736 729 buffer2ContentSize = zresult;
737 730 }
738 731 else {
739 732 if (buffer1Size < frameParams.frameContentSize) {
740 733 buffer1Size = frameParams.frameContentSize;
741 734 destBuffer = PyMem_Realloc(buffer1, buffer1Size);
742 735 if (!destBuffer) {
743 736 goto finally;
744 737 }
745 738 buffer1 = destBuffer;
746 739 }
747 740
748 741 Py_BEGIN_ALLOW_THREADS
749 742 zresult = ZSTD_decompress_usingDict(dctx, buffer1, buffer1Size,
750 743 chunkData, chunkSize, buffer2, buffer2ContentSize);
751 744 Py_END_ALLOW_THREADS
752 745 if (ZSTD_isError(zresult)) {
753 746 PyErr_Format(ZstdError, "could not decompress chunk %zd: %s",
754 747 chunkIndex, ZSTD_getErrorName(zresult));
755 748 goto finally;
756 749 }
757 750 buffer1ContentSize = zresult;
758 751 }
759 752 }
760 753
761 754 result = PyBytes_FromStringAndSize(parity ? buffer2 : buffer1,
762 755 parity ? buffer2ContentSize : buffer1ContentSize);
763 756
764 757 finally:
765 758 if (buffer2) {
766 759 PyMem_Free(buffer2);
767 760 }
768 761 if (buffer1) {
769 762 PyMem_Free(buffer1);
770 763 }
771 764
772 765 if (dctx) {
773 766 ZSTD_freeDCtx(dctx);
774 767 }
775 768
776 769 return result;
777 770 }
778 771
772 typedef struct {
773 void* sourceData;
774 size_t sourceSize;
775 unsigned long long destSize;
776 } FramePointer;
777
778 typedef struct {
779 FramePointer* frames;
780 Py_ssize_t framesSize;
781 unsigned long long compressedSize;
782 } FrameSources;
783
784 typedef struct {
785 void* dest;
786 Py_ssize_t destSize;
787 BufferSegment* segments;
788 Py_ssize_t segmentsSize;
789 } DestBuffer;
790
791 typedef enum {
792 WorkerError_none = 0,
793 WorkerError_zstd = 1,
794 WorkerError_memory = 2,
795 WorkerError_sizeMismatch = 3,
796 WorkerError_unknownSize = 4,
797 } WorkerError;
798
799 typedef struct {
800 /* Source records and length */
801 FramePointer* framePointers;
802 /* Which records to process. */
803 Py_ssize_t startOffset;
804 Py_ssize_t endOffset;
805 unsigned long long totalSourceSize;
806
807 /* Compression state and settings. */
808 ZSTD_DCtx* dctx;
809 ZSTD_DDict* ddict;
810 int requireOutputSizes;
811
812 /* Output storage. */
813 DestBuffer* destBuffers;
814 Py_ssize_t destCount;
815
816 /* Item that error occurred on. */
817 Py_ssize_t errorOffset;
818 /* If an error occurred. */
819 WorkerError error;
820 /* result from zstd decompression operation */
821 size_t zresult;
822 } WorkerState;
823
824 static void decompress_worker(WorkerState* state) {
825 size_t allocationSize;
826 DestBuffer* destBuffer;
827 Py_ssize_t frameIndex;
828 Py_ssize_t localOffset = 0;
829 Py_ssize_t currentBufferStartIndex = state->startOffset;
830 Py_ssize_t remainingItems = state->endOffset - state->startOffset + 1;
831 void* tmpBuf;
832 Py_ssize_t destOffset = 0;
833 FramePointer* framePointers = state->framePointers;
834 size_t zresult;
835 unsigned long long totalOutputSize = 0;
836
837 assert(NULL == state->destBuffers);
838 assert(0 == state->destCount);
839 assert(state->endOffset - state->startOffset >= 0);
840
841 /*
842 * We need to allocate a buffer to hold decompressed data. How we do this
843 * depends on what we know about the output. The following scenarios are
844 * possible:
845 *
846 * 1. All structs defining frames declare the output size.
847 * 2. The decompressed size is embedded within the zstd frame.
848 * 3. The decompressed size is not stored anywhere.
849 *
850 * For now, we only support #1 and #2.
851 */
852
853 /* Resolve ouput segments. */
854 for (frameIndex = state->startOffset; frameIndex <= state->endOffset; frameIndex++) {
855 FramePointer* fp = &framePointers[frameIndex];
856
857 if (0 == fp->destSize) {
858 fp->destSize = ZSTD_getDecompressedSize(fp->sourceData, fp->sourceSize);
859 if (0 == fp->destSize && state->requireOutputSizes) {
860 state->error = WorkerError_unknownSize;
861 state->errorOffset = frameIndex;
862 return;
863 }
864 }
865
866 totalOutputSize += fp->destSize;
867 }
868
869 state->destBuffers = calloc(1, sizeof(DestBuffer));
870 if (NULL == state->destBuffers) {
871 state->error = WorkerError_memory;
872 return;
873 }
874
875 state->destCount = 1;
876
877 destBuffer = &state->destBuffers[state->destCount - 1];
878
879 assert(framePointers[state->startOffset].destSize > 0); /* For now. */
880
881 allocationSize = roundpow2(state->totalSourceSize);
882
883 if (framePointers[state->startOffset].destSize > allocationSize) {
884 allocationSize = roundpow2(framePointers[state->startOffset].destSize);
885 }
886
887 destBuffer->dest = malloc(allocationSize);
888 if (NULL == destBuffer->dest) {
889 state->error = WorkerError_memory;
890 return;
891 }
892
893 destBuffer->destSize = allocationSize;
894
895 destBuffer->segments = calloc(remainingItems, sizeof(BufferSegment));
896 if (NULL == destBuffer->segments) {
897 /* Caller will free state->dest as part of cleanup. */
898 state->error = WorkerError_memory;
899 return;
900 }
901
902 destBuffer->segmentsSize = remainingItems;
903
904 for (frameIndex = state->startOffset; frameIndex <= state->endOffset; frameIndex++) {
905 const void* source = framePointers[frameIndex].sourceData;
906 const size_t sourceSize = framePointers[frameIndex].sourceSize;
907 void* dest;
908 const size_t decompressedSize = framePointers[frameIndex].destSize;
909 size_t destAvailable = destBuffer->destSize - destOffset;
910
911 assert(decompressedSize > 0); /* For now. */
912
913 /*
914 * Not enough space in current buffer. Finish current before and allocate and
915 * switch to a new one.
916 */
917 if (decompressedSize > destAvailable) {
918 /*
919 * Shrinking the destination buffer is optional. But it should be cheap,
920 * so we just do it.
921 */
922 if (destAvailable) {
923 tmpBuf = realloc(destBuffer->dest, destOffset);
924 if (NULL == tmpBuf) {
925 state->error = WorkerError_memory;
926 return;
927 }
928
929 destBuffer->dest = tmpBuf;
930 destBuffer->destSize = destOffset;
931 }
932
933 /* Truncate segments buffer. */
934 tmpBuf = realloc(destBuffer->segments,
935 (frameIndex - currentBufferStartIndex) * sizeof(BufferSegment));
936 if (NULL == tmpBuf) {
937 state->error = WorkerError_memory;
938 return;
939 }
940
941 destBuffer->segments = tmpBuf;
942 destBuffer->segmentsSize = frameIndex - currentBufferStartIndex;
943
944 /* Grow space for new DestBuffer. */
945 tmpBuf = realloc(state->destBuffers, (state->destCount + 1) * sizeof(DestBuffer));
946 if (NULL == tmpBuf) {
947 state->error = WorkerError_memory;
948 return;
949 }
950
951 state->destBuffers = tmpBuf;
952 state->destCount++;
953
954 destBuffer = &state->destBuffers[state->destCount - 1];
955
956 /* Don't take any chances will non-NULL pointers. */
957 memset(destBuffer, 0, sizeof(DestBuffer));
958
959 allocationSize = roundpow2(state->totalSourceSize);
960
961 if (decompressedSize > allocationSize) {
962 allocationSize = roundpow2(decompressedSize);
963 }
964
965 destBuffer->dest = malloc(allocationSize);
966 if (NULL == destBuffer->dest) {
967 state->error = WorkerError_memory;
968 return;
969 }
970
971 destBuffer->destSize = allocationSize;
972 destAvailable = allocationSize;
973 destOffset = 0;
974 localOffset = 0;
975
976 destBuffer->segments = calloc(remainingItems, sizeof(BufferSegment));
977 if (NULL == destBuffer->segments) {
978 state->error = WorkerError_memory;
979 return;
980 }
981
982 destBuffer->segmentsSize = remainingItems;
983 currentBufferStartIndex = frameIndex;
984 }
985
986 dest = (char*)destBuffer->dest + destOffset;
987
988 if (state->ddict) {
989 zresult = ZSTD_decompress_usingDDict(state->dctx, dest, decompressedSize,
990 source, sourceSize, state->ddict);
991 }
992 else {
993 zresult = ZSTD_decompressDCtx(state->dctx, dest, decompressedSize,
994 source, sourceSize);
995 }
996
997 if (ZSTD_isError(zresult)) {
998 state->error = WorkerError_zstd;
999 state->zresult = zresult;
1000 state->errorOffset = frameIndex;
1001 return;
1002 }
1003 else if (zresult != decompressedSize) {
1004 state->error = WorkerError_sizeMismatch;
1005 state->zresult = zresult;
1006 state->errorOffset = frameIndex;
1007 return;
1008 }
1009
1010 destBuffer->segments[localOffset].offset = destOffset;
1011 destBuffer->segments[localOffset].length = decompressedSize;
1012 destOffset += zresult;
1013 localOffset++;
1014 remainingItems--;
1015 }
1016
1017 if (destBuffer->destSize > destOffset) {
1018 tmpBuf = realloc(destBuffer->dest, destOffset);
1019 if (NULL == tmpBuf) {
1020 state->error = WorkerError_memory;
1021 return;
1022 }
1023
1024 destBuffer->dest = tmpBuf;
1025 destBuffer->destSize = destOffset;
1026 }
1027 }
1028
1029 ZstdBufferWithSegmentsCollection* decompress_from_framesources(ZstdDecompressor* decompressor, FrameSources* frames,
1030 unsigned int threadCount) {
1031 void* dictData = NULL;
1032 size_t dictSize = 0;
1033 Py_ssize_t i = 0;
1034 int errored = 0;
1035 Py_ssize_t segmentsCount;
1036 ZstdBufferWithSegments* bws = NULL;
1037 PyObject* resultArg = NULL;
1038 Py_ssize_t resultIndex;
1039 ZstdBufferWithSegmentsCollection* result = NULL;
1040 FramePointer* framePointers = frames->frames;
1041 unsigned long long workerBytes = 0;
1042 int currentThread = 0;
1043 Py_ssize_t workerStartOffset = 0;
1044 POOL_ctx* pool = NULL;
1045 WorkerState* workerStates = NULL;
1046 unsigned long long bytesPerWorker;
1047
1048 /* Caller should normalize 0 and negative values to 1 or larger. */
1049 assert(threadCount >= 1);
1050
1051 /* More threads than inputs makes no sense under any conditions. */
1052 threadCount = frames->framesSize < threadCount ? (unsigned int)frames->framesSize
1053 : threadCount;
1054
1055 /* TODO lower thread count if input size is too small and threads would just
1056 add overhead. */
1057
1058 if (decompressor->dict) {
1059 dictData = decompressor->dict->dictData;
1060 dictSize = decompressor->dict->dictSize;
1061 }
1062
1063 if (dictData && !decompressor->ddict) {
1064 Py_BEGIN_ALLOW_THREADS
1065 decompressor->ddict = ZSTD_createDDict_byReference(dictData, dictSize);
1066 Py_END_ALLOW_THREADS
1067
1068 if (!decompressor->ddict) {
1069 PyErr_SetString(ZstdError, "could not create decompression dict");
1070 return NULL;
1071 }
1072 }
1073
1074 /* If threadCount==1, we don't start a thread pool. But we do leverage the
1075 same API for dispatching work. */
1076 workerStates = PyMem_Malloc(threadCount * sizeof(WorkerState));
1077 if (NULL == workerStates) {
1078 PyErr_NoMemory();
1079 goto finally;
1080 }
1081
1082 memset(workerStates, 0, threadCount * sizeof(WorkerState));
1083
1084 if (threadCount > 1) {
1085 pool = POOL_create(threadCount, 1);
1086 if (NULL == pool) {
1087 PyErr_SetString(ZstdError, "could not initialize zstd thread pool");
1088 goto finally;
1089 }
1090 }
1091
1092 bytesPerWorker = frames->compressedSize / threadCount;
1093
1094 for (i = 0; i < threadCount; i++) {
1095 workerStates[i].dctx = ZSTD_createDCtx();
1096 if (NULL == workerStates[i].dctx) {
1097 PyErr_NoMemory();
1098 goto finally;
1099 }
1100
1101 ZSTD_copyDCtx(workerStates[i].dctx, decompressor->dctx);
1102
1103 workerStates[i].ddict = decompressor->ddict;
1104 workerStates[i].framePointers = framePointers;
1105 workerStates[i].requireOutputSizes = 1;
1106 }
1107
1108 Py_BEGIN_ALLOW_THREADS
1109 /* There are many ways to split work among workers.
1110
1111 For now, we take a simple approach of splitting work so each worker
1112 gets roughly the same number of input bytes. This will result in more
1113 starvation than running N>threadCount jobs. But it avoids complications
1114 around state tracking, which could involve extra locking.
1115 */
1116 for (i = 0; i < frames->framesSize; i++) {
1117 workerBytes += frames->frames[i].sourceSize;
1118
1119 /*
1120 * The last worker/thread needs to handle all remaining work. Don't
1121 * trigger it prematurely. Defer to the block outside of the loop.
1122 * (But still process this loop so workerBytes is correct.
1123 */
1124 if (currentThread == threadCount - 1) {
1125 continue;
1126 }
1127
1128 if (workerBytes >= bytesPerWorker) {
1129 workerStates[currentThread].startOffset = workerStartOffset;
1130 workerStates[currentThread].endOffset = i;
1131 workerStates[currentThread].totalSourceSize = workerBytes;
1132
1133 if (threadCount > 1) {
1134 POOL_add(pool, (POOL_function)decompress_worker, &workerStates[currentThread]);
1135 }
1136 else {
1137 decompress_worker(&workerStates[currentThread]);
1138 }
1139 currentThread++;
1140 workerStartOffset = i + 1;
1141 workerBytes = 0;
1142 }
1143 }
1144
1145 if (workerBytes) {
1146 workerStates[currentThread].startOffset = workerStartOffset;
1147 workerStates[currentThread].endOffset = frames->framesSize - 1;
1148 workerStates[currentThread].totalSourceSize = workerBytes;
1149
1150 if (threadCount > 1) {
1151 POOL_add(pool, (POOL_function)decompress_worker, &workerStates[currentThread]);
1152 }
1153 else {
1154 decompress_worker(&workerStates[currentThread]);
1155 }
1156 }
1157
1158 if (threadCount > 1) {
1159 POOL_free(pool);
1160 pool = NULL;
1161 }
1162 Py_END_ALLOW_THREADS
1163
1164 for (i = 0; i < threadCount; i++) {
1165 switch (workerStates[i].error) {
1166 case WorkerError_none:
1167 break;
1168
1169 case WorkerError_zstd:
1170 PyErr_Format(ZstdError, "error decompressing item %zd: %s",
1171 workerStates[i].errorOffset, ZSTD_getErrorName(workerStates[i].zresult));
1172 errored = 1;
1173 break;
1174
1175 case WorkerError_memory:
1176 PyErr_NoMemory();
1177 errored = 1;
1178 break;
1179
1180 case WorkerError_sizeMismatch:
1181 PyErr_Format(ZstdError, "error decompressing item %zd: decompressed %zu bytes; expected %llu",
1182 workerStates[i].errorOffset, workerStates[i].zresult,
1183 framePointers[workerStates[i].errorOffset].destSize);
1184 errored = 1;
1185 break;
1186
1187 case WorkerError_unknownSize:
1188 PyErr_Format(PyExc_ValueError, "could not determine decompressed size of item %zd",
1189 workerStates[i].errorOffset);
1190 errored = 1;
1191 break;
1192
1193 default:
1194 PyErr_Format(ZstdError, "unhandled error type: %d; this is a bug",
1195 workerStates[i].error);
1196 errored = 1;
1197 break;
1198 }
1199
1200 if (errored) {
1201 break;
1202 }
1203 }
1204
1205 if (errored) {
1206 goto finally;
1207 }
1208
1209 segmentsCount = 0;
1210 for (i = 0; i < threadCount; i++) {
1211 segmentsCount += workerStates[i].destCount;
1212 }
1213
1214 resultArg = PyTuple_New(segmentsCount);
1215 if (NULL == resultArg) {
1216 goto finally;
1217 }
1218
1219 resultIndex = 0;
1220
1221 for (i = 0; i < threadCount; i++) {
1222 Py_ssize_t bufferIndex;
1223 WorkerState* state = &workerStates[i];
1224
1225 for (bufferIndex = 0; bufferIndex < state->destCount; bufferIndex++) {
1226 DestBuffer* destBuffer = &state->destBuffers[bufferIndex];
1227
1228 bws = BufferWithSegments_FromMemory(destBuffer->dest, destBuffer->destSize,
1229 destBuffer->segments, destBuffer->segmentsSize);
1230 if (NULL == bws) {
1231 goto finally;
1232 }
1233
1234 /*
1235 * Memory for buffer and segments was allocated using malloc() in worker
1236 * and the memory is transferred to the BufferWithSegments instance. So
1237 * tell instance to use free() and NULL the reference in the state struct
1238 * so it isn't freed below.
1239 */
1240 bws->useFree = 1;
1241 destBuffer->dest = NULL;
1242 destBuffer->segments = NULL;
1243
1244 PyTuple_SET_ITEM(resultArg, resultIndex++, (PyObject*)bws);
1245 }
1246 }
1247
1248 result = (ZstdBufferWithSegmentsCollection*)PyObject_CallObject(
1249 (PyObject*)&ZstdBufferWithSegmentsCollectionType, resultArg);
1250
1251 finally:
1252 Py_CLEAR(resultArg);
1253
1254 if (workerStates) {
1255 for (i = 0; i < threadCount; i++) {
1256 Py_ssize_t bufferIndex;
1257 WorkerState* state = &workerStates[i];
1258
1259 if (state->dctx) {
1260 ZSTD_freeDCtx(state->dctx);
1261 }
1262
1263 for (bufferIndex = 0; bufferIndex < state->destCount; bufferIndex++) {
1264 if (state->destBuffers) {
1265 /*
1266 * Will be NULL if memory transfered to a BufferWithSegments.
1267 * Otherwise it is left over after an error occurred.
1268 */
1269 free(state->destBuffers[bufferIndex].dest);
1270 free(state->destBuffers[bufferIndex].segments);
1271 }
1272 }
1273
1274 free(state->destBuffers);
1275 }
1276
1277 PyMem_Free(workerStates);
1278 }
1279
1280 POOL_free(pool);
1281
1282 return result;
1283 }
1284
1285 PyDoc_STRVAR(Decompressor_multi_decompress_to_buffer__doc__,
1286 "Decompress multiple frames to output buffers\n"
1287 "\n"
1288 "Receives a ``BufferWithSegments``, a ``BufferWithSegmentsCollection`` or a\n"
1289 "list of bytes-like objects. Each item in the passed collection should be a\n"
1290 "compressed zstd frame.\n"
1291 "\n"
1292 "Unless ``decompressed_sizes`` is specified, the content size *must* be\n"
1293 "written into the zstd frame header. If ``decompressed_sizes`` is specified,\n"
1294 "it is an object conforming to the buffer protocol that represents an array\n"
1295 "of 64-bit unsigned integers in the machine's native format. Specifying\n"
1296 "``decompressed_sizes`` avoids a pre-scan of each frame to determine its\n"
1297 "output size.\n"
1298 "\n"
1299 "Returns a ``BufferWithSegmentsCollection`` containing the decompressed\n"
1300 "data. All decompressed data is allocated in a single memory buffer. The\n"
1301 "``BufferWithSegments`` instance tracks which objects are at which offsets\n"
1302 "and their respective lengths.\n"
1303 "\n"
1304 "The ``threads`` argument controls how many threads to use for operations.\n"
1305 "Negative values will use the same number of threads as logical CPUs on the\n"
1306 "machine.\n"
1307 );
1308
1309 static ZstdBufferWithSegmentsCollection* Decompressor_multi_decompress_to_buffer(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) {
1310 static char* kwlist[] = {
1311 "frames",
1312 "decompressed_sizes",
1313 "threads",
1314 NULL
1315 };
1316
1317 PyObject* frames;
1318 Py_buffer frameSizes;
1319 int threads = 0;
1320 Py_ssize_t frameCount;
1321 Py_buffer* frameBuffers = NULL;
1322 FramePointer* framePointers = NULL;
1323 unsigned long long* frameSizesP = NULL;
1324 unsigned long long totalInputSize = 0;
1325 FrameSources frameSources;
1326 ZstdBufferWithSegmentsCollection* result = NULL;
1327 Py_ssize_t i;
1328
1329 memset(&frameSizes, 0, sizeof(frameSizes));
1330
1331 #if PY_MAJOR_VERSION >= 3
1332 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|y*i:multi_decompress_to_buffer",
1333 #else
1334 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s*i:multi_decompress_to_buffer",
1335 #endif
1336 kwlist, &frames, &frameSizes, &threads)) {
1337 return NULL;
1338 }
1339
1340 if (frameSizes.buf) {
1341 if (!PyBuffer_IsContiguous(&frameSizes, 'C') || frameSizes.ndim > 1) {
1342 PyErr_SetString(PyExc_ValueError, "decompressed_sizes buffer should be contiguous and have a single dimension");
1343 goto finally;
1344 }
1345
1346 frameSizesP = (unsigned long long*)frameSizes.buf;
1347 }
1348
1349 if (threads < 0) {
1350 threads = cpu_count();
1351 }
1352
1353 if (threads < 2) {
1354 threads = 1;
1355 }
1356
1357 if (PyObject_TypeCheck(frames, &ZstdBufferWithSegmentsType)) {
1358 ZstdBufferWithSegments* buffer = (ZstdBufferWithSegments*)frames;
1359 frameCount = buffer->segmentCount;
1360
1361 if (frameSizes.buf && frameSizes.len != frameCount * (Py_ssize_t)sizeof(unsigned long long)) {
1362 PyErr_Format(PyExc_ValueError, "decompressed_sizes size mismatch; expected %zd, got %zd",
1363 frameCount * sizeof(unsigned long long), frameSizes.len);
1364 goto finally;
1365 }
1366
1367 framePointers = PyMem_Malloc(frameCount * sizeof(FramePointer));
1368 if (!framePointers) {
1369 PyErr_NoMemory();
1370 goto finally;
1371 }
1372
1373 for (i = 0; i < frameCount; i++) {
1374 void* sourceData;
1375 unsigned long long sourceSize;
1376 unsigned long long decompressedSize = 0;
1377
1378 if (buffer->segments[i].offset + buffer->segments[i].length > buffer->dataSize) {
1379 PyErr_Format(PyExc_ValueError, "item %zd has offset outside memory area", i);
1380 goto finally;
1381 }
1382
1383 sourceData = (char*)buffer->data + buffer->segments[i].offset;
1384 sourceSize = buffer->segments[i].length;
1385 totalInputSize += sourceSize;
1386
1387 if (frameSizesP) {
1388 decompressedSize = frameSizesP[i];
1389 }
1390
1391 framePointers[i].sourceData = sourceData;
1392 framePointers[i].sourceSize = sourceSize;
1393 framePointers[i].destSize = decompressedSize;
1394 }
1395 }
1396 else if (PyObject_TypeCheck(frames, &ZstdBufferWithSegmentsCollectionType)) {
1397 Py_ssize_t offset = 0;
1398 ZstdBufferWithSegments* buffer;
1399 ZstdBufferWithSegmentsCollection* collection = (ZstdBufferWithSegmentsCollection*)frames;
1400
1401 frameCount = BufferWithSegmentsCollection_length(collection);
1402
1403 if (frameSizes.buf && frameSizes.len != frameCount) {
1404 PyErr_Format(PyExc_ValueError,
1405 "decompressed_sizes size mismatch; expected %zd; got %zd",
1406 frameCount * sizeof(unsigned long long), frameSizes.len);
1407 goto finally;
1408 }
1409
1410 framePointers = PyMem_Malloc(frameCount * sizeof(FramePointer));
1411 if (NULL == framePointers) {
1412 PyErr_NoMemory();
1413 goto finally;
1414 }
1415
1416 /* Iterate the data structure directly because it is faster. */
1417 for (i = 0; i < collection->bufferCount; i++) {
1418 Py_ssize_t segmentIndex;
1419 buffer = collection->buffers[i];
1420
1421 for (segmentIndex = 0; segmentIndex < buffer->segmentCount; segmentIndex++) {
1422 if (buffer->segments[segmentIndex].offset + buffer->segments[segmentIndex].length > buffer->dataSize) {
1423 PyErr_Format(PyExc_ValueError, "item %zd has offset outside memory area",
1424 offset);
1425 goto finally;
1426 }
1427
1428 totalInputSize += buffer->segments[segmentIndex].length;
1429
1430 framePointers[offset].sourceData = (char*)buffer->data + buffer->segments[segmentIndex].offset;
1431 framePointers[offset].sourceSize = buffer->segments[segmentIndex].length;
1432 framePointers[offset].destSize = frameSizesP ? frameSizesP[offset] : 0;
1433
1434 offset++;
1435 }
1436 }
1437 }
1438 else if (PyList_Check(frames)) {
1439 frameCount = PyList_GET_SIZE(frames);
1440
1441 if (frameSizes.buf && frameSizes.len != frameCount * (Py_ssize_t)sizeof(unsigned long long)) {
1442 PyErr_Format(PyExc_ValueError, "decompressed_sizes size mismatch; expected %zd, got %zd",
1443 frameCount * sizeof(unsigned long long), frameSizes.len);
1444 goto finally;
1445 }
1446
1447 framePointers = PyMem_Malloc(frameCount * sizeof(FramePointer));
1448 if (!framePointers) {
1449 PyErr_NoMemory();
1450 goto finally;
1451 }
1452
1453 /*
1454 * It is not clear whether Py_buffer.buf is still valid after
1455 * PyBuffer_Release. So, we hold a reference to all Py_buffer instances
1456 * for the duration of the operation.
1457 */
1458 frameBuffers = PyMem_Malloc(frameCount * sizeof(Py_buffer));
1459 if (NULL == frameBuffers) {
1460 PyErr_NoMemory();
1461 goto finally;
1462 }
1463
1464 memset(frameBuffers, 0, frameCount * sizeof(Py_buffer));
1465
1466 /* Do a pass to assemble info about our input buffers and output sizes. */
1467 for (i = 0; i < frameCount; i++) {
1468 if (0 != PyObject_GetBuffer(PyList_GET_ITEM(frames, i),
1469 &frameBuffers[i], PyBUF_CONTIG_RO)) {
1470 PyErr_Clear();
1471 PyErr_Format(PyExc_TypeError, "item %zd not a bytes like object", i);
1472 goto finally;
1473 }
1474
1475 totalInputSize += frameBuffers[i].len;
1476
1477 framePointers[i].sourceData = frameBuffers[i].buf;
1478 framePointers[i].sourceSize = frameBuffers[i].len;
1479 framePointers[i].destSize = frameSizesP ? frameSizesP[i] : 0;
1480 }
1481 }
1482 else {
1483 PyErr_SetString(PyExc_TypeError, "argument must be list or BufferWithSegments");
1484 goto finally;
1485 }
1486
1487 /* We now have an array with info about our inputs and outputs. Feed it into
1488 our generic decompression function. */
1489 frameSources.frames = framePointers;
1490 frameSources.framesSize = frameCount;
1491 frameSources.compressedSize = totalInputSize;
1492
1493 result = decompress_from_framesources(self, &frameSources, threads);
1494
1495 finally:
1496 if (frameSizes.buf) {
1497 PyBuffer_Release(&frameSizes);
1498 }
1499 PyMem_Free(framePointers);
1500
1501 if (frameBuffers) {
1502 for (i = 0; i < frameCount; i++) {
1503 PyBuffer_Release(&frameBuffers[i]);
1504 }
1505
1506 PyMem_Free(frameBuffers);
1507 }
1508
1509 return result;
1510 }
1511
779 1512 static PyMethodDef Decompressor_methods[] = {
780 1513 { "copy_stream", (PyCFunction)Decompressor_copy_stream, METH_VARARGS | METH_KEYWORDS,
781 1514 Decompressor_copy_stream__doc__ },
782 1515 { "decompress", (PyCFunction)Decompressor_decompress, METH_VARARGS | METH_KEYWORDS,
783 1516 Decompressor_decompress__doc__ },
784 1517 { "decompressobj", (PyCFunction)Decompressor_decompressobj, METH_NOARGS,
785 1518 Decompressor_decompressobj__doc__ },
786 1519 { "read_from", (PyCFunction)Decompressor_read_from, METH_VARARGS | METH_KEYWORDS,
787 1520 Decompressor_read_from__doc__ },
788 1521 { "write_to", (PyCFunction)Decompressor_write_to, METH_VARARGS | METH_KEYWORDS,
789 1522 Decompressor_write_to__doc__ },
790 1523 { "decompress_content_dict_chain", (PyCFunction)Decompressor_decompress_content_dict_chain,
791 1524 METH_VARARGS | METH_KEYWORDS, Decompressor_decompress_content_dict_chain__doc__ },
1525 { "multi_decompress_to_buffer", (PyCFunction)Decompressor_multi_decompress_to_buffer,
1526 METH_VARARGS | METH_KEYWORDS, Decompressor_multi_decompress_to_buffer__doc__ },
792 1527 { NULL, NULL }
793 1528 };
794 1529
795 1530 PyTypeObject ZstdDecompressorType = {
796 1531 PyVarObject_HEAD_INIT(NULL, 0)
797 1532 "zstd.ZstdDecompressor", /* tp_name */
798 1533 sizeof(ZstdDecompressor), /* tp_basicsize */
799 1534 0, /* tp_itemsize */
800 1535 (destructor)Decompressor_dealloc, /* tp_dealloc */
801 1536 0, /* tp_print */
802 1537 0, /* tp_getattr */
803 1538 0, /* tp_setattr */
804 1539 0, /* tp_compare */
805 1540 0, /* tp_repr */
806 1541 0, /* tp_as_number */
807 1542 0, /* tp_as_sequence */
808 1543 0, /* tp_as_mapping */
809 1544 0, /* tp_hash */
810 1545 0, /* tp_call */
811 1546 0, /* tp_str */
812 1547 0, /* tp_getattro */
813 1548 0, /* tp_setattro */
814 1549 0, /* tp_as_buffer */
815 1550 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
816 1551 Decompressor__doc__, /* tp_doc */
817 1552 0, /* tp_traverse */
818 1553 0, /* tp_clear */
819 1554 0, /* tp_richcompare */
820 1555 0, /* tp_weaklistoffset */
821 1556 0, /* tp_iter */
822 1557 0, /* tp_iternext */
823 1558 Decompressor_methods, /* tp_methods */
824 1559 0, /* tp_members */
825 1560 0, /* tp_getset */
826 1561 0, /* tp_base */
827 1562 0, /* tp_dict */
828 1563 0, /* tp_descr_get */
829 1564 0, /* tp_descr_set */
830 1565 0, /* tp_dictoffset */
831 1566 (initproc)Decompressor_init, /* tp_init */
832 1567 0, /* tp_alloc */
833 1568 PyType_GenericNew, /* tp_new */
834 1569 };
835 1570
836 1571 void decompressor_module_init(PyObject* mod) {
837 1572 Py_TYPE(&ZstdDecompressorType) = &PyType_Type;
838 1573 if (PyType_Ready(&ZstdDecompressorType) < 0) {
839 1574 return;
840 1575 }
841 1576
842 1577 Py_INCREF((PyObject*)&ZstdDecompressorType);
843 1578 PyModule_AddObject(mod, "ZstdDecompressor",
844 1579 (PyObject*)&ZstdDecompressorType);
845 1580 }
@@ -1,254 +1,251
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 #define min(a, b) (((a) < (b)) ? (a) : (b))
12 12
13 13 extern PyObject* ZstdError;
14 14
15 15 PyDoc_STRVAR(ZstdDecompressorIterator__doc__,
16 16 "Represents an iterator of decompressed data.\n"
17 17 );
18 18
19 19 static void ZstdDecompressorIterator_dealloc(ZstdDecompressorIterator* self) {
20 20 Py_XDECREF(self->decompressor);
21 21 Py_XDECREF(self->reader);
22 22
23 23 if (self->buffer) {
24 24 PyBuffer_Release(self->buffer);
25 25 PyMem_FREE(self->buffer);
26 26 self->buffer = NULL;
27 27 }
28 28
29 if (self->dstream) {
30 ZSTD_freeDStream(self->dstream);
31 self->dstream = NULL;
32 }
33
34 29 if (self->input.src) {
35 30 PyMem_Free((void*)self->input.src);
36 31 self->input.src = NULL;
37 32 }
38 33
39 34 PyObject_Del(self);
40 35 }
41 36
42 37 static PyObject* ZstdDecompressorIterator_iter(PyObject* self) {
43 38 Py_INCREF(self);
44 39 return self;
45 40 }
46 41
47 42 static DecompressorIteratorResult read_decompressor_iterator(ZstdDecompressorIterator* self) {
48 43 size_t zresult;
49 44 PyObject* chunk;
50 45 DecompressorIteratorResult result;
51 46 size_t oldInputPos = self->input.pos;
52 47
48 assert(self->decompressor->dstream);
49
53 50 result.chunk = NULL;
54 51
55 52 chunk = PyBytes_FromStringAndSize(NULL, self->outSize);
56 53 if (!chunk) {
57 54 result.errored = 1;
58 55 return result;
59 56 }
60 57
61 58 self->output.dst = PyBytes_AsString(chunk);
62 59 self->output.size = self->outSize;
63 60 self->output.pos = 0;
64 61
65 62 Py_BEGIN_ALLOW_THREADS
66 zresult = ZSTD_decompressStream(self->dstream, &self->output, &self->input);
63 zresult = ZSTD_decompressStream(self->decompressor->dstream, &self->output, &self->input);
67 64 Py_END_ALLOW_THREADS
68 65
69 66 /* We're done with the pointer. Nullify to prevent anyone from getting a
70 67 handle on a Python object. */
71 68 self->output.dst = NULL;
72 69
73 70 if (ZSTD_isError(zresult)) {
74 71 Py_DECREF(chunk);
75 72 PyErr_Format(ZstdError, "zstd decompress error: %s",
76 73 ZSTD_getErrorName(zresult));
77 74 result.errored = 1;
78 75 return result;
79 76 }
80 77
81 78 self->readCount += self->input.pos - oldInputPos;
82 79
83 80 /* Frame is fully decoded. Input exhausted and output sitting in buffer. */
84 81 if (0 == zresult) {
85 82 self->finishedInput = 1;
86 83 self->finishedOutput = 1;
87 84 }
88 85
89 86 /* If it produced output data, return it. */
90 87 if (self->output.pos) {
91 88 if (self->output.pos < self->outSize) {
92 89 if (_PyBytes_Resize(&chunk, self->output.pos)) {
93 90 result.errored = 1;
94 91 return result;
95 92 }
96 93 }
97 94 }
98 95 else {
99 96 Py_DECREF(chunk);
100 97 chunk = NULL;
101 98 }
102 99
103 100 result.errored = 0;
104 101 result.chunk = chunk;
105 102
106 103 return result;
107 104 }
108 105
109 106 static PyObject* ZstdDecompressorIterator_iternext(ZstdDecompressorIterator* self) {
110 107 PyObject* readResult = NULL;
111 108 char* readBuffer;
112 109 Py_ssize_t readSize;
113 110 Py_ssize_t bufferRemaining;
114 111 DecompressorIteratorResult result;
115 112
116 113 if (self->finishedOutput) {
117 114 PyErr_SetString(PyExc_StopIteration, "output flushed");
118 115 return NULL;
119 116 }
120 117
121 118 /* If we have data left in the input, consume it. */
122 119 if (self->input.pos < self->input.size) {
123 120 result = read_decompressor_iterator(self);
124 121 if (result.chunk || result.errored) {
125 122 return result.chunk;
126 123 }
127 124
128 125 /* Else fall through to get more data from input. */
129 126 }
130 127
131 128 read_from_source:
132 129
133 130 if (!self->finishedInput) {
134 131 if (self->reader) {
135 132 readResult = PyObject_CallMethod(self->reader, "read", "I", self->inSize);
136 133 if (!readResult) {
137 134 return NULL;
138 135 }
139 136
140 137 PyBytes_AsStringAndSize(readResult, &readBuffer, &readSize);
141 138 }
142 139 else {
143 140 assert(self->buffer && self->buffer->buf);
144 141
145 142 /* Only support contiguous C arrays for now */
146 143 assert(self->buffer->strides == NULL && self->buffer->suboffsets == NULL);
147 144 assert(self->buffer->itemsize == 1);
148 145
149 146 /* TODO avoid memcpy() below */
150 147 readBuffer = (char *)self->buffer->buf + self->bufferOffset;
151 148 bufferRemaining = self->buffer->len - self->bufferOffset;
152 149 readSize = min(bufferRemaining, (Py_ssize_t)self->inSize);
153 150 self->bufferOffset += readSize;
154 151 }
155 152
156 153 if (readSize) {
157 154 if (!self->readCount && self->skipBytes) {
158 155 assert(self->skipBytes < self->inSize);
159 156 if ((Py_ssize_t)self->skipBytes >= readSize) {
160 157 PyErr_SetString(PyExc_ValueError,
161 158 "skip_bytes larger than first input chunk; "
162 159 "this scenario is currently unsupported");
163 Py_DecRef(readResult);
160 Py_XDECREF(readResult);
164 161 return NULL;
165 162 }
166 163
167 164 readBuffer = readBuffer + self->skipBytes;
168 165 readSize -= self->skipBytes;
169 166 }
170 167
171 168 /* Copy input into previously allocated buffer because it can live longer
172 169 than a single function call and we don't want to keep a ref to a Python
173 170 object around. This could be changed... */
174 171 memcpy((void*)self->input.src, readBuffer, readSize);
175 172 self->input.size = readSize;
176 173 self->input.pos = 0;
177 174 }
178 175 /* No bytes on first read must mean an empty input stream. */
179 176 else if (!self->readCount) {
180 177 self->finishedInput = 1;
181 178 self->finishedOutput = 1;
182 Py_DecRef(readResult);
179 Py_XDECREF(readResult);
183 180 PyErr_SetString(PyExc_StopIteration, "empty input");
184 181 return NULL;
185 182 }
186 183 else {
187 184 self->finishedInput = 1;
188 185 }
189 186
190 187 /* We've copied the data managed by memory. Discard the Python object. */
191 Py_DecRef(readResult);
188 Py_XDECREF(readResult);
192 189 }
193 190
194 191 result = read_decompressor_iterator(self);
195 192 if (result.errored || result.chunk) {
196 193 return result.chunk;
197 194 }
198 195
199 196 /* No new output data. Try again unless we know there is no more data. */
200 197 if (!self->finishedInput) {
201 198 goto read_from_source;
202 199 }
203 200
204 201 PyErr_SetString(PyExc_StopIteration, "input exhausted");
205 202 return NULL;
206 203 }
207 204
208 205 PyTypeObject ZstdDecompressorIteratorType = {
209 206 PyVarObject_HEAD_INIT(NULL, 0)
210 207 "zstd.ZstdDecompressorIterator", /* tp_name */
211 208 sizeof(ZstdDecompressorIterator), /* tp_basicsize */
212 209 0, /* tp_itemsize */
213 210 (destructor)ZstdDecompressorIterator_dealloc, /* tp_dealloc */
214 211 0, /* tp_print */
215 212 0, /* tp_getattr */
216 213 0, /* tp_setattr */
217 214 0, /* tp_compare */
218 215 0, /* tp_repr */
219 216 0, /* tp_as_number */
220 217 0, /* tp_as_sequence */
221 218 0, /* tp_as_mapping */
222 219 0, /* tp_hash */
223 220 0, /* tp_call */
224 221 0, /* tp_str */
225 222 0, /* tp_getattro */
226 223 0, /* tp_setattro */
227 224 0, /* tp_as_buffer */
228 225 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
229 226 ZstdDecompressorIterator__doc__, /* tp_doc */
230 227 0, /* tp_traverse */
231 228 0, /* tp_clear */
232 229 0, /* tp_richcompare */
233 230 0, /* tp_weaklistoffset */
234 231 ZstdDecompressorIterator_iter, /* tp_iter */
235 232 (iternextfunc)ZstdDecompressorIterator_iternext, /* tp_iternext */
236 233 0, /* tp_methods */
237 234 0, /* tp_members */
238 235 0, /* tp_getset */
239 236 0, /* tp_base */
240 237 0, /* tp_dict */
241 238 0, /* tp_descr_get */
242 239 0, /* tp_descr_set */
243 240 0, /* tp_dictoffset */
244 241 0, /* tp_init */
245 242 0, /* tp_alloc */
246 243 PyType_GenericNew, /* tp_new */
247 244 };
248 245
249 246 void decompressoriterator_module_init(PyObject* mod) {
250 247 Py_TYPE(&ZstdDecompressorIteratorType) = &PyType_Type;
251 248 if (PyType_Ready(&ZstdDecompressorIteratorType) < 0) {
252 249 return;
253 250 }
254 251 }
@@ -1,132 +1,132
1 1 /**
2 2 * Copyright (c) 2017-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #include "python-zstandard.h"
10 10
11 11 extern PyObject* ZstdError;
12 12
13 13 PyDoc_STRVAR(FrameParameters__doc__,
14 14 "FrameParameters: information about a zstd frame");
15 15
16 16 FrameParametersObject* get_frame_parameters(PyObject* self, PyObject* args) {
17 17 const char* source;
18 18 Py_ssize_t sourceSize;
19 19 ZSTD_frameParams params;
20 20 FrameParametersObject* result = NULL;
21 21 size_t zresult;
22 22
23 23 #if PY_MAJOR_VERSION >= 3
24 24 if (!PyArg_ParseTuple(args, "y#:get_frame_parameters",
25 25 #else
26 26 if (!PyArg_ParseTuple(args, "s#:get_frame_parameters",
27 27 #endif
28 28 &source, &sourceSize)) {
29 29 return NULL;
30 30 }
31 31
32 32 /* Needed for Python 2 to reject unicode */
33 33 if (!PyBytes_Check(PyTuple_GET_ITEM(args, 0))) {
34 34 PyErr_SetString(PyExc_TypeError, "argument must be bytes");
35 35 return NULL;
36 36 }
37 37
38 38 zresult = ZSTD_getFrameParams(&params, (void*)source, sourceSize);
39 39
40 40 if (ZSTD_isError(zresult)) {
41 41 PyErr_Format(ZstdError, "cannot get frame parameters: %s", ZSTD_getErrorName(zresult));
42 42 return NULL;
43 43 }
44 44
45 45 if (zresult) {
46 46 PyErr_Format(ZstdError, "not enough data for frame parameters; need %zu bytes", zresult);
47 47 return NULL;
48 48 }
49 49
50 50 result = PyObject_New(FrameParametersObject, &FrameParametersType);
51 51 if (!result) {
52 52 return NULL;
53 53 }
54 54
55 55 result->frameContentSize = params.frameContentSize;
56 56 result->windowSize = params.windowSize;
57 57 result->dictID = params.dictID;
58 58 result->checksumFlag = params.checksumFlag ? 1 : 0;
59 59
60 60 return result;
61 61 }
62 62
63 63 static void FrameParameters_dealloc(PyObject* self) {
64 64 PyObject_Del(self);
65 65 }
66 66
67 67 static PyMemberDef FrameParameters_members[] = {
68 68 { "content_size", T_ULONGLONG,
69 69 offsetof(FrameParametersObject, frameContentSize), READONLY,
70 70 "frame content size" },
71 71 { "window_size", T_UINT,
72 72 offsetof(FrameParametersObject, windowSize), READONLY,
73 73 "window size" },
74 74 { "dict_id", T_UINT,
75 75 offsetof(FrameParametersObject, dictID), READONLY,
76 76 "dictionary ID" },
77 77 { "has_checksum", T_BOOL,
78 78 offsetof(FrameParametersObject, checksumFlag), READONLY,
79 79 "checksum flag" },
80 80 { NULL }
81 81 };
82 82
83 83 PyTypeObject FrameParametersType = {
84 84 PyVarObject_HEAD_INIT(NULL, 0)
85 85 "FrameParameters", /* tp_name */
86 86 sizeof(FrameParametersObject), /* tp_basicsize */
87 87 0, /* tp_itemsize */
88 88 (destructor)FrameParameters_dealloc, /* tp_dealloc */
89 89 0, /* tp_print */
90 90 0, /* tp_getattr */
91 91 0, /* tp_setattr */
92 92 0, /* tp_compare */
93 93 0, /* tp_repr */
94 94 0, /* tp_as_number */
95 95 0, /* tp_as_sequence */
96 96 0, /* tp_as_mapping */
97 97 0, /* tp_hash */
98 98 0, /* tp_call */
99 99 0, /* tp_str */
100 100 0, /* tp_getattro */
101 101 0, /* tp_setattro */
102 102 0, /* tp_as_buffer */
103 103 Py_TPFLAGS_DEFAULT, /* tp_flags */
104 104 FrameParameters__doc__, /* tp_doc */
105 105 0, /* tp_traverse */
106 106 0, /* tp_clear */
107 107 0, /* tp_richcompare */
108 108 0, /* tp_weaklistoffset */
109 109 0, /* tp_iter */
110 110 0, /* tp_iternext */
111 111 0, /* tp_methods */
112 112 FrameParameters_members, /* tp_members */
113 113 0, /* tp_getset */
114 114 0, /* tp_base */
115 115 0, /* tp_dict */
116 116 0, /* tp_descr_get */
117 117 0, /* tp_descr_set */
118 118 0, /* tp_dictoffset */
119 119 0, /* tp_init */
120 120 0, /* tp_alloc */
121 121 0, /* tp_new */
122 122 };
123 123
124 124 void frameparams_module_init(PyObject* mod) {
125 125 Py_TYPE(&FrameParametersType) = &PyType_Type;
126 126 if (PyType_Ready(&FrameParametersType) < 0) {
127 127 return;
128 128 }
129 129
130 Py_IncRef((PyObject*)&FrameParametersType);
130 Py_INCREF(&FrameParametersType);
131 131 PyModule_AddObject(mod, "FrameParameters", (PyObject*)&FrameParametersType);
132 132 }
@@ -1,190 +1,285
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 #define PY_SSIZE_T_CLEAN
10 10 #include <Python.h>
11 11 #include "structmember.h"
12 12
13 13 #define ZSTD_STATIC_LINKING_ONLY
14 14 #define ZDICT_STATIC_LINKING_ONLY
15 15 #include "mem.h"
16 16 #include "zstd.h"
17 17 #include "zdict.h"
18 #include "zstdmt_compress.h"
18 19
19 #define PYTHON_ZSTANDARD_VERSION "0.7.0"
20 #define PYTHON_ZSTANDARD_VERSION "0.8.0"
20 21
21 22 typedef enum {
22 23 compressorobj_flush_finish,
23 24 compressorobj_flush_block,
24 25 } CompressorObj_Flush;
25 26
27 /*
28 Represents a CompressionParameters type.
29
30 This type is basically a wrapper around ZSTD_compressionParameters.
31 */
26 32 typedef struct {
27 33 PyObject_HEAD
28 34 unsigned windowLog;
29 35 unsigned chainLog;
30 36 unsigned hashLog;
31 37 unsigned searchLog;
32 38 unsigned searchLength;
33 39 unsigned targetLength;
34 40 ZSTD_strategy strategy;
35 41 } CompressionParametersObject;
36 42
37 43 extern PyTypeObject CompressionParametersType;
38 44
45 /*
46 Represents a FrameParameters type.
47
48 This type is basically a wrapper around ZSTD_frameParams.
49 */
39 50 typedef struct {
40 51 PyObject_HEAD
41 52 unsigned long long frameContentSize;
42 53 unsigned windowSize;
43 54 unsigned dictID;
44 55 char checksumFlag;
45 56 } FrameParametersObject;
46 57
47 58 extern PyTypeObject FrameParametersType;
48 59
49 typedef struct {
50 PyObject_HEAD
51 unsigned selectivityLevel;
52 int compressionLevel;
53 unsigned notificationLevel;
54 unsigned dictID;
55 } DictParametersObject;
60 /*
61 Represents a ZstdCompressionDict type.
56 62
57 extern PyTypeObject DictParametersType;
58
63 Instances hold data used for a zstd compression dictionary.
64 */
59 65 typedef struct {
60 66 PyObject_HEAD
61 67
68 /* Pointer to dictionary data. Owned by self. */
62 69 void* dictData;
70 /* Size of dictionary data. */
63 71 size_t dictSize;
72 /* k parameter for cover dictionaries. Only populated by train_cover_dict(). */
73 unsigned k;
74 /* d parameter for cover dictionaries. Only populated by train_cover_dict(). */
75 unsigned d;
64 76 } ZstdCompressionDict;
65 77
66 78 extern PyTypeObject ZstdCompressionDictType;
67 79
80 /*
81 Represents a ZstdCompressor type.
82 */
68 83 typedef struct {
69 84 PyObject_HEAD
70 85
86 /* Configured compression level. Should be always set. */
71 87 int compressionLevel;
88 /* Number of threads to use for operations. */
89 unsigned int threads;
90 /* Pointer to compression dictionary to use. NULL if not using dictionary
91 compression. */
72 92 ZstdCompressionDict* dict;
93 /* Compression context to use. Populated during object construction. NULL
94 if using multi-threaded compression. */
73 95 ZSTD_CCtx* cctx;
96 /* Multi-threaded compression context to use. Populated during object
97 construction. NULL if not using multi-threaded compression. */
98 ZSTDMT_CCtx* mtcctx;
99 /* Digest compression dictionary. NULL initially. Populated on first use. */
74 100 ZSTD_CDict* cdict;
101 /* Low-level compression parameter control. NULL unless passed to
102 constructor. Takes precedence over `compressionLevel` if defined. */
75 103 CompressionParametersObject* cparams;
104 /* Controls zstd frame options. */
76 105 ZSTD_frameParameters fparams;
106 /* Holds state for streaming compression. Shared across all invocation.
107 Populated on first use. */
108 ZSTD_CStream* cstream;
77 109 } ZstdCompressor;
78 110
79 111 extern PyTypeObject ZstdCompressorType;
80 112
81 113 typedef struct {
82 114 PyObject_HEAD
83 115
84 116 ZstdCompressor* compressor;
85 ZSTD_CStream* cstream;
86 117 ZSTD_outBuffer output;
87 118 int finished;
88 119 } ZstdCompressionObj;
89 120
90 121 extern PyTypeObject ZstdCompressionObjType;
91 122
92 123 typedef struct {
93 124 PyObject_HEAD
94 125
95 126 ZstdCompressor* compressor;
96 127 PyObject* writer;
97 128 Py_ssize_t sourceSize;
98 129 size_t outSize;
99 ZSTD_CStream* cstream;
100 130 int entered;
101 131 } ZstdCompressionWriter;
102 132
103 133 extern PyTypeObject ZstdCompressionWriterType;
104 134
105 135 typedef struct {
106 136 PyObject_HEAD
107 137
108 138 ZstdCompressor* compressor;
109 139 PyObject* reader;
110 140 Py_buffer* buffer;
111 141 Py_ssize_t bufferOffset;
112 142 Py_ssize_t sourceSize;
113 143 size_t inSize;
114 144 size_t outSize;
115 145
116 ZSTD_CStream* cstream;
117 146 ZSTD_inBuffer input;
118 147 ZSTD_outBuffer output;
119 148 int finishedOutput;
120 149 int finishedInput;
121 150 PyObject* readResult;
122 151 } ZstdCompressorIterator;
123 152
124 153 extern PyTypeObject ZstdCompressorIteratorType;
125 154
126 155 typedef struct {
127 156 PyObject_HEAD
128 157
129 158 ZSTD_DCtx* dctx;
130 159
131 160 ZstdCompressionDict* dict;
132 161 ZSTD_DDict* ddict;
162 ZSTD_DStream* dstream;
133 163 } ZstdDecompressor;
134 164
135 165 extern PyTypeObject ZstdDecompressorType;
136 166
137 167 typedef struct {
138 168 PyObject_HEAD
139 169
140 170 ZstdDecompressor* decompressor;
141 ZSTD_DStream* dstream;
142 171 int finished;
143 172 } ZstdDecompressionObj;
144 173
145 174 extern PyTypeObject ZstdDecompressionObjType;
146 175
147 176 typedef struct {
148 177 PyObject_HEAD
149 178
150 179 ZstdDecompressor* decompressor;
151 180 PyObject* writer;
152 181 size_t outSize;
153 ZSTD_DStream* dstream;
154 182 int entered;
155 183 } ZstdDecompressionWriter;
156 184
157 185 extern PyTypeObject ZstdDecompressionWriterType;
158 186
159 187 typedef struct {
160 188 PyObject_HEAD
161 189
162 190 ZstdDecompressor* decompressor;
163 191 PyObject* reader;
164 192 Py_buffer* buffer;
165 193 Py_ssize_t bufferOffset;
166 194 size_t inSize;
167 195 size_t outSize;
168 196 size_t skipBytes;
169 ZSTD_DStream* dstream;
170 197 ZSTD_inBuffer input;
171 198 ZSTD_outBuffer output;
172 199 Py_ssize_t readCount;
173 200 int finishedInput;
174 201 int finishedOutput;
175 202 } ZstdDecompressorIterator;
176 203
177 204 extern PyTypeObject ZstdDecompressorIteratorType;
178 205
179 206 typedef struct {
180 207 int errored;
181 208 PyObject* chunk;
182 209 } DecompressorIteratorResult;
183 210
211 typedef struct {
212 unsigned long long offset;
213 unsigned long long length;
214 } BufferSegment;
215
216 typedef struct {
217 PyObject_HEAD
218
219 PyObject* parent;
220 BufferSegment* segments;
221 Py_ssize_t segmentCount;
222 } ZstdBufferSegments;
223
224 extern PyTypeObject ZstdBufferSegmentsType;
225
226 typedef struct {
227 PyObject_HEAD
228
229 PyObject* parent;
230 void* data;
231 Py_ssize_t dataSize;
232 unsigned long long offset;
233 } ZstdBufferSegment;
234
235 extern PyTypeObject ZstdBufferSegmentType;
236
237 typedef struct {
238 PyObject_HEAD
239
240 Py_buffer parent;
241 void* data;
242 unsigned long long dataSize;
243 BufferSegment* segments;
244 Py_ssize_t segmentCount;
245 int useFree;
246 } ZstdBufferWithSegments;
247
248 extern PyTypeObject ZstdBufferWithSegmentsType;
249
250 /**
251 * An ordered collection of BufferWithSegments exposed as a squashed collection.
252 *
253 * This type provides a virtual view spanning multiple BufferWithSegments
254 * instances. It allows multiple instances to be "chained" together and
255 * exposed as a single collection. e.g. if there are 2 buffers holding
256 * 10 segments each, then o[14] will access the 5th segment in the 2nd buffer.
257 */
258 typedef struct {
259 PyObject_HEAD
260
261 /* An array of buffers that should be exposed through this instance. */
262 ZstdBufferWithSegments** buffers;
263 /* Number of elements in buffers array. */
264 Py_ssize_t bufferCount;
265 /* Array of first offset in each buffer instance. 0th entry corresponds
266 to number of elements in the 0th buffer. 1st entry corresponds to the
267 sum of elements in 0th and 1st buffers. */
268 Py_ssize_t* firstElements;
269 } ZstdBufferWithSegmentsCollection;
270
271 extern PyTypeObject ZstdBufferWithSegmentsCollectionType;
272
184 273 void ztopy_compression_parameters(CompressionParametersObject* params, ZSTD_compressionParameters* zparams);
185 274 CompressionParametersObject* get_compression_parameters(PyObject* self, PyObject* args);
186 275 FrameParametersObject* get_frame_parameters(PyObject* self, PyObject* args);
187 276 PyObject* estimate_compression_context_size(PyObject* self, PyObject* args);
188 ZSTD_CStream* CStream_from_ZstdCompressor(ZstdCompressor* compressor, Py_ssize_t sourceSize);
189 ZSTD_DStream* DStream_from_ZstdDecompressor(ZstdDecompressor* decompressor);
277 int init_cstream(ZstdCompressor* compressor, unsigned long long sourceSize);
278 int init_mtcstream(ZstdCompressor* compressor, Py_ssize_t sourceSize);
279 int init_dstream(ZstdDecompressor* decompressor);
190 280 ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs);
281 ZstdCompressionDict* train_cover_dictionary(PyObject* self, PyObject* args, PyObject* kwargs);
282 ZstdBufferWithSegments* BufferWithSegments_FromMemory(void* data, unsigned long long dataSize, BufferSegment* segments, Py_ssize_t segmentsSize);
283 Py_ssize_t BufferWithSegmentsCollection_length(ZstdBufferWithSegmentsCollection*);
284 int cpu_count(void);
285 size_t roundpow2(size_t);
@@ -1,154 +1,187
1 1 # Copyright (c) 2016-present, Gregory Szorc
2 2 # All rights reserved.
3 3 #
4 4 # This software may be modified and distributed under the terms
5 5 # of the BSD license. See the LICENSE file for details.
6 6
7 7 from __future__ import absolute_import
8 8
9 9 import cffi
10 10 import distutils.ccompiler
11 11 import os
12 12 import re
13 13 import subprocess
14 14 import tempfile
15 15
16 16
17 17 HERE = os.path.abspath(os.path.dirname(__file__))
18 18
19 19 SOURCES = ['zstd/%s' % p for p in (
20 20 'common/entropy_common.c',
21 21 'common/error_private.c',
22 22 'common/fse_decompress.c',
23 23 'common/pool.c',
24 24 'common/threading.c',
25 25 'common/xxhash.c',
26 26 'common/zstd_common.c',
27 27 'compress/fse_compress.c',
28 28 'compress/huf_compress.c',
29 29 'compress/zstd_compress.c',
30 'compress/zstdmt_compress.c',
30 31 'decompress/huf_decompress.c',
31 32 'decompress/zstd_decompress.c',
32 33 'dictBuilder/cover.c',
33 34 'dictBuilder/divsufsort.c',
34 35 'dictBuilder/zdict.c',
35 36 )]
36 37
38 # Headers whose preprocessed output will be fed into cdef().
37 39 HEADERS = [os.path.join(HERE, 'zstd', *p) for p in (
38 40 ('zstd.h',),
39 ('common', 'pool.h'),
41 ('compress', 'zstdmt_compress.h'),
40 42 ('dictBuilder', 'zdict.h'),
41 43 )]
42 44
43 45 INCLUDE_DIRS = [os.path.join(HERE, d) for d in (
44 46 'zstd',
45 47 'zstd/common',
46 48 'zstd/compress',
47 49 'zstd/decompress',
48 50 'zstd/dictBuilder',
49 51 )]
50 52
51 53 # cffi can't parse some of the primitives in zstd.h. So we invoke the
52 54 # preprocessor and feed its output into cffi.
53 55 compiler = distutils.ccompiler.new_compiler()
54 56
55 57 # Needed for MSVC.
56 58 if hasattr(compiler, 'initialize'):
57 59 compiler.initialize()
58 60
59 61 # Distutils doesn't set compiler.preprocessor, so invoke the preprocessor
60 62 # manually.
61 63 if compiler.compiler_type == 'unix':
62 64 args = list(compiler.executables['compiler'])
63 65 args.extend([
64 66 '-E',
65 67 '-DZSTD_STATIC_LINKING_ONLY',
66 68 '-DZDICT_STATIC_LINKING_ONLY',
67 69 ])
68 70 elif compiler.compiler_type == 'msvc':
69 71 args = [compiler.cc]
70 72 args.extend([
71 73 '/EP',
72 74 '/DZSTD_STATIC_LINKING_ONLY',
73 75 '/DZDICT_STATIC_LINKING_ONLY',
74 76 ])
75 77 else:
76 78 raise Exception('unsupported compiler type: %s' % compiler.compiler_type)
77 79
78 80 def preprocess(path):
79 # zstd.h includes <stddef.h>, which is also included by cffi's boilerplate.
80 # This can lead to duplicate declarations. So we strip this include from the
81 # preprocessor invocation.
82 81 with open(path, 'rb') as fh:
83 lines = [l for l in fh if not l.startswith(b'#include <stddef.h>')]
82 lines = []
83 for l in fh:
84 # zstd.h includes <stddef.h>, which is also included by cffi's
85 # boilerplate. This can lead to duplicate declarations. So we strip
86 # this include from the preprocessor invocation.
87 #
88 # The same things happens for including zstd.h, so give it the same
89 # treatment.
90 #
91 # We define ZSTD_STATIC_LINKING_ONLY, which is redundant with the inline
92 # #define in zstdmt_compress.h and results in a compiler warning. So drop
93 # the inline #define.
94 if l.startswith((b'#include <stddef.h>',
95 b'#include "zstd.h"',
96 b'#define ZSTD_STATIC_LINKING_ONLY')):
97 continue
98
99 # ZSTDLIB_API may not be defined if we dropped zstd.h. It isn't
100 # important so just filter it out.
101 if l.startswith(b'ZSTDLIB_API'):
102 l = l[len(b'ZSTDLIB_API '):]
103
104 lines.append(l)
84 105
85 106 fd, input_file = tempfile.mkstemp(suffix='.h')
86 107 os.write(fd, b''.join(lines))
87 108 os.close(fd)
88 109
89 110 try:
90 111 process = subprocess.Popen(args + [input_file], stdout=subprocess.PIPE)
91 112 output = process.communicate()[0]
92 113 ret = process.poll()
93 114 if ret:
94 115 raise Exception('preprocessor exited with error')
95 116
96 117 return output
97 118 finally:
98 119 os.unlink(input_file)
99 120
100 121
101 122 def normalize_output(output):
102 123 lines = []
103 124 for line in output.splitlines():
104 125 # CFFI's parser doesn't like __attribute__ on UNIX compilers.
105 126 if line.startswith(b'__attribute__ ((visibility ("default"))) '):
106 127 line = line[len(b'__attribute__ ((visibility ("default"))) '):]
107 128
108 129 if line.startswith(b'__attribute__((deprecated('):
109 130 continue
110 131 elif b'__declspec(deprecated(' in line:
111 132 continue
112 133
113 134 lines.append(line)
114 135
115 136 return b'\n'.join(lines)
116 137
117 138
118 139 ffi = cffi.FFI()
140 # *_DISABLE_DEPRECATE_WARNINGS prevents the compiler from emitting a warning
141 # when cffi uses the function. Since we statically link against zstd, even
142 # if we use the deprecated functions it shouldn't be a huge problem.
119 143 ffi.set_source('_zstd_cffi', '''
120 144 #include "mem.h"
121 145 #define ZSTD_STATIC_LINKING_ONLY
122 146 #include "zstd.h"
123 147 #define ZDICT_STATIC_LINKING_ONLY
124 #include "pool.h"
148 #define ZDICT_DISABLE_DEPRECATE_WARNINGS
125 149 #include "zdict.h"
150 #include "zstdmt_compress.h"
126 151 ''', sources=SOURCES, include_dirs=INCLUDE_DIRS)
127 152
128 153 DEFINE = re.compile(b'^\\#define ([a-zA-Z0-9_]+) ')
129 154
130 155 sources = []
131 156
157 # Feed normalized preprocessor output for headers into the cdef parser.
132 158 for header in HEADERS:
133 159 preprocessed = preprocess(header)
134 160 sources.append(normalize_output(preprocessed))
135 161
136 # Do another pass over source and find constants that were preprocessed
137 # away.
162 # #define's are effectively erased as part of going through preprocessor.
163 # So perform a manual pass to re-add those to the cdef source.
138 164 with open(header, 'rb') as fh:
139 165 for line in fh:
140 166 line = line.strip()
141 167 m = DEFINE.match(line)
142 168 if not m:
143 169 continue
144 170
171 if m.group(1) == b'ZSTD_STATIC_LINKING_ONLY':
172 continue
173
145 174 # The parser doesn't like some constants with complex values.
146 175 if m.group(1) in (b'ZSTD_LIB_VERSION', b'ZSTD_VERSION_STRING'):
147 176 continue
148 177
178 # The ... is magic syntax by the cdef parser to resolve the
179 # value at compile time.
149 180 sources.append(m.group(0) + b' ...')
150 181
151 ffi.cdef(u'\n'.join(s.decode('latin1') for s in sources))
182 cdeflines = b'\n'.join(sources).splitlines()
183 cdeflines = [l for l in cdeflines if l.strip()]
184 ffi.cdef(b'\n'.join(cdeflines).decode('latin1'))
152 185
153 186 if __name__ == '__main__':
154 187 ffi.compile()
@@ -1,70 +1,76
1 1 #!/usr/bin/env python
2 2 # Copyright (c) 2016-present, Gregory Szorc
3 3 # All rights reserved.
4 4 #
5 5 # This software may be modified and distributed under the terms
6 6 # of the BSD license. See the LICENSE file for details.
7 7
8 8 import sys
9 9 from setuptools import setup
10 10
11 11 try:
12 12 import cffi
13 13 except ImportError:
14 14 cffi = None
15 15
16 16 import setup_zstd
17 17
18 18 SUPPORT_LEGACY = False
19 19
20 20 if "--legacy" in sys.argv:
21 21 SUPPORT_LEGACY = True
22 22 sys.argv.remove("--legacy")
23 23
24 24 # Code for obtaining the Extension instance is in its own module to
25 25 # facilitate reuse in other projects.
26 26 extensions = [setup_zstd.get_c_extension(SUPPORT_LEGACY, 'zstd')]
27 27
28 install_requires = []
29
28 30 if cffi:
29 31 import make_cffi
30 32 extensions.append(make_cffi.ffi.distutils_extension())
31 33
34 # Need change in 1.8 for ffi.from_buffer() behavior.
35 install_requires.append('cffi>=1.8')
36
32 37 version = None
33 38
34 39 with open('c-ext/python-zstandard.h', 'r') as fh:
35 40 for line in fh:
36 41 if not line.startswith('#define PYTHON_ZSTANDARD_VERSION'):
37 42 continue
38 43
39 44 version = line.split()[2][1:-1]
40 45 break
41 46
42 47 if not version:
43 48 raise Exception('could not resolve package version; '
44 49 'this should never happen')
45 50
46 51 setup(
47 52 name='zstandard',
48 53 version=version,
49 54 description='Zstandard bindings for Python',
50 55 long_description=open('README.rst', 'r').read(),
51 56 url='https://github.com/indygreg/python-zstandard',
52 57 author='Gregory Szorc',
53 58 author_email='gregory.szorc@gmail.com',
54 59 license='BSD',
55 60 classifiers=[
56 61 'Development Status :: 4 - Beta',
57 62 'Intended Audience :: Developers',
58 63 'License :: OSI Approved :: BSD License',
59 64 'Programming Language :: C',
60 65 'Programming Language :: Python :: 2.6',
61 66 'Programming Language :: Python :: 2.7',
62 67 'Programming Language :: Python :: 3.3',
63 68 'Programming Language :: Python :: 3.4',
64 69 'Programming Language :: Python :: 3.5',
65 70 'Programming Language :: Python :: 3.6',
66 71 ],
67 72 keywords='zstandard zstd compression',
68 73 ext_modules=extensions,
69 74 test_suite='tests',
75 install_requires=install_requires,
70 76 )
@@ -1,96 +1,102
1 1 # Copyright (c) 2016-present, Gregory Szorc
2 2 # All rights reserved.
3 3 #
4 4 # This software may be modified and distributed under the terms
5 5 # of the BSD license. See the LICENSE file for details.
6 6
7 7 import os
8 8 from distutils.extension import Extension
9 9
10 10
11 11 zstd_sources = ['zstd/%s' % p for p in (
12 12 'common/entropy_common.c',
13 13 'common/error_private.c',
14 14 'common/fse_decompress.c',
15 15 'common/pool.c',
16 16 'common/threading.c',
17 17 'common/xxhash.c',
18 18 'common/zstd_common.c',
19 19 'compress/fse_compress.c',
20 20 'compress/huf_compress.c',
21 21 'compress/zstd_compress.c',
22 'compress/zstdmt_compress.c',
22 23 'decompress/huf_decompress.c',
23 24 'decompress/zstd_decompress.c',
24 25 'dictBuilder/cover.c',
25 26 'dictBuilder/divsufsort.c',
26 27 'dictBuilder/zdict.c',
27 28 )]
28 29
29 30 zstd_sources_legacy = ['zstd/%s' % p for p in (
30 31 'deprecated/zbuff_common.c',
31 32 'deprecated/zbuff_compress.c',
32 33 'deprecated/zbuff_decompress.c',
33 34 'legacy/zstd_v01.c',
34 35 'legacy/zstd_v02.c',
35 36 'legacy/zstd_v03.c',
36 37 'legacy/zstd_v04.c',
37 38 'legacy/zstd_v05.c',
38 39 'legacy/zstd_v06.c',
39 40 'legacy/zstd_v07.c'
40 41 )]
41 42
42 43 zstd_includes = [
43 44 'c-ext',
44 45 'zstd',
45 46 'zstd/common',
46 47 'zstd/compress',
47 48 'zstd/decompress',
48 49 'zstd/dictBuilder',
49 50 ]
50 51
51 52 zstd_includes_legacy = [
52 53 'zstd/deprecated',
53 54 'zstd/legacy',
54 55 ]
55 56
56 57 ext_sources = [
57 58 'zstd.c',
59 'c-ext/bufferutil.c',
58 60 'c-ext/compressiondict.c',
59 61 'c-ext/compressobj.c',
60 62 'c-ext/compressor.c',
61 63 'c-ext/compressoriterator.c',
62 64 'c-ext/compressionparams.c',
63 65 'c-ext/compressionwriter.c',
64 66 'c-ext/constants.c',
65 67 'c-ext/decompressobj.c',
66 68 'c-ext/decompressor.c',
67 69 'c-ext/decompressoriterator.c',
68 70 'c-ext/decompressionwriter.c',
69 'c-ext/dictparams.c',
70 71 'c-ext/frameparams.c',
71 72 ]
72 73
73 74 zstd_depends = [
74 75 'c-ext/python-zstandard.h',
75 76 ]
76 77
77 78
78 79 def get_c_extension(support_legacy=False, name='zstd'):
79 80 """Obtain a distutils.extension.Extension for the C extension."""
80 81 root = os.path.abspath(os.path.dirname(__file__))
81 82
82 83 sources = [os.path.join(root, p) for p in zstd_sources + ext_sources]
83 84 if support_legacy:
84 85 sources.extend([os.path.join(root, p) for p in zstd_sources_legacy])
85 86
86 87 include_dirs = [os.path.join(root, d) for d in zstd_includes]
87 88 if support_legacy:
88 89 include_dirs.extend([os.path.join(root, d) for d in zstd_includes_legacy])
89 90
90 91 depends = [os.path.join(root, p) for p in zstd_depends]
91 92
93 extra_args = ['-DZSTD_MULTITHREAD']
94
95 if support_legacy:
96 extra_args.append('-DZSTD_LEGACY_SUPPORT=1')
97
92 98 # TODO compile with optimizations.
93 99 return Extension(name, sources,
94 100 include_dirs=include_dirs,
95 101 depends=depends,
96 extra_compile_args=["-DZSTD_LEGACY_SUPPORT=1"] if support_legacy else [])
102 extra_compile_args=extra_args)
@@ -1,61 +1,88
1 1 import inspect
2 2 import io
3 import os
3 4 import types
4 5
5 6
6 7 def make_cffi(cls):
7 8 """Decorator to add CFFI versions of each test method."""
8 9
9 10 try:
10 11 import zstd_cffi
11 12 except ImportError:
12 13 return cls
13 14
14 15 # If CFFI version is available, dynamically construct test methods
15 16 # that use it.
16 17
17 18 for attr in dir(cls):
18 19 fn = getattr(cls, attr)
19 20 if not inspect.ismethod(fn) and not inspect.isfunction(fn):
20 21 continue
21 22
22 23 if not fn.__name__.startswith('test_'):
23 24 continue
24 25
25 26 name = '%s_cffi' % fn.__name__
26 27
27 28 # Replace the "zstd" symbol with the CFFI module instance. Then copy
28 29 # the function object and install it in a new attribute.
29 30 if isinstance(fn, types.FunctionType):
30 31 globs = dict(fn.__globals__)
31 32 globs['zstd'] = zstd_cffi
32 33 new_fn = types.FunctionType(fn.__code__, globs, name,
33 34 fn.__defaults__, fn.__closure__)
34 35 new_method = new_fn
35 36 else:
36 37 globs = dict(fn.__func__.func_globals)
37 38 globs['zstd'] = zstd_cffi
38 39 new_fn = types.FunctionType(fn.__func__.func_code, globs, name,
39 40 fn.__func__.func_defaults,
40 41 fn.__func__.func_closure)
41 42 new_method = types.UnboundMethodType(new_fn, fn.im_self,
42 43 fn.im_class)
43 44
44 45 setattr(cls, name, new_method)
45 46
46 47 return cls
47 48
48 49
49 50 class OpCountingBytesIO(io.BytesIO):
50 51 def __init__(self, *args, **kwargs):
51 52 self._read_count = 0
52 53 self._write_count = 0
53 54 return super(OpCountingBytesIO, self).__init__(*args, **kwargs)
54 55
55 56 def read(self, *args):
56 57 self._read_count += 1
57 58 return super(OpCountingBytesIO, self).read(*args)
58 59
59 60 def write(self, data):
60 61 self._write_count += 1
61 62 return super(OpCountingBytesIO, self).write(data)
63
64
65 _source_files = []
66
67
68 def random_input_data():
69 """Obtain the raw content of source files.
70
71 This is used for generating "random" data to feed into fuzzing, since it is
72 faster than random content generation.
73 """
74 if _source_files:
75 return _source_files
76
77 for root, dirs, files in os.walk(os.path.dirname(__file__)):
78 dirs[:] = list(sorted(dirs))
79 for f in sorted(files):
80 try:
81 with open(os.path.join(root, f), 'rb') as fh:
82 data = fh.read()
83 if data:
84 _source_files.append(data)
85 except OSError:
86 pass
87
88 return _source_files
@@ -1,675 +1,905
1 1 import hashlib
2 2 import io
3 3 import struct
4 4 import sys
5 5
6 6 try:
7 7 import unittest2 as unittest
8 8 except ImportError:
9 9 import unittest
10 10
11 11 import zstd
12 12
13 13 from .common import (
14 14 make_cffi,
15 15 OpCountingBytesIO,
16 16 )
17 17
18 18
19 19 if sys.version_info[0] >= 3:
20 20 next = lambda it: it.__next__()
21 21 else:
22 22 next = lambda it: it.next()
23 23
24 24
25 def multithreaded_chunk_size(level, source_size=0):
26 params = zstd.get_compression_parameters(level, source_size)
27
28 return 1 << (params.window_log + 2)
29
30
25 31 @make_cffi
26 32 class TestCompressor(unittest.TestCase):
27 33 def test_level_bounds(self):
28 34 with self.assertRaises(ValueError):
29 35 zstd.ZstdCompressor(level=0)
30 36
31 37 with self.assertRaises(ValueError):
32 38 zstd.ZstdCompressor(level=23)
33 39
34 40
35 41 @make_cffi
36 42 class TestCompressor_compress(unittest.TestCase):
43 def test_multithreaded_unsupported(self):
44 samples = []
45 for i in range(128):
46 samples.append(b'foo' * 64)
47 samples.append(b'bar' * 64)
48
49 d = zstd.train_dictionary(8192, samples)
50
51 cctx = zstd.ZstdCompressor(dict_data=d, threads=2)
52
53 with self.assertRaisesRegexp(zstd.ZstdError, 'compress\(\) cannot be used with both dictionaries and multi-threaded compression'):
54 cctx.compress(b'foo')
55
56 params = zstd.get_compression_parameters(3)
57 cctx = zstd.ZstdCompressor(compression_params=params, threads=2)
58 with self.assertRaisesRegexp(zstd.ZstdError, 'compress\(\) cannot be used with both compression parameters and multi-threaded compression'):
59 cctx.compress(b'foo')
60
37 61 def test_compress_empty(self):
38 62 cctx = zstd.ZstdCompressor(level=1)
39 63 result = cctx.compress(b'')
40 64 self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
41 65 params = zstd.get_frame_parameters(result)
42 66 self.assertEqual(params.content_size, 0)
43 67 self.assertEqual(params.window_size, 524288)
44 68 self.assertEqual(params.dict_id, 0)
45 69 self.assertFalse(params.has_checksum, 0)
46 70
47 71 # TODO should be temporary until https://github.com/facebook/zstd/issues/506
48 72 # is fixed.
49 73 cctx = zstd.ZstdCompressor(write_content_size=True)
50 74 with self.assertRaises(ValueError):
51 75 cctx.compress(b'')
52 76
53 77 cctx.compress(b'', allow_empty=True)
54 78
55 79 def test_compress_large(self):
56 80 chunks = []
57 81 for i in range(255):
58 82 chunks.append(struct.Struct('>B').pack(i) * 16384)
59 83
60 84 cctx = zstd.ZstdCompressor(level=3)
61 85 result = cctx.compress(b''.join(chunks))
62 86 self.assertEqual(len(result), 999)
63 87 self.assertEqual(result[0:4], b'\x28\xb5\x2f\xfd')
64 88
65 89 # This matches the test for read_from() below.
66 90 cctx = zstd.ZstdCompressor(level=1)
67 91 result = cctx.compress(b'f' * zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE + b'o')
68 92 self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x40\x54\x00\x00'
69 93 b'\x10\x66\x66\x01\x00\xfb\xff\x39\xc0'
70 94 b'\x02\x09\x00\x00\x6f')
71 95
72 96 def test_write_checksum(self):
73 97 cctx = zstd.ZstdCompressor(level=1)
74 98 no_checksum = cctx.compress(b'foobar')
75 99 cctx = zstd.ZstdCompressor(level=1, write_checksum=True)
76 100 with_checksum = cctx.compress(b'foobar')
77 101
78 102 self.assertEqual(len(with_checksum), len(no_checksum) + 4)
79 103
80 104 no_params = zstd.get_frame_parameters(no_checksum)
81 105 with_params = zstd.get_frame_parameters(with_checksum)
82 106
83 107 self.assertFalse(no_params.has_checksum)
84 108 self.assertTrue(with_params.has_checksum)
85 109
86 110 def test_write_content_size(self):
87 111 cctx = zstd.ZstdCompressor(level=1)
88 112 no_size = cctx.compress(b'foobar' * 256)
89 113 cctx = zstd.ZstdCompressor(level=1, write_content_size=True)
90 114 with_size = cctx.compress(b'foobar' * 256)
91 115
92 116 self.assertEqual(len(with_size), len(no_size) + 1)
93 117
94 118 no_params = zstd.get_frame_parameters(no_size)
95 119 with_params = zstd.get_frame_parameters(with_size)
96 120 self.assertEqual(no_params.content_size, 0)
97 121 self.assertEqual(with_params.content_size, 1536)
98 122
99 123 def test_no_dict_id(self):
100 124 samples = []
101 125 for i in range(128):
102 126 samples.append(b'foo' * 64)
103 127 samples.append(b'bar' * 64)
104 128 samples.append(b'foobar' * 64)
105 129
106 130 d = zstd.train_dictionary(1024, samples)
107 131
108 132 cctx = zstd.ZstdCompressor(level=1, dict_data=d)
109 133 with_dict_id = cctx.compress(b'foobarfoobar')
110 134
111 135 cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False)
112 136 no_dict_id = cctx.compress(b'foobarfoobar')
113 137
114 138 self.assertEqual(len(with_dict_id), len(no_dict_id) + 4)
115 139
116 140 no_params = zstd.get_frame_parameters(no_dict_id)
117 141 with_params = zstd.get_frame_parameters(with_dict_id)
118 142 self.assertEqual(no_params.dict_id, 0)
119 143 self.assertEqual(with_params.dict_id, 1584102229)
120 144
121 145 def test_compress_dict_multiple(self):
122 146 samples = []
123 147 for i in range(128):
124 148 samples.append(b'foo' * 64)
125 149 samples.append(b'bar' * 64)
126 150 samples.append(b'foobar' * 64)
127 151
128 152 d = zstd.train_dictionary(8192, samples)
129 153
130 154 cctx = zstd.ZstdCompressor(level=1, dict_data=d)
131 155
132 156 for i in range(32):
133 157 cctx.compress(b'foo bar foobar foo bar foobar')
134 158
159 def test_multithreaded(self):
160 chunk_size = multithreaded_chunk_size(1)
161 source = b''.join([b'x' * chunk_size, b'y' * chunk_size])
162
163 cctx = zstd.ZstdCompressor(level=1, threads=2)
164 compressed = cctx.compress(source)
165
166 params = zstd.get_frame_parameters(compressed)
167 self.assertEqual(params.content_size, chunk_size * 2)
168 self.assertEqual(params.dict_id, 0)
169 self.assertFalse(params.has_checksum)
170
171 dctx = zstd.ZstdDecompressor()
172 self.assertEqual(dctx.decompress(compressed), source)
173
135 174
136 175 @make_cffi
137 176 class TestCompressor_compressobj(unittest.TestCase):
138 177 def test_compressobj_empty(self):
139 178 cctx = zstd.ZstdCompressor(level=1)
140 179 cobj = cctx.compressobj()
141 180 self.assertEqual(cobj.compress(b''), b'')
142 181 self.assertEqual(cobj.flush(),
143 182 b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
144 183
145 184 def test_compressobj_large(self):
146 185 chunks = []
147 186 for i in range(255):
148 187 chunks.append(struct.Struct('>B').pack(i) * 16384)
149 188
150 189 cctx = zstd.ZstdCompressor(level=3)
151 190 cobj = cctx.compressobj()
152 191
153 192 result = cobj.compress(b''.join(chunks)) + cobj.flush()
154 193 self.assertEqual(len(result), 999)
155 194 self.assertEqual(result[0:4], b'\x28\xb5\x2f\xfd')
156 195
157 196 params = zstd.get_frame_parameters(result)
158 197 self.assertEqual(params.content_size, 0)
159 198 self.assertEqual(params.window_size, 1048576)
160 199 self.assertEqual(params.dict_id, 0)
161 200 self.assertFalse(params.has_checksum)
162 201
163 202 def test_write_checksum(self):
164 203 cctx = zstd.ZstdCompressor(level=1)
165 204 cobj = cctx.compressobj()
166 205 no_checksum = cobj.compress(b'foobar') + cobj.flush()
167 206 cctx = zstd.ZstdCompressor(level=1, write_checksum=True)
168 207 cobj = cctx.compressobj()
169 208 with_checksum = cobj.compress(b'foobar') + cobj.flush()
170 209
171 210 no_params = zstd.get_frame_parameters(no_checksum)
172 211 with_params = zstd.get_frame_parameters(with_checksum)
173 212 self.assertEqual(no_params.content_size, 0)
174 213 self.assertEqual(with_params.content_size, 0)
175 214 self.assertEqual(no_params.dict_id, 0)
176 215 self.assertEqual(with_params.dict_id, 0)
177 216 self.assertFalse(no_params.has_checksum)
178 217 self.assertTrue(with_params.has_checksum)
179 218
180 219 self.assertEqual(len(with_checksum), len(no_checksum) + 4)
181 220
182 221 def test_write_content_size(self):
183 222 cctx = zstd.ZstdCompressor(level=1)
184 223 cobj = cctx.compressobj(size=len(b'foobar' * 256))
185 224 no_size = cobj.compress(b'foobar' * 256) + cobj.flush()
186 225 cctx = zstd.ZstdCompressor(level=1, write_content_size=True)
187 226 cobj = cctx.compressobj(size=len(b'foobar' * 256))
188 227 with_size = cobj.compress(b'foobar' * 256) + cobj.flush()
189 228
190 229 no_params = zstd.get_frame_parameters(no_size)
191 230 with_params = zstd.get_frame_parameters(with_size)
192 231 self.assertEqual(no_params.content_size, 0)
193 232 self.assertEqual(with_params.content_size, 1536)
194 233 self.assertEqual(no_params.dict_id, 0)
195 234 self.assertEqual(with_params.dict_id, 0)
196 235 self.assertFalse(no_params.has_checksum)
197 236 self.assertFalse(with_params.has_checksum)
198 237
199 238 self.assertEqual(len(with_size), len(no_size) + 1)
200 239
201 240 def test_compress_after_finished(self):
202 241 cctx = zstd.ZstdCompressor()
203 242 cobj = cctx.compressobj()
204 243
205 244 cobj.compress(b'foo')
206 245 cobj.flush()
207 246
208 247 with self.assertRaisesRegexp(zstd.ZstdError, 'cannot call compress\(\) after compressor'):
209 248 cobj.compress(b'foo')
210 249
211 250 with self.assertRaisesRegexp(zstd.ZstdError, 'compressor object already finished'):
212 251 cobj.flush()
213 252
214 253 def test_flush_block_repeated(self):
215 254 cctx = zstd.ZstdCompressor(level=1)
216 255 cobj = cctx.compressobj()
217 256
218 257 self.assertEqual(cobj.compress(b'foo'), b'')
219 258 self.assertEqual(cobj.flush(zstd.COMPRESSOBJ_FLUSH_BLOCK),
220 259 b'\x28\xb5\x2f\xfd\x00\x48\x18\x00\x00foo')
221 260 self.assertEqual(cobj.compress(b'bar'), b'')
222 261 # 3 byte header plus content.
223 262 self.assertEqual(cobj.flush(), b'\x19\x00\x00bar')
224 263
225 264 def test_flush_empty_block(self):
226 265 cctx = zstd.ZstdCompressor(write_checksum=True)
227 266 cobj = cctx.compressobj()
228 267
229 268 cobj.compress(b'foobar')
230 269 cobj.flush(zstd.COMPRESSOBJ_FLUSH_BLOCK)
231 270 # No-op if no block is active (this is internal to zstd).
232 271 self.assertEqual(cobj.flush(zstd.COMPRESSOBJ_FLUSH_BLOCK), b'')
233 272
234 273 trailing = cobj.flush()
235 274 # 3 bytes block header + 4 bytes frame checksum
236 275 self.assertEqual(len(trailing), 7)
237 276 header = trailing[0:3]
238 277 self.assertEqual(header, b'\x01\x00\x00')
239 278
279 def test_multithreaded(self):
280 source = io.BytesIO()
281 source.write(b'a' * 1048576)
282 source.write(b'b' * 1048576)
283 source.write(b'c' * 1048576)
284 source.seek(0)
285
286 cctx = zstd.ZstdCompressor(level=1, threads=2)
287 cobj = cctx.compressobj()
288
289 chunks = []
290 while True:
291 d = source.read(8192)
292 if not d:
293 break
294
295 chunks.append(cobj.compress(d))
296
297 chunks.append(cobj.flush())
298
299 compressed = b''.join(chunks)
300
301 self.assertEqual(len(compressed), 295)
302
240 303
241 304 @make_cffi
242 305 class TestCompressor_copy_stream(unittest.TestCase):
243 306 def test_no_read(self):
244 307 source = object()
245 308 dest = io.BytesIO()
246 309
247 310 cctx = zstd.ZstdCompressor()
248 311 with self.assertRaises(ValueError):
249 312 cctx.copy_stream(source, dest)
250 313
251 314 def test_no_write(self):
252 315 source = io.BytesIO()
253 316 dest = object()
254 317
255 318 cctx = zstd.ZstdCompressor()
256 319 with self.assertRaises(ValueError):
257 320 cctx.copy_stream(source, dest)
258 321
259 322 def test_empty(self):
260 323 source = io.BytesIO()
261 324 dest = io.BytesIO()
262 325
263 326 cctx = zstd.ZstdCompressor(level=1)
264 327 r, w = cctx.copy_stream(source, dest)
265 328 self.assertEqual(int(r), 0)
266 329 self.assertEqual(w, 9)
267 330
268 331 self.assertEqual(dest.getvalue(),
269 332 b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
270 333
271 334 def test_large_data(self):
272 335 source = io.BytesIO()
273 336 for i in range(255):
274 337 source.write(struct.Struct('>B').pack(i) * 16384)
275 338 source.seek(0)
276 339
277 340 dest = io.BytesIO()
278 341 cctx = zstd.ZstdCompressor()
279 342 r, w = cctx.copy_stream(source, dest)
280 343
281 344 self.assertEqual(r, 255 * 16384)
282 345 self.assertEqual(w, 999)
283 346
284 347 params = zstd.get_frame_parameters(dest.getvalue())
285 348 self.assertEqual(params.content_size, 0)
286 349 self.assertEqual(params.window_size, 1048576)
287 350 self.assertEqual(params.dict_id, 0)
288 351 self.assertFalse(params.has_checksum)
289 352
290 353 def test_write_checksum(self):
291 354 source = io.BytesIO(b'foobar')
292 355 no_checksum = io.BytesIO()
293 356
294 357 cctx = zstd.ZstdCompressor(level=1)
295 358 cctx.copy_stream(source, no_checksum)
296 359
297 360 source.seek(0)
298 361 with_checksum = io.BytesIO()
299 362 cctx = zstd.ZstdCompressor(level=1, write_checksum=True)
300 363 cctx.copy_stream(source, with_checksum)
301 364
302 365 self.assertEqual(len(with_checksum.getvalue()),
303 366 len(no_checksum.getvalue()) + 4)
304 367
305 368 no_params = zstd.get_frame_parameters(no_checksum.getvalue())
306 369 with_params = zstd.get_frame_parameters(with_checksum.getvalue())
307 370 self.assertEqual(no_params.content_size, 0)
308 371 self.assertEqual(with_params.content_size, 0)
309 372 self.assertEqual(no_params.dict_id, 0)
310 373 self.assertEqual(with_params.dict_id, 0)
311 374 self.assertFalse(no_params.has_checksum)
312 375 self.assertTrue(with_params.has_checksum)
313 376
314 377 def test_write_content_size(self):
315 378 source = io.BytesIO(b'foobar' * 256)
316 379 no_size = io.BytesIO()
317 380
318 381 cctx = zstd.ZstdCompressor(level=1)
319 382 cctx.copy_stream(source, no_size)
320 383
321 384 source.seek(0)
322 385 with_size = io.BytesIO()
323 386 cctx = zstd.ZstdCompressor(level=1, write_content_size=True)
324 387 cctx.copy_stream(source, with_size)
325 388
326 389 # Source content size is unknown, so no content size written.
327 390 self.assertEqual(len(with_size.getvalue()),
328 391 len(no_size.getvalue()))
329 392
330 393 source.seek(0)
331 394 with_size = io.BytesIO()
332 395 cctx.copy_stream(source, with_size, size=len(source.getvalue()))
333 396
334 397 # We specified source size, so content size header is present.
335 398 self.assertEqual(len(with_size.getvalue()),
336 399 len(no_size.getvalue()) + 1)
337 400
338 401 no_params = zstd.get_frame_parameters(no_size.getvalue())
339 402 with_params = zstd.get_frame_parameters(with_size.getvalue())
340 403 self.assertEqual(no_params.content_size, 0)
341 404 self.assertEqual(with_params.content_size, 1536)
342 405 self.assertEqual(no_params.dict_id, 0)
343 406 self.assertEqual(with_params.dict_id, 0)
344 407 self.assertFalse(no_params.has_checksum)
345 408 self.assertFalse(with_params.has_checksum)
346 409
347 410 def test_read_write_size(self):
348 411 source = OpCountingBytesIO(b'foobarfoobar')
349 412 dest = OpCountingBytesIO()
350 413 cctx = zstd.ZstdCompressor()
351 414 r, w = cctx.copy_stream(source, dest, read_size=1, write_size=1)
352 415
353 416 self.assertEqual(r, len(source.getvalue()))
354 417 self.assertEqual(w, 21)
355 418 self.assertEqual(source._read_count, len(source.getvalue()) + 1)
356 419 self.assertEqual(dest._write_count, len(dest.getvalue()))
357 420
421 def test_multithreaded(self):
422 source = io.BytesIO()
423 source.write(b'a' * 1048576)
424 source.write(b'b' * 1048576)
425 source.write(b'c' * 1048576)
426 source.seek(0)
427
428 dest = io.BytesIO()
429 cctx = zstd.ZstdCompressor(threads=2)
430 r, w = cctx.copy_stream(source, dest)
431 self.assertEqual(r, 3145728)
432 self.assertEqual(w, 295)
433
434 params = zstd.get_frame_parameters(dest.getvalue())
435 self.assertEqual(params.content_size, 0)
436 self.assertEqual(params.dict_id, 0)
437 self.assertFalse(params.has_checksum)
438
439 # Writing content size and checksum works.
440 cctx = zstd.ZstdCompressor(threads=2, write_content_size=True,
441 write_checksum=True)
442 dest = io.BytesIO()
443 source.seek(0)
444 cctx.copy_stream(source, dest, size=len(source.getvalue()))
445
446 params = zstd.get_frame_parameters(dest.getvalue())
447 self.assertEqual(params.content_size, 3145728)
448 self.assertEqual(params.dict_id, 0)
449 self.assertTrue(params.has_checksum)
450
358 451
359 452 def compress(data, level):
360 453 buffer = io.BytesIO()
361 454 cctx = zstd.ZstdCompressor(level=level)
362 455 with cctx.write_to(buffer) as compressor:
363 456 compressor.write(data)
364 457 return buffer.getvalue()
365 458
366 459
367 460 @make_cffi
368 461 class TestCompressor_write_to(unittest.TestCase):
369 462 def test_empty(self):
370 463 result = compress(b'', 1)
371 464 self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
372 465
373 466 params = zstd.get_frame_parameters(result)
374 467 self.assertEqual(params.content_size, 0)
375 468 self.assertEqual(params.window_size, 524288)
376 469 self.assertEqual(params.dict_id, 0)
377 470 self.assertFalse(params.has_checksum)
378 471
379 472 def test_multiple_compress(self):
380 473 buffer = io.BytesIO()
381 474 cctx = zstd.ZstdCompressor(level=5)
382 475 with cctx.write_to(buffer) as compressor:
383 476 self.assertEqual(compressor.write(b'foo'), 0)
384 477 self.assertEqual(compressor.write(b'bar'), 0)
385 478 self.assertEqual(compressor.write(b'x' * 8192), 0)
386 479
387 480 result = buffer.getvalue()
388 481 self.assertEqual(result,
389 482 b'\x28\xb5\x2f\xfd\x00\x50\x75\x00\x00\x38\x66\x6f'
390 483 b'\x6f\x62\x61\x72\x78\x01\x00\xfc\xdf\x03\x23')
391 484
392 485 def test_dictionary(self):
393 486 samples = []
394 487 for i in range(128):
395 488 samples.append(b'foo' * 64)
396 489 samples.append(b'bar' * 64)
397 490 samples.append(b'foobar' * 64)
398 491
399 492 d = zstd.train_dictionary(8192, samples)
400 493
401 494 buffer = io.BytesIO()
402 495 cctx = zstd.ZstdCompressor(level=9, dict_data=d)
403 496 with cctx.write_to(buffer) as compressor:
404 497 self.assertEqual(compressor.write(b'foo'), 0)
405 498 self.assertEqual(compressor.write(b'bar'), 0)
406 499 self.assertEqual(compressor.write(b'foo' * 16384), 634)
407 500
408 501 compressed = buffer.getvalue()
409 502
410 503 params = zstd.get_frame_parameters(compressed)
411 504 self.assertEqual(params.content_size, 0)
412 505 self.assertEqual(params.window_size, 1024)
413 506 self.assertEqual(params.dict_id, d.dict_id())
414 507 self.assertFalse(params.has_checksum)
415 508
416 509 self.assertEqual(compressed[0:32],
417 510 b'\x28\xb5\x2f\xfd\x03\x00\x55\x7b\x6b\x5e\x54\x00'
418 511 b'\x00\x00\x02\xfc\xf4\xa5\xba\x23\x3f\x85\xb3\x54'
419 512 b'\x00\x00\x18\x6f\x6f\x66\x01\x00')
420 513
421 514 h = hashlib.sha1(compressed).hexdigest()
422 515 self.assertEqual(h, '1c5bcd25181bcd8c1a73ea8773323e0056129f92')
423 516
424 517 def test_compression_params(self):
425 518 params = zstd.CompressionParameters(20, 6, 12, 5, 4, 10, zstd.STRATEGY_FAST)
426 519
427 520 buffer = io.BytesIO()
428 521 cctx = zstd.ZstdCompressor(compression_params=params)
429 522 with cctx.write_to(buffer) as compressor:
430 523 self.assertEqual(compressor.write(b'foo'), 0)
431 524 self.assertEqual(compressor.write(b'bar'), 0)
432 525 self.assertEqual(compressor.write(b'foobar' * 16384), 0)
433 526
434 527 compressed = buffer.getvalue()
435 528
436 529 params = zstd.get_frame_parameters(compressed)
437 530 self.assertEqual(params.content_size, 0)
438 531 self.assertEqual(params.window_size, 1048576)
439 532 self.assertEqual(params.dict_id, 0)
440 533 self.assertFalse(params.has_checksum)
441 534
442 535 h = hashlib.sha1(compressed).hexdigest()
443 536 self.assertEqual(h, '1ae31f270ed7de14235221a604b31ecd517ebd99')
444 537
445 538 def test_write_checksum(self):
446 539 no_checksum = io.BytesIO()
447 540 cctx = zstd.ZstdCompressor(level=1)
448 541 with cctx.write_to(no_checksum) as compressor:
449 542 self.assertEqual(compressor.write(b'foobar'), 0)
450 543
451 544 with_checksum = io.BytesIO()
452 545 cctx = zstd.ZstdCompressor(level=1, write_checksum=True)
453 546 with cctx.write_to(with_checksum) as compressor:
454 547 self.assertEqual(compressor.write(b'foobar'), 0)
455 548
456 549 no_params = zstd.get_frame_parameters(no_checksum.getvalue())
457 550 with_params = zstd.get_frame_parameters(with_checksum.getvalue())
458 551 self.assertEqual(no_params.content_size, 0)
459 552 self.assertEqual(with_params.content_size, 0)
460 553 self.assertEqual(no_params.dict_id, 0)
461 554 self.assertEqual(with_params.dict_id, 0)
462 555 self.assertFalse(no_params.has_checksum)
463 556 self.assertTrue(with_params.has_checksum)
464 557
465 558 self.assertEqual(len(with_checksum.getvalue()),
466 559 len(no_checksum.getvalue()) + 4)
467 560
468 561 def test_write_content_size(self):
469 562 no_size = io.BytesIO()
470 563 cctx = zstd.ZstdCompressor(level=1)
471 564 with cctx.write_to(no_size) as compressor:
472 565 self.assertEqual(compressor.write(b'foobar' * 256), 0)
473 566
474 567 with_size = io.BytesIO()
475 568 cctx = zstd.ZstdCompressor(level=1, write_content_size=True)
476 569 with cctx.write_to(with_size) as compressor:
477 570 self.assertEqual(compressor.write(b'foobar' * 256), 0)
478 571
479 572 # Source size is not known in streaming mode, so header not
480 573 # written.
481 574 self.assertEqual(len(with_size.getvalue()),
482 575 len(no_size.getvalue()))
483 576
484 577 # Declaring size will write the header.
485 578 with_size = io.BytesIO()
486 579 with cctx.write_to(with_size, size=len(b'foobar' * 256)) as compressor:
487 580 self.assertEqual(compressor.write(b'foobar' * 256), 0)
488 581
489 582 no_params = zstd.get_frame_parameters(no_size.getvalue())
490 583 with_params = zstd.get_frame_parameters(with_size.getvalue())
491 584 self.assertEqual(no_params.content_size, 0)
492 585 self.assertEqual(with_params.content_size, 1536)
493 586 self.assertEqual(no_params.dict_id, 0)
494 587 self.assertEqual(with_params.dict_id, 0)
495 588 self.assertFalse(no_params.has_checksum)
496 589 self.assertFalse(with_params.has_checksum)
497 590
498 591 self.assertEqual(len(with_size.getvalue()),
499 592 len(no_size.getvalue()) + 1)
500 593
501 594 def test_no_dict_id(self):
502 595 samples = []
503 596 for i in range(128):
504 597 samples.append(b'foo' * 64)
505 598 samples.append(b'bar' * 64)
506 599 samples.append(b'foobar' * 64)
507 600
508 601 d = zstd.train_dictionary(1024, samples)
509 602
510 603 with_dict_id = io.BytesIO()
511 604 cctx = zstd.ZstdCompressor(level=1, dict_data=d)
512 605 with cctx.write_to(with_dict_id) as compressor:
513 606 self.assertEqual(compressor.write(b'foobarfoobar'), 0)
514 607
515 608 cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False)
516 609 no_dict_id = io.BytesIO()
517 610 with cctx.write_to(no_dict_id) as compressor:
518 611 self.assertEqual(compressor.write(b'foobarfoobar'), 0)
519 612
520 613 no_params = zstd.get_frame_parameters(no_dict_id.getvalue())
521 614 with_params = zstd.get_frame_parameters(with_dict_id.getvalue())
522 615 self.assertEqual(no_params.content_size, 0)
523 616 self.assertEqual(with_params.content_size, 0)
524 617 self.assertEqual(no_params.dict_id, 0)
525 618 self.assertEqual(with_params.dict_id, d.dict_id())
526 619 self.assertFalse(no_params.has_checksum)
527 620 self.assertFalse(with_params.has_checksum)
528 621
529 622 self.assertEqual(len(with_dict_id.getvalue()),
530 623 len(no_dict_id.getvalue()) + 4)
531 624
532 625 def test_memory_size(self):
533 626 cctx = zstd.ZstdCompressor(level=3)
534 627 buffer = io.BytesIO()
535 628 with cctx.write_to(buffer) as compressor:
536 629 size = compressor.memory_size()
537 630
538 631 self.assertGreater(size, 100000)
539 632
540 633 def test_write_size(self):
541 634 cctx = zstd.ZstdCompressor(level=3)
542 635 dest = OpCountingBytesIO()
543 636 with cctx.write_to(dest, write_size=1) as compressor:
544 637 self.assertEqual(compressor.write(b'foo'), 0)
545 638 self.assertEqual(compressor.write(b'bar'), 0)
546 639 self.assertEqual(compressor.write(b'foobar'), 0)
547 640
548 641 self.assertEqual(len(dest.getvalue()), dest._write_count)
549 642
550 643 def test_flush_repeated(self):
551 644 cctx = zstd.ZstdCompressor(level=3)
552 645 dest = OpCountingBytesIO()
553 646 with cctx.write_to(dest) as compressor:
554 647 self.assertEqual(compressor.write(b'foo'), 0)
555 648 self.assertEqual(dest._write_count, 0)
556 649 self.assertEqual(compressor.flush(), 12)
557 650 self.assertEqual(dest._write_count, 1)
558 651 self.assertEqual(compressor.write(b'bar'), 0)
559 652 self.assertEqual(dest._write_count, 1)
560 653 self.assertEqual(compressor.flush(), 6)
561 654 self.assertEqual(dest._write_count, 2)
562 655 self.assertEqual(compressor.write(b'baz'), 0)
563 656
564 657 self.assertEqual(dest._write_count, 3)
565 658
566 659 def test_flush_empty_block(self):
567 660 cctx = zstd.ZstdCompressor(level=3, write_checksum=True)
568 661 dest = OpCountingBytesIO()
569 662 with cctx.write_to(dest) as compressor:
570 663 self.assertEqual(compressor.write(b'foobar' * 8192), 0)
571 664 count = dest._write_count
572 665 offset = dest.tell()
573 666 self.assertEqual(compressor.flush(), 23)
574 667 self.assertGreater(dest._write_count, count)
575 668 self.assertGreater(dest.tell(), offset)
576 669 offset = dest.tell()
577 670 # Ending the write here should cause an empty block to be written
578 671 # to denote end of frame.
579 672
580 673 trailing = dest.getvalue()[offset:]
581 674 # 3 bytes block header + 4 bytes frame checksum
582 675 self.assertEqual(len(trailing), 7)
583 676
584 677 header = trailing[0:3]
585 678 self.assertEqual(header, b'\x01\x00\x00')
586 679
680 def test_multithreaded(self):
681 dest = io.BytesIO()
682 cctx = zstd.ZstdCompressor(threads=2)
683 with cctx.write_to(dest) as compressor:
684 compressor.write(b'a' * 1048576)
685 compressor.write(b'b' * 1048576)
686 compressor.write(b'c' * 1048576)
687
688 self.assertEqual(len(dest.getvalue()), 295)
689
587 690
588 691 @make_cffi
589 692 class TestCompressor_read_from(unittest.TestCase):
590 693 def test_type_validation(self):
591 694 cctx = zstd.ZstdCompressor()
592 695
593 696 # Object with read() works.
594 697 for chunk in cctx.read_from(io.BytesIO()):
595 698 pass
596 699
597 700 # Buffer protocol works.
598 701 for chunk in cctx.read_from(b'foobar'):
599 702 pass
600 703
601 704 with self.assertRaisesRegexp(ValueError, 'must pass an object with a read'):
602 705 for chunk in cctx.read_from(True):
603 706 pass
604 707
605 708 def test_read_empty(self):
606 709 cctx = zstd.ZstdCompressor(level=1)
607 710
608 711 source = io.BytesIO()
609 712 it = cctx.read_from(source)
610 713 chunks = list(it)
611 714 self.assertEqual(len(chunks), 1)
612 715 compressed = b''.join(chunks)
613 716 self.assertEqual(compressed, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
614 717
615 718 # And again with the buffer protocol.
616 719 it = cctx.read_from(b'')
617 720 chunks = list(it)
618 721 self.assertEqual(len(chunks), 1)
619 722 compressed2 = b''.join(chunks)
620 723 self.assertEqual(compressed2, compressed)
621 724
622 725 def test_read_large(self):
623 726 cctx = zstd.ZstdCompressor(level=1)
624 727
625 728 source = io.BytesIO()
626 729 source.write(b'f' * zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE)
627 730 source.write(b'o')
628 731 source.seek(0)
629 732
630 733 # Creating an iterator should not perform any compression until
631 734 # first read.
632 735 it = cctx.read_from(source, size=len(source.getvalue()))
633 736 self.assertEqual(source.tell(), 0)
634 737
635 738 # We should have exactly 2 output chunks.
636 739 chunks = []
637 740 chunk = next(it)
638 741 self.assertIsNotNone(chunk)
639 742 self.assertEqual(source.tell(), zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE)
640 743 chunks.append(chunk)
641 744 chunk = next(it)
642 745 self.assertIsNotNone(chunk)
643 746 chunks.append(chunk)
644 747
645 748 self.assertEqual(source.tell(), len(source.getvalue()))
646 749
647 750 with self.assertRaises(StopIteration):
648 751 next(it)
649 752
650 753 # And again for good measure.
651 754 with self.assertRaises(StopIteration):
652 755 next(it)
653 756
654 757 # We should get the same output as the one-shot compression mechanism.
655 758 self.assertEqual(b''.join(chunks), cctx.compress(source.getvalue()))
656 759
657 760 params = zstd.get_frame_parameters(b''.join(chunks))
658 761 self.assertEqual(params.content_size, 0)
659 762 self.assertEqual(params.window_size, 262144)
660 763 self.assertEqual(params.dict_id, 0)
661 764 self.assertFalse(params.has_checksum)
662 765
663 766 # Now check the buffer protocol.
664 767 it = cctx.read_from(source.getvalue())
665 768 chunks = list(it)
666 769 self.assertEqual(len(chunks), 2)
667 770 self.assertEqual(b''.join(chunks), cctx.compress(source.getvalue()))
668 771
669 772 def test_read_write_size(self):
670 773 source = OpCountingBytesIO(b'foobarfoobar')
671 774 cctx = zstd.ZstdCompressor(level=3)
672 775 for chunk in cctx.read_from(source, read_size=1, write_size=1):
673 776 self.assertEqual(len(chunk), 1)
674 777
675 778 self.assertEqual(source._read_count, len(source.getvalue()) + 1)
779
780 def test_multithreaded(self):
781 source = io.BytesIO()
782 source.write(b'a' * 1048576)
783 source.write(b'b' * 1048576)
784 source.write(b'c' * 1048576)
785 source.seek(0)
786
787 cctx = zstd.ZstdCompressor(threads=2)
788
789 compressed = b''.join(cctx.read_from(source))
790 self.assertEqual(len(compressed), 295)
791
792
793 class TestCompressor_multi_compress_to_buffer(unittest.TestCase):
794 def test_multithreaded_unsupported(self):
795 cctx = zstd.ZstdCompressor(threads=2)
796
797 with self.assertRaisesRegexp(zstd.ZstdError, 'function cannot be called on ZstdCompressor configured for multi-threaded compression'):
798 cctx.multi_compress_to_buffer([b'foo'])
799
800 def test_invalid_inputs(self):
801 cctx = zstd.ZstdCompressor()
802
803 with self.assertRaises(TypeError):
804 cctx.multi_compress_to_buffer(True)
805
806 with self.assertRaises(TypeError):
807 cctx.multi_compress_to_buffer((1, 2))
808
809 with self.assertRaisesRegexp(TypeError, 'item 0 not a bytes like object'):
810 cctx.multi_compress_to_buffer([u'foo'])
811
812 def test_empty_input(self):
813 cctx = zstd.ZstdCompressor()
814
815 with self.assertRaisesRegexp(ValueError, 'no source elements found'):
816 cctx.multi_compress_to_buffer([])
817
818 with self.assertRaisesRegexp(ValueError, 'source elements are empty'):
819 cctx.multi_compress_to_buffer([b'', b'', b''])
820
821 def test_list_input(self):
822 cctx = zstd.ZstdCompressor(write_content_size=True, write_checksum=True)
823
824 original = [b'foo' * 12, b'bar' * 6]
825 frames = [cctx.compress(c) for c in original]
826 b = cctx.multi_compress_to_buffer(original)
827
828 self.assertIsInstance(b, zstd.BufferWithSegmentsCollection)
829
830 self.assertEqual(len(b), 2)
831 self.assertEqual(b.size(), 44)
832
833 self.assertEqual(b[0].tobytes(), frames[0])
834 self.assertEqual(b[1].tobytes(), frames[1])
835
836 def test_buffer_with_segments_input(self):
837 cctx = zstd.ZstdCompressor(write_content_size=True, write_checksum=True)
838
839 original = [b'foo' * 4, b'bar' * 6]
840 frames = [cctx.compress(c) for c in original]
841
842 offsets = struct.pack('=QQQQ', 0, len(original[0]),
843 len(original[0]), len(original[1]))
844 segments = zstd.BufferWithSegments(b''.join(original), offsets)
845
846 result = cctx.multi_compress_to_buffer(segments)
847
848 self.assertEqual(len(result), 2)
849 self.assertEqual(result.size(), 47)
850
851 self.assertEqual(result[0].tobytes(), frames[0])
852 self.assertEqual(result[1].tobytes(), frames[1])
853
854 def test_buffer_with_segments_collection_input(self):
855 cctx = zstd.ZstdCompressor(write_content_size=True, write_checksum=True)
856
857 original = [
858 b'foo1',
859 b'foo2' * 2,
860 b'foo3' * 3,
861 b'foo4' * 4,
862 b'foo5' * 5,
863 ]
864
865 frames = [cctx.compress(c) for c in original]
866
867 b = b''.join([original[0], original[1]])
868 b1 = zstd.BufferWithSegments(b, struct.pack('=QQQQ',
869 0, len(original[0]),
870 len(original[0]), len(original[1])))
871 b = b''.join([original[2], original[3], original[4]])
872 b2 = zstd.BufferWithSegments(b, struct.pack('=QQQQQQ',
873 0, len(original[2]),
874 len(original[2]), len(original[3]),
875 len(original[2]) + len(original[3]), len(original[4])))
876
877 c = zstd.BufferWithSegmentsCollection(b1, b2)
878
879 result = cctx.multi_compress_to_buffer(c)
880
881 self.assertEqual(len(result), len(frames))
882
883 for i, frame in enumerate(frames):
884 self.assertEqual(result[i].tobytes(), frame)
885
886 def test_multiple_threads(self):
887 # threads argument will cause multi-threaded ZSTD APIs to be used, which will
888 # make output different.
889 refcctx = zstd.ZstdCompressor(write_content_size=True, write_checksum=True)
890 reference = [refcctx.compress(b'x' * 64), refcctx.compress(b'y' * 64)]
891
892 cctx = zstd.ZstdCompressor(write_content_size=True, write_checksum=True)
893
894 frames = []
895 frames.extend(b'x' * 64 for i in range(256))
896 frames.extend(b'y' * 64 for i in range(256))
897
898 result = cctx.multi_compress_to_buffer(frames, threads=-1)
899
900 self.assertEqual(len(result), 512)
901 for i in range(512):
902 if i < 256:
903 self.assertEqual(result[i].tobytes(), reference[0])
904 else:
905 self.assertEqual(result[i].tobytes(), reference[1])
@@ -1,186 +1,123
1 import io
2
3 1 try:
4 2 import unittest2 as unittest
5 3 except ImportError:
6 4 import unittest
7 5
8 try:
9 import hypothesis
10 import hypothesis.strategies as strategies
11 except ImportError:
12 hypothesis = None
13
14 6 import zstd
15 7
16 8 from . common import (
17 9 make_cffi,
18 10 )
19 11
20 12
21 13 @make_cffi
22 14 class TestCompressionParameters(unittest.TestCase):
23 15 def test_init_bad_arg_type(self):
24 16 with self.assertRaises(TypeError):
25 17 zstd.CompressionParameters()
26 18
27 19 with self.assertRaises(TypeError):
28 20 zstd.CompressionParameters(0, 1)
29 21
30 22 def test_bounds(self):
31 23 zstd.CompressionParameters(zstd.WINDOWLOG_MIN,
32 24 zstd.CHAINLOG_MIN,
33 25 zstd.HASHLOG_MIN,
34 26 zstd.SEARCHLOG_MIN,
35 zstd.SEARCHLENGTH_MIN,
27 zstd.SEARCHLENGTH_MIN + 1,
36 28 zstd.TARGETLENGTH_MIN,
37 29 zstd.STRATEGY_FAST)
38 30
39 31 zstd.CompressionParameters(zstd.WINDOWLOG_MAX,
40 32 zstd.CHAINLOG_MAX,
41 33 zstd.HASHLOG_MAX,
42 34 zstd.SEARCHLOG_MAX,
43 zstd.SEARCHLENGTH_MAX,
35 zstd.SEARCHLENGTH_MAX - 1,
44 36 zstd.TARGETLENGTH_MAX,
45 37 zstd.STRATEGY_BTOPT)
46 38
47 39 def test_get_compression_parameters(self):
48 40 p = zstd.get_compression_parameters(1)
49 41 self.assertIsInstance(p, zstd.CompressionParameters)
50 42
51 43 self.assertEqual(p.window_log, 19)
52 44
53 45 def test_members(self):
54 46 p = zstd.CompressionParameters(10, 6, 7, 4, 5, 8, 1)
55 47 self.assertEqual(p.window_log, 10)
56 48 self.assertEqual(p.chain_log, 6)
57 49 self.assertEqual(p.hash_log, 7)
58 50 self.assertEqual(p.search_log, 4)
59 51 self.assertEqual(p.search_length, 5)
60 52 self.assertEqual(p.target_length, 8)
61 53 self.assertEqual(p.strategy, 1)
62 54
55 def test_estimated_compression_context_size(self):
56 p = zstd.CompressionParameters(20, 16, 17, 1, 5, 16, zstd.STRATEGY_DFAST)
57
58 # 32-bit has slightly different values from 64-bit.
59 self.assertAlmostEqual(p.estimated_compression_context_size(), 1287076,
60 delta=110)
61
63 62
64 63 @make_cffi
65 64 class TestFrameParameters(unittest.TestCase):
66 65 def test_invalid_type(self):
67 66 with self.assertRaises(TypeError):
68 67 zstd.get_frame_parameters(None)
69 68
70 69 with self.assertRaises(TypeError):
71 70 zstd.get_frame_parameters(u'foobarbaz')
72 71
73 72 def test_invalid_input_sizes(self):
74 73 with self.assertRaisesRegexp(zstd.ZstdError, 'not enough data for frame'):
75 74 zstd.get_frame_parameters(b'')
76 75
77 76 with self.assertRaisesRegexp(zstd.ZstdError, 'not enough data for frame'):
78 77 zstd.get_frame_parameters(zstd.FRAME_HEADER)
79 78
80 79 def test_invalid_frame(self):
81 80 with self.assertRaisesRegexp(zstd.ZstdError, 'Unknown frame descriptor'):
82 81 zstd.get_frame_parameters(b'foobarbaz')
83 82
84 83 def test_attributes(self):
85 84 params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x00\x00')
86 85 self.assertEqual(params.content_size, 0)
87 86 self.assertEqual(params.window_size, 1024)
88 87 self.assertEqual(params.dict_id, 0)
89 88 self.assertFalse(params.has_checksum)
90 89
91 90 # Lowest 2 bits indicate a dictionary and length. Here, the dict id is 1 byte.
92 91 params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x01\x00\xff')
93 92 self.assertEqual(params.content_size, 0)
94 93 self.assertEqual(params.window_size, 1024)
95 94 self.assertEqual(params.dict_id, 255)
96 95 self.assertFalse(params.has_checksum)
97 96
98 97 # Lowest 3rd bit indicates if checksum is present.
99 98 params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x04\x00')
100 99 self.assertEqual(params.content_size, 0)
101 100 self.assertEqual(params.window_size, 1024)
102 101 self.assertEqual(params.dict_id, 0)
103 102 self.assertTrue(params.has_checksum)
104 103
105 104 # Upper 2 bits indicate content size.
106 105 params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x40\x00\xff\x00')
107 106 self.assertEqual(params.content_size, 511)
108 107 self.assertEqual(params.window_size, 1024)
109 108 self.assertEqual(params.dict_id, 0)
110 109 self.assertFalse(params.has_checksum)
111 110
112 111 # Window descriptor is 2nd byte after frame header.
113 112 params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x00\x40')
114 113 self.assertEqual(params.content_size, 0)
115 114 self.assertEqual(params.window_size, 262144)
116 115 self.assertEqual(params.dict_id, 0)
117 116 self.assertFalse(params.has_checksum)
118 117
119 118 # Set multiple things.
120 119 params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x45\x40\x0f\x10\x00')
121 120 self.assertEqual(params.content_size, 272)
122 121 self.assertEqual(params.window_size, 262144)
123 122 self.assertEqual(params.dict_id, 15)
124 123 self.assertTrue(params.has_checksum)
125
126
127 if hypothesis:
128 s_windowlog = strategies.integers(min_value=zstd.WINDOWLOG_MIN,
129 max_value=zstd.WINDOWLOG_MAX)
130 s_chainlog = strategies.integers(min_value=zstd.CHAINLOG_MIN,
131 max_value=zstd.CHAINLOG_MAX)
132 s_hashlog = strategies.integers(min_value=zstd.HASHLOG_MIN,
133 max_value=zstd.HASHLOG_MAX)
134 s_searchlog = strategies.integers(min_value=zstd.SEARCHLOG_MIN,
135 max_value=zstd.SEARCHLOG_MAX)
136 s_searchlength = strategies.integers(min_value=zstd.SEARCHLENGTH_MIN,
137 max_value=zstd.SEARCHLENGTH_MAX)
138 s_targetlength = strategies.integers(min_value=zstd.TARGETLENGTH_MIN,
139 max_value=zstd.TARGETLENGTH_MAX)
140 s_strategy = strategies.sampled_from((zstd.STRATEGY_FAST,
141 zstd.STRATEGY_DFAST,
142 zstd.STRATEGY_GREEDY,
143 zstd.STRATEGY_LAZY,
144 zstd.STRATEGY_LAZY2,
145 zstd.STRATEGY_BTLAZY2,
146 zstd.STRATEGY_BTOPT))
147
148
149 @make_cffi
150 class TestCompressionParametersHypothesis(unittest.TestCase):
151 @hypothesis.given(s_windowlog, s_chainlog, s_hashlog, s_searchlog,
152 s_searchlength, s_targetlength, s_strategy)
153 def test_valid_init(self, windowlog, chainlog, hashlog, searchlog,
154 searchlength, targetlength, strategy):
155 p = zstd.CompressionParameters(windowlog, chainlog, hashlog,
156 searchlog, searchlength,
157 targetlength, strategy)
158
159 # Verify we can instantiate a compressor with the supplied values.
160 # ZSTD_checkCParams moves the goal posts on us from what's advertised
161 # in the constants. So move along with them.
162 if searchlength == zstd.SEARCHLENGTH_MIN and strategy in (zstd.STRATEGY_FAST, zstd.STRATEGY_GREEDY):
163 searchlength += 1
164 p = zstd.CompressionParameters(windowlog, chainlog, hashlog,
165 searchlog, searchlength,
166 targetlength, strategy)
167 elif searchlength == zstd.SEARCHLENGTH_MAX and strategy != zstd.STRATEGY_FAST:
168 searchlength -= 1
169 p = zstd.CompressionParameters(windowlog, chainlog, hashlog,
170 searchlog, searchlength,
171 targetlength, strategy)
172
173 cctx = zstd.ZstdCompressor(compression_params=p)
174 with cctx.write_to(io.BytesIO()):
175 pass
176
177 @hypothesis.given(s_windowlog, s_chainlog, s_hashlog, s_searchlog,
178 s_searchlength, s_targetlength, s_strategy)
179 def test_estimate_compression_context_size(self, windowlog, chainlog,
180 hashlog, searchlog,
181 searchlength, targetlength,
182 strategy):
183 p = zstd.CompressionParameters(windowlog, chainlog, hashlog,
184 searchlog, searchlength,
185 targetlength, strategy)
186 size = zstd.estimate_compression_context_size(p)
@@ -1,577 +1,741
1 1 import io
2 2 import random
3 3 import struct
4 4 import sys
5 5
6 6 try:
7 7 import unittest2 as unittest
8 8 except ImportError:
9 9 import unittest
10 10
11 11 import zstd
12 12
13 13 from .common import (
14 14 make_cffi,
15 15 OpCountingBytesIO,
16 16 )
17 17
18 18
19 19 if sys.version_info[0] >= 3:
20 20 next = lambda it: it.__next__()
21 21 else:
22 22 next = lambda it: it.next()
23 23
24 24
25 25 @make_cffi
26 26 class TestDecompressor_decompress(unittest.TestCase):
27 27 def test_empty_input(self):
28 28 dctx = zstd.ZstdDecompressor()
29 29
30 30 with self.assertRaisesRegexp(zstd.ZstdError, 'input data invalid'):
31 31 dctx.decompress(b'')
32 32
33 33 def test_invalid_input(self):
34 34 dctx = zstd.ZstdDecompressor()
35 35
36 36 with self.assertRaisesRegexp(zstd.ZstdError, 'input data invalid'):
37 37 dctx.decompress(b'foobar')
38 38
39 39 def test_no_content_size_in_frame(self):
40 40 cctx = zstd.ZstdCompressor(write_content_size=False)
41 41 compressed = cctx.compress(b'foobar')
42 42
43 43 dctx = zstd.ZstdDecompressor()
44 44 with self.assertRaisesRegexp(zstd.ZstdError, 'input data invalid'):
45 45 dctx.decompress(compressed)
46 46
47 47 def test_content_size_present(self):
48 48 cctx = zstd.ZstdCompressor(write_content_size=True)
49 49 compressed = cctx.compress(b'foobar')
50 50
51 51 dctx = zstd.ZstdDecompressor()
52 decompressed = dctx.decompress(compressed)
52 decompressed = dctx.decompress(compressed)
53 53 self.assertEqual(decompressed, b'foobar')
54 54
55 55 def test_max_output_size(self):
56 56 cctx = zstd.ZstdCompressor(write_content_size=False)
57 57 source = b'foobar' * 256
58 58 compressed = cctx.compress(source)
59 59
60 60 dctx = zstd.ZstdDecompressor()
61 61 # Will fit into buffer exactly the size of input.
62 62 decompressed = dctx.decompress(compressed, max_output_size=len(source))
63 63 self.assertEqual(decompressed, source)
64 64
65 65 # Input size - 1 fails
66 66 with self.assertRaisesRegexp(zstd.ZstdError, 'Destination buffer is too small'):
67 67 dctx.decompress(compressed, max_output_size=len(source) - 1)
68 68
69 69 # Input size + 1 works
70 70 decompressed = dctx.decompress(compressed, max_output_size=len(source) + 1)
71 71 self.assertEqual(decompressed, source)
72 72
73 73 # A much larger buffer works.
74 74 decompressed = dctx.decompress(compressed, max_output_size=len(source) * 64)
75 75 self.assertEqual(decompressed, source)
76 76
77 77 def test_stupidly_large_output_buffer(self):
78 78 cctx = zstd.ZstdCompressor(write_content_size=False)
79 79 compressed = cctx.compress(b'foobar' * 256)
80 80 dctx = zstd.ZstdDecompressor()
81 81
82 82 # Will get OverflowError on some Python distributions that can't
83 83 # handle really large integers.
84 84 with self.assertRaises((MemoryError, OverflowError)):
85 85 dctx.decompress(compressed, max_output_size=2**62)
86 86
87 87 def test_dictionary(self):
88 88 samples = []
89 89 for i in range(128):
90 90 samples.append(b'foo' * 64)
91 91 samples.append(b'bar' * 64)
92 92 samples.append(b'foobar' * 64)
93 93
94 94 d = zstd.train_dictionary(8192, samples)
95 95
96 96 orig = b'foobar' * 16384
97 97 cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_content_size=True)
98 98 compressed = cctx.compress(orig)
99 99
100 100 dctx = zstd.ZstdDecompressor(dict_data=d)
101 101 decompressed = dctx.decompress(compressed)
102 102
103 103 self.assertEqual(decompressed, orig)
104 104
105 105 def test_dictionary_multiple(self):
106 106 samples = []
107 107 for i in range(128):
108 108 samples.append(b'foo' * 64)
109 109 samples.append(b'bar' * 64)
110 110 samples.append(b'foobar' * 64)
111 111
112 112 d = zstd.train_dictionary(8192, samples)
113 113
114 114 sources = (b'foobar' * 8192, b'foo' * 8192, b'bar' * 8192)
115 115 compressed = []
116 116 cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_content_size=True)
117 117 for source in sources:
118 118 compressed.append(cctx.compress(source))
119 119
120 120 dctx = zstd.ZstdDecompressor(dict_data=d)
121 121 for i in range(len(sources)):
122 122 decompressed = dctx.decompress(compressed[i])
123 123 self.assertEqual(decompressed, sources[i])
124 124
125 125
126 126 @make_cffi
127 127 class TestDecompressor_copy_stream(unittest.TestCase):
128 128 def test_no_read(self):
129 129 source = object()
130 130 dest = io.BytesIO()
131 131
132 132 dctx = zstd.ZstdDecompressor()
133 133 with self.assertRaises(ValueError):
134 134 dctx.copy_stream(source, dest)
135 135
136 136 def test_no_write(self):
137 137 source = io.BytesIO()
138 138 dest = object()
139 139
140 140 dctx = zstd.ZstdDecompressor()
141 141 with self.assertRaises(ValueError):
142 142 dctx.copy_stream(source, dest)
143 143
144 144 def test_empty(self):
145 145 source = io.BytesIO()
146 146 dest = io.BytesIO()
147 147
148 148 dctx = zstd.ZstdDecompressor()
149 149 # TODO should this raise an error?
150 150 r, w = dctx.copy_stream(source, dest)
151 151
152 152 self.assertEqual(r, 0)
153 153 self.assertEqual(w, 0)
154 154 self.assertEqual(dest.getvalue(), b'')
155 155
156 156 def test_large_data(self):
157 157 source = io.BytesIO()
158 158 for i in range(255):
159 159 source.write(struct.Struct('>B').pack(i) * 16384)
160 160 source.seek(0)
161 161
162 162 compressed = io.BytesIO()
163 163 cctx = zstd.ZstdCompressor()
164 164 cctx.copy_stream(source, compressed)
165 165
166 166 compressed.seek(0)
167 167 dest = io.BytesIO()
168 168 dctx = zstd.ZstdDecompressor()
169 169 r, w = dctx.copy_stream(compressed, dest)
170 170
171 171 self.assertEqual(r, len(compressed.getvalue()))
172 172 self.assertEqual(w, len(source.getvalue()))
173 173
174 174 def test_read_write_size(self):
175 175 source = OpCountingBytesIO(zstd.ZstdCompressor().compress(
176 176 b'foobarfoobar'))
177 177
178 178 dest = OpCountingBytesIO()
179 179 dctx = zstd.ZstdDecompressor()
180 180 r, w = dctx.copy_stream(source, dest, read_size=1, write_size=1)
181 181
182 182 self.assertEqual(r, len(source.getvalue()))
183 183 self.assertEqual(w, len(b'foobarfoobar'))
184 184 self.assertEqual(source._read_count, len(source.getvalue()) + 1)
185 185 self.assertEqual(dest._write_count, len(dest.getvalue()))
186 186
187 187
188 188 @make_cffi
189 189 class TestDecompressor_decompressobj(unittest.TestCase):
190 190 def test_simple(self):
191 191 data = zstd.ZstdCompressor(level=1).compress(b'foobar')
192 192
193 193 dctx = zstd.ZstdDecompressor()
194 194 dobj = dctx.decompressobj()
195 195 self.assertEqual(dobj.decompress(data), b'foobar')
196 196
197 197 def test_reuse(self):
198 198 data = zstd.ZstdCompressor(level=1).compress(b'foobar')
199 199
200 200 dctx = zstd.ZstdDecompressor()
201 201 dobj = dctx.decompressobj()
202 202 dobj.decompress(data)
203 203
204 204 with self.assertRaisesRegexp(zstd.ZstdError, 'cannot use a decompressobj'):
205 205 dobj.decompress(data)
206 206
207 207
208 208 def decompress_via_writer(data):
209 209 buffer = io.BytesIO()
210 210 dctx = zstd.ZstdDecompressor()
211 211 with dctx.write_to(buffer) as decompressor:
212 212 decompressor.write(data)
213 213 return buffer.getvalue()
214 214
215 215
216 216 @make_cffi
217 217 class TestDecompressor_write_to(unittest.TestCase):
218 218 def test_empty_roundtrip(self):
219 219 cctx = zstd.ZstdCompressor()
220 220 empty = cctx.compress(b'')
221 221 self.assertEqual(decompress_via_writer(empty), b'')
222 222
223 223 def test_large_roundtrip(self):
224 224 chunks = []
225 225 for i in range(255):
226 226 chunks.append(struct.Struct('>B').pack(i) * 16384)
227 227 orig = b''.join(chunks)
228 228 cctx = zstd.ZstdCompressor()
229 229 compressed = cctx.compress(orig)
230 230
231 231 self.assertEqual(decompress_via_writer(compressed), orig)
232 232
233 233 def test_multiple_calls(self):
234 234 chunks = []
235 235 for i in range(255):
236 236 for j in range(255):
237 237 chunks.append(struct.Struct('>B').pack(j) * i)
238 238
239 239 orig = b''.join(chunks)
240 240 cctx = zstd.ZstdCompressor()
241 241 compressed = cctx.compress(orig)
242 242
243 243 buffer = io.BytesIO()
244 244 dctx = zstd.ZstdDecompressor()
245 245 with dctx.write_to(buffer) as decompressor:
246 246 pos = 0
247 247 while pos < len(compressed):
248 248 pos2 = pos + 8192
249 249 decompressor.write(compressed[pos:pos2])
250 250 pos += 8192
251 251 self.assertEqual(buffer.getvalue(), orig)
252 252
253 253 def test_dictionary(self):
254 254 samples = []
255 255 for i in range(128):
256 256 samples.append(b'foo' * 64)
257 257 samples.append(b'bar' * 64)
258 258 samples.append(b'foobar' * 64)
259 259
260 260 d = zstd.train_dictionary(8192, samples)
261 261
262 262 orig = b'foobar' * 16384
263 263 buffer = io.BytesIO()
264 264 cctx = zstd.ZstdCompressor(dict_data=d)
265 265 with cctx.write_to(buffer) as compressor:
266 266 self.assertEqual(compressor.write(orig), 1544)
267 267
268 268 compressed = buffer.getvalue()
269 269 buffer = io.BytesIO()
270 270
271 271 dctx = zstd.ZstdDecompressor(dict_data=d)
272 272 with dctx.write_to(buffer) as decompressor:
273 273 self.assertEqual(decompressor.write(compressed), len(orig))
274 274
275 275 self.assertEqual(buffer.getvalue(), orig)
276 276
277 277 def test_memory_size(self):
278 278 dctx = zstd.ZstdDecompressor()
279 279 buffer = io.BytesIO()
280 280 with dctx.write_to(buffer) as decompressor:
281 281 size = decompressor.memory_size()
282 282
283 283 self.assertGreater(size, 100000)
284 284
285 285 def test_write_size(self):
286 286 source = zstd.ZstdCompressor().compress(b'foobarfoobar')
287 287 dest = OpCountingBytesIO()
288 288 dctx = zstd.ZstdDecompressor()
289 289 with dctx.write_to(dest, write_size=1) as decompressor:
290 290 s = struct.Struct('>B')
291 291 for c in source:
292 292 if not isinstance(c, str):
293 293 c = s.pack(c)
294 294 decompressor.write(c)
295 295
296
297 296 self.assertEqual(dest.getvalue(), b'foobarfoobar')
298 297 self.assertEqual(dest._write_count, len(dest.getvalue()))
299 298
300 299
301 300 @make_cffi
302 301 class TestDecompressor_read_from(unittest.TestCase):
303 302 def test_type_validation(self):
304 303 dctx = zstd.ZstdDecompressor()
305 304
306 305 # Object with read() works.
307 306 dctx.read_from(io.BytesIO())
308 307
309 308 # Buffer protocol works.
310 309 dctx.read_from(b'foobar')
311 310
312 311 with self.assertRaisesRegexp(ValueError, 'must pass an object with a read'):
313 312 b''.join(dctx.read_from(True))
314 313
315 314 def test_empty_input(self):
316 315 dctx = zstd.ZstdDecompressor()
317 316
318 317 source = io.BytesIO()
319 318 it = dctx.read_from(source)
320 319 # TODO this is arguably wrong. Should get an error about missing frame foo.
321 320 with self.assertRaises(StopIteration):
322 321 next(it)
323 322
324 323 it = dctx.read_from(b'')
325 324 with self.assertRaises(StopIteration):
326 325 next(it)
327 326
328 327 def test_invalid_input(self):
329 328 dctx = zstd.ZstdDecompressor()
330 329
331 330 source = io.BytesIO(b'foobar')
332 331 it = dctx.read_from(source)
333 332 with self.assertRaisesRegexp(zstd.ZstdError, 'Unknown frame descriptor'):
334 333 next(it)
335 334
336 335 it = dctx.read_from(b'foobar')
337 336 with self.assertRaisesRegexp(zstd.ZstdError, 'Unknown frame descriptor'):
338 337 next(it)
339 338
340 339 def test_empty_roundtrip(self):
341 340 cctx = zstd.ZstdCompressor(level=1, write_content_size=False)
342 341 empty = cctx.compress(b'')
343 342
344 343 source = io.BytesIO(empty)
345 344 source.seek(0)
346 345
347 346 dctx = zstd.ZstdDecompressor()
348 347 it = dctx.read_from(source)
349 348
350 349 # No chunks should be emitted since there is no data.
351 350 with self.assertRaises(StopIteration):
352 351 next(it)
353 352
354 353 # Again for good measure.
355 354 with self.assertRaises(StopIteration):
356 355 next(it)
357 356
358 357 def test_skip_bytes_too_large(self):
359 358 dctx = zstd.ZstdDecompressor()
360 359
361 360 with self.assertRaisesRegexp(ValueError, 'skip_bytes must be smaller than read_size'):
362 361 b''.join(dctx.read_from(b'', skip_bytes=1, read_size=1))
363 362
364 363 with self.assertRaisesRegexp(ValueError, 'skip_bytes larger than first input chunk'):
365 364 b''.join(dctx.read_from(b'foobar', skip_bytes=10))
366 365
367 366 def test_skip_bytes(self):
368 367 cctx = zstd.ZstdCompressor(write_content_size=False)
369 368 compressed = cctx.compress(b'foobar')
370 369
371 370 dctx = zstd.ZstdDecompressor()
372 371 output = b''.join(dctx.read_from(b'hdr' + compressed, skip_bytes=3))
373 372 self.assertEqual(output, b'foobar')
374 373
375 374 def test_large_output(self):
376 375 source = io.BytesIO()
377 376 source.write(b'f' * zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE)
378 377 source.write(b'o')
379 378 source.seek(0)
380 379
381 380 cctx = zstd.ZstdCompressor(level=1)
382 381 compressed = io.BytesIO(cctx.compress(source.getvalue()))
383 382 compressed.seek(0)
384 383
385 384 dctx = zstd.ZstdDecompressor()
386 385 it = dctx.read_from(compressed)
387 386
388 387 chunks = []
389 388 chunks.append(next(it))
390 389 chunks.append(next(it))
391 390
392 391 with self.assertRaises(StopIteration):
393 392 next(it)
394 393
395 394 decompressed = b''.join(chunks)
396 395 self.assertEqual(decompressed, source.getvalue())
397 396
398 397 # And again with buffer protocol.
399 398 it = dctx.read_from(compressed.getvalue())
400 399 chunks = []
401 400 chunks.append(next(it))
402 401 chunks.append(next(it))
403 402
404 403 with self.assertRaises(StopIteration):
405 404 next(it)
406 405
407 406 decompressed = b''.join(chunks)
408 407 self.assertEqual(decompressed, source.getvalue())
409 408
410 409 def test_large_input(self):
411 410 bytes = list(struct.Struct('>B').pack(i) for i in range(256))
412 411 compressed = io.BytesIO()
413 412 input_size = 0
414 413 cctx = zstd.ZstdCompressor(level=1)
415 414 with cctx.write_to(compressed) as compressor:
416 415 while True:
417 416 compressor.write(random.choice(bytes))
418 417 input_size += 1
419 418
420 419 have_compressed = len(compressed.getvalue()) > zstd.DECOMPRESSION_RECOMMENDED_INPUT_SIZE
421 420 have_raw = input_size > zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE * 2
422 421 if have_compressed and have_raw:
423 422 break
424 423
425 424 compressed.seek(0)
426 425 self.assertGreater(len(compressed.getvalue()),
427 426 zstd.DECOMPRESSION_RECOMMENDED_INPUT_SIZE)
428 427
429 428 dctx = zstd.ZstdDecompressor()
430 429 it = dctx.read_from(compressed)
431 430
432 431 chunks = []
433 432 chunks.append(next(it))
434 433 chunks.append(next(it))
435 434 chunks.append(next(it))
436 435
437 436 with self.assertRaises(StopIteration):
438 437 next(it)
439 438
440 439 decompressed = b''.join(chunks)
441 440 self.assertEqual(len(decompressed), input_size)
442 441
443 442 # And again with buffer protocol.
444 443 it = dctx.read_from(compressed.getvalue())
445 444
446 445 chunks = []
447 446 chunks.append(next(it))
448 447 chunks.append(next(it))
449 448 chunks.append(next(it))
450 449
451 450 with self.assertRaises(StopIteration):
452 451 next(it)
453 452
454 453 decompressed = b''.join(chunks)
455 454 self.assertEqual(len(decompressed), input_size)
456 455
457 456 def test_interesting(self):
458 457 # Found this edge case via fuzzing.
459 458 cctx = zstd.ZstdCompressor(level=1)
460 459
461 460 source = io.BytesIO()
462 461
463 462 compressed = io.BytesIO()
464 463 with cctx.write_to(compressed) as compressor:
465 464 for i in range(256):
466 465 chunk = b'\0' * 1024
467 466 compressor.write(chunk)
468 467 source.write(chunk)
469 468
470 469 dctx = zstd.ZstdDecompressor()
471 470
472 471 simple = dctx.decompress(compressed.getvalue(),
473 472 max_output_size=len(source.getvalue()))
474 473 self.assertEqual(simple, source.getvalue())
475 474
476 475 compressed.seek(0)
477 476 streamed = b''.join(dctx.read_from(compressed))
478 477 self.assertEqual(streamed, source.getvalue())
479 478
480 479 def test_read_write_size(self):
481 480 source = OpCountingBytesIO(zstd.ZstdCompressor().compress(b'foobarfoobar'))
482 481 dctx = zstd.ZstdDecompressor()
483 482 for chunk in dctx.read_from(source, read_size=1, write_size=1):
484 483 self.assertEqual(len(chunk), 1)
485 484
486 485 self.assertEqual(source._read_count, len(source.getvalue()))
487 486
488 487
489 488 @make_cffi
490 489 class TestDecompressor_content_dict_chain(unittest.TestCase):
491 490 def test_bad_inputs_simple(self):
492 491 dctx = zstd.ZstdDecompressor()
493 492
494 493 with self.assertRaises(TypeError):
495 494 dctx.decompress_content_dict_chain(b'foo')
496 495
497 496 with self.assertRaises(TypeError):
498 497 dctx.decompress_content_dict_chain((b'foo', b'bar'))
499 498
500 499 with self.assertRaisesRegexp(ValueError, 'empty input chain'):
501 500 dctx.decompress_content_dict_chain([])
502 501
503 502 with self.assertRaisesRegexp(ValueError, 'chunk 0 must be bytes'):
504 503 dctx.decompress_content_dict_chain([u'foo'])
505 504
506 505 with self.assertRaisesRegexp(ValueError, 'chunk 0 must be bytes'):
507 506 dctx.decompress_content_dict_chain([True])
508 507
509 508 with self.assertRaisesRegexp(ValueError, 'chunk 0 is too small to contain a zstd frame'):
510 509 dctx.decompress_content_dict_chain([zstd.FRAME_HEADER])
511 510
512 511 with self.assertRaisesRegexp(ValueError, 'chunk 0 is not a valid zstd frame'):
513 512 dctx.decompress_content_dict_chain([b'foo' * 8])
514 513
515 514 no_size = zstd.ZstdCompressor().compress(b'foo' * 64)
516 515
517 516 with self.assertRaisesRegexp(ValueError, 'chunk 0 missing content size in frame'):
518 517 dctx.decompress_content_dict_chain([no_size])
519 518
520 519 # Corrupt first frame.
521 520 frame = zstd.ZstdCompressor(write_content_size=True).compress(b'foo' * 64)
522 521 frame = frame[0:12] + frame[15:]
523 522 with self.assertRaisesRegexp(zstd.ZstdError, 'could not decompress chunk 0'):
524 523 dctx.decompress_content_dict_chain([frame])
525 524
526 525 def test_bad_subsequent_input(self):
527 526 initial = zstd.ZstdCompressor(write_content_size=True).compress(b'foo' * 64)
528 527
529 528 dctx = zstd.ZstdDecompressor()
530 529
531 530 with self.assertRaisesRegexp(ValueError, 'chunk 1 must be bytes'):
532 531 dctx.decompress_content_dict_chain([initial, u'foo'])
533 532
534 533 with self.assertRaisesRegexp(ValueError, 'chunk 1 must be bytes'):
535 534 dctx.decompress_content_dict_chain([initial, None])
536 535
537 536 with self.assertRaisesRegexp(ValueError, 'chunk 1 is too small to contain a zstd frame'):
538 537 dctx.decompress_content_dict_chain([initial, zstd.FRAME_HEADER])
539 538
540 539 with self.assertRaisesRegexp(ValueError, 'chunk 1 is not a valid zstd frame'):
541 540 dctx.decompress_content_dict_chain([initial, b'foo' * 8])
542 541
543 542 no_size = zstd.ZstdCompressor().compress(b'foo' * 64)
544 543
545 544 with self.assertRaisesRegexp(ValueError, 'chunk 1 missing content size in frame'):
546 545 dctx.decompress_content_dict_chain([initial, no_size])
547 546
548 547 # Corrupt second frame.
549 548 cctx = zstd.ZstdCompressor(write_content_size=True, dict_data=zstd.ZstdCompressionDict(b'foo' * 64))
550 549 frame = cctx.compress(b'bar' * 64)
551 550 frame = frame[0:12] + frame[15:]
552 551
553 552 with self.assertRaisesRegexp(zstd.ZstdError, 'could not decompress chunk 1'):
554 553 dctx.decompress_content_dict_chain([initial, frame])
555 554
556 555 def test_simple(self):
557 556 original = [
558 557 b'foo' * 64,
559 558 b'foobar' * 64,
560 559 b'baz' * 64,
561 560 b'foobaz' * 64,
562 561 b'foobarbaz' * 64,
563 562 ]
564 563
565 564 chunks = []
566 565 chunks.append(zstd.ZstdCompressor(write_content_size=True).compress(original[0]))
567 566 for i, chunk in enumerate(original[1:]):
568 567 d = zstd.ZstdCompressionDict(original[i])
569 568 cctx = zstd.ZstdCompressor(dict_data=d, write_content_size=True)
570 569 chunks.append(cctx.compress(chunk))
571 570
572 571 for i in range(1, len(original)):
573 572 chain = chunks[0:i]
574 573 expected = original[i - 1]
575 574 dctx = zstd.ZstdDecompressor()
576 575 decompressed = dctx.decompress_content_dict_chain(chain)
577 576 self.assertEqual(decompressed, expected)
577
578
579 # TODO enable for CFFI
580 class TestDecompressor_multi_decompress_to_buffer(unittest.TestCase):
581 def test_invalid_inputs(self):
582 dctx = zstd.ZstdDecompressor()
583
584 with self.assertRaises(TypeError):
585 dctx.multi_decompress_to_buffer(True)
586
587 with self.assertRaises(TypeError):
588 dctx.multi_decompress_to_buffer((1, 2))
589
590 with self.assertRaisesRegexp(TypeError, 'item 0 not a bytes like object'):
591 dctx.multi_decompress_to_buffer([u'foo'])
592
593 with self.assertRaisesRegexp(ValueError, 'could not determine decompressed size of item 0'):
594 dctx.multi_decompress_to_buffer([b'foobarbaz'])
595
596 def test_list_input(self):
597 cctx = zstd.ZstdCompressor(write_content_size=True)
598
599 original = [b'foo' * 4, b'bar' * 6]
600 frames = [cctx.compress(d) for d in original]
601
602 dctx = zstd.ZstdDecompressor()
603 result = dctx.multi_decompress_to_buffer(frames)
604
605 self.assertEqual(len(result), len(frames))
606 self.assertEqual(result.size(), sum(map(len, original)))
607
608 for i, data in enumerate(original):
609 self.assertEqual(result[i].tobytes(), data)
610
611 self.assertEqual(result[0].offset, 0)
612 self.assertEqual(len(result[0]), 12)
613 self.assertEqual(result[1].offset, 12)
614 self.assertEqual(len(result[1]), 18)
615
616 def test_list_input_frame_sizes(self):
617 cctx = zstd.ZstdCompressor(write_content_size=False)
618
619 original = [b'foo' * 4, b'bar' * 6, b'baz' * 8]
620 frames = [cctx.compress(d) for d in original]
621 sizes = struct.pack('=' + 'Q' * len(original), *map(len, original))
622
623 dctx = zstd.ZstdDecompressor()
624 result = dctx.multi_decompress_to_buffer(frames, decompressed_sizes=sizes)
625
626 self.assertEqual(len(result), len(frames))
627 self.assertEqual(result.size(), sum(map(len, original)))
628
629 for i, data in enumerate(original):
630 self.assertEqual(result[i].tobytes(), data)
631
632 def test_buffer_with_segments_input(self):
633 cctx = zstd.ZstdCompressor(write_content_size=True)
634
635 original = [b'foo' * 4, b'bar' * 6]
636 frames = [cctx.compress(d) for d in original]
637
638 dctx = zstd.ZstdDecompressor()
639
640 segments = struct.pack('=QQQQ', 0, len(frames[0]), len(frames[0]), len(frames[1]))
641 b = zstd.BufferWithSegments(b''.join(frames), segments)
642
643 result = dctx.multi_decompress_to_buffer(b)
644
645 self.assertEqual(len(result), len(frames))
646 self.assertEqual(result[0].offset, 0)
647 self.assertEqual(len(result[0]), 12)
648 self.assertEqual(result[1].offset, 12)
649 self.assertEqual(len(result[1]), 18)
650
651 def test_buffer_with_segments_sizes(self):
652 cctx = zstd.ZstdCompressor(write_content_size=False)
653 original = [b'foo' * 4, b'bar' * 6, b'baz' * 8]
654 frames = [cctx.compress(d) for d in original]
655 sizes = struct.pack('=' + 'Q' * len(original), *map(len, original))
656
657 segments = struct.pack('=QQQQQQ', 0, len(frames[0]),
658 len(frames[0]), len(frames[1]),
659 len(frames[0]) + len(frames[1]), len(frames[2]))
660 b = zstd.BufferWithSegments(b''.join(frames), segments)
661
662 dctx = zstd.ZstdDecompressor()
663 result = dctx.multi_decompress_to_buffer(b, decompressed_sizes=sizes)
664
665 self.assertEqual(len(result), len(frames))
666 self.assertEqual(result.size(), sum(map(len, original)))
667
668 for i, data in enumerate(original):
669 self.assertEqual(result[i].tobytes(), data)
670
671 def test_buffer_with_segments_collection_input(self):
672 cctx = zstd.ZstdCompressor(write_content_size=True)
673
674 original = [
675 b'foo0' * 2,
676 b'foo1' * 3,
677 b'foo2' * 4,
678 b'foo3' * 5,
679 b'foo4' * 6,
680 ]
681
682 frames = cctx.multi_compress_to_buffer(original)
683
684 # Check round trip.
685 dctx = zstd.ZstdDecompressor()
686 decompressed = dctx.multi_decompress_to_buffer(frames, threads=3)
687
688 self.assertEqual(len(decompressed), len(original))
689
690 for i, data in enumerate(original):
691 self.assertEqual(data, decompressed[i].tobytes())
692
693 # And a manual mode.
694 b = b''.join([frames[0].tobytes(), frames[1].tobytes()])
695 b1 = zstd.BufferWithSegments(b, struct.pack('=QQQQ',
696 0, len(frames[0]),
697 len(frames[0]), len(frames[1])))
698
699 b = b''.join([frames[2].tobytes(), frames[3].tobytes(), frames[4].tobytes()])
700 b2 = zstd.BufferWithSegments(b, struct.pack('=QQQQQQ',
701 0, len(frames[2]),
702 len(frames[2]), len(frames[3]),
703 len(frames[2]) + len(frames[3]), len(frames[4])))
704
705 c = zstd.BufferWithSegmentsCollection(b1, b2)
706
707 dctx = zstd.ZstdDecompressor()
708 decompressed = dctx.multi_decompress_to_buffer(c)
709
710 self.assertEqual(len(decompressed), 5)
711 for i in range(5):
712 self.assertEqual(decompressed[i].tobytes(), original[i])
713
714 def test_multiple_threads(self):
715 cctx = zstd.ZstdCompressor(write_content_size=True)
716
717 frames = []
718 frames.extend(cctx.compress(b'x' * 64) for i in range(256))
719 frames.extend(cctx.compress(b'y' * 64) for i in range(256))
720
721 dctx = zstd.ZstdDecompressor()
722 result = dctx.multi_decompress_to_buffer(frames, threads=-1)
723
724 self.assertEqual(len(result), len(frames))
725 self.assertEqual(result.size(), 2 * 64 * 256)
726 self.assertEqual(result[0].tobytes(), b'x' * 64)
727 self.assertEqual(result[256].tobytes(), b'y' * 64)
728
729 def test_item_failure(self):
730 cctx = zstd.ZstdCompressor(write_content_size=True)
731 frames = [cctx.compress(b'x' * 128), cctx.compress(b'y' * 128)]
732
733 frames[1] = frames[1] + b'extra'
734
735 dctx = zstd.ZstdDecompressor()
736
737 with self.assertRaisesRegexp(zstd.ZstdError, 'error decompressing item 1: Src size incorrect'):
738 dctx.multi_decompress_to_buffer(frames)
739
740 with self.assertRaisesRegexp(zstd.ZstdError, 'error decompressing item 1: Src size incorrect'):
741 dctx.multi_decompress_to_buffer(frames, threads=2)
@@ -1,50 +1,110
1 1 import sys
2 2
3 3 try:
4 4 import unittest2 as unittest
5 5 except ImportError:
6 6 import unittest
7 7
8 8 import zstd
9 9
10 10 from . common import (
11 11 make_cffi,
12 12 )
13 13
14 14 if sys.version_info[0] >= 3:
15 15 int_type = int
16 16 else:
17 17 int_type = long
18 18
19 19
20 20 @make_cffi
21 21 class TestTrainDictionary(unittest.TestCase):
22 22 def test_no_args(self):
23 23 with self.assertRaises(TypeError):
24 24 zstd.train_dictionary()
25 25
26 26 def test_bad_args(self):
27 27 with self.assertRaises(TypeError):
28 28 zstd.train_dictionary(8192, u'foo')
29 29
30 30 with self.assertRaises(ValueError):
31 31 zstd.train_dictionary(8192, [u'foo'])
32 32
33 33 def test_basic(self):
34 34 samples = []
35 35 for i in range(128):
36 36 samples.append(b'foo' * 64)
37 37 samples.append(b'bar' * 64)
38 38 samples.append(b'foobar' * 64)
39 39 samples.append(b'baz' * 64)
40 40 samples.append(b'foobaz' * 64)
41 41 samples.append(b'bazfoo' * 64)
42 42
43 43 d = zstd.train_dictionary(8192, samples)
44 44 self.assertLessEqual(len(d), 8192)
45 45
46 46 dict_id = d.dict_id()
47 47 self.assertIsInstance(dict_id, int_type)
48 48
49 49 data = d.as_bytes()
50 50 self.assertEqual(data[0:4], b'\x37\xa4\x30\xec')
51
52 def test_set_dict_id(self):
53 samples = []
54 for i in range(128):
55 samples.append(b'foo' * 64)
56 samples.append(b'foobar' * 64)
57
58 d = zstd.train_dictionary(8192, samples, dict_id=42)
59 self.assertEqual(d.dict_id(), 42)
60
61
62 @make_cffi
63 class TestTrainCoverDictionary(unittest.TestCase):
64 def test_no_args(self):
65 with self.assertRaises(TypeError):
66 zstd.train_cover_dictionary()
67
68 def test_bad_args(self):
69 with self.assertRaises(TypeError):
70 zstd.train_cover_dictionary(8192, u'foo')
71
72 with self.assertRaises(ValueError):
73 zstd.train_cover_dictionary(8192, [u'foo'])
74
75 def test_basic(self):
76 samples = []
77 for i in range(128):
78 samples.append(b'foo' * 64)
79 samples.append(b'foobar' * 64)
80
81 d = zstd.train_cover_dictionary(8192, samples, k=64, d=16)
82 self.assertIsInstance(d.dict_id(), int_type)
83
84 data = d.as_bytes()
85 self.assertEqual(data[0:4], b'\x37\xa4\x30\xec')
86
87 self.assertEqual(d.k, 64)
88 self.assertEqual(d.d, 16)
89
90 def test_set_dict_id(self):
91 samples = []
92 for i in range(128):
93 samples.append(b'foo' * 64)
94 samples.append(b'foobar' * 64)
95
96 d = zstd.train_cover_dictionary(8192, samples, k=64, d=16,
97 dict_id=42)
98 self.assertEqual(d.dict_id(), 42)
99
100 def test_optimize(self):
101 samples = []
102 for i in range(128):
103 samples.append(b'foo' * 64)
104 samples.append(b'foobar' * 64)
105
106 d = zstd.train_cover_dictionary(8192, samples, optimize=True,
107 threads=-1, steps=1, d=16)
108
109 self.assertEqual(d.k, 16)
110 self.assertEqual(d.d, 16)
@@ -1,145 +1,210
1 1 /**
2 2 * Copyright (c) 2016-present, Gregory Szorc
3 3 * All rights reserved.
4 4 *
5 5 * This software may be modified and distributed under the terms
6 6 * of the BSD license. See the LICENSE file for details.
7 7 */
8 8
9 9 /* A Python C extension for Zstandard. */
10 10
11 #if defined(_WIN32)
12 #define WIN32_LEAN_AND_MEAN
13 #include <Windows.h>
14 #endif
15
11 16 #include "python-zstandard.h"
12 17
13 18 PyObject *ZstdError;
14 19
15 20 PyDoc_STRVAR(estimate_compression_context_size__doc__,
16 21 "estimate_compression_context_size(compression_parameters)\n"
17 22 "\n"
18 23 "Give the amount of memory allocated for a compression context given a\n"
19 24 "CompressionParameters instance");
20 25
21 26 PyDoc_STRVAR(estimate_decompression_context_size__doc__,
22 27 "estimate_decompression_context_size()\n"
23 28 "\n"
24 29 "Estimate the amount of memory allocated to a decompression context.\n"
25 30 );
26 31
27 32 static PyObject* estimate_decompression_context_size(PyObject* self) {
28 33 return PyLong_FromSize_t(ZSTD_estimateDCtxSize());
29 34 }
30 35
31 36 PyDoc_STRVAR(get_compression_parameters__doc__,
32 37 "get_compression_parameters(compression_level[, source_size[, dict_size]])\n"
33 38 "\n"
34 39 "Obtains a ``CompressionParameters`` instance from a compression level and\n"
35 40 "optional input size and dictionary size");
36 41
37 42 PyDoc_STRVAR(get_frame_parameters__doc__,
38 43 "get_frame_parameters(data)\n"
39 44 "\n"
40 45 "Obtains a ``FrameParameters`` instance by parsing data.\n");
41 46
42 47 PyDoc_STRVAR(train_dictionary__doc__,
43 48 "train_dictionary(dict_size, samples)\n"
44 49 "\n"
45 50 "Train a dictionary from sample data.\n"
46 51 "\n"
47 52 "A compression dictionary of size ``dict_size`` will be created from the\n"
48 53 "iterable of samples provided by ``samples``.\n"
49 54 "\n"
50 55 "The raw dictionary content will be returned\n");
51 56
57 PyDoc_STRVAR(train_cover_dictionary__doc__,
58 "train_cover_dictionary(dict_size, samples, k=None, d=None, notifications=0, dict_id=0, level=0)\n"
59 "\n"
60 "Train a dictionary from sample data using the COVER algorithm.\n"
61 "\n"
62 "This behaves like ``train_dictionary()`` except a different algorithm is\n"
63 "used to create the dictionary. The algorithm has 2 parameters: ``k`` and\n"
64 "``d``. These control the *segment size* and *dmer size*. A reasonable range\n"
65 "for ``k`` is ``[16, 2048+]``. A reasonable range for ``d`` is ``[6, 16]``.\n"
66 "``d`` must be less than or equal to ``k``.\n"
67 );
68
52 69 static char zstd_doc[] = "Interface to zstandard";
53 70
54 71 static PyMethodDef zstd_methods[] = {
72 /* TODO remove since it is a method on CompressionParameters. */
55 73 { "estimate_compression_context_size", (PyCFunction)estimate_compression_context_size,
56 74 METH_VARARGS, estimate_compression_context_size__doc__ },
57 75 { "estimate_decompression_context_size", (PyCFunction)estimate_decompression_context_size,
58 76 METH_NOARGS, estimate_decompression_context_size__doc__ },
59 77 { "get_compression_parameters", (PyCFunction)get_compression_parameters,
60 78 METH_VARARGS, get_compression_parameters__doc__ },
61 79 { "get_frame_parameters", (PyCFunction)get_frame_parameters,
62 80 METH_VARARGS, get_frame_parameters__doc__ },
63 81 { "train_dictionary", (PyCFunction)train_dictionary,
64 82 METH_VARARGS | METH_KEYWORDS, train_dictionary__doc__ },
83 { "train_cover_dictionary", (PyCFunction)train_cover_dictionary,
84 METH_VARARGS | METH_KEYWORDS, train_cover_dictionary__doc__ },
65 85 { NULL, NULL }
66 86 };
67 87
88 void bufferutil_module_init(PyObject* mod);
68 89 void compressobj_module_init(PyObject* mod);
69 90 void compressor_module_init(PyObject* mod);
70 91 void compressionparams_module_init(PyObject* mod);
71 92 void constants_module_init(PyObject* mod);
72 void dictparams_module_init(PyObject* mod);
73 93 void compressiondict_module_init(PyObject* mod);
74 94 void compressionwriter_module_init(PyObject* mod);
75 95 void compressoriterator_module_init(PyObject* mod);
76 96 void decompressor_module_init(PyObject* mod);
77 97 void decompressobj_module_init(PyObject* mod);
78 98 void decompressionwriter_module_init(PyObject* mod);
79 99 void decompressoriterator_module_init(PyObject* mod);
80 100 void frameparams_module_init(PyObject* mod);
81 101
82 102 void zstd_module_init(PyObject* m) {
83 103 /* python-zstandard relies on unstable zstd C API features. This means
84 104 that changes in zstd may break expectations in python-zstandard.
85 105
86 106 python-zstandard is distributed with a copy of the zstd sources.
87 107 python-zstandard is only guaranteed to work with the bundled version
88 108 of zstd.
89 109
90 110 However, downstream redistributors or packagers may unbundle zstd
91 111 from python-zstandard. This can result in a mismatch between zstd
92 112 versions and API semantics. This essentially "voids the warranty"
93 113 of python-zstandard and may cause undefined behavior.
94 114
95 115 We detect this mismatch here and refuse to load the module if this
96 116 scenario is detected.
97 117 */
98 118 if (ZSTD_VERSION_NUMBER != 10103 || ZSTD_versionNumber() != 10103) {
99 119 PyErr_SetString(PyExc_ImportError, "zstd C API mismatch; Python bindings not compiled against expected zstd version");
100 120 return;
101 121 }
102 122
123 bufferutil_module_init(m);
103 124 compressionparams_module_init(m);
104 dictparams_module_init(m);
105 125 compressiondict_module_init(m);
106 126 compressobj_module_init(m);
107 127 compressor_module_init(m);
108 128 compressionwriter_module_init(m);
109 129 compressoriterator_module_init(m);
110 130 constants_module_init(m);
111 131 decompressor_module_init(m);
112 132 decompressobj_module_init(m);
113 133 decompressionwriter_module_init(m);
114 134 decompressoriterator_module_init(m);
115 135 frameparams_module_init(m);
116 136 }
117 137
118 138 #if PY_MAJOR_VERSION >= 3
119 139 static struct PyModuleDef zstd_module = {
120 140 PyModuleDef_HEAD_INIT,
121 141 "zstd",
122 142 zstd_doc,
123 143 -1,
124 144 zstd_methods
125 145 };
126 146
127 147 PyMODINIT_FUNC PyInit_zstd(void) {
128 148 PyObject *m = PyModule_Create(&zstd_module);
129 149 if (m) {
130 150 zstd_module_init(m);
131 151 if (PyErr_Occurred()) {
132 152 Py_DECREF(m);
133 153 m = NULL;
134 154 }
135 155 }
136 156 return m;
137 157 }
138 158 #else
139 159 PyMODINIT_FUNC initzstd(void) {
140 160 PyObject *m = Py_InitModule3("zstd", zstd_methods, zstd_doc);
141 161 if (m) {
142 162 zstd_module_init(m);
143 163 }
144 164 }
145 165 #endif
166
167 /* Attempt to resolve the number of CPUs in the system. */
168 int cpu_count() {
169 int count = 0;
170
171 #if defined(_WIN32)
172 SYSTEM_INFO si;
173 si.dwNumberOfProcessors = 0;
174 GetSystemInfo(&si);
175 count = si.dwNumberOfProcessors;
176 #elif defined(__APPLE__)
177 int num;
178 size_t size = sizeof(int);
179
180 if (0 == sysctlbyname("hw.logicalcpu", &num, &size, NULL, 0)) {
181 count = num;
182 }
183 #elif defined(__linux__)
184 count = sysconf(_SC_NPROCESSORS_ONLN);
185 #elif defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__DragonFly__)
186 int mib[2];
187 size_t len = sizeof(count);
188 mib[0] = CTL_HW;
189 mib[1] = HW_NCPU;
190 if (0 != sysctl(mib, 2, &count, &len, NULL, 0)) {
191 count = 0;
192 }
193 #elif defined(__hpux)
194 count = mpctl(MPC_GETNUMSPUS, NULL, NULL);
195 #endif
196
197 return count;
198 }
199
200 size_t roundpow2(size_t i) {
201 i--;
202 i |= i >> 1;
203 i |= i >> 2;
204 i |= i >> 4;
205 i |= i >> 8;
206 i |= i >> 16;
207 i++;
208
209 return i;
210 }
@@ -1,1042 +1,1257
1 1 # Copyright (c) 2016-present, Gregory Szorc
2 2 # All rights reserved.
3 3 #
4 4 # This software may be modified and distributed under the terms
5 5 # of the BSD license. See the LICENSE file for details.
6 6
7 7 """Python interface to the Zstandard (zstd) compression library."""
8 8
9 9 from __future__ import absolute_import, unicode_literals
10 10
11 import os
11 12 import sys
12 13
13 14 from _zstd_cffi import (
14 15 ffi,
15 16 lib,
16 17 )
17 18
18 19 if sys.version_info[0] == 2:
19 20 bytes_type = str
20 21 int_type = long
21 22 else:
22 23 bytes_type = bytes
23 24 int_type = int
24 25
25 26
26 27 COMPRESSION_RECOMMENDED_INPUT_SIZE = lib.ZSTD_CStreamInSize()
27 28 COMPRESSION_RECOMMENDED_OUTPUT_SIZE = lib.ZSTD_CStreamOutSize()
28 29 DECOMPRESSION_RECOMMENDED_INPUT_SIZE = lib.ZSTD_DStreamInSize()
29 30 DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE = lib.ZSTD_DStreamOutSize()
30 31
31 32 new_nonzero = ffi.new_allocator(should_clear_after_alloc=False)
32 33
33 34
34 35 MAX_COMPRESSION_LEVEL = lib.ZSTD_maxCLevel()
35 36 MAGIC_NUMBER = lib.ZSTD_MAGICNUMBER
36 37 FRAME_HEADER = b'\x28\xb5\x2f\xfd'
37 38 ZSTD_VERSION = (lib.ZSTD_VERSION_MAJOR, lib.ZSTD_VERSION_MINOR, lib.ZSTD_VERSION_RELEASE)
38 39
39 40 WINDOWLOG_MIN = lib.ZSTD_WINDOWLOG_MIN
40 41 WINDOWLOG_MAX = lib.ZSTD_WINDOWLOG_MAX
41 42 CHAINLOG_MIN = lib.ZSTD_CHAINLOG_MIN
42 43 CHAINLOG_MAX = lib.ZSTD_CHAINLOG_MAX
43 44 HASHLOG_MIN = lib.ZSTD_HASHLOG_MIN
44 45 HASHLOG_MAX = lib.ZSTD_HASHLOG_MAX
45 46 HASHLOG3_MAX = lib.ZSTD_HASHLOG3_MAX
46 47 SEARCHLOG_MIN = lib.ZSTD_SEARCHLOG_MIN
47 48 SEARCHLOG_MAX = lib.ZSTD_SEARCHLOG_MAX
48 49 SEARCHLENGTH_MIN = lib.ZSTD_SEARCHLENGTH_MIN
49 50 SEARCHLENGTH_MAX = lib.ZSTD_SEARCHLENGTH_MAX
50 51 TARGETLENGTH_MIN = lib.ZSTD_TARGETLENGTH_MIN
51 52 TARGETLENGTH_MAX = lib.ZSTD_TARGETLENGTH_MAX
52 53
53 54 STRATEGY_FAST = lib.ZSTD_fast
54 55 STRATEGY_DFAST = lib.ZSTD_dfast
55 56 STRATEGY_GREEDY = lib.ZSTD_greedy
56 57 STRATEGY_LAZY = lib.ZSTD_lazy
57 58 STRATEGY_LAZY2 = lib.ZSTD_lazy2
58 59 STRATEGY_BTLAZY2 = lib.ZSTD_btlazy2
59 60 STRATEGY_BTOPT = lib.ZSTD_btopt
60 61
61 62 COMPRESSOBJ_FLUSH_FINISH = 0
62 63 COMPRESSOBJ_FLUSH_BLOCK = 1
63 64
64 65
66 def _cpu_count():
67 # os.cpu_count() was introducd in Python 3.4.
68 try:
69 return os.cpu_count() or 0
70 except AttributeError:
71 pass
72
73 # Linux.
74 try:
75 if sys.version_info[0] == 2:
76 return os.sysconf(b'SC_NPROCESSORS_ONLN')
77 else:
78 return os.sysconf(u'SC_NPROCESSORS_ONLN')
79 except (AttributeError, ValueError):
80 pass
81
82 # TODO implement on other platforms.
83 return 0
84
85
65 86 class ZstdError(Exception):
66 87 pass
67 88
68 89
69 90 class CompressionParameters(object):
70 91 def __init__(self, window_log, chain_log, hash_log, search_log,
71 92 search_length, target_length, strategy):
72 93 if window_log < WINDOWLOG_MIN or window_log > WINDOWLOG_MAX:
73 94 raise ValueError('invalid window log value')
74 95
75 96 if chain_log < CHAINLOG_MIN or chain_log > CHAINLOG_MAX:
76 97 raise ValueError('invalid chain log value')
77 98
78 99 if hash_log < HASHLOG_MIN or hash_log > HASHLOG_MAX:
79 100 raise ValueError('invalid hash log value')
80 101
81 102 if search_log < SEARCHLOG_MIN or search_log > SEARCHLOG_MAX:
82 103 raise ValueError('invalid search log value')
83 104
84 105 if search_length < SEARCHLENGTH_MIN or search_length > SEARCHLENGTH_MAX:
85 106 raise ValueError('invalid search length value')
86 107
87 108 if target_length < TARGETLENGTH_MIN or target_length > TARGETLENGTH_MAX:
88 109 raise ValueError('invalid target length value')
89 110
90 111 if strategy < STRATEGY_FAST or strategy > STRATEGY_BTOPT:
91 112 raise ValueError('invalid strategy value')
92 113
93 114 self.window_log = window_log
94 115 self.chain_log = chain_log
95 116 self.hash_log = hash_log
96 117 self.search_log = search_log
97 118 self.search_length = search_length
98 119 self.target_length = target_length
99 120 self.strategy = strategy
100 121
122 zresult = lib.ZSTD_checkCParams(self.as_compression_parameters())
123 if lib.ZSTD_isError(zresult):
124 raise ValueError('invalid compression parameters: %s',
125 ffi.string(lib.ZSTD_getErrorName(zresult)))
126
127 def estimated_compression_context_size(self):
128 return lib.ZSTD_estimateCCtxSize(self.as_compression_parameters())
129
101 130 def as_compression_parameters(self):
102 131 p = ffi.new('ZSTD_compressionParameters *')[0]
103 132 p.windowLog = self.window_log
104 133 p.chainLog = self.chain_log
105 134 p.hashLog = self.hash_log
106 135 p.searchLog = self.search_log
107 136 p.searchLength = self.search_length
108 137 p.targetLength = self.target_length
109 138 p.strategy = self.strategy
110 139
111 140 return p
112 141
113 142 def get_compression_parameters(level, source_size=0, dict_size=0):
114 143 params = lib.ZSTD_getCParams(level, source_size, dict_size)
115 144 return CompressionParameters(window_log=params.windowLog,
116 145 chain_log=params.chainLog,
117 146 hash_log=params.hashLog,
118 147 search_log=params.searchLog,
119 148 search_length=params.searchLength,
120 149 target_length=params.targetLength,
121 150 strategy=params.strategy)
122 151
123 152
124 153 def estimate_compression_context_size(params):
125 154 if not isinstance(params, CompressionParameters):
126 155 raise ValueError('argument must be a CompressionParameters')
127 156
128 157 cparams = params.as_compression_parameters()
129 158 return lib.ZSTD_estimateCCtxSize(cparams)
130 159
131 160
132 161 def estimate_decompression_context_size():
133 162 return lib.ZSTD_estimateDCtxSize()
134 163
135 164
136 165 class ZstdCompressionWriter(object):
137 166 def __init__(self, compressor, writer, source_size, write_size):
138 167 self._compressor = compressor
139 168 self._writer = writer
140 169 self._source_size = source_size
141 170 self._write_size = write_size
142 171 self._entered = False
172 self._mtcctx = compressor._cctx if compressor._multithreaded else None
143 173
144 174 def __enter__(self):
145 175 if self._entered:
146 176 raise ZstdError('cannot __enter__ multiple times')
147 177
148 self._cstream = self._compressor._get_cstream(self._source_size)
178 if self._mtcctx:
179 self._compressor._init_mtcstream(self._source_size)
180 else:
181 self._compressor._ensure_cstream(self._source_size)
149 182 self._entered = True
150 183 return self
151 184
152 185 def __exit__(self, exc_type, exc_value, exc_tb):
153 186 self._entered = False
154 187
155 188 if not exc_type and not exc_value and not exc_tb:
156 189 out_buffer = ffi.new('ZSTD_outBuffer *')
157 190 dst_buffer = ffi.new('char[]', self._write_size)
158 191 out_buffer.dst = dst_buffer
159 192 out_buffer.size = self._write_size
160 193 out_buffer.pos = 0
161 194
162 195 while True:
163 zresult = lib.ZSTD_endStream(self._cstream, out_buffer)
196 if self._mtcctx:
197 zresult = lib.ZSTDMT_endStream(self._mtcctx, out_buffer)
198 else:
199 zresult = lib.ZSTD_endStream(self._compressor._cstream, out_buffer)
164 200 if lib.ZSTD_isError(zresult):
165 201 raise ZstdError('error ending compression stream: %s' %
166 202 ffi.string(lib.ZSTD_getErrorName(zresult)))
167 203
168 204 if out_buffer.pos:
169 205 self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:])
170 206 out_buffer.pos = 0
171 207
172 208 if zresult == 0:
173 209 break
174 210
175 self._cstream = None
176 211 self._compressor = None
177 212
178 213 return False
179 214
180 215 def memory_size(self):
181 216 if not self._entered:
182 217 raise ZstdError('cannot determine size of an inactive compressor; '
183 218 'call when a context manager is active')
184 219
185 return lib.ZSTD_sizeof_CStream(self._cstream)
220 return lib.ZSTD_sizeof_CStream(self._compressor._cstream)
186 221
187 222 def write(self, data):
188 223 if not self._entered:
189 224 raise ZstdError('write() must be called from an active context '
190 225 'manager')
191 226
192 227 total_write = 0
193 228
194 229 data_buffer = ffi.from_buffer(data)
195 230
196 231 in_buffer = ffi.new('ZSTD_inBuffer *')
197 232 in_buffer.src = data_buffer
198 233 in_buffer.size = len(data_buffer)
199 234 in_buffer.pos = 0
200 235
201 236 out_buffer = ffi.new('ZSTD_outBuffer *')
202 237 dst_buffer = ffi.new('char[]', self._write_size)
203 238 out_buffer.dst = dst_buffer
204 239 out_buffer.size = self._write_size
205 240 out_buffer.pos = 0
206 241
207 242 while in_buffer.pos < in_buffer.size:
208 zresult = lib.ZSTD_compressStream(self._cstream, out_buffer, in_buffer)
243 if self._mtcctx:
244 zresult = lib.ZSTDMT_compressStream(self._mtcctx, out_buffer,
245 in_buffer)
246 else:
247 zresult = lib.ZSTD_compressStream(self._compressor._cstream, out_buffer,
248 in_buffer)
209 249 if lib.ZSTD_isError(zresult):
210 250 raise ZstdError('zstd compress error: %s' %
211 251 ffi.string(lib.ZSTD_getErrorName(zresult)))
212 252
213 253 if out_buffer.pos:
214 254 self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:])
215 255 total_write += out_buffer.pos
216 256 out_buffer.pos = 0
217 257
218 258 return total_write
219 259
220 260 def flush(self):
221 261 if not self._entered:
222 262 raise ZstdError('flush must be called from an active context manager')
223 263
224 264 total_write = 0
225 265
226 266 out_buffer = ffi.new('ZSTD_outBuffer *')
227 267 dst_buffer = ffi.new('char[]', self._write_size)
228 268 out_buffer.dst = dst_buffer
229 269 out_buffer.size = self._write_size
230 270 out_buffer.pos = 0
231 271
232 272 while True:
233 zresult = lib.ZSTD_flushStream(self._cstream, out_buffer)
273 if self._mtcctx:
274 zresult = lib.ZSTDMT_flushStream(self._mtcctx, out_buffer)
275 else:
276 zresult = lib.ZSTD_flushStream(self._compressor._cstream, out_buffer)
234 277 if lib.ZSTD_isError(zresult):
235 278 raise ZstdError('zstd compress error: %s' %
236 279 ffi.string(lib.ZSTD_getErrorName(zresult)))
237 280
238 281 if not out_buffer.pos:
239 282 break
240 283
241 284 self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:])
242 285 total_write += out_buffer.pos
243 286 out_buffer.pos = 0
244 287
245 288 return total_write
246 289
247 290
248 291 class ZstdCompressionObj(object):
249 292 def compress(self, data):
250 293 if self._finished:
251 294 raise ZstdError('cannot call compress() after compressor finished')
252 295
253 296 data_buffer = ffi.from_buffer(data)
254 297 source = ffi.new('ZSTD_inBuffer *')
255 298 source.src = data_buffer
256 299 source.size = len(data_buffer)
257 300 source.pos = 0
258 301
259 302 chunks = []
260 303
261 304 while source.pos < len(data):
262 zresult = lib.ZSTD_compressStream(self._cstream, self._out, source)
305 if self._mtcctx:
306 zresult = lib.ZSTDMT_compressStream(self._mtcctx,
307 self._out, source)
308 else:
309 zresult = lib.ZSTD_compressStream(self._compressor._cstream, self._out,
310 source)
263 311 if lib.ZSTD_isError(zresult):
264 312 raise ZstdError('zstd compress error: %s' %
265 313 ffi.string(lib.ZSTD_getErrorName(zresult)))
266 314
267 315 if self._out.pos:
268 316 chunks.append(ffi.buffer(self._out.dst, self._out.pos)[:])
269 317 self._out.pos = 0
270 318
271 319 return b''.join(chunks)
272 320
273 321 def flush(self, flush_mode=COMPRESSOBJ_FLUSH_FINISH):
274 322 if flush_mode not in (COMPRESSOBJ_FLUSH_FINISH, COMPRESSOBJ_FLUSH_BLOCK):
275 323 raise ValueError('flush mode not recognized')
276 324
277 325 if self._finished:
278 326 raise ZstdError('compressor object already finished')
279 327
280 328 assert self._out.pos == 0
281 329
282 330 if flush_mode == COMPRESSOBJ_FLUSH_BLOCK:
283 zresult = lib.ZSTD_flushStream(self._cstream, self._out)
331 if self._mtcctx:
332 zresult = lib.ZSTDMT_flushStream(self._mtcctx, self._out)
333 else:
334 zresult = lib.ZSTD_flushStream(self._compressor._cstream, self._out)
284 335 if lib.ZSTD_isError(zresult):
285 336 raise ZstdError('zstd compress error: %s' %
286 337 ffi.string(lib.ZSTD_getErrorName(zresult)))
287 338
288 339 # Output buffer is guaranteed to hold full block.
289 340 assert zresult == 0
290 341
291 342 if self._out.pos:
292 343 result = ffi.buffer(self._out.dst, self._out.pos)[:]
293 344 self._out.pos = 0
294 345 return result
295 346 else:
296 347 return b''
297 348
298 349 assert flush_mode == COMPRESSOBJ_FLUSH_FINISH
299 350 self._finished = True
300 351
301 352 chunks = []
302 353
303 354 while True:
304 zresult = lib.ZSTD_endStream(self._cstream, self._out)
355 if self._mtcctx:
356 zresult = lib.ZSTDMT_endStream(self._mtcctx, self._out)
357 else:
358 zresult = lib.ZSTD_endStream(self._compressor._cstream, self._out)
305 359 if lib.ZSTD_isError(zresult):
306 360 raise ZstdError('error ending compression stream: %s' %
307 361 ffi.string(lib.ZSTD_getErroName(zresult)))
308 362
309 363 if self._out.pos:
310 364 chunks.append(ffi.buffer(self._out.dst, self._out.pos)[:])
311 365 self._out.pos = 0
312 366
313 367 if not zresult:
314 368 break
315 369
316 # GC compression stream immediately.
317 self._cstream = None
318
319 370 return b''.join(chunks)
320 371
321 372
322 373 class ZstdCompressor(object):
323 374 def __init__(self, level=3, dict_data=None, compression_params=None,
324 375 write_checksum=False, write_content_size=False,
325 write_dict_id=True):
376 write_dict_id=True, threads=0):
326 377 if level < 1:
327 378 raise ValueError('level must be greater than 0')
328 379 elif level > lib.ZSTD_maxCLevel():
329 380 raise ValueError('level must be less than %d' % lib.ZSTD_maxCLevel())
330 381
382 if threads < 0:
383 threads = _cpu_count()
384
331 385 self._compression_level = level
332 386 self._dict_data = dict_data
333 387 self._cparams = compression_params
334 388 self._fparams = ffi.new('ZSTD_frameParameters *')[0]
335 389 self._fparams.checksumFlag = write_checksum
336 390 self._fparams.contentSizeFlag = write_content_size
337 391 self._fparams.noDictIDFlag = not write_dict_id
338 392
339 cctx = lib.ZSTD_createCCtx()
340 if cctx == ffi.NULL:
341 raise MemoryError()
393 if threads:
394 cctx = lib.ZSTDMT_createCCtx(threads)
395 if cctx == ffi.NULL:
396 raise MemoryError()
342 397
343 self._cctx = ffi.gc(cctx, lib.ZSTD_freeCCtx)
398 self._cctx = ffi.gc(cctx, lib.ZSTDMT_freeCCtx)
399 self._multithreaded = True
400 else:
401 cctx = lib.ZSTD_createCCtx()
402 if cctx == ffi.NULL:
403 raise MemoryError()
404
405 self._cctx = ffi.gc(cctx, lib.ZSTD_freeCCtx)
406 self._multithreaded = False
407
408 self._cstream = None
344 409
345 410 def compress(self, data, allow_empty=False):
346 411 if len(data) == 0 and self._fparams.contentSizeFlag and not allow_empty:
347 412 raise ValueError('cannot write empty inputs when writing content sizes')
348 413
414 if self._multithreaded and self._dict_data:
415 raise ZstdError('compress() cannot be used with both dictionaries and multi-threaded compression')
416
417 if self._multithreaded and self._cparams:
418 raise ZstdError('compress() cannot be used with both compression parameters and multi-threaded compression')
419
349 420 # TODO use a CDict for performance.
350 421 dict_data = ffi.NULL
351 422 dict_size = 0
352 423
353 424 if self._dict_data:
354 425 dict_data = self._dict_data.as_bytes()
355 426 dict_size = len(self._dict_data)
356 427
357 428 params = ffi.new('ZSTD_parameters *')[0]
358 429 if self._cparams:
359 430 params.cParams = self._cparams.as_compression_parameters()
360 431 else:
361 432 params.cParams = lib.ZSTD_getCParams(self._compression_level, len(data),
362 433 dict_size)
363 434 params.fParams = self._fparams
364 435
365 436 dest_size = lib.ZSTD_compressBound(len(data))
366 437 out = new_nonzero('char[]', dest_size)
367 438
368 zresult = lib.ZSTD_compress_advanced(self._cctx,
369 ffi.addressof(out), dest_size,
370 data, len(data),
371 dict_data, dict_size,
372 params)
439 if self._multithreaded:
440 zresult = lib.ZSTDMT_compressCCtx(self._cctx,
441 ffi.addressof(out), dest_size,
442 data, len(data),
443 self._compression_level)
444 else:
445 zresult = lib.ZSTD_compress_advanced(self._cctx,
446 ffi.addressof(out), dest_size,
447 data, len(data),
448 dict_data, dict_size,
449 params)
373 450
374 451 if lib.ZSTD_isError(zresult):
375 452 raise ZstdError('cannot compress: %s' %
376 453 ffi.string(lib.ZSTD_getErrorName(zresult)))
377 454
378 455 return ffi.buffer(out, zresult)[:]
379 456
380 457 def compressobj(self, size=0):
381 cstream = self._get_cstream(size)
458 if self._multithreaded:
459 self._init_mtcstream(size)
460 else:
461 self._ensure_cstream(size)
462
382 463 cobj = ZstdCompressionObj()
383 cobj._cstream = cstream
384 464 cobj._out = ffi.new('ZSTD_outBuffer *')
385 465 cobj._dst_buffer = ffi.new('char[]', COMPRESSION_RECOMMENDED_OUTPUT_SIZE)
386 466 cobj._out.dst = cobj._dst_buffer
387 467 cobj._out.size = COMPRESSION_RECOMMENDED_OUTPUT_SIZE
388 468 cobj._out.pos = 0
389 469 cobj._compressor = self
390 470 cobj._finished = False
391 471
472 if self._multithreaded:
473 cobj._mtcctx = self._cctx
474 else:
475 cobj._mtcctx = None
476
392 477 return cobj
393 478
394 479 def copy_stream(self, ifh, ofh, size=0,
395 480 read_size=COMPRESSION_RECOMMENDED_INPUT_SIZE,
396 481 write_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE):
397 482
398 483 if not hasattr(ifh, 'read'):
399 484 raise ValueError('first argument must have a read() method')
400 485 if not hasattr(ofh, 'write'):
401 486 raise ValueError('second argument must have a write() method')
402 487
403 cstream = self._get_cstream(size)
488 mt = self._multithreaded
489 if mt:
490 self._init_mtcstream(size)
491 else:
492 self._ensure_cstream(size)
404 493
405 494 in_buffer = ffi.new('ZSTD_inBuffer *')
406 495 out_buffer = ffi.new('ZSTD_outBuffer *')
407 496
408 497 dst_buffer = ffi.new('char[]', write_size)
409 498 out_buffer.dst = dst_buffer
410 499 out_buffer.size = write_size
411 500 out_buffer.pos = 0
412 501
413 502 total_read, total_write = 0, 0
414 503
415 504 while True:
416 505 data = ifh.read(read_size)
417 506 if not data:
418 507 break
419 508
420 509 data_buffer = ffi.from_buffer(data)
421 510 total_read += len(data_buffer)
422 511 in_buffer.src = data_buffer
423 512 in_buffer.size = len(data_buffer)
424 513 in_buffer.pos = 0
425 514
426 515 while in_buffer.pos < in_buffer.size:
427 zresult = lib.ZSTD_compressStream(cstream, out_buffer, in_buffer)
516 if mt:
517 zresult = lib.ZSTDMT_compressStream(self._cctx, out_buffer, in_buffer)
518 else:
519 zresult = lib.ZSTD_compressStream(self._cstream,
520 out_buffer, in_buffer)
428 521 if lib.ZSTD_isError(zresult):
429 522 raise ZstdError('zstd compress error: %s' %
430 523 ffi.string(lib.ZSTD_getErrorName(zresult)))
431 524
432 525 if out_buffer.pos:
433 526 ofh.write(ffi.buffer(out_buffer.dst, out_buffer.pos))
434 527 total_write += out_buffer.pos
435 528 out_buffer.pos = 0
436 529
437 530 # We've finished reading. Flush the compressor.
438 531 while True:
439 zresult = lib.ZSTD_endStream(cstream, out_buffer)
532 if mt:
533 zresult = lib.ZSTDMT_endStream(self._cctx, out_buffer)
534 else:
535 zresult = lib.ZSTD_endStream(self._cstream, out_buffer)
440 536 if lib.ZSTD_isError(zresult):
441 537 raise ZstdError('error ending compression stream: %s' %
442 538 ffi.string(lib.ZSTD_getErrorName(zresult)))
443 539
444 540 if out_buffer.pos:
445 541 ofh.write(ffi.buffer(out_buffer.dst, out_buffer.pos))
446 542 total_write += out_buffer.pos
447 543 out_buffer.pos = 0
448 544
449 545 if zresult == 0:
450 546 break
451 547
452 548 return total_read, total_write
453 549
454 550 def write_to(self, writer, size=0,
455 551 write_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE):
456 552
457 553 if not hasattr(writer, 'write'):
458 554 raise ValueError('must pass an object with a write() method')
459 555
460 556 return ZstdCompressionWriter(self, writer, size, write_size)
461 557
462 558 def read_from(self, reader, size=0,
463 559 read_size=COMPRESSION_RECOMMENDED_INPUT_SIZE,
464 560 write_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE):
465 561 if hasattr(reader, 'read'):
466 562 have_read = True
467 563 elif hasattr(reader, '__getitem__'):
468 564 have_read = False
469 565 buffer_offset = 0
470 566 size = len(reader)
471 567 else:
472 568 raise ValueError('must pass an object with a read() method or '
473 569 'conforms to buffer protocol')
474 570
475 cstream = self._get_cstream(size)
571 if self._multithreaded:
572 self._init_mtcstream(size)
573 else:
574 self._ensure_cstream(size)
476 575
477 576 in_buffer = ffi.new('ZSTD_inBuffer *')
478 577 out_buffer = ffi.new('ZSTD_outBuffer *')
479 578
480 579 in_buffer.src = ffi.NULL
481 580 in_buffer.size = 0
482 581 in_buffer.pos = 0
483 582
484 583 dst_buffer = ffi.new('char[]', write_size)
485 584 out_buffer.dst = dst_buffer
486 585 out_buffer.size = write_size
487 586 out_buffer.pos = 0
488 587
489 588 while True:
490 589 # We should never have output data sitting around after a previous
491 590 # iteration.
492 591 assert out_buffer.pos == 0
493 592
494 593 # Collect input data.
495 594 if have_read:
496 595 read_result = reader.read(read_size)
497 596 else:
498 597 remaining = len(reader) - buffer_offset
499 598 slice_size = min(remaining, read_size)
500 599 read_result = reader[buffer_offset:buffer_offset + slice_size]
501 600 buffer_offset += slice_size
502 601
503 602 # No new input data. Break out of the read loop.
504 603 if not read_result:
505 604 break
506 605
507 606 # Feed all read data into the compressor and emit output until
508 607 # exhausted.
509 608 read_buffer = ffi.from_buffer(read_result)
510 609 in_buffer.src = read_buffer
511 610 in_buffer.size = len(read_buffer)
512 611 in_buffer.pos = 0
513 612
514 613 while in_buffer.pos < in_buffer.size:
515 zresult = lib.ZSTD_compressStream(cstream, out_buffer, in_buffer)
614 if self._multithreaded:
615 zresult = lib.ZSTDMT_compressStream(self._cctx, out_buffer, in_buffer)
616 else:
617 zresult = lib.ZSTD_compressStream(self._cstream, out_buffer, in_buffer)
516 618 if lib.ZSTD_isError(zresult):
517 619 raise ZstdError('zstd compress error: %s' %
518 620 ffi.string(lib.ZSTD_getErrorName(zresult)))
519 621
520 622 if out_buffer.pos:
521 623 data = ffi.buffer(out_buffer.dst, out_buffer.pos)[:]
522 624 out_buffer.pos = 0
523 625 yield data
524 626
525 627 assert out_buffer.pos == 0
526 628
527 629 # And repeat the loop to collect more data.
528 630 continue
529 631
530 632 # If we get here, input is exhausted. End the stream and emit what
531 633 # remains.
532 634 while True:
533 635 assert out_buffer.pos == 0
534 zresult = lib.ZSTD_endStream(cstream, out_buffer)
636 if self._multithreaded:
637 zresult = lib.ZSTDMT_endStream(self._cctx, out_buffer)
638 else:
639 zresult = lib.ZSTD_endStream(self._cstream, out_buffer)
535 640 if lib.ZSTD_isError(zresult):
536 641 raise ZstdError('error ending compression stream: %s' %
537 642 ffi.string(lib.ZSTD_getErrorName(zresult)))
538 643
539 644 if out_buffer.pos:
540 645 data = ffi.buffer(out_buffer.dst, out_buffer.pos)[:]
541 646 out_buffer.pos = 0
542 647 yield data
543 648
544 649 if zresult == 0:
545 650 break
546 651
547 def _get_cstream(self, size):
652 def _ensure_cstream(self, size):
653 if self._cstream:
654 zresult = lib.ZSTD_resetCStream(self._cstream, size)
655 if lib.ZSTD_isError(zresult):
656 raise ZstdError('could not reset CStream: %s' %
657 ffi.string(lib.ZSTD_getErrorName(zresult)))
658
659 return
660
548 661 cstream = lib.ZSTD_createCStream()
549 662 if cstream == ffi.NULL:
550 663 raise MemoryError()
551 664
552 665 cstream = ffi.gc(cstream, lib.ZSTD_freeCStream)
553 666
554 667 dict_data = ffi.NULL
555 668 dict_size = 0
556 669 if self._dict_data:
557 670 dict_data = self._dict_data.as_bytes()
558 671 dict_size = len(self._dict_data)
559 672
560 673 zparams = ffi.new('ZSTD_parameters *')[0]
561 674 if self._cparams:
562 675 zparams.cParams = self._cparams.as_compression_parameters()
563 676 else:
564 677 zparams.cParams = lib.ZSTD_getCParams(self._compression_level,
565 678 size, dict_size)
566 679 zparams.fParams = self._fparams
567 680
568 681 zresult = lib.ZSTD_initCStream_advanced(cstream, dict_data, dict_size,
569 682 zparams, size)
570 683 if lib.ZSTD_isError(zresult):
571 684 raise Exception('cannot init CStream: %s' %
572 685 ffi.string(lib.ZSTD_getErrorName(zresult)))
573 686
574 return cstream
687 self._cstream = cstream
688
689 def _init_mtcstream(self, size):
690 assert self._multithreaded
691
692 dict_data = ffi.NULL
693 dict_size = 0
694 if self._dict_data:
695 dict_data = self._dict_data.as_bytes()
696 dict_size = len(self._dict_data)
697
698 zparams = ffi.new('ZSTD_parameters *')[0]
699 if self._cparams:
700 zparams.cParams = self._cparams.as_compression_parameters()
701 else:
702 zparams.cParams = lib.ZSTD_getCParams(self._compression_level,
703 size, dict_size)
704
705 zparams.fParams = self._fparams
706
707 zresult = lib.ZSTDMT_initCStream_advanced(self._cctx, dict_data, dict_size,
708 zparams, size)
709
710 if lib.ZSTD_isError(zresult):
711 raise ZstdError('cannot init CStream: %s' %
712 ffi.string(lib.ZSTD_getErrorName(zresult)))
575 713
576 714
577 715 class FrameParameters(object):
578 716 def __init__(self, fparams):
579 717 self.content_size = fparams.frameContentSize
580 718 self.window_size = fparams.windowSize
581 719 self.dict_id = fparams.dictID
582 720 self.has_checksum = bool(fparams.checksumFlag)
583 721
584 722
585 723 def get_frame_parameters(data):
586 724 if not isinstance(data, bytes_type):
587 725 raise TypeError('argument must be bytes')
588 726
589 727 params = ffi.new('ZSTD_frameParams *')
590 728
591 729 zresult = lib.ZSTD_getFrameParams(params, data, len(data))
592 730 if lib.ZSTD_isError(zresult):
593 731 raise ZstdError('cannot get frame parameters: %s' %
594 732 ffi.string(lib.ZSTD_getErrorName(zresult)))
595 733
596 734 if zresult:
597 735 raise ZstdError('not enough data for frame parameters; need %d bytes' %
598 736 zresult)
599 737
600 738 return FrameParameters(params[0])
601 739
602 740
603 741 class ZstdCompressionDict(object):
604 def __init__(self, data):
742 def __init__(self, data, k=0, d=0):
605 743 assert isinstance(data, bytes_type)
606 744 self._data = data
745 self.k = k
746 self.d = d
607 747
608 748 def __len__(self):
609 749 return len(self._data)
610 750
611 751 def dict_id(self):
612 752 return int_type(lib.ZDICT_getDictID(self._data, len(self._data)))
613 753
614 754 def as_bytes(self):
615 755 return self._data
616 756
617 757
618 def train_dictionary(dict_size, samples, parameters=None):
758 def train_dictionary(dict_size, samples, selectivity=0, level=0,
759 notifications=0, dict_id=0):
619 760 if not isinstance(samples, list):
620 761 raise TypeError('samples must be a list')
621 762
622 763 total_size = sum(map(len, samples))
623 764
624 765 samples_buffer = new_nonzero('char[]', total_size)
625 766 sample_sizes = new_nonzero('size_t[]', len(samples))
626 767
627 768 offset = 0
628 769 for i, sample in enumerate(samples):
629 770 if not isinstance(sample, bytes_type):
630 771 raise ValueError('samples must be bytes')
631 772
632 773 l = len(sample)
633 774 ffi.memmove(samples_buffer + offset, sample, l)
634 775 offset += l
635 776 sample_sizes[i] = l
636 777
637 778 dict_data = new_nonzero('char[]', dict_size)
638 779
639 zresult = lib.ZDICT_trainFromBuffer(ffi.addressof(dict_data), dict_size,
640 ffi.addressof(samples_buffer),
641 ffi.addressof(sample_sizes, 0),
642 len(samples))
780 dparams = ffi.new('ZDICT_params_t *')[0]
781 dparams.selectivityLevel = selectivity
782 dparams.compressionLevel = level
783 dparams.notificationLevel = notifications
784 dparams.dictID = dict_id
785
786 zresult = lib.ZDICT_trainFromBuffer_advanced(
787 ffi.addressof(dict_data), dict_size,
788 ffi.addressof(samples_buffer),
789 ffi.addressof(sample_sizes, 0), len(samples),
790 dparams)
791
643 792 if lib.ZDICT_isError(zresult):
644 793 raise ZstdError('Cannot train dict: %s' %
645 794 ffi.string(lib.ZDICT_getErrorName(zresult)))
646 795
647 796 return ZstdCompressionDict(ffi.buffer(dict_data, zresult)[:])
648 797
649 798
799 def train_cover_dictionary(dict_size, samples, k=0, d=0,
800 notifications=0, dict_id=0, level=0, optimize=False,
801 steps=0, threads=0):
802 if not isinstance(samples, list):
803 raise TypeError('samples must be a list')
804
805 if threads < 0:
806 threads = _cpu_count()
807
808 total_size = sum(map(len, samples))
809
810 samples_buffer = new_nonzero('char[]', total_size)
811 sample_sizes = new_nonzero('size_t[]', len(samples))
812
813 offset = 0
814 for i, sample in enumerate(samples):
815 if not isinstance(sample, bytes_type):
816 raise ValueError('samples must be bytes')
817
818 l = len(sample)
819 ffi.memmove(samples_buffer + offset, sample, l)
820 offset += l
821 sample_sizes[i] = l
822
823 dict_data = new_nonzero('char[]', dict_size)
824
825 dparams = ffi.new('COVER_params_t *')[0]
826 dparams.k = k
827 dparams.d = d
828 dparams.steps = steps
829 dparams.nbThreads = threads
830 dparams.notificationLevel = notifications
831 dparams.dictID = dict_id
832 dparams.compressionLevel = level
833
834 if optimize:
835 zresult = lib.COVER_optimizeTrainFromBuffer(
836 ffi.addressof(dict_data), dict_size,
837 ffi.addressof(samples_buffer),
838 ffi.addressof(sample_sizes, 0), len(samples),
839 ffi.addressof(dparams))
840 else:
841 zresult = lib.COVER_trainFromBuffer(
842 ffi.addressof(dict_data), dict_size,
843 ffi.addressof(samples_buffer),
844 ffi.addressof(sample_sizes, 0), len(samples),
845 dparams)
846
847 if lib.ZDICT_isError(zresult):
848 raise ZstdError('cannot train dict: %s' %
849 ffi.string(lib.ZDICT_getErrorName(zresult)))
850
851 return ZstdCompressionDict(ffi.buffer(dict_data, zresult)[:],
852 k=dparams.k, d=dparams.d)
853
854
650 855 class ZstdDecompressionObj(object):
651 856 def __init__(self, decompressor):
652 857 self._decompressor = decompressor
653 self._dstream = self._decompressor._get_dstream()
654 858 self._finished = False
655 859
656 860 def decompress(self, data):
657 861 if self._finished:
658 862 raise ZstdError('cannot use a decompressobj multiple times')
659 863
864 assert(self._decompressor._dstream)
865
660 866 in_buffer = ffi.new('ZSTD_inBuffer *')
661 867 out_buffer = ffi.new('ZSTD_outBuffer *')
662 868
663 869 data_buffer = ffi.from_buffer(data)
664 870 in_buffer.src = data_buffer
665 871 in_buffer.size = len(data_buffer)
666 872 in_buffer.pos = 0
667 873
668 874 dst_buffer = ffi.new('char[]', DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE)
669 875 out_buffer.dst = dst_buffer
670 876 out_buffer.size = len(dst_buffer)
671 877 out_buffer.pos = 0
672 878
673 879 chunks = []
674 880
675 881 while in_buffer.pos < in_buffer.size:
676 zresult = lib.ZSTD_decompressStream(self._dstream, out_buffer, in_buffer)
882 zresult = lib.ZSTD_decompressStream(self._decompressor._dstream,
883 out_buffer, in_buffer)
677 884 if lib.ZSTD_isError(zresult):
678 885 raise ZstdError('zstd decompressor error: %s' %
679 886 ffi.string(lib.ZSTD_getErrorName(zresult)))
680 887
681 888 if zresult == 0:
682 889 self._finished = True
683 self._dstream = None
684 890 self._decompressor = None
685 891
686 892 if out_buffer.pos:
687 893 chunks.append(ffi.buffer(out_buffer.dst, out_buffer.pos)[:])
688 894 out_buffer.pos = 0
689 895
690 896 return b''.join(chunks)
691 897
692 898
693 899 class ZstdDecompressionWriter(object):
694 900 def __init__(self, decompressor, writer, write_size):
695 901 self._decompressor = decompressor
696 902 self._writer = writer
697 903 self._write_size = write_size
698 self._dstream = None
699 904 self._entered = False
700 905
701 906 def __enter__(self):
702 907 if self._entered:
703 908 raise ZstdError('cannot __enter__ multiple times')
704 909
705 self._dstream = self._decompressor._get_dstream()
910 self._decompressor._ensure_dstream()
706 911 self._entered = True
707 912
708 913 return self
709 914
710 915 def __exit__(self, exc_type, exc_value, exc_tb):
711 916 self._entered = False
712 self._dstream = None
713 917
714 918 def memory_size(self):
715 if not self._dstream:
919 if not self._decompressor._dstream:
716 920 raise ZstdError('cannot determine size of inactive decompressor '
717 921 'call when context manager is active')
718 922
719 return lib.ZSTD_sizeof_DStream(self._dstream)
923 return lib.ZSTD_sizeof_DStream(self._decompressor._dstream)
720 924
721 925 def write(self, data):
722 926 if not self._entered:
723 927 raise ZstdError('write must be called from an active context manager')
724 928
725 929 total_write = 0
726 930
727 931 in_buffer = ffi.new('ZSTD_inBuffer *')
728 932 out_buffer = ffi.new('ZSTD_outBuffer *')
729 933
730 934 data_buffer = ffi.from_buffer(data)
731 935 in_buffer.src = data_buffer
732 936 in_buffer.size = len(data_buffer)
733 937 in_buffer.pos = 0
734 938
735 939 dst_buffer = ffi.new('char[]', self._write_size)
736 940 out_buffer.dst = dst_buffer
737 941 out_buffer.size = len(dst_buffer)
738 942 out_buffer.pos = 0
739 943
944 dstream = self._decompressor._dstream
945
740 946 while in_buffer.pos < in_buffer.size:
741 zresult = lib.ZSTD_decompressStream(self._dstream, out_buffer, in_buffer)
947 zresult = lib.ZSTD_decompressStream(dstream, out_buffer, in_buffer)
742 948 if lib.ZSTD_isError(zresult):
743 949 raise ZstdError('zstd decompress error: %s' %
744 950 ffi.string(lib.ZSTD_getErrorName(zresult)))
745 951
746 952 if out_buffer.pos:
747 953 self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:])
748 954 total_write += out_buffer.pos
749 955 out_buffer.pos = 0
750 956
751 957 return total_write
752 958
753 959
754 960 class ZstdDecompressor(object):
755 961 def __init__(self, dict_data=None):
756 962 self._dict_data = dict_data
757 963
758 964 dctx = lib.ZSTD_createDCtx()
759 965 if dctx == ffi.NULL:
760 966 raise MemoryError()
761 967
762 968 self._refdctx = ffi.gc(dctx, lib.ZSTD_freeDCtx)
969 self._dstream = None
763 970
764 971 @property
765 972 def _ddict(self):
766 973 if self._dict_data:
767 974 dict_data = self._dict_data.as_bytes()
768 975 dict_size = len(self._dict_data)
769 976
770 977 ddict = lib.ZSTD_createDDict(dict_data, dict_size)
771 978 if ddict == ffi.NULL:
772 979 raise ZstdError('could not create decompression dict')
773 980 else:
774 981 ddict = None
775 982
776 983 self.__dict__['_ddict'] = ddict
777 984 return ddict
778 985
779 986 def decompress(self, data, max_output_size=0):
780 987 data_buffer = ffi.from_buffer(data)
781 988
782 989 orig_dctx = new_nonzero('char[]', lib.ZSTD_sizeof_DCtx(self._refdctx))
783 990 dctx = ffi.cast('ZSTD_DCtx *', orig_dctx)
784 991 lib.ZSTD_copyDCtx(dctx, self._refdctx)
785 992
786 993 ddict = self._ddict
787 994
788 995 output_size = lib.ZSTD_getDecompressedSize(data_buffer, len(data_buffer))
789 996 if output_size:
790 997 result_buffer = ffi.new('char[]', output_size)
791 998 result_size = output_size
792 999 else:
793 1000 if not max_output_size:
794 1001 raise ZstdError('input data invalid or missing content size '
795 1002 'in frame header')
796 1003
797 1004 result_buffer = ffi.new('char[]', max_output_size)
798 1005 result_size = max_output_size
799 1006
800 1007 if ddict:
801 1008 zresult = lib.ZSTD_decompress_usingDDict(dctx,
802 1009 result_buffer, result_size,
803 1010 data_buffer, len(data_buffer),
804 1011 ddict)
805 1012 else:
806 1013 zresult = lib.ZSTD_decompressDCtx(dctx,
807 1014 result_buffer, result_size,
808 1015 data_buffer, len(data_buffer))
809 1016 if lib.ZSTD_isError(zresult):
810 1017 raise ZstdError('decompression error: %s' %
811 1018 ffi.string(lib.ZSTD_getErrorName(zresult)))
812 1019 elif output_size and zresult != output_size:
813 1020 raise ZstdError('decompression error: decompressed %d bytes; expected %d' %
814 1021 (zresult, output_size))
815 1022
816 1023 return ffi.buffer(result_buffer, zresult)[:]
817 1024
818 1025 def decompressobj(self):
1026 self._ensure_dstream()
819 1027 return ZstdDecompressionObj(self)
820 1028
821 1029 def read_from(self, reader, read_size=DECOMPRESSION_RECOMMENDED_INPUT_SIZE,
822 1030 write_size=DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE,
823 1031 skip_bytes=0):
824 1032 if skip_bytes >= read_size:
825 1033 raise ValueError('skip_bytes must be smaller than read_size')
826 1034
827 1035 if hasattr(reader, 'read'):
828 1036 have_read = True
829 1037 elif hasattr(reader, '__getitem__'):
830 1038 have_read = False
831 1039 buffer_offset = 0
832 1040 size = len(reader)
833 1041 else:
834 1042 raise ValueError('must pass an object with a read() method or '
835 1043 'conforms to buffer protocol')
836 1044
837 1045 if skip_bytes:
838 1046 if have_read:
839 1047 reader.read(skip_bytes)
840 1048 else:
841 1049 if skip_bytes > size:
842 1050 raise ValueError('skip_bytes larger than first input chunk')
843 1051
844 1052 buffer_offset = skip_bytes
845 1053
846 dstream = self._get_dstream()
1054 self._ensure_dstream()
847 1055
848 1056 in_buffer = ffi.new('ZSTD_inBuffer *')
849 1057 out_buffer = ffi.new('ZSTD_outBuffer *')
850 1058
851 1059 dst_buffer = ffi.new('char[]', write_size)
852 1060 out_buffer.dst = dst_buffer
853 1061 out_buffer.size = len(dst_buffer)
854 1062 out_buffer.pos = 0
855 1063
856 1064 while True:
857 1065 assert out_buffer.pos == 0
858 1066
859 1067 if have_read:
860 1068 read_result = reader.read(read_size)
861 1069 else:
862 1070 remaining = size - buffer_offset
863 1071 slice_size = min(remaining, read_size)
864 1072 read_result = reader[buffer_offset:buffer_offset + slice_size]
865 1073 buffer_offset += slice_size
866 1074
867 1075 # No new input. Break out of read loop.
868 1076 if not read_result:
869 1077 break
870 1078
871 1079 # Feed all read data into decompressor and emit output until
872 1080 # exhausted.
873 1081 read_buffer = ffi.from_buffer(read_result)
874 1082 in_buffer.src = read_buffer
875 1083 in_buffer.size = len(read_buffer)
876 1084 in_buffer.pos = 0
877 1085
878 1086 while in_buffer.pos < in_buffer.size:
879 1087 assert out_buffer.pos == 0
880 1088
881 zresult = lib.ZSTD_decompressStream(dstream, out_buffer, in_buffer)
1089 zresult = lib.ZSTD_decompressStream(self._dstream, out_buffer, in_buffer)
882 1090 if lib.ZSTD_isError(zresult):
883 1091 raise ZstdError('zstd decompress error: %s' %
884 1092 ffi.string(lib.ZSTD_getErrorName(zresult)))
885 1093
886 1094 if out_buffer.pos:
887 1095 data = ffi.buffer(out_buffer.dst, out_buffer.pos)[:]
888 1096 out_buffer.pos = 0
889 1097 yield data
890 1098
891 1099 if zresult == 0:
892 1100 return
893 1101
894 1102 # Repeat loop to collect more input data.
895 1103 continue
896 1104
897 1105 # If we get here, input is exhausted.
898 1106
899 1107 def write_to(self, writer, write_size=DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE):
900 1108 if not hasattr(writer, 'write'):
901 1109 raise ValueError('must pass an object with a write() method')
902 1110
903 1111 return ZstdDecompressionWriter(self, writer, write_size)
904 1112
905 1113 def copy_stream(self, ifh, ofh,
906 1114 read_size=DECOMPRESSION_RECOMMENDED_INPUT_SIZE,
907 1115 write_size=DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE):
908 1116 if not hasattr(ifh, 'read'):
909 1117 raise ValueError('first argument must have a read() method')
910 1118 if not hasattr(ofh, 'write'):
911 1119 raise ValueError('second argument must have a write() method')
912 1120
913 dstream = self._get_dstream()
1121 self._ensure_dstream()
914 1122
915 1123 in_buffer = ffi.new('ZSTD_inBuffer *')
916 1124 out_buffer = ffi.new('ZSTD_outBuffer *')
917 1125
918 1126 dst_buffer = ffi.new('char[]', write_size)
919 1127 out_buffer.dst = dst_buffer
920 1128 out_buffer.size = write_size
921 1129 out_buffer.pos = 0
922 1130
923 1131 total_read, total_write = 0, 0
924 1132
925 1133 # Read all available input.
926 1134 while True:
927 1135 data = ifh.read(read_size)
928 1136 if not data:
929 1137 break
930 1138
931 1139 data_buffer = ffi.from_buffer(data)
932 1140 total_read += len(data_buffer)
933 1141 in_buffer.src = data_buffer
934 1142 in_buffer.size = len(data_buffer)
935 1143 in_buffer.pos = 0
936 1144
937 1145 # Flush all read data to output.
938 1146 while in_buffer.pos < in_buffer.size:
939 zresult = lib.ZSTD_decompressStream(dstream, out_buffer, in_buffer)
1147 zresult = lib.ZSTD_decompressStream(self._dstream, out_buffer, in_buffer)
940 1148 if lib.ZSTD_isError(zresult):
941 1149 raise ZstdError('zstd decompressor error: %s' %
942 1150 ffi.string(lib.ZSTD_getErrorName(zresult)))
943 1151
944 1152 if out_buffer.pos:
945 1153 ofh.write(ffi.buffer(out_buffer.dst, out_buffer.pos))
946 1154 total_write += out_buffer.pos
947 1155 out_buffer.pos = 0
948 1156
949 1157 # Continue loop to keep reading.
950 1158
951 1159 return total_read, total_write
952 1160
953 1161 def decompress_content_dict_chain(self, frames):
954 1162 if not isinstance(frames, list):
955 1163 raise TypeError('argument must be a list')
956 1164
957 1165 if not frames:
958 1166 raise ValueError('empty input chain')
959 1167
960 1168 # First chunk should not be using a dictionary. We handle it specially.
961 1169 chunk = frames[0]
962 1170 if not isinstance(chunk, bytes_type):
963 1171 raise ValueError('chunk 0 must be bytes')
964 1172
965 1173 # All chunks should be zstd frames and should have content size set.
966 1174 chunk_buffer = ffi.from_buffer(chunk)
967 1175 params = ffi.new('ZSTD_frameParams *')
968 1176 zresult = lib.ZSTD_getFrameParams(params, chunk_buffer, len(chunk_buffer))
969 1177 if lib.ZSTD_isError(zresult):
970 1178 raise ValueError('chunk 0 is not a valid zstd frame')
971 1179 elif zresult:
972 1180 raise ValueError('chunk 0 is too small to contain a zstd frame')
973 1181
974 1182 if not params.frameContentSize:
975 1183 raise ValueError('chunk 0 missing content size in frame')
976 1184
977 1185 dctx = lib.ZSTD_createDCtx()
978 1186 if dctx == ffi.NULL:
979 1187 raise MemoryError()
980 1188
981 1189 dctx = ffi.gc(dctx, lib.ZSTD_freeDCtx)
982 1190
983 1191 last_buffer = ffi.new('char[]', params.frameContentSize)
984 1192
985 1193 zresult = lib.ZSTD_decompressDCtx(dctx, last_buffer, len(last_buffer),
986 1194 chunk_buffer, len(chunk_buffer))
987 1195 if lib.ZSTD_isError(zresult):
988 1196 raise ZstdError('could not decompress chunk 0: %s' %
989 1197 ffi.string(lib.ZSTD_getErrorName(zresult)))
990 1198
991 1199 # Special case of chain length of 1
992 1200 if len(frames) == 1:
993 1201 return ffi.buffer(last_buffer, len(last_buffer))[:]
994 1202
995 1203 i = 1
996 1204 while i < len(frames):
997 1205 chunk = frames[i]
998 1206 if not isinstance(chunk, bytes_type):
999 1207 raise ValueError('chunk %d must be bytes' % i)
1000 1208
1001 1209 chunk_buffer = ffi.from_buffer(chunk)
1002 1210 zresult = lib.ZSTD_getFrameParams(params, chunk_buffer, len(chunk_buffer))
1003 1211 if lib.ZSTD_isError(zresult):
1004 1212 raise ValueError('chunk %d is not a valid zstd frame' % i)
1005 1213 elif zresult:
1006 1214 raise ValueError('chunk %d is too small to contain a zstd frame' % i)
1007 1215
1008 1216 if not params.frameContentSize:
1009 1217 raise ValueError('chunk %d missing content size in frame' % i)
1010 1218
1011 1219 dest_buffer = ffi.new('char[]', params.frameContentSize)
1012 1220
1013 1221 zresult = lib.ZSTD_decompress_usingDict(dctx, dest_buffer, len(dest_buffer),
1014 1222 chunk_buffer, len(chunk_buffer),
1015 1223 last_buffer, len(last_buffer))
1016 1224 if lib.ZSTD_isError(zresult):
1017 1225 raise ZstdError('could not decompress chunk %d' % i)
1018 1226
1019 1227 last_buffer = dest_buffer
1020 1228 i += 1
1021 1229
1022 1230 return ffi.buffer(last_buffer, len(last_buffer))[:]
1023 1231
1024 def _get_dstream(self):
1025 dstream = lib.ZSTD_createDStream()
1026 if dstream == ffi.NULL:
1232 def _ensure_dstream(self):
1233 if self._dstream:
1234 zresult = lib.ZSTD_resetDStream(self._dstream)
1235 if lib.ZSTD_isError(zresult):
1236 raise ZstdError('could not reset DStream: %s' %
1237 ffi.string(lib.ZSTD_getErrorName(zresult)))
1238
1239 return
1240
1241 self._dstream = lib.ZSTD_createDStream()
1242 if self._dstream == ffi.NULL:
1027 1243 raise MemoryError()
1028 1244
1029 dstream = ffi.gc(dstream, lib.ZSTD_freeDStream)
1245 self._dstream = ffi.gc(self._dstream, lib.ZSTD_freeDStream)
1030 1246
1031 1247 if self._dict_data:
1032 zresult = lib.ZSTD_initDStream_usingDict(dstream,
1248 zresult = lib.ZSTD_initDStream_usingDict(self._dstream,
1033 1249 self._dict_data.as_bytes(),
1034 1250 len(self._dict_data))
1035 1251 else:
1036 zresult = lib.ZSTD_initDStream(dstream)
1252 zresult = lib.ZSTD_initDStream(self._dstream)
1037 1253
1038 1254 if lib.ZSTD_isError(zresult):
1255 self._dstream = None
1039 1256 raise ZstdError('could not initialize DStream: %s' %
1040 1257 ffi.string(lib.ZSTD_getErrorName(zresult)))
1041
1042 return dstream
@@ -1,41 +1,44
1 1 #require test-repo
2 2
3 3 $ . "$TESTDIR/helpers-testrepo.sh"
4 4 $ cd "$TESTDIR"/..
5 5
6 6 $ hg files 'set:(**.py)' | sed 's|\\|/|g' | xargs python contrib/check-py3-compat.py
7 7 contrib/python-zstandard/setup.py not using absolute_import
8 8 contrib/python-zstandard/setup_zstd.py not using absolute_import
9 9 contrib/python-zstandard/tests/common.py not using absolute_import
10 contrib/python-zstandard/tests/test_buffer_util.py not using absolute_import
10 11 contrib/python-zstandard/tests/test_compressor.py not using absolute_import
12 contrib/python-zstandard/tests/test_compressor_fuzzing.py not using absolute_import
11 13 contrib/python-zstandard/tests/test_data_structures.py not using absolute_import
14 contrib/python-zstandard/tests/test_data_structures_fuzzing.py not using absolute_import
12 15 contrib/python-zstandard/tests/test_decompressor.py not using absolute_import
16 contrib/python-zstandard/tests/test_decompressor_fuzzing.py not using absolute_import
13 17 contrib/python-zstandard/tests/test_estimate_sizes.py not using absolute_import
14 18 contrib/python-zstandard/tests/test_module_attributes.py not using absolute_import
15 contrib/python-zstandard/tests/test_roundtrip.py not using absolute_import
16 19 contrib/python-zstandard/tests/test_train_dictionary.py not using absolute_import
17 20 i18n/check-translation.py not using absolute_import
18 21 setup.py not using absolute_import
19 22 tests/test-demandimport.py not using absolute_import
20 23
21 24 #if py3exe
22 25 $ hg files 'set:(**.py) - grep(pygments)' -X hgext/fsmonitor/pywatchman \
23 26 > | sed 's|\\|/|g' | xargs $PYTHON3 contrib/check-py3-compat.py \
24 27 > | sed 's/[0-9][0-9]*)$/*)/'
25 28 hgext/convert/transport.py: error importing: <*Error> No module named 'svn.client' (error at transport.py:*) (glob)
26 29 hgext/fsmonitor/state.py: error importing: <SyntaxError> from __future__ imports must occur at the beginning of the file (__init__.py, line 30) (error at watchmanclient.py:*)
27 30 hgext/fsmonitor/watchmanclient.py: error importing: <SyntaxError> from __future__ imports must occur at the beginning of the file (__init__.py, line 30) (error at watchmanclient.py:*)
28 31 mercurial/cffi/bdiff.py: error importing: <*Error> No module named 'mercurial.cffi' (error at check-py3-compat.py:*) (glob)
29 32 mercurial/cffi/mpatch.py: error importing: <*Error> No module named 'mercurial.cffi' (error at check-py3-compat.py:*) (glob)
30 33 mercurial/cffi/osutil.py: error importing: <*Error> No module named 'mercurial.cffi' (error at check-py3-compat.py:*) (glob)
31 34 mercurial/scmwindows.py: error importing: <*Error> No module named 'msvcrt' (error at win32.py:*) (glob)
32 35 mercurial/win32.py: error importing: <*Error> No module named 'msvcrt' (error at win32.py:*) (glob)
33 36 mercurial/windows.py: error importing: <*Error> No module named 'msvcrt' (error at windows.py:*) (glob)
34 37
35 38 #endif
36 39
37 40 #if py3exe py3pygments
38 41 $ hg files 'set:(**.py) and grep(pygments)' | sed 's|\\|/|g' \
39 42 > | xargs $PYTHON3 contrib/check-py3-compat.py \
40 43 > | sed 's/[0-9][0-9]*)$/*)/'
41 44 #endif
1 NO CONTENT: file was removed
1 NO CONTENT: file was removed
General Comments 0
You need to be logged in to leave comments. Login now