diff --git a/contrib/python-zstandard/LICENSE b/contrib/python-zstandard/LICENSE new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2016, Gregory Szorc +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/contrib/python-zstandard/MANIFEST.in b/contrib/python-zstandard/MANIFEST.in new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/MANIFEST.in @@ -0,0 +1,2 @@ +graft zstd +include make_cffi.py diff --git a/contrib/python-zstandard/NEWS.rst b/contrib/python-zstandard/NEWS.rst new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/NEWS.rst @@ -0,0 +1,63 @@ +Version History +=============== + +0.5.0 (released 2016-11-10) +--------------------------- + +* Vendored version of zstd updated to 1.1.1. +* Continuous integration for Python 3.6 and 3.7 +* Continuous integration for Conda +* Added compression and decompression APIs providing similar interfaces + to the standard library ``zlib`` and ``bz2`` modules. This allows + coding to a common interface. +* ``zstd.__version__` is now defined. +* ``read_from()`` on various APIs now accepts objects implementing the buffer + protocol. +* ``read_from()`` has gained a ``skip_bytes`` argument. This allows callers + to pass in an existing buffer with a header without having to create a + slice or a new object. +* Implemented ``ZstdCompressionDict.as_bytes()``. +* Python's memory allocator is now used instead of ``malloc()``. +* Low-level zstd data structures are reused in more instances, cutting down + on overhead for certain operations. +* ``distutils`` boilerplate for obtaining an ``Extension`` instance + has now been refactored into a standalone ``setup_zstd.py`` file. This + allows other projects with ``setup.py`` files to reuse the + ``distutils`` code for this project without copying code. +* The monolithic ``zstd.c`` file has been split into a header file defining + types and separate ``.c`` source files for the implementation. + +History of the Project +====================== + +2016-08-31 - Zstandard 1.0.0 is released and Gregory starts hacking on a +Python extension for use by the Mercurial project. A very hacky prototype +is sent to the mercurial-devel list for RFC. + +2016-09-03 - Most functionality from Zstandard C API implemented. Source +code published on https://github.com/indygreg/python-zstandard. Travis-CI +automation configured. 0.0.1 release on PyPI. + +2016-09-05 - After the API was rounded out a bit and support for Python +2.6 and 2.7 was added, version 0.1 was released to PyPI. + +2016-09-05 - After the compressor and decompressor APIs were changed, 0.2 +was released to PyPI. + +2016-09-10 - 0.3 is released with a bunch of new features. ZstdCompressor +now accepts arguments controlling frame parameters. The source size can now +be declared when performing streaming compression. ZstdDecompressor.decompress() +is implemented. Compression dictionaries are now cached when using the simple +compression and decompression APIs. Memory size APIs added. +ZstdCompressor.read_from() and ZstdDecompressor.read_from() have been +implemented. This rounds out the major compression/decompression APIs planned +by the author. + +2016-10-02 - 0.3.3 is released with a bug fix for read_from not fully +decoding a zstd frame (issue #2). + +2016-10-02 - 0.4.0 is released with zstd 1.1.0, support for custom read and +write buffer sizes, and a few bug fixes involving failure to read/write +all data when buffer sizes were too small to hold remaining data. + +2016-11-10 - 0.5.0 is released with zstd 1.1.1 and other enhancements. diff --git a/contrib/python-zstandard/README.rst b/contrib/python-zstandard/README.rst new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/README.rst @@ -0,0 +1,776 @@ +================ +python-zstandard +================ + +This project provides a Python C extension for interfacing with the +`Zstandard `_ compression library. + +The primary goal of the extension is to provide a Pythonic interface to +the underlying C API. This means exposing most of the features and flexibility +of the C API while not sacrificing usability or safety that Python provides. + +| |ci-status| |win-ci-status| + +State of Project +================ + +The project is officially in beta state. The author is reasonably satisfied +with the current API and that functionality works as advertised. There +may be some backwards incompatible changes before 1.0. Though the author +does not intend to make any major changes to the Python API. + +There is continuous integration for Python versions 2.6, 2.7, and 3.3+ +on Linux x86_x64 and Windows x86 and x86_64. The author is reasonably +confident the extension is stable and works as advertised on these +platforms. + +Expected Changes +---------------- + +The author is reasonably confident in the current state of what's +implemented on the ``ZstdCompressor`` and ``ZstdDecompressor`` types. +Those APIs likely won't change significantly. Some low-level behavior +(such as naming and types expected by arguments) may change. + +There will likely be arguments added to control the input and output +buffer sizes (currently, certain operations read and write in chunk +sizes using zstd's preferred defaults). + +There should be an API that accepts an object that conforms to the buffer +interface and returns an iterator over compressed or decompressed output. + +The author is on the fence as to whether to support the extremely +low level compression and decompression APIs. It could be useful to +support compression without the framing headers. But the author doesn't +believe it a high priority at this time. + +The CFFI bindings are half-baked and need to be finished. + +Requirements +============ + +This extension is designed to run with Python 2.6, 2.7, 3.3, 3.4, and 3.5 +on common platforms (Linux, Windows, and OS X). Only x86_64 is currently +well-tested as an architecture. + +Installing +========== + +This package is uploaded to PyPI at https://pypi.python.org/pypi/zstandard. +So, to install this package:: + + $ pip install zstandard + +Binary wheels are made available for some platforms. If you need to +install from a source distribution, all you should need is a working C +compiler and the Python development headers/libraries. On many Linux +distributions, you can install a ``python-dev`` or ``python-devel`` +package to provide these dependencies. + +Packages are also uploaded to Anaconda Cloud at +https://anaconda.org/indygreg/zstandard. See that URL for how to install +this package with ``conda``. + +Performance +=========== + +Very crude and non-scientific benchmarking (most benchmarks fall in this +category because proper benchmarking is hard) show that the Python bindings +perform within 10% of the native C implementation. + +The following table compares the performance of compressing and decompressing +a 1.1 GB tar file comprised of the files in a Firefox source checkout. Values +obtained with the ``zstd`` program are on the left. The remaining columns detail +performance of various compression APIs in the Python bindings. + ++-------+-----------------+-----------------+-----------------+---------------+ +| Level | Native | Simple | Stream In | Stream Out | +| | Comp / Decomp | Comp / Decomp | Comp / Decomp | Comp | ++=======+=================+=================+=================+===============+ +| 1 | 490 / 1338 MB/s | 458 / 1266 MB/s | 407 / 1156 MB/s | 405 MB/s | ++-------+-----------------+-----------------+-----------------+---------------+ +| 2 | 412 / 1288 MB/s | 381 / 1203 MB/s | 345 / 1128 MB/s | 349 MB/s | ++-------+-----------------+-----------------+-----------------+---------------+ +| 3 | 342 / 1312 MB/s | 319 / 1182 MB/s | 285 / 1165 MB/s | 287 MB/s | ++-------+-----------------+-----------------+-----------------+---------------+ +| 11 | 64 / 1506 MB/s | 66 / 1436 MB/s | 56 / 1342 MB/s | 57 MB/s | ++-------+-----------------+-----------------+-----------------+---------------+ + +Again, these are very unscientific. But it shows that Python is capable of +compressing at several hundred MB/s and decompressing at over 1 GB/s. + +Comparison to Other Python Bindings +=================================== + +https://pypi.python.org/pypi/zstd is an alternative Python binding to +Zstandard. At the time this was written, the latest release of that +package (1.0.0.2) had the following significant differences from this package: + +* It only exposes the simple API for compression and decompression operations. + This extension exposes the streaming API, dictionary training, and more. +* It adds a custom framing header to compressed data and there is no way to + disable it. This means that data produced with that module cannot be used by + other Zstandard implementations. + +Bundling of Zstandard Source Code +================================= + +The source repository for this project contains a vendored copy of the +Zstandard source code. This is done for a few reasons. + +First, Zstandard is relatively new and not yet widely available as a system +package. Providing a copy of the source code enables the Python C extension +to be compiled without requiring the user to obtain the Zstandard source code +separately. + +Second, Zstandard has both a stable *public* API and an *experimental* API. +The *experimental* API is actually quite useful (contains functionality for +training dictionaries for example), so it is something we wish to expose to +Python. However, the *experimental* API is only available via static linking. +Furthermore, the *experimental* API can change at any time. So, control over +the exact version of the Zstandard library linked against is important to +ensure known behavior. + +Instructions for Building and Testing +===================================== + +Once you have the source code, the extension can be built via setup.py:: + + $ python setup.py build_ext + +We recommend testing with ``nose``:: + + $ nosetests + +A Tox configuration is present to test against multiple Python versions:: + + $ tox + +Tests use the ``hypothesis`` Python package to perform fuzzing. If you +don't have it, those tests won't run. + +There is also an experimental CFFI module. You need the ``cffi`` Python +package installed to build and test that. + +To create a virtualenv with all development dependencies, do something +like the following:: + + # Python 2 + $ virtualenv venv + + # Python 3 + $ python3 -m venv venv + + $ source venv/bin/activate + $ pip install cffi hypothesis nose tox + +API +=== + +The compiled C extension provides a ``zstd`` Python module. This module +exposes the following interfaces. + +ZstdCompressor +-------------- + +The ``ZstdCompressor`` class provides an interface for performing +compression operations. + +Each instance is associated with parameters that control compression +behavior. These come from the following named arguments (all optional): + +level + Integer compression level. Valid values are between 1 and 22. +dict_data + Compression dictionary to use. + + Note: When using dictionary data and ``compress()`` is called multiple + times, the ``CompressionParameters`` derived from an integer compression + ``level`` and the first compressed data's size will be reused for all + subsequent operations. This may not be desirable if source data size + varies significantly. +compression_params + A ``CompressionParameters`` instance (overrides the ``level`` value). +write_checksum + Whether a 4 byte checksum should be written with the compressed data. + Defaults to False. If True, the decompressor can verify that decompressed + data matches the original input data. +write_content_size + Whether the size of the uncompressed data will be written into the + header of compressed data. Defaults to False. The data will only be + written if the compressor knows the size of the input data. This is + likely not true for streaming compression. +write_dict_id + Whether to write the dictionary ID into the compressed data. + Defaults to True. The dictionary ID is only written if a dictionary + is being used. + +Simple API +^^^^^^^^^^ + +``compress(data)`` compresses and returns data as a one-shot operation.:: + + cctx = zstd.ZsdCompressor() + compressed = cctx.compress(b'data to compress') + +Streaming Input API +^^^^^^^^^^^^^^^^^^^ + +``write_to(fh)`` (which behaves as a context manager) allows you to *stream* +data into a compressor.:: + + cctx = zstd.ZstdCompressor(level=10) + with cctx.write_to(fh) as compressor: + compressor.write(b'chunk 0') + compressor.write(b'chunk 1') + ... + +The argument to ``write_to()`` must have a ``write(data)`` method. As +compressed data is available, ``write()`` will be called with the comrpessed +data as its argument. Many common Python types implement ``write()``, including +open file handles and ``io.BytesIO``. + +``write_to()`` returns an object representing a streaming compressor instance. +It **must** be used as a context manager. That object's ``write(data)`` method +is used to feed data into the compressor. + +If the size of the data being fed to this streaming compressor is known, +you can declare it before compression begins:: + + cctx = zstd.ZstdCompressor() + with cctx.write_to(fh, size=data_len) as compressor: + compressor.write(chunk0) + compressor.write(chunk1) + ... + +Declaring the size of the source data allows compression parameters to +be tuned. And if ``write_content_size`` is used, it also results in the +content size being written into the frame header of the output data. + +The size of chunks being ``write()`` to the destination can be specified:: + + cctx = zstd.ZstdCompressor() + with cctx.write_to(fh, write_size=32768) as compressor: + ... + +To see how much memory is being used by the streaming compressor:: + + cctx = zstd.ZstdCompressor() + with cctx.write_to(fh) as compressor: + ... + byte_size = compressor.memory_size() + +Streaming Output API +^^^^^^^^^^^^^^^^^^^^ + +``read_from(reader)`` provides a mechanism to stream data out of a compressor +as an iterator of data chunks.:: + + cctx = zstd.ZstdCompressor() + for chunk in cctx.read_from(fh): + # Do something with emitted data. + +``read_from()`` accepts an object that has a ``read(size)`` method or conforms +to the buffer protocol. (``bytes`` and ``memoryview`` are 2 common types that +provide the buffer protocol.) + +Uncompressed data is fetched from the source either by calling ``read(size)`` +or by fetching a slice of data from the object directly (in the case where +the buffer protocol is being used). The returned iterator consists of chunks +of compressed data. + +Like ``write_to()``, ``read_from()`` also accepts a ``size`` argument +declaring the size of the input stream:: + + cctx = zstd.ZstdCompressor() + for chunk in cctx.read_from(fh, size=some_int): + pass + +You can also control the size that data is ``read()`` from the source and +the ideal size of output chunks:: + + cctx = zstd.ZstdCompressor() + for chunk in cctx.read_from(fh, read_size=16384, write_size=8192): + pass + +Stream Copying API +^^^^^^^^^^^^^^^^^^ + +``copy_stream(ifh, ofh)`` can be used to copy data between 2 streams while +compressing it.:: + + cctx = zstd.ZstdCompressor() + cctx.copy_stream(ifh, ofh) + +For example, say you wish to compress a file:: + + cctx = zstd.ZstdCompressor() + with open(input_path, 'rb') as ifh, open(output_path, 'wb') as ofh: + cctx.copy_stream(ifh, ofh) + +It is also possible to declare the size of the source stream:: + + cctx = zstd.ZstdCompressor() + cctx.copy_stream(ifh, ofh, size=len_of_input) + +You can also specify how large the chunks that are ``read()`` and ``write()`` +from and to the streams:: + + cctx = zstd.ZstdCompressor() + cctx.copy_stream(ifh, ofh, read_size=32768, write_size=16384) + +The stream copier returns a 2-tuple of bytes read and written:: + + cctx = zstd.ZstdCompressor() + read_count, write_count = cctx.copy_stream(ifh, ofh) + +Compressor API +^^^^^^^^^^^^^^ + +``compressobj()`` returns an object that exposes ``compress(data)`` and +``flush()`` methods. Each returns compressed data or an empty bytes. + +The purpose of ``compressobj()`` is to provide an API-compatible interface +with ``zlib.compressobj`` and ``bz2.BZ2Compressor``. This allows callers to +swap in different compressor objects while using the same API. + +Once ``flush()`` is called, the compressor will no longer accept new data +to ``compress()``. ``flush()`` **must** be called to end the compression +context. If not called, the returned data may be incomplete. + +Here is how this API should be used:: + + cctx = zstd.ZstdCompressor() + cobj = cctx.compressobj() + data = cobj.compress(b'raw input 0') + data = cobj.compress(b'raw input 1') + data = cobj.flush() + +For best performance results, keep input chunks under 256KB. This avoids +extra allocations for a large output object. + +It is possible to declare the input size of the data that will be fed into +the compressor:: + + cctx = zstd.ZstdCompressor() + cobj = cctx.compressobj(size=6) + data = cobj.compress(b'foobar') + data = cobj.flush() + +ZstdDecompressor +---------------- + +The ``ZstdDecompressor`` class provides an interface for performing +decompression. + +Each instance is associated with parameters that control decompression. These +come from the following named arguments (all optional): + +dict_data + Compression dictionary to use. + +The interface of this class is very similar to ``ZstdCompressor`` (by design). + +Simple API +^^^^^^^^^^ + +``decompress(data)`` can be used to decompress an entire compressed zstd +frame in a single operation.:: + + dctx = zstd.ZstdDecompressor() + decompressed = dctx.decompress(data) + +By default, ``decompress(data)`` will only work on data written with the content +size encoded in its header. This can be achieved by creating a +``ZstdCompressor`` with ``write_content_size=True``. If compressed data without +an embedded content size is seen, ``zstd.ZstdError`` will be raised. + +If the compressed data doesn't have its content size embedded within it, +decompression can be attempted by specifying the ``max_output_size`` +argument.:: + + dctx = zstd.ZstdDecompressor() + uncompressed = dctx.decompress(data, max_output_size=1048576) + +Ideally, ``max_output_size`` will be identical to the decompressed output +size. + +If ``max_output_size`` is too small to hold the decompressed data, +``zstd.ZstdError`` will be raised. + +If ``max_output_size`` is larger than the decompressed data, the allocated +output buffer will be resized to only use the space required. + +Please note that an allocation of the requested ``max_output_size`` will be +performed every time the method is called. Setting to a very large value could +result in a lot of work for the memory allocator and may result in +``MemoryError`` being raised if the allocation fails. + +If the exact size of decompressed data is unknown, it is **strongly** +recommended to use a streaming API. + +Streaming Input API +^^^^^^^^^^^^^^^^^^^ + +``write_to(fh)`` can be used to incrementally send compressed data to a +decompressor.:: + + dctx = zstd.ZstdDecompressor() + with dctx.write_to(fh) as decompressor: + decompressor.write(compressed_data) + +This behaves similarly to ``zstd.ZstdCompressor``: compressed data is written to +the decompressor by calling ``write(data)`` and decompressed output is written +to the output object by calling its ``write(data)`` method. + +The size of chunks being ``write()`` to the destination can be specified:: + + dctx = zstd.ZstdDecompressor() + with dctx.write_to(fh, write_size=16384) as decompressor: + pass + +You can see how much memory is being used by the decompressor:: + + dctx = zstd.ZstdDecompressor() + with dctx.write_to(fh) as decompressor: + byte_size = decompressor.memory_size() + +Streaming Output API +^^^^^^^^^^^^^^^^^^^^ + +``read_from(fh)`` provides a mechanism to stream decompressed data out of a +compressed source as an iterator of data chunks.:: + + dctx = zstd.ZstdDecompressor() + for chunk in dctx.read_from(fh): + # Do something with original data. + +``read_from()`` accepts a) an object with a ``read(size)`` method that will +return compressed bytes b) an object conforming to the buffer protocol that +can expose its data as a contiguous range of bytes. The ``bytes`` and +``memoryview`` types expose this buffer protocol. + +``read_from()`` returns an iterator whose elements are chunks of the +decompressed data. + +The size of requested ``read()`` from the source can be specified:: + + dctx = zstd.ZstdDecompressor() + for chunk in dctx.read_from(fh, read_size=16384): + pass + +It is also possible to skip leading bytes in the input data:: + + dctx = zstd.ZstdDecompressor() + for chunk in dctx.read_from(fh, skip_bytes=1): + pass + +Skipping leading bytes is useful if the source data contains extra +*header* data but you want to avoid the overhead of making a buffer copy +or allocating a new ``memoryview`` object in order to decompress the data. + +Similarly to ``ZstdCompressor.read_from()``, the consumer of the iterator +controls when data is decompressed. If the iterator isn't consumed, +decompression is put on hold. + +When ``read_from()`` is passed an object conforming to the buffer protocol, +the behavior may seem similar to what occurs when the simple decompression +API is used. However, this API works when the decompressed size is unknown. +Furthermore, if feeding large inputs, the decompressor will work in chunks +instead of performing a single operation. + +Stream Copying API +^^^^^^^^^^^^^^^^^^ + +``copy_stream(ifh, ofh)`` can be used to copy data across 2 streams while +performing decompression.:: + + dctx = zstd.ZstdDecompressor() + dctx.copy_stream(ifh, ofh) + +e.g. to decompress a file to another file:: + + dctx = zstd.ZstdDecompressor() + with open(input_path, 'rb') as ifh, open(output_path, 'wb') as ofh: + dctx.copy_stream(ifh, ofh) + +The size of chunks being ``read()`` and ``write()`` from and to the streams +can be specified:: + + dctx = zstd.ZstdDecompressor() + dctx.copy_stream(ifh, ofh, read_size=8192, write_size=16384) + +Decompressor API +^^^^^^^^^^^^^^^^ + +``decompressobj()`` returns an object that exposes a ``decompress(data)`` +methods. Compressed data chunks are fed into ``decompress(data)`` and +uncompressed output (or an empty bytes) is returned. Output from subsequent +calls needs to be concatenated to reassemble the full decompressed byte +sequence. + +The purpose of ``decompressobj()`` is to provide an API-compatible interface +with ``zlib.decompressobj`` and ``bz2.BZ2Decompressor``. This allows callers +to swap in different decompressor objects while using the same API. + +Each object is single use: once an input frame is decoded, ``decompress()`` +can no longer be called. + +Here is how this API should be used:: + + dctx = zstd.ZstdDeompressor() + dobj = cctx.decompressobj() + data = dobj.decompress(compressed_chunk_0) + data = dobj.decompress(compressed_chunk_1) + +Choosing an API +--------------- + +Various forms of compression and decompression APIs are provided because each +are suitable for different use cases. + +The simple/one-shot APIs are useful for small data, when the decompressed +data size is known (either recorded in the zstd frame header via +``write_content_size`` or known via an out-of-band mechanism, such as a file +size). + +A limitation of the simple APIs is that input or output data must fit in memory. +And unless using advanced tricks with Python *buffer objects*, both input and +output must fit in memory simultaneously. + +Another limitation is that compression or decompression is performed as a single +operation. So if you feed large input, it could take a long time for the +function to return. + +The streaming APIs do not have the limitations of the simple API. The cost to +this is they are more complex to use than a single function call. + +The streaming APIs put the caller in control of compression and decompression +behavior by allowing them to directly control either the input or output side +of the operation. + +With the streaming input APIs, the caller feeds data into the compressor or +decompressor as they see fit. Output data will only be written after the caller +has explicitly written data. + +With the streaming output APIs, the caller consumes output from the compressor +or decompressor as they see fit. The compressor or decompressor will only +consume data from the source when the caller is ready to receive it. + +One end of the streaming APIs involves a file-like object that must +``write()`` output data or ``read()`` input data. Depending on what the +backing storage for these objects is, those operations may not complete quickly. +For example, when streaming compressed data to a file, the ``write()`` into +a streaming compressor could result in a ``write()`` to the filesystem, which +may take a long time to finish due to slow I/O on the filesystem. So, there +may be overhead in streaming APIs beyond the compression and decompression +operations. + +Dictionary Creation and Management +---------------------------------- + +Zstandard allows *dictionaries* to be used when compressing and +decompressing data. The idea is that if you are compressing a lot of similar +data, you can precompute common properties of that data (such as recurring +byte sequences) to achieve better compression ratios. + +In Python, compression dictionaries are represented as the +``ZstdCompressionDict`` type. + +Instances can be constructed from bytes:: + + dict_data = zstd.ZstdCompressionDict(data) + +More interestingly, instances can be created by *training* on sample data:: + + dict_data = zstd.train_dictionary(size, samples) + +This takes a list of bytes instances and creates and returns a +``ZstdCompressionDict``. + +You can see how many bytes are in the dictionary by calling ``len()``:: + + dict_data = zstd.train_dictionary(size, samples) + dict_size = len(dict_data) # will not be larger than ``size`` + +Once you have a dictionary, you can pass it to the objects performing +compression and decompression:: + + dict_data = zstd.train_dictionary(16384, samples) + + cctx = zstd.ZstdCompressor(dict_data=dict_data) + for source_data in input_data: + compressed = cctx.compress(source_data) + # Do something with compressed data. + + dctx = zstd.ZstdDecompressor(dict_data=dict_data) + for compressed_data in input_data: + buffer = io.BytesIO() + with dctx.write_to(buffer) as decompressor: + decompressor.write(compressed_data) + # Do something with raw data in ``buffer``. + +Dictionaries have unique integer IDs. You can retrieve this ID via:: + + dict_id = zstd.dictionary_id(dict_data) + +You can obtain the raw data in the dict (useful for persisting and constructing +a ``ZstdCompressionDict`` later) via ``as_bytes()``:: + + dict_data = zstd.train_dictionary(size, samples) + raw_data = dict_data.as_bytes() + +Explicit Compression Parameters +------------------------------- + +Zstandard's integer compression levels along with the input size and dictionary +size are converted into a data structure defining multiple parameters to tune +behavior of the compression algorithm. It is possible to use define this +data structure explicitly to have lower-level control over compression behavior. + +The ``zstd.CompressionParameters`` type represents this data structure. +You can see how Zstandard converts compression levels to this data structure +by calling ``zstd.get_compression_parameters()``. e.g.:: + + params = zstd.get_compression_parameters(5) + +This function also accepts the uncompressed data size and dictionary size +to adjust parameters:: + + params = zstd.get_compression_parameters(3, source_size=len(data), dict_size=len(dict_data)) + +You can also construct compression parameters from their low-level components:: + + params = zstd.CompressionParameters(20, 6, 12, 5, 4, 10, zstd.STRATEGY_FAST) + +You can then configure a compressor to use the custom parameters:: + + cctx = zstd.ZstdCompressor(compression_params=params) + +The members of the ``CompressionParameters`` tuple are as follows:: + +* 0 - Window log +* 1 - Chain log +* 2 - Hash log +* 3 - Search log +* 4 - Search length +* 5 - Target length +* 6 - Strategy (one of the ``zstd.STRATEGY_`` constants) + +You'll need to read the Zstandard documentation for what these parameters +do. + +Misc Functionality +------------------ + +estimate_compression_context_size(CompressionParameters) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Given a ``CompressionParameters`` struct, estimate the memory size required +to perform compression. + +estimate_decompression_context_size() +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Estimate the memory size requirements for a decompressor instance. + +Constants +--------- + +The following module constants/attributes are exposed: + +ZSTD_VERSION + This module attribute exposes a 3-tuple of the Zstandard version. e.g. + ``(1, 0, 0)`` +MAX_COMPRESSION_LEVEL + Integer max compression level accepted by compression functions +COMPRESSION_RECOMMENDED_INPUT_SIZE + Recommended chunk size to feed to compressor functions +COMPRESSION_RECOMMENDED_OUTPUT_SIZE + Recommended chunk size for compression output +DECOMPRESSION_RECOMMENDED_INPUT_SIZE + Recommended chunk size to feed into decompresor functions +DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE + Recommended chunk size for decompression output + +FRAME_HEADER + bytes containing header of the Zstandard frame +MAGIC_NUMBER + Frame header as an integer + +WINDOWLOG_MIN + Minimum value for compression parameter +WINDOWLOG_MAX + Maximum value for compression parameter +CHAINLOG_MIN + Minimum value for compression parameter +CHAINLOG_MAX + Maximum value for compression parameter +HASHLOG_MIN + Minimum value for compression parameter +HASHLOG_MAX + Maximum value for compression parameter +SEARCHLOG_MIN + Minimum value for compression parameter +SEARCHLOG_MAX + Maximum value for compression parameter +SEARCHLENGTH_MIN + Minimum value for compression parameter +SEARCHLENGTH_MAX + Maximum value for compression parameter +TARGETLENGTH_MIN + Minimum value for compression parameter +TARGETLENGTH_MAX + Maximum value for compression parameter +STRATEGY_FAST + Compression strategory +STRATEGY_DFAST + Compression strategory +STRATEGY_GREEDY + Compression strategory +STRATEGY_LAZY + Compression strategory +STRATEGY_LAZY2 + Compression strategory +STRATEGY_BTLAZY2 + Compression strategory +STRATEGY_BTOPT + Compression strategory + +Note on Zstandard's *Experimental* API +====================================== + +Many of the Zstandard APIs used by this module are marked as *experimental* +within the Zstandard project. This includes a large number of useful +features, such as compression and frame parameters and parts of dictionary +compression. + +It is unclear how Zstandard's C API will evolve over time, especially with +regards to this *experimental* functionality. We will try to maintain +backwards compatibility at the Python API level. However, we cannot +guarantee this for things not under our control. + +Since a copy of the Zstandard source code is distributed with this +module and since we compile against it, the behavior of a specific +version of this module should be constant for all of time. So if you +pin the version of this module used in your projects (which is a Python +best practice), you should be buffered from unwanted future changes. + +Donate +====== + +A lot of time has been invested into this project by the author. + +If you find this project useful and would like to thank the author for +their work, consider donating some money. Any amount is appreciated. + +.. image:: https://www.paypalobjects.com/en_US/i/btn/btn_donate_LG.gif + :target: https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=gregory%2eszorc%40gmail%2ecom&lc=US&item_name=python%2dzstandard¤cy_code=USD&bn=PP%2dDonationsBF%3abtn_donate_LG%2egif%3aNonHosted + :alt: Donate via PayPal + +.. |ci-status| image:: https://travis-ci.org/indygreg/python-zstandard.svg?branch=master + :target: https://travis-ci.org/indygreg/python-zstandard + +.. |win-ci-status| image:: https://ci.appveyor.com/api/projects/status/github/indygreg/python-zstandard?svg=true + :target: https://ci.appveyor.com/project/indygreg/python-zstandard + :alt: Windows build status diff --git a/contrib/python-zstandard/c-ext/compressiondict.c b/contrib/python-zstandard/c-ext/compressiondict.c new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/c-ext/compressiondict.c @@ -0,0 +1,247 @@ +/** +* Copyright (c) 2016-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +extern PyObject* ZstdError; + +ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) { + static char *kwlist[] = { "dict_size", "samples", "parameters", NULL }; + size_t capacity; + PyObject* samples; + Py_ssize_t samplesLen; + PyObject* parameters = NULL; + ZDICT_params_t zparams; + Py_ssize_t sampleIndex; + Py_ssize_t sampleSize; + PyObject* sampleItem; + size_t zresult; + void* sampleBuffer; + void* sampleOffset; + size_t samplesSize = 0; + size_t* sampleSizes; + void* dict; + ZstdCompressionDict* result; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|O!", kwlist, + &capacity, + &PyList_Type, &samples, + (PyObject*)&DictParametersType, ¶meters)) { + return NULL; + } + + /* Validate parameters first since it is easiest. */ + zparams.selectivityLevel = 0; + zparams.compressionLevel = 0; + zparams.notificationLevel = 0; + zparams.dictID = 0; + zparams.reserved[0] = 0; + zparams.reserved[1] = 0; + + if (parameters) { + /* TODO validate data ranges */ + zparams.selectivityLevel = PyLong_AsUnsignedLong(PyTuple_GetItem(parameters, 0)); + zparams.compressionLevel = PyLong_AsLong(PyTuple_GetItem(parameters, 1)); + zparams.notificationLevel = PyLong_AsUnsignedLong(PyTuple_GetItem(parameters, 2)); + zparams.dictID = PyLong_AsUnsignedLong(PyTuple_GetItem(parameters, 3)); + } + + /* Figure out the size of the raw samples */ + samplesLen = PyList_Size(samples); + for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) { + sampleItem = PyList_GetItem(samples, sampleIndex); + if (!PyBytes_Check(sampleItem)) { + PyErr_SetString(PyExc_ValueError, "samples must be bytes"); + /* TODO probably need to perform DECREF here */ + return NULL; + } + samplesSize += PyBytes_GET_SIZE(sampleItem); + } + + /* Now that we know the total size of the raw simples, we can allocate + a buffer for the raw data */ + sampleBuffer = malloc(samplesSize); + if (!sampleBuffer) { + PyErr_NoMemory(); + return NULL; + } + sampleSizes = malloc(samplesLen * sizeof(size_t)); + if (!sampleSizes) { + free(sampleBuffer); + PyErr_NoMemory(); + return NULL; + } + + sampleOffset = sampleBuffer; + /* Now iterate again and assemble the samples in the buffer */ + for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) { + sampleItem = PyList_GetItem(samples, sampleIndex); + sampleSize = PyBytes_GET_SIZE(sampleItem); + sampleSizes[sampleIndex] = sampleSize; + memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize); + sampleOffset = (char*)sampleOffset + sampleSize; + } + + dict = malloc(capacity); + if (!dict) { + free(sampleSizes); + free(sampleBuffer); + PyErr_NoMemory(); + return NULL; + } + + zresult = ZDICT_trainFromBuffer_advanced(dict, capacity, + sampleBuffer, sampleSizes, (unsigned int)samplesLen, + zparams); + if (ZDICT_isError(zresult)) { + PyErr_Format(ZstdError, "Cannot train dict: %s", ZDICT_getErrorName(zresult)); + free(dict); + free(sampleSizes); + free(sampleBuffer); + return NULL; + } + + result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType); + if (!result) { + return NULL; + } + + result->dictData = dict; + result->dictSize = zresult; + return result; +} + + +PyDoc_STRVAR(ZstdCompressionDict__doc__, +"ZstdCompressionDict(data) - Represents a computed compression dictionary\n" +"\n" +"This type holds the results of a computed Zstandard compression dictionary.\n" +"Instances are obtained by calling ``train_dictionary()`` or by passing bytes\n" +"obtained from another source into the constructor.\n" +); + +static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args) { + const char* source; + Py_ssize_t sourceSize; + + self->dictData = NULL; + self->dictSize = 0; + +#if PY_MAJOR_VERSION >= 3 + if (!PyArg_ParseTuple(args, "y#", &source, &sourceSize)) { +#else + if (!PyArg_ParseTuple(args, "s#", &source, &sourceSize)) { +#endif + return -1; + } + + self->dictData = malloc(sourceSize); + if (!self->dictData) { + PyErr_NoMemory(); + return -1; + } + + memcpy(self->dictData, source, sourceSize); + self->dictSize = sourceSize; + + return 0; + } + +static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) { + if (self->dictData) { + free(self->dictData); + self->dictData = NULL; + } + + PyObject_Del(self); +} + +static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) { + unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize); + + return PyLong_FromLong(dictID); +} + +static PyObject* ZstdCompressionDict_as_bytes(ZstdCompressionDict* self) { + return PyBytes_FromStringAndSize(self->dictData, self->dictSize); +} + +static PyMethodDef ZstdCompressionDict_methods[] = { + { "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS, + PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") }, + { "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS, + PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") }, + { NULL, NULL } +}; + +static Py_ssize_t ZstdCompressionDict_length(ZstdCompressionDict* self) { + return self->dictSize; +} + +static PySequenceMethods ZstdCompressionDict_sq = { + (lenfunc)ZstdCompressionDict_length, /* sq_length */ + 0, /* sq_concat */ + 0, /* sq_repeat */ + 0, /* sq_item */ + 0, /* sq_ass_item */ + 0, /* sq_contains */ + 0, /* sq_inplace_concat */ + 0 /* sq_inplace_repeat */ +}; + +PyTypeObject ZstdCompressionDictType = { + PyVarObject_HEAD_INIT(NULL, 0) + "zstd.ZstdCompressionDict", /* tp_name */ + sizeof(ZstdCompressionDict), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)ZstdCompressionDict_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + &ZstdCompressionDict_sq, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + ZstdCompressionDict__doc__, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + ZstdCompressionDict_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)ZstdCompressionDict_init, /* tp_init */ + 0, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ +}; + +void compressiondict_module_init(PyObject* mod) { + Py_TYPE(&ZstdCompressionDictType) = &PyType_Type; + if (PyType_Ready(&ZstdCompressionDictType) < 0) { + return; + } + + Py_INCREF((PyObject*)&ZstdCompressionDictType); + PyModule_AddObject(mod, "ZstdCompressionDict", + (PyObject*)&ZstdCompressionDictType); +} diff --git a/contrib/python-zstandard/c-ext/compressionparams.c b/contrib/python-zstandard/c-ext/compressionparams.c new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/c-ext/compressionparams.c @@ -0,0 +1,226 @@ +/** +* Copyright (c) 2016-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +void ztopy_compression_parameters(CompressionParametersObject* params, ZSTD_compressionParameters* zparams) { + zparams->windowLog = params->windowLog; + zparams->chainLog = params->chainLog; + zparams->hashLog = params->hashLog; + zparams->searchLog = params->searchLog; + zparams->searchLength = params->searchLength; + zparams->targetLength = params->targetLength; + zparams->strategy = params->strategy; +} + +CompressionParametersObject* get_compression_parameters(PyObject* self, PyObject* args) { + int compressionLevel; + unsigned PY_LONG_LONG sourceSize = 0; + Py_ssize_t dictSize = 0; + ZSTD_compressionParameters params; + CompressionParametersObject* result; + + if (!PyArg_ParseTuple(args, "i|Kn", &compressionLevel, &sourceSize, &dictSize)) { + return NULL; + } + + params = ZSTD_getCParams(compressionLevel, sourceSize, dictSize); + + result = PyObject_New(CompressionParametersObject, &CompressionParametersType); + if (!result) { + return NULL; + } + + result->windowLog = params.windowLog; + result->chainLog = params.chainLog; + result->hashLog = params.hashLog; + result->searchLog = params.searchLog; + result->searchLength = params.searchLength; + result->targetLength = params.targetLength; + result->strategy = params.strategy; + + return result; +} + +PyObject* estimate_compression_context_size(PyObject* self, PyObject* args) { + CompressionParametersObject* params; + ZSTD_compressionParameters zparams; + PyObject* result; + + if (!PyArg_ParseTuple(args, "O!", &CompressionParametersType, ¶ms)) { + return NULL; + } + + ztopy_compression_parameters(params, &zparams); + result = PyLong_FromSize_t(ZSTD_estimateCCtxSize(zparams)); + return result; +} + +PyDoc_STRVAR(CompressionParameters__doc__, +"CompressionParameters: low-level control over zstd compression"); + +static PyObject* CompressionParameters_new(PyTypeObject* subtype, PyObject* args, PyObject* kwargs) { + CompressionParametersObject* self; + unsigned windowLog; + unsigned chainLog; + unsigned hashLog; + unsigned searchLog; + unsigned searchLength; + unsigned targetLength; + unsigned strategy; + + if (!PyArg_ParseTuple(args, "IIIIIII", &windowLog, &chainLog, &hashLog, &searchLog, + &searchLength, &targetLength, &strategy)) { + return NULL; + } + + if (windowLog < ZSTD_WINDOWLOG_MIN || windowLog > ZSTD_WINDOWLOG_MAX) { + PyErr_SetString(PyExc_ValueError, "invalid window log value"); + return NULL; + } + + if (chainLog < ZSTD_CHAINLOG_MIN || chainLog > ZSTD_CHAINLOG_MAX) { + PyErr_SetString(PyExc_ValueError, "invalid chain log value"); + return NULL; + } + + if (hashLog < ZSTD_HASHLOG_MIN || hashLog > ZSTD_HASHLOG_MAX) { + PyErr_SetString(PyExc_ValueError, "invalid hash log value"); + return NULL; + } + + if (searchLog < ZSTD_SEARCHLOG_MIN || searchLog > ZSTD_SEARCHLOG_MAX) { + PyErr_SetString(PyExc_ValueError, "invalid search log value"); + return NULL; + } + + if (searchLength < ZSTD_SEARCHLENGTH_MIN || searchLength > ZSTD_SEARCHLENGTH_MAX) { + PyErr_SetString(PyExc_ValueError, "invalid search length value"); + return NULL; + } + + if (targetLength < ZSTD_TARGETLENGTH_MIN || targetLength > ZSTD_TARGETLENGTH_MAX) { + PyErr_SetString(PyExc_ValueError, "invalid target length value"); + return NULL; + } + + if (strategy < ZSTD_fast || strategy > ZSTD_btopt) { + PyErr_SetString(PyExc_ValueError, "invalid strategy value"); + return NULL; + } + + self = (CompressionParametersObject*)subtype->tp_alloc(subtype, 1); + if (!self) { + return NULL; + } + + self->windowLog = windowLog; + self->chainLog = chainLog; + self->hashLog = hashLog; + self->searchLog = searchLog; + self->searchLength = searchLength; + self->targetLength = targetLength; + self->strategy = strategy; + + return (PyObject*)self; +} + +static void CompressionParameters_dealloc(PyObject* self) { + PyObject_Del(self); +} + +static Py_ssize_t CompressionParameters_length(PyObject* self) { + return 7; +}; + +static PyObject* CompressionParameters_item(PyObject* o, Py_ssize_t i) { + CompressionParametersObject* self = (CompressionParametersObject*)o; + + switch (i) { + case 0: + return PyLong_FromLong(self->windowLog); + case 1: + return PyLong_FromLong(self->chainLog); + case 2: + return PyLong_FromLong(self->hashLog); + case 3: + return PyLong_FromLong(self->searchLog); + case 4: + return PyLong_FromLong(self->searchLength); + case 5: + return PyLong_FromLong(self->targetLength); + case 6: + return PyLong_FromLong(self->strategy); + default: + PyErr_SetString(PyExc_IndexError, "index out of range"); + return NULL; + } +} + +static PySequenceMethods CompressionParameters_sq = { + CompressionParameters_length, /* sq_length */ + 0, /* sq_concat */ + 0, /* sq_repeat */ + CompressionParameters_item, /* sq_item */ + 0, /* sq_ass_item */ + 0, /* sq_contains */ + 0, /* sq_inplace_concat */ + 0 /* sq_inplace_repeat */ +}; + +PyTypeObject CompressionParametersType = { + PyVarObject_HEAD_INIT(NULL, 0) + "CompressionParameters", /* tp_name */ + sizeof(CompressionParametersObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)CompressionParameters_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + &CompressionParameters_sq, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + CompressionParameters__doc__, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + CompressionParameters_new, /* tp_new */ +}; + +void compressionparams_module_init(PyObject* mod) { + Py_TYPE(&CompressionParametersType) = &PyType_Type; + if (PyType_Ready(&CompressionParametersType) < 0) { + return; + } + + Py_IncRef((PyObject*)&CompressionParametersType); + PyModule_AddObject(mod, "CompressionParameters", + (PyObject*)&CompressionParametersType); +} diff --git a/contrib/python-zstandard/c-ext/compressionwriter.c b/contrib/python-zstandard/c-ext/compressionwriter.c new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/c-ext/compressionwriter.c @@ -0,0 +1,235 @@ +/** +* Copyright (c) 2016-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +extern PyObject* ZstdError; + +PyDoc_STRVAR(ZstdCompresssionWriter__doc__, +"""A context manager used for writing compressed output to a writer.\n" +); + +static void ZstdCompressionWriter_dealloc(ZstdCompressionWriter* self) { + Py_XDECREF(self->compressor); + Py_XDECREF(self->writer); + + if (self->cstream) { + ZSTD_freeCStream(self->cstream); + self->cstream = NULL; + } + + PyObject_Del(self); +} + +static PyObject* ZstdCompressionWriter_enter(ZstdCompressionWriter* self) { + if (self->entered) { + PyErr_SetString(ZstdError, "cannot __enter__ multiple times"); + return NULL; + } + + self->cstream = CStream_from_ZstdCompressor(self->compressor, self->sourceSize); + if (!self->cstream) { + return NULL; + } + + self->entered = 1; + + Py_INCREF(self); + return (PyObject*)self; +} + +static PyObject* ZstdCompressionWriter_exit(ZstdCompressionWriter* self, PyObject* args) { + PyObject* exc_type; + PyObject* exc_value; + PyObject* exc_tb; + size_t zresult; + + ZSTD_outBuffer output; + PyObject* res; + + if (!PyArg_ParseTuple(args, "OOO", &exc_type, &exc_value, &exc_tb)) { + return NULL; + } + + self->entered = 0; + + if (self->cstream && exc_type == Py_None && exc_value == Py_None && + exc_tb == Py_None) { + + output.dst = malloc(self->outSize); + if (!output.dst) { + return PyErr_NoMemory(); + } + output.size = self->outSize; + output.pos = 0; + + while (1) { + zresult = ZSTD_endStream(self->cstream, &output); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "error ending compression stream: %s", + ZSTD_getErrorName(zresult)); + free(output.dst); + return NULL; + } + + if (output.pos) { +#if PY_MAJOR_VERSION >= 3 + res = PyObject_CallMethod(self->writer, "write", "y#", +#else + res = PyObject_CallMethod(self->writer, "write", "s#", +#endif + output.dst, output.pos); + Py_XDECREF(res); + } + + if (!zresult) { + break; + } + + output.pos = 0; + } + + free(output.dst); + ZSTD_freeCStream(self->cstream); + self->cstream = NULL; + } + + Py_RETURN_FALSE; +} + +static PyObject* ZstdCompressionWriter_memory_size(ZstdCompressionWriter* self) { + if (!self->cstream) { + PyErr_SetString(ZstdError, "cannot determine size of an inactive compressor; " + "call when a context manager is active"); + return NULL; + } + + return PyLong_FromSize_t(ZSTD_sizeof_CStream(self->cstream)); +} + +static PyObject* ZstdCompressionWriter_write(ZstdCompressionWriter* self, PyObject* args) { + const char* source; + Py_ssize_t sourceSize; + size_t zresult; + ZSTD_inBuffer input; + ZSTD_outBuffer output; + PyObject* res; + +#if PY_MAJOR_VERSION >= 3 + if (!PyArg_ParseTuple(args, "y#", &source, &sourceSize)) { +#else + if (!PyArg_ParseTuple(args, "s#", &source, &sourceSize)) { +#endif + return NULL; + } + + if (!self->entered) { + PyErr_SetString(ZstdError, "compress must be called from an active context manager"); + return NULL; + } + + output.dst = malloc(self->outSize); + if (!output.dst) { + return PyErr_NoMemory(); + } + output.size = self->outSize; + output.pos = 0; + + input.src = source; + input.size = sourceSize; + input.pos = 0; + + while ((ssize_t)input.pos < sourceSize) { + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_compressStream(self->cstream, &output, &input); + Py_END_ALLOW_THREADS + + if (ZSTD_isError(zresult)) { + free(output.dst); + PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult)); + return NULL; + } + + /* Copy data from output buffer to writer. */ + if (output.pos) { +#if PY_MAJOR_VERSION >= 3 + res = PyObject_CallMethod(self->writer, "write", "y#", +#else + res = PyObject_CallMethod(self->writer, "write", "s#", +#endif + output.dst, output.pos); + Py_XDECREF(res); + } + output.pos = 0; + } + + free(output.dst); + + /* TODO return bytes written */ + Py_RETURN_NONE; + } + +static PyMethodDef ZstdCompressionWriter_methods[] = { + { "__enter__", (PyCFunction)ZstdCompressionWriter_enter, METH_NOARGS, + PyDoc_STR("Enter a compression context.") }, + { "__exit__", (PyCFunction)ZstdCompressionWriter_exit, METH_VARARGS, + PyDoc_STR("Exit a compression context.") }, + { "memory_size", (PyCFunction)ZstdCompressionWriter_memory_size, METH_NOARGS, + PyDoc_STR("Obtain the memory size of the underlying compressor") }, + { "write", (PyCFunction)ZstdCompressionWriter_write, METH_VARARGS, + PyDoc_STR("Compress data") }, + { NULL, NULL } +}; + +PyTypeObject ZstdCompressionWriterType = { + PyVarObject_HEAD_INIT(NULL, 0) + "zstd.ZstdCompressionWriter", /* tp_name */ + sizeof(ZstdCompressionWriter), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)ZstdCompressionWriter_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + ZstdCompresssionWriter__doc__, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + ZstdCompressionWriter_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ +}; + +void compressionwriter_module_init(PyObject* mod) { + Py_TYPE(&ZstdCompressionWriterType) = &PyType_Type; + if (PyType_Ready(&ZstdCompressionWriterType) < 0) { + return; + } +} diff --git a/contrib/python-zstandard/c-ext/compressobj.c b/contrib/python-zstandard/c-ext/compressobj.c new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/c-ext/compressobj.c @@ -0,0 +1,205 @@ +/** +* Copyright (c) 2016-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +extern PyObject* ZstdError; + +PyDoc_STRVAR(ZstdCompressionObj__doc__, +"Perform compression using a standard library compatible API.\n" +); + +static void ZstdCompressionObj_dealloc(ZstdCompressionObj* self) { + PyMem_Free(self->output.dst); + self->output.dst = NULL; + + if (self->cstream) { + ZSTD_freeCStream(self->cstream); + self->cstream = NULL; + } + + Py_XDECREF(self->compressor); + + PyObject_Del(self); +} + +static PyObject* ZstdCompressionObj_compress(ZstdCompressionObj* self, PyObject* args) { + const char* source; + Py_ssize_t sourceSize; + ZSTD_inBuffer input; + size_t zresult; + PyObject* result = NULL; + Py_ssize_t resultSize = 0; + + if (self->flushed) { + PyErr_SetString(ZstdError, "cannot call compress() after flush() has been called"); + return NULL; + } + +#if PY_MAJOR_VERSION >= 3 + if (!PyArg_ParseTuple(args, "y#", &source, &sourceSize)) { +#else + if (!PyArg_ParseTuple(args, "s#", &source, &sourceSize)) { +#endif + return NULL; + } + + input.src = source; + input.size = sourceSize; + input.pos = 0; + + while ((ssize_t)input.pos < sourceSize) { + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_compressStream(self->cstream, &self->output, &input); + Py_END_ALLOW_THREADS + + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult)); + return NULL; + } + + if (self->output.pos) { + if (result) { + resultSize = PyBytes_GET_SIZE(result); + if (-1 == _PyBytes_Resize(&result, resultSize + self->output.pos)) { + return NULL; + } + + memcpy(PyBytes_AS_STRING(result) + resultSize, + self->output.dst, self->output.pos); + } + else { + result = PyBytes_FromStringAndSize(self->output.dst, self->output.pos); + if (!result) { + return NULL; + } + } + + self->output.pos = 0; + } + } + + if (result) { + return result; + } + else { + return PyBytes_FromString(""); + } +} + +static PyObject* ZstdCompressionObj_flush(ZstdCompressionObj* self) { + size_t zresult; + PyObject* result = NULL; + Py_ssize_t resultSize = 0; + + if (self->flushed) { + PyErr_SetString(ZstdError, "flush() already called"); + return NULL; + } + + self->flushed = 1; + + while (1) { + zresult = ZSTD_endStream(self->cstream, &self->output); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "error ending compression stream: %s", + ZSTD_getErrorName(zresult)); + return NULL; + } + + if (self->output.pos) { + if (result) { + resultSize = PyBytes_GET_SIZE(result); + if (-1 == _PyBytes_Resize(&result, resultSize + self->output.pos)) { + return NULL; + } + + memcpy(PyBytes_AS_STRING(result) + resultSize, + self->output.dst, self->output.pos); + } + else { + result = PyBytes_FromStringAndSize(self->output.dst, self->output.pos); + if (!result) { + return NULL; + } + } + + self->output.pos = 0; + } + + if (!zresult) { + break; + } + } + + ZSTD_freeCStream(self->cstream); + self->cstream = NULL; + + if (result) { + return result; + } + else { + return PyBytes_FromString(""); + } +} + +static PyMethodDef ZstdCompressionObj_methods[] = { + { "compress", (PyCFunction)ZstdCompressionObj_compress, METH_VARARGS, + PyDoc_STR("compress data") }, + { "flush", (PyCFunction)ZstdCompressionObj_flush, METH_NOARGS, + PyDoc_STR("finish compression operation") }, + { NULL, NULL } +}; + +PyTypeObject ZstdCompressionObjType = { + PyVarObject_HEAD_INIT(NULL, 0) + "zstd.ZstdCompressionObj", /* tp_name */ + sizeof(ZstdCompressionObj), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)ZstdCompressionObj_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + ZstdCompressionObj__doc__, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + ZstdCompressionObj_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ +}; + +void compressobj_module_init(PyObject* module) { + Py_TYPE(&ZstdCompressionObjType) = &PyType_Type; + if (PyType_Ready(&ZstdCompressionObjType) < 0) { + return; + } +} diff --git a/contrib/python-zstandard/c-ext/compressor.c b/contrib/python-zstandard/c-ext/compressor.c new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/c-ext/compressor.c @@ -0,0 +1,757 @@ +/** +* Copyright (c) 2016-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +extern PyObject* ZstdError; + +/** +* Initialize a zstd CStream from a ZstdCompressor instance. +* +* Returns a ZSTD_CStream on success or NULL on failure. If NULL, a Python +* exception will be set. +*/ +ZSTD_CStream* CStream_from_ZstdCompressor(ZstdCompressor* compressor, Py_ssize_t sourceSize) { + ZSTD_CStream* cstream; + ZSTD_parameters zparams; + void* dictData = NULL; + size_t dictSize = 0; + size_t zresult; + + cstream = ZSTD_createCStream(); + if (!cstream) { + PyErr_SetString(ZstdError, "cannot create CStream"); + return NULL; + } + + if (compressor->dict) { + dictData = compressor->dict->dictData; + dictSize = compressor->dict->dictSize; + } + + memset(&zparams, 0, sizeof(zparams)); + if (compressor->cparams) { + ztopy_compression_parameters(compressor->cparams, &zparams.cParams); + /* Do NOT call ZSTD_adjustCParams() here because the compression params + come from the user. */ + } + else { + zparams.cParams = ZSTD_getCParams(compressor->compressionLevel, sourceSize, dictSize); + } + + zparams.fParams = compressor->fparams; + + zresult = ZSTD_initCStream_advanced(cstream, dictData, dictSize, zparams, sourceSize); + + if (ZSTD_isError(zresult)) { + ZSTD_freeCStream(cstream); + PyErr_Format(ZstdError, "cannot init CStream: %s", ZSTD_getErrorName(zresult)); + return NULL; + } + + return cstream; +} + + +PyDoc_STRVAR(ZstdCompressor__doc__, +"ZstdCompressor(level=None, dict_data=None, compression_params=None)\n" +"\n" +"Create an object used to perform Zstandard compression.\n" +"\n" +"An instance can compress data various ways. Instances can be used multiple\n" +"times. Each compression operation will use the compression parameters\n" +"defined at construction time.\n" +"\n" +"Compression can be configured via the following names arguments:\n" +"\n" +"level\n" +" Integer compression level.\n" +"dict_data\n" +" A ``ZstdCompressionDict`` to be used to compress with dictionary data.\n" +"compression_params\n" +" A ``CompressionParameters`` instance defining low-level compression" +" parameters. If defined, this will overwrite the ``level`` argument.\n" +"write_checksum\n" +" If True, a 4 byte content checksum will be written with the compressed\n" +" data, allowing the decompressor to perform content verification.\n" +"write_content_size\n" +" If True, the decompressed content size will be included in the header of\n" +" the compressed data. This data will only be written if the compressor\n" +" knows the size of the input data.\n" +"write_dict_id\n" +" Determines whether the dictionary ID will be written into the compressed\n" +" data. Defaults to True. Only adds content to the compressed data if\n" +" a dictionary is being used.\n" +); + +static int ZstdCompressor_init(ZstdCompressor* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "level", + "dict_data", + "compression_params", + "write_checksum", + "write_content_size", + "write_dict_id", + NULL + }; + + int level = 3; + ZstdCompressionDict* dict = NULL; + CompressionParametersObject* params = NULL; + PyObject* writeChecksum = NULL; + PyObject* writeContentSize = NULL; + PyObject* writeDictID = NULL; + + self->dict = NULL; + self->cparams = NULL; + self->cdict = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!O!OOO", kwlist, + &level, &ZstdCompressionDictType, &dict, + &CompressionParametersType, ¶ms, + &writeChecksum, &writeContentSize, &writeDictID)) { + return -1; + } + + if (level < 1) { + PyErr_SetString(PyExc_ValueError, "level must be greater than 0"); + return -1; + } + + if (level > ZSTD_maxCLevel()) { + PyErr_Format(PyExc_ValueError, "level must be less than %d", + ZSTD_maxCLevel() + 1); + return -1; + } + + self->compressionLevel = level; + + if (dict) { + self->dict = dict; + Py_INCREF(dict); + } + + if (params) { + self->cparams = params; + Py_INCREF(params); + } + + memset(&self->fparams, 0, sizeof(self->fparams)); + + if (writeChecksum && PyObject_IsTrue(writeChecksum)) { + self->fparams.checksumFlag = 1; + } + if (writeContentSize && PyObject_IsTrue(writeContentSize)) { + self->fparams.contentSizeFlag = 1; + } + if (writeDictID && PyObject_Not(writeDictID)) { + self->fparams.noDictIDFlag = 1; + } + + return 0; +} + +static void ZstdCompressor_dealloc(ZstdCompressor* self) { + Py_XDECREF(self->cparams); + Py_XDECREF(self->dict); + + if (self->cdict) { + ZSTD_freeCDict(self->cdict); + self->cdict = NULL; + } + + PyObject_Del(self); +} + +PyDoc_STRVAR(ZstdCompressor_copy_stream__doc__, +"copy_stream(ifh, ofh[, size=0, read_size=default, write_size=default])\n" +"compress data between streams\n" +"\n" +"Data will be read from ``ifh``, compressed, and written to ``ofh``.\n" +"``ifh`` must have a ``read(size)`` method. ``ofh`` must have a ``write(data)``\n" +"method.\n" +"\n" +"An optional ``size`` argument specifies the size of the source stream.\n" +"If defined, compression parameters will be tuned based on the size.\n" +"\n" +"Optional arguments ``read_size`` and ``write_size`` define the chunk sizes\n" +"of ``read()`` and ``write()`` operations, respectively. By default, they use\n" +"the default compression stream input and output sizes, respectively.\n" +); + +static PyObject* ZstdCompressor_copy_stream(ZstdCompressor* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "ifh", + "ofh", + "size", + "read_size", + "write_size", + NULL + }; + + PyObject* source; + PyObject* dest; + Py_ssize_t sourceSize = 0; + size_t inSize = ZSTD_CStreamInSize(); + size_t outSize = ZSTD_CStreamOutSize(); + ZSTD_CStream* cstream; + ZSTD_inBuffer input; + ZSTD_outBuffer output; + Py_ssize_t totalRead = 0; + Py_ssize_t totalWrite = 0; + char* readBuffer; + Py_ssize_t readSize; + PyObject* readResult; + PyObject* res = NULL; + size_t zresult; + PyObject* writeResult; + PyObject* totalReadPy; + PyObject* totalWritePy; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nkk", kwlist, &source, &dest, &sourceSize, + &inSize, &outSize)) { + return NULL; + } + + if (!PyObject_HasAttrString(source, "read")) { + PyErr_SetString(PyExc_ValueError, "first argument must have a read() method"); + return NULL; + } + + if (!PyObject_HasAttrString(dest, "write")) { + PyErr_SetString(PyExc_ValueError, "second argument must have a write() method"); + return NULL; + } + + cstream = CStream_from_ZstdCompressor(self, sourceSize); + if (!cstream) { + res = NULL; + goto finally; + } + + output.dst = PyMem_Malloc(outSize); + if (!output.dst) { + PyErr_NoMemory(); + res = NULL; + goto finally; + } + output.size = outSize; + output.pos = 0; + + while (1) { + /* Try to read from source stream. */ + readResult = PyObject_CallMethod(source, "read", "n", inSize); + if (!readResult) { + PyErr_SetString(ZstdError, "could not read() from source"); + goto finally; + } + + PyBytes_AsStringAndSize(readResult, &readBuffer, &readSize); + + /* If no data was read, we're at EOF. */ + if (0 == readSize) { + break; + } + + totalRead += readSize; + + /* Send data to compressor */ + input.src = readBuffer; + input.size = readSize; + input.pos = 0; + + while (input.pos < input.size) { + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_compressStream(cstream, &output, &input); + Py_END_ALLOW_THREADS + + if (ZSTD_isError(zresult)) { + res = NULL; + PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult)); + goto finally; + } + + if (output.pos) { +#if PY_MAJOR_VERSION >= 3 + writeResult = PyObject_CallMethod(dest, "write", "y#", +#else + writeResult = PyObject_CallMethod(dest, "write", "s#", +#endif + output.dst, output.pos); + Py_XDECREF(writeResult); + totalWrite += output.pos; + output.pos = 0; + } + } + } + + /* We've finished reading. Now flush the compressor stream. */ + while (1) { + zresult = ZSTD_endStream(cstream, &output); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "error ending compression stream: %s", + ZSTD_getErrorName(zresult)); + res = NULL; + goto finally; + } + + if (output.pos) { +#if PY_MAJOR_VERSION >= 3 + writeResult = PyObject_CallMethod(dest, "write", "y#", +#else + writeResult = PyObject_CallMethod(dest, "write", "s#", +#endif + output.dst, output.pos); + totalWrite += output.pos; + Py_XDECREF(writeResult); + output.pos = 0; + } + + if (!zresult) { + break; + } + } + + ZSTD_freeCStream(cstream); + cstream = NULL; + + totalReadPy = PyLong_FromSsize_t(totalRead); + totalWritePy = PyLong_FromSsize_t(totalWrite); + res = PyTuple_Pack(2, totalReadPy, totalWritePy); + Py_DecRef(totalReadPy); + Py_DecRef(totalWritePy); + +finally: + if (output.dst) { + PyMem_Free(output.dst); + } + + if (cstream) { + ZSTD_freeCStream(cstream); + } + + return res; +} + +PyDoc_STRVAR(ZstdCompressor_compress__doc__, +"compress(data)\n" +"\n" +"Compress data in a single operation.\n" +"\n" +"This is the simplest mechanism to perform compression: simply pass in a\n" +"value and get a compressed value back. It is almost the most prone to abuse.\n" +"The input and output values must fit in memory, so passing in very large\n" +"values can result in excessive memory usage. For this reason, one of the\n" +"streaming based APIs is preferred for larger values.\n" +); + +static PyObject* ZstdCompressor_compress(ZstdCompressor* self, PyObject* args) { + const char* source; + Py_ssize_t sourceSize; + size_t destSize; + ZSTD_CCtx* cctx; + PyObject* output; + char* dest; + void* dictData = NULL; + size_t dictSize = 0; + size_t zresult; + ZSTD_parameters zparams; + ZSTD_customMem zmem; + +#if PY_MAJOR_VERSION >= 3 + if (!PyArg_ParseTuple(args, "y#", &source, &sourceSize)) { +#else + if (!PyArg_ParseTuple(args, "s#", &source, &sourceSize)) { +#endif + return NULL; + } + + destSize = ZSTD_compressBound(sourceSize); + output = PyBytes_FromStringAndSize(NULL, destSize); + if (!output) { + return NULL; + } + + dest = PyBytes_AsString(output); + + cctx = ZSTD_createCCtx(); + if (!cctx) { + Py_DECREF(output); + PyErr_SetString(ZstdError, "could not create CCtx"); + return NULL; + } + + if (self->dict) { + dictData = self->dict->dictData; + dictSize = self->dict->dictSize; + } + + memset(&zparams, 0, sizeof(zparams)); + if (!self->cparams) { + zparams.cParams = ZSTD_getCParams(self->compressionLevel, sourceSize, dictSize); + } + else { + ztopy_compression_parameters(self->cparams, &zparams.cParams); + /* Do NOT call ZSTD_adjustCParams() here because the compression params + come from the user. */ + } + + zparams.fParams = self->fparams; + + /* The raw dict data has to be processed before it can be used. Since this + adds overhead - especially if multiple dictionary compression operations + are performed on the same ZstdCompressor instance - we create a + ZSTD_CDict once and reuse it for all operations. */ + + /* TODO the zparams (which can be derived from the source data size) used + on first invocation are effectively reused for subsequent operations. This + may not be appropriate if input sizes vary significantly and could affect + chosen compression parameters. + https://github.com/facebook/zstd/issues/358 tracks this issue. */ + if (dictData && !self->cdict) { + Py_BEGIN_ALLOW_THREADS + memset(&zmem, 0, sizeof(zmem)); + self->cdict = ZSTD_createCDict_advanced(dictData, dictSize, zparams, zmem); + Py_END_ALLOW_THREADS + + if (!self->cdict) { + Py_DECREF(output); + ZSTD_freeCCtx(cctx); + PyErr_SetString(ZstdError, "could not create compression dictionary"); + return NULL; + } + } + + Py_BEGIN_ALLOW_THREADS + /* By avoiding ZSTD_compress(), we don't necessarily write out content + size. This means the argument to ZstdCompressor to control frame + parameters is honored. */ + if (self->cdict) { + zresult = ZSTD_compress_usingCDict(cctx, dest, destSize, + source, sourceSize, self->cdict); + } + else { + zresult = ZSTD_compress_advanced(cctx, dest, destSize, + source, sourceSize, dictData, dictSize, zparams); + } + Py_END_ALLOW_THREADS + + ZSTD_freeCCtx(cctx); + + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "cannot compress: %s", ZSTD_getErrorName(zresult)); + Py_CLEAR(output); + return NULL; + } + else { + Py_SIZE(output) = zresult; + } + + return output; +} + +PyDoc_STRVAR(ZstdCompressionObj__doc__, +"compressobj()\n" +"\n" +"Return an object exposing ``compress(data)`` and ``flush()`` methods.\n" +"\n" +"The returned object exposes an API similar to ``zlib.compressobj`` and\n" +"``bz2.BZ2Compressor`` so that callers can swap in the zstd compressor\n" +"without changing how compression is performed.\n" +); + +static ZstdCompressionObj* ZstdCompressor_compressobj(ZstdCompressor* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "size", + NULL + }; + + Py_ssize_t inSize = 0; + size_t outSize = ZSTD_CStreamOutSize(); + ZstdCompressionObj* result = PyObject_New(ZstdCompressionObj, &ZstdCompressionObjType); + if (!result) { + return NULL; + } + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n", kwlist, &inSize)) { + return NULL; + } + + result->cstream = CStream_from_ZstdCompressor(self, inSize); + if (!result->cstream) { + Py_DECREF(result); + return NULL; + } + + result->output.dst = PyMem_Malloc(outSize); + if (!result->output.dst) { + PyErr_NoMemory(); + Py_DECREF(result); + return NULL; + } + result->output.size = outSize; + result->output.pos = 0; + + result->compressor = self; + Py_INCREF(result->compressor); + + result->flushed = 0; + + return result; +} + +PyDoc_STRVAR(ZstdCompressor_read_from__doc__, +"read_from(reader, [size=0, read_size=default, write_size=default])\n" +"Read uncompress data from a reader and return an iterator\n" +"\n" +"Returns an iterator of compressed data produced from reading from ``reader``.\n" +"\n" +"Uncompressed data will be obtained from ``reader`` by calling the\n" +"``read(size)`` method of it. The source data will be streamed into a\n" +"compressor. As compressed data is available, it will be exposed to the\n" +"iterator.\n" +"\n" +"Data is read from the source in chunks of ``read_size``. Compressed chunks\n" +"are at most ``write_size`` bytes. Both values default to the zstd input and\n" +"and output defaults, respectively.\n" +"\n" +"The caller is partially in control of how fast data is fed into the\n" +"compressor by how it consumes the returned iterator. The compressor will\n" +"not consume from the reader unless the caller consumes from the iterator.\n" +); + +static ZstdCompressorIterator* ZstdCompressor_read_from(ZstdCompressor* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "reader", + "size", + "read_size", + "write_size", + NULL + }; + + PyObject* reader; + Py_ssize_t sourceSize = 0; + size_t inSize = ZSTD_CStreamInSize(); + size_t outSize = ZSTD_CStreamOutSize(); + ZstdCompressorIterator* result; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nkk", kwlist, &reader, &sourceSize, + &inSize, &outSize)) { + return NULL; + } + + result = PyObject_New(ZstdCompressorIterator, &ZstdCompressorIteratorType); + if (!result) { + return NULL; + } + + result->compressor = NULL; + result->reader = NULL; + result->buffer = NULL; + result->cstream = NULL; + result->input.src = NULL; + result->output.dst = NULL; + result->readResult = NULL; + + if (PyObject_HasAttrString(reader, "read")) { + result->reader = reader; + Py_INCREF(result->reader); + } + else if (1 == PyObject_CheckBuffer(reader)) { + result->buffer = PyMem_Malloc(sizeof(Py_buffer)); + if (!result->buffer) { + goto except; + } + + memset(result->buffer, 0, sizeof(Py_buffer)); + + if (0 != PyObject_GetBuffer(reader, result->buffer, PyBUF_CONTIG_RO)) { + goto except; + } + + result->bufferOffset = 0; + sourceSize = result->buffer->len; + } + else { + PyErr_SetString(PyExc_ValueError, + "must pass an object with a read() method or conforms to buffer protocol"); + goto except; + } + + result->compressor = self; + Py_INCREF(result->compressor); + + result->sourceSize = sourceSize; + result->cstream = CStream_from_ZstdCompressor(self, sourceSize); + if (!result->cstream) { + goto except; + } + + result->inSize = inSize; + result->outSize = outSize; + + result->output.dst = PyMem_Malloc(outSize); + if (!result->output.dst) { + PyErr_NoMemory(); + goto except; + } + result->output.size = outSize; + result->output.pos = 0; + + result->input.src = NULL; + result->input.size = 0; + result->input.pos = 0; + + result->finishedInput = 0; + result->finishedOutput = 0; + + goto finally; + +except: + if (result->cstream) { + ZSTD_freeCStream(result->cstream); + result->cstream = NULL; + } + + Py_DecRef((PyObject*)result->compressor); + Py_DecRef(result->reader); + + Py_DECREF(result); + result = NULL; + +finally: + return result; +} + +PyDoc_STRVAR(ZstdCompressor_write_to___doc__, +"Create a context manager to write compressed data to an object.\n" +"\n" +"The passed object must have a ``write()`` method.\n" +"\n" +"The caller feeds input data to the object by calling ``compress(data)``.\n" +"Compressed data is written to the argument given to this function.\n" +"\n" +"The function takes an optional ``size`` argument indicating the total size\n" +"of the eventual input. If specified, the size will influence compression\n" +"parameter tuning and could result in the size being written into the\n" +"header of the compressed data.\n" +"\n" +"An optional ``write_size`` argument is also accepted. It defines the maximum\n" +"byte size of chunks fed to ``write()``. By default, it uses the zstd default\n" +"for a compressor output stream.\n" +); + +static ZstdCompressionWriter* ZstdCompressor_write_to(ZstdCompressor* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "writer", + "size", + "write_size", + NULL + }; + + PyObject* writer; + ZstdCompressionWriter* result; + Py_ssize_t sourceSize = 0; + size_t outSize = ZSTD_CStreamOutSize(); + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nk", kwlist, &writer, &sourceSize, + &outSize)) { + return NULL; + } + + if (!PyObject_HasAttrString(writer, "write")) { + PyErr_SetString(PyExc_ValueError, "must pass an object with a write() method"); + return NULL; + } + + result = PyObject_New(ZstdCompressionWriter, &ZstdCompressionWriterType); + if (!result) { + return NULL; + } + + result->compressor = self; + Py_INCREF(result->compressor); + + result->writer = writer; + Py_INCREF(result->writer); + + result->sourceSize = sourceSize; + + result->outSize = outSize; + + result->entered = 0; + result->cstream = NULL; + + return result; +} + +static PyMethodDef ZstdCompressor_methods[] = { + { "compress", (PyCFunction)ZstdCompressor_compress, METH_VARARGS, + ZstdCompressor_compress__doc__ }, + { "compressobj", (PyCFunction)ZstdCompressor_compressobj, + METH_VARARGS | METH_KEYWORDS, ZstdCompressionObj__doc__ }, + { "copy_stream", (PyCFunction)ZstdCompressor_copy_stream, + METH_VARARGS | METH_KEYWORDS, ZstdCompressor_copy_stream__doc__ }, + { "read_from", (PyCFunction)ZstdCompressor_read_from, + METH_VARARGS | METH_KEYWORDS, ZstdCompressor_read_from__doc__ }, + { "write_to", (PyCFunction)ZstdCompressor_write_to, + METH_VARARGS | METH_KEYWORDS, ZstdCompressor_write_to___doc__ }, + { NULL, NULL } +}; + +PyTypeObject ZstdCompressorType = { + PyVarObject_HEAD_INIT(NULL, 0) + "zstd.ZstdCompressor", /* tp_name */ + sizeof(ZstdCompressor), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)ZstdCompressor_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + ZstdCompressor__doc__, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + ZstdCompressor_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)ZstdCompressor_init, /* tp_init */ + 0, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ +}; + +void compressor_module_init(PyObject* mod) { + Py_TYPE(&ZstdCompressorType) = &PyType_Type; + if (PyType_Ready(&ZstdCompressorType) < 0) { + return; + } + + Py_INCREF((PyObject*)&ZstdCompressorType); + PyModule_AddObject(mod, "ZstdCompressor", + (PyObject*)&ZstdCompressorType); +} diff --git a/contrib/python-zstandard/c-ext/compressoriterator.c b/contrib/python-zstandard/c-ext/compressoriterator.c new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/c-ext/compressoriterator.c @@ -0,0 +1,234 @@ +/** +* Copyright (c) 2016-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +extern PyObject* ZstdError; + +PyDoc_STRVAR(ZstdCompressorIterator__doc__, +"Represents an iterator of compressed data.\n" +); + +static void ZstdCompressorIterator_dealloc(ZstdCompressorIterator* self) { + Py_XDECREF(self->readResult); + Py_XDECREF(self->compressor); + Py_XDECREF(self->reader); + + if (self->buffer) { + PyBuffer_Release(self->buffer); + PyMem_FREE(self->buffer); + self->buffer = NULL; + } + + if (self->cstream) { + ZSTD_freeCStream(self->cstream); + self->cstream = NULL; + } + + if (self->output.dst) { + PyMem_Free(self->output.dst); + self->output.dst = NULL; + } + + PyObject_Del(self); +} + +static PyObject* ZstdCompressorIterator_iter(PyObject* self) { + Py_INCREF(self); + return self; +} + +static PyObject* ZstdCompressorIterator_iternext(ZstdCompressorIterator* self) { + size_t zresult; + PyObject* readResult = NULL; + PyObject* chunk; + char* readBuffer; + Py_ssize_t readSize = 0; + Py_ssize_t bufferRemaining; + + if (self->finishedOutput) { + PyErr_SetString(PyExc_StopIteration, "output flushed"); + return NULL; + } + +feedcompressor: + + /* If we have data left in the input, consume it. */ + if (self->input.pos < self->input.size) { + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_compressStream(self->cstream, &self->output, &self->input); + Py_END_ALLOW_THREADS + + /* Release the Python object holding the input buffer. */ + if (self->input.pos == self->input.size) { + self->input.src = NULL; + self->input.pos = 0; + self->input.size = 0; + Py_DECREF(self->readResult); + self->readResult = NULL; + } + + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult)); + return NULL; + } + + /* If it produced output data, emit it. */ + if (self->output.pos) { + chunk = PyBytes_FromStringAndSize(self->output.dst, self->output.pos); + self->output.pos = 0; + return chunk; + } + } + + /* We should never have output data sitting around after a previous call. */ + assert(self->output.pos == 0); + + /* The code above should have either emitted a chunk and returned or consumed + the entire input buffer. So the state of the input buffer is not + relevant. */ + if (!self->finishedInput) { + if (self->reader) { + readResult = PyObject_CallMethod(self->reader, "read", "I", self->inSize); + if (!readResult) { + PyErr_SetString(ZstdError, "could not read() from source"); + return NULL; + } + + PyBytes_AsStringAndSize(readResult, &readBuffer, &readSize); + } + else { + assert(self->buffer && self->buffer->buf); + + /* Only support contiguous C arrays. */ + assert(self->buffer->strides == NULL && self->buffer->suboffsets == NULL); + assert(self->buffer->itemsize == 1); + + readBuffer = (char*)self->buffer->buf + self->bufferOffset; + bufferRemaining = self->buffer->len - self->bufferOffset; + readSize = min(bufferRemaining, (Py_ssize_t)self->inSize); + self->bufferOffset += readSize; + } + + if (0 == readSize) { + Py_XDECREF(readResult); + self->finishedInput = 1; + } + else { + self->readResult = readResult; + } + } + + /* EOF */ + if (0 == readSize) { + zresult = ZSTD_endStream(self->cstream, &self->output); + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "error ending compression stream: %s", + ZSTD_getErrorName(zresult)); + return NULL; + } + + assert(self->output.pos); + + if (0 == zresult) { + self->finishedOutput = 1; + } + + chunk = PyBytes_FromStringAndSize(self->output.dst, self->output.pos); + self->output.pos = 0; + return chunk; + } + + /* New data from reader. Feed into compressor. */ + self->input.src = readBuffer; + self->input.size = readSize; + self->input.pos = 0; + + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_compressStream(self->cstream, &self->output, &self->input); + Py_END_ALLOW_THREADS + + /* The input buffer currently points to memory managed by Python + (readBuffer). This object was allocated by this function. If it wasn't + fully consumed, we need to release it in a subsequent function call. + If it is fully consumed, do that now. + */ + if (self->input.pos == self->input.size) { + self->input.src = NULL; + self->input.pos = 0; + self->input.size = 0; + Py_XDECREF(self->readResult); + self->readResult = NULL; + } + + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult)); + return NULL; + } + + assert(self->input.pos <= self->input.size); + + /* If we didn't write anything, start the process over. */ + if (0 == self->output.pos) { + goto feedcompressor; + } + + chunk = PyBytes_FromStringAndSize(self->output.dst, self->output.pos); + self->output.pos = 0; + return chunk; +} + +PyTypeObject ZstdCompressorIteratorType = { + PyVarObject_HEAD_INIT(NULL, 0) + "zstd.ZstdCompressorIterator", /* tp_name */ + sizeof(ZstdCompressorIterator), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)ZstdCompressorIterator_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + ZstdCompressorIterator__doc__, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + ZstdCompressorIterator_iter, /* tp_iter */ + (iternextfunc)ZstdCompressorIterator_iternext, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ +}; + +void compressoriterator_module_init(PyObject* mod) { + Py_TYPE(&ZstdCompressorIteratorType) = &PyType_Type; + if (PyType_Ready(&ZstdCompressorIteratorType) < 0) { + return; + } +} diff --git a/contrib/python-zstandard/c-ext/constants.c b/contrib/python-zstandard/c-ext/constants.c new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/c-ext/constants.c @@ -0,0 +1,84 @@ +/** +* Copyright (c) 2016-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +extern PyObject* ZstdError; + +static char frame_header[] = { + '\x28', + '\xb5', + '\x2f', + '\xfd', +}; + +void constants_module_init(PyObject* mod) { + PyObject* version; + PyObject* zstdVersion; + PyObject* frameHeader; + +#if PY_MAJOR_VERSION >= 3 + version = PyUnicode_FromString(PYTHON_ZSTANDARD_VERSION); +#else + version = PyString_FromString(PYTHON_ZSTANDARD_VERSION); +#endif + Py_INCREF(version); + PyModule_AddObject(mod, "__version__", version); + + ZstdError = PyErr_NewException("zstd.ZstdError", NULL, NULL); + PyModule_AddObject(mod, "ZstdError", ZstdError); + + /* For now, the version is a simple tuple instead of a dedicated type. */ + zstdVersion = PyTuple_New(3); + PyTuple_SetItem(zstdVersion, 0, PyLong_FromLong(ZSTD_VERSION_MAJOR)); + PyTuple_SetItem(zstdVersion, 1, PyLong_FromLong(ZSTD_VERSION_MINOR)); + PyTuple_SetItem(zstdVersion, 2, PyLong_FromLong(ZSTD_VERSION_RELEASE)); + Py_IncRef(zstdVersion); + PyModule_AddObject(mod, "ZSTD_VERSION", zstdVersion); + + frameHeader = PyBytes_FromStringAndSize(frame_header, sizeof(frame_header)); + if (frameHeader) { + PyModule_AddObject(mod, "FRAME_HEADER", frameHeader); + } + else { + PyErr_Format(PyExc_ValueError, "could not create frame header object"); + } + + PyModule_AddIntConstant(mod, "MAX_COMPRESSION_LEVEL", ZSTD_maxCLevel()); + PyModule_AddIntConstant(mod, "COMPRESSION_RECOMMENDED_INPUT_SIZE", + (long)ZSTD_CStreamInSize()); + PyModule_AddIntConstant(mod, "COMPRESSION_RECOMMENDED_OUTPUT_SIZE", + (long)ZSTD_CStreamOutSize()); + PyModule_AddIntConstant(mod, "DECOMPRESSION_RECOMMENDED_INPUT_SIZE", + (long)ZSTD_DStreamInSize()); + PyModule_AddIntConstant(mod, "DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE", + (long)ZSTD_DStreamOutSize()); + + PyModule_AddIntConstant(mod, "MAGIC_NUMBER", ZSTD_MAGICNUMBER); + PyModule_AddIntConstant(mod, "WINDOWLOG_MIN", ZSTD_WINDOWLOG_MIN); + PyModule_AddIntConstant(mod, "WINDOWLOG_MAX", ZSTD_WINDOWLOG_MAX); + PyModule_AddIntConstant(mod, "CHAINLOG_MIN", ZSTD_CHAINLOG_MIN); + PyModule_AddIntConstant(mod, "CHAINLOG_MAX", ZSTD_CHAINLOG_MAX); + PyModule_AddIntConstant(mod, "HASHLOG_MIN", ZSTD_HASHLOG_MIN); + PyModule_AddIntConstant(mod, "HASHLOG_MAX", ZSTD_HASHLOG_MAX); + PyModule_AddIntConstant(mod, "HASHLOG3_MAX", ZSTD_HASHLOG3_MAX); + PyModule_AddIntConstant(mod, "SEARCHLOG_MIN", ZSTD_SEARCHLOG_MIN); + PyModule_AddIntConstant(mod, "SEARCHLOG_MAX", ZSTD_SEARCHLOG_MAX); + PyModule_AddIntConstant(mod, "SEARCHLENGTH_MIN", ZSTD_SEARCHLENGTH_MIN); + PyModule_AddIntConstant(mod, "SEARCHLENGTH_MAX", ZSTD_SEARCHLENGTH_MAX); + PyModule_AddIntConstant(mod, "TARGETLENGTH_MIN", ZSTD_TARGETLENGTH_MIN); + PyModule_AddIntConstant(mod, "TARGETLENGTH_MAX", ZSTD_TARGETLENGTH_MAX); + + PyModule_AddIntConstant(mod, "STRATEGY_FAST", ZSTD_fast); + PyModule_AddIntConstant(mod, "STRATEGY_DFAST", ZSTD_dfast); + PyModule_AddIntConstant(mod, "STRATEGY_GREEDY", ZSTD_greedy); + PyModule_AddIntConstant(mod, "STRATEGY_LAZY", ZSTD_lazy); + PyModule_AddIntConstant(mod, "STRATEGY_LAZY2", ZSTD_lazy2); + PyModule_AddIntConstant(mod, "STRATEGY_BTLAZY2", ZSTD_btlazy2); + PyModule_AddIntConstant(mod, "STRATEGY_BTOPT", ZSTD_btopt); +} diff --git a/contrib/python-zstandard/c-ext/decompressionwriter.c b/contrib/python-zstandard/c-ext/decompressionwriter.c new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/c-ext/decompressionwriter.c @@ -0,0 +1,187 @@ +/** +* Copyright (c) 2016-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +extern PyObject* ZstdError; + +PyDoc_STRVAR(ZstdDecompressionWriter__doc, +"""A context manager used for writing decompressed output.\n" +); + +static void ZstdDecompressionWriter_dealloc(ZstdDecompressionWriter* self) { + Py_XDECREF(self->decompressor); + Py_XDECREF(self->writer); + + if (self->dstream) { + ZSTD_freeDStream(self->dstream); + self->dstream = NULL; + } + + PyObject_Del(self); +} + +static PyObject* ZstdDecompressionWriter_enter(ZstdDecompressionWriter* self) { + if (self->entered) { + PyErr_SetString(ZstdError, "cannot __enter__ multiple times"); + return NULL; + } + + self->dstream = DStream_from_ZstdDecompressor(self->decompressor); + if (!self->dstream) { + return NULL; + } + + self->entered = 1; + + Py_INCREF(self); + return (PyObject*)self; +} + +static PyObject* ZstdDecompressionWriter_exit(ZstdDecompressionWriter* self, PyObject* args) { + self->entered = 0; + + if (self->dstream) { + ZSTD_freeDStream(self->dstream); + self->dstream = NULL; + } + + Py_RETURN_FALSE; +} + +static PyObject* ZstdDecompressionWriter_memory_size(ZstdDecompressionWriter* self) { + if (!self->dstream) { + PyErr_SetString(ZstdError, "cannot determine size of inactive decompressor; " + "call when context manager is active"); + return NULL; + } + + return PyLong_FromSize_t(ZSTD_sizeof_DStream(self->dstream)); +} + +static PyObject* ZstdDecompressionWriter_write(ZstdDecompressionWriter* self, PyObject* args) { + const char* source; + Py_ssize_t sourceSize; + size_t zresult = 0; + ZSTD_inBuffer input; + ZSTD_outBuffer output; + PyObject* res; + +#if PY_MAJOR_VERSION >= 3 + if (!PyArg_ParseTuple(args, "y#", &source, &sourceSize)) { +#else + if (!PyArg_ParseTuple(args, "s#", &source, &sourceSize)) { +#endif + return NULL; + } + + if (!self->entered) { + PyErr_SetString(ZstdError, "write must be called from an active context manager"); + return NULL; + } + + output.dst = malloc(self->outSize); + if (!output.dst) { + return PyErr_NoMemory(); + } + output.size = self->outSize; + output.pos = 0; + + input.src = source; + input.size = sourceSize; + input.pos = 0; + + while ((ssize_t)input.pos < sourceSize) { + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_decompressStream(self->dstream, &output, &input); + Py_END_ALLOW_THREADS + + if (ZSTD_isError(zresult)) { + free(output.dst); + PyErr_Format(ZstdError, "zstd decompress error: %s", + ZSTD_getErrorName(zresult)); + return NULL; + } + + if (output.pos) { +#if PY_MAJOR_VERSION >= 3 + res = PyObject_CallMethod(self->writer, "write", "y#", +#else + res = PyObject_CallMethod(self->writer, "write", "s#", +#endif + output.dst, output.pos); + Py_XDECREF(res); + output.pos = 0; + } + } + + free(output.dst); + + /* TODO return bytes written */ + Py_RETURN_NONE; + } + +static PyMethodDef ZstdDecompressionWriter_methods[] = { + { "__enter__", (PyCFunction)ZstdDecompressionWriter_enter, METH_NOARGS, + PyDoc_STR("Enter a decompression context.") }, + { "__exit__", (PyCFunction)ZstdDecompressionWriter_exit, METH_VARARGS, + PyDoc_STR("Exit a decompression context.") }, + { "memory_size", (PyCFunction)ZstdDecompressionWriter_memory_size, METH_NOARGS, + PyDoc_STR("Obtain the memory size in bytes of the underlying decompressor.") }, + { "write", (PyCFunction)ZstdDecompressionWriter_write, METH_VARARGS, + PyDoc_STR("Compress data") }, + { NULL, NULL } +}; + +PyTypeObject ZstdDecompressionWriterType = { + PyVarObject_HEAD_INIT(NULL, 0) + "zstd.ZstdDecompressionWriter", /* tp_name */ + sizeof(ZstdDecompressionWriter),/* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)ZstdDecompressionWriter_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + ZstdDecompressionWriter__doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + ZstdDecompressionWriter_methods,/* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ +}; + +void decompressionwriter_module_init(PyObject* mod) { + Py_TYPE(&ZstdDecompressionWriterType) = &PyType_Type; + if (PyType_Ready(&ZstdDecompressionWriterType) < 0) { + return; + } +} diff --git a/contrib/python-zstandard/c-ext/decompressobj.c b/contrib/python-zstandard/c-ext/decompressobj.c new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/c-ext/decompressobj.c @@ -0,0 +1,170 @@ +/** +* Copyright (c) 2016-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +extern PyObject* ZstdError; + +PyDoc_STRVAR(DecompressionObj__doc__, +"Perform decompression using a standard library compatible API.\n" +); + +static void DecompressionObj_dealloc(ZstdDecompressionObj* self) { + if (self->dstream) { + ZSTD_freeDStream(self->dstream); + self->dstream = NULL; + } + + Py_XDECREF(self->decompressor); + + PyObject_Del(self); +} + +static PyObject* DecompressionObj_decompress(ZstdDecompressionObj* self, PyObject* args) { + const char* source; + Py_ssize_t sourceSize; + size_t zresult; + ZSTD_inBuffer input; + ZSTD_outBuffer output; + size_t outSize = ZSTD_DStreamOutSize(); + PyObject* result = NULL; + Py_ssize_t resultSize = 0; + + if (self->finished) { + PyErr_SetString(ZstdError, "cannot use a decompressobj multiple times"); + return NULL; + } + +#if PY_MAJOR_VERSION >= 3 + if (!PyArg_ParseTuple(args, "y#", +#else + if (!PyArg_ParseTuple(args, "s#", +#endif + &source, &sourceSize)) { + return NULL; + } + + input.src = source; + input.size = sourceSize; + input.pos = 0; + + output.dst = PyMem_Malloc(outSize); + if (!output.dst) { + PyErr_NoMemory(); + return NULL; + } + output.size = outSize; + output.pos = 0; + + /* Read input until exhausted. */ + while (input.pos < input.size) { + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_decompressStream(self->dstream, &output, &input); + Py_END_ALLOW_THREADS + + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "zstd decompressor error: %s", + ZSTD_getErrorName(zresult)); + result = NULL; + goto finally; + } + + if (0 == zresult) { + self->finished = 1; + } + + if (output.pos) { + if (result) { + resultSize = PyBytes_GET_SIZE(result); + if (-1 == _PyBytes_Resize(&result, resultSize + output.pos)) { + goto except; + } + + memcpy(PyBytes_AS_STRING(result) + resultSize, + output.dst, output.pos); + } + else { + result = PyBytes_FromStringAndSize(output.dst, output.pos); + if (!result) { + goto except; + } + } + + output.pos = 0; + } + } + + if (!result) { + result = PyBytes_FromString(""); + } + + goto finally; + +except: + Py_DecRef(result); + result = NULL; + +finally: + PyMem_Free(output.dst); + + return result; +} + +static PyMethodDef DecompressionObj_methods[] = { + { "decompress", (PyCFunction)DecompressionObj_decompress, + METH_VARARGS, PyDoc_STR("decompress data") }, + { NULL, NULL } +}; + +PyTypeObject ZstdDecompressionObjType = { + PyVarObject_HEAD_INIT(NULL, 0) + "zstd.ZstdDecompressionObj", /* tp_name */ + sizeof(ZstdDecompressionObj), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)DecompressionObj_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + DecompressionObj__doc__, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + DecompressionObj_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ +}; + +void decompressobj_module_init(PyObject* module) { + Py_TYPE(&ZstdDecompressionObjType) = &PyType_Type; + if (PyType_Ready(&ZstdDecompressionObjType) < 0) { + return; + } +} diff --git a/contrib/python-zstandard/c-ext/decompressor.c b/contrib/python-zstandard/c-ext/decompressor.c new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/c-ext/decompressor.c @@ -0,0 +1,669 @@ +/** +* Copyright (c) 2016-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +extern PyObject* ZstdError; + +ZSTD_DStream* DStream_from_ZstdDecompressor(ZstdDecompressor* decompressor) { + ZSTD_DStream* dstream; + void* dictData = NULL; + size_t dictSize = 0; + size_t zresult; + + dstream = ZSTD_createDStream(); + if (!dstream) { + PyErr_SetString(ZstdError, "could not create DStream"); + return NULL; + } + + if (decompressor->dict) { + dictData = decompressor->dict->dictData; + dictSize = decompressor->dict->dictSize; + } + + if (dictData) { + zresult = ZSTD_initDStream_usingDict(dstream, dictData, dictSize); + } + else { + zresult = ZSTD_initDStream(dstream); + } + + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "could not initialize DStream: %s", + ZSTD_getErrorName(zresult)); + return NULL; + } + + return dstream; +} + +PyDoc_STRVAR(Decompressor__doc__, +"ZstdDecompressor(dict_data=None)\n" +"\n" +"Create an object used to perform Zstandard decompression.\n" +"\n" +"An instance can perform multiple decompression operations." +); + +static int Decompressor_init(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "dict_data", + NULL + }; + + ZstdCompressionDict* dict = NULL; + + self->refdctx = NULL; + self->dict = NULL; + self->ddict = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O!", kwlist, + &ZstdCompressionDictType, &dict)) { + return -1; + } + + /* Instead of creating a ZSTD_DCtx for every decompression operation, + we create an instance at object creation time and recycle it via + ZSTD_copyDCTx() on each use. This means each use is a malloc+memcpy + instead of a malloc+init. */ + /* TODO lazily initialize the reference ZSTD_DCtx on first use since + not instances of ZstdDecompressor will use a ZSTD_DCtx. */ + self->refdctx = ZSTD_createDCtx(); + if (!self->refdctx) { + PyErr_NoMemory(); + goto except; + } + + if (dict) { + self->dict = dict; + Py_INCREF(dict); + } + + return 0; + +except: + if (self->refdctx) { + ZSTD_freeDCtx(self->refdctx); + self->refdctx = NULL; + } + + return -1; +} + +static void Decompressor_dealloc(ZstdDecompressor* self) { + if (self->refdctx) { + ZSTD_freeDCtx(self->refdctx); + } + + Py_XDECREF(self->dict); + + if (self->ddict) { + ZSTD_freeDDict(self->ddict); + self->ddict = NULL; + } + + PyObject_Del(self); +} + +PyDoc_STRVAR(Decompressor_copy_stream__doc__, + "copy_stream(ifh, ofh[, read_size=default, write_size=default]) -- decompress data between streams\n" + "\n" + "Compressed data will be read from ``ifh``, decompressed, and written to\n" + "``ofh``. ``ifh`` must have a ``read(size)`` method. ``ofh`` must have a\n" + "``write(data)`` method.\n" + "\n" + "The optional ``read_size`` and ``write_size`` arguments control the chunk\n" + "size of data that is ``read()`` and ``write()`` between streams. They default\n" + "to the default input and output sizes of zstd decompressor streams.\n" +); + +static PyObject* Decompressor_copy_stream(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "ifh", + "ofh", + "read_size", + "write_size", + NULL + }; + + PyObject* source; + PyObject* dest; + size_t inSize = ZSTD_DStreamInSize(); + size_t outSize = ZSTD_DStreamOutSize(); + ZSTD_DStream* dstream; + ZSTD_inBuffer input; + ZSTD_outBuffer output; + Py_ssize_t totalRead = 0; + Py_ssize_t totalWrite = 0; + char* readBuffer; + Py_ssize_t readSize; + PyObject* readResult; + PyObject* res = NULL; + size_t zresult = 0; + PyObject* writeResult; + PyObject* totalReadPy; + PyObject* totalWritePy; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|kk", kwlist, &source, + &dest, &inSize, &outSize)) { + return NULL; + } + + if (!PyObject_HasAttrString(source, "read")) { + PyErr_SetString(PyExc_ValueError, "first argument must have a read() method"); + return NULL; + } + + if (!PyObject_HasAttrString(dest, "write")) { + PyErr_SetString(PyExc_ValueError, "second argument must have a write() method"); + return NULL; + } + + dstream = DStream_from_ZstdDecompressor(self); + if (!dstream) { + res = NULL; + goto finally; + } + + output.dst = PyMem_Malloc(outSize); + if (!output.dst) { + PyErr_NoMemory(); + res = NULL; + goto finally; + } + output.size = outSize; + output.pos = 0; + + /* Read source stream until EOF */ + while (1) { + readResult = PyObject_CallMethod(source, "read", "n", inSize); + if (!readResult) { + PyErr_SetString(ZstdError, "could not read() from source"); + goto finally; + } + + PyBytes_AsStringAndSize(readResult, &readBuffer, &readSize); + + /* If no data was read, we're at EOF. */ + if (0 == readSize) { + break; + } + + totalRead += readSize; + + /* Send data to decompressor */ + input.src = readBuffer; + input.size = readSize; + input.pos = 0; + + while (input.pos < input.size) { + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_decompressStream(dstream, &output, &input); + Py_END_ALLOW_THREADS + + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "zstd decompressor error: %s", + ZSTD_getErrorName(zresult)); + res = NULL; + goto finally; + } + + if (output.pos) { +#if PY_MAJOR_VERSION >= 3 + writeResult = PyObject_CallMethod(dest, "write", "y#", +#else + writeResult = PyObject_CallMethod(dest, "write", "s#", +#endif + output.dst, output.pos); + + Py_XDECREF(writeResult); + totalWrite += output.pos; + output.pos = 0; + } + } + } + + /* Source stream is exhausted. Finish up. */ + + ZSTD_freeDStream(dstream); + dstream = NULL; + + totalReadPy = PyLong_FromSsize_t(totalRead); + totalWritePy = PyLong_FromSsize_t(totalWrite); + res = PyTuple_Pack(2, totalReadPy, totalWritePy); + Py_DecRef(totalReadPy); + Py_DecRef(totalWritePy); + + finally: + if (output.dst) { + PyMem_Free(output.dst); + } + + if (dstream) { + ZSTD_freeDStream(dstream); + } + + return res; +} + +PyDoc_STRVAR(Decompressor_decompress__doc__, +"decompress(data[, max_output_size=None]) -- Decompress data in its entirety\n" +"\n" +"This method will decompress the entirety of the argument and return the\n" +"result.\n" +"\n" +"The input bytes are expected to contain a full Zstandard frame (something\n" +"compressed with ``ZstdCompressor.compress()`` or similar). If the input does\n" +"not contain a full frame, an exception will be raised.\n" +"\n" +"If the frame header of the compressed data does not contain the content size\n" +"``max_output_size`` must be specified or ``ZstdError`` will be raised. An\n" +"allocation of size ``max_output_size`` will be performed and an attempt will\n" +"be made to perform decompression into that buffer. If the buffer is too\n" +"small or cannot be allocated, ``ZstdError`` will be raised. The buffer will\n" +"be resized if it is too large.\n" +"\n" +"Uncompressed data could be much larger than compressed data. As a result,\n" +"calling this function could result in a very large memory allocation being\n" +"performed to hold the uncompressed data. Therefore it is **highly**\n" +"recommended to use a streaming decompression method instead of this one.\n" +); + +PyObject* Decompressor_decompress(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "data", + "max_output_size", + NULL + }; + + const char* source; + Py_ssize_t sourceSize; + Py_ssize_t maxOutputSize = 0; + unsigned long long decompressedSize; + size_t destCapacity; + PyObject* result = NULL; + ZSTD_DCtx* dctx = NULL; + void* dictData = NULL; + size_t dictSize = 0; + size_t zresult; + +#if PY_MAJOR_VERSION >= 3 + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|n", kwlist, +#else + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|n", kwlist, +#endif + &source, &sourceSize, &maxOutputSize)) { + return NULL; + } + + dctx = PyMem_Malloc(ZSTD_sizeof_DCtx(self->refdctx)); + if (!dctx) { + PyErr_NoMemory(); + return NULL; + } + + ZSTD_copyDCtx(dctx, self->refdctx); + + if (self->dict) { + dictData = self->dict->dictData; + dictSize = self->dict->dictSize; + } + + if (dictData && !self->ddict) { + Py_BEGIN_ALLOW_THREADS + self->ddict = ZSTD_createDDict(dictData, dictSize); + Py_END_ALLOW_THREADS + + if (!self->ddict) { + PyErr_SetString(ZstdError, "could not create decompression dict"); + goto except; + } + } + + decompressedSize = ZSTD_getDecompressedSize(source, sourceSize); + /* 0 returned if content size not in the zstd frame header */ + if (0 == decompressedSize) { + if (0 == maxOutputSize) { + PyErr_SetString(ZstdError, "input data invalid or missing content size " + "in frame header"); + goto except; + } + else { + result = PyBytes_FromStringAndSize(NULL, maxOutputSize); + destCapacity = maxOutputSize; + } + } + else { + result = PyBytes_FromStringAndSize(NULL, decompressedSize); + destCapacity = decompressedSize; + } + + if (!result) { + goto except; + } + + Py_BEGIN_ALLOW_THREADS + if (self->ddict) { + zresult = ZSTD_decompress_usingDDict(dctx, PyBytes_AsString(result), destCapacity, + source, sourceSize, self->ddict); + } + else { + zresult = ZSTD_decompressDCtx(dctx, PyBytes_AsString(result), destCapacity, source, sourceSize); + } + Py_END_ALLOW_THREADS + + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "decompression error: %s", ZSTD_getErrorName(zresult)); + goto except; + } + else if (decompressedSize && zresult != decompressedSize) { + PyErr_Format(ZstdError, "decompression error: decompressed %zu bytes; expected %llu", + zresult, decompressedSize); + goto except; + } + else if (zresult < destCapacity) { + if (_PyBytes_Resize(&result, zresult)) { + goto except; + } + } + + goto finally; + +except: + Py_DecRef(result); + result = NULL; + +finally: + if (dctx) { + PyMem_FREE(dctx); + } + + return result; +} + +PyDoc_STRVAR(Decompressor_decompressobj__doc__, +"decompressobj()\n" +"\n" +"Incrementally feed data into a decompressor.\n" +"\n" +"The returned object exposes a ``decompress(data)`` method. This makes it\n" +"compatible with ``zlib.decompressobj`` and ``bz2.BZ2Decompressor`` so that\n" +"callers can swap in the zstd decompressor while using the same API.\n" +); + +static ZstdDecompressionObj* Decompressor_decompressobj(ZstdDecompressor* self) { + ZstdDecompressionObj* result = PyObject_New(ZstdDecompressionObj, &ZstdDecompressionObjType); + if (!result) { + return NULL; + } + + result->dstream = DStream_from_ZstdDecompressor(self); + if (!result->dstream) { + Py_DecRef((PyObject*)result); + return NULL; + } + + result->decompressor = self; + Py_INCREF(result->decompressor); + + result->finished = 0; + + return result; +} + +PyDoc_STRVAR(Decompressor_read_from__doc__, +"read_from(reader[, read_size=default, write_size=default, skip_bytes=0])\n" +"Read compressed data and return an iterator\n" +"\n" +"Returns an iterator of decompressed data chunks produced from reading from\n" +"the ``reader``.\n" +"\n" +"Compressed data will be obtained from ``reader`` by calling the\n" +"``read(size)`` method of it. The source data will be streamed into a\n" +"decompressor. As decompressed data is available, it will be exposed to the\n" +"returned iterator.\n" +"\n" +"Data is ``read()`` in chunks of size ``read_size`` and exposed to the\n" +"iterator in chunks of size ``write_size``. The default values are the input\n" +"and output sizes for a zstd streaming decompressor.\n" +"\n" +"There is also support for skipping the first ``skip_bytes`` of data from\n" +"the source.\n" +); + +static ZstdDecompressorIterator* Decompressor_read_from(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "reader", + "read_size", + "write_size", + "skip_bytes", + NULL + }; + + PyObject* reader; + size_t inSize = ZSTD_DStreamInSize(); + size_t outSize = ZSTD_DStreamOutSize(); + ZstdDecompressorIterator* result; + size_t skipBytes = 0; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|kkk", kwlist, &reader, + &inSize, &outSize, &skipBytes)) { + return NULL; + } + + if (skipBytes >= inSize) { + PyErr_SetString(PyExc_ValueError, + "skip_bytes must be smaller than read_size"); + return NULL; + } + + result = PyObject_New(ZstdDecompressorIterator, &ZstdDecompressorIteratorType); + if (!result) { + return NULL; + } + + result->decompressor = NULL; + result->reader = NULL; + result->buffer = NULL; + result->dstream = NULL; + result->input.src = NULL; + result->output.dst = NULL; + + if (PyObject_HasAttrString(reader, "read")) { + result->reader = reader; + Py_INCREF(result->reader); + } + else if (1 == PyObject_CheckBuffer(reader)) { + /* Object claims it is a buffer. Try to get a handle to it. */ + result->buffer = PyMem_Malloc(sizeof(Py_buffer)); + if (!result->buffer) { + goto except; + } + + memset(result->buffer, 0, sizeof(Py_buffer)); + + if (0 != PyObject_GetBuffer(reader, result->buffer, PyBUF_CONTIG_RO)) { + goto except; + } + + result->bufferOffset = 0; + } + else { + PyErr_SetString(PyExc_ValueError, + "must pass an object with a read() method or conforms to buffer protocol"); + goto except; + } + + result->decompressor = self; + Py_INCREF(result->decompressor); + + result->inSize = inSize; + result->outSize = outSize; + result->skipBytes = skipBytes; + + result->dstream = DStream_from_ZstdDecompressor(self); + if (!result->dstream) { + goto except; + } + + result->input.src = PyMem_Malloc(inSize); + if (!result->input.src) { + PyErr_NoMemory(); + goto except; + } + result->input.size = 0; + result->input.pos = 0; + + result->output.dst = NULL; + result->output.size = 0; + result->output.pos = 0; + + result->readCount = 0; + result->finishedInput = 0; + result->finishedOutput = 0; + + goto finally; + +except: + if (result->reader) { + Py_DECREF(result->reader); + result->reader = NULL; + } + + if (result->buffer) { + PyBuffer_Release(result->buffer); + Py_DECREF(result->buffer); + result->buffer = NULL; + } + + Py_DECREF(result); + result = NULL; + +finally: + + return result; +} + +PyDoc_STRVAR(Decompressor_write_to__doc__, +"Create a context manager to write decompressed data to an object.\n" +"\n" +"The passed object must have a ``write()`` method.\n" +"\n" +"The caller feeds intput data to the object by calling ``write(data)``.\n" +"Decompressed data is written to the argument given as it is decompressed.\n" +"\n" +"An optional ``write_size`` argument defines the size of chunks to\n" +"``write()`` to the writer. It defaults to the default output size for a zstd\n" +"streaming decompressor.\n" +); + +static ZstdDecompressionWriter* Decompressor_write_to(ZstdDecompressor* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "writer", + "write_size", + NULL + }; + + PyObject* writer; + size_t outSize = ZSTD_DStreamOutSize(); + ZstdDecompressionWriter* result; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k", kwlist, &writer, &outSize)) { + return NULL; + } + + if (!PyObject_HasAttrString(writer, "write")) { + PyErr_SetString(PyExc_ValueError, "must pass an object with a write() method"); + return NULL; + } + + result = PyObject_New(ZstdDecompressionWriter, &ZstdDecompressionWriterType); + if (!result) { + return NULL; + } + + result->decompressor = self; + Py_INCREF(result->decompressor); + + result->writer = writer; + Py_INCREF(result->writer); + + result->outSize = outSize; + + result->entered = 0; + result->dstream = NULL; + + return result; +} + +static PyMethodDef Decompressor_methods[] = { + { "copy_stream", (PyCFunction)Decompressor_copy_stream, METH_VARARGS | METH_KEYWORDS, + Decompressor_copy_stream__doc__ }, + { "decompress", (PyCFunction)Decompressor_decompress, METH_VARARGS | METH_KEYWORDS, + Decompressor_decompress__doc__ }, + { "decompressobj", (PyCFunction)Decompressor_decompressobj, METH_NOARGS, + Decompressor_decompressobj__doc__ }, + { "read_from", (PyCFunction)Decompressor_read_from, METH_VARARGS | METH_KEYWORDS, + Decompressor_read_from__doc__ }, + { "write_to", (PyCFunction)Decompressor_write_to, METH_VARARGS | METH_KEYWORDS, + Decompressor_write_to__doc__ }, + { NULL, NULL } +}; + +PyTypeObject ZstdDecompressorType = { + PyVarObject_HEAD_INIT(NULL, 0) + "zstd.ZstdDecompressor", /* tp_name */ + sizeof(ZstdDecompressor), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)Decompressor_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + Decompressor__doc__, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Decompressor_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)Decompressor_init, /* tp_init */ + 0, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ +}; + +void decompressor_module_init(PyObject* mod) { + Py_TYPE(&ZstdDecompressorType) = &PyType_Type; + if (PyType_Ready(&ZstdDecompressorType) < 0) { + return; + } + + Py_INCREF((PyObject*)&ZstdDecompressorType); + PyModule_AddObject(mod, "ZstdDecompressor", + (PyObject*)&ZstdDecompressorType); +} diff --git a/contrib/python-zstandard/c-ext/decompressoriterator.c b/contrib/python-zstandard/c-ext/decompressoriterator.c new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/c-ext/decompressoriterator.c @@ -0,0 +1,254 @@ +/** +* Copyright (c) 2016-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +extern PyObject* ZstdError; + +PyDoc_STRVAR(ZstdDecompressorIterator__doc__, +"Represents an iterator of decompressed data.\n" +); + +static void ZstdDecompressorIterator_dealloc(ZstdDecompressorIterator* self) { + Py_XDECREF(self->decompressor); + Py_XDECREF(self->reader); + + if (self->buffer) { + PyBuffer_Release(self->buffer); + PyMem_FREE(self->buffer); + self->buffer = NULL; + } + + if (self->dstream) { + ZSTD_freeDStream(self->dstream); + self->dstream = NULL; + } + + if (self->input.src) { + PyMem_Free((void*)self->input.src); + self->input.src = NULL; + } + + PyObject_Del(self); +} + +static PyObject* ZstdDecompressorIterator_iter(PyObject* self) { + Py_INCREF(self); + return self; +} + +static DecompressorIteratorResult read_decompressor_iterator(ZstdDecompressorIterator* self) { + size_t zresult; + PyObject* chunk; + DecompressorIteratorResult result; + size_t oldInputPos = self->input.pos; + + result.chunk = NULL; + + chunk = PyBytes_FromStringAndSize(NULL, self->outSize); + if (!chunk) { + result.errored = 1; + return result; + } + + self->output.dst = PyBytes_AsString(chunk); + self->output.size = self->outSize; + self->output.pos = 0; + + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_decompressStream(self->dstream, &self->output, &self->input); + Py_END_ALLOW_THREADS + + /* We're done with the pointer. Nullify to prevent anyone from getting a + handle on a Python object. */ + self->output.dst = NULL; + + if (ZSTD_isError(zresult)) { + Py_DECREF(chunk); + PyErr_Format(ZstdError, "zstd decompress error: %s", + ZSTD_getErrorName(zresult)); + result.errored = 1; + return result; + } + + self->readCount += self->input.pos - oldInputPos; + + /* Frame is fully decoded. Input exhausted and output sitting in buffer. */ + if (0 == zresult) { + self->finishedInput = 1; + self->finishedOutput = 1; + } + + /* If it produced output data, return it. */ + if (self->output.pos) { + if (self->output.pos < self->outSize) { + if (_PyBytes_Resize(&chunk, self->output.pos)) { + result.errored = 1; + return result; + } + } + } + else { + Py_DECREF(chunk); + chunk = NULL; + } + + result.errored = 0; + result.chunk = chunk; + + return result; +} + +static PyObject* ZstdDecompressorIterator_iternext(ZstdDecompressorIterator* self) { + PyObject* readResult = NULL; + char* readBuffer; + Py_ssize_t readSize; + Py_ssize_t bufferRemaining; + DecompressorIteratorResult result; + + if (self->finishedOutput) { + PyErr_SetString(PyExc_StopIteration, "output flushed"); + return NULL; + } + + /* If we have data left in the input, consume it. */ + if (self->input.pos < self->input.size) { + result = read_decompressor_iterator(self); + if (result.chunk || result.errored) { + return result.chunk; + } + + /* Else fall through to get more data from input. */ + } + +read_from_source: + + if (!self->finishedInput) { + if (self->reader) { + readResult = PyObject_CallMethod(self->reader, "read", "I", self->inSize); + if (!readResult) { + return NULL; + } + + PyBytes_AsStringAndSize(readResult, &readBuffer, &readSize); + } + else { + assert(self->buffer && self->buffer->buf); + + /* Only support contiguous C arrays for now */ + assert(self->buffer->strides == NULL && self->buffer->suboffsets == NULL); + assert(self->buffer->itemsize == 1); + + /* TODO avoid memcpy() below */ + readBuffer = (char *)self->buffer->buf + self->bufferOffset; + bufferRemaining = self->buffer->len - self->bufferOffset; + readSize = min(bufferRemaining, (Py_ssize_t)self->inSize); + self->bufferOffset += readSize; + } + + if (readSize) { + if (!self->readCount && self->skipBytes) { + assert(self->skipBytes < self->inSize); + if ((Py_ssize_t)self->skipBytes >= readSize) { + PyErr_SetString(PyExc_ValueError, + "skip_bytes larger than first input chunk; " + "this scenario is currently unsupported"); + Py_DecRef(readResult); + return NULL; + } + + readBuffer = readBuffer + self->skipBytes; + readSize -= self->skipBytes; + } + + /* Copy input into previously allocated buffer because it can live longer + than a single function call and we don't want to keep a ref to a Python + object around. This could be changed... */ + memcpy((void*)self->input.src, readBuffer, readSize); + self->input.size = readSize; + self->input.pos = 0; + } + /* No bytes on first read must mean an empty input stream. */ + else if (!self->readCount) { + self->finishedInput = 1; + self->finishedOutput = 1; + Py_DecRef(readResult); + PyErr_SetString(PyExc_StopIteration, "empty input"); + return NULL; + } + else { + self->finishedInput = 1; + } + + /* We've copied the data managed by memory. Discard the Python object. */ + Py_DecRef(readResult); + } + + result = read_decompressor_iterator(self); + if (result.errored || result.chunk) { + return result.chunk; + } + + /* No new output data. Try again unless we know there is no more data. */ + if (!self->finishedInput) { + goto read_from_source; + } + + PyErr_SetString(PyExc_StopIteration, "input exhausted"); + return NULL; +} + +PyTypeObject ZstdDecompressorIteratorType = { + PyVarObject_HEAD_INIT(NULL, 0) + "zstd.ZstdDecompressorIterator", /* tp_name */ + sizeof(ZstdDecompressorIterator), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)ZstdDecompressorIterator_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + ZstdDecompressorIterator__doc__, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + ZstdDecompressorIterator_iter, /* tp_iter */ + (iternextfunc)ZstdDecompressorIterator_iternext, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ +}; + +void decompressoriterator_module_init(PyObject* mod) { + Py_TYPE(&ZstdDecompressorIteratorType) = &PyType_Type; + if (PyType_Ready(&ZstdDecompressorIteratorType) < 0) { + return; + } +} diff --git a/contrib/python-zstandard/c-ext/dictparams.c b/contrib/python-zstandard/c-ext/dictparams.c new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/c-ext/dictparams.c @@ -0,0 +1,125 @@ +/** +* Copyright (c) 2016-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +PyDoc_STRVAR(DictParameters__doc__, +"DictParameters: low-level control over dictionary generation"); + +static PyObject* DictParameters_new(PyTypeObject* subtype, PyObject* args, PyObject* kwargs) { + DictParametersObject* self; + unsigned selectivityLevel; + int compressionLevel; + unsigned notificationLevel; + unsigned dictID; + + if (!PyArg_ParseTuple(args, "IiII", &selectivityLevel, &compressionLevel, + ¬ificationLevel, &dictID)) { + return NULL; + } + + self = (DictParametersObject*)subtype->tp_alloc(subtype, 1); + if (!self) { + return NULL; + } + + self->selectivityLevel = selectivityLevel; + self->compressionLevel = compressionLevel; + self->notificationLevel = notificationLevel; + self->dictID = dictID; + + return (PyObject*)self; +} + +static void DictParameters_dealloc(PyObject* self) { + PyObject_Del(self); +} + +static Py_ssize_t DictParameters_length(PyObject* self) { + return 4; +}; + +static PyObject* DictParameters_item(PyObject* o, Py_ssize_t i) { + DictParametersObject* self = (DictParametersObject*)o; + + switch (i) { + case 0: + return PyLong_FromLong(self->selectivityLevel); + case 1: + return PyLong_FromLong(self->compressionLevel); + case 2: + return PyLong_FromLong(self->notificationLevel); + case 3: + return PyLong_FromLong(self->dictID); + default: + PyErr_SetString(PyExc_IndexError, "index out of range"); + return NULL; + } +} + +static PySequenceMethods DictParameters_sq = { + DictParameters_length, /* sq_length */ + 0, /* sq_concat */ + 0, /* sq_repeat */ + DictParameters_item, /* sq_item */ + 0, /* sq_ass_item */ + 0, /* sq_contains */ + 0, /* sq_inplace_concat */ + 0 /* sq_inplace_repeat */ +}; + +PyTypeObject DictParametersType = { + PyVarObject_HEAD_INIT(NULL, 0) + "DictParameters", /* tp_name */ + sizeof(DictParametersObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)DictParameters_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + &DictParameters_sq, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + DictParameters__doc__, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + DictParameters_new, /* tp_new */ +}; + +void dictparams_module_init(PyObject* mod) { + Py_TYPE(&DictParametersType) = &PyType_Type; + if (PyType_Ready(&DictParametersType) < 0) { + return; + } + + Py_IncRef((PyObject*)&DictParametersType); + PyModule_AddObject(mod, "DictParameters", (PyObject*)&DictParametersType); +} diff --git a/contrib/python-zstandard/c-ext/python-zstandard.h b/contrib/python-zstandard/c-ext/python-zstandard.h new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/c-ext/python-zstandard.h @@ -0,0 +1,172 @@ +/** +* Copyright (c) 2016-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#define PY_SSIZE_T_CLEAN +#include + +#define ZSTD_STATIC_LINKING_ONLY +#define ZDICT_STATIC_LINKING_ONLY +#include "mem.h" +#include "zstd.h" +#include "zdict.h" + +#define PYTHON_ZSTANDARD_VERSION "0.5.0" + +typedef struct { + PyObject_HEAD + unsigned windowLog; + unsigned chainLog; + unsigned hashLog; + unsigned searchLog; + unsigned searchLength; + unsigned targetLength; + ZSTD_strategy strategy; +} CompressionParametersObject; + +extern PyTypeObject CompressionParametersType; + +typedef struct { + PyObject_HEAD + unsigned selectivityLevel; + int compressionLevel; + unsigned notificationLevel; + unsigned dictID; +} DictParametersObject; + +extern PyTypeObject DictParametersType; + +typedef struct { + PyObject_HEAD + + void* dictData; + size_t dictSize; +} ZstdCompressionDict; + +extern PyTypeObject ZstdCompressionDictType; + +typedef struct { + PyObject_HEAD + + int compressionLevel; + ZstdCompressionDict* dict; + ZSTD_CDict* cdict; + CompressionParametersObject* cparams; + ZSTD_frameParameters fparams; +} ZstdCompressor; + +extern PyTypeObject ZstdCompressorType; + +typedef struct { + PyObject_HEAD + + ZstdCompressor* compressor; + ZSTD_CStream* cstream; + ZSTD_outBuffer output; + int flushed; +} ZstdCompressionObj; + +extern PyTypeObject ZstdCompressionObjType; + +typedef struct { + PyObject_HEAD + + ZstdCompressor* compressor; + PyObject* writer; + Py_ssize_t sourceSize; + size_t outSize; + ZSTD_CStream* cstream; + int entered; +} ZstdCompressionWriter; + +extern PyTypeObject ZstdCompressionWriterType; + +typedef struct { + PyObject_HEAD + + ZstdCompressor* compressor; + PyObject* reader; + Py_buffer* buffer; + Py_ssize_t bufferOffset; + Py_ssize_t sourceSize; + size_t inSize; + size_t outSize; + + ZSTD_CStream* cstream; + ZSTD_inBuffer input; + ZSTD_outBuffer output; + int finishedOutput; + int finishedInput; + PyObject* readResult; +} ZstdCompressorIterator; + +extern PyTypeObject ZstdCompressorIteratorType; + +typedef struct { + PyObject_HEAD + + ZSTD_DCtx* refdctx; + + ZstdCompressionDict* dict; + ZSTD_DDict* ddict; +} ZstdDecompressor; + +extern PyTypeObject ZstdDecompressorType; + +typedef struct { + PyObject_HEAD + + ZstdDecompressor* decompressor; + ZSTD_DStream* dstream; + int finished; +} ZstdDecompressionObj; + +extern PyTypeObject ZstdDecompressionObjType; + +typedef struct { + PyObject_HEAD + + ZstdDecompressor* decompressor; + PyObject* writer; + size_t outSize; + ZSTD_DStream* dstream; + int entered; +} ZstdDecompressionWriter; + +extern PyTypeObject ZstdDecompressionWriterType; + +typedef struct { + PyObject_HEAD + + ZstdDecompressor* decompressor; + PyObject* reader; + Py_buffer* buffer; + Py_ssize_t bufferOffset; + size_t inSize; + size_t outSize; + size_t skipBytes; + ZSTD_DStream* dstream; + ZSTD_inBuffer input; + ZSTD_outBuffer output; + Py_ssize_t readCount; + int finishedInput; + int finishedOutput; +} ZstdDecompressorIterator; + +extern PyTypeObject ZstdDecompressorIteratorType; + +typedef struct { + int errored; + PyObject* chunk; +} DecompressorIteratorResult; + +void ztopy_compression_parameters(CompressionParametersObject* params, ZSTD_compressionParameters* zparams); +CompressionParametersObject* get_compression_parameters(PyObject* self, PyObject* args); +PyObject* estimate_compression_context_size(PyObject* self, PyObject* args); +ZSTD_CStream* CStream_from_ZstdCompressor(ZstdCompressor* compressor, Py_ssize_t sourceSize); +ZSTD_DStream* DStream_from_ZstdDecompressor(ZstdDecompressor* decompressor); +ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs); diff --git a/contrib/python-zstandard/make_cffi.py b/contrib/python-zstandard/make_cffi.py new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/make_cffi.py @@ -0,0 +1,110 @@ +# Copyright (c) 2016-present, Gregory Szorc +# All rights reserved. +# +# This software may be modified and distributed under the terms +# of the BSD license. See the LICENSE file for details. + +from __future__ import absolute_import + +import cffi +import os + + +HERE = os.path.abspath(os.path.dirname(__file__)) + +SOURCES = ['zstd/%s' % p for p in ( + 'common/entropy_common.c', + 'common/error_private.c', + 'common/fse_decompress.c', + 'common/xxhash.c', + 'common/zstd_common.c', + 'compress/fse_compress.c', + 'compress/huf_compress.c', + 'compress/zbuff_compress.c', + 'compress/zstd_compress.c', + 'decompress/huf_decompress.c', + 'decompress/zbuff_decompress.c', + 'decompress/zstd_decompress.c', + 'dictBuilder/divsufsort.c', + 'dictBuilder/zdict.c', +)] + +INCLUDE_DIRS = [os.path.join(HERE, d) for d in ( + 'zstd', + 'zstd/common', + 'zstd/compress', + 'zstd/decompress', + 'zstd/dictBuilder', +)] + +with open(os.path.join(HERE, 'zstd', 'zstd.h'), 'rb') as fh: + zstd_h = fh.read() + +ffi = cffi.FFI() +ffi.set_source('_zstd_cffi', ''' +/* needed for typedefs like U32 references in zstd.h */ +#include "mem.h" +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" +''', + sources=SOURCES, include_dirs=INCLUDE_DIRS) + +# Rather than define the API definitions from zstd.h inline, munge the +# source in a way that cdef() will accept. +lines = zstd_h.splitlines() +lines = [l.rstrip() for l in lines if l.strip()] + +# Strip preprocessor directives - they aren't important for our needs. +lines = [l for l in lines + if not l.startswith((b'#if', b'#else', b'#endif', b'#include'))] + +# Remove extern C block +lines = [l for l in lines if l not in (b'extern "C" {', b'}')] + +# The version #defines don't parse and aren't necessary. Strip them. +lines = [l for l in lines if not l.startswith(( + b'#define ZSTD_H_235446', + b'#define ZSTD_LIB_VERSION', + b'#define ZSTD_QUOTE', + b'#define ZSTD_EXPAND_AND_QUOTE', + b'#define ZSTD_VERSION_STRING', + b'#define ZSTD_VERSION_NUMBER'))] + +# The C parser also doesn't like some constant defines referencing +# other constants. +# TODO we pick the 64-bit constants here. We should assert somewhere +# we're compiling for 64-bit. +def fix_constants(l): + if l.startswith(b'#define ZSTD_WINDOWLOG_MAX '): + return b'#define ZSTD_WINDOWLOG_MAX 27' + elif l.startswith(b'#define ZSTD_CHAINLOG_MAX '): + return b'#define ZSTD_CHAINLOG_MAX 28' + elif l.startswith(b'#define ZSTD_HASHLOG_MAX '): + return b'#define ZSTD_HASHLOG_MAX 27' + elif l.startswith(b'#define ZSTD_CHAINLOG_MAX '): + return b'#define ZSTD_CHAINLOG_MAX 28' + elif l.startswith(b'#define ZSTD_CHAINLOG_MIN '): + return b'#define ZSTD_CHAINLOG_MIN 6' + elif l.startswith(b'#define ZSTD_SEARCHLOG_MAX '): + return b'#define ZSTD_SEARCHLOG_MAX 26' + elif l.startswith(b'#define ZSTD_BLOCKSIZE_ABSOLUTEMAX '): + return b'#define ZSTD_BLOCKSIZE_ABSOLUTEMAX 131072' + else: + return l +lines = map(fix_constants, lines) + +# ZSTDLIB_API isn't handled correctly. Strip it. +lines = [l for l in lines if not l.startswith(b'# define ZSTDLIB_API')] +def strip_api(l): + if l.startswith(b'ZSTDLIB_API '): + return l[len(b'ZSTDLIB_API '):] + else: + return l +lines = map(strip_api, lines) + +source = b'\n'.join(lines) +ffi.cdef(source.decode('latin1')) + + +if __name__ == '__main__': + ffi.compile() diff --git a/contrib/python-zstandard/setup.py b/contrib/python-zstandard/setup.py new file mode 100755 --- /dev/null +++ b/contrib/python-zstandard/setup.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# Copyright (c) 2016-present, Gregory Szorc +# All rights reserved. +# +# This software may be modified and distributed under the terms +# of the BSD license. See the LICENSE file for details. + +from setuptools import setup + +try: + import cffi +except ImportError: + cffi = None + +import setup_zstd + +# Code for obtaining the Extension instance is in its own module to +# facilitate reuse in other projects. +extensions = [setup_zstd.get_c_extension()] + +if cffi: + import make_cffi + extensions.append(make_cffi.ffi.distutils_extension()) + +version = None + +with open('c-ext/python-zstandard.h', 'r') as fh: + for line in fh: + if not line.startswith('#define PYTHON_ZSTANDARD_VERSION'): + continue + + version = line.split()[2][1:-1] + break + +if not version: + raise Exception('could not resolve package version; ' + 'this should never happen') + +setup( + name='zstandard', + version=version, + description='Zstandard bindings for Python', + long_description=open('README.rst', 'r').read(), + url='https://github.com/indygreg/python-zstandard', + author='Gregory Szorc', + author_email='gregory.szorc@gmail.com', + license='BSD', + classifiers=[ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: BSD License', + 'Programming Language :: C', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + ], + keywords='zstandard zstd compression', + ext_modules=extensions, + test_suite='tests', +) diff --git a/contrib/python-zstandard/setup_zstd.py b/contrib/python-zstandard/setup_zstd.py new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/setup_zstd.py @@ -0,0 +1,64 @@ +# Copyright (c) 2016-present, Gregory Szorc +# All rights reserved. +# +# This software may be modified and distributed under the terms +# of the BSD license. See the LICENSE file for details. + +import os +from distutils.extension import Extension + + +zstd_sources = ['zstd/%s' % p for p in ( + 'common/entropy_common.c', + 'common/error_private.c', + 'common/fse_decompress.c', + 'common/xxhash.c', + 'common/zstd_common.c', + 'compress/fse_compress.c', + 'compress/huf_compress.c', + 'compress/zbuff_compress.c', + 'compress/zstd_compress.c', + 'decompress/huf_decompress.c', + 'decompress/zbuff_decompress.c', + 'decompress/zstd_decompress.c', + 'dictBuilder/divsufsort.c', + 'dictBuilder/zdict.c', +)] + + +zstd_includes = [ + 'c-ext', + 'zstd', + 'zstd/common', + 'zstd/compress', + 'zstd/decompress', + 'zstd/dictBuilder', +] + +ext_sources = [ + 'zstd.c', + 'c-ext/compressiondict.c', + 'c-ext/compressobj.c', + 'c-ext/compressor.c', + 'c-ext/compressoriterator.c', + 'c-ext/compressionparams.c', + 'c-ext/compressionwriter.c', + 'c-ext/constants.c', + 'c-ext/decompressobj.c', + 'c-ext/decompressor.c', + 'c-ext/decompressoriterator.c', + 'c-ext/decompressionwriter.c', + 'c-ext/dictparams.c', +] + + +def get_c_extension(name='zstd'): + """Obtain a distutils.extension.Extension for the C extension.""" + root = os.path.abspath(os.path.dirname(__file__)) + + sources = [os.path.join(root, p) for p in zstd_sources + ext_sources] + include_dirs = [os.path.join(root, d) for d in zstd_includes] + + # TODO compile with optimizations. + return Extension(name, sources, + include_dirs=include_dirs) diff --git a/contrib/python-zstandard/tests/__init__.py b/contrib/python-zstandard/tests/__init__.py new file mode 100644 diff --git a/contrib/python-zstandard/tests/common.py b/contrib/python-zstandard/tests/common.py new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/tests/common.py @@ -0,0 +1,15 @@ +import io + +class OpCountingBytesIO(io.BytesIO): + def __init__(self, *args, **kwargs): + self._read_count = 0 + self._write_count = 0 + return super(OpCountingBytesIO, self).__init__(*args, **kwargs) + + def read(self, *args): + self._read_count += 1 + return super(OpCountingBytesIO, self).read(*args) + + def write(self, data): + self._write_count += 1 + return super(OpCountingBytesIO, self).write(data) diff --git a/contrib/python-zstandard/tests/test_cffi.py b/contrib/python-zstandard/tests/test_cffi.py new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/tests/test_cffi.py @@ -0,0 +1,35 @@ +import io + +try: + import unittest2 as unittest +except ImportError: + import unittest + +import zstd + +try: + import zstd_cffi +except ImportError: + raise unittest.SkipTest('cffi version of zstd not available') + + +class TestCFFIWriteToToCDecompressor(unittest.TestCase): + def test_simple(self): + orig = io.BytesIO() + orig.write(b'foo') + orig.write(b'bar') + orig.write(b'foobar' * 16384) + + dest = io.BytesIO() + cctx = zstd_cffi.ZstdCompressor() + with cctx.write_to(dest) as compressor: + compressor.write(orig.getvalue()) + + uncompressed = io.BytesIO() + dctx = zstd.ZstdDecompressor() + with dctx.write_to(uncompressed) as decompressor: + decompressor.write(dest.getvalue()) + + self.assertEqual(uncompressed.getvalue(), orig.getvalue()) + + diff --git a/contrib/python-zstandard/tests/test_compressor.py b/contrib/python-zstandard/tests/test_compressor.py new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/tests/test_compressor.py @@ -0,0 +1,465 @@ +import hashlib +import io +import struct +import sys + +try: + import unittest2 as unittest +except ImportError: + import unittest + +import zstd + +from .common import OpCountingBytesIO + + +if sys.version_info[0] >= 3: + next = lambda it: it.__next__() +else: + next = lambda it: it.next() + + +class TestCompressor(unittest.TestCase): + def test_level_bounds(self): + with self.assertRaises(ValueError): + zstd.ZstdCompressor(level=0) + + with self.assertRaises(ValueError): + zstd.ZstdCompressor(level=23) + + +class TestCompressor_compress(unittest.TestCase): + def test_compress_empty(self): + cctx = zstd.ZstdCompressor(level=1) + cctx.compress(b'') + + cctx = zstd.ZstdCompressor(level=22) + cctx.compress(b'') + + def test_compress_empty(self): + cctx = zstd.ZstdCompressor(level=1) + self.assertEqual(cctx.compress(b''), + b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') + + def test_compress_large(self): + chunks = [] + for i in range(255): + chunks.append(struct.Struct('>B').pack(i) * 16384) + + cctx = zstd.ZstdCompressor(level=3) + result = cctx.compress(b''.join(chunks)) + self.assertEqual(len(result), 999) + self.assertEqual(result[0:4], b'\x28\xb5\x2f\xfd') + + def test_write_checksum(self): + cctx = zstd.ZstdCompressor(level=1) + no_checksum = cctx.compress(b'foobar') + cctx = zstd.ZstdCompressor(level=1, write_checksum=True) + with_checksum = cctx.compress(b'foobar') + + self.assertEqual(len(with_checksum), len(no_checksum) + 4) + + def test_write_content_size(self): + cctx = zstd.ZstdCompressor(level=1) + no_size = cctx.compress(b'foobar' * 256) + cctx = zstd.ZstdCompressor(level=1, write_content_size=True) + with_size = cctx.compress(b'foobar' * 256) + + self.assertEqual(len(with_size), len(no_size) + 1) + + def test_no_dict_id(self): + samples = [] + for i in range(128): + samples.append(b'foo' * 64) + samples.append(b'bar' * 64) + samples.append(b'foobar' * 64) + + d = zstd.train_dictionary(1024, samples) + + cctx = zstd.ZstdCompressor(level=1, dict_data=d) + with_dict_id = cctx.compress(b'foobarfoobar') + + cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False) + no_dict_id = cctx.compress(b'foobarfoobar') + + self.assertEqual(len(with_dict_id), len(no_dict_id) + 4) + + def test_compress_dict_multiple(self): + samples = [] + for i in range(128): + samples.append(b'foo' * 64) + samples.append(b'bar' * 64) + samples.append(b'foobar' * 64) + + d = zstd.train_dictionary(8192, samples) + + cctx = zstd.ZstdCompressor(level=1, dict_data=d) + + for i in range(32): + cctx.compress(b'foo bar foobar foo bar foobar') + + +class TestCompressor_compressobj(unittest.TestCase): + def test_compressobj_empty(self): + cctx = zstd.ZstdCompressor(level=1) + cobj = cctx.compressobj() + self.assertEqual(cobj.compress(b''), b'') + self.assertEqual(cobj.flush(), + b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') + + def test_compressobj_large(self): + chunks = [] + for i in range(255): + chunks.append(struct.Struct('>B').pack(i) * 16384) + + cctx = zstd.ZstdCompressor(level=3) + cobj = cctx.compressobj() + + result = cobj.compress(b''.join(chunks)) + cobj.flush() + self.assertEqual(len(result), 999) + self.assertEqual(result[0:4], b'\x28\xb5\x2f\xfd') + + def test_write_checksum(self): + cctx = zstd.ZstdCompressor(level=1) + cobj = cctx.compressobj() + no_checksum = cobj.compress(b'foobar') + cobj.flush() + cctx = zstd.ZstdCompressor(level=1, write_checksum=True) + cobj = cctx.compressobj() + with_checksum = cobj.compress(b'foobar') + cobj.flush() + + self.assertEqual(len(with_checksum), len(no_checksum) + 4) + + def test_write_content_size(self): + cctx = zstd.ZstdCompressor(level=1) + cobj = cctx.compressobj(size=len(b'foobar' * 256)) + no_size = cobj.compress(b'foobar' * 256) + cobj.flush() + cctx = zstd.ZstdCompressor(level=1, write_content_size=True) + cobj = cctx.compressobj(size=len(b'foobar' * 256)) + with_size = cobj.compress(b'foobar' * 256) + cobj.flush() + + self.assertEqual(len(with_size), len(no_size) + 1) + + def test_compress_after_flush(self): + cctx = zstd.ZstdCompressor() + cobj = cctx.compressobj() + + cobj.compress(b'foo') + cobj.flush() + + with self.assertRaisesRegexp(zstd.ZstdError, 'cannot call compress\(\) after flush'): + cobj.compress(b'foo') + + with self.assertRaisesRegexp(zstd.ZstdError, 'flush\(\) already called'): + cobj.flush() + + +class TestCompressor_copy_stream(unittest.TestCase): + def test_no_read(self): + source = object() + dest = io.BytesIO() + + cctx = zstd.ZstdCompressor() + with self.assertRaises(ValueError): + cctx.copy_stream(source, dest) + + def test_no_write(self): + source = io.BytesIO() + dest = object() + + cctx = zstd.ZstdCompressor() + with self.assertRaises(ValueError): + cctx.copy_stream(source, dest) + + def test_empty(self): + source = io.BytesIO() + dest = io.BytesIO() + + cctx = zstd.ZstdCompressor(level=1) + r, w = cctx.copy_stream(source, dest) + self.assertEqual(int(r), 0) + self.assertEqual(w, 9) + + self.assertEqual(dest.getvalue(), + b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') + + def test_large_data(self): + source = io.BytesIO() + for i in range(255): + source.write(struct.Struct('>B').pack(i) * 16384) + source.seek(0) + + dest = io.BytesIO() + cctx = zstd.ZstdCompressor() + r, w = cctx.copy_stream(source, dest) + + self.assertEqual(r, 255 * 16384) + self.assertEqual(w, 999) + + def test_write_checksum(self): + source = io.BytesIO(b'foobar') + no_checksum = io.BytesIO() + + cctx = zstd.ZstdCompressor(level=1) + cctx.copy_stream(source, no_checksum) + + source.seek(0) + with_checksum = io.BytesIO() + cctx = zstd.ZstdCompressor(level=1, write_checksum=True) + cctx.copy_stream(source, with_checksum) + + self.assertEqual(len(with_checksum.getvalue()), + len(no_checksum.getvalue()) + 4) + + def test_write_content_size(self): + source = io.BytesIO(b'foobar' * 256) + no_size = io.BytesIO() + + cctx = zstd.ZstdCompressor(level=1) + cctx.copy_stream(source, no_size) + + source.seek(0) + with_size = io.BytesIO() + cctx = zstd.ZstdCompressor(level=1, write_content_size=True) + cctx.copy_stream(source, with_size) + + # Source content size is unknown, so no content size written. + self.assertEqual(len(with_size.getvalue()), + len(no_size.getvalue())) + + source.seek(0) + with_size = io.BytesIO() + cctx.copy_stream(source, with_size, size=len(source.getvalue())) + + # We specified source size, so content size header is present. + self.assertEqual(len(with_size.getvalue()), + len(no_size.getvalue()) + 1) + + def test_read_write_size(self): + source = OpCountingBytesIO(b'foobarfoobar') + dest = OpCountingBytesIO() + cctx = zstd.ZstdCompressor() + r, w = cctx.copy_stream(source, dest, read_size=1, write_size=1) + + self.assertEqual(r, len(source.getvalue())) + self.assertEqual(w, 21) + self.assertEqual(source._read_count, len(source.getvalue()) + 1) + self.assertEqual(dest._write_count, len(dest.getvalue())) + + +def compress(data, level): + buffer = io.BytesIO() + cctx = zstd.ZstdCompressor(level=level) + with cctx.write_to(buffer) as compressor: + compressor.write(data) + return buffer.getvalue() + + +class TestCompressor_write_to(unittest.TestCase): + def test_empty(self): + self.assertEqual(compress(b'', 1), + b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') + + def test_multiple_compress(self): + buffer = io.BytesIO() + cctx = zstd.ZstdCompressor(level=5) + with cctx.write_to(buffer) as compressor: + compressor.write(b'foo') + compressor.write(b'bar') + compressor.write(b'x' * 8192) + + result = buffer.getvalue() + self.assertEqual(result, + b'\x28\xb5\x2f\xfd\x00\x50\x75\x00\x00\x38\x66\x6f' + b'\x6f\x62\x61\x72\x78\x01\x00\xfc\xdf\x03\x23') + + def test_dictionary(self): + samples = [] + for i in range(128): + samples.append(b'foo' * 64) + samples.append(b'bar' * 64) + samples.append(b'foobar' * 64) + + d = zstd.train_dictionary(8192, samples) + + buffer = io.BytesIO() + cctx = zstd.ZstdCompressor(level=9, dict_data=d) + with cctx.write_to(buffer) as compressor: + compressor.write(b'foo') + compressor.write(b'bar') + compressor.write(b'foo' * 16384) + + compressed = buffer.getvalue() + h = hashlib.sha1(compressed).hexdigest() + self.assertEqual(h, '1c5bcd25181bcd8c1a73ea8773323e0056129f92') + + def test_compression_params(self): + params = zstd.CompressionParameters(20, 6, 12, 5, 4, 10, zstd.STRATEGY_FAST) + + buffer = io.BytesIO() + cctx = zstd.ZstdCompressor(compression_params=params) + with cctx.write_to(buffer) as compressor: + compressor.write(b'foo') + compressor.write(b'bar') + compressor.write(b'foobar' * 16384) + + compressed = buffer.getvalue() + h = hashlib.sha1(compressed).hexdigest() + self.assertEqual(h, '1ae31f270ed7de14235221a604b31ecd517ebd99') + + def test_write_checksum(self): + no_checksum = io.BytesIO() + cctx = zstd.ZstdCompressor(level=1) + with cctx.write_to(no_checksum) as compressor: + compressor.write(b'foobar') + + with_checksum = io.BytesIO() + cctx = zstd.ZstdCompressor(level=1, write_checksum=True) + with cctx.write_to(with_checksum) as compressor: + compressor.write(b'foobar') + + self.assertEqual(len(with_checksum.getvalue()), + len(no_checksum.getvalue()) + 4) + + def test_write_content_size(self): + no_size = io.BytesIO() + cctx = zstd.ZstdCompressor(level=1) + with cctx.write_to(no_size) as compressor: + compressor.write(b'foobar' * 256) + + with_size = io.BytesIO() + cctx = zstd.ZstdCompressor(level=1, write_content_size=True) + with cctx.write_to(with_size) as compressor: + compressor.write(b'foobar' * 256) + + # Source size is not known in streaming mode, so header not + # written. + self.assertEqual(len(with_size.getvalue()), + len(no_size.getvalue())) + + # Declaring size will write the header. + with_size = io.BytesIO() + with cctx.write_to(with_size, size=len(b'foobar' * 256)) as compressor: + compressor.write(b'foobar' * 256) + + self.assertEqual(len(with_size.getvalue()), + len(no_size.getvalue()) + 1) + + def test_no_dict_id(self): + samples = [] + for i in range(128): + samples.append(b'foo' * 64) + samples.append(b'bar' * 64) + samples.append(b'foobar' * 64) + + d = zstd.train_dictionary(1024, samples) + + with_dict_id = io.BytesIO() + cctx = zstd.ZstdCompressor(level=1, dict_data=d) + with cctx.write_to(with_dict_id) as compressor: + compressor.write(b'foobarfoobar') + + cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False) + no_dict_id = io.BytesIO() + with cctx.write_to(no_dict_id) as compressor: + compressor.write(b'foobarfoobar') + + self.assertEqual(len(with_dict_id.getvalue()), + len(no_dict_id.getvalue()) + 4) + + def test_memory_size(self): + cctx = zstd.ZstdCompressor(level=3) + buffer = io.BytesIO() + with cctx.write_to(buffer) as compressor: + size = compressor.memory_size() + + self.assertGreater(size, 100000) + + def test_write_size(self): + cctx = zstd.ZstdCompressor(level=3) + dest = OpCountingBytesIO() + with cctx.write_to(dest, write_size=1) as compressor: + compressor.write(b'foo') + compressor.write(b'bar') + compressor.write(b'foobar') + + self.assertEqual(len(dest.getvalue()), dest._write_count) + + +class TestCompressor_read_from(unittest.TestCase): + def test_type_validation(self): + cctx = zstd.ZstdCompressor() + + # Object with read() works. + cctx.read_from(io.BytesIO()) + + # Buffer protocol works. + cctx.read_from(b'foobar') + + with self.assertRaisesRegexp(ValueError, 'must pass an object with a read'): + cctx.read_from(True) + + def test_read_empty(self): + cctx = zstd.ZstdCompressor(level=1) + + source = io.BytesIO() + it = cctx.read_from(source) + chunks = list(it) + self.assertEqual(len(chunks), 1) + compressed = b''.join(chunks) + self.assertEqual(compressed, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') + + # And again with the buffer protocol. + it = cctx.read_from(b'') + chunks = list(it) + self.assertEqual(len(chunks), 1) + compressed2 = b''.join(chunks) + self.assertEqual(compressed2, compressed) + + def test_read_large(self): + cctx = zstd.ZstdCompressor(level=1) + + source = io.BytesIO() + source.write(b'f' * zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE) + source.write(b'o') + source.seek(0) + + # Creating an iterator should not perform any compression until + # first read. + it = cctx.read_from(source, size=len(source.getvalue())) + self.assertEqual(source.tell(), 0) + + # We should have exactly 2 output chunks. + chunks = [] + chunk = next(it) + self.assertIsNotNone(chunk) + self.assertEqual(source.tell(), zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE) + chunks.append(chunk) + chunk = next(it) + self.assertIsNotNone(chunk) + chunks.append(chunk) + + self.assertEqual(source.tell(), len(source.getvalue())) + + with self.assertRaises(StopIteration): + next(it) + + # And again for good measure. + with self.assertRaises(StopIteration): + next(it) + + # We should get the same output as the one-shot compression mechanism. + self.assertEqual(b''.join(chunks), cctx.compress(source.getvalue())) + + # Now check the buffer protocol. + it = cctx.read_from(source.getvalue()) + chunks = list(it) + self.assertEqual(len(chunks), 2) + self.assertEqual(b''.join(chunks), cctx.compress(source.getvalue())) + + def test_read_write_size(self): + source = OpCountingBytesIO(b'foobarfoobar') + cctx = zstd.ZstdCompressor(level=3) + for chunk in cctx.read_from(source, read_size=1, write_size=1): + self.assertEqual(len(chunk), 1) + + self.assertEqual(source._read_count, len(source.getvalue()) + 1) diff --git a/contrib/python-zstandard/tests/test_data_structures.py b/contrib/python-zstandard/tests/test_data_structures.py new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/tests/test_data_structures.py @@ -0,0 +1,107 @@ +import io + +try: + import unittest2 as unittest +except ImportError: + import unittest + +try: + import hypothesis + import hypothesis.strategies as strategies +except ImportError: + hypothesis = None + +import zstd + +class TestCompressionParameters(unittest.TestCase): + def test_init_bad_arg_type(self): + with self.assertRaises(TypeError): + zstd.CompressionParameters() + + with self.assertRaises(TypeError): + zstd.CompressionParameters(0, 1) + + def test_bounds(self): + zstd.CompressionParameters(zstd.WINDOWLOG_MIN, + zstd.CHAINLOG_MIN, + zstd.HASHLOG_MIN, + zstd.SEARCHLOG_MIN, + zstd.SEARCHLENGTH_MIN, + zstd.TARGETLENGTH_MIN, + zstd.STRATEGY_FAST) + + zstd.CompressionParameters(zstd.WINDOWLOG_MAX, + zstd.CHAINLOG_MAX, + zstd.HASHLOG_MAX, + zstd.SEARCHLOG_MAX, + zstd.SEARCHLENGTH_MAX, + zstd.TARGETLENGTH_MAX, + zstd.STRATEGY_BTOPT) + + def test_get_compression_parameters(self): + p = zstd.get_compression_parameters(1) + self.assertIsInstance(p, zstd.CompressionParameters) + + self.assertEqual(p[0], 19) + +if hypothesis: + s_windowlog = strategies.integers(min_value=zstd.WINDOWLOG_MIN, + max_value=zstd.WINDOWLOG_MAX) + s_chainlog = strategies.integers(min_value=zstd.CHAINLOG_MIN, + max_value=zstd.CHAINLOG_MAX) + s_hashlog = strategies.integers(min_value=zstd.HASHLOG_MIN, + max_value=zstd.HASHLOG_MAX) + s_searchlog = strategies.integers(min_value=zstd.SEARCHLOG_MIN, + max_value=zstd.SEARCHLOG_MAX) + s_searchlength = strategies.integers(min_value=zstd.SEARCHLENGTH_MIN, + max_value=zstd.SEARCHLENGTH_MAX) + s_targetlength = strategies.integers(min_value=zstd.TARGETLENGTH_MIN, + max_value=zstd.TARGETLENGTH_MAX) + s_strategy = strategies.sampled_from((zstd.STRATEGY_FAST, + zstd.STRATEGY_DFAST, + zstd.STRATEGY_GREEDY, + zstd.STRATEGY_LAZY, + zstd.STRATEGY_LAZY2, + zstd.STRATEGY_BTLAZY2, + zstd.STRATEGY_BTOPT)) + + class TestCompressionParametersHypothesis(unittest.TestCase): + @hypothesis.given(s_windowlog, s_chainlog, s_hashlog, s_searchlog, + s_searchlength, s_targetlength, s_strategy) + def test_valid_init(self, windowlog, chainlog, hashlog, searchlog, + searchlength, targetlength, strategy): + p = zstd.CompressionParameters(windowlog, chainlog, hashlog, + searchlog, searchlength, + targetlength, strategy) + self.assertEqual(tuple(p), + (windowlog, chainlog, hashlog, searchlog, + searchlength, targetlength, strategy)) + + # Verify we can instantiate a compressor with the supplied values. + # ZSTD_checkCParams moves the goal posts on us from what's advertised + # in the constants. So move along with them. + if searchlength == zstd.SEARCHLENGTH_MIN and strategy in (zstd.STRATEGY_FAST, zstd.STRATEGY_GREEDY): + searchlength += 1 + p = zstd.CompressionParameters(windowlog, chainlog, hashlog, + searchlog, searchlength, + targetlength, strategy) + elif searchlength == zstd.SEARCHLENGTH_MAX and strategy != zstd.STRATEGY_FAST: + searchlength -= 1 + p = zstd.CompressionParameters(windowlog, chainlog, hashlog, + searchlog, searchlength, + targetlength, strategy) + + cctx = zstd.ZstdCompressor(compression_params=p) + with cctx.write_to(io.BytesIO()): + pass + + @hypothesis.given(s_windowlog, s_chainlog, s_hashlog, s_searchlog, + s_searchlength, s_targetlength, s_strategy) + def test_estimate_compression_context_size(self, windowlog, chainlog, + hashlog, searchlog, + searchlength, targetlength, + strategy): + p = zstd.CompressionParameters(windowlog, chainlog, hashlog, + searchlog, searchlength, + targetlength, strategy) + size = zstd.estimate_compression_context_size(p) diff --git a/contrib/python-zstandard/tests/test_decompressor.py b/contrib/python-zstandard/tests/test_decompressor.py new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/tests/test_decompressor.py @@ -0,0 +1,478 @@ +import io +import random +import struct +import sys + +try: + import unittest2 as unittest +except ImportError: + import unittest + +import zstd + +from .common import OpCountingBytesIO + + +if sys.version_info[0] >= 3: + next = lambda it: it.__next__() +else: + next = lambda it: it.next() + + +class TestDecompressor_decompress(unittest.TestCase): + def test_empty_input(self): + dctx = zstd.ZstdDecompressor() + + with self.assertRaisesRegexp(zstd.ZstdError, 'input data invalid'): + dctx.decompress(b'') + + def test_invalid_input(self): + dctx = zstd.ZstdDecompressor() + + with self.assertRaisesRegexp(zstd.ZstdError, 'input data invalid'): + dctx.decompress(b'foobar') + + def test_no_content_size_in_frame(self): + cctx = zstd.ZstdCompressor(write_content_size=False) + compressed = cctx.compress(b'foobar') + + dctx = zstd.ZstdDecompressor() + with self.assertRaisesRegexp(zstd.ZstdError, 'input data invalid'): + dctx.decompress(compressed) + + def test_content_size_present(self): + cctx = zstd.ZstdCompressor(write_content_size=True) + compressed = cctx.compress(b'foobar') + + dctx = zstd.ZstdDecompressor() + decompressed = dctx.decompress(compressed) + self.assertEqual(decompressed, b'foobar') + + def test_max_output_size(self): + cctx = zstd.ZstdCompressor(write_content_size=False) + source = b'foobar' * 256 + compressed = cctx.compress(source) + + dctx = zstd.ZstdDecompressor() + # Will fit into buffer exactly the size of input. + decompressed = dctx.decompress(compressed, max_output_size=len(source)) + self.assertEqual(decompressed, source) + + # Input size - 1 fails + with self.assertRaisesRegexp(zstd.ZstdError, 'Destination buffer is too small'): + dctx.decompress(compressed, max_output_size=len(source) - 1) + + # Input size + 1 works + decompressed = dctx.decompress(compressed, max_output_size=len(source) + 1) + self.assertEqual(decompressed, source) + + # A much larger buffer works. + decompressed = dctx.decompress(compressed, max_output_size=len(source) * 64) + self.assertEqual(decompressed, source) + + def test_stupidly_large_output_buffer(self): + cctx = zstd.ZstdCompressor(write_content_size=False) + compressed = cctx.compress(b'foobar' * 256) + dctx = zstd.ZstdDecompressor() + + # Will get OverflowError on some Python distributions that can't + # handle really large integers. + with self.assertRaises((MemoryError, OverflowError)): + dctx.decompress(compressed, max_output_size=2**62) + + def test_dictionary(self): + samples = [] + for i in range(128): + samples.append(b'foo' * 64) + samples.append(b'bar' * 64) + samples.append(b'foobar' * 64) + + d = zstd.train_dictionary(8192, samples) + + orig = b'foobar' * 16384 + cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_content_size=True) + compressed = cctx.compress(orig) + + dctx = zstd.ZstdDecompressor(dict_data=d) + decompressed = dctx.decompress(compressed) + + self.assertEqual(decompressed, orig) + + def test_dictionary_multiple(self): + samples = [] + for i in range(128): + samples.append(b'foo' * 64) + samples.append(b'bar' * 64) + samples.append(b'foobar' * 64) + + d = zstd.train_dictionary(8192, samples) + + sources = (b'foobar' * 8192, b'foo' * 8192, b'bar' * 8192) + compressed = [] + cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_content_size=True) + for source in sources: + compressed.append(cctx.compress(source)) + + dctx = zstd.ZstdDecompressor(dict_data=d) + for i in range(len(sources)): + decompressed = dctx.decompress(compressed[i]) + self.assertEqual(decompressed, sources[i]) + + +class TestDecompressor_copy_stream(unittest.TestCase): + def test_no_read(self): + source = object() + dest = io.BytesIO() + + dctx = zstd.ZstdDecompressor() + with self.assertRaises(ValueError): + dctx.copy_stream(source, dest) + + def test_no_write(self): + source = io.BytesIO() + dest = object() + + dctx = zstd.ZstdDecompressor() + with self.assertRaises(ValueError): + dctx.copy_stream(source, dest) + + def test_empty(self): + source = io.BytesIO() + dest = io.BytesIO() + + dctx = zstd.ZstdDecompressor() + # TODO should this raise an error? + r, w = dctx.copy_stream(source, dest) + + self.assertEqual(r, 0) + self.assertEqual(w, 0) + self.assertEqual(dest.getvalue(), b'') + + def test_large_data(self): + source = io.BytesIO() + for i in range(255): + source.write(struct.Struct('>B').pack(i) * 16384) + source.seek(0) + + compressed = io.BytesIO() + cctx = zstd.ZstdCompressor() + cctx.copy_stream(source, compressed) + + compressed.seek(0) + dest = io.BytesIO() + dctx = zstd.ZstdDecompressor() + r, w = dctx.copy_stream(compressed, dest) + + self.assertEqual(r, len(compressed.getvalue())) + self.assertEqual(w, len(source.getvalue())) + + def test_read_write_size(self): + source = OpCountingBytesIO(zstd.ZstdCompressor().compress( + b'foobarfoobar')) + + dest = OpCountingBytesIO() + dctx = zstd.ZstdDecompressor() + r, w = dctx.copy_stream(source, dest, read_size=1, write_size=1) + + self.assertEqual(r, len(source.getvalue())) + self.assertEqual(w, len(b'foobarfoobar')) + self.assertEqual(source._read_count, len(source.getvalue()) + 1) + self.assertEqual(dest._write_count, len(dest.getvalue())) + + +class TestDecompressor_decompressobj(unittest.TestCase): + def test_simple(self): + data = zstd.ZstdCompressor(level=1).compress(b'foobar') + + dctx = zstd.ZstdDecompressor() + dobj = dctx.decompressobj() + self.assertEqual(dobj.decompress(data), b'foobar') + + def test_reuse(self): + data = zstd.ZstdCompressor(level=1).compress(b'foobar') + + dctx = zstd.ZstdDecompressor() + dobj = dctx.decompressobj() + dobj.decompress(data) + + with self.assertRaisesRegexp(zstd.ZstdError, 'cannot use a decompressobj'): + dobj.decompress(data) + + +def decompress_via_writer(data): + buffer = io.BytesIO() + dctx = zstd.ZstdDecompressor() + with dctx.write_to(buffer) as decompressor: + decompressor.write(data) + return buffer.getvalue() + + +class TestDecompressor_write_to(unittest.TestCase): + def test_empty_roundtrip(self): + cctx = zstd.ZstdCompressor() + empty = cctx.compress(b'') + self.assertEqual(decompress_via_writer(empty), b'') + + def test_large_roundtrip(self): + chunks = [] + for i in range(255): + chunks.append(struct.Struct('>B').pack(i) * 16384) + orig = b''.join(chunks) + cctx = zstd.ZstdCompressor() + compressed = cctx.compress(orig) + + self.assertEqual(decompress_via_writer(compressed), orig) + + def test_multiple_calls(self): + chunks = [] + for i in range(255): + for j in range(255): + chunks.append(struct.Struct('>B').pack(j) * i) + + orig = b''.join(chunks) + cctx = zstd.ZstdCompressor() + compressed = cctx.compress(orig) + + buffer = io.BytesIO() + dctx = zstd.ZstdDecompressor() + with dctx.write_to(buffer) as decompressor: + pos = 0 + while pos < len(compressed): + pos2 = pos + 8192 + decompressor.write(compressed[pos:pos2]) + pos += 8192 + self.assertEqual(buffer.getvalue(), orig) + + def test_dictionary(self): + samples = [] + for i in range(128): + samples.append(b'foo' * 64) + samples.append(b'bar' * 64) + samples.append(b'foobar' * 64) + + d = zstd.train_dictionary(8192, samples) + + orig = b'foobar' * 16384 + buffer = io.BytesIO() + cctx = zstd.ZstdCompressor(dict_data=d) + with cctx.write_to(buffer) as compressor: + compressor.write(orig) + + compressed = buffer.getvalue() + buffer = io.BytesIO() + + dctx = zstd.ZstdDecompressor(dict_data=d) + with dctx.write_to(buffer) as decompressor: + decompressor.write(compressed) + + self.assertEqual(buffer.getvalue(), orig) + + def test_memory_size(self): + dctx = zstd.ZstdDecompressor() + buffer = io.BytesIO() + with dctx.write_to(buffer) as decompressor: + size = decompressor.memory_size() + + self.assertGreater(size, 100000) + + def test_write_size(self): + source = zstd.ZstdCompressor().compress(b'foobarfoobar') + dest = OpCountingBytesIO() + dctx = zstd.ZstdDecompressor() + with dctx.write_to(dest, write_size=1) as decompressor: + s = struct.Struct('>B') + for c in source: + if not isinstance(c, str): + c = s.pack(c) + decompressor.write(c) + + + self.assertEqual(dest.getvalue(), b'foobarfoobar') + self.assertEqual(dest._write_count, len(dest.getvalue())) + + +class TestDecompressor_read_from(unittest.TestCase): + def test_type_validation(self): + dctx = zstd.ZstdDecompressor() + + # Object with read() works. + dctx.read_from(io.BytesIO()) + + # Buffer protocol works. + dctx.read_from(b'foobar') + + with self.assertRaisesRegexp(ValueError, 'must pass an object with a read'): + dctx.read_from(True) + + def test_empty_input(self): + dctx = zstd.ZstdDecompressor() + + source = io.BytesIO() + it = dctx.read_from(source) + # TODO this is arguably wrong. Should get an error about missing frame foo. + with self.assertRaises(StopIteration): + next(it) + + it = dctx.read_from(b'') + with self.assertRaises(StopIteration): + next(it) + + def test_invalid_input(self): + dctx = zstd.ZstdDecompressor() + + source = io.BytesIO(b'foobar') + it = dctx.read_from(source) + with self.assertRaisesRegexp(zstd.ZstdError, 'Unknown frame descriptor'): + next(it) + + it = dctx.read_from(b'foobar') + with self.assertRaisesRegexp(zstd.ZstdError, 'Unknown frame descriptor'): + next(it) + + def test_empty_roundtrip(self): + cctx = zstd.ZstdCompressor(level=1, write_content_size=False) + empty = cctx.compress(b'') + + source = io.BytesIO(empty) + source.seek(0) + + dctx = zstd.ZstdDecompressor() + it = dctx.read_from(source) + + # No chunks should be emitted since there is no data. + with self.assertRaises(StopIteration): + next(it) + + # Again for good measure. + with self.assertRaises(StopIteration): + next(it) + + def test_skip_bytes_too_large(self): + dctx = zstd.ZstdDecompressor() + + with self.assertRaisesRegexp(ValueError, 'skip_bytes must be smaller than read_size'): + dctx.read_from(b'', skip_bytes=1, read_size=1) + + with self.assertRaisesRegexp(ValueError, 'skip_bytes larger than first input chunk'): + b''.join(dctx.read_from(b'foobar', skip_bytes=10)) + + def test_skip_bytes(self): + cctx = zstd.ZstdCompressor(write_content_size=False) + compressed = cctx.compress(b'foobar') + + dctx = zstd.ZstdDecompressor() + output = b''.join(dctx.read_from(b'hdr' + compressed, skip_bytes=3)) + self.assertEqual(output, b'foobar') + + def test_large_output(self): + source = io.BytesIO() + source.write(b'f' * zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE) + source.write(b'o') + source.seek(0) + + cctx = zstd.ZstdCompressor(level=1) + compressed = io.BytesIO(cctx.compress(source.getvalue())) + compressed.seek(0) + + dctx = zstd.ZstdDecompressor() + it = dctx.read_from(compressed) + + chunks = [] + chunks.append(next(it)) + chunks.append(next(it)) + + with self.assertRaises(StopIteration): + next(it) + + decompressed = b''.join(chunks) + self.assertEqual(decompressed, source.getvalue()) + + # And again with buffer protocol. + it = dctx.read_from(compressed.getvalue()) + chunks = [] + chunks.append(next(it)) + chunks.append(next(it)) + + with self.assertRaises(StopIteration): + next(it) + + decompressed = b''.join(chunks) + self.assertEqual(decompressed, source.getvalue()) + + def test_large_input(self): + bytes = list(struct.Struct('>B').pack(i) for i in range(256)) + compressed = io.BytesIO() + input_size = 0 + cctx = zstd.ZstdCompressor(level=1) + with cctx.write_to(compressed) as compressor: + while True: + compressor.write(random.choice(bytes)) + input_size += 1 + + have_compressed = len(compressed.getvalue()) > zstd.DECOMPRESSION_RECOMMENDED_INPUT_SIZE + have_raw = input_size > zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE * 2 + if have_compressed and have_raw: + break + + compressed.seek(0) + self.assertGreater(len(compressed.getvalue()), + zstd.DECOMPRESSION_RECOMMENDED_INPUT_SIZE) + + dctx = zstd.ZstdDecompressor() + it = dctx.read_from(compressed) + + chunks = [] + chunks.append(next(it)) + chunks.append(next(it)) + chunks.append(next(it)) + + with self.assertRaises(StopIteration): + next(it) + + decompressed = b''.join(chunks) + self.assertEqual(len(decompressed), input_size) + + # And again with buffer protocol. + it = dctx.read_from(compressed.getvalue()) + + chunks = [] + chunks.append(next(it)) + chunks.append(next(it)) + chunks.append(next(it)) + + with self.assertRaises(StopIteration): + next(it) + + decompressed = b''.join(chunks) + self.assertEqual(len(decompressed), input_size) + + def test_interesting(self): + # Found this edge case via fuzzing. + cctx = zstd.ZstdCompressor(level=1) + + source = io.BytesIO() + + compressed = io.BytesIO() + with cctx.write_to(compressed) as compressor: + for i in range(256): + chunk = b'\0' * 1024 + compressor.write(chunk) + source.write(chunk) + + dctx = zstd.ZstdDecompressor() + + simple = dctx.decompress(compressed.getvalue(), + max_output_size=len(source.getvalue())) + self.assertEqual(simple, source.getvalue()) + + compressed.seek(0) + streamed = b''.join(dctx.read_from(compressed)) + self.assertEqual(streamed, source.getvalue()) + + def test_read_write_size(self): + source = OpCountingBytesIO(zstd.ZstdCompressor().compress(b'foobarfoobar')) + dctx = zstd.ZstdDecompressor() + for chunk in dctx.read_from(source, read_size=1, write_size=1): + self.assertEqual(len(chunk), 1) + + self.assertEqual(source._read_count, len(source.getvalue())) diff --git a/contrib/python-zstandard/tests/test_estimate_sizes.py b/contrib/python-zstandard/tests/test_estimate_sizes.py new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/tests/test_estimate_sizes.py @@ -0,0 +1,17 @@ +try: + import unittest2 as unittest +except ImportError: + import unittest + +import zstd + + +class TestSizes(unittest.TestCase): + def test_decompression_size(self): + size = zstd.estimate_decompression_context_size() + self.assertGreater(size, 100000) + + def test_compression_size(self): + params = zstd.get_compression_parameters(3) + size = zstd.estimate_compression_context_size(params) + self.assertGreater(size, 100000) diff --git a/contrib/python-zstandard/tests/test_module_attributes.py b/contrib/python-zstandard/tests/test_module_attributes.py new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/tests/test_module_attributes.py @@ -0,0 +1,48 @@ +from __future__ import unicode_literals + +try: + import unittest2 as unittest +except ImportError: + import unittest + +import zstd + +class TestModuleAttributes(unittest.TestCase): + def test_version(self): + self.assertEqual(zstd.ZSTD_VERSION, (1, 1, 1)) + + def test_constants(self): + self.assertEqual(zstd.MAX_COMPRESSION_LEVEL, 22) + self.assertEqual(zstd.FRAME_HEADER, b'\x28\xb5\x2f\xfd') + + def test_hasattr(self): + attrs = ( + 'COMPRESSION_RECOMMENDED_INPUT_SIZE', + 'COMPRESSION_RECOMMENDED_OUTPUT_SIZE', + 'DECOMPRESSION_RECOMMENDED_INPUT_SIZE', + 'DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE', + 'MAGIC_NUMBER', + 'WINDOWLOG_MIN', + 'WINDOWLOG_MAX', + 'CHAINLOG_MIN', + 'CHAINLOG_MAX', + 'HASHLOG_MIN', + 'HASHLOG_MAX', + 'HASHLOG3_MAX', + 'SEARCHLOG_MIN', + 'SEARCHLOG_MAX', + 'SEARCHLENGTH_MIN', + 'SEARCHLENGTH_MAX', + 'TARGETLENGTH_MIN', + 'TARGETLENGTH_MAX', + 'STRATEGY_FAST', + 'STRATEGY_DFAST', + 'STRATEGY_GREEDY', + 'STRATEGY_LAZY', + 'STRATEGY_LAZY2', + 'STRATEGY_BTLAZY2', + 'STRATEGY_BTOPT', + ) + + for a in attrs: + self.assertTrue(hasattr(zstd, a)) diff --git a/contrib/python-zstandard/tests/test_roundtrip.py b/contrib/python-zstandard/tests/test_roundtrip.py new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/tests/test_roundtrip.py @@ -0,0 +1,64 @@ +import io + +try: + import unittest2 as unittest +except ImportError: + import unittest + +try: + import hypothesis + import hypothesis.strategies as strategies +except ImportError: + raise unittest.SkipTest('hypothesis not available') + +import zstd + + +compression_levels = strategies.integers(min_value=1, max_value=22) + + +class TestRoundTrip(unittest.TestCase): + @hypothesis.given(strategies.binary(), compression_levels) + def test_compress_write_to(self, data, level): + """Random data from compress() roundtrips via write_to.""" + cctx = zstd.ZstdCompressor(level=level) + compressed = cctx.compress(data) + + buffer = io.BytesIO() + dctx = zstd.ZstdDecompressor() + with dctx.write_to(buffer) as decompressor: + decompressor.write(compressed) + + self.assertEqual(buffer.getvalue(), data) + + @hypothesis.given(strategies.binary(), compression_levels) + def test_compressor_write_to_decompressor_write_to(self, data, level): + """Random data from compressor write_to roundtrips via write_to.""" + compress_buffer = io.BytesIO() + decompressed_buffer = io.BytesIO() + + cctx = zstd.ZstdCompressor(level=level) + with cctx.write_to(compress_buffer) as compressor: + compressor.write(data) + + dctx = zstd.ZstdDecompressor() + with dctx.write_to(decompressed_buffer) as decompressor: + decompressor.write(compress_buffer.getvalue()) + + self.assertEqual(decompressed_buffer.getvalue(), data) + + @hypothesis.given(strategies.binary(average_size=1048576)) + @hypothesis.settings(perform_health_check=False) + def test_compressor_write_to_decompressor_write_to_larger(self, data): + compress_buffer = io.BytesIO() + decompressed_buffer = io.BytesIO() + + cctx = zstd.ZstdCompressor(level=5) + with cctx.write_to(compress_buffer) as compressor: + compressor.write(data) + + dctx = zstd.ZstdDecompressor() + with dctx.write_to(decompressed_buffer) as decompressor: + decompressor.write(compress_buffer.getvalue()) + + self.assertEqual(decompressed_buffer.getvalue(), data) diff --git a/contrib/python-zstandard/tests/test_train_dictionary.py b/contrib/python-zstandard/tests/test_train_dictionary.py new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/tests/test_train_dictionary.py @@ -0,0 +1,46 @@ +import sys + +try: + import unittest2 as unittest +except ImportError: + import unittest + +import zstd + + +if sys.version_info[0] >= 3: + int_type = int +else: + int_type = long + + +class TestTrainDictionary(unittest.TestCase): + def test_no_args(self): + with self.assertRaises(TypeError): + zstd.train_dictionary() + + def test_bad_args(self): + with self.assertRaises(TypeError): + zstd.train_dictionary(8192, u'foo') + + with self.assertRaises(ValueError): + zstd.train_dictionary(8192, [u'foo']) + + def test_basic(self): + samples = [] + for i in range(128): + samples.append(b'foo' * 64) + samples.append(b'bar' * 64) + samples.append(b'foobar' * 64) + samples.append(b'baz' * 64) + samples.append(b'foobaz' * 64) + samples.append(b'bazfoo' * 64) + + d = zstd.train_dictionary(8192, samples) + self.assertLessEqual(len(d), 8192) + + dict_id = d.dict_id() + self.assertIsInstance(dict_id, int_type) + + data = d.as_bytes() + self.assertEqual(data[0:4], b'\x37\xa4\x30\xec') diff --git a/contrib/python-zstandard/zstd.c b/contrib/python-zstandard/zstd.c new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/zstd.c @@ -0,0 +1,112 @@ +/** + * Copyright (c) 2016-present, Gregory Szorc + * All rights reserved. + * + * This software may be modified and distributed under the terms + * of the BSD license. See the LICENSE file for details. + */ + +/* A Python C extension for Zstandard. */ + +#include "python-zstandard.h" + +PyObject *ZstdError; + +PyDoc_STRVAR(estimate_compression_context_size__doc__, +"estimate_compression_context_size(compression_parameters)\n" +"\n" +"Give the amount of memory allocated for a compression context given a\n" +"CompressionParameters instance"); + +PyDoc_STRVAR(estimate_decompression_context_size__doc__, +"estimate_decompression_context_size()\n" +"\n" +"Estimate the amount of memory allocated to a decompression context.\n" +); + +static PyObject* estimate_decompression_context_size(PyObject* self) { + return PyLong_FromSize_t(ZSTD_estimateDCtxSize()); +} + +PyDoc_STRVAR(get_compression_parameters__doc__, +"get_compression_parameters(compression_level[, source_size[, dict_size]])\n" +"\n" +"Obtains a ``CompressionParameters`` instance from a compression level and\n" +"optional input size and dictionary size"); + +PyDoc_STRVAR(train_dictionary__doc__, +"train_dictionary(dict_size, samples)\n" +"\n" +"Train a dictionary from sample data.\n" +"\n" +"A compression dictionary of size ``dict_size`` will be created from the\n" +"iterable of samples provided by ``samples``.\n" +"\n" +"The raw dictionary content will be returned\n"); + +static char zstd_doc[] = "Interface to zstandard"; + +static PyMethodDef zstd_methods[] = { + { "estimate_compression_context_size", (PyCFunction)estimate_compression_context_size, + METH_VARARGS, estimate_compression_context_size__doc__ }, + { "estimate_decompression_context_size", (PyCFunction)estimate_decompression_context_size, + METH_NOARGS, estimate_decompression_context_size__doc__ }, + { "get_compression_parameters", (PyCFunction)get_compression_parameters, + METH_VARARGS, get_compression_parameters__doc__ }, + { "train_dictionary", (PyCFunction)train_dictionary, + METH_VARARGS | METH_KEYWORDS, train_dictionary__doc__ }, + { NULL, NULL } +}; + +void compressobj_module_init(PyObject* mod); +void compressor_module_init(PyObject* mod); +void compressionparams_module_init(PyObject* mod); +void constants_module_init(PyObject* mod); +void dictparams_module_init(PyObject* mod); +void compressiondict_module_init(PyObject* mod); +void compressionwriter_module_init(PyObject* mod); +void compressoriterator_module_init(PyObject* mod); +void decompressor_module_init(PyObject* mod); +void decompressobj_module_init(PyObject* mod); +void decompressionwriter_module_init(PyObject* mod); +void decompressoriterator_module_init(PyObject* mod); + +void zstd_module_init(PyObject* m) { + compressionparams_module_init(m); + dictparams_module_init(m); + compressiondict_module_init(m); + compressobj_module_init(m); + compressor_module_init(m); + compressionwriter_module_init(m); + compressoriterator_module_init(m); + constants_module_init(m); + decompressor_module_init(m); + decompressobj_module_init(m); + decompressionwriter_module_init(m); + decompressoriterator_module_init(m); +} + +#if PY_MAJOR_VERSION >= 3 +static struct PyModuleDef zstd_module = { + PyModuleDef_HEAD_INIT, + "zstd", + zstd_doc, + -1, + zstd_methods +}; + +PyMODINIT_FUNC PyInit_zstd(void) { + PyObject *m = PyModule_Create(&zstd_module); + if (m) { + zstd_module_init(m); + } + return m; +} +#else +PyMODINIT_FUNC initzstd(void) { + PyObject *m = Py_InitModule3("zstd", zstd_methods, zstd_doc); + if (m) { + zstd_module_init(m); + } +} +#endif diff --git a/contrib/python-zstandard/zstd_cffi.py b/contrib/python-zstandard/zstd_cffi.py new file mode 100644 --- /dev/null +++ b/contrib/python-zstandard/zstd_cffi.py @@ -0,0 +1,152 @@ +# Copyright (c) 2016-present, Gregory Szorc +# All rights reserved. +# +# This software may be modified and distributed under the terms +# of the BSD license. See the LICENSE file for details. + +"""Python interface to the Zstandard (zstd) compression library.""" + +from __future__ import absolute_import, unicode_literals + +import io + +from _zstd_cffi import ( + ffi, + lib, +) + + +_CSTREAM_IN_SIZE = lib.ZSTD_CStreamInSize() +_CSTREAM_OUT_SIZE = lib.ZSTD_CStreamOutSize() + + +class _ZstdCompressionWriter(object): + def __init__(self, cstream, writer): + self._cstream = cstream + self._writer = writer + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, exc_tb): + if not exc_type and not exc_value and not exc_tb: + out_buffer = ffi.new('ZSTD_outBuffer *') + out_buffer.dst = ffi.new('char[]', _CSTREAM_OUT_SIZE) + out_buffer.size = _CSTREAM_OUT_SIZE + out_buffer.pos = 0 + + while True: + res = lib.ZSTD_endStream(self._cstream, out_buffer) + if lib.ZSTD_isError(res): + raise Exception('error ending compression stream: %s' % lib.ZSTD_getErrorName) + + if out_buffer.pos: + self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)) + out_buffer.pos = 0 + + if res == 0: + break + + return False + + def write(self, data): + out_buffer = ffi.new('ZSTD_outBuffer *') + out_buffer.dst = ffi.new('char[]', _CSTREAM_OUT_SIZE) + out_buffer.size = _CSTREAM_OUT_SIZE + out_buffer.pos = 0 + + # TODO can we reuse existing memory? + in_buffer = ffi.new('ZSTD_inBuffer *') + in_buffer.src = ffi.new('char[]', data) + in_buffer.size = len(data) + in_buffer.pos = 0 + while in_buffer.pos < in_buffer.size: + res = lib.ZSTD_compressStream(self._cstream, out_buffer, in_buffer) + if lib.ZSTD_isError(res): + raise Exception('zstd compress error: %s' % lib.ZSTD_getErrorName(res)) + + if out_buffer.pos: + self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)) + out_buffer.pos = 0 + + +class ZstdCompressor(object): + def __init__(self, level=3, dict_data=None, compression_params=None): + if dict_data: + raise Exception('dict_data not yet supported') + if compression_params: + raise Exception('compression_params not yet supported') + + self._compression_level = level + + def compress(self, data): + # Just use the stream API for now. + output = io.BytesIO() + with self.write_to(output) as compressor: + compressor.write(data) + return output.getvalue() + + def copy_stream(self, ifh, ofh): + cstream = self._get_cstream() + + in_buffer = ffi.new('ZSTD_inBuffer *') + out_buffer = ffi.new('ZSTD_outBuffer *') + + out_buffer.dst = ffi.new('char[]', _CSTREAM_OUT_SIZE) + out_buffer.size = _CSTREAM_OUT_SIZE + out_buffer.pos = 0 + + total_read, total_write = 0, 0 + + while True: + data = ifh.read(_CSTREAM_IN_SIZE) + if not data: + break + + total_read += len(data) + + in_buffer.src = ffi.new('char[]', data) + in_buffer.size = len(data) + in_buffer.pos = 0 + + while in_buffer.pos < in_buffer.size: + res = lib.ZSTD_compressStream(cstream, out_buffer, in_buffer) + if lib.ZSTD_isError(res): + raise Exception('zstd compress error: %s' % + lib.ZSTD_getErrorName(res)) + + if out_buffer.pos: + ofh.write(ffi.buffer(out_buffer.dst, out_buffer.pos)) + total_write = out_buffer.pos + out_buffer.pos = 0 + + # We've finished reading. Flush the compressor. + while True: + res = lib.ZSTD_endStream(cstream, out_buffer) + if lib.ZSTD_isError(res): + raise Exception('error ending compression stream: %s' % + lib.ZSTD_getErrorName(res)) + + if out_buffer.pos: + ofh.write(ffi.buffer(out_buffer.dst, out_buffer.pos)) + total_write += out_buffer.pos + out_buffer.pos = 0 + + if res == 0: + break + + return total_read, total_write + + def write_to(self, writer): + return _ZstdCompressionWriter(self._get_cstream(), writer) + + def _get_cstream(self): + cstream = lib.ZSTD_createCStream() + cstream = ffi.gc(cstream, lib.ZSTD_freeCStream) + + res = lib.ZSTD_initCStream(cstream, self._compression_level) + if lib.ZSTD_isError(res): + raise Exception('cannot init CStream: %s' % + lib.ZSTD_getErrorName(res)) + + return cstream diff --git a/tests/test-check-code.t b/tests/test-check-code.t --- a/tests/test-check-code.t +++ b/tests/test-check-code.t @@ -7,7 +7,7 @@ New errors are not allowed. Warnings are strongly discouraged. (The writing "no-che?k-code" is for not skipping this file when checking.) - $ hg locate | sed 's-\\-/-g' | + $ hg locate -X contrib/python-zstandard | sed 's-\\-/-g' | > xargs "$check_code" --warnings --per-file=0 || false Skipping hgext/fsmonitor/pywatchman/__init__.py it has no-che?k-code (glob) Skipping hgext/fsmonitor/pywatchman/bser.c it has no-che?k-code (glob) diff --git a/tests/test-check-module-imports.t b/tests/test-check-module-imports.t --- a/tests/test-check-module-imports.t +++ b/tests/test-check-module-imports.t @@ -159,6 +159,7 @@ outputs, which should be fixed later. $ hg locate 'set:**.py or grep(r"^#!.*?python")' \ > 'tests/**.t' \ > -X contrib/debugshell.py \ + > -X contrib/python-zstandard/ \ > -X contrib/win32/hgwebdir_wsgi.py \ > -X doc/gendoc.py \ > -X doc/hgmanpage.py \ diff --git a/tests/test-check-py3-compat.t b/tests/test-check-py3-compat.t --- a/tests/test-check-py3-compat.t +++ b/tests/test-check-py3-compat.t @@ -4,6 +4,17 @@ $ cd "$TESTDIR"/.. $ hg files 'set:(**.py)' | sed 's|\\|/|g' | xargs python contrib/check-py3-compat.py + contrib/python-zstandard/setup.py not using absolute_import + contrib/python-zstandard/setup_zstd.py not using absolute_import + contrib/python-zstandard/tests/common.py not using absolute_import + contrib/python-zstandard/tests/test_cffi.py not using absolute_import + contrib/python-zstandard/tests/test_compressor.py not using absolute_import + contrib/python-zstandard/tests/test_data_structures.py not using absolute_import + contrib/python-zstandard/tests/test_decompressor.py not using absolute_import + contrib/python-zstandard/tests/test_estimate_sizes.py not using absolute_import + contrib/python-zstandard/tests/test_module_attributes.py not using absolute_import + contrib/python-zstandard/tests/test_roundtrip.py not using absolute_import + contrib/python-zstandard/tests/test_train_dictionary.py not using absolute_import hgext/fsmonitor/pywatchman/__init__.py not using absolute_import hgext/fsmonitor/pywatchman/__init__.py requires print_function hgext/fsmonitor/pywatchman/capabilities.py not using absolute_import diff --git a/tests/test-check-pyflakes.t b/tests/test-check-pyflakes.t --- a/tests/test-check-pyflakes.t +++ b/tests/test-check-pyflakes.t @@ -10,6 +10,6 @@ run pyflakes on all tracked files ending > -X mercurial/pycompat.py \ > 2>/dev/null \ > | xargs pyflakes 2>/dev/null | "$TESTDIR/filterpyflakes.py" + contrib/python-zstandard/tests/test_data_structures.py:107: local variable 'size' is assigned to but never used tests/filterpyflakes.py:39: undefined name 'undefinedname' -