# HG changeset patch
# User Gregory Szorc <gregory.szorc@gmail.com>
# Date 2018-10-08 23:27:40
# Node ID 73fef626dae3049a6e5810f7f2f512e082131dba
# Parent  89742f1fa6cbf176259d694377aae17a0f7cd88d

zstandard: vendor python-zstandard 0.10.1

This was just released.

The upstream source distribution from PyPI was extracted. Unwanted
files were removed.

The clang-format ignore list was updated to reflect the new source
of files.

setup.py was updated to pass a new argument to python-zstandard's
function for returning an Extension instance. Upstream had to change
to use relative paths because Python 3.7's packaging doesn't
seem to like absolute paths when defining sources, includes, etc.
The default relative path calculation is relative to setup_zstd.py
which is different from the directory of Mercurial's setup.py.

The project contains a vendored copy of zstandard 1.3.6. The old
version was 1.3.4.

The API should be backwards compatible and nothing in core should
need adjusted. However, there is a new "chunker" API that we
may find useful in places where we want to emit compressed chunks
of a fixed size.

There are a pair of bug fixes in 0.10.0 with regards to
compressobj() and decompressobj() when block flushing is used. I
actually found these bugs when introducing these APIs in Mercurial!
But existing Mercurial code is not affected because we don't
perform block flushing.

# no-check-commit because 3rd party code has different style guidelines

Differential Revision: https://phab.mercurial-scm.org/D4911

diff --git a/contrib/clang-format-ignorelist b/contrib/clang-format-ignorelist
--- a/contrib/clang-format-ignorelist
+++ b/contrib/clang-format-ignorelist
@@ -6,6 +6,7 @@ mercurial/cext/osutil.c
 mercurial/cext/revlog.c
 # Vendored code that we should never format:
 contrib/python-zstandard/c-ext/bufferutil.c
+contrib/python-zstandard/c-ext/compressionchunker.c
 contrib/python-zstandard/c-ext/compressiondict.c
 contrib/python-zstandard/c-ext/compressionparams.c
 contrib/python-zstandard/c-ext/compressionreader.c
@@ -25,6 +26,8 @@ contrib/python-zstandard/zstd.c
 contrib/python-zstandard/zstd/common/bitstream.h
 contrib/python-zstandard/zstd/common/compiler.h
 contrib/python-zstandard/zstd/common/cpu.h
+contrib/python-zstandard/zstd/common/debug.c
+contrib/python-zstandard/zstd/common/debug.h
 contrib/python-zstandard/zstd/common/entropy_common.c
 contrib/python-zstandard/zstd/common/error_private.c
 contrib/python-zstandard/zstd/common/error_private.h
@@ -42,6 +45,8 @@ contrib/python-zstandard/zstd/common/zst
 contrib/python-zstandard/zstd/common/zstd_errors.h
 contrib/python-zstandard/zstd/common/zstd_internal.h
 contrib/python-zstandard/zstd/compress/fse_compress.c
+contrib/python-zstandard/zstd/compress/hist.c
+contrib/python-zstandard/zstd/compress/hist.h
 contrib/python-zstandard/zstd/compress/huf_compress.c
 contrib/python-zstandard/zstd/compress/zstd_compress.c
 contrib/python-zstandard/zstd/compress/zstd_compress_internal.h
@@ -64,8 +69,10 @@ contrib/python-zstandard/zstd/deprecated
 contrib/python-zstandard/zstd/deprecated/zbuff_decompress.c
 contrib/python-zstandard/zstd/deprecated/zbuff.h
 contrib/python-zstandard/zstd/dictBuilder/cover.c
+contrib/python-zstandard/zstd/dictBuilder/cover.h
 contrib/python-zstandard/zstd/dictBuilder/divsufsort.c
 contrib/python-zstandard/zstd/dictBuilder/divsufsort.h
+contrib/python-zstandard/zstd/dictBuilder/fastcover.c
 contrib/python-zstandard/zstd/dictBuilder/zdict.c
 contrib/python-zstandard/zstd/dictBuilder/zdict.h
 contrib/python-zstandard/zstd/zstd.h
diff --git a/contrib/python-zstandard/MANIFEST.in b/contrib/python-zstandard/MANIFEST.in
--- a/contrib/python-zstandard/MANIFEST.in
+++ b/contrib/python-zstandard/MANIFEST.in
@@ -1,7 +1,10 @@
 graft c-ext
+graft debian
 graft zstd
 graft tests
 include make_cffi.py
 include setup_zstd.py
 include zstd.c
+include zstd_cffi.py
 include LICENSE
+include NEWS.rst
diff --git a/contrib/python-zstandard/NEWS.rst b/contrib/python-zstandard/NEWS.rst
--- a/contrib/python-zstandard/NEWS.rst
+++ b/contrib/python-zstandard/NEWS.rst
@@ -30,6 +30,19 @@ Actions Blocking Release
 * Remove low-level compression parameters from ``ZstdCompressor.__init__`` and
   require use of ``CompressionParameters``.
 * Expose ``ZSTD_getFrameProgression()`` from more compressor types.
+* Support modifying compression parameters mid operation when supported by
+  zstd API.
+* Expose ``ZSTD_CLEVEL_DEFAULT`` constant.
+* Support ``ZSTD_p_forceAttachDict`` compression parameter.
+* Use ``ZSTD_CCtx_getParameter()``/``ZSTD_CCtxParam_getParameter()`` for retrieving
+  compression parameters.
+* Consider exposing ``ZSTDMT_toFlushNow()``.
+* Expose ``ZDICT_trainFromBuffer_fastCover()``,
+  ``ZDICT_optimizeTrainFromBuffer_fastCover``.
+* Expose and enforce ``ZSTD_minCLevel()`` for minimum compression level.
+* Consider a ``chunker()`` API for decompression.
+* Consider stats for ``chunker()`` API, including finding the last consumed
+  offset of input data.
 
 Other Actions Not Blocking Release
 ---------------------------------------
@@ -38,6 +51,111 @@ Other Actions Not Blocking Release
 * API for ensuring max memory ceiling isn't exceeded.
 * Move off nose for testing.
 
+0.10.1 (released 2018-10-08)
+============================
+
+Backwards Compatibility Notes
+-----------------------------
+
+* ``ZstdCompressor.stream_reader().closed`` is now a property instead of a
+  method (#58).
+* ``ZstdDecompressor.stream_reader().closed`` is now a property instead of a
+  method (#58).
+
+Changes
+-------
+
+* Stop attempting to package Python 3.6 for Miniconda. The latest version of
+  Miniconda is using Python 3.7. The Python 3.6 Miniconda packages were a lie
+  since this were built against Python 3.7.
+* ``ZstdCompressor.stream_reader()``'s and ``ZstdDecompressor.stream_reader()``'s
+  ``closed`` attribute is now a read-only property instead of a method. This now
+  properly matches the ``IOBase`` API and allows instances to be used in more
+  places that accept ``IOBase`` instances.
+
+0.10.0 (released 2018-10-08)
+============================
+
+Backwards Compatibility Notes
+-----------------------------
+
+* ``ZstdDecompressor.stream_reader().read()`` now consistently requires an
+  argument in both the C and CFFI backends. Before, the CFFI implementation
+  would assume a default value of ``-1``, which was later rejected.
+* The ``compress_literals`` argument and attribute has been removed from
+  ``zstd.ZstdCompressionParameters`` because it was removed by the zstd 1.3.5
+  API.
+* ``ZSTD_CCtx_setParametersUsingCCtxParams()`` is no longer called on every
+  operation performed against ``ZstdCompressor`` instances. The reason for this
+  change is that the zstd 1.3.5 API no longer allows this without calling
+  ``ZSTD_CCtx_resetParameters()`` first. But if we called
+  ``ZSTD_CCtx_resetParameters()`` on every operation, we'd have to redo
+  potentially expensive setup when using dictionaries. We now call
+  ``ZSTD_CCtx_reset()`` on every operation and don't attempt to change
+  compression parameters.
+* Objects returned by ``ZstdCompressor.stream_reader()`` no longer need to be
+  used as a context manager. The context manager interface still exists and its
+  behavior is unchanged.
+* Objects returned by ``ZstdDecompressor.stream_reader()`` no longer need to be
+  used as a context manager. The context manager interface still exists and its
+  behavior is unchanged.
+
+Bug Fixes
+---------
+
+* ``ZstdDecompressor.decompressobj().decompress()`` should now return all data
+  from internal buffers in more scenarios. Before, it was possible for data to
+  remain in internal buffers. This data would be emitted on a subsequent call
+  to ``decompress()``. The overall output stream would still be valid. But if
+  callers were expecting input data to exactly map to output data (say the
+  producer had used ``flush(COMPRESSOBJ_FLUSH_BLOCK)`` and was attempting to
+  map input chunks to output chunks), then the previous behavior would be
+  wrong. The new behavior is such that output from
+  ``flush(COMPRESSOBJ_FLUSH_BLOCK)`` fed into ``decompressobj().decompress()``
+  should produce all available compressed input.
+* ``ZstdDecompressor.stream_reader().read()`` should no longer segfault after
+  a previous context manager resulted in error (#56).
+* ``ZstdCompressor.compressobj().flush(COMPRESSOBJ_FLUSH_BLOCK)`` now returns
+  all data necessary to flush a block. Before, it was possible for the
+  ``flush()`` to not emit all data necessary to fully represent a block. This
+  would mean decompressors wouldn't be able to decompress all data that had been
+  fed into the compressor and ``flush()``ed. (#55).
+
+New Features
+------------
+
+* New module constants ``BLOCKSIZELOG_MAX``, ``BLOCKSIZE_MAX``,
+  ``TARGETLENGTH_MAX`` that expose constants from libzstd.
+* New ``ZstdCompressor.chunker()`` API for manually feeding data into a
+  compressor and emitting chunks of a fixed size. Like ``compressobj()``, the
+  API doesn't impose restrictions on the input or output types for the
+  data streams. Unlike ``compressobj()``, it ensures output chunks are of a
+  fixed size. This makes this API useful when the compressed output is being
+  fed into an I/O layer, where uniform write sizes are useful.
+* ``ZstdCompressor.stream_reader()`` no longer needs to be used as a context
+  manager (#34).
+* ``ZstdDecompressor.stream_reader()`` no longer needs to be used as a context
+  manager (#34).
+* Bundled zstandard library upgraded from 1.3.4 to 1.3.6.
+
+Changes
+-------
+
+* Added ``zstd_cffi.py`` and ``NEWS.rst`` to ``MANIFEST.in``.
+* ``zstandard.__version__`` is now defined (#50).
+* Upgrade pip, setuptools, wheel, and cibuildwheel packages to latest versions.
+* Upgrade various packages used in CI to latest versions. Notably tox (in
+  order to support Python 3.7).
+* Use relative paths in setup.py to appease Python 3.7 (#51).
+* Added CI for Python 3.7.
+
+0.9.1 (released 2018-06-04)
+===========================
+
+* Debian packaging support.
+* Fix typo in setup.py (#44).
+* Support building with mingw compiler (#46).
+
 0.9.0 (released 2018-04-08)
 ===========================
 
@@ -90,7 +208,7 @@ Bug Fixes
 New Features
 ------------
 
-* Bundlded zstandard library upgraded from 1.1.3 to 1.3.4. This delivers various
+* Bundled zstandard library upgraded from 1.1.3 to 1.3.4. This delivers various
   bug fixes and performance improvements. It also gives us access to newer
   features.
 * Support for negative compression levels.
diff --git a/contrib/python-zstandard/README.rst b/contrib/python-zstandard/README.rst
--- a/contrib/python-zstandard/README.rst
+++ b/contrib/python-zstandard/README.rst
@@ -196,6 +196,17 @@ Stream Reader API
 
    with open(path, 'rb') as fh:
        cctx = zstd.ZstdCompressor()
+       reader = cctx.stream_reader(fh)
+       while True:
+           chunk = reader.read(16384)
+           if not chunk:
+               break
+
+           # Do something with compressed chunk.
+
+Instances can also be used as context managers::
+
+   with open(path, 'rb') as fh:
        with cctx.stream_reader(fh) as reader:
            while True:
                chunk = reader.read(16384)
@@ -204,9 +215,9 @@ Stream Reader API
 
                # Do something with compressed chunk.
 
-The stream can only be read within a context manager. When the context
-manager exits, the stream is closed and the underlying resource is
-released and future operations against the compression stream stream will fail.
+When the context manager exists or ``close()`` is called, the stream is closed,
+underlying resources are released, and future operations against the compression
+stream will fail.
 
 The ``source`` argument to ``stream_reader()`` can be any object with a
 ``read(size)`` method or any object implementing the *buffer protocol*.
@@ -419,6 +430,64 @@ the compressor::
    data = cobj.compress(b'foobar')
    data = cobj.flush()
 
+Chunker API
+^^^^^^^^^^^
+
+``chunker(size=None, chunk_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE)`` returns
+an object that can be used to iteratively feed chunks of data into a compressor
+and produce output chunks of a uniform size.
+
+The object returned by ``chunker()`` exposes the following methods:
+
+``compress(data)``
+   Feeds new input data into the compressor.
+
+``flush()``
+   Flushes all data currently in the compressor.
+
+``finish()``
+   Signals the end of input data. No new data can be compressed after this
+   method is called.
+
+``compress()``, ``flush()``, and ``finish()`` all return an iterator of
+``bytes`` instances holding compressed data. The iterator may be empty. Callers
+MUST iterate through all elements of the returned iterator before performing
+another operation on the object.
+
+All chunks emitted by ``compress()`` will have a length of ``chunk_size``.
+
+``flush()`` and ``finish()`` may return a final chunk smaller than
+``chunk_size``.
+
+Here is how the API should be used::
+
+   cctx = zstd.ZstdCompressor()
+   chunker = cctx.chunker(chunk_size=32768)
+
+   with open(path, 'rb') as fh:
+       while True:
+           in_chunk = fh.read(32768)
+           if not in_chunk:
+               break
+
+           for out_chunk in chunker.compress(in_chunk):
+               # Do something with output chunk of size 32768.
+
+       for out_chunk in chunker.finish():
+           # Do something with output chunks that finalize the zstd frame.
+
+The ``chunker()`` API is often a better alternative to ``compressobj()``.
+
+``compressobj()`` will emit output data as it is available. This results in a
+*stream* of output chunks of varying sizes. The consistency of the output chunk
+size with ``chunker()`` is more appropriate for many usages, such as sending
+compressed data to a socket.
+
+``compressobj()`` may also perform extra memory reallocations in order to
+dynamically adjust the sizes of the output chunks. Since ``chunker()`` output
+chunks are all the same size (except for flushed or final chunks), there is
+less memory allocation overhead.
+
 Batch Compression API
 ^^^^^^^^^^^^^^^^^^^^^
 
@@ -542,17 +611,24 @@ Stream Reader API
 
    with open(path, 'rb') as fh:
        dctx = zstd.ZstdDecompressor()
-       with dctx.stream_reader(fh) as reader:
-           while True:
-               chunk = reader.read(16384)
-               if not chunk:
-                   break
+       reader = dctx.stream_reader(fh)
+       while True:
+           chunk = reader.read(16384)
+            if not chunk:
+                break
+
+            # Do something with decompressed chunk.
 
-               # Do something with decompressed chunk.
+The stream can also be used as a context manager::
 
-The stream can only be read within a context manager. When the context
-manager exits, the stream is closed and the underlying resource is
-released and future operations against the stream will fail.
+   with open(path, 'rb') as fh:
+       dctx = zstd.ZstdDecompressor()
+       with dctx.stream_reader(fh) as reader:
+           ...
+
+When used as a context manager, the stream is closed and the underlying
+resources are released when the context manager exits. Future operations against
+the stream will fail.
 
 The ``source`` argument to ``stream_reader()`` can be any object with a
 ``read(size)`` method or any object implementing the *buffer protocol*.
@@ -1077,7 +1153,6 @@ follows:
 * write_dict_id
 * job_size
 * overlap_size_log
-* compress_literals
 * force_max_window
 * enable_ldm
 * ldm_hash_log
diff --git a/contrib/python-zstandard/c-ext/compressionchunker.c b/contrib/python-zstandard/c-ext/compressionchunker.c
new file mode 100644
--- /dev/null
+++ b/contrib/python-zstandard/c-ext/compressionchunker.c
@@ -0,0 +1,360 @@
+/**
+* Copyright (c) 2018-present, Gregory Szorc
+* All rights reserved.
+*
+* This software may be modified and distributed under the terms
+* of the BSD license. See the LICENSE file for details.
+*/
+
+#include "python-zstandard.h"
+
+extern PyObject* ZstdError;
+
+PyDoc_STRVAR(ZstdCompressionChunkerIterator__doc__,
+	"Iterator of output chunks from ZstdCompressionChunker.\n"
+);
+
+static void ZstdCompressionChunkerIterator_dealloc(ZstdCompressionChunkerIterator* self) {
+	Py_XDECREF(self->chunker);
+
+	PyObject_Del(self);
+}
+
+static PyObject* ZstdCompressionChunkerIterator_iter(PyObject* self) {
+	Py_INCREF(self);
+	return self;
+}
+
+static PyObject* ZstdCompressionChunkerIterator_iternext(ZstdCompressionChunkerIterator* self) {
+	size_t zresult;
+	PyObject* chunk;
+	ZstdCompressionChunker* chunker = self->chunker;
+	ZSTD_EndDirective zFlushMode;
+
+	if (self->mode != compressionchunker_mode_normal && chunker->input.pos != chunker->input.size) {
+		PyErr_SetString(ZstdError, "input should have been fully consumed before calling flush() or finish()");
+		return NULL;
+	}
+
+	if (chunker->finished) {
+		return NULL;
+	}
+
+	/* If we have data left in the input, consume it. */
+	while (chunker->input.pos < chunker->input.size) {
+		Py_BEGIN_ALLOW_THREADS
+		zresult = ZSTD_compress_generic(chunker->compressor->cctx, &chunker->output,
+			&chunker->input, ZSTD_e_continue);
+		Py_END_ALLOW_THREADS
+
+		/* Input is fully consumed. */
+		if (chunker->input.pos == chunker->input.size) {
+			chunker->input.src = NULL;
+			chunker->input.pos = 0;
+			chunker->input.size = 0;
+			PyBuffer_Release(&chunker->inBuffer);
+		}
+
+		if (ZSTD_isError(zresult)) {
+			PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
+			return NULL;
+		}
+
+		/* If it produced a full output chunk, emit it. */
+		if (chunker->output.pos == chunker->output.size) {
+			chunk = PyBytes_FromStringAndSize(chunker->output.dst, chunker->output.pos);
+			if (!chunk) {
+				return NULL;
+			}
+
+			chunker->output.pos = 0;
+
+			return chunk;
+		}
+
+		/* Else continue to compress available input data. */
+	}
+
+	/* We also need this here for the special case of an empty input buffer. */
+	if (chunker->input.pos == chunker->input.size) {
+		chunker->input.src = NULL;
+		chunker->input.pos = 0;
+		chunker->input.size = 0;
+		PyBuffer_Release(&chunker->inBuffer);
+	}
+
+	/* No more input data. A partial chunk may be in chunker->output.
+	 * If we're in normal compression mode, we're done. Otherwise if we're in
+	 * flush or finish mode, we need to emit what data remains.
+	 */
+	if (self->mode == compressionchunker_mode_normal) {
+		/* We don't need to set StopIteration. */
+		return NULL;
+	}
+
+	if (self->mode == compressionchunker_mode_flush) {
+		zFlushMode = ZSTD_e_flush;
+	}
+	else if (self->mode == compressionchunker_mode_finish) {
+		zFlushMode = ZSTD_e_end;
+	}
+	else {
+		PyErr_SetString(ZstdError, "unhandled compression mode; this should never happen");
+		return NULL;
+	}
+
+	Py_BEGIN_ALLOW_THREADS
+	zresult = ZSTD_compress_generic(chunker->compressor->cctx, &chunker->output,
+		&chunker->input, zFlushMode);
+	Py_END_ALLOW_THREADS
+
+	if (ZSTD_isError(zresult)) {
+		PyErr_Format(ZstdError, "zstd compress error: %s",
+			ZSTD_getErrorName(zresult));
+		return NULL;
+	}
+
+	if (!zresult && chunker->output.pos == 0) {
+		return NULL;
+	}
+
+	chunk = PyBytes_FromStringAndSize(chunker->output.dst, chunker->output.pos);
+	if (!chunk) {
+		return NULL;
+	}
+
+	chunker->output.pos = 0;
+
+	if (!zresult && self->mode == compressionchunker_mode_finish) {
+		chunker->finished = 1;
+	}
+
+	return chunk;
+}
+
+PyTypeObject ZstdCompressionChunkerIteratorType = {
+	PyVarObject_HEAD_INIT(NULL, 0)
+	"zstd.ZstdCompressionChunkerIterator", /* tp_name */
+	sizeof(ZstdCompressionChunkerIterator), /* tp_basicsize */
+	0,                               /* tp_itemsize */
+	(destructor)ZstdCompressionChunkerIterator_dealloc, /* tp_dealloc */
+	0,                               /* tp_print */
+	0,                               /* tp_getattr */
+	0,                               /* tp_setattr */
+	0,                               /* tp_compare */
+	0,                               /* tp_repr */
+	0,                               /* tp_as_number */
+	0,                               /* tp_as_sequence */
+	0,                               /* tp_as_mapping */
+	0,                               /* tp_hash */
+	0,                               /* tp_call */
+	0,                               /* tp_str */
+	0,                               /* tp_getattro */
+	0,                               /* tp_setattro */
+	0,                               /* tp_as_buffer */
+	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+	ZstdCompressionChunkerIterator__doc__, /* tp_doc */
+	0,                               /* tp_traverse */
+	0,                               /* tp_clear */
+	0,                               /* tp_richcompare */
+	0,                               /* tp_weaklistoffset */
+	ZstdCompressionChunkerIterator_iter, /* tp_iter */
+	(iternextfunc)ZstdCompressionChunkerIterator_iternext, /* tp_iternext */
+	0,                               /* tp_methods */
+	0,                               /* tp_members */
+	0,                               /* tp_getset */
+	0,                               /* tp_base */
+	0,                               /* tp_dict */
+	0,                               /* tp_descr_get */
+	0,                               /* tp_descr_set */
+	0,                               /* tp_dictoffset */
+	0,                               /* tp_init */
+	0,                               /* tp_alloc */
+	PyType_GenericNew,              /* tp_new */
+};
+
+PyDoc_STRVAR(ZstdCompressionChunker__doc__,
+	"Compress chunks iteratively into exact chunk sizes.\n"
+);
+
+static void ZstdCompressionChunker_dealloc(ZstdCompressionChunker* self) {
+	PyBuffer_Release(&self->inBuffer);
+	self->input.src = NULL;
+
+	PyMem_Free(self->output.dst);
+	self->output.dst = NULL;
+
+	Py_XDECREF(self->compressor);
+
+	PyObject_Del(self);
+}
+
+static ZstdCompressionChunkerIterator* ZstdCompressionChunker_compress(ZstdCompressionChunker* self, PyObject* args, PyObject* kwargs) {
+	static char* kwlist[] = {
+		"data",
+		NULL
+	};
+
+	ZstdCompressionChunkerIterator* result;
+
+	if (self->finished) {
+		PyErr_SetString(ZstdError, "cannot call compress() after compression finished");
+		return NULL;
+	}
+
+	if (self->inBuffer.obj) {
+		PyErr_SetString(ZstdError,
+			"cannot perform operation before consuming output from previous operation");
+		return NULL;
+	}
+
+#if PY_MAJOR_VERSION >= 3
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*:compress",
+#else
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*:compress",
+#endif
+		kwlist, &self->inBuffer)) {
+		return NULL;
+	}
+
+	if (!PyBuffer_IsContiguous(&self->inBuffer, 'C') || self->inBuffer.ndim > 1) {
+		PyErr_SetString(PyExc_ValueError,
+			"data buffer should be contiguous and have at most one dimension");
+		PyBuffer_Release(&self->inBuffer);
+		return NULL;
+	}
+
+	result = (ZstdCompressionChunkerIterator*)PyObject_CallObject((PyObject*)&ZstdCompressionChunkerIteratorType, NULL);
+	if (!result) {
+		PyBuffer_Release(&self->inBuffer);
+		return NULL;
+	}
+
+	self->input.src = self->inBuffer.buf;
+	self->input.size = self->inBuffer.len;
+	self->input.pos = 0;
+
+	result->chunker = self;
+	Py_INCREF(result->chunker);
+
+	result->mode = compressionchunker_mode_normal;
+
+	return result;
+}
+
+static ZstdCompressionChunkerIterator* ZstdCompressionChunker_finish(ZstdCompressionChunker* self) {
+	ZstdCompressionChunkerIterator* result;
+
+	if (self->finished) {
+		PyErr_SetString(ZstdError, "cannot call finish() after compression finished");
+		return NULL;
+	}
+
+	if (self->inBuffer.obj) {
+		PyErr_SetString(ZstdError,
+			"cannot call finish() before consuming output from previous operation");
+		return NULL;
+	}
+
+	result = (ZstdCompressionChunkerIterator*)PyObject_CallObject((PyObject*)&ZstdCompressionChunkerIteratorType, NULL);
+	if (!result) {
+		return NULL;
+	}
+
+	result->chunker = self;
+	Py_INCREF(result->chunker);
+
+	result->mode = compressionchunker_mode_finish;
+
+	return result;
+}
+
+static ZstdCompressionChunkerIterator* ZstdCompressionChunker_flush(ZstdCompressionChunker* self, PyObject* args, PyObject* kwargs) {
+	ZstdCompressionChunkerIterator* result;
+
+	if (self->finished) {
+		PyErr_SetString(ZstdError, "cannot call flush() after compression finished");
+		return NULL;
+	}
+
+	if (self->inBuffer.obj) {
+		PyErr_SetString(ZstdError,
+			"cannot call flush() before consuming output from previous operation");
+		return NULL;
+	}
+
+	result = (ZstdCompressionChunkerIterator*)PyObject_CallObject((PyObject*)&ZstdCompressionChunkerIteratorType, NULL);
+	if (!result) {
+		return NULL;
+	}
+
+	result->chunker = self;
+	Py_INCREF(result->chunker);
+
+	result->mode = compressionchunker_mode_flush;
+
+	return result;
+}
+
+static PyMethodDef ZstdCompressionChunker_methods[] = {
+	{ "compress", (PyCFunction)ZstdCompressionChunker_compress, METH_VARARGS | METH_KEYWORDS,
+	PyDoc_STR("compress data") },
+	{ "finish", (PyCFunction)ZstdCompressionChunker_finish, METH_NOARGS,
+	PyDoc_STR("finish compression operation") },
+	{ "flush", (PyCFunction)ZstdCompressionChunker_flush, METH_VARARGS | METH_KEYWORDS,
+	PyDoc_STR("finish compression operation") },
+	{ NULL, NULL }
+};
+
+PyTypeObject ZstdCompressionChunkerType = {
+	PyVarObject_HEAD_INIT(NULL, 0)
+	"zstd.ZstdCompressionChunkerType",  /* tp_name */
+	sizeof(ZstdCompressionChunker),     /* tp_basicsize */
+	0,                                  /* tp_itemsize */
+	(destructor)ZstdCompressionChunker_dealloc, /* tp_dealloc */
+	0,                                  /* tp_print */
+	0,                                  /* tp_getattr */
+	0,                                  /* tp_setattr */
+	0,                                  /* tp_compare */
+	0,                                  /* tp_repr */
+	0,                                  /* tp_as_number */
+	0,                                  /* tp_as_sequence */
+	0,                                  /* tp_as_mapping */
+	0,                                  /* tp_hash */
+	0,                                  /* tp_call */
+	0,                                  /* tp_str */
+	0,                                  /* tp_getattro */
+	0,                                  /* tp_setattro */
+	0,                                  /* tp_as_buffer */
+	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+	ZstdCompressionChunker__doc__,      /* tp_doc */
+	0,                                  /* tp_traverse */
+	0,                                  /* tp_clear */
+	0,                                  /* tp_richcompare */
+	0,                                  /* tp_weaklistoffset */
+	0,                                  /* tp_iter */
+	0,                                  /* tp_iternext */
+	ZstdCompressionChunker_methods,     /* tp_methods */
+	0,                                  /* tp_members */
+	0,                                  /* tp_getset */
+	0,                                  /* tp_base */
+	0,                                  /* tp_dict */
+	0,                                  /* tp_descr_get */
+	0,                                  /* tp_descr_set */
+	0,                                  /* tp_dictoffset */
+	0,                                  /* tp_init */
+	0,                                  /* tp_alloc */
+	PyType_GenericNew,                  /* tp_new */
+};
+
+void compressionchunker_module_init(PyObject* module) {
+	Py_TYPE(&ZstdCompressionChunkerIteratorType) = &PyType_Type;
+	if (PyType_Ready(&ZstdCompressionChunkerIteratorType) < 0) {
+		return;
+	}
+
+	Py_TYPE(&ZstdCompressionChunkerType) = &PyType_Type;
+	if (PyType_Ready(&ZstdCompressionChunkerType) < 0) {
+		return;
+	}
+}
diff --git a/contrib/python-zstandard/c-ext/compressionparams.c b/contrib/python-zstandard/c-ext/compressionparams.c
--- a/contrib/python-zstandard/c-ext/compressionparams.c
+++ b/contrib/python-zstandard/c-ext/compressionparams.c
@@ -39,7 +39,6 @@ int set_parameters(ZSTD_CCtx_params* par
 	TRY_SET_PARAMETER(params, ZSTD_p_nbWorkers, obj->threads);
 	TRY_SET_PARAMETER(params, ZSTD_p_jobSize, obj->jobSize);
 	TRY_SET_PARAMETER(params, ZSTD_p_overlapSizeLog, obj->overlapSizeLog);
-	TRY_SET_PARAMETER(params, ZSTD_p_compressLiterals, obj->compressLiterals);
 	TRY_SET_PARAMETER(params, ZSTD_p_forceMaxWindow, obj->forceMaxWindow);
 	TRY_SET_PARAMETER(params, ZSTD_p_enableLongDistanceMatching, obj->enableLongDistanceMatching);
 	TRY_SET_PARAMETER(params, ZSTD_p_ldmHashLog, obj->ldmHashLog);
@@ -88,7 +87,6 @@ static int ZstdCompressionParameters_ini
 		"ldm_bucket_size_log",
 		"ldm_hash_every_log",
 		"threads",
-		"compress_literals",
 		NULL
 	};
 
@@ -114,18 +112,13 @@ static int ZstdCompressionParameters_ini
 	unsigned ldmHashEveryLog = 0;
 	int threads = 0;
 
-	/* Setting value 0 has the effect of disabling. So we use -1 as a default
-	 * to detect whether to set. Then we automatically derive the expected value
-	 * based on the level, just like zstandard does itself. */
-	int compressLiterals = -1;
-
 	if (!PyArg_ParseTupleAndKeywords(args, kwargs,
-		"|IiIIIIIIIIIIIIIIIIIIii:CompressionParameters",
+		"|IiIIIIIIIIIIIIIIIIIIi:CompressionParameters",
 		kwlist, &format, &compressionLevel, &windowLog, &hashLog, &chainLog,
 		&searchLog, &minMatch, &targetLength, &compressionStrategy,
 		&contentSizeFlag, &checksumFlag, &dictIDFlag, &jobSize, &overlapSizeLog,
 		&forceMaxWindow, &enableLDM, &ldmHashLog, &ldmMinMatch, &ldmBucketSizeLog,
-		&ldmHashEveryLog, &threads, &compressLiterals)) {
+		&ldmHashEveryLog, &threads)) {
 		return -1;
 	}
 
@@ -133,10 +126,6 @@ static int ZstdCompressionParameters_ini
 		threads = cpu_count();
 	}
 
-	if (compressLiterals < 0) {
-		compressLiterals = compressionLevel >= 0;
-	}
-
 	self->format = format;
 	self->compressionLevel = compressionLevel;
 	self->windowLog = windowLog;
@@ -152,7 +141,6 @@ static int ZstdCompressionParameters_ini
 	self->threads = threads;
 	self->jobSize = jobSize;
 	self->overlapSizeLog = overlapSizeLog;
-	self->compressLiterals = compressLiterals;
 	self->forceMaxWindow = forceMaxWindow;
 	self->enableLongDistanceMatching = enableLDM;
 	self->ldmHashLog = ldmHashLog;
@@ -299,16 +287,6 @@ ZstdCompressionParametersObject* Compres
 		Py_DECREF(val);
 	}
 
-	val = PyDict_GetItemString(kwargs, "compress_literals");
-	if (!val) {
-		val = PyLong_FromLong(level >= 0 ? 1 : 0);
-		if (!val) {
-			goto cleanup;
-		}
-		PyDict_SetItemString(kwargs, "compress_literals", val);
-		Py_DECREF(val);
-	}
-
 	result = PyObject_New(ZstdCompressionParametersObject, &ZstdCompressionParametersType);
 	if (!result) {
 		goto cleanup;
@@ -420,9 +398,6 @@ static PyMemberDef ZstdCompressionParame
 	{ "overlap_size_log", T_UINT,
 	  offsetof(ZstdCompressionParametersObject, overlapSizeLog), READONLY,
 	  "Size of previous input reloaded at the beginning of each job" },
-	{ "compress_literals", T_UINT,
-	  offsetof(ZstdCompressionParametersObject, compressLiterals), READONLY,
-	  "whether Huffman compression of literals is in use" },
 	{ "force_max_window", T_UINT,
 	  offsetof(ZstdCompressionParametersObject, forceMaxWindow), READONLY,
 	  "force back references to remain smaller than window size" },
diff --git a/contrib/python-zstandard/c-ext/compressionreader.c b/contrib/python-zstandard/c-ext/compressionreader.c
--- a/contrib/python-zstandard/c-ext/compressionreader.c
+++ b/contrib/python-zstandard/c-ext/compressionreader.c
@@ -43,20 +43,11 @@ static void reader_dealloc(ZstdCompressi
 }
 
 static ZstdCompressionReader* reader_enter(ZstdCompressionReader* self) {
-	size_t zresult;
-
 	if (self->entered) {
 		PyErr_SetString(PyExc_ValueError, "cannot __enter__ multiple times");
 		return NULL;
 	}
 
-	zresult = ZSTD_CCtx_setPledgedSrcSize(self->compressor->cctx, self->sourceSize);
-	if (ZSTD_isError(zresult)) {
-		PyErr_Format(ZstdError, "error setting source size: %s",
-			ZSTD_getErrorName(zresult));
-		return NULL;
-	}
-
 	self->entered = 1;
 
 	Py_INCREF(self);
@@ -132,15 +123,6 @@ static PyObject* reader_close(ZstdCompre
 	Py_RETURN_NONE;
 }
 
-static PyObject* reader_closed(ZstdCompressionReader* self) {
-	if (self->closed) {
-		Py_RETURN_TRUE;
-	}
-	else {
-		Py_RETURN_FALSE;
-	}
-}
-
 static PyObject* reader_tell(ZstdCompressionReader* self) {
 	/* TODO should this raise OSError since stream isn't seekable? */
 	return PyLong_FromUnsignedLongLong(self->bytesCompressed);
@@ -159,11 +141,6 @@ static PyObject* reader_read(ZstdCompres
 	size_t zresult;
 	size_t oldPos;
 
-	if (!self->entered) {
-		PyErr_SetString(ZstdError, "read() must be called from an active context manager");
-		return NULL;
-	}
-
 	if (self->closed) {
 		PyErr_SetString(PyExc_ValueError, "stream is closed");
 		return NULL;
@@ -333,8 +310,6 @@ static PyMethodDef reader_methods[] = {
 	PyDoc_STR("Exit a compression context") },
 	{ "close", (PyCFunction)reader_close, METH_NOARGS,
 	PyDoc_STR("Close the stream so it cannot perform any more operations") },
-	{ "closed", (PyCFunction)reader_closed, METH_NOARGS,
-	PyDoc_STR("Whether stream is closed") },
 	{ "flush", (PyCFunction)reader_flush, METH_NOARGS, PyDoc_STR("no-ops") },
 	{ "isatty", (PyCFunction)reader_isatty, METH_NOARGS, PyDoc_STR("Returns False") },
 	{ "readable", (PyCFunction)reader_readable, METH_NOARGS,
@@ -354,6 +329,12 @@ static PyMethodDef reader_methods[] = {
 	{ NULL, NULL }
 };
 
+static PyMemberDef reader_members[] = {
+	{ "closed", T_BOOL, offsetof(ZstdCompressionReader, closed),
+	  READONLY, "whether stream is closed" },
+	{ NULL }
+};
+
 PyTypeObject ZstdCompressionReaderType = {
 	PyVarObject_HEAD_INIT(NULL, 0)
 	"zstd.ZstdCompressionReader", /* tp_name */
@@ -383,7 +364,7 @@ PyTypeObject ZstdCompressionReaderType =
 	reader_iter, /* tp_iter */
 	reader_iternext, /* tp_iternext */
 	reader_methods, /* tp_methods */
-	0, /* tp_members */
+	reader_members, /* tp_members */
 	0, /* tp_getset */
 	0, /* tp_base */
 	0, /* tp_dict */
diff --git a/contrib/python-zstandard/c-ext/compressionwriter.c b/contrib/python-zstandard/c-ext/compressionwriter.c
--- a/contrib/python-zstandard/c-ext/compressionwriter.c
+++ b/contrib/python-zstandard/c-ext/compressionwriter.c
@@ -222,10 +222,6 @@ static PyObject* ZstdCompressionWriter_f
 			return NULL;
 		}
 
-		if (!output.pos) {
-			break;
-		}
-
 		/* Copy data from output buffer to writer. */
 		if (output.pos) {
 #if PY_MAJOR_VERSION >= 3
@@ -238,7 +234,12 @@ static PyObject* ZstdCompressionWriter_f
 			totalWrite += output.pos;
 			self->bytesCompressed += output.pos;
 		}
+
 		output.pos = 0;
+
+		if (!zresult) {
+			break;
+		}
 	}
 
 	PyMem_Free(output.dst);
diff --git a/contrib/python-zstandard/c-ext/compressobj.c b/contrib/python-zstandard/c-ext/compressobj.c
--- a/contrib/python-zstandard/c-ext/compressobj.c
+++ b/contrib/python-zstandard/c-ext/compressobj.c
@@ -115,6 +115,7 @@ static PyObject* ZstdCompressionObj_flus
 	PyObject* result = NULL;
 	Py_ssize_t resultSize = 0;
 	ZSTD_inBuffer input;
+	ZSTD_EndDirective zFlushMode;
 
 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|i:flush", kwlist, &flushMode)) {
 		return NULL;
@@ -130,52 +131,34 @@ static PyObject* ZstdCompressionObj_flus
 		return NULL;
 	}
 
+	switch (flushMode) {
+		case compressorobj_flush_block:
+			zFlushMode = ZSTD_e_flush;
+			break;
+
+		case compressorobj_flush_finish:
+			zFlushMode = ZSTD_e_end;
+			self->finished = 1;
+			break;
+
+		default:
+			PyErr_SetString(ZstdError, "unhandled flush mode");
+			return NULL;
+	}
+
 	assert(self->output.pos == 0);
 
 	input.src = NULL;
 	input.size = 0;
 	input.pos = 0;
 
-	if (flushMode == compressorobj_flush_block) {
-		/* The output buffer is of size ZSTD_CStreamOutSize(), which is 
-		   guaranteed to hold a full block. */
+	while (1) {
 		Py_BEGIN_ALLOW_THREADS
-			zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output,
-				&input, ZSTD_e_flush);
+		zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output,
+			&input, zFlushMode);
 		Py_END_ALLOW_THREADS
 
 		if (ZSTD_isError(zresult)) {
-			PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
-			return NULL;
-		}
-
-		/* Output buffer is guaranteed to hold full block. */
-		assert(zresult == 0);
-
-		if (self->output.pos) {
-			result = PyBytes_FromStringAndSize(self->output.dst, self->output.pos);
-			if (!result) {
-				return NULL;
-			}
-		}
-
-		self->output.pos = 0;
-
-		if (result) {
-			return result;
-		}
-		else {
-			return PyBytes_FromString("");
-		}
-	}
-
-	assert(flushMode == compressorobj_flush_finish);
-	self->finished = 1;
-
-	while (1) {
-		zresult = ZSTD_compress_generic(self->compressor->cctx, &self->output,
-			&input, ZSTD_e_end);
-		if (ZSTD_isError(zresult)) {
 			PyErr_Format(ZstdError, "error ending compression stream: %s",
 				ZSTD_getErrorName(zresult));
 			return NULL;
diff --git a/contrib/python-zstandard/c-ext/compressor.c b/contrib/python-zstandard/c-ext/compressor.c
--- a/contrib/python-zstandard/c-ext/compressor.c
+++ b/contrib/python-zstandard/c-ext/compressor.c
@@ -11,15 +11,13 @@
 
 extern PyObject* ZstdError;
 
-int ensure_cctx(ZstdCompressor* compressor) {
+int setup_cctx(ZstdCompressor* compressor) {
 	size_t zresult;
 
 	assert(compressor);
 	assert(compressor->cctx);
 	assert(compressor->params);
 
-	ZSTD_CCtx_reset(compressor->cctx);
-
 	zresult = ZSTD_CCtx_setParametersUsingCCtxParams(compressor->cctx, compressor->params);
 	if (ZSTD_isError(zresult)) {
 		PyErr_Format(ZstdError, "could not set compression parameters: %s",
@@ -237,9 +235,9 @@ static int ZstdCompressor_init(ZstdCompr
 		Py_INCREF(dict);
 	}
 
-	if (ensure_cctx(self)) {
-		return -1;
-	}
+    if (setup_cctx(self)) {
+        return -1;
+    }
 
 	return 0;
 }
@@ -346,9 +344,7 @@ static PyObject* ZstdCompressor_copy_str
 		return NULL;
 	}
 
-	if (ensure_cctx(self)) {
-		return NULL;
-	}
+	ZSTD_CCtx_reset(self->cctx);
 
 	zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, sourceSize);
 	if (ZSTD_isError(zresult)) {
@@ -489,6 +485,7 @@ static ZstdCompressionReader* ZstdCompre
 	unsigned long long sourceSize = ZSTD_CONTENTSIZE_UNKNOWN;
 	size_t readSize = ZSTD_CStreamInSize();
 	ZstdCompressionReader* result = NULL;
+	size_t zresult;
 
 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Kk:stream_reader", kwlist,
 		&source, &sourceSize, &readSize)) {
@@ -520,13 +517,17 @@ static ZstdCompressionReader* ZstdCompre
 		goto except;
 	}
 
-	if (ensure_cctx(self)) {
+	ZSTD_CCtx_reset(self->cctx);
+
+	zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, sourceSize);
+	if (ZSTD_isError(zresult)) {
+		PyErr_Format(ZstdError, "error setting source source: %s",
+			ZSTD_getErrorName(zresult));
 		goto except;
 	}
 
 	result->compressor = self;
 	Py_INCREF(self);
-	result->sourceSize = sourceSize;
 
 	return result;
 
@@ -576,9 +577,7 @@ static PyObject* ZstdCompressor_compress
 		goto finally;
 	}
 
-	if (ensure_cctx(self)) {
-		goto finally;
-	}
+	ZSTD_CCtx_reset(self->cctx);
 
 	destSize = ZSTD_compressBound(source.len);
 	output = PyBytes_FromStringAndSize(NULL, destSize);
@@ -652,9 +651,7 @@ static ZstdCompressionObj* ZstdCompresso
 		return NULL;
 	}
 
-	if (ensure_cctx(self)) {
-		return NULL;
-	}
+	ZSTD_CCtx_reset(self->cctx);
 
 	zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, inSize);
 	if (ZSTD_isError(zresult)) {
@@ -743,9 +740,7 @@ static ZstdCompressorIterator* ZstdCompr
 		goto except;
 	}
 
-	if (ensure_cctx(self)) {
-		return NULL;
-	}
+	ZSTD_CCtx_reset(self->cctx);
 
 	zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, sourceSize);
 	if (ZSTD_isError(zresult)) {
@@ -817,9 +812,7 @@ static ZstdCompressionWriter* ZstdCompre
 		return NULL;
 	}
 
-	if (ensure_cctx(self)) {
-		return NULL;
-	}
+	ZSTD_CCtx_reset(self->cctx);
 
 	result = (ZstdCompressionWriter*)PyObject_CallObject((PyObject*)&ZstdCompressionWriterType, NULL);
 	if (!result) {
@@ -839,6 +832,58 @@ static ZstdCompressionWriter* ZstdCompre
 	return result;
 }
 
+PyDoc_STRVAR(ZstdCompressor_chunker__doc__,
+"Create an object for iterative compressing to same-sized chunks.\n"
+);
+
+static ZstdCompressionChunker* ZstdCompressor_chunker(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
+	static char* kwlist[] = {
+		"size",
+		"chunk_size",
+		NULL
+	};
+
+	unsigned long long sourceSize = ZSTD_CONTENTSIZE_UNKNOWN;
+	size_t chunkSize = ZSTD_CStreamOutSize();
+	ZstdCompressionChunker* chunker;
+	size_t zresult;
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|Kk:chunker", kwlist,
+		&sourceSize, &chunkSize)) {
+		return NULL;
+	}
+
+	ZSTD_CCtx_reset(self->cctx);
+
+	zresult = ZSTD_CCtx_setPledgedSrcSize(self->cctx, sourceSize);
+	if (ZSTD_isError(zresult)) {
+		PyErr_Format(ZstdError, "error setting source size: %s",
+			ZSTD_getErrorName(zresult));
+		return NULL;
+	}
+
+	chunker = (ZstdCompressionChunker*)PyObject_CallObject((PyObject*)&ZstdCompressionChunkerType, NULL);
+	if (!chunker) {
+		return NULL;
+	}
+
+	chunker->output.dst = PyMem_Malloc(chunkSize);
+	if (!chunker->output.dst) {
+		PyErr_NoMemory();
+		Py_DECREF(chunker);
+		return NULL;
+	}
+	chunker->output.size = chunkSize;
+	chunker->output.pos = 0;
+
+	chunker->compressor = self;
+	Py_INCREF(chunker->compressor);
+
+	chunker->chunkSize = chunkSize;
+
+	return chunker;
+}
+
 typedef struct {
 	void* sourceData;
 	size_t sourceSize;
@@ -1524,6 +1569,8 @@ finally:
 }
 
 static PyMethodDef ZstdCompressor_methods[] = {
+	{ "chunker", (PyCFunction)ZstdCompressor_chunker,
+	METH_VARARGS | METH_KEYWORDS, ZstdCompressor_chunker__doc__ },
 	{ "compress", (PyCFunction)ZstdCompressor_compress,
 	METH_VARARGS | METH_KEYWORDS, ZstdCompressor_compress__doc__ },
 	{ "compressobj", (PyCFunction)ZstdCompressor_compressobj,
diff --git a/contrib/python-zstandard/c-ext/constants.c b/contrib/python-zstandard/c-ext/constants.c
--- a/contrib/python-zstandard/c-ext/constants.c
+++ b/contrib/python-zstandard/c-ext/constants.c
@@ -27,7 +27,6 @@ void constants_module_init(PyObject* mod
 #else
 	version = PyString_FromString(PYTHON_ZSTANDARD_VERSION);
 #endif
-	Py_INCREF(version);
 	PyModule_AddObject(mod, "__version__", version);
 
 	ZstdError = PyErr_NewException("zstd.ZstdError", NULL, NULL);
@@ -41,7 +40,6 @@ void constants_module_init(PyObject* mod
 	PyTuple_SetItem(zstdVersion, 0, PyLong_FromLong(ZSTD_VERSION_MAJOR));
 	PyTuple_SetItem(zstdVersion, 1, PyLong_FromLong(ZSTD_VERSION_MINOR));
 	PyTuple_SetItem(zstdVersion, 2, PyLong_FromLong(ZSTD_VERSION_RELEASE));
-	Py_INCREF(zstdVersion);
 	PyModule_AddObject(mod, "ZSTD_VERSION", zstdVersion);
 
 	frameHeader = PyBytes_FromStringAndSize(frame_header, sizeof(frame_header));
@@ -68,6 +66,8 @@ void constants_module_init(PyObject* mod
 		(long)ZSTD_DStreamOutSize());
 
 	PyModule_AddIntConstant(mod, "MAGIC_NUMBER", ZSTD_MAGICNUMBER);
+	PyModule_AddIntConstant(mod, "BLOCKSIZELOG_MAX", ZSTD_BLOCKSIZELOG_MAX);
+	PyModule_AddIntConstant(mod, "BLOCKSIZE_MAX", ZSTD_BLOCKSIZE_MAX);
 	PyModule_AddIntConstant(mod, "WINDOWLOG_MIN", ZSTD_WINDOWLOG_MIN);
 	PyModule_AddIntConstant(mod, "WINDOWLOG_MAX", ZSTD_WINDOWLOG_MAX);
 	PyModule_AddIntConstant(mod, "CHAINLOG_MIN", ZSTD_CHAINLOG_MIN);
@@ -80,6 +80,7 @@ void constants_module_init(PyObject* mod
 	PyModule_AddIntConstant(mod, "SEARCHLENGTH_MIN", ZSTD_SEARCHLENGTH_MIN);
 	PyModule_AddIntConstant(mod, "SEARCHLENGTH_MAX", ZSTD_SEARCHLENGTH_MAX);
 	PyModule_AddIntConstant(mod, "TARGETLENGTH_MIN", ZSTD_TARGETLENGTH_MIN);
+	PyModule_AddIntConstant(mod, "TARGETLENGTH_MAX", ZSTD_TARGETLENGTH_MAX);
 	PyModule_AddIntConstant(mod, "LDM_MINMATCH_MIN", ZSTD_LDM_MINMATCH_MIN);
 	PyModule_AddIntConstant(mod, "LDM_MINMATCH_MAX", ZSTD_LDM_MINMATCH_MAX);
 	PyModule_AddIntConstant(mod, "LDM_BUCKETSIZELOG_MAX", ZSTD_LDM_BUCKETSIZELOG_MAX);
diff --git a/contrib/python-zstandard/c-ext/decompressionreader.c b/contrib/python-zstandard/c-ext/decompressionreader.c
--- a/contrib/python-zstandard/c-ext/decompressionreader.c
+++ b/contrib/python-zstandard/c-ext/decompressionreader.c
@@ -47,10 +47,6 @@ static ZstdDecompressionReader* reader_e
 		return NULL;
 	}
 
-	if (ensure_dctx(self->decompressor, 1)) {
-		return NULL;
-	}
-
 	self->entered = 1;
 
 	Py_INCREF(self);
@@ -98,15 +94,6 @@ static PyObject* reader_close(ZstdDecomp
 	Py_RETURN_NONE;
 }
 
-static PyObject* reader_closed(ZstdDecompressionReader* self) {
-	if (self->closed) {
-		Py_RETURN_TRUE;
-	}
-	else {
-		Py_RETURN_FALSE;
-	}
-}
-
 static PyObject* reader_flush(PyObject* self) {
 	Py_RETURN_NONE;
 }
@@ -128,11 +115,6 @@ static PyObject* reader_read(ZstdDecompr
 	ZSTD_outBuffer output;
 	size_t zresult;
 
-	if (!self->entered) {
-		PyErr_SetString(ZstdError, "read() must be called from an active context manager");
-		return NULL;
-	}
-
 	if (self->closed) {
 		PyErr_SetString(PyExc_ValueError, "stream is closed");
 		return NULL;
@@ -281,11 +263,6 @@ static PyObject* reader_seek(ZstdDecompr
 	unsigned long long readAmount = 0;
 	size_t defaultOutSize = ZSTD_DStreamOutSize();
 
-	if (!self->entered) {
-		PyErr_SetString(ZstdError, "seek() must be called from an active context manager");
-		return NULL;
-	}
-
 	if (self->closed) {
 		PyErr_SetString(PyExc_ValueError, "stream is closed");
 		return NULL;
@@ -384,8 +361,6 @@ static PyMethodDef reader_methods[] = {
 	PyDoc_STR("Exit a compression context") },
 	{ "close", (PyCFunction)reader_close, METH_NOARGS,
 	PyDoc_STR("Close the stream so it cannot perform any more operations") },
-	{ "closed", (PyCFunction)reader_closed, METH_NOARGS,
-	PyDoc_STR("Whether stream is closed") },
 	{ "flush", (PyCFunction)reader_flush, METH_NOARGS, PyDoc_STR("no-ops") },
 	{ "isatty", (PyCFunction)reader_isatty, METH_NOARGS, PyDoc_STR("Returns False") },
 	{ "readable", (PyCFunction)reader_readable, METH_NOARGS,
@@ -407,6 +382,12 @@ static PyMethodDef reader_methods[] = {
 	{ NULL, NULL }
 };
 
+static PyMemberDef reader_members[] = {
+	{ "closed", T_BOOL, offsetof(ZstdDecompressionReader, closed),
+	  READONLY, "whether stream is closed" },
+	{ NULL }
+};
+
 PyTypeObject ZstdDecompressionReaderType = {
 	PyVarObject_HEAD_INIT(NULL, 0)
 	"zstd.ZstdDecompressionReader", /* tp_name */
@@ -436,7 +417,7 @@ PyTypeObject ZstdDecompressionReaderType
 	reader_iter, /* tp_iter */
 	reader_iternext, /* tp_iternext */
 	reader_methods, /* tp_methods */
-	0, /* tp_members */
+	reader_members, /* tp_members */
 	0, /* tp_getset */
 	0, /* tp_base */
 	0, /* tp_dict */
diff --git a/contrib/python-zstandard/c-ext/decompressobj.c b/contrib/python-zstandard/c-ext/decompressobj.c
--- a/contrib/python-zstandard/c-ext/decompressobj.c
+++ b/contrib/python-zstandard/c-ext/decompressobj.c
@@ -33,6 +33,8 @@ static PyObject* DecompressionObj_decomp
 	PyObject* result = NULL;
 	Py_ssize_t resultSize = 0;
 
+	output.dst = NULL;
+
 	if (self->finished) {
 		PyErr_SetString(ZstdError, "cannot use a decompressobj multiple times");
 		return NULL;
@@ -53,6 +55,12 @@ static PyObject* DecompressionObj_decomp
 		goto finally;
 	}
 
+	/* Special case of empty input. Output will always be empty. */
+	if (source.len == 0) {
+		result = PyBytes_FromString("");
+		goto finally;
+	}
+
 	input.src = source.buf;
 	input.size = source.len;
 	input.pos = 0;
@@ -65,8 +73,7 @@ static PyObject* DecompressionObj_decomp
 	output.size = self->outSize;
 	output.pos = 0;
 
-	/* Read input until exhausted. */
-	while (input.pos < input.size) {
+	while (1) {
 		Py_BEGIN_ALLOW_THREADS
 		zresult = ZSTD_decompress_generic(self->decompressor->dctx, &output, &input);
 		Py_END_ALLOW_THREADS
@@ -98,9 +105,13 @@ static PyObject* DecompressionObj_decomp
 					goto except;
 				}
 			}
+		}
 
-			output.pos = 0;
+		if (zresult == 0 || (input.pos == input.size && output.pos == 0)) {
+			break;
 		}
+
+		output.pos = 0;
 	}
 
 	if (!result) {
diff --git a/contrib/python-zstandard/c-ext/decompressor.c b/contrib/python-zstandard/c-ext/decompressor.c
--- a/contrib/python-zstandard/c-ext/decompressor.c
+++ b/contrib/python-zstandard/c-ext/decompressor.c
@@ -575,6 +575,10 @@ static ZstdDecompressionReader* Decompre
 		return NULL;
 	}
 
+	if (ensure_dctx(self, 1)) {
+		return NULL;
+	}
+
 	result = (ZstdDecompressionReader*)PyObject_CallObject((PyObject*)&ZstdDecompressionReaderType, NULL);
 	if (NULL == result) {
 		return NULL;
diff --git a/contrib/python-zstandard/c-ext/python-zstandard.h b/contrib/python-zstandard/c-ext/python-zstandard.h
--- a/contrib/python-zstandard/c-ext/python-zstandard.h
+++ b/contrib/python-zstandard/c-ext/python-zstandard.h
@@ -15,7 +15,8 @@
 #include <zstd.h>
 #include <zdict.h>
 
-#define PYTHON_ZSTANDARD_VERSION "0.9.0"
+/* Remember to change the string in zstandard/__init__ as well */
+#define PYTHON_ZSTANDARD_VERSION "0.10.1"
 
 typedef enum {
 	compressorobj_flush_finish,
@@ -45,7 +46,6 @@ typedef struct {
 	unsigned threads;
 	unsigned jobSize;
 	unsigned overlapSizeLog;
-	unsigned compressLiterals;
 	unsigned forceMaxWindow;
 	unsigned enableLongDistanceMatching;
 	unsigned ldmHashLog;
@@ -162,7 +162,6 @@ typedef struct {
 	ZstdCompressor* compressor;
 	PyObject* reader;
 	Py_buffer buffer;
-	unsigned long long sourceSize;
 	size_t readSize;
 
 	int entered;
@@ -181,6 +180,34 @@ extern PyTypeObject ZstdCompressionReade
 typedef struct {
 	PyObject_HEAD
 
+	ZstdCompressor* compressor;
+	ZSTD_inBuffer input;
+	ZSTD_outBuffer output;
+	Py_buffer inBuffer;
+	int finished;
+	size_t chunkSize;
+} ZstdCompressionChunker;
+
+extern PyTypeObject ZstdCompressionChunkerType;
+
+typedef enum {
+	compressionchunker_mode_normal,
+	compressionchunker_mode_flush,
+	compressionchunker_mode_finish,
+} CompressionChunkerMode;
+
+typedef struct {
+	PyObject_HEAD
+
+	ZstdCompressionChunker* chunker;
+	CompressionChunkerMode mode;
+} ZstdCompressionChunkerIterator;
+
+extern PyTypeObject ZstdCompressionChunkerIteratorType;
+
+typedef struct {
+	PyObject_HEAD
+
 	ZSTD_DCtx* dctx;
 	ZstdCompressionDict* dict;
 	size_t maxWindowSize;
diff --git a/contrib/python-zstandard/make_cffi.py b/contrib/python-zstandard/make_cffi.py
--- a/contrib/python-zstandard/make_cffi.py
+++ b/contrib/python-zstandard/make_cffi.py
@@ -17,6 +17,7 @@ import tempfile
 HERE = os.path.abspath(os.path.dirname(__file__))
 
 SOURCES = ['zstd/%s' % p for p in (
+    'common/debug.c',
     'common/entropy_common.c',
     'common/error_private.c',
     'common/fse_decompress.c',
@@ -25,6 +26,7 @@ SOURCES = ['zstd/%s' % p for p in (
     'common/xxhash.c',
     'common/zstd_common.c',
     'compress/fse_compress.c',
+    'compress/hist.c',
     'compress/huf_compress.c',
     'compress/zstd_compress.c',
     'compress/zstd_double_fast.c',
@@ -36,6 +38,7 @@ SOURCES = ['zstd/%s' % p for p in (
     'decompress/huf_decompress.c',
     'decompress/zstd_decompress.c',
     'dictBuilder/cover.c',
+    'dictBuilder/fastcover.c',
     'dictBuilder/divsufsort.c',
     'dictBuilder/zdict.c',
 )]
diff --git a/contrib/python-zstandard/setup_zstd.py b/contrib/python-zstandard/setup_zstd.py
--- a/contrib/python-zstandard/setup_zstd.py
+++ b/contrib/python-zstandard/setup_zstd.py
@@ -6,12 +6,12 @@
 
 import distutils.ccompiler
 import os
-import sys
 
 from distutils.extension import Extension
 
 
 zstd_sources = ['zstd/%s' % p for p in (
+    'common/debug.c',
     'common/entropy_common.c',
     'common/error_private.c',
     'common/fse_decompress.c',
@@ -20,6 +20,7 @@ zstd_sources = ['zstd/%s' % p for p in (
     'common/xxhash.c',
     'common/zstd_common.c',
     'compress/fse_compress.c',
+    'compress/hist.c',
     'compress/huf_compress.c',
     'compress/zstd_compress.c',
     'compress/zstd_double_fast.c',
@@ -32,6 +33,7 @@ zstd_sources = ['zstd/%s' % p for p in (
     'decompress/zstd_decompress.c',
     'dictBuilder/cover.c',
     'dictBuilder/divsufsort.c',
+    'dictBuilder/fastcover.c',
     'dictBuilder/zdict.c',
 )]
 
@@ -75,6 +77,7 @@ ext_sources = [
     'c-ext/compressobj.c',
     'c-ext/compressor.c',
     'c-ext/compressoriterator.c',
+    'c-ext/compressionchunker.c',
     'c-ext/compressionparams.c',
     'c-ext/compressionreader.c',
     'c-ext/compressionwriter.c',
@@ -93,25 +96,45 @@ zstd_depends = [
 
 
 def get_c_extension(support_legacy=False, system_zstd=False, name='zstd',
-                    warnings_as_errors=False):
-    """Obtain a distutils.extension.Extension for the C extension."""
-    root = os.path.abspath(os.path.dirname(__file__))
+                    warnings_as_errors=False, root=None):
+    """Obtain a distutils.extension.Extension for the C extension.
+
+    ``support_legacy`` controls whether to compile in legacy zstd format support.
+
+    ``system_zstd`` controls whether to compile against the system zstd library.
+    For this to work, the system zstd library and headers must match what
+    python-zstandard is coded against exactly.
+
+    ``name`` is the module name of the C extension to produce.
+
+    ``warnings_as_errors`` controls whether compiler warnings are turned into
+    compiler errors.
 
-    sources = set([os.path.join(root, p) for p in ext_sources])
+    ``root`` defines a root path that source should be computed as relative
+    to. This should be the directory with the main ``setup.py`` that is
+    being invoked. If not defined, paths will be relative to this file.
+    """
+    actual_root = os.path.abspath(os.path.dirname(__file__))
+    root = root or actual_root
+
+    sources = set([os.path.join(actual_root, p) for p in ext_sources])
     if not system_zstd:
-        sources.update([os.path.join(root, p) for p in zstd_sources])
+        sources.update([os.path.join(actual_root, p) for p in zstd_sources])
         if support_legacy:
-            sources.update([os.path.join(root, p) for p in zstd_sources_legacy])
+            sources.update([os.path.join(actual_root, p)
+                            for p in zstd_sources_legacy])
     sources = list(sources)
 
-    include_dirs = set([os.path.join(root, d) for d in ext_includes])
+    include_dirs = set([os.path.join(actual_root, d) for d in ext_includes])
     if not system_zstd:
-        include_dirs.update([os.path.join(root, d) for d in zstd_includes])
+        include_dirs.update([os.path.join(actual_root, d)
+                             for d in zstd_includes])
         if support_legacy:
-            include_dirs.update([os.path.join(root, d) for d in zstd_includes_legacy])
+            include_dirs.update([os.path.join(actual_root, d)
+                                 for d in zstd_includes_legacy])
     include_dirs = list(include_dirs)
 
-    depends = [os.path.join(root, p) for p in zstd_depends]
+    depends = [os.path.join(actual_root, p) for p in zstd_depends]
 
     compiler = distutils.ccompiler.new_compiler()
 
@@ -152,6 +175,11 @@ def get_c_extension(support_legacy=False
 
     libraries = ['zstd'] if system_zstd else []
 
+    # Python 3.7 doesn't like absolute paths. So normalize to relative.
+    sources = [os.path.relpath(p, root) for p in sources]
+    include_dirs = [os.path.relpath(p, root) for p in include_dirs]
+    depends = [os.path.relpath(p, root) for p in depends]
+
     # TODO compile with optimizations.
     return Extension(name, sources,
                      include_dirs=include_dirs,
diff --git a/contrib/python-zstandard/tests/test_compressor.py b/contrib/python-zstandard/tests/test_compressor.py
--- a/contrib/python-zstandard/tests/test_compressor.py
+++ b/contrib/python-zstandard/tests/test_compressor.py
@@ -153,7 +153,7 @@ class TestCompressor_compress(unittest.T
         no_params = zstd.get_frame_parameters(no_dict_id)
         with_params = zstd.get_frame_parameters(with_dict_id)
         self.assertEqual(no_params.dict_id, 0)
-        self.assertEqual(with_params.dict_id, 1387616518)
+        self.assertEqual(with_params.dict_id, 1880053135)
 
     def test_compress_dict_multiple(self):
         samples = []
@@ -216,7 +216,7 @@ class TestCompressor_compress(unittest.T
         self.assertEqual(params.dict_id, d.dict_id())
 
         self.assertEqual(result,
-                         b'\x28\xb5\x2f\xfd\x23\x06\x59\xb5\x52\x03\x19\x00\x00'
+                         b'\x28\xb5\x2f\xfd\x23\x8f\x55\x0f\x70\x03\x19\x00\x00'
                          b'\x66\x6f\x6f')
 
     def test_multithreaded_compression_params(self):
@@ -336,7 +336,9 @@ class TestCompressor_compressobj(unittes
                          b'\x28\xb5\x2f\xfd\x00\x48\x18\x00\x00foo')
         self.assertEqual(cobj.compress(b'bar'), b'')
         # 3 byte header plus content.
-        self.assertEqual(cobj.flush(), b'\x19\x00\x00bar')
+        self.assertEqual(cobj.flush(zstd.COMPRESSOBJ_FLUSH_BLOCK),
+                         b'\x18\x00\x00bar')
+        self.assertEqual(cobj.flush(), b'\x01\x00\x00')
 
     def test_flush_empty_block(self):
         cctx = zstd.ZstdCompressor(write_checksum=True)
@@ -576,15 +578,23 @@ class TestCompressor_stream_reader(unitt
     def test_context_manager(self):
         cctx = zstd.ZstdCompressor()
 
-        reader = cctx.stream_reader(b'foo' * 60)
-        with self.assertRaisesRegexp(zstd.ZstdError, 'read\(\) must be called from an active'):
-            reader.read(10)
-
         with cctx.stream_reader(b'foo') as reader:
             with self.assertRaisesRegexp(ValueError, 'cannot __enter__ multiple times'):
                 with reader as reader2:
                     pass
 
+    def test_no_context_manager(self):
+        cctx = zstd.ZstdCompressor()
+
+        reader = cctx.stream_reader(b'foo')
+        reader.read(4)
+        self.assertFalse(reader.closed)
+
+        reader.close()
+        self.assertTrue(reader.closed)
+        with self.assertRaisesRegexp(ValueError, 'stream is closed'):
+            reader.read(1)
+
     def test_not_implemented(self):
         cctx = zstd.ZstdCompressor()
 
@@ -619,13 +629,18 @@ class TestCompressor_stream_reader(unitt
             self.assertFalse(reader.writable())
             self.assertFalse(reader.seekable())
             self.assertFalse(reader.isatty())
+            self.assertFalse(reader.closed)
             self.assertIsNone(reader.flush())
+            self.assertFalse(reader.closed)
+
+        self.assertTrue(reader.closed)
 
     def test_read_closed(self):
         cctx = zstd.ZstdCompressor()
 
         with cctx.stream_reader(b'foo' * 60) as reader:
             reader.close()
+            self.assertTrue(reader.closed)
             with self.assertRaisesRegexp(ValueError, 'stream is closed'):
                 reader.read(10)
 
@@ -715,7 +730,7 @@ class TestCompressor_stream_reader(unitt
             while reader.read(8192):
                 pass
 
-        with self.assertRaisesRegexp(zstd.ZstdError, 'read\(\) must be called from an active'):
+        with self.assertRaisesRegexp(ValueError, 'stream is closed'):
             reader.read(10)
 
     def test_bad_size(self):
@@ -792,7 +807,7 @@ class TestCompressor_stream_writer(unitt
         d = zstd.train_dictionary(8192, samples)
 
         h = hashlib.sha1(d.as_bytes()).hexdigest()
-        self.assertEqual(h, '3040faa0ddc37d50e71a4dd28052cb8db5d9d027')
+        self.assertEqual(h, '2b3b6428da5bf2c9cc9d4bb58ba0bc5990dd0e79')
 
         buffer = io.BytesIO()
         cctx = zstd.ZstdCompressor(level=9, dict_data=d)
@@ -808,9 +823,16 @@ class TestCompressor_stream_writer(unitt
         self.assertEqual(params.window_size, 2097152)
         self.assertEqual(params.dict_id, d.dict_id())
         self.assertFalse(params.has_checksum)
-        self.assertEqual(compressed,
-                         b'\x28\xb5\x2f\xfd\x03\x58\x06\x59\xb5\x52\x5d\x00'
-                         b'\x00\x00\x02\xfc\x3d\x3f\xd9\xb0\x51\x03\x45\x89')
+
+        h = hashlib.sha1(compressed).hexdigest()
+        self.assertEqual(h, '23f88344263678478f5f82298e0a5d1833125786')
+
+        source = b'foo' + b'bar' + (b'foo' * 16384)
+
+        dctx = zstd.ZstdDecompressor(dict_data=d)
+
+        self.assertEqual(dctx.decompress(compressed, max_output_size=len(source)),
+                         source)
 
     def test_compression_params(self):
         params = zstd.ZstdCompressionParameters(
@@ -1157,6 +1179,181 @@ class TestCompressor_read_to_iter(unitte
         b''.join(cctx.read_to_iter(source))
 
 
+@make_cffi
+class TestCompressor_chunker(unittest.TestCase):
+    def test_empty(self):
+        cctx = zstd.ZstdCompressor(write_content_size=False)
+        chunker = cctx.chunker()
+
+        it = chunker.compress(b'')
+
+        with self.assertRaises(StopIteration):
+            next(it)
+
+        it = chunker.finish()
+
+        self.assertEqual(next(it), b'\x28\xb5\x2f\xfd\x00\x50\x01\x00\x00')
+
+        with self.assertRaises(StopIteration):
+            next(it)
+
+    def test_simple_input(self):
+        cctx = zstd.ZstdCompressor()
+        chunker = cctx.chunker()
+
+        it = chunker.compress(b'foobar')
+
+        with self.assertRaises(StopIteration):
+            next(it)
+
+        it = chunker.compress(b'baz' * 30)
+
+        with self.assertRaises(StopIteration):
+            next(it)
+
+        it = chunker.finish()
+
+        self.assertEqual(next(it),
+                         b'\x28\xb5\x2f\xfd\x00\x50\x7d\x00\x00\x48\x66\x6f'
+                         b'\x6f\x62\x61\x72\x62\x61\x7a\x01\x00\xe4\xe4\x8e')
+
+        with self.assertRaises(StopIteration):
+            next(it)
+
+    def test_input_size(self):
+        cctx = zstd.ZstdCompressor()
+        chunker = cctx.chunker(size=1024)
+
+        it = chunker.compress(b'x' * 1000)
+
+        with self.assertRaises(StopIteration):
+            next(it)
+
+        it = chunker.compress(b'y' * 24)
+
+        with self.assertRaises(StopIteration):
+            next(it)
+
+        chunks = list(chunker.finish())
+
+        self.assertEqual(chunks, [
+            b'\x28\xb5\x2f\xfd\x60\x00\x03\x65\x00\x00\x18\x78\x78\x79\x02\x00'
+            b'\xa0\x16\xe3\x2b\x80\x05'
+        ])
+
+        dctx = zstd.ZstdDecompressor()
+
+        self.assertEqual(dctx.decompress(b''.join(chunks)),
+                         (b'x' * 1000) + (b'y' * 24))
+
+    def test_small_chunk_size(self):
+        cctx = zstd.ZstdCompressor()
+        chunker = cctx.chunker(chunk_size=1)
+
+        chunks = list(chunker.compress(b'foo' * 1024))
+        self.assertEqual(chunks, [])
+
+        chunks = list(chunker.finish())
+        self.assertTrue(all(len(chunk) == 1 for chunk in chunks))
+
+        self.assertEqual(
+            b''.join(chunks),
+            b'\x28\xb5\x2f\xfd\x00\x50\x55\x00\x00\x18\x66\x6f\x6f\x01\x00'
+            b'\xfa\xd3\x77\x43')
+
+        dctx = zstd.ZstdDecompressor()
+        self.assertEqual(dctx.decompress(b''.join(chunks),
+                                         max_output_size=10000),
+                         b'foo' * 1024)
+
+    def test_input_types(self):
+        cctx = zstd.ZstdCompressor()
+
+        mutable_array = bytearray(3)
+        mutable_array[:] = b'foo'
+
+        sources = [
+            memoryview(b'foo'),
+            bytearray(b'foo'),
+            mutable_array,
+        ]
+
+        for source in sources:
+            chunker = cctx.chunker()
+
+            self.assertEqual(list(chunker.compress(source)), [])
+            self.assertEqual(list(chunker.finish()), [
+                b'\x28\xb5\x2f\xfd\x00\x50\x19\x00\x00\x66\x6f\x6f'
+            ])
+
+    def test_flush(self):
+        cctx = zstd.ZstdCompressor()
+        chunker = cctx.chunker()
+
+        self.assertEqual(list(chunker.compress(b'foo' * 1024)), [])
+        self.assertEqual(list(chunker.compress(b'bar' * 1024)), [])
+
+        chunks1 = list(chunker.flush())
+
+        self.assertEqual(chunks1, [
+            b'\x28\xb5\x2f\xfd\x00\x50\x8c\x00\x00\x30\x66\x6f\x6f\x62\x61\x72'
+            b'\x02\x00\xfa\x03\xfe\xd0\x9f\xbe\x1b\x02'
+        ])
+
+        self.assertEqual(list(chunker.flush()), [])
+        self.assertEqual(list(chunker.flush()), [])
+
+        self.assertEqual(list(chunker.compress(b'baz' * 1024)), [])
+
+        chunks2 = list(chunker.flush())
+        self.assertEqual(len(chunks2), 1)
+
+        chunks3 = list(chunker.finish())
+        self.assertEqual(len(chunks2), 1)
+
+        dctx = zstd.ZstdDecompressor()
+
+        self.assertEqual(dctx.decompress(b''.join(chunks1 + chunks2 + chunks3),
+                                         max_output_size=10000),
+                         (b'foo' * 1024) + (b'bar' * 1024) + (b'baz' * 1024))
+
+    def test_compress_after_finish(self):
+        cctx = zstd.ZstdCompressor()
+        chunker = cctx.chunker()
+
+        list(chunker.compress(b'foo'))
+        list(chunker.finish())
+
+        with self.assertRaisesRegexp(
+                zstd.ZstdError,
+                'cannot call compress\(\) after compression finished'):
+            list(chunker.compress(b'foo'))
+
+    def test_flush_after_finish(self):
+        cctx = zstd.ZstdCompressor()
+        chunker = cctx.chunker()
+
+        list(chunker.compress(b'foo'))
+        list(chunker.finish())
+
+        with self.assertRaisesRegexp(
+                zstd.ZstdError,
+                'cannot call flush\(\) after compression finished'):
+            list(chunker.flush())
+
+    def test_finish_after_finish(self):
+        cctx = zstd.ZstdCompressor()
+        chunker = cctx.chunker()
+
+        list(chunker.compress(b'foo'))
+        list(chunker.finish())
+
+        with self.assertRaisesRegexp(
+                zstd.ZstdError,
+                'cannot call finish\(\) after compression finished'):
+            list(chunker.finish())
+
+
 class TestCompressor_multi_compress_to_buffer(unittest.TestCase):
     def test_invalid_inputs(self):
         cctx = zstd.ZstdCompressor()
diff --git a/contrib/python-zstandard/tests/test_compressor_fuzzing.py b/contrib/python-zstandard/tests/test_compressor_fuzzing.py
--- a/contrib/python-zstandard/tests/test_compressor_fuzzing.py
+++ b/contrib/python-zstandard/tests/test_compressor_fuzzing.py
@@ -135,6 +135,51 @@ class TestCompressor_compressobj_fuzzing
 
         self.assertEqual(b''.join(chunks), ref_frame)
 
+    @hypothesis.settings(
+        suppress_health_check=[hypothesis.HealthCheck.large_base_example])
+    @hypothesis.given(original=strategies.sampled_from(random_input_data()),
+                      level=strategies.integers(min_value=1, max_value=5),
+                      chunk_sizes=strategies.data(),
+                      flushes=strategies.data())
+    def test_flush_block(self, original, level, chunk_sizes, flushes):
+        cctx = zstd.ZstdCompressor(level=level)
+        cobj = cctx.compressobj()
+
+        dctx = zstd.ZstdDecompressor()
+        dobj = dctx.decompressobj()
+
+        compressed_chunks = []
+        decompressed_chunks = []
+        i = 0
+        while True:
+            input_size = chunk_sizes.draw(strategies.integers(1, 4096))
+            source = original[i:i + input_size]
+            if not source:
+                break
+
+            i += input_size
+
+            chunk = cobj.compress(source)
+            compressed_chunks.append(chunk)
+            decompressed_chunks.append(dobj.decompress(chunk))
+
+            if not flushes.draw(strategies.booleans()):
+                continue
+
+            chunk = cobj.flush(zstd.COMPRESSOBJ_FLUSH_BLOCK)
+            compressed_chunks.append(chunk)
+            decompressed_chunks.append(dobj.decompress(chunk))
+
+            self.assertEqual(b''.join(decompressed_chunks), original[0:i])
+
+        chunk = cobj.flush(zstd.COMPRESSOBJ_FLUSH_FINISH)
+        compressed_chunks.append(chunk)
+        decompressed_chunks.append(dobj.decompress(chunk))
+
+        self.assertEqual(dctx.decompress(b''.join(compressed_chunks),
+                                         max_output_size=len(original)),
+                         original)
+        self.assertEqual(b''.join(decompressed_chunks), original)
 
 @unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
 @make_cffi
@@ -186,3 +231,90 @@ class TestCompressor_multi_compress_to_b
 
         for i, frame in enumerate(result):
             self.assertEqual(dctx.decompress(frame), original[i])
+
+
+@unittest.skipUnless('ZSTD_SLOW_TESTS' in os.environ, 'ZSTD_SLOW_TESTS not set')
+@make_cffi
+class TestCompressor_chunker_fuzzing(unittest.TestCase):
+    @hypothesis.settings(
+        suppress_health_check=[hypothesis.HealthCheck.large_base_example])
+    @hypothesis.given(original=strategies.sampled_from(random_input_data()),
+                      level=strategies.integers(min_value=1, max_value=5),
+                      chunk_size=strategies.integers(
+                          min_value=1,
+                          max_value=32 * 1048576),
+                      input_sizes=strategies.data())
+    def test_random_input_sizes(self, original, level, chunk_size, input_sizes):
+        cctx = zstd.ZstdCompressor(level=level)
+        chunker = cctx.chunker(chunk_size=chunk_size)
+
+        chunks = []
+        i = 0
+        while True:
+            input_size = input_sizes.draw(strategies.integers(1, 4096))
+            source = original[i:i + input_size]
+            if not source:
+                break
+
+            chunks.extend(chunker.compress(source))
+            i += input_size
+
+        chunks.extend(chunker.finish())
+
+        dctx = zstd.ZstdDecompressor()
+
+        self.assertEqual(dctx.decompress(b''.join(chunks),
+                                         max_output_size=len(original)),
+                         original)
+
+        self.assertTrue(all(len(chunk) == chunk_size for chunk in chunks[:-1]))
+
+    @hypothesis.settings(
+        suppress_health_check=[hypothesis.HealthCheck.large_base_example])
+    @hypothesis.given(original=strategies.sampled_from(random_input_data()),
+                      level=strategies.integers(min_value=1, max_value=5),
+                      chunk_size=strategies.integers(
+                          min_value=1,
+                          max_value=32 * 1048576),
+                      input_sizes=strategies.data(),
+                      flushes=strategies.data())
+    def test_flush_block(self, original, level, chunk_size, input_sizes,
+                         flushes):
+        cctx = zstd.ZstdCompressor(level=level)
+        chunker = cctx.chunker(chunk_size=chunk_size)
+
+        dctx = zstd.ZstdDecompressor()
+        dobj = dctx.decompressobj()
+
+        compressed_chunks = []
+        decompressed_chunks = []
+        i = 0
+        while True:
+            input_size = input_sizes.draw(strategies.integers(1, 4096))
+            source = original[i:i + input_size]
+            if not source:
+                break
+
+            i += input_size
+
+            chunks = list(chunker.compress(source))
+            compressed_chunks.extend(chunks)
+            decompressed_chunks.append(dobj.decompress(b''.join(chunks)))
+
+            if not flushes.draw(strategies.booleans()):
+                continue
+
+            chunks = list(chunker.flush())
+            compressed_chunks.extend(chunks)
+            decompressed_chunks.append(dobj.decompress(b''.join(chunks)))
+
+            self.assertEqual(b''.join(decompressed_chunks), original[0:i])
+
+        chunks = list(chunker.finish())
+        compressed_chunks.extend(chunks)
+        decompressed_chunks.append(dobj.decompress(b''.join(chunks)))
+
+        self.assertEqual(dctx.decompress(b''.join(compressed_chunks),
+                                         max_output_size=len(original)),
+                         original)
+        self.assertEqual(b''.join(decompressed_chunks), original)
\ No newline at end of file
diff --git a/contrib/python-zstandard/tests/test_data_structures.py b/contrib/python-zstandard/tests/test_data_structures.py
--- a/contrib/python-zstandard/tests/test_data_structures.py
+++ b/contrib/python-zstandard/tests/test_data_structures.py
@@ -24,6 +24,7 @@ class TestCompressionParameters(unittest
                                        hash_log=zstd.HASHLOG_MAX,
                                        search_log=zstd.SEARCHLOG_MAX,
                                        min_match=zstd.SEARCHLENGTH_MAX - 1,
+                                       target_length=zstd.TARGETLENGTH_MAX,
                                        compression_strategy=zstd.STRATEGY_BTULTRA)
 
     def test_from_level(self):
@@ -34,7 +35,6 @@ class TestCompressionParameters(unittest
 
         p = zstd.ZstdCompressionParameters.from_level(-4)
         self.assertEqual(p.window_log, 19)
-        self.assertEqual(p.compress_literals, 0)
 
     def test_members(self):
         p = zstd.ZstdCompressionParameters(window_log=10,
@@ -64,19 +64,11 @@ class TestCompressionParameters(unittest
         self.assertEqual(p.job_size, 1048576)
         self.assertEqual(p.overlap_size_log, 6)
 
-        p = zstd.ZstdCompressionParameters(compression_level=2)
-        self.assertEqual(p.compress_literals, 1)
-
-        p = zstd.ZstdCompressionParameters(compress_literals=False)
-        self.assertEqual(p.compress_literals, 0)
-
         p = zstd.ZstdCompressionParameters(compression_level=-1)
         self.assertEqual(p.compression_level, -1)
-        self.assertEqual(p.compress_literals, 0)
 
-        p = zstd.ZstdCompressionParameters(compression_level=-2, compress_literals=True)
+        p = zstd.ZstdCompressionParameters(compression_level=-2)
         self.assertEqual(p.compression_level, -2)
-        self.assertEqual(p.compress_literals, 1)
 
         p = zstd.ZstdCompressionParameters(force_max_window=True)
         self.assertEqual(p.force_max_window, 1)
diff --git a/contrib/python-zstandard/tests/test_data_structures_fuzzing.py b/contrib/python-zstandard/tests/test_data_structures_fuzzing.py
--- a/contrib/python-zstandard/tests/test_data_structures_fuzzing.py
+++ b/contrib/python-zstandard/tests/test_data_structures_fuzzing.py
@@ -27,7 +27,7 @@ s_searchlog = strategies.integers(min_va
 s_searchlength = strategies.integers(min_value=zstd.SEARCHLENGTH_MIN,
                                      max_value=zstd.SEARCHLENGTH_MAX)
 s_targetlength = strategies.integers(min_value=zstd.TARGETLENGTH_MIN,
-                                     max_value=2**32)
+                                     max_value=zstd.TARGETLENGTH_MAX)
 s_strategy = strategies.sampled_from((zstd.STRATEGY_FAST,
                                         zstd.STRATEGY_DFAST,
                                         zstd.STRATEGY_GREEDY,
diff --git a/contrib/python-zstandard/tests/test_decompressor.py b/contrib/python-zstandard/tests/test_decompressor.py
--- a/contrib/python-zstandard/tests/test_decompressor.py
+++ b/contrib/python-zstandard/tests/test_decompressor.py
@@ -293,10 +293,6 @@ class TestDecompressor_stream_reader(uni
     def test_context_manager(self):
         dctx = zstd.ZstdDecompressor()
 
-        reader = dctx.stream_reader(b'foo')
-        with self.assertRaisesRegexp(zstd.ZstdError, 'read\(\) must be called from an active'):
-            reader.read(1)
-
         with dctx.stream_reader(b'foo') as reader:
             with self.assertRaisesRegexp(ValueError, 'cannot __enter__ multiple times'):
                 with reader as reader2:
@@ -331,17 +327,23 @@ class TestDecompressor_stream_reader(uni
         dctx = zstd.ZstdDecompressor()
 
         with dctx.stream_reader(b'foo') as reader:
+            self.assertFalse(reader.closed)
             self.assertTrue(reader.readable())
             self.assertFalse(reader.writable())
             self.assertTrue(reader.seekable())
             self.assertFalse(reader.isatty())
+            self.assertFalse(reader.closed)
             self.assertIsNone(reader.flush())
+            self.assertFalse(reader.closed)
+
+        self.assertTrue(reader.closed)
 
     def test_read_closed(self):
         dctx = zstd.ZstdDecompressor()
 
         with dctx.stream_reader(b'foo') as reader:
             reader.close()
+            self.assertTrue(reader.closed)
             with self.assertRaisesRegexp(ValueError, 'stream is closed'):
                 reader.read(1)
 
@@ -372,10 +374,10 @@ class TestDecompressor_stream_reader(uni
             self.assertEqual(reader.tell(), len(source))
 
             # Read after EOF should return empty bytes.
-            self.assertEqual(reader.read(), b'')
+            self.assertEqual(reader.read(1), b'')
             self.assertEqual(reader.tell(), len(result))
 
-        self.assertTrue(reader.closed())
+        self.assertTrue(reader.closed)
 
     def test_read_buffer_small_chunks(self):
         cctx = zstd.ZstdCompressor()
@@ -408,8 +410,11 @@ class TestDecompressor_stream_reader(uni
             chunk = reader.read(8192)
             self.assertEqual(chunk, source)
             self.assertEqual(reader.tell(), len(source))
-            self.assertEqual(reader.read(), b'')
+            self.assertEqual(reader.read(1), b'')
             self.assertEqual(reader.tell(), len(source))
+            self.assertFalse(reader.closed)
+
+        self.assertTrue(reader.closed)
 
     def test_read_stream_small_chunks(self):
         cctx = zstd.ZstdCompressor()
@@ -440,7 +445,9 @@ class TestDecompressor_stream_reader(uni
             while reader.read(16):
                 pass
 
-        with self.assertRaisesRegexp(zstd.ZstdError, 'read\(\) must be called from an active'):
+        self.assertTrue(reader.closed)
+
+        with self.assertRaisesRegexp(ValueError, 'stream is closed'):
             reader.read(10)
 
     def test_illegal_seeks(self):
@@ -474,8 +481,7 @@ class TestDecompressor_stream_reader(uni
             with self.assertRaisesRegexp(ValueError, 'stream is closed'):
                 reader.seek(4, os.SEEK_SET)
 
-        with self.assertRaisesRegexp(
-            zstd.ZstdError, 'seek\(\) must be called from an active context'):
+        with self.assertRaisesRegexp(ValueError, 'stream is closed'):
             reader.seek(0)
 
     def test_seek(self):
@@ -492,6 +498,39 @@ class TestDecompressor_stream_reader(uni
             reader.seek(4, os.SEEK_CUR)
             self.assertEqual(reader.read(2), b'ar')
 
+    def test_no_context_manager(self):
+        source = b'foobar' * 60
+        cctx = zstd.ZstdCompressor()
+        frame = cctx.compress(source)
+
+        dctx = zstd.ZstdDecompressor()
+        reader = dctx.stream_reader(frame)
+
+        self.assertEqual(reader.read(6), b'foobar')
+        self.assertEqual(reader.read(18), b'foobar' * 3)
+        self.assertFalse(reader.closed)
+
+        # Calling close prevents subsequent use.
+        reader.close()
+        self.assertTrue(reader.closed)
+
+        with self.assertRaisesRegexp(ValueError, 'stream is closed'):
+            reader.read(6)
+
+    def test_read_after_error(self):
+        source = io.BytesIO(b'')
+        dctx = zstd.ZstdDecompressor()
+
+        reader = dctx.stream_reader(source)
+
+        with reader:
+            with self.assertRaises(TypeError):
+                reader.read()
+
+        with reader:
+            with self.assertRaisesRegexp(ValueError, 'stream is closed'):
+                reader.read(100)
+
 
 @make_cffi
 class TestDecompressor_decompressobj(unittest.TestCase):
diff --git a/contrib/python-zstandard/tests/test_module_attributes.py b/contrib/python-zstandard/tests/test_module_attributes.py
--- a/contrib/python-zstandard/tests/test_module_attributes.py
+++ b/contrib/python-zstandard/tests/test_module_attributes.py
@@ -12,7 +12,9 @@ from . common import (
 @make_cffi
 class TestModuleAttributes(unittest.TestCase):
     def test_version(self):
-        self.assertEqual(zstd.ZSTD_VERSION, (1, 3, 4))
+        self.assertEqual(zstd.ZSTD_VERSION, (1, 3, 6))
+
+        self.assertEqual(zstd.__version__, '0.10.1')
 
     def test_constants(self):
         self.assertEqual(zstd.MAX_COMPRESSION_LEVEL, 22)
@@ -27,6 +29,8 @@ class TestModuleAttributes(unittest.Test
             'DECOMPRESSION_RECOMMENDED_INPUT_SIZE',
             'DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE',
             'MAGIC_NUMBER',
+            'BLOCKSIZELOG_MAX',
+            'BLOCKSIZE_MAX',
             'WINDOWLOG_MIN',
             'WINDOWLOG_MAX',
             'CHAINLOG_MIN',
@@ -39,6 +43,7 @@ class TestModuleAttributes(unittest.Test
             'SEARCHLENGTH_MIN',
             'SEARCHLENGTH_MAX',
             'TARGETLENGTH_MIN',
+            'TARGETLENGTH_MAX',
             'LDM_MINMATCH_MIN',
             'LDM_MINMATCH_MAX',
             'LDM_BUCKETSIZELOG_MAX',
diff --git a/contrib/python-zstandard/tests/test_train_dictionary.py b/contrib/python-zstandard/tests/test_train_dictionary.py
--- a/contrib/python-zstandard/tests/test_train_dictionary.py
+++ b/contrib/python-zstandard/tests/test_train_dictionary.py
@@ -57,7 +57,8 @@ class TestTrainDictionary(unittest.TestC
         d = zstd.train_dictionary(8192, generate_samples(), threads=-1, steps=1,
                                   d=16)
 
-        self.assertEqual(d.k, 50)
+        # This varies by platform.
+        self.assertIn(d.k, (50, 2000))
         self.assertEqual(d.d, 16)
 
 @make_cffi
diff --git a/contrib/python-zstandard/zstandard/__init__.py b/contrib/python-zstandard/zstandard/__init__.py
--- a/contrib/python-zstandard/zstandard/__init__.py
+++ b/contrib/python-zstandard/zstandard/__init__.py
@@ -60,3 +60,6 @@ elif _module_policy == 'cffi':
 else:
     raise ImportError('unknown module import policy: %s; use default, cffi_fallback, '
                       'cext, or cffi' % _module_policy)
+
+# Keep this in sync with python-zstandard.h.
+__version__ = '0.10.1'
diff --git a/contrib/python-zstandard/zstd.c b/contrib/python-zstandard/zstd.c
--- a/contrib/python-zstandard/zstd.c
+++ b/contrib/python-zstandard/zstd.c
@@ -182,6 +182,7 @@ void compressobj_module_init(PyObject* m
 void compressor_module_init(PyObject* mod);
 void compressionparams_module_init(PyObject* mod);
 void constants_module_init(PyObject* mod);
+void compressionchunker_module_init(PyObject* mod);
 void compressiondict_module_init(PyObject* mod);
 void compressionreader_module_init(PyObject* mod);
 void compressionwriter_module_init(PyObject* mod);
@@ -209,7 +210,7 @@ void zstd_module_init(PyObject* m) {
 	   We detect this mismatch here and refuse to load the module if this
 	   scenario is detected.
 	*/
-	if (ZSTD_VERSION_NUMBER != 10304 || ZSTD_versionNumber() != 10304) {
+	if (ZSTD_VERSION_NUMBER != 10306 || ZSTD_versionNumber() != 10306) {
 		PyErr_SetString(PyExc_ImportError, "zstd C API mismatch; Python bindings not compiled against expected zstd version");
 		return;
 	}
@@ -219,6 +220,7 @@ void zstd_module_init(PyObject* m) {
 	compressiondict_module_init(m);
 	compressobj_module_init(m);
 	compressor_module_init(m);
+	compressionchunker_module_init(m);
 	compressionreader_module_init(m);
 	compressionwriter_module_init(m);
 	compressoriterator_module_init(m);
diff --git a/contrib/python-zstandard/zstd/common/bitstream.h b/contrib/python-zstandard/zstd/common/bitstream.h
--- a/contrib/python-zstandard/zstd/common/bitstream.h
+++ b/contrib/python-zstandard/zstd/common/bitstream.h
@@ -1,8 +1,7 @@
 /* ******************************************************************
    bitstream
    Part of FSE library
-   header file (to include)
-   Copyright (C) 2013-2017, Yann Collet.
+   Copyright (C) 2013-present, Yann Collet.
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -49,21 +48,10 @@ extern "C" {
 *  Dependencies
 ******************************************/
 #include "mem.h"            /* unaligned access routines */
+#include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
 #include "error_private.h"  /* error codes and messages */
 
 
-/*-*************************************
-*  Debug
-***************************************/
-#if defined(BIT_DEBUG) && (BIT_DEBUG>=1)
-#  include <assert.h>
-#else
-#  ifndef assert
-#    define assert(condition) ((void)0)
-#  endif
-#endif
-
-
 /*=========================================
 *  Target specific
 =========================================*/
@@ -83,8 +71,7 @@ extern "C" {
  * A critical property of these streams is that they encode and decode in **reverse** direction.
  * So the first bit sequence you add will be the last to be read, like a LIFO stack.
  */
-typedef struct
-{
+typedef struct {
     size_t bitContainer;
     unsigned bitPos;
     char*  startPtr;
@@ -118,8 +105,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_C
 /*-********************************************
 *  bitStream decoding API (read backward)
 **********************************************/
-typedef struct
-{
+typedef struct {
     size_t   bitContainer;
     unsigned bitsConsumed;
     const char* ptr;
@@ -236,7 +222,8 @@ MEM_STATIC void BIT_addBits(BIT_CStream_
 }
 
 /*! BIT_addBitsFast() :
- *  works only if `value` is _clean_, meaning all high bits above nbBits are 0 */
+ *  works only if `value` is _clean_,
+ *  meaning all high bits above nbBits are 0 */
 MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
                                 size_t value, unsigned nbBits)
 {
diff --git a/contrib/python-zstandard/zstd/common/compiler.h b/contrib/python-zstandard/zstd/common/compiler.h
--- a/contrib/python-zstandard/zstd/common/compiler.h
+++ b/contrib/python-zstandard/zstd/common/compiler.h
@@ -77,9 +77,9 @@
  * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
  */
 #ifndef DYNAMIC_BMI2
-  #if (defined(__clang__) && __has_attribute(__target__)) \
+  #if ((defined(__clang__) && __has_attribute(__target__)) \
       || (defined(__GNUC__) \
-          && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) \
+          && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
       && (defined(__x86_64__) || defined(_M_X86)) \
       && !defined(__BMI2__)
   #  define DYNAMIC_BMI2 1
@@ -88,15 +88,37 @@
   #endif
 #endif
 
-/* prefetch */
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
-#  include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-#  define PREFETCH(ptr)   _mm_prefetch((const char*)ptr, _MM_HINT_T0)
-#elif defined(__GNUC__)
-#  define PREFETCH(ptr)   __builtin_prefetch(ptr, 0, 0)
+/* prefetch
+ * can be disabled, by declaring NO_PREFETCH macro
+ * All prefetch invocations use a single default locality 2,
+ * generating instruction prefetcht1,
+ * which, according to Intel, means "load data into L2 cache".
+ * This is a good enough "middle ground" for the time being,
+ * though in theory, it would be better to specialize locality depending on data being prefetched.
+ * Tests could not determine any sensible difference based on locality value. */
+#if defined(NO_PREFETCH)
+#  define PREFETCH(ptr)     (void)(ptr)  /* disabled */
 #else
-#  define PREFETCH(ptr)   /* disabled */
-#endif
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define PREFETCH(ptr)   _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define PREFETCH(ptr)   __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+#  else
+#    define PREFETCH(ptr)   (void)(ptr)  /* disabled */
+#  endif
+#endif  /* NO_PREFETCH */
+
+#define CACHELINE_SIZE 64
+
+#define PREFETCH_AREA(p, s)  {            \
+    const char* const _ptr = (const char*)(p);  \
+    size_t const _size = (size_t)(s);     \
+    size_t _pos;                          \
+    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+        PREFETCH(_ptr + _pos);            \
+    }                                     \
+}
 
 /* disable warnings */
 #ifdef _MSC_VER    /* Visual Studio */
diff --git a/contrib/python-zstandard/zstd/common/cpu.h b/contrib/python-zstandard/zstd/common/cpu.h
--- a/contrib/python-zstandard/zstd/common/cpu.h
+++ b/contrib/python-zstandard/zstd/common/cpu.h
@@ -36,7 +36,7 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void)
     U32 f1d = 0;
     U32 f7b = 0;
     U32 f7c = 0;
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
     int reg[4];
     __cpuid((int*)reg, 0);
     {
@@ -72,8 +72,7 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void)
           "cpuid\n\t"
           "popl %%ebx\n\t"
           : "=a"(f1a), "=c"(f1c), "=d"(f1d)
-          : "a"(1)
-          :);
+          : "a"(1));
     }
     if (n >= 7) {
       __asm__(
diff --git a/contrib/python-zstandard/zstd/common/debug.c b/contrib/python-zstandard/zstd/common/debug.c
new file mode 100644
--- /dev/null
+++ b/contrib/python-zstandard/zstd/common/debug.c
@@ -0,0 +1,44 @@
+/* ******************************************************************
+   debug
+   Part of FSE library
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+
+/*
+ * This module only hosts one global variable
+ * which can be used to dynamically influence the verbosity of traces,
+ * such as DEBUGLOG and RAWLOG
+ */
+
+#include "debug.h"
+
+int g_debuglevel = DEBUGLEVEL;
diff --git a/contrib/python-zstandard/zstd/common/debug.h b/contrib/python-zstandard/zstd/common/debug.h
new file mode 100644
--- /dev/null
+++ b/contrib/python-zstandard/zstd/common/debug.h
@@ -0,0 +1,123 @@
+/* ******************************************************************
+   debug
+   Part of FSE library
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+
+/*
+ * The purpose of this header is to enable debug functions.
+ * They regroup assert(), DEBUGLOG() and RAWLOG() for run-time,
+ * and DEBUG_STATIC_ASSERT() for compile-time.
+ *
+ * By default, DEBUGLEVEL==0, which means run-time debug is disabled.
+ *
+ * Level 1 enables assert() only.
+ * Starting level 2, traces can be generated and pushed to stderr.
+ * The higher the level, the more verbose the traces.
+ *
+ * It's possible to dynamically adjust level using variable g_debug_level,
+ * which is only declared if DEBUGLEVEL>=2,
+ * and is a global variable, not multi-thread protected (use with care)
+ */
+
+#ifndef DEBUG_H_12987983217
+#define DEBUG_H_12987983217
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* static assert is triggered at compile time, leaving no runtime artefact,
+ * but can only work with compile-time constants.
+ * This variant can only be used inside a function. */
+#define DEBUG_STATIC_ASSERT(c) (void)sizeof(char[(c) ? 1 : -1])
+
+
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+/* recommended values for DEBUGLEVEL :
+ * 0 : no debug, all run-time functions disabled
+ * 1 : no display, enables assert() only
+ * 2 : reserved, for currently active debug path
+ * 3 : events once per object lifetime (CCtx, CDict, etc.)
+ * 4 : events once per frame
+ * 5 : events once per block
+ * 6 : events once per sequence (verbose)
+ * 7+: events at every position (*very* verbose)
+ *
+ * It's generally inconvenient to output traces > 5.
+ * In which case, it's possible to selectively enable higher verbosity levels
+ * by modifying g_debug_level.
+ */
+
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>
+#else
+#  ifndef assert   /* assert may be already defined, due to prior #include <assert.h> */
+#    define assert(condition) ((void)0)   /* disable assert (default) */
+#  endif
+#endif
+
+#if (DEBUGLEVEL>=2)
+#  include <stdio.h>
+extern int g_debuglevel; /* here, this variable is only declared,
+                           it actually lives in debug.c,
+                           and is shared by the whole process.
+                           It's typically used to enable very verbose levels
+                           on selective conditions (such as position in src) */
+
+#  define RAWLOG(l, ...) {                                      \
+                if (l<=g_debuglevel) {                          \
+                    fprintf(stderr, __VA_ARGS__);               \
+            }   }
+#  define DEBUGLOG(l, ...) {                                    \
+                if (l<=g_debuglevel) {                          \
+                    fprintf(stderr, __FILE__ ": " __VA_ARGS__); \
+                    fprintf(stderr, " \n");                     \
+            }   }
+#else
+#  define RAWLOG(l, ...)      {}    /* disabled */
+#  define DEBUGLOG(l, ...)    {}    /* disabled */
+#endif
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* DEBUG_H_12987983217 */
diff --git a/contrib/python-zstandard/zstd/common/entropy_common.c b/contrib/python-zstandard/zstd/common/entropy_common.c
--- a/contrib/python-zstandard/zstd/common/entropy_common.c
+++ b/contrib/python-zstandard/zstd/common/entropy_common.c
@@ -72,7 +72,21 @@ size_t FSE_readNCount (short* normalized
     unsigned charnum = 0;
     int previous0 = 0;
 
-    if (hbSize < 4) return ERROR(srcSize_wrong);
+    if (hbSize < 4) {
+        /* This function only works when hbSize >= 4 */
+        char buffer[4];
+        memset(buffer, 0, sizeof(buffer));
+        memcpy(buffer, headerBuffer, hbSize);
+        {   size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr,
+                                                    buffer, sizeof(buffer));
+            if (FSE_isError(countSize)) return countSize;
+            if (countSize > hbSize) return ERROR(corruption_detected);
+            return countSize;
+    }   }
+    assert(hbSize >= 4);
+
+    /* init */
+    memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0]));   /* all symbols not present in NCount have a frequency of 0 */
     bitStream = MEM_readLE32(ip);
     nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
     if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
@@ -105,6 +119,7 @@ size_t FSE_readNCount (short* normalized
             if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
             while (charnum < n0) normalizedCounter[charnum++] = 0;
             if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                assert((bitCount >> 3) <= 3); /* For first condition to work */
                 ip += bitCount>>3;
                 bitCount &= 7;
                 bitStream = MEM_readLE32(ip) >> bitCount;
diff --git a/contrib/python-zstandard/zstd/common/fse.h b/contrib/python-zstandard/zstd/common/fse.h
--- a/contrib/python-zstandard/zstd/common/fse.h
+++ b/contrib/python-zstandard/zstd/common/fse.h
@@ -72,6 +72,7 @@ extern "C" {
 #define FSE_VERSION_NUMBER  (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE)
 FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /**< library version number; to be used when checking dll version */
 
+
 /*-****************************************
 *  FSE simple functions
 ******************************************/
@@ -129,7 +130,7 @@ FSE_PUBLIC_API size_t FSE_compress2 (voi
 ******************************************/
 /*!
 FSE_compress() does the following:
-1. count symbol occurrence from source[] into table count[]
+1. count symbol occurrence from source[] into table count[] (see hist.h)
 2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
 3. save normalized counters to memory buffer using writeNCount()
 4. build encoding table 'CTable' from normalized counters
@@ -147,15 +148,6 @@ or to save and provide normalized distri
 
 /* *** COMPRESSION *** */
 
-/*! FSE_count():
-    Provides the precise count of each byte within a table 'count'.
-    'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
-    *maxSymbolValuePtr will be updated if detected smaller than initial value.
-    @return : the count of the most frequent symbol (which is not identified).
-              if return == srcSize, there is only one symbol.
-              Can also return an error code, which can be tested with FSE_isError(). */
-FSE_PUBLIC_API size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
-
 /*! FSE_optimalTableLog():
     dynamically downsize 'tableLog' when conditions are met.
     It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
@@ -167,7 +159,8 @@ FSE_PUBLIC_API unsigned FSE_optimalTable
     'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
     @return : tableLog,
               or an errorCode, which can be tested using FSE_isError() */
-FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, const unsigned* count, size_t srcSize, unsigned maxSymbolValue);
+FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog,
+                    const unsigned* count, size_t srcSize, unsigned maxSymbolValue);
 
 /*! FSE_NCountWriteBound():
     Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
@@ -178,8 +171,9 @@ FSE_PUBLIC_API size_t FSE_NCountWriteBou
     Compactly save 'normalizedCounter' into 'buffer'.
     @return : size of the compressed table,
               or an errorCode, which can be tested using FSE_isError(). */
-FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
-
+FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                                 const short* normalizedCounter,
+                                 unsigned maxSymbolValue, unsigned tableLog);
 
 /*! Constructor and Destructor of FSE_CTable.
     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
@@ -250,7 +244,9 @@ If there is an error, the function will 
     @return : size read from 'rBuffer',
               or an errorCode, which can be tested using FSE_isError().
               maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
-FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter,
+                           unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                           const void* rBuffer, size_t rBuffSize);
 
 /*! Constructor and Destructor of FSE_DTable.
     Note that its size depends on 'tableLog' */
@@ -325,33 +321,8 @@ If there is an error, the function will 
 
 
 /* *****************************************
-*  FSE advanced API
-*******************************************/
-/* FSE_count_wksp() :
- * Same as FSE_count(), but using an externally provided scratch buffer.
- * `workSpace` size must be table of >= `1024` unsigned
- */
-size_t FSE_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
-                 const void* source, size_t sourceSize, unsigned* workSpace);
-
-/** FSE_countFast() :
- *  same as FSE_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr
- */
-size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
-
-/* FSE_countFast_wksp() :
- * Same as FSE_countFast(), but using an externally provided scratch buffer.
- * `workSpace` must be a table of minimum `1024` unsigned
- */
-size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* workSpace);
-
-/*! FSE_count_simple() :
- * Same as FSE_countFast(), but does not use any additional memory (not even on stack).
- * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` (presuming it's also the size of `count`).
-*/
-size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
-
-
+ *  FSE advanced API
+ ***************************************** */
 
 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
 /**< same as FSE_optimalTableLog(), which used `minus==2` */
@@ -576,6 +547,39 @@ MEM_STATIC void FSE_flushCState(BIT_CStr
 }
 
 
+/* FSE_getMaxNbBits() :
+ * Approximate maximum cost of a symbol, in bits.
+ * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    return (symbolTT[symbolValue].deltaNbBits + ((1<<16)-1)) >> 16;
+}
+
+/* FSE_bitCost() :
+ * Approximate symbol cost, as fractional value, using fixed-point format (accuracyLog fractional bits)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_bitCost(const void* symbolTTPtr, U32 tableLog, U32 symbolValue, U32 accuracyLog)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    U32 const minNbBits = symbolTT[symbolValue].deltaNbBits >> 16;
+    U32 const threshold = (minNbBits+1) << 16;
+    assert(tableLog < 16);
+    assert(accuracyLog < 31-tableLog);  /* ensure enough room for renormalization double shift */
+    {   U32 const tableSize = 1 << tableLog;
+        U32 const deltaFromThreshold = threshold - (symbolTT[symbolValue].deltaNbBits + tableSize);
+        U32 const normalizedDeltaFromThreshold = (deltaFromThreshold << accuracyLog) >> tableLog;   /* linear interpolation (very approximate) */
+        U32 const bitMultiplier = 1 << accuracyLog;
+        assert(symbolTT[symbolValue].deltaNbBits + tableSize <= threshold);
+        assert(normalizedDeltaFromThreshold <= bitMultiplier);
+        return (minNbBits+1)*bitMultiplier - normalizedDeltaFromThreshold;
+    }
+}
+
+
 /* ======    Decompression    ====== */
 
 typedef struct {
diff --git a/contrib/python-zstandard/zstd/common/fse_decompress.c b/contrib/python-zstandard/zstd/common/fse_decompress.c
--- a/contrib/python-zstandard/zstd/common/fse_decompress.c
+++ b/contrib/python-zstandard/zstd/common/fse_decompress.c
@@ -49,7 +49,7 @@
 *  Error Management
 ****************************************************************/
 #define FSE_isError ERR_isError
-#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+#define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)   /* use only *after* variable declarations */
 
 /* check and forward error code */
 #define CHECK_F(f) { size_t const e = f; if (FSE_isError(e)) return e; }
diff --git a/contrib/python-zstandard/zstd/common/huf.h b/contrib/python-zstandard/zstd/common/huf.h
--- a/contrib/python-zstandard/zstd/common/huf.h
+++ b/contrib/python-zstandard/zstd/common/huf.h
@@ -1,7 +1,7 @@
 /* ******************************************************************
-   Huffman coder, part of New Generation Entropy library
-   header file
-   Copyright (C) 2013-2016, Yann Collet.
+   huff0 huffman codec,
+   part of Finite State Entropy library
+   Copyright (C) 2013-present, Yann Collet.
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -163,25 +163,25 @@ HUF_PUBLIC_API size_t HUF_compress4X_wks
 /* static allocation of HUF's DTable */
 typedef U32 HUF_DTable;
 #define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<(maxTableLog)))
-#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+#define HUF_CREATE_STATIC_DTABLEX1(DTable, maxTableLog) \
         HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) }
-#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
         HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) }
 
 
 /* ****************************************
 *  Advanced decompression functions
 ******************************************/
-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
-size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
 
 size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< decodes RLE and uncompressed */
 size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */
 size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< considers RLE and uncompressed as errors */
-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
-size_t HUF_decompress4X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
-size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
 
 
 /* ****************************************
@@ -208,7 +208,7 @@ size_t HUF_compress4X_usingCTable(void* 
 typedef enum {
    HUF_repeat_none,  /**< Cannot use the previous table */
    HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
-   HUF_repeat_valid  /**< Can use the previous table and it is asumed to be valid */
+   HUF_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
  } HUF_repeat;
 /** HUF_compress4X_repeat() :
  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
@@ -227,7 +227,9 @@ size_t HUF_compress4X_repeat(void* dst, 
  */
 #define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
 #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
-size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize);
+size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                       const U32* count, U32 maxSymbolValue, U32 maxNbBits,
+                             void* workSpace, size_t wkspSize);
 
 /*! HUF_readStats() :
  *  Read compact Huffman tree, saved by HUF_writeCTable().
@@ -242,10 +244,15 @@ size_t HUF_readStats(BYTE* huffWeight, s
  *  Loading a CTable saved with HUF_writeCTable() */
 size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
 
+/** HUF_getNbBits() :
+ *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
+ *  Note 1 : is not inlined, as HUF_CElt definition is private
+ *  Note 2 : const void* used, so that it can provide a statically allocated table as argument (which uses type U32) */
+U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue);
 
 /*
  * HUF_decompress() does the following:
- * 1. select the decompression algorithm (X2, X4) based on pre-computed heuristics
+ * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
  * 2. build Huffman table from save, using HUF_readDTableX?()
  * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable()
  */
@@ -253,13 +260,13 @@ size_t HUF_readCTable (HUF_CElt* CTable,
 /** HUF_selectDecoder() :
  *  Tells which decoder is likely to decode faster,
  *  based on a set of pre-computed metrics.
- * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
  *  Assumption : 0 < dstSize <= 128 KB */
 U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
 
 /**
  *  The minimum workspace size for the `workSpace` used in
- *  HUF_readDTableX2_wksp() and HUF_readDTableX4_wksp().
+ *  HUF_readDTableX1_wksp() and HUF_readDTableX2_wksp().
  *
  *  The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when
  *  HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15.
@@ -270,14 +277,14 @@ U32 HUF_selectDecoder (size_t dstSize, s
 #define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10)
 #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
 
+size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
 size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
 size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
-size_t HUF_readDTableX4 (HUF_DTable* DTable, const void* src, size_t srcSize);
-size_t HUF_readDTableX4_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
 
 size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
 size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
 
 
 /* ====================== */
@@ -298,25 +305,25 @@ size_t HUF_compress1X_repeat(void* dst, 
                        void* workSpace, size_t wkspSize,   /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
                        HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
 
-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
-size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
 
 size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
 size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
-size_t HUF_decompress1X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
-size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
 
 size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /**< automatic selection of sing or double symbol decoder, based on DTable */
+size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
 size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-size_t HUF_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
 
 /* BMI2 variants.
  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
  */
 size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
-size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
 size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
 size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
 
diff --git a/contrib/python-zstandard/zstd/common/mem.h b/contrib/python-zstandard/zstd/common/mem.h
--- a/contrib/python-zstandard/zstd/common/mem.h
+++ b/contrib/python-zstandard/zstd/common/mem.h
@@ -57,11 +57,23 @@ MEM_STATIC void MEM_check(void) { MEM_ST
   typedef  uint64_t U64;
   typedef   int64_t S64;
 #else
+# include <limits.h>
+#if CHAR_BIT != 8
+#  error "this implementation requires char to be exactly 8-bit type"
+#endif
   typedef unsigned char      BYTE;
+#if USHRT_MAX != 65535
+#  error "this implementation requires short to be exactly 16-bit type"
+#endif
   typedef unsigned short      U16;
   typedef   signed short      S16;
+#if UINT_MAX != 4294967295
+#  error "this implementation requires int to be exactly 32-bit type"
+#endif
   typedef unsigned int        U32;
   typedef   signed int        S32;
+/* note : there are no limits defined for long long type in C90.
+ * limits exist in C99, however, in such case, <stdint.h> is preferred */
   typedef unsigned long long  U64;
   typedef   signed long long  S64;
 #endif
diff --git a/contrib/python-zstandard/zstd/common/pool.c b/contrib/python-zstandard/zstd/common/pool.c
--- a/contrib/python-zstandard/zstd/common/pool.c
+++ b/contrib/python-zstandard/zstd/common/pool.c
@@ -10,9 +10,10 @@
 
 
 /* ======   Dependencies   ======= */
-#include <stddef.h>  /* size_t */
+#include <stddef.h>    /* size_t */
+#include "debug.h"     /* assert */
+#include "zstd_internal.h"  /* ZSTD_malloc, ZSTD_free */
 #include "pool.h"
-#include "zstd_internal.h"  /* ZSTD_malloc, ZSTD_free */
 
 /* ======   Compiler specifics   ====== */
 #if defined(_MSC_VER)
@@ -33,8 +34,9 @@ typedef struct POOL_job_s {
 struct POOL_ctx_s {
     ZSTD_customMem customMem;
     /* Keep track of the threads */
-    ZSTD_pthread_t *threads;
-    size_t numThreads;
+    ZSTD_pthread_t* threads;
+    size_t threadCapacity;
+    size_t threadLimit;
 
     /* The queue is a circular buffer */
     POOL_job *queue;
@@ -58,10 +60,10 @@ struct POOL_ctx_s {
 };
 
 /* POOL_thread() :
-   Work thread for the thread pool.
-   Waits for jobs and executes them.
-   @returns : NULL on failure else non-null.
-*/
+ * Work thread for the thread pool.
+ * Waits for jobs and executes them.
+ * @returns : NULL on failure else non-null.
+ */
 static void* POOL_thread(void* opaque) {
     POOL_ctx* const ctx = (POOL_ctx*)opaque;
     if (!ctx) { return NULL; }
@@ -69,14 +71,17 @@ static void* POOL_thread(void* opaque) {
         /* Lock the mutex and wait for a non-empty queue or until shutdown */
         ZSTD_pthread_mutex_lock(&ctx->queueMutex);
 
-        while (ctx->queueEmpty && !ctx->shutdown) {
+        while ( ctx->queueEmpty
+            || (ctx->numThreadsBusy >= ctx->threadLimit) ) {
+            if (ctx->shutdown) {
+                /* even if !queueEmpty, (possible if numThreadsBusy >= threadLimit),
+                 * a few threads will be shutdown while !queueEmpty,
+                 * but enough threads will remain active to finish the queue */
+                ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+                return opaque;
+            }
             ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex);
         }
-        /* empty => shutting down: so stop */
-        if (ctx->queueEmpty) {
-            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-            return opaque;
-        }
         /* Pop a job off the queue */
         {   POOL_job const job = ctx->queue[ctx->queueHead];
             ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize;
@@ -89,30 +94,32 @@ static void* POOL_thread(void* opaque) {
             job.function(job.opaque);
 
             /* If the intended queue size was 0, signal after finishing job */
+            ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+            ctx->numThreadsBusy--;
             if (ctx->queueSize == 1) {
-                ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-                ctx->numThreadsBusy--;
-                ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
                 ZSTD_pthread_cond_signal(&ctx->queuePushCond);
-        }   }
+            }
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        }
     }  /* for (;;) */
-    /* Unreachable */
+    assert(0);  /* Unreachable */
 }
 
 POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
     return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
 }
 
-POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) {
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+                               ZSTD_customMem customMem) {
     POOL_ctx* ctx;
-    /* Check the parameters */
+    /* Check parameters */
     if (!numThreads) { return NULL; }
     /* Allocate the context and zero initialize */
     ctx = (POOL_ctx*)ZSTD_calloc(sizeof(POOL_ctx), customMem);
     if (!ctx) { return NULL; }
     /* Initialize the job queue.
-     * It needs one extra space since one space is wasted to differentiate empty
-     * and full queues.
+     * It needs one extra space since one space is wasted to differentiate
+     * empty and full queues.
      */
     ctx->queueSize = queueSize + 1;
     ctx->queue = (POOL_job*)ZSTD_malloc(ctx->queueSize * sizeof(POOL_job), customMem);
@@ -126,7 +133,7 @@ POOL_ctx* POOL_create_advanced(size_t nu
     ctx->shutdown = 0;
     /* Allocate space for the thread handles */
     ctx->threads = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), customMem);
-    ctx->numThreads = 0;
+    ctx->threadCapacity = 0;
     ctx->customMem = customMem;
     /* Check for errors */
     if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; }
@@ -134,11 +141,12 @@ POOL_ctx* POOL_create_advanced(size_t nu
     {   size_t i;
         for (i = 0; i < numThreads; ++i) {
             if (ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) {
-                ctx->numThreads = i;
+                ctx->threadCapacity = i;
                 POOL_free(ctx);
                 return NULL;
         }   }
-        ctx->numThreads = numThreads;
+        ctx->threadCapacity = numThreads;
+        ctx->threadLimit = numThreads;
     }
     return ctx;
 }
@@ -156,8 +164,8 @@ static void POOL_join(POOL_ctx* ctx) {
     ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
     /* Join all of the threads */
     {   size_t i;
-        for (i = 0; i < ctx->numThreads; ++i) {
-            ZSTD_pthread_join(ctx->threads[i], NULL);
+        for (i = 0; i < ctx->threadCapacity; ++i) {
+            ZSTD_pthread_join(ctx->threads[i], NULL);  /* note : could fail */
     }   }
 }
 
@@ -172,24 +180,68 @@ void POOL_free(POOL_ctx *ctx) {
     ZSTD_free(ctx, ctx->customMem);
 }
 
+
+
 size_t POOL_sizeof(POOL_ctx *ctx) {
     if (ctx==NULL) return 0;  /* supports sizeof NULL */
     return sizeof(*ctx)
         + ctx->queueSize * sizeof(POOL_job)
-        + ctx->numThreads * sizeof(ZSTD_pthread_t);
+        + ctx->threadCapacity * sizeof(ZSTD_pthread_t);
+}
+
+
+/* @return : 0 on success, 1 on error */
+static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads)
+{
+    if (numThreads <= ctx->threadCapacity) {
+        if (!numThreads) return 1;
+        ctx->threadLimit = numThreads;
+        return 0;
+    }
+    /* numThreads > threadCapacity */
+    {   ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem);
+        if (!threadPool) return 1;
+        /* replace existing thread pool */
+        memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool));
+        ZSTD_free(ctx->threads, ctx->customMem);
+        ctx->threads = threadPool;
+        /* Initialize additional threads */
+        {   size_t threadId;
+            for (threadId = ctx->threadCapacity; threadId < numThreads; ++threadId) {
+                if (ZSTD_pthread_create(&threadPool[threadId], NULL, &POOL_thread, ctx)) {
+                    ctx->threadCapacity = threadId;
+                    return 1;
+            }   }
+    }   }
+    /* successfully expanded */
+    ctx->threadCapacity = numThreads;
+    ctx->threadLimit = numThreads;
+    return 0;
+}
+
+/* @return : 0 on success, 1 on error */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads)
+{
+    int result;
+    if (ctx==NULL) return 1;
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    result = POOL_resize_internal(ctx, numThreads);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return result;
 }
 
 /**
  * Returns 1 if the queue is full and 0 otherwise.
  *
- * If the queueSize is 1 (the pool was created with an intended queueSize of 0),
- * then a queue is empty if there is a thread free and no job is waiting.
+ * When queueSize is 1 (pool was created with an intended queueSize of 0),
+ * then a queue is empty if there is a thread free _and_ no job is waiting.
  */
 static int isQueueFull(POOL_ctx const* ctx) {
     if (ctx->queueSize > 1) {
         return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize);
     } else {
-        return ctx->numThreadsBusy == ctx->numThreads ||
+        return (ctx->numThreadsBusy == ctx->threadLimit) ||
                !ctx->queueEmpty;
     }
 }
@@ -263,6 +315,11 @@ void POOL_free(POOL_ctx* ctx) {
     (void)ctx;
 }
 
+int POOL_resize(POOL_ctx* ctx, size_t numThreads) {
+    (void)ctx; (void)numThreads;
+    return 0;
+}
+
 void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) {
     (void)ctx;
     function(opaque);
diff --git a/contrib/python-zstandard/zstd/common/pool.h b/contrib/python-zstandard/zstd/common/pool.h
--- a/contrib/python-zstandard/zstd/common/pool.h
+++ b/contrib/python-zstandard/zstd/common/pool.h
@@ -30,40 +30,50 @@ typedef struct POOL_ctx_s POOL_ctx;
 */
 POOL_ctx* POOL_create(size_t numThreads, size_t queueSize);
 
-POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem);
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+                               ZSTD_customMem customMem);
 
 /*! POOL_free() :
-    Free a thread pool returned by POOL_create().
-*/
+ *  Free a thread pool returned by POOL_create().
+ */
 void POOL_free(POOL_ctx* ctx);
 
+/*! POOL_resize() :
+ *  Expands or shrinks pool's number of threads.
+ *  This is more efficient than releasing + creating a new context,
+ *  since it tries to preserve and re-use existing threads.
+ * `numThreads` must be at least 1.
+ * @return : 0 when resize was successful,
+ *           !0 (typically 1) if there is an error.
+ *    note : only numThreads can be resized, queueSize remains unchanged.
+ */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads);
+
 /*! POOL_sizeof() :
-    return memory usage of pool returned by POOL_create().
-*/
+ * @return threadpool memory usage
+ *  note : compatible with NULL (returns 0 in this case)
+ */
 size_t POOL_sizeof(POOL_ctx* ctx);
 
 /*! POOL_function :
-    The function type that can be added to a thread pool.
-*/
+ *  The function type that can be added to a thread pool.
+ */
 typedef void (*POOL_function)(void*);
-/*! POOL_add_function :
-    The function type for a generic thread pool add function.
-*/
-typedef void (*POOL_add_function)(void*, POOL_function, void*);
 
 /*! POOL_add() :
-    Add the job `function(opaque)` to the thread pool. `ctx` must be valid.
-    Possibly blocks until there is room in the queue.
-    Note : The function may be executed asynchronously, so `opaque` must live until the function has been completed.
-*/
+ *  Add the job `function(opaque)` to the thread pool. `ctx` must be valid.
+ *  Possibly blocks until there is room in the queue.
+ *  Note : The function may be executed asynchronously,
+ *         therefore, `opaque` must live until function has been completed.
+ */
 void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque);
 
 
 /*! POOL_tryAdd() :
-    Add the job `function(opaque)` to the thread pool if a worker is available.
-    return immediately otherwise.
-   @return : 1 if successful, 0 if not.
-*/
+ *  Add the job `function(opaque)` to thread pool _if_ a worker is available.
+ *  Returns immediately even if not (does not block).
+ * @return : 1 if successful, 0 if not.
+ */
 int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque);
 
 
diff --git a/contrib/python-zstandard/zstd/common/xxhash.c b/contrib/python-zstandard/zstd/common/xxhash.c
--- a/contrib/python-zstandard/zstd/common/xxhash.c
+++ b/contrib/python-zstandard/zstd/common/xxhash.c
@@ -98,6 +98,7 @@
 /* Modify the local functions below should you wish to use some other memory routines */
 /* for malloc(), free() */
 #include <stdlib.h>
+#include <stddef.h>     /* size_t */
 static void* XXH_malloc(size_t s) { return malloc(s); }
 static void  XXH_free  (void* p)  { free(p); }
 /* for memcpy() */
diff --git a/contrib/python-zstandard/zstd/common/zstd_common.c b/contrib/python-zstandard/zstd/common/zstd_common.c
--- a/contrib/python-zstandard/zstd/common/zstd_common.c
+++ b/contrib/python-zstandard/zstd/common/zstd_common.c
@@ -46,11 +46,6 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t 
  *  provides error code string from enum */
 const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
 
-/*! g_debuglog_enable :
- *  turn on/off debug traces (global switch) */
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG >= 2)
-int g_debuglog_enable = 1;
-#endif
 
 
 /*=**************************************************************
diff --git a/contrib/python-zstandard/zstd/common/zstd_internal.h b/contrib/python-zstandard/zstd/common/zstd_internal.h
--- a/contrib/python-zstandard/zstd/common/zstd_internal.h
+++ b/contrib/python-zstandard/zstd/common/zstd_internal.h
@@ -21,6 +21,7 @@
 ***************************************/
 #include "compiler.h"
 #include "mem.h"
+#include "debug.h"                 /* assert, DEBUGLOG, RAWLOG, g_debuglevel */
 #include "error_private.h"
 #define ZSTD_STATIC_LINKING_ONLY
 #include "zstd.h"
@@ -38,43 +39,8 @@
 extern "C" {
 #endif
 
-
-/*-*************************************
-*  Debug
-***************************************/
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
-#  include <assert.h>
-#else
-#  ifndef assert
-#    define assert(condition) ((void)0)
-#  endif
-#endif
-
-#define ZSTD_STATIC_ASSERT(c) { enum { ZSTD_static_assert = 1/(int)(!!(c)) }; }
-
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2)
-#  include <stdio.h>
-extern int g_debuglog_enable;
-/* recommended values for ZSTD_DEBUG display levels :
- * 1 : no display, enables assert() only
- * 2 : reserved for currently active debug path
- * 3 : events once per object lifetime (CCtx, CDict, etc.)
- * 4 : events once per frame
- * 5 : events once per block
- * 6 : events once per sequence (*very* verbose) */
-#  define RAWLOG(l, ...) {                                      \
-                if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) {    \
-                    fprintf(stderr, __VA_ARGS__);               \
-            }   }
-#  define DEBUGLOG(l, ...) {                                    \
-                if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) {    \
-                    fprintf(stderr, __FILE__ ": " __VA_ARGS__); \
-                    fprintf(stderr, " \n");                     \
-            }   }
-#else
-#  define RAWLOG(l, ...)      {}    /* disabled */
-#  define DEBUGLOG(l, ...)    {}    /* disabled */
-#endif
+/* ---- static assert (debug) --- */
+#define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)
 
 
 /*-*************************************
@@ -113,8 +79,7 @@ static const U32 repStartValue[ZSTD_REP_
 static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 };
 static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 };
 
-#define ZSTD_FRAMEIDSIZE 4
-static const size_t ZSTD_frameIdSize = ZSTD_FRAMEIDSIZE;  /* magic number size */
+#define ZSTD_FRAMEIDSIZE 4   /* magic number size */
 
 #define ZSTD_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
 static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
@@ -227,6 +192,8 @@ typedef struct {
     BYTE* llCode;
     BYTE* mlCode;
     BYTE* ofCode;
+    size_t maxNbSeq;
+    size_t maxNbLit;
     U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
     U32   longLengthPos;
 } seqStore_t;
diff --git a/contrib/python-zstandard/zstd/compress/fse_compress.c b/contrib/python-zstandard/zstd/compress/fse_compress.c
--- a/contrib/python-zstandard/zstd/compress/fse_compress.c
+++ b/contrib/python-zstandard/zstd/compress/fse_compress.c
@@ -1,6 +1,6 @@
 /* ******************************************************************
    FSE : Finite State Entropy encoder
-   Copyright (C) 2013-2015, Yann Collet.
+   Copyright (C) 2013-present, Yann Collet.
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -37,9 +37,11 @@
 ****************************************************************/
 #include <stdlib.h>     /* malloc, free, qsort */
 #include <string.h>     /* memcpy, memset */
-#include <stdio.h>      /* printf (debug) */
+#include "compiler.h"
+#include "mem.h"        /* U32, U16, etc. */
+#include "debug.h"      /* assert, DEBUGLOG */
+#include "hist.h"       /* HIST_count_wksp */
 #include "bitstream.h"
-#include "compiler.h"
 #define FSE_STATIC_LINKING_ONLY
 #include "fse.h"
 #include "error_private.h"
@@ -49,7 +51,6 @@
 *  Error Management
 ****************************************************************/
 #define FSE_isError ERR_isError
-#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
 
 
 /* **************************************************************
@@ -82,7 +83,9 @@
  * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
  * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
  */
-size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                      const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                            void* workSpace, size_t wkspSize)
 {
     U32 const tableSize = 1 << tableLog;
     U32 const tableMask = tableSize - 1;
@@ -100,9 +103,14 @@ size_t FSE_buildCTable_wksp(FSE_CTable* 
     if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge);
     tableU16[-2] = (U16) tableLog;
     tableU16[-1] = (U16) maxSymbolValue;
+    assert(tableLog < 16);   /* required for threshold strategy to work */
 
     /* For explanations on how to distribute symbol values over the table :
-    *  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+     #ifdef __clang_analyzer__
+     memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+     #endif
 
     /* symbol start positions */
     {   U32 u;
@@ -122,13 +130,15 @@ size_t FSE_buildCTable_wksp(FSE_CTable* 
         U32 symbol;
         for (symbol=0; symbol<=maxSymbolValue; symbol++) {
             int nbOccurences;
-            for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++) {
+            int const freq = normalizedCounter[symbol];
+            for (nbOccurences=0; nbOccurences<freq; nbOccurences++) {
                 tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
                 position = (position + step) & tableMask;
-                while (position > highThreshold) position = (position + step) & tableMask;   /* Low proba area */
+                while (position > highThreshold)
+                    position = (position + step) & tableMask;   /* Low proba area */
         }   }
 
-        if (position!=0) return ERROR(GENERIC);   /* Must have gone through all positions */
+        assert(position==0);  /* Must have initialized all positions */
     }
 
     /* Build table */
@@ -143,7 +153,10 @@ size_t FSE_buildCTable_wksp(FSE_CTable* 
         for (s=0; s<=maxSymbolValue; s++) {
             switch (normalizedCounter[s])
             {
-            case  0: break;
+            case  0:
+                /* filling nonetheless, for compatibility with FSE_getMaxNbBits() */
+                symbolTT[s].deltaNbBits = ((tableLog+1) << 16) - (1<<tableLog);
+                break;
 
             case -1:
             case  1:
@@ -160,6 +173,18 @@ size_t FSE_buildCTable_wksp(FSE_CTable* 
                     total +=  normalizedCounter[s];
     }   }   }   }
 
+#if 0  /* debug : symbol costs */
+    DEBUGLOG(5, "\n --- table statistics : ");
+    {   U32 symbol;
+        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+            DEBUGLOG(5, "%3u: w=%3i,   maxBits=%u, fracBits=%.2f",
+                symbol, normalizedCounter[symbol],
+                FSE_getMaxNbBits(symbolTT, symbol),
+                (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256);
+        }
+    }
+#endif
+
     return 0;
 }
 
@@ -174,8 +199,9 @@ size_t FSE_buildCTable(FSE_CTable* ct, c
 
 #ifndef FSE_COMMONDEFS_ONLY
 
+
 /*-**************************************************************
-*  FSE NCount encoding-decoding
+*  FSE NCount encoding
 ****************************************************************/
 size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
 {
@@ -183,9 +209,10 @@ size_t FSE_NCountWriteBound(unsigned max
     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
 }
 
-static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
-                                       const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
-                                       unsigned writeIsSafe)
+static size_t
+FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                   const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                         unsigned writeIsSafe)
 {
     BYTE* const ostart = (BYTE*) header;
     BYTE* out = ostart;
@@ -194,13 +221,12 @@ static size_t FSE_writeNCount_generic (v
     const int tableSize = 1 << tableLog;
     int remaining;
     int threshold;
-    U32 bitStream;
-    int bitCount;
-    unsigned charnum = 0;
-    int previous0 = 0;
+    U32 bitStream = 0;
+    int bitCount = 0;
+    unsigned symbol = 0;
+    unsigned const alphabetSize = maxSymbolValue + 1;
+    int previousIs0 = 0;
 
-    bitStream = 0;
-    bitCount  = 0;
     /* Table Size */
     bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
     bitCount  += 4;
@@ -210,48 +236,53 @@ static size_t FSE_writeNCount_generic (v
     threshold = tableSize;
     nbBits = tableLog+1;
 
-    while (remaining>1) {  /* stops at 1 */
-        if (previous0) {
-            unsigned start = charnum;
-            while (!normalizedCounter[charnum]) charnum++;
-            while (charnum >= start+24) {
+    while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+        if (previousIs0) {
+            unsigned start = symbol;
+            while ((symbol < alphabetSize) && !normalizedCounter[symbol]) symbol++;
+            if (symbol == alphabetSize) break;   /* incorrect distribution */
+            while (symbol >= start+24) {
                 start+=24;
                 bitStream += 0xFFFFU << bitCount;
-                if ((!writeIsSafe) && (out > oend-2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                if ((!writeIsSafe) && (out > oend-2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
                 out[0] = (BYTE) bitStream;
                 out[1] = (BYTE)(bitStream>>8);
                 out+=2;
                 bitStream>>=16;
             }
-            while (charnum >= start+3) {
+            while (symbol >= start+3) {
                 start+=3;
                 bitStream += 3 << bitCount;
                 bitCount += 2;
             }
-            bitStream += (charnum-start) << bitCount;
+            bitStream += (symbol-start) << bitCount;
             bitCount += 2;
             if (bitCount>16) {
-                if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                if ((!writeIsSafe) && (out > oend - 2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
                 out[0] = (BYTE)bitStream;
                 out[1] = (BYTE)(bitStream>>8);
                 out += 2;
                 bitStream >>= 16;
                 bitCount -= 16;
         }   }
-        {   int count = normalizedCounter[charnum++];
-            int const max = (2*threshold-1)-remaining;
+        {   int count = normalizedCounter[symbol++];
+            int const max = (2*threshold-1) - remaining;
             remaining -= count < 0 ? -count : count;
             count++;   /* +1 for extra accuracy */
-            if (count>=threshold) count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            if (count>=threshold)
+                count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
             bitStream += count << bitCount;
             bitCount  += nbBits;
             bitCount  -= (count<max);
-            previous0  = (count==1);
+            previousIs0  = (count==1);
             if (remaining<1) return ERROR(GENERIC);
             while (remaining<threshold) { nbBits--; threshold>>=1; }
         }
         if (bitCount>16) {
-            if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+            if ((!writeIsSafe) && (out > oend - 2))
+                return ERROR(dstSize_tooSmall);   /* Buffer overflow */
             out[0] = (BYTE)bitStream;
             out[1] = (BYTE)(bitStream>>8);
             out += 2;
@@ -259,19 +290,23 @@ static size_t FSE_writeNCount_generic (v
             bitCount -= 16;
     }   }
 
+    if (remaining != 1)
+        return ERROR(GENERIC);  /* incorrect normalized distribution */
+    assert(symbol <= alphabetSize);
+
     /* flush remaining bitStream */
-    if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+    if ((!writeIsSafe) && (out > oend - 2))
+        return ERROR(dstSize_tooSmall);   /* Buffer overflow */
     out[0] = (BYTE)bitStream;
     out[1] = (BYTE)(bitStream>>8);
     out+= (bitCount+7) /8;
 
-    if (charnum > maxSymbolValue + 1) return ERROR(GENERIC);
-
     return (out-ostart);
 }
 
 
-size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                  const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
 {
     if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported */
     if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
@@ -279,179 +314,13 @@ size_t FSE_writeNCount (void* buffer, si
     if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
         return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
 
-    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
-}
-
-
-
-/*-**************************************************************
-*  Counting histogram
-****************************************************************/
-/*! FSE_count_simple
-    This function counts byte values within `src`, and store the histogram into table `count`.
-    It doesn't use any additional memory.
-    But this function is unsafe : it doesn't check that all values within `src` can fit into `count`.
-    For this reason, prefer using a table `count` with 256 elements.
-    @return : count of most numerous element.
-*/
-size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
-                        const void* src, size_t srcSize)
-{
-    const BYTE* ip = (const BYTE*)src;
-    const BYTE* const end = ip + srcSize;
-    unsigned maxSymbolValue = *maxSymbolValuePtr;
-    unsigned max=0;
-
-    memset(count, 0, (maxSymbolValue+1)*sizeof(*count));
-    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
-
-    while (ip<end) {
-        assert(*ip <= maxSymbolValue);
-        count[*ip++]++;
-    }
-
-    while (!count[maxSymbolValue]) maxSymbolValue--;
-    *maxSymbolValuePtr = maxSymbolValue;
-
-    { U32 s; for (s=0; s<=maxSymbolValue; s++) if (count[s] > max) max = count[s]; }
-
-    return (size_t)max;
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */);
 }
 
 
-/* FSE_count_parallel_wksp() :
- * Same as FSE_count_parallel(), but using an externally provided scratch buffer.
- * `workSpace` size must be a minimum of `1024 * sizeof(unsigned)`.
- * @return : largest histogram frequency, or an error code (notably when histogram would be larger than *maxSymbolValuePtr). */
-static size_t FSE_count_parallel_wksp(
-                                unsigned* count, unsigned* maxSymbolValuePtr,
-                                const void* source, size_t sourceSize,
-                                unsigned checkMax, unsigned* const workSpace)
-{
-    const BYTE* ip = (const BYTE*)source;
-    const BYTE* const iend = ip+sourceSize;
-    unsigned maxSymbolValue = *maxSymbolValuePtr;
-    unsigned max=0;
-    U32* const Counting1 = workSpace;
-    U32* const Counting2 = Counting1 + 256;
-    U32* const Counting3 = Counting2 + 256;
-    U32* const Counting4 = Counting3 + 256;
-
-    memset(workSpace, 0, 4*256*sizeof(unsigned));
-
-    /* safety checks */
-    if (!sourceSize) {
-        memset(count, 0, maxSymbolValue + 1);
-        *maxSymbolValuePtr = 0;
-        return 0;
-    }
-    if (!maxSymbolValue) maxSymbolValue = 255;            /* 0 == default */
-
-    /* by stripes of 16 bytes */
-    {   U32 cached = MEM_read32(ip); ip += 4;
-        while (ip < iend-15) {
-            U32 c = cached; cached = MEM_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = MEM_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = MEM_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = MEM_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-        }
-        ip-=4;
-    }
-
-    /* finish last symbols */
-    while (ip<iend) Counting1[*ip++]++;
-
-    if (checkMax) {   /* verify stats will fit into destination table */
-        U32 s; for (s=255; s>maxSymbolValue; s--) {
-            Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
-            if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall);
-    }   }
-
-    {   U32 s;
-        if (maxSymbolValue > 255) maxSymbolValue = 255;
-        for (s=0; s<=maxSymbolValue; s++) {
-            count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
-            if (count[s] > max) max = count[s];
-    }   }
-
-    while (!count[maxSymbolValue]) maxSymbolValue--;
-    *maxSymbolValuePtr = maxSymbolValue;
-    return (size_t)max;
-}
-
-/* FSE_countFast_wksp() :
- * Same as FSE_countFast(), but using an externally provided scratch buffer.
- * `workSpace` size must be table of >= `1024` unsigned */
-size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
-                          const void* source, size_t sourceSize,
-                          unsigned* workSpace)
-{
-    if (sourceSize < 1500) /* heuristic threshold */
-        return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize);
-    return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace);
-}
-
-/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
-size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
-                     const void* source, size_t sourceSize)
-{
-    unsigned tmpCounters[1024];
-    return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters);
-}
-
-/* FSE_count_wksp() :
- * Same as FSE_count(), but using an externally provided scratch buffer.
- * `workSpace` size must be table of >= `1024` unsigned */
-size_t FSE_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
-                 const void* source, size_t sourceSize, unsigned* workSpace)
-{
-    if (*maxSymbolValuePtr < 255)
-        return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 1, workSpace);
-    *maxSymbolValuePtr = 255;
-    return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace);
-}
-
-size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr,
-                 const void* src, size_t srcSize)
-{
-    unsigned tmpCounters[1024];
-    return FSE_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters);
-}
-
-
-
 /*-**************************************************************
 *  FSE Compression Code
 ****************************************************************/
-/*! FSE_sizeof_CTable() :
-    FSE_CTable is a variable size structure which contains :
-    `U16 tableLog;`
-    `U16 maxSymbolValue;`
-    `U16 nextStateNumber[1 << tableLog];`                         // This size is variable
-    `FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];`  // This size is variable
-Allocation is manual (C standard does not support variable-size structures).
-*/
-size_t FSE_sizeof_CTable (unsigned maxSymbolValue, unsigned tableLog)
-{
-    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
-    return FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
-}
 
 FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
 {
@@ -466,7 +335,7 @@ void FSE_freeCTable (FSE_CTable* ct) { f
 /* provides the minimum logSize to safely represent a distribution */
 static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 {
-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1;
+    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
     U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
     assert(srcSize > 1); /* Not supported, RLE should be used instead */
@@ -529,6 +398,9 @@ static size_t FSE_normalizeM2(short* nor
     }
     ToDistribute = (1 << tableLog) - distributed;
 
+    if (ToDistribute == 0)
+        return 0;
+
     if ((total / ToDistribute) > lowOne) {
         /* risk of rounding to zero */
         lowOne = (U32)((total * 3) / (ToDistribute * 2));
@@ -629,11 +501,11 @@ size_t FSE_normalizeCount (short* normal
         U32 s;
         U32 nTotal = 0;
         for (s=0; s<=maxSymbolValue; s++)
-            printf("%3i: %4i \n", s, normalizedCounter[s]);
+            RAWLOG(2, "%3i: %4i \n", s, normalizedCounter[s]);
         for (s=0; s<=maxSymbolValue; s++)
             nTotal += abs(normalizedCounter[s]);
         if (nTotal != (1U<<tableLog))
-            printf("Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
+            RAWLOG(2, "Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
         getchar();
     }
 #endif
@@ -800,7 +672,7 @@ size_t FSE_compress_wksp (void* dst, siz
     if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
 
     /* Scan input and build symbol stats */
-    {   CHECK_V_F(maxCount, FSE_count_wksp(count, &maxSymbolValue, src, srcSize, (unsigned*)scratchBuffer) );
+    {   CHECK_V_F(maxCount, HIST_count_wksp(count, &maxSymbolValue, src, srcSize, (unsigned*)scratchBuffer) );
         if (maxCount == srcSize) return 1;   /* only a single symbol in src : rle */
         if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
         if (maxCount < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
@@ -835,7 +707,7 @@ typedef struct {
 size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
 {
     fseWkspMax_t scratchBuffer;
-    FSE_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE));   /* compilation failures here means scratchBuffer is not large enough */
+    DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE));   /* compilation failures here means scratchBuffer is not large enough */
     if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
     return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer));
 }
diff --git a/contrib/python-zstandard/zstd/compress/hist.c b/contrib/python-zstandard/zstd/compress/hist.c
new file mode 100644
--- /dev/null
+++ b/contrib/python-zstandard/zstd/compress/hist.c
@@ -0,0 +1,195 @@
+/* ******************************************************************
+   hist : Histogram functions
+   part of Finite State Entropy project
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* --- dependencies --- */
+#include "mem.h"             /* U32, BYTE, etc. */
+#include "debug.h"           /* assert, DEBUGLOG */
+#include "error_private.h"   /* ERROR */
+#include "hist.h"
+
+
+/* --- Error management --- */
+unsigned HIST_isError(size_t code) { return ERR_isError(code); }
+
+/*-**************************************************************
+ *  Histogram functions
+ ****************************************************************/
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const end = ip + srcSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned largestCount=0;
+
+    memset(count, 0, (maxSymbolValue+1) * sizeof(*count));
+    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
+
+    while (ip<end) {
+        assert(*ip <= maxSymbolValue);
+        count[*ip++]++;
+    }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+
+    {   U32 s;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > largestCount) largestCount = count[s];
+    }
+
+    return largestCount;
+}
+
+
+/* HIST_count_parallel_wksp() :
+ * store histogram into 4 intermediate tables, recombined at the end.
+ * this design makes better use of OoO cpus,
+ * and is noticeably faster when some values are heavily repeated.
+ * But it needs some additional workspace for intermediate tables.
+ * `workSpace` size must be a table of size >= HIST_WKSP_SIZE_U32.
+ * @return : largest histogram frequency,
+ *           or an error code (notably when histogram would be larger than *maxSymbolValuePtr). */
+static size_t HIST_count_parallel_wksp(
+                                unsigned* count, unsigned* maxSymbolValuePtr,
+                                const void* source, size_t sourceSize,
+                                unsigned checkMax,
+                                unsigned* const workSpace)
+{
+    const BYTE* ip = (const BYTE*)source;
+    const BYTE* const iend = ip+sourceSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned max=0;
+    U32* const Counting1 = workSpace;
+    U32* const Counting2 = Counting1 + 256;
+    U32* const Counting3 = Counting2 + 256;
+    U32* const Counting4 = Counting3 + 256;
+
+    memset(workSpace, 0, 4*256*sizeof(unsigned));
+
+    /* safety checks */
+    if (!sourceSize) {
+        memset(count, 0, maxSymbolValue + 1);
+        *maxSymbolValuePtr = 0;
+        return 0;
+    }
+    if (!maxSymbolValue) maxSymbolValue = 255;            /* 0 == default */
+
+    /* by stripes of 16 bytes */
+    {   U32 cached = MEM_read32(ip); ip += 4;
+        while (ip < iend-15) {
+            U32 c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+        }
+        ip-=4;
+    }
+
+    /* finish last symbols */
+    while (ip<iend) Counting1[*ip++]++;
+
+    if (checkMax) {   /* verify stats will fit into destination table */
+        U32 s; for (s=255; s>maxSymbolValue; s--) {
+            Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+            if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall);
+    }   }
+
+    {   U32 s;
+        if (maxSymbolValue > 255) maxSymbolValue = 255;
+        for (s=0; s<=maxSymbolValue; s++) {
+            count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
+            if (count[s] > max) max = count[s];
+    }   }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+    return (size_t)max;
+}
+
+/* HIST_countFast_wksp() :
+ * Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                          const void* source, size_t sourceSize,
+                          unsigned* workSpace)
+{
+    if (sourceSize < 1500) /* heuristic threshold */
+        return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+    return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace);
+}
+
+/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                     const void* source, size_t sourceSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters);
+}
+
+/* HIST_count_wksp() :
+ * Same as HIST_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* source, size_t sourceSize, unsigned* workSpace)
+{
+    if (*maxSymbolValuePtr < 255)
+        return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 1, workSpace);
+    *maxSymbolValuePtr = 255;
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace);
+}
+
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* src, size_t srcSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters);
+}
diff --git a/contrib/python-zstandard/zstd/compress/hist.h b/contrib/python-zstandard/zstd/compress/hist.h
new file mode 100644
--- /dev/null
+++ b/contrib/python-zstandard/zstd/compress/hist.h
@@ -0,0 +1,92 @@
+/* ******************************************************************
+   hist : Histogram functions
+   part of Finite State Entropy project
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* --- dependencies --- */
+#include <stddef.h>   /* size_t */
+
+
+/* --- simple histogram functions --- */
+
+/*! HIST_count():
+ *  Provides the precise count of each byte within a table 'count'.
+ *  'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
+ *  Updates *maxSymbolValuePtr with actual largest symbol value detected.
+ *  @return : count of the most frequent symbol (which isn't identified).
+ *            or an error code, which can be tested using HIST_isError().
+ *            note : if return == srcSize, there is only one symbol.
+ */
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                  const void* src, size_t srcSize);
+
+unsigned HIST_isError(size_t code);  /**< tells if a return value is an error code */
+
+
+/* --- advanced histogram functions --- */
+
+#define HIST_WKSP_SIZE_U32 1024
+/** HIST_count_wksp() :
+ *  Same as HIST_count(), but using an externally provided scratch buffer.
+ *  Benefit is this function will use very little stack space.
+ * `workSpace` must be a table of unsigned of size >= HIST_WKSP_SIZE_U32
+ */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                       const void* src, size_t srcSize,
+                       unsigned* workSpace);
+
+/** HIST_countFast() :
+ *  same as HIST_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr.
+ *  This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr`
+ */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                      const void* src, size_t srcSize);
+
+/** HIST_countFast_wksp() :
+ *  Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` must be a table of unsigned of size >= HIST_WKSP_SIZE_U32
+ */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize,
+                           unsigned* workSpace);
+
+/*! HIST_count_simple() :
+ *  Same as HIST_countFast(), this function is unsafe,
+ *  and will segfault if any value within `src` is `> *maxSymbolValuePtr`.
+ *  It is also a bit slower for large inputs.
+ *  However, it does not need any additional memory (not even on stack).
+ * @return : count of the most frequent symbol.
+ *  Note this function doesn't produce any error (i.e. it must succeed).
+ */
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize);
diff --git a/contrib/python-zstandard/zstd/compress/huf_compress.c b/contrib/python-zstandard/zstd/compress/huf_compress.c
--- a/contrib/python-zstandard/zstd/compress/huf_compress.c
+++ b/contrib/python-zstandard/zstd/compress/huf_compress.c
@@ -45,8 +45,9 @@
 ****************************************************************/
 #include <string.h>     /* memcpy, memset */
 #include <stdio.h>      /* printf (debug) */
+#include "compiler.h"
 #include "bitstream.h"
-#include "compiler.h"
+#include "hist.h"
 #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
 #include "fse.h"        /* header compression */
 #define HUF_STATIC_LINKING_ONLY
@@ -58,7 +59,7 @@
 *  Error Management
 ****************************************************************/
 #define HUF_isError ERR_isError
-#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+#define HUF_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)   /* use only *after* variable declarations */
 #define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
 #define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
 
@@ -81,7 +82,7 @@ unsigned HUF_optimalTableLog(unsigned ma
  * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
  */
 #define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
-size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize)
+static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize)
 {
     BYTE* const ostart = (BYTE*) dst;
     BYTE* op = ostart;
@@ -100,9 +101,9 @@ size_t HUF_compressWeights (void* dst, s
     if (wtSize <= 1) return 0;  /* Not compressible */
 
     /* Scan input and build symbol stats */
-    {   CHECK_V_F(maxCount, FSE_count_simple(count, &maxSymbolValue, weightTable, wtSize) );
+    {   unsigned const maxCount = HIST_count_simple(count, &maxSymbolValue, weightTable, wtSize);   /* never fails */
         if (maxCount == wtSize) return 1;   /* only a single symbol in src : rle */
-        if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
+        if (maxCount == 1) return 0;        /* each symbol present maximum once => not compressible */
     }
 
     tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
@@ -216,6 +217,13 @@ size_t HUF_readCTable (HUF_CElt* CTable,
     return readSize;
 }
 
+U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue)
+{
+    const HUF_CElt* table = (const HUF_CElt*)symbolTable;
+    assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
+    return table[symbolValue].nbBits;
+}
+
 
 typedef struct nodeElt_s {
     U32 count;
@@ -660,9 +668,9 @@ static size_t HUF_compress_internal (
     }
 
     /* Scan input and build symbol stats */
-    {   CHECK_V_F(largest, FSE_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->count) );
+    {   CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->count) );
         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
-        if (largest <= (srcSize >> 7)+1) return 0;   /* heuristic : probably not compressible enough */
+        if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
     }
 
     /* Check validity of previous table */
diff --git a/contrib/python-zstandard/zstd/compress/zstd_compress.c b/contrib/python-zstandard/zstd/compress/zstd_compress.c
--- a/contrib/python-zstandard/zstd/compress/zstd_compress.c
+++ b/contrib/python-zstandard/zstd/compress/zstd_compress.c
@@ -8,21 +8,13 @@
  * You may select, at your option, one of the above-listed licenses.
  */
 
-
-/*-*************************************
-*  Tuning parameters
-***************************************/
-#ifndef ZSTD_CLEVEL_DEFAULT
-#  define ZSTD_CLEVEL_DEFAULT 3
-#endif
-
-
 /*-*************************************
 *  Dependencies
 ***************************************/
 #include <string.h>         /* memset */
 #include "cpu.h"
 #include "mem.h"
+#include "hist.h"           /* HIST_countFast_wksp */
 #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
 #include "fse.h"
 #define HUF_STATIC_LINKING_ONLY
@@ -54,7 +46,6 @@ struct ZSTD_CDict_s {
     size_t workspaceSize;
     ZSTD_matchState_t matchState;
     ZSTD_compressedBlockState_t cBlockState;
-    ZSTD_compressionParameters cParams;
     ZSTD_customMem customMem;
     U32 dictID;
 };  /* typedef'd to ZSTD_CDict within "zstd.h" */
@@ -64,17 +55,26 @@ ZSTD_CCtx* ZSTD_createCCtx(void)
     return ZSTD_createCCtx_advanced(ZSTD_defaultCMem);
 }
 
+static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager)
+{
+    assert(cctx != NULL);
+    memset(cctx, 0, sizeof(*cctx));
+    cctx->customMem = memManager;
+    cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+    {   size_t const err = ZSTD_CCtx_resetParameters(cctx);
+        assert(!ZSTD_isError(err));
+        (void)err;
+    }
+}
+
 ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem)
 {
     ZSTD_STATIC_ASSERT(zcss_init==0);
     ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1));
     if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
-    {   ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_calloc(sizeof(ZSTD_CCtx), customMem);
+    {   ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_malloc(sizeof(ZSTD_CCtx), customMem);
         if (!cctx) return NULL;
-        cctx->customMem = customMem;
-        cctx->requestedParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
-        cctx->requestedParams.fParams.contentSizeFlag = 1;
-        cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+        ZSTD_initCCtx(cctx, customMem);
         return cctx;
     }
 }
@@ -102,17 +102,24 @@ ZSTD_CCtx* ZSTD_initStaticCCtx(void *wor
     return cctx;
 }
 
-size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
 {
-    if (cctx==NULL) return 0;   /* support free on NULL */
-    if (cctx->staticSize) return ERROR(memory_allocation);   /* not compatible with static CCtx */
+    assert(cctx != NULL);
+    assert(cctx->staticSize == 0);
     ZSTD_free(cctx->workSpace, cctx->customMem); cctx->workSpace = NULL;
     ZSTD_freeCDict(cctx->cdictLocal); cctx->cdictLocal = NULL;
 #ifdef ZSTD_MULTITHREAD
     ZSTDMT_freeCCtx(cctx->mtctx); cctx->mtctx = NULL;
 #endif
+}
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return 0;   /* support free on NULL */
+    if (cctx->staticSize) return ERROR(memory_allocation);   /* not compatible with static CCtx */
+    ZSTD_freeCCtxContent(cctx);
     ZSTD_free(cctx, cctx->customMem);
-    return 0;   /* reserved as a potential error code in the future */
+    return 0;
 }
 
 
@@ -143,21 +150,6 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CS
 /* private API call, for dictBuilder only */
 const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
 
-ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
-        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize)
-{
-    ZSTD_compressionParameters cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize);
-    if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
-    if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog;
-    if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog;
-    if (CCtxParams->cParams.chainLog) cParams.chainLog = CCtxParams->cParams.chainLog;
-    if (CCtxParams->cParams.searchLog) cParams.searchLog = CCtxParams->cParams.searchLog;
-    if (CCtxParams->cParams.searchLength) cParams.searchLength = CCtxParams->cParams.searchLength;
-    if (CCtxParams->cParams.targetLength) cParams.targetLength = CCtxParams->cParams.targetLength;
-    if (CCtxParams->cParams.strategy) cParams.strategy = CCtxParams->cParams.strategy;
-    return cParams;
-}
-
 static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
         ZSTD_compressionParameters cParams)
 {
@@ -251,7 +243,6 @@ static int ZSTD_isUpdateAuthorized(ZSTD_
     case ZSTD_p_minMatch:
     case ZSTD_p_targetLength:
     case ZSTD_p_compressionStrategy:
-    case ZSTD_p_compressLiterals:
         return 1;
 
     case ZSTD_p_format:
@@ -268,6 +259,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_
     case ZSTD_p_ldmMinMatch:
     case ZSTD_p_ldmBucketSizeLog:
     case ZSTD_p_ldmHashEveryLog:
+    case ZSTD_p_forceAttachDict:
     default:
         return 0;
     }
@@ -302,7 +294,6 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx*
         if (cctx->cdict) return ERROR(stage_wrong);
         return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
 
-    case ZSTD_p_compressLiterals:
     case ZSTD_p_contentSizeFlag:
     case ZSTD_p_checksumFlag:
     case ZSTD_p_dictIDFlag:
@@ -313,6 +304,9 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx*
                                    * default : 0 when using a CDict, 1 when using a Prefix */
         return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
 
+    case ZSTD_p_forceAttachDict:
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
     case ZSTD_p_nbWorkers:
         if ((value>0) && cctx->staticSize) {
             return ERROR(parameter_unsupported);  /* MT not compatible with static alloc */
@@ -351,7 +345,6 @@ size_t ZSTD_CCtxParam_setParameter(
         int cLevel = (int)value;  /* cast expected to restore negative sign */
         if (cLevel > ZSTD_maxCLevel()) cLevel = ZSTD_maxCLevel();
         if (cLevel) {  /* 0 : does not change current level */
-            CCtxParams->disableLiteralCompression = (cLevel<0);  /* negative levels disable huffman */
             CCtxParams->compressionLevel = cLevel;
         }
         if (CCtxParams->compressionLevel >= 0) return CCtxParams->compressionLevel;
@@ -399,10 +392,6 @@ size_t ZSTD_CCtxParam_setParameter(
         CCtxParams->cParams.strategy = (ZSTD_strategy)value;
         return (size_t)CCtxParams->cParams.strategy;
 
-    case ZSTD_p_compressLiterals:
-        CCtxParams->disableLiteralCompression = !value;
-        return !CCtxParams->disableLiteralCompression;
-
     case ZSTD_p_contentSizeFlag :
         /* Content size written in frame header _when known_ (default:1) */
         DEBUGLOG(4, "set content size flag = %u", (value>0));
@@ -423,6 +412,12 @@ size_t ZSTD_CCtxParam_setParameter(
         CCtxParams->forceWindow = (value > 0);
         return CCtxParams->forceWindow;
 
+    case ZSTD_p_forceAttachDict :
+        CCtxParams->attachDictPref = value ?
+                                    (value > 0 ? ZSTD_dictForceAttach : ZSTD_dictForceCopy) :
+                                     ZSTD_dictDefaultAttach;
+        return CCtxParams->attachDictPref;
+
     case ZSTD_p_nbWorkers :
 #ifndef ZSTD_MULTITHREAD
         if (value>0) return ERROR(parameter_unsupported);
@@ -477,6 +472,98 @@ size_t ZSTD_CCtxParam_setParameter(
     }
 }
 
+size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned* value)
+{
+    return ZSTD_CCtxParam_getParameter(&cctx->requestedParams, param, value);
+}
+
+size_t ZSTD_CCtxParam_getParameter(
+        ZSTD_CCtx_params* CCtxParams, ZSTD_cParameter param, unsigned* value)
+{
+    switch(param)
+    {
+    case ZSTD_p_format :
+        *value = CCtxParams->format;
+        break;
+    case ZSTD_p_compressionLevel :
+        *value = CCtxParams->compressionLevel;
+        break;
+    case ZSTD_p_windowLog :
+        *value = CCtxParams->cParams.windowLog;
+        break;
+    case ZSTD_p_hashLog :
+        *value = CCtxParams->cParams.hashLog;
+        break;
+    case ZSTD_p_chainLog :
+        *value = CCtxParams->cParams.chainLog;
+        break;
+    case ZSTD_p_searchLog :
+        *value = CCtxParams->cParams.searchLog;
+        break;
+    case ZSTD_p_minMatch :
+        *value = CCtxParams->cParams.searchLength;
+        break;
+    case ZSTD_p_targetLength :
+        *value = CCtxParams->cParams.targetLength;
+        break;
+    case ZSTD_p_compressionStrategy :
+        *value = (unsigned)CCtxParams->cParams.strategy;
+        break;
+    case ZSTD_p_contentSizeFlag :
+        *value = CCtxParams->fParams.contentSizeFlag;
+        break;
+    case ZSTD_p_checksumFlag :
+        *value = CCtxParams->fParams.checksumFlag;
+        break;
+    case ZSTD_p_dictIDFlag :
+        *value = !CCtxParams->fParams.noDictIDFlag;
+        break;
+    case ZSTD_p_forceMaxWindow :
+        *value = CCtxParams->forceWindow;
+        break;
+    case ZSTD_p_forceAttachDict :
+        *value = CCtxParams->attachDictPref;
+        break;
+    case ZSTD_p_nbWorkers :
+#ifndef ZSTD_MULTITHREAD
+        assert(CCtxParams->nbWorkers == 0);
+#endif
+        *value = CCtxParams->nbWorkers;
+        break;
+    case ZSTD_p_jobSize :
+#ifndef ZSTD_MULTITHREAD
+        return ERROR(parameter_unsupported);
+#else
+        *value = CCtxParams->jobSize;
+        break;
+#endif
+    case ZSTD_p_overlapSizeLog :
+#ifndef ZSTD_MULTITHREAD
+        return ERROR(parameter_unsupported);
+#else
+        *value = CCtxParams->overlapSizeLog;
+        break;
+#endif
+    case ZSTD_p_enableLongDistanceMatching :
+        *value = CCtxParams->ldmParams.enableLdm;
+        break;
+    case ZSTD_p_ldmHashLog :
+        *value = CCtxParams->ldmParams.hashLog;
+        break;
+    case ZSTD_p_ldmMinMatch :
+        *value = CCtxParams->ldmParams.minMatchLength;
+        break;
+    case ZSTD_p_ldmBucketSizeLog :
+        *value = CCtxParams->ldmParams.bucketSizeLog;
+        break;
+    case ZSTD_p_ldmHashEveryLog :
+        *value = CCtxParams->ldmParams.hashEveryLog;
+        break;
+    default: return ERROR(parameter_unsupported);
+    }
+    return 0;
+}
+
 /** ZSTD_CCtx_setParametersUsingCCtxParams() :
  *  just applies `params` into `cctx`
  *  no action is performed, parameters are merely stored.
@@ -487,6 +574,7 @@ size_t ZSTD_CCtxParam_setParameter(
 size_t ZSTD_CCtx_setParametersUsingCCtxParams(
         ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params)
 {
+    DEBUGLOG(4, "ZSTD_CCtx_setParametersUsingCCtxParams");
     if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
     if (cctx->cdict) return ERROR(stage_wrong);
 
@@ -565,18 +653,19 @@ size_t ZSTD_CCtx_refPrefix_advanced(
     return 0;
 }
 
-static void ZSTD_startNewCompression(ZSTD_CCtx* cctx)
+/*! ZSTD_CCtx_reset() :
+ *  Also dumps dictionary */
+void ZSTD_CCtx_reset(ZSTD_CCtx* cctx)
 {
     cctx->streamStage = zcss_init;
     cctx->pledgedSrcSizePlusOne = 0;
 }
 
-/*! ZSTD_CCtx_reset() :
- *  Also dumps dictionary */
-void ZSTD_CCtx_reset(ZSTD_CCtx* cctx)
+size_t ZSTD_CCtx_resetParameters(ZSTD_CCtx* cctx)
 {
-    ZSTD_startNewCompression(cctx);
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
     cctx->cdict = NULL;
+    return ZSTD_CCtxParams_reset(&cctx->requestedParams);
 }
 
 /** ZSTD_checkCParams() :
@@ -589,8 +678,9 @@ size_t ZSTD_checkCParams(ZSTD_compressio
     CLAMPCHECK(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
     CLAMPCHECK(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
     CLAMPCHECK(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
-    if ((U32)(cParams.targetLength) < ZSTD_TARGETLENGTH_MIN)
-        return ERROR(parameter_unsupported);
+    ZSTD_STATIC_ASSERT(ZSTD_TARGETLENGTH_MIN == 0);
+    if (cParams.targetLength > ZSTD_TARGETLENGTH_MAX)
+        return ERROR(parameter_outOfBound);
     if ((U32)(cParams.strategy) > (U32)ZSTD_btultra)
         return ERROR(parameter_unsupported);
     return 0;
@@ -599,7 +689,8 @@ size_t ZSTD_checkCParams(ZSTD_compressio
 /** ZSTD_clampCParams() :
  *  make CParam values within valid range.
  *  @return : valid CParams */
-static ZSTD_compressionParameters ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+static ZSTD_compressionParameters
+ZSTD_clampCParams(ZSTD_compressionParameters cParams)
 {
 #   define CLAMP(val,min,max) {      \
         if (val<min) val=min;        \
@@ -610,8 +701,10 @@ static ZSTD_compressionParameters ZSTD_c
     CLAMP(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
     CLAMP(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
     CLAMP(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
-    if ((U32)(cParams.targetLength) < ZSTD_TARGETLENGTH_MIN) cParams.targetLength = ZSTD_TARGETLENGTH_MIN;
-    if ((U32)(cParams.strategy) > (U32)ZSTD_btultra) cParams.strategy = ZSTD_btultra;
+    ZSTD_STATIC_ASSERT(ZSTD_TARGETLENGTH_MIN == 0);
+    if (cParams.targetLength > ZSTD_TARGETLENGTH_MAX)
+        cParams.targetLength = ZSTD_TARGETLENGTH_MAX;
+    CLAMP(cParams.strategy, ZSTD_fast, ZSTD_btultra);
     return cParams;
 }
 
@@ -627,8 +720,11 @@ static U32 ZSTD_cycleLog(U32 hashLog, ZS
     optimize `cPar` for a given input (`srcSize` and `dictSize`).
     mostly downsizing to reduce memory consumption and initialization latency.
     Both `srcSize` and `dictSize` are optional (use 0 if unknown).
-    Note : cPar is considered validated at this stage. Use ZSTD_checkCParams() to ensure that condition. */
-ZSTD_compressionParameters ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize)
+    Note : cPar is assumed validated. Use ZSTD_checkCParams() to ensure this condition. */
+static ZSTD_compressionParameters
+ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+                            unsigned long long srcSize,
+                            size_t dictSize)
 {
     static const U64 minSrcSize = 513; /* (1<<9) + 1 */
     static const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
@@ -648,7 +744,7 @@ ZSTD_compressionParameters ZSTD_adjustCP
                             ZSTD_highbit32(tSize-1) + 1;
         if (cPar.windowLog > srcLog) cPar.windowLog = srcLog;
     }
-    if (cPar.hashLog > cPar.windowLog) cPar.hashLog = cPar.windowLog;
+    if (cPar.hashLog > cPar.windowLog+1) cPar.hashLog = cPar.windowLog+1;
     {   U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
         if (cycleLog > cPar.windowLog)
             cPar.chainLog -= (cycleLog - cPar.windowLog);
@@ -660,13 +756,34 @@ ZSTD_compressionParameters ZSTD_adjustCP
     return cPar;
 }
 
-ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize)
+ZSTD_compressionParameters
+ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+                   unsigned long long srcSize,
+                   size_t dictSize)
 {
     cPar = ZSTD_clampCParams(cPar);
     return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize);
 }
 
-static size_t ZSTD_sizeof_matchState(ZSTD_compressionParameters const* cParams, const U32 forCCtx)
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize);
+    if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
+    if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog;
+    if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog;
+    if (CCtxParams->cParams.chainLog) cParams.chainLog = CCtxParams->cParams.chainLog;
+    if (CCtxParams->cParams.searchLog) cParams.searchLog = CCtxParams->cParams.searchLog;
+    if (CCtxParams->cParams.searchLength) cParams.searchLength = CCtxParams->cParams.searchLength;
+    if (CCtxParams->cParams.targetLength) cParams.targetLength = CCtxParams->cParams.targetLength;
+    if (CCtxParams->cParams.strategy) cParams.strategy = CCtxParams->cParams.strategy;
+    assert(!ZSTD_checkCParams(cParams));
+    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize);
+}
+
+static size_t
+ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+                       const U32 forCCtx)
 {
     size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
     size_t const hSize = ((size_t)1) << cParams->hashLog;
@@ -693,7 +810,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxPa
         size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
         U32    const divider = (cParams.searchLength==3) ? 3 : 4;
         size_t const maxNbSeq = blockSize / divider;
-        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const tokenSpace = WILDCOPY_OVERLENGTH + blockSize + 11*maxNbSeq;
         size_t const entropySpace = HUF_WORKSPACE_SIZE;
         size_t const blockStateSpace = 2 * sizeof(ZSTD_compressedBlockState_t);
         size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 1);
@@ -752,12 +869,14 @@ size_t ZSTD_estimateCStreamSize_usingCPa
     return ZSTD_estimateCStreamSize_usingCCtxParams(&params);
 }
 
-static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) {
+static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel)
+{
     ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0);
     return ZSTD_estimateCStreamSize_usingCParams(cParams);
 }
 
-size_t ZSTD_estimateCStreamSize(int compressionLevel) {
+size_t ZSTD_estimateCStreamSize(int compressionLevel)
+{
     int level;
     size_t memBudget = 0;
     for (level=1; level<=compressionLevel; level++) {
@@ -786,9 +905,27 @@ ZSTD_frameProgression ZSTD_getFrameProgr
         fp.ingested = cctx->consumedSrcSize + buffered;
         fp.consumed = cctx->consumedSrcSize;
         fp.produced = cctx->producedCSize;
+        fp.flushed  = cctx->producedCSize;   /* simplified; some data might still be left within streaming output buffer */
+        fp.currentJobID = 0;
+        fp.nbActiveWorkers = 0;
         return fp;
 }   }
 
+/*! ZSTD_toFlushNow()
+ *  Only useful for multithreading scenarios currently (nbWorkers >= 1).
+ */
+size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        return ZSTDMT_toFlushNow(cctx->mtctx);
+    }
+#endif
+    (void)cctx;
+    return 0;   /* over-simplification; could also check if context is currently running in streaming mode, and in which case, report how many bytes are left to be flushed within output buffer */
+}
+
+
 
 static U32 ZSTD_equivalentCParams(ZSTD_compressionParameters cParams1,
                                   ZSTD_compressionParameters cParams2)
@@ -799,6 +936,20 @@ static U32 ZSTD_equivalentCParams(ZSTD_c
          & ((cParams1.searchLength==3) == (cParams2.searchLength==3));  /* hashlog3 space */
 }
 
+static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1,
+                                    ZSTD_compressionParameters cParams2)
+{
+    (void)cParams1;
+    (void)cParams2;
+    assert(cParams1.windowLog    == cParams2.windowLog);
+    assert(cParams1.chainLog     == cParams2.chainLog);
+    assert(cParams1.hashLog      == cParams2.hashLog);
+    assert(cParams1.searchLog    == cParams2.searchLog);
+    assert(cParams1.searchLength == cParams2.searchLength);
+    assert(cParams1.targetLength == cParams2.targetLength);
+    assert(cParams1.strategy     == cParams2.strategy);
+}
+
 /** The parameters are equivalent if ldm is not enabled in both sets or
  *  all the parameters are equivalent. */
 static U32 ZSTD_equivalentLdmParams(ldmParams_t ldmParams1,
@@ -817,33 +968,51 @@ typedef enum { ZSTDb_not_buffered, ZSTDb
 /* ZSTD_sufficientBuff() :
  * check internal buffers exist for streaming if buffPol == ZSTDb_buffered .
  * Note : they are assumed to be correctly sized if ZSTD_equivalentCParams()==1 */
-static U32 ZSTD_sufficientBuff(size_t bufferSize1, size_t blockSize1,
+static U32 ZSTD_sufficientBuff(size_t bufferSize1, size_t maxNbSeq1,
+                            size_t maxNbLit1,
                             ZSTD_buffered_policy_e buffPol2,
                             ZSTD_compressionParameters cParams2,
                             U64 pledgedSrcSize)
 {
     size_t const windowSize2 = MAX(1, (size_t)MIN(((U64)1 << cParams2.windowLog), pledgedSrcSize));
     size_t const blockSize2 = MIN(ZSTD_BLOCKSIZE_MAX, windowSize2);
+    size_t const maxNbSeq2 = blockSize2 / ((cParams2.searchLength == 3) ? 3 : 4);
+    size_t const maxNbLit2 = blockSize2;
     size_t const neededBufferSize2 = (buffPol2==ZSTDb_buffered) ? windowSize2 + blockSize2 : 0;
-    DEBUGLOG(4, "ZSTD_sufficientBuff: is windowSize2=%u <= wlog1=%u",
-                (U32)windowSize2, cParams2.windowLog);
-    DEBUGLOG(4, "ZSTD_sufficientBuff: is blockSize2=%u <= blockSize1=%u",
-                (U32)blockSize2, (U32)blockSize1);
-    return (blockSize2 <= blockSize1) /* seqStore space depends on blockSize */
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is neededBufferSize2=%u <= bufferSize1=%u",
+                (U32)neededBufferSize2, (U32)bufferSize1);
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is maxNbSeq2=%u <= maxNbSeq1=%u",
+                (U32)maxNbSeq2, (U32)maxNbSeq1);
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is maxNbLit2=%u <= maxNbLit1=%u",
+                (U32)maxNbLit2, (U32)maxNbLit1);
+    return (maxNbLit2 <= maxNbLit1)
+         & (maxNbSeq2 <= maxNbSeq1)
          & (neededBufferSize2 <= bufferSize1);
 }
 
 /** Equivalence for resetCCtx purposes */
 static U32 ZSTD_equivalentParams(ZSTD_CCtx_params params1,
                                  ZSTD_CCtx_params params2,
-                                 size_t buffSize1, size_t blockSize1,
+                                 size_t buffSize1,
+                                 size_t maxNbSeq1, size_t maxNbLit1,
                                  ZSTD_buffered_policy_e buffPol2,
                                  U64 pledgedSrcSize)
 {
     DEBUGLOG(4, "ZSTD_equivalentParams: pledgedSrcSize=%u", (U32)pledgedSrcSize);
-    return ZSTD_equivalentCParams(params1.cParams, params2.cParams) &&
-           ZSTD_equivalentLdmParams(params1.ldmParams, params2.ldmParams) &&
-           ZSTD_sufficientBuff(buffSize1, blockSize1, buffPol2, params2.cParams, pledgedSrcSize);
+    if (!ZSTD_equivalentCParams(params1.cParams, params2.cParams)) {
+      DEBUGLOG(4, "ZSTD_equivalentCParams() == 0");
+      return 0;
+    }
+    if (!ZSTD_equivalentLdmParams(params1.ldmParams, params2.ldmParams)) {
+      DEBUGLOG(4, "ZSTD_equivalentLdmParams() == 0");
+      return 0;
+    }
+    if (!ZSTD_sufficientBuff(buffSize1, maxNbSeq1, maxNbLit1, buffPol2,
+                             params2.cParams, pledgedSrcSize)) {
+      DEBUGLOG(4, "ZSTD_sufficientBuff() == 0");
+      return 0;
+    }
+    return 1;
 }
 
 static void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
@@ -851,10 +1020,10 @@ static void ZSTD_reset_compressedBlockSt
     int i;
     for (i = 0; i < ZSTD_REP_NUM; ++i)
         bs->rep[i] = repStartValue[i];
-    bs->entropy.hufCTable_repeatMode = HUF_repeat_none;
-    bs->entropy.offcode_repeatMode = FSE_repeat_none;
-    bs->entropy.matchlength_repeatMode = FSE_repeat_none;
-    bs->entropy.litlength_repeatMode = FSE_repeat_none;
+    bs->entropy.huf.repeatMode = HUF_repeat_none;
+    bs->entropy.fse.offcode_repeatMode = FSE_repeat_none;
+    bs->entropy.fse.matchlength_repeatMode = FSE_repeat_none;
+    bs->entropy.fse.litlength_repeatMode = FSE_repeat_none;
 }
 
 /*! ZSTD_invalidateMatchState()
@@ -866,8 +1035,10 @@ static void ZSTD_invalidateMatchState(ZS
     ZSTD_window_clear(&ms->window);
 
     ms->nextToUpdate = ms->window.dictLimit + 1;
+    ms->nextToUpdate3 = ms->window.dictLimit + 1;
     ms->loadedDictEnd = 0;
     ms->opt.litLengthSum = 0;  /* force reset of btopt stats */
+    ms->dictMatchState = NULL;
 }
 
 /*! ZSTD_continueCCtx() :
@@ -880,6 +1051,7 @@ static size_t ZSTD_continueCCtx(ZSTD_CCt
 
     cctx->blockSize = blockSize;   /* previous block size could be different even for same windowLog, due to pledgedSrcSize */
     cctx->appliedParams = params;
+    cctx->blockState.matchState.cParams = params.cParams;
     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
     cctx->consumedSrcSize = 0;
     cctx->producedCSize = 0;
@@ -900,7 +1072,11 @@ static size_t ZSTD_continueCCtx(ZSTD_CCt
 
 typedef enum { ZSTDcrp_continue, ZSTDcrp_noMemset } ZSTD_compResetPolicy_e;
 
-static void* ZSTD_reset_matchState(ZSTD_matchState_t* ms, void* ptr, ZSTD_compressionParameters const* cParams, ZSTD_compResetPolicy_e const crp, U32 const forCCtx)
+static void*
+ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+                      void* ptr,
+                const ZSTD_compressionParameters* cParams,
+                      ZSTD_compResetPolicy_e const crp, U32 const forCCtx)
 {
     size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
     size_t const hSize = ((size_t)1) << cParams->hashLog;
@@ -912,6 +1088,9 @@ static void* ZSTD_reset_matchState(ZSTD_
 
     ms->hashLog3 = hashLog3;
     memset(&ms->window, 0, sizeof(ms->window));
+    ms->window.dictLimit = 1;    /* start from 1, so that 1st position is valid */
+    ms->window.lowLimit = 1;     /* it ensures first and later CCtx usages compress the same */
+    ms->window.nextSrc = ms->window.base + 1;   /* see issue #1241 */
     ZSTD_invalidateMatchState(ms);
 
     /* opt parser space */
@@ -937,14 +1116,24 @@ static void* ZSTD_reset_matchState(ZSTD_
     ms->hashTable3 = ms->chainTable + chainSize;
     ptr = ms->hashTable3 + h3Size;
 
+    ms->cParams = *cParams;
+
     assert(((size_t)ptr & 3) == 0);
     return ptr;
 }
 
+#define ZSTD_WORKSPACETOOLARGE_FACTOR 3 /* define "workspace is too large" as this number of times larger than needed */
+#define ZSTD_WORKSPACETOOLARGE_MAXDURATION 128  /* when workspace is continuously too large
+                                         * during at least this number of times,
+                                         * context's memory usage is considered wasteful,
+                                         * because it's sized to handle a worst case scenario which rarely happens.
+                                         * In which case, resize it down to free some memory */
+
 /*! ZSTD_resetCCtx_internal() :
     note : `params` are assumed fully validated at this stage */
 static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
-                                      ZSTD_CCtx_params params, U64 pledgedSrcSize,
+                                      ZSTD_CCtx_params params,
+                                      U64 pledgedSrcSize,
                                       ZSTD_compResetPolicy_e const crp,
                                       ZSTD_buffered_policy_e const zbuff)
 {
@@ -954,34 +1143,35 @@ static size_t ZSTD_resetCCtx_internal(ZS
 
     if (crp == ZSTDcrp_continue) {
         if (ZSTD_equivalentParams(zc->appliedParams, params,
-                                zc->inBuffSize, zc->blockSize,
-                                zbuff, pledgedSrcSize)) {
-            DEBUGLOG(4, "ZSTD_equivalentParams()==1 -> continue mode (wLog1=%u, blockSize1=%u)",
-                        zc->appliedParams.cParams.windowLog, (U32)zc->blockSize);
-            return ZSTD_continueCCtx(zc, params, pledgedSrcSize);
+                                  zc->inBuffSize,
+                                  zc->seqStore.maxNbSeq, zc->seqStore.maxNbLit,
+                                  zbuff, pledgedSrcSize)) {
+            DEBUGLOG(4, "ZSTD_equivalentParams()==1 -> continue mode (wLog1=%u, blockSize1=%zu)",
+                        zc->appliedParams.cParams.windowLog, zc->blockSize);
+            zc->workSpaceOversizedDuration += (zc->workSpaceOversizedDuration > 0);   /* if it was too large, it still is */
+            if (zc->workSpaceOversizedDuration <= ZSTD_WORKSPACETOOLARGE_MAXDURATION)
+                return ZSTD_continueCCtx(zc, params, pledgedSrcSize);
     }   }
     DEBUGLOG(4, "ZSTD_equivalentParams()==0 -> reset CCtx");
 
     if (params.ldmParams.enableLdm) {
         /* Adjust long distance matching parameters */
-        params.ldmParams.windowLog = params.cParams.windowLog;
         ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
         assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
         assert(params.ldmParams.hashEveryLog < 32);
-        zc->ldmState.hashPower =
-                ZSTD_ldm_getHashPower(params.ldmParams.minMatchLength);
+        zc->ldmState.hashPower = ZSTD_ldm_getHashPower(params.ldmParams.minMatchLength);
     }
 
     {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize));
         size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
         U32    const divider = (params.cParams.searchLength==3) ? 3 : 4;
         size_t const maxNbSeq = blockSize / divider;
-        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const tokenSpace = WILDCOPY_OVERLENGTH + blockSize + 11*maxNbSeq;
         size_t const buffOutSize = (zbuff==ZSTDb_buffered) ? ZSTD_compressBound(blockSize)+1 : 0;
         size_t const buffInSize = (zbuff==ZSTDb_buffered) ? windowSize + blockSize : 0;
         size_t const matchStateSize = ZSTD_sizeof_matchState(&params.cParams, /* forCCtx */ 1);
         size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize);
-        void* ptr;
+        void* ptr;   /* used to partition workSpace */
 
         /* Check if workSpace is large enough, alloc a new one if needed */
         {   size_t const entropySpace = HUF_WORKSPACE_SIZE;
@@ -993,14 +1183,20 @@ static size_t ZSTD_resetCCtx_internal(ZS
             size_t const neededSpace = entropySpace + blockStateSpace + ldmSpace +
                                        ldmSeqSpace + matchStateSize + tokenSpace +
                                        bufferSpace;
-            DEBUGLOG(4, "Need %uKB workspace, including %uKB for match state, and %uKB for buffers",
-                        (U32)(neededSpace>>10), (U32)(matchStateSize>>10), (U32)(bufferSpace>>10));
-            DEBUGLOG(4, "windowSize: %u - blockSize: %u", (U32)windowSize, (U32)blockSize);
-
-            if (zc->workSpaceSize < neededSpace) {  /* too small : resize */
-                DEBUGLOG(4, "Need to update workSpaceSize from %uK to %uK",
-                            (unsigned)(zc->workSpaceSize>>10),
-                            (unsigned)(neededSpace>>10));
+
+            int const workSpaceTooSmall = zc->workSpaceSize < neededSpace;
+            int const workSpaceTooLarge = zc->workSpaceSize > ZSTD_WORKSPACETOOLARGE_FACTOR * neededSpace;
+            int const workSpaceWasteful = workSpaceTooLarge && (zc->workSpaceOversizedDuration > ZSTD_WORKSPACETOOLARGE_MAXDURATION);
+            zc->workSpaceOversizedDuration = workSpaceTooLarge ? zc->workSpaceOversizedDuration+1 : 0;
+
+            DEBUGLOG(4, "Need %zuKB workspace, including %zuKB for match state, and %zuKB for buffers",
+                        neededSpace>>10, matchStateSize>>10, bufferSpace>>10);
+            DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
+
+            if (workSpaceTooSmall || workSpaceWasteful) {
+                DEBUGLOG(4, "Need to resize workSpaceSize from %zuKB to %zuKB",
+                            zc->workSpaceSize >> 10,
+                            neededSpace >> 10);
                 /* static cctx : no resize, error out */
                 if (zc->staticSize) return ERROR(memory_allocation);
 
@@ -1009,9 +1205,11 @@ static size_t ZSTD_resetCCtx_internal(ZS
                 zc->workSpace = ZSTD_malloc(neededSpace, zc->customMem);
                 if (zc->workSpace == NULL) return ERROR(memory_allocation);
                 zc->workSpaceSize = neededSpace;
-                ptr = zc->workSpace;
-
-                /* Statically sized space. entropyWorkspace never moves (but prev/next block swap places) */
+                zc->workSpaceOversizedDuration = 0;
+
+                /* Statically sized space.
+                 * entropyWorkspace never moves,
+                 * though prev/next block swap places */
                 assert(((size_t)zc->workSpace & 3) == 0);   /* ensure correct alignment */
                 assert(zc->workSpaceSize >= 2 * sizeof(ZSTD_compressedBlockState_t));
                 zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)zc->workSpace;
@@ -1022,6 +1220,7 @@ static size_t ZSTD_resetCCtx_internal(ZS
 
         /* init params */
         zc->appliedParams = params;
+        zc->blockState.matchState.cParams = params.cParams;
         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
         zc->consumedSrcSize = 0;
         zc->producedCSize = 0;
@@ -1058,13 +1257,18 @@ static size_t ZSTD_resetCCtx_internal(ZS
         ptr = ZSTD_reset_matchState(&zc->blockState.matchState, ptr, &params.cParams, crp, /* forCCtx */ 1);
 
         /* sequences storage */
+        zc->seqStore.maxNbSeq = maxNbSeq;
         zc->seqStore.sequencesStart = (seqDef*)ptr;
         ptr = zc->seqStore.sequencesStart + maxNbSeq;
         zc->seqStore.llCode = (BYTE*) ptr;
         zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq;
         zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq;
         zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq;
-        ptr = zc->seqStore.litStart + blockSize;
+        /* ZSTD_wildcopy() is used to copy into the literals buffer,
+         * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+         */
+        zc->seqStore.maxNbLit = blockSize;
+        ptr = zc->seqStore.litStart + blockSize + WILDCOPY_OVERLENGTH;
 
         /* ldm bucketOffsets table */
         if (params.ldmParams.enableLdm) {
@@ -1098,28 +1302,110 @@ void ZSTD_invalidateRepCodes(ZSTD_CCtx* 
     assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
 }
 
-static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
+/* These are the approximate sizes for each strategy past which copying the
+ * dictionary tables into the working context is faster than using them
+ * in-place.
+ */
+static const size_t attachDictSizeCutoffs[(unsigned)ZSTD_btultra+1] = {
+    8 KB, /* unused */
+    8 KB, /* ZSTD_fast */
+    16 KB, /* ZSTD_dfast */
+    32 KB, /* ZSTD_greedy */
+    32 KB, /* ZSTD_lazy */
+    32 KB, /* ZSTD_lazy2 */
+    32 KB, /* ZSTD_btlazy2 */
+    32 KB, /* ZSTD_btopt */
+    8 KB /* ZSTD_btultra */
+};
+
+static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict,
+                                 ZSTD_CCtx_params params,
+                                 U64 pledgedSrcSize)
+{
+    size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy];
+    return ( pledgedSrcSize <= cutoff
+          || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+          || params.attachDictPref == ZSTD_dictForceAttach )
+        && params.attachDictPref != ZSTD_dictForceCopy
+        && !params.forceWindow; /* dictMatchState isn't correctly
+                                 * handled in _enforceMaxDist */
+}
+
+static size_t ZSTD_resetCCtx_byAttachingCDict(
+    ZSTD_CCtx* cctx,
+    const ZSTD_CDict* cdict,
+    ZSTD_CCtx_params params,
+    U64 pledgedSrcSize,
+    ZSTD_buffered_policy_e zbuff)
+{
+    {
+        const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams;
+        unsigned const windowLog = params.cParams.windowLog;
+        assert(windowLog != 0);
+        /* Resize working context table params for input only, since the dict
+         * has its own tables. */
+        params.cParams = ZSTD_adjustCParams_internal(*cdict_cParams, pledgedSrcSize, 0);
+        params.cParams.windowLog = windowLog;
+        ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+                                ZSTDcrp_continue, zbuff);
+        assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
+    }
+
+    {
+        const U32 cdictEnd = (U32)( cdict->matchState.window.nextSrc
+                                  - cdict->matchState.window.base);
+        const U32 cdictLen = cdictEnd - cdict->matchState.window.dictLimit;
+        if (cdictLen == 0) {
+            /* don't even attach dictionaries with no contents */
+            DEBUGLOG(4, "skipping attaching empty dictionary");
+        } else {
+            DEBUGLOG(4, "attaching dictionary into context");
+            cctx->blockState.matchState.dictMatchState = &cdict->matchState;
+
+            /* prep working match state so dict matches never have negative indices
+             * when they are translated to the working context's index space. */
+            if (cctx->blockState.matchState.window.dictLimit < cdictEnd) {
+                cctx->blockState.matchState.window.nextSrc =
+                    cctx->blockState.matchState.window.base + cdictEnd;
+                ZSTD_window_clear(&cctx->blockState.matchState.window);
+            }
+            cctx->blockState.matchState.loadedDictEnd = cctx->blockState.matchState.window.dictLimit;
+        }
+    }
+
+    cctx->dictID = cdict->dictID;
+
+    /* copy block state */
+    memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
+
+    return 0;
+}
+
+static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                             const ZSTD_CDict* cdict,
-                            unsigned windowLog,
-                            ZSTD_frameParameters fParams,
+                            ZSTD_CCtx_params params,
                             U64 pledgedSrcSize,
                             ZSTD_buffered_policy_e zbuff)
 {
-    {   ZSTD_CCtx_params params = cctx->requestedParams;
+    const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams;
+
+    DEBUGLOG(4, "copying dictionary into context");
+
+    {   unsigned const windowLog = params.cParams.windowLog;
+        assert(windowLog != 0);
         /* Copy only compression parameters related to tables. */
-        params.cParams = cdict->cParams;
-        if (windowLog) params.cParams.windowLog = windowLog;
-        params.fParams = fParams;
+        params.cParams = *cdict_cParams;
+        params.cParams.windowLog = windowLog;
         ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
                                 ZSTDcrp_noMemset, zbuff);
-        assert(cctx->appliedParams.cParams.strategy == cdict->cParams.strategy);
-        assert(cctx->appliedParams.cParams.hashLog == cdict->cParams.hashLog);
-        assert(cctx->appliedParams.cParams.chainLog == cdict->cParams.chainLog);
+        assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
+        assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog);
+        assert(cctx->appliedParams.cParams.chainLog == cdict_cParams->chainLog);
     }
 
     /* copy tables */
-    {   size_t const chainSize = (cdict->cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict->cParams.chainLog);
-        size_t const hSize =  (size_t)1 << cdict->cParams.hashLog;
+    {   size_t const chainSize = (cdict_cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict_cParams->chainLog);
+        size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
         size_t const tableSpace = (chainSize + hSize) * sizeof(U32);
         assert((U32*)cctx->blockState.matchState.chainTable == (U32*)cctx->blockState.matchState.hashTable + hSize);  /* chainTable must follow hashTable */
         assert((U32*)cctx->blockState.matchState.hashTable3 == (U32*)cctx->blockState.matchState.chainTable + chainSize);
@@ -1127,6 +1413,7 @@ static size_t ZSTD_resetCCtx_usingCDict(
         assert((U32*)cdict->matchState.hashTable3 == (U32*)cdict->matchState.chainTable + chainSize);
         memcpy(cctx->blockState.matchState.hashTable, cdict->matchState.hashTable, tableSpace);   /* presumes all tables follow each other */
     }
+
     /* Zero the hashTable3, since the cdict never fills it */
     {   size_t const h3Size = (size_t)1 << cctx->blockState.matchState.hashLog3;
         assert(cdict->matchState.hashLog3 == 0);
@@ -1134,14 +1421,14 @@ static size_t ZSTD_resetCCtx_usingCDict(
     }
 
     /* copy dictionary offsets */
-    {
-        ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
+    {   ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
         ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
         dstMatchState->window       = srcMatchState->window;
         dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
         dstMatchState->nextToUpdate3= srcMatchState->nextToUpdate3;
         dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
     }
+
     cctx->dictID = cdict->dictID;
 
     /* copy block state */
@@ -1150,6 +1437,27 @@ static size_t ZSTD_resetCCtx_usingCDict(
     return 0;
 }
 
+/* We have a choice between copying the dictionary context into the working
+ * context, or referencing the dictionary context from the working context
+ * in-place. We decide here which strategy to use. */
+static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
+                            const ZSTD_CDict* cdict,
+                            ZSTD_CCtx_params params,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+
+    DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)", (U32)pledgedSrcSize);
+
+    if (ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) {
+        return ZSTD_resetCCtx_byAttachingCDict(
+            cctx, cdict, params, pledgedSrcSize, zbuff);
+    } else {
+        return ZSTD_resetCCtx_byCopyingCDict(
+            cctx, cdict, params, pledgedSrcSize, zbuff);
+    }
+}
+
 /*! ZSTD_copyCCtx_internal() :
  *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
  *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
@@ -1192,7 +1500,7 @@ static size_t ZSTD_copyCCtx_internal(ZST
 
     /* copy dictionary offsets */
     {
-        ZSTD_matchState_t const* srcMatchState = &srcCCtx->blockState.matchState;
+        const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState;
         ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState;
         dstMatchState->window       = srcMatchState->window;
         dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
@@ -1294,15 +1602,15 @@ static void ZSTD_reduceIndex (ZSTD_CCtx*
 
 /* See doc/zstd_compression_format.md for detailed format description */
 
-size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+static size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
 {
+    U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
     if (srcSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    MEM_writeLE24(dst, cBlockHeader24);
     memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize);
-    MEM_writeLE24(dst, (U32)(srcSize << 2) + (U32)bt_raw);
-    return ZSTD_blockHeaderSize+srcSize;
+    return ZSTD_blockHeaderSize + srcSize;
 }
 
-
 static size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE* const)dst;
@@ -1356,16 +1664,24 @@ static size_t ZSTD_compressRleLiteralsBl
 }
 
 
-static size_t ZSTD_minGain(size_t srcSize) { return (srcSize >> 6) + 2; }
-
-static size_t ZSTD_compressLiterals (ZSTD_entropyCTables_t const* prevEntropy,
-                                     ZSTD_entropyCTables_t* nextEntropy,
+/* ZSTD_minGain() :
+ * minimum compression required
+ * to generate a compress block or a compressed literals section.
+ * note : use same formula for both situations */
+static size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+{
+    U32 const minlog = (strat==ZSTD_btultra) ? 7 : 6;
+    return (srcSize >> minlog) + 2;
+}
+
+static size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+                                     ZSTD_hufCTables_t* nextHuf,
                                      ZSTD_strategy strategy, int disableLiteralCompression,
                                      void* dst, size_t dstCapacity,
                                const void* src, size_t srcSize,
                                      U32* workspace, const int bmi2)
 {
-    size_t const minGain = ZSTD_minGain(srcSize);
+    size_t const minGain = ZSTD_minGain(srcSize, strategy);
     size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
     BYTE*  const ostart = (BYTE*)dst;
     U32 singleStream = srcSize < 256;
@@ -1376,27 +1692,25 @@ static size_t ZSTD_compressLiterals (ZST
                 disableLiteralCompression);
 
     /* Prepare nextEntropy assuming reusing the existing table */
-    nextEntropy->hufCTable_repeatMode = prevEntropy->hufCTable_repeatMode;
-    memcpy(nextEntropy->hufCTable, prevEntropy->hufCTable,
-           sizeof(prevEntropy->hufCTable));
+    memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
 
     if (disableLiteralCompression)
         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
 
     /* small ? don't even attempt compression (speed opt) */
 #   define COMPRESS_LITERALS_SIZE_MIN 63
-    {   size_t const minLitSize = (prevEntropy->hufCTable_repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
         if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
     }
 
     if (dstCapacity < lhSize+1) return ERROR(dstSize_tooSmall);   /* not enough space for compression */
-    {   HUF_repeat repeat = prevEntropy->hufCTable_repeatMode;
+    {   HUF_repeat repeat = prevHuf->repeatMode;
         int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
         cLitSize = singleStream ? HUF_compress1X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11,
-                                      workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextEntropy->hufCTable, &repeat, preferRepeat, bmi2)
+                                      workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2)
                                 : HUF_compress4X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11,
-                                      workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextEntropy->hufCTable, &repeat, preferRepeat, bmi2);
+                                      workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2);
         if (repeat != HUF_repeat_none) {
             /* reused the existing table */
             hType = set_repeat;
@@ -1404,17 +1718,17 @@ static size_t ZSTD_compressLiterals (ZST
     }
 
     if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) {
-        memcpy(nextEntropy->hufCTable, prevEntropy->hufCTable, sizeof(prevEntropy->hufCTable));
+        memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
     }
     if (cLitSize==1) {
-        memcpy(nextEntropy->hufCTable, prevEntropy->hufCTable, sizeof(prevEntropy->hufCTable));
+        memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
         return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
     }
 
     if (hType == set_compressed) {
         /* using a newly constructed table */
-        nextEntropy->hufCTable_repeatMode = HUF_repeat_check;
+        nextHuf->repeatMode = HUF_repeat_check;
     }
 
     /* Build header */
@@ -1451,6 +1765,7 @@ void ZSTD_seqToCodes(const seqStore_t* s
     BYTE* const mlCodeTable = seqStorePtr->mlCode;
     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     U32 u;
+    assert(nbSeq <= seqStorePtr->maxNbSeq);
     for (u=0; u<nbSeq; u++) {
         U32 const llv = sequences[u].litLength;
         U32 const mlv = sequences[u].matchLength;
@@ -1464,61 +1779,234 @@ void ZSTD_seqToCodes(const seqStore_t* s
         mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
 }
 
+
+/**
+ * -log2(x / 256) lookup table for x in [0, 256).
+ * If x == 0: Return 0
+ * Else: Return floor(-log2(x / 256) * 256)
+ */
+static unsigned const kInverseProbabiltyLog256[256] = {
+    0,    2048, 1792, 1642, 1536, 1453, 1386, 1329, 1280, 1236, 1197, 1162,
+    1130, 1100, 1073, 1047, 1024, 1001, 980,  960,  941,  923,  906,  889,
+    874,  859,  844,  830,  817,  804,  791,  779,  768,  756,  745,  734,
+    724,  714,  704,  694,  685,  676,  667,  658,  650,  642,  633,  626,
+    618,  610,  603,  595,  588,  581,  574,  567,  561,  554,  548,  542,
+    535,  529,  523,  517,  512,  506,  500,  495,  489,  484,  478,  473,
+    468,  463,  458,  453,  448,  443,  438,  434,  429,  424,  420,  415,
+    411,  407,  402,  398,  394,  390,  386,  382,  377,  373,  370,  366,
+    362,  358,  354,  350,  347,  343,  339,  336,  332,  329,  325,  322,
+    318,  315,  311,  308,  305,  302,  298,  295,  292,  289,  286,  282,
+    279,  276,  273,  270,  267,  264,  261,  258,  256,  253,  250,  247,
+    244,  241,  239,  236,  233,  230,  228,  225,  222,  220,  217,  215,
+    212,  209,  207,  204,  202,  199,  197,  194,  192,  190,  187,  185,
+    182,  180,  178,  175,  173,  171,  168,  166,  164,  162,  159,  157,
+    155,  153,  151,  149,  146,  144,  142,  140,  138,  136,  134,  132,
+    130,  128,  126,  123,  121,  119,  117,  115,  114,  112,  110,  108,
+    106,  104,  102,  100,  98,   96,   94,   93,   91,   89,   87,   85,
+    83,   82,   80,   78,   76,   74,   73,   71,   69,   67,   66,   64,
+    62,   61,   59,   57,   55,   54,   52,   50,   49,   47,   46,   44,
+    42,   41,   39,   37,   36,   34,   33,   31,   30,   28,   26,   25,
+    23,   22,   20,   19,   17,   16,   14,   13,   11,   10,   8,    7,
+    5,    4,    2,    1,
+};
+
+
+/**
+ * Returns the cost in bits of encoding the distribution described by count
+ * using the entropy bound.
+ */
+static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t const total)
+{
+    unsigned cost = 0;
+    unsigned s;
+    for (s = 0; s <= max; ++s) {
+        unsigned norm = (unsigned)((256 * count[s]) / total);
+        if (count[s] != 0 && norm == 0)
+            norm = 1;
+        assert(count[s] < total);
+        cost += count[s] * kInverseProbabiltyLog256[norm];
+    }
+    return cost >> 8;
+}
+
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using the
+ * table described by norm. The max symbol support by norm is assumed >= max.
+ * norm must be valid for every symbol with non-zero probability in count.
+ */
+static size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+                                    unsigned const* count, unsigned const max)
+{
+    unsigned const shift = 8 - accuracyLog;
+    size_t cost = 0;
+    unsigned s;
+    assert(accuracyLog <= 8);
+    for (s = 0; s <= max; ++s) {
+        unsigned const normAcc = norm[s] != -1 ? norm[s] : 1;
+        unsigned const norm256 = normAcc << shift;
+        assert(norm256 > 0);
+        assert(norm256 < 256);
+        cost += count[s] * kInverseProbabiltyLog256[norm256];
+    }
+    return cost >> 8;
+}
+
+
+static unsigned ZSTD_getFSEMaxSymbolValue(FSE_CTable const* ctable) {
+  void const* ptr = ctable;
+  U16 const* u16ptr = (U16 const*)ptr;
+  U32 const maxSymbolValue = MEM_read16(u16ptr + 1);
+  return maxSymbolValue;
+}
+
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using ctable.
+ * Returns an error if ctable cannot represent all the symbols in count.
+ */
+static size_t ZSTD_fseBitCost(
+    FSE_CTable const* ctable,
+    unsigned const* count,
+    unsigned const max)
+{
+    unsigned const kAccuracyLog = 8;
+    size_t cost = 0;
+    unsigned s;
+    FSE_CState_t cstate;
+    FSE_initCState(&cstate, ctable);
+    if (ZSTD_getFSEMaxSymbolValue(ctable) < max) {
+        DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u",
+                    ZSTD_getFSEMaxSymbolValue(ctable), max);
+        return ERROR(GENERIC);
+    }
+    for (s = 0; s <= max; ++s) {
+        unsigned const tableLog = cstate.stateLog;
+        unsigned const badCost = (tableLog + 1) << kAccuracyLog;
+        unsigned const bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog);
+        if (count[s] == 0)
+            continue;
+        if (bitCost >= badCost) {
+            DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s);
+            return ERROR(GENERIC);
+        }
+        cost += count[s] * bitCost;
+    }
+    return cost >> kAccuracyLog;
+}
+
+/**
+ * Returns the cost in bytes of encoding the normalized count header.
+ * Returns an error if any of the helper functions return an error.
+ */
+static size_t ZSTD_NCountCost(unsigned const* count, unsigned const max,
+                              size_t const nbSeq, unsigned const FSELog)
+{
+    BYTE wksp[FSE_NCOUNTBOUND];
+    S16 norm[MaxSeq + 1];
+    const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+    CHECK_F(FSE_normalizeCount(norm, tableLog, count, nbSeq, max));
+    return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog);
+}
+
+
 typedef enum {
     ZSTD_defaultDisallowed = 0,
     ZSTD_defaultAllowed = 1
 } ZSTD_defaultPolicy_e;
 
-MEM_STATIC
-symbolEncodingType_e ZSTD_selectEncodingType(
-        FSE_repeat* repeatMode, size_t const mostFrequent, size_t nbSeq,
-        U32 defaultNormLog, ZSTD_defaultPolicy_e const isDefaultAllowed)
+MEM_STATIC symbolEncodingType_e
+ZSTD_selectEncodingType(
+        FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+        size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+        FSE_CTable const* prevCTable,
+        short const* defaultNorm, U32 defaultNormLog,
+        ZSTD_defaultPolicy_e const isDefaultAllowed,
+        ZSTD_strategy const strategy)
 {
-#define MIN_SEQ_FOR_DYNAMIC_FSE   64
-#define MAX_SEQ_FOR_STATIC_FSE  1000
     ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0);
-    if ((mostFrequent == nbSeq) && (!isDefaultAllowed || nbSeq > 2)) {
+    if (mostFrequent == nbSeq) {
+        *repeatMode = FSE_repeat_none;
+        if (isDefaultAllowed && nbSeq <= 2) {
+            /* Prefer set_basic over set_rle when there are 2 or less symbols,
+             * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+             * If basic encoding isn't possible, always choose RLE.
+             */
+            DEBUGLOG(5, "Selected set_basic");
+            return set_basic;
+        }
         DEBUGLOG(5, "Selected set_rle");
-        /* Prefer set_basic over set_rle when there are 2 or less symbols,
-         * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
-         * If basic encoding isn't possible, always choose RLE.
-         */
-        *repeatMode = FSE_repeat_check;
         return set_rle;
     }
-    if ( isDefaultAllowed
-      && (*repeatMode == FSE_repeat_valid) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
-        DEBUGLOG(5, "Selected set_repeat");
-        return set_repeat;
-    }
-    if ( isDefaultAllowed
-      && ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (defaultNormLog-1)))) ) {
-        DEBUGLOG(5, "Selected set_basic");
-        /* The format allows default tables to be repeated, but it isn't useful.
-         * When using simple heuristics to select encoding type, we don't want
-         * to confuse these tables with dictionaries. When running more careful
-         * analysis, we don't need to waste time checking both repeating tables
-         * and default tables.
-         */
-        *repeatMode = FSE_repeat_none;
-        return set_basic;
+    if (strategy < ZSTD_lazy) {
+        if (isDefaultAllowed) {
+            size_t const staticFse_nbSeq_max = 1000;
+            size_t const mult = 10 - strategy;
+            size_t const baseLog = 3;
+            size_t const dynamicFse_nbSeq_min = (((size_t)1 << defaultNormLog) * mult) >> baseLog;  /* 28-36 for offset, 56-72 for lengths */
+            assert(defaultNormLog >= 5 && defaultNormLog <= 6);  /* xx_DEFAULTNORMLOG */
+            assert(mult <= 9 && mult >= 7);
+            if ( (*repeatMode == FSE_repeat_valid)
+              && (nbSeq < staticFse_nbSeq_max) ) {
+                DEBUGLOG(5, "Selected set_repeat");
+                return set_repeat;
+            }
+            if ( (nbSeq < dynamicFse_nbSeq_min)
+              || (mostFrequent < (nbSeq >> (defaultNormLog-1))) ) {
+                DEBUGLOG(5, "Selected set_basic");
+                /* The format allows default tables to be repeated, but it isn't useful.
+                 * When using simple heuristics to select encoding type, we don't want
+                 * to confuse these tables with dictionaries. When running more careful
+                 * analysis, we don't need to waste time checking both repeating tables
+                 * and default tables.
+                 */
+                *repeatMode = FSE_repeat_none;
+                return set_basic;
+            }
+        }
+    } else {
+        size_t const basicCost = isDefaultAllowed ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, count, max) : ERROR(GENERIC);
+        size_t const repeatCost = *repeatMode != FSE_repeat_none ? ZSTD_fseBitCost(prevCTable, count, max) : ERROR(GENERIC);
+        size_t const NCountCost = ZSTD_NCountCost(count, max, nbSeq, FSELog);
+        size_t const compressedCost = (NCountCost << 3) + ZSTD_entropyCost(count, max, nbSeq);
+
+        if (isDefaultAllowed) {
+            assert(!ZSTD_isError(basicCost));
+            assert(!(*repeatMode == FSE_repeat_valid && ZSTD_isError(repeatCost)));
+        }
+        assert(!ZSTD_isError(NCountCost));
+        assert(compressedCost < ERROR(maxCode));
+        DEBUGLOG(5, "Estimated bit costs: basic=%u\trepeat=%u\tcompressed=%u",
+                    (U32)basicCost, (U32)repeatCost, (U32)compressedCost);
+        if (basicCost <= repeatCost && basicCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_basic");
+            assert(isDefaultAllowed);
+            *repeatMode = FSE_repeat_none;
+            return set_basic;
+        }
+        if (repeatCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_repeat");
+            assert(!ZSTD_isError(repeatCost));
+            return set_repeat;
+        }
+        assert(compressedCost < basicCost && compressedCost < repeatCost);
     }
     DEBUGLOG(5, "Selected set_compressed");
     *repeatMode = FSE_repeat_check;
     return set_compressed;
 }
 
-MEM_STATIC
-size_t ZSTD_buildCTable(void* dst, size_t dstCapacity,
-        FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
-        U32* count, U32 max,
-        BYTE const* codeTable, size_t nbSeq,
-        S16 const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
-        FSE_CTable const* prevCTable, size_t prevCTableSize,
-        void* workspace, size_t workspaceSize)
+MEM_STATIC size_t
+ZSTD_buildCTable(void* dst, size_t dstCapacity,
+                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+                U32* count, U32 max,
+                const BYTE* codeTable, size_t nbSeq,
+                const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                const FSE_CTable* prevCTable, size_t prevCTableSize,
+                void* workspace, size_t workspaceSize)
 {
     BYTE* op = (BYTE*)dst;
-    BYTE const* const oend = op + dstCapacity;
+    const BYTE* const oend = op + dstCapacity;
 
     switch (type) {
     case set_rle:
@@ -1674,7 +2162,7 @@ ZSTD_encodeSequences_bmi2(
 
 #endif
 
-size_t ZSTD_encodeSequences(
+static size_t ZSTD_encodeSequences(
             void* dst, size_t dstCapacity,
             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
             FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
@@ -1706,10 +2194,11 @@ MEM_STATIC size_t ZSTD_compressSequences
                               const int bmi2)
 {
     const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+    ZSTD_strategy const strategy = cctxParams->cParams.strategy;
     U32 count[MaxSeq+1];
-    FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable;
-    FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable;
-    FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable;
+    FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+    FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+    FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
     U32 LLtype, Offtype, MLtype;   /* compressed, raw or rle */
     const seqDef* const sequences = seqStorePtr->sequencesStart;
     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
@@ -1720,15 +2209,17 @@ MEM_STATIC size_t ZSTD_compressSequences
     BYTE* op = ostart;
     size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
     BYTE* seqHead;
+    BYTE* lastNCount = NULL;
 
     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
 
     /* Compress literals */
     {   const BYTE* const literals = seqStorePtr->litStart;
         size_t const litSize = seqStorePtr->lit - literals;
+        int const disableLiteralCompression = (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0);
         size_t const cSize = ZSTD_compressLiterals(
-                                    prevEntropy, nextEntropy,
-                                    cctxParams->cParams.strategy, cctxParams->disableLiteralCompression,
+                                    &prevEntropy->huf, &nextEntropy->huf,
+                                    cctxParams->cParams.strategy, disableLiteralCompression,
                                     op, dstCapacity,
                                     literals, litSize,
                                     workspace, bmi2);
@@ -1747,13 +2238,9 @@ MEM_STATIC size_t ZSTD_compressSequences
     else
         op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
     if (nbSeq==0) {
-      memcpy(nextEntropy->litlengthCTable, prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable));
-      nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode;
-      memcpy(nextEntropy->offcodeCTable, prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable));
-      nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
-      memcpy(nextEntropy->matchlengthCTable, prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable));
-      nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode;
-      return op - ostart;
+        /* Copy the old tables over as if we repeated them */
+        memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+        return op - ostart;
     }
 
     /* seqHead : flags for FSE encoding type */
@@ -1763,43 +2250,53 @@ MEM_STATIC size_t ZSTD_compressSequences
     ZSTD_seqToCodes(seqStorePtr);
     /* build CTable for Literal Lengths */
     {   U32 max = MaxLL;
-        size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, workspace);
+        size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, workspace);   /* can't fail */
         DEBUGLOG(5, "Building LL table");
-        nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode;
-        LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, mostFrequent, nbSeq, LL_defaultNormLog, ZSTD_defaultAllowed);
+        nextEntropy->fse.litlength_repeatMode = prevEntropy->fse.litlength_repeatMode;
+        LLtype = ZSTD_selectEncodingType(&nextEntropy->fse.litlength_repeatMode, count, max, mostFrequent, nbSeq, LLFSELog, prevEntropy->fse.litlengthCTable, LL_defaultNorm, LL_defaultNormLog, ZSTD_defaultAllowed, strategy);
+        assert(set_basic < set_compressed && set_rle < set_compressed);
+        assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
         {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype,
-                    count, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL,
-                    prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable),
-                    workspace, HUF_WORKSPACE_SIZE);
+                                                    count, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                                    prevEntropy->fse.litlengthCTable, sizeof(prevEntropy->fse.litlengthCTable),
+                                                    workspace, HUF_WORKSPACE_SIZE);
             if (ZSTD_isError(countSize)) return countSize;
+            if (LLtype == set_compressed)
+                lastNCount = op;
             op += countSize;
     }   }
     /* build CTable for Offsets */
     {   U32 max = MaxOff;
-        size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, workspace);
+        size_t const mostFrequent = HIST_countFast_wksp(count, &max, ofCodeTable, nbSeq, workspace);  /* can't fail */
         /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
         ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
         DEBUGLOG(5, "Building OF table");
-        nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
-        Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, mostFrequent, nbSeq, OF_defaultNormLog, defaultPolicy);
+        nextEntropy->fse.offcode_repeatMode = prevEntropy->fse.offcode_repeatMode;
+        Offtype = ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repeatMode, count, max, mostFrequent, nbSeq, OffFSELog, prevEntropy->fse.offcodeCTable, OF_defaultNorm, OF_defaultNormLog, defaultPolicy, strategy);
+        assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */
         {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype,
-                    count, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
-                    prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable),
-                    workspace, HUF_WORKSPACE_SIZE);
+                                                    count, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                                    prevEntropy->fse.offcodeCTable, sizeof(prevEntropy->fse.offcodeCTable),
+                                                    workspace, HUF_WORKSPACE_SIZE);
             if (ZSTD_isError(countSize)) return countSize;
+            if (Offtype == set_compressed)
+                lastNCount = op;
             op += countSize;
     }   }
     /* build CTable for MatchLengths */
     {   U32 max = MaxML;
-        size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, workspace);
+        size_t const mostFrequent = HIST_countFast_wksp(count, &max, mlCodeTable, nbSeq, workspace);   /* can't fail */
         DEBUGLOG(5, "Building ML table");
-        nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode;
-        MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, mostFrequent, nbSeq, ML_defaultNormLog, ZSTD_defaultAllowed);
+        nextEntropy->fse.matchlength_repeatMode = prevEntropy->fse.matchlength_repeatMode;
+        MLtype = ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_repeatMode, count, max, mostFrequent, nbSeq, MLFSELog, prevEntropy->fse.matchlengthCTable, ML_defaultNorm, ML_defaultNormLog, ZSTD_defaultAllowed, strategy);
+        assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
         {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype,
-                    count, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML,
-                    prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable),
-                    workspace, HUF_WORKSPACE_SIZE);
+                                                    count, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                                    prevEntropy->fse.matchlengthCTable, sizeof(prevEntropy->fse.matchlengthCTable),
+                                                    workspace, HUF_WORKSPACE_SIZE);
             if (ZSTD_isError(countSize)) return countSize;
+            if (MLtype == set_compressed)
+                lastNCount = op;
             op += countSize;
     }   }
 
@@ -1814,21 +2311,37 @@ MEM_STATIC size_t ZSTD_compressSequences
                                         longOffsets, bmi2);
         if (ZSTD_isError(bitstreamSize)) return bitstreamSize;
         op += bitstreamSize;
+        /* zstd versions <= 1.3.4 mistakenly report corruption when
+         * FSE_readNCount() recieves a buffer < 4 bytes.
+         * Fixed by https://github.com/facebook/zstd/pull/1146.
+         * This can happen when the last set_compressed table present is 2
+         * bytes and the bitstream is only one byte.
+         * In this exceedingly rare case, we will simply emit an uncompressed
+         * block, since it isn't worth optimizing.
+         */
+        if (lastNCount && (op - lastNCount) < 4) {
+            /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
+            assert(op - lastNCount == 3);
+            DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
+                        "emitting an uncompressed block.");
+            return 0;
+        }
     }
 
     return op - ostart;
 }
 
 MEM_STATIC size_t ZSTD_compressSequences(seqStore_t* seqStorePtr,
-                              ZSTD_entropyCTables_t const* prevEntropy,
+                        const ZSTD_entropyCTables_t* prevEntropy,
                               ZSTD_entropyCTables_t* nextEntropy,
-                              ZSTD_CCtx_params const* cctxParams,
+                        const ZSTD_CCtx_params* cctxParams,
                               void* dst, size_t dstCapacity,
                               size_t srcSize, U32* workspace, int bmi2)
 {
     size_t const cSize = ZSTD_compressSequences_internal(
             seqStorePtr, prevEntropy, nextEntropy, cctxParams, dst, dstCapacity,
             workspace, bmi2);
+    if (cSize == 0) return 0;
     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
      */
@@ -1837,40 +2350,55 @@ MEM_STATIC size_t ZSTD_compressSequences
     if (ZSTD_isError(cSize)) return cSize;
 
     /* Check compressibility */
-    {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize);  /* note : fixed formula, maybe should depend on compression level, or strategy */
+    {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
         if (cSize >= maxCSize) return 0;  /* block not compressed */
     }
 
-    /* We check that dictionaries have offset codes available for the first
-     * block. After the first block, the offcode table might not have large
-     * enough codes to represent the offsets in the data.
-     */
-    if (nextEntropy->offcode_repeatMode == FSE_repeat_valid)
-        nextEntropy->offcode_repeatMode = FSE_repeat_check;
-
     return cSize;
 }
 
 /* ZSTD_selectBlockCompressor() :
  * Not static, but internal use only (used by long distance matcher)
  * assumption : strat is a valid strategy */
-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict)
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode)
 {
-    static const ZSTD_blockCompressor blockCompressor[2][(unsigned)ZSTD_btultra+1] = {
+    static const ZSTD_blockCompressor blockCompressor[3][(unsigned)ZSTD_btultra+1] = {
         { ZSTD_compressBlock_fast  /* default for 0 */,
-          ZSTD_compressBlock_fast, ZSTD_compressBlock_doubleFast, ZSTD_compressBlock_greedy,
-          ZSTD_compressBlock_lazy, ZSTD_compressBlock_lazy2, ZSTD_compressBlock_btlazy2,
-          ZSTD_compressBlock_btopt, ZSTD_compressBlock_btultra },
+          ZSTD_compressBlock_fast,
+          ZSTD_compressBlock_doubleFast,
+          ZSTD_compressBlock_greedy,
+          ZSTD_compressBlock_lazy,
+          ZSTD_compressBlock_lazy2,
+          ZSTD_compressBlock_btlazy2,
+          ZSTD_compressBlock_btopt,
+          ZSTD_compressBlock_btultra },
         { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
-          ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_doubleFast_extDict, ZSTD_compressBlock_greedy_extDict,
-          ZSTD_compressBlock_lazy_extDict,ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict,
-          ZSTD_compressBlock_btopt_extDict, ZSTD_compressBlock_btultra_extDict }
+          ZSTD_compressBlock_fast_extDict,
+          ZSTD_compressBlock_doubleFast_extDict,
+          ZSTD_compressBlock_greedy_extDict,
+          ZSTD_compressBlock_lazy_extDict,
+          ZSTD_compressBlock_lazy2_extDict,
+          ZSTD_compressBlock_btlazy2_extDict,
+          ZSTD_compressBlock_btopt_extDict,
+          ZSTD_compressBlock_btultra_extDict },
+        { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
+          ZSTD_compressBlock_fast_dictMatchState,
+          ZSTD_compressBlock_doubleFast_dictMatchState,
+          ZSTD_compressBlock_greedy_dictMatchState,
+          ZSTD_compressBlock_lazy_dictMatchState,
+          ZSTD_compressBlock_lazy2_dictMatchState,
+          ZSTD_compressBlock_btlazy2_dictMatchState,
+          ZSTD_compressBlock_btopt_dictMatchState,
+          ZSTD_compressBlock_btultra_dictMatchState }
     };
+    ZSTD_blockCompressor selectedCompressor;
     ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1);
 
     assert((U32)strat >= (U32)ZSTD_fast);
     assert((U32)strat <= (U32)ZSTD_btultra);
-    return blockCompressor[extDict!=0][(U32)strat];
+    selectedCompressor = blockCompressor[(int)dictMode][(U32)strat];
+    assert(selectedCompressor != NULL);
+    return selectedCompressor;
 }
 
 static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr,
@@ -1880,7 +2408,7 @@ static void ZSTD_storeLastLiterals(seqSt
     seqStorePtr->lit += lastLLSize;
 }
 
-static void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+void ZSTD_resetSeqStore(seqStore_t* ssPtr)
 {
     ssPtr->lit = ssPtr->litStart;
     ssPtr->sequences = ssPtr->sequencesStart;
@@ -1892,24 +2420,38 @@ static size_t ZSTD_compressBlock_interna
                                         const void* src, size_t srcSize)
 {
     ZSTD_matchState_t* const ms = &zc->blockState.matchState;
-    DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
-                (U32)dstCapacity, ms->window.dictLimit, ms->nextToUpdate);
+    size_t cSize;
+    DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%zu, dictLimit=%u, nextToUpdate=%u)",
+                dstCapacity, ms->window.dictLimit, ms->nextToUpdate);
+    assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+
+    /* Assert that we have correctly flushed the ctx params into the ms's copy */
+    ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+
     if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
         ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.searchLength);
-        return 0;   /* don't even attempt compression below a certain srcSize */
+        cSize = 0;
+        goto out;  /* don't even attempt compression below a certain srcSize */
     }
     ZSTD_resetSeqStore(&(zc->seqStore));
+    ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy;   /* required for optimal parser to read stats from dictionary */
+
+    /* a gap between an attached dict and the current window is not safe,
+     * they must remain adjacent, and when that stops being the case, the dict
+     * must be unset */
+    assert(ms->dictMatchState == NULL || ms->loadedDictEnd == ms->window.dictLimit);
 
     /* limited update after a very long match */
     {   const BYTE* const base = ms->window.base;
         const BYTE* const istart = (const BYTE*)src;
         const U32 current = (U32)(istart-base);
+        if (sizeof(ptrdiff_t)==8) assert(istart - base < (ptrdiff_t)(U32)(-1));   /* ensure no overflow */
         if (current > ms->nextToUpdate + 384)
             ms->nextToUpdate = current - MIN(192, (U32)(current - ms->nextToUpdate - 384));
     }
 
     /* select and store sequences */
-    {   U32 const extDict = ZSTD_window_hasExtDict(ms->window);
+    {   ZSTD_dictMode_e const dictMode = ZSTD_matchState_dictMode(ms);
         size_t lastLLSize;
         {   int i;
             for (i = 0; i < ZSTD_REP_NUM; ++i)
@@ -1922,8 +2464,7 @@ static size_t ZSTD_compressBlock_interna
                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
                                        ms, &zc->seqStore,
                                        zc->blockState.nextCBlock->rep,
-                                       &zc->appliedParams.cParams,
-                                       src, srcSize, extDict);
+                                       src, srcSize);
             assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
         } else if (zc->appliedParams.ldmParams.enableLdm) {
             rawSeqStore_t ldmSeqStore = {NULL, 0, 0, 0};
@@ -1939,31 +2480,38 @@ static size_t ZSTD_compressBlock_interna
                 ZSTD_ldm_blockCompress(&ldmSeqStore,
                                        ms, &zc->seqStore,
                                        zc->blockState.nextCBlock->rep,
-                                       &zc->appliedParams.cParams,
-                                       src, srcSize, extDict);
+                                       src, srcSize);
             assert(ldmSeqStore.pos == ldmSeqStore.size);
         } else {   /* not long range mode */
-            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, extDict);
-            lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, &zc->appliedParams.cParams, src, srcSize);
+            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode);
+            lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
         }
         {   const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize;
             ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize);
     }   }
 
     /* encode sequences and literals */
-    {   size_t const cSize = ZSTD_compressSequences(&zc->seqStore,
-                                &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
-                                &zc->appliedParams,
-                                dst, dstCapacity,
-                                srcSize, zc->entropyWorkspace, zc->bmi2);
-        if (ZSTD_isError(cSize) || cSize == 0) return cSize;
-        /* confirm repcodes and entropy tables */
-        {   ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
-            zc->blockState.prevCBlock = zc->blockState.nextCBlock;
-            zc->blockState.nextCBlock = tmp;
-        }
-        return cSize;
+    cSize = ZSTD_compressSequences(&zc->seqStore,
+            &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
+            &zc->appliedParams,
+            dst, dstCapacity,
+            srcSize, zc->entropyWorkspace, zc->bmi2);
+
+out:
+    if (!ZSTD_isError(cSize) && cSize != 0) {
+        /* confirm repcodes and entropy tables when emitting a compressed block */
+        ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
+        zc->blockState.prevCBlock = zc->blockState.nextCBlock;
+        zc->blockState.nextCBlock = tmp;
     }
+    /* We check that dictionaries have offset codes available for the first
+     * block. After the first block, the offcode table might not have large
+     * enough codes to represent the offsets in the data.
+     */
+    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
 }
 
 
@@ -2005,13 +2553,13 @@ static size_t ZSTD_compress_frameChunk (
             ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
             ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
             ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
-
             ZSTD_reduceIndex(cctx, correction);
             if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
             else ms->nextToUpdate -= correction;
             ms->loadedDictEnd = 0;
+            ms->dictMatchState = NULL;
         }
-        ZSTD_window_enforceMaxDist(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd);
+        ZSTD_window_enforceMaxDist(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
         if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit;
 
         {   size_t cSize = ZSTD_compressBlock_internal(cctx,
@@ -2020,11 +2568,8 @@ static size_t ZSTD_compress_frameChunk (
             if (ZSTD_isError(cSize)) return cSize;
 
             if (cSize == 0) {  /* block is not compressible */
-                U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(blockSize << 3);
-                if (blockSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall);
-                MEM_writeLE32(op, cBlockHeader24);   /* 4th byte will be overwritten */
-                memcpy(op + ZSTD_blockHeaderSize, ip, blockSize);
-                cSize = ZSTD_blockHeaderSize + blockSize;
+                cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+                if (ZSTD_isError(cSize)) return cSize;
             } else {
                 U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
                 MEM_writeLE24(op, cBlockHeader24);
@@ -2060,6 +2605,7 @@ static size_t ZSTD_writeFrameHeader(void
     BYTE  const frameHeaderDecriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) );
     size_t pos=0;
 
+    assert(!(params.fParams.contentSizeFlag && pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN));
     if (dstCapacity < ZSTD_frameHeaderSize_max) return ERROR(dstSize_tooSmall);
     DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u",
                 !params.fParams.noDictIDFlag, dictID,  dictIDSizeCode);
@@ -2122,7 +2668,7 @@ static size_t ZSTD_compressContinue_inte
                         const void* src, size_t srcSize,
                                U32 frame, U32 lastFrameChunk)
 {
-    ZSTD_matchState_t* ms = &cctx->blockState.matchState;
+    ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
     size_t fhSize = 0;
 
     DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u",
@@ -2143,8 +2689,25 @@ static size_t ZSTD_compressContinue_inte
     if (!ZSTD_window_update(&ms->window, src, srcSize)) {
         ms->nextToUpdate = ms->window.dictLimit;
     }
-    if (cctx->appliedParams.ldmParams.enableLdm)
+    if (cctx->appliedParams.ldmParams.enableLdm) {
         ZSTD_window_update(&cctx->ldmState.window, src, srcSize);
+    }
+
+    if (!frame) {
+        /* overflow check and correction for block mode */
+        if (ZSTD_window_needOverflowCorrection(ms->window, (const char*)src + srcSize)) {
+            U32 const cycleLog = ZSTD_cycleLog(cctx->appliedParams.cParams.chainLog, cctx->appliedParams.cParams.strategy);
+            U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, 1 << cctx->appliedParams.cParams.windowLog, src);
+            ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
+            ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
+            ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+            ZSTD_reduceIndex(cctx, correction);
+            if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
+            else ms->nextToUpdate -= correction;
+            ms->loadedDictEnd = 0;
+            ms->dictMatchState = NULL;
+        }
+    }
 
     DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (U32)cctx->blockSize);
     {   size_t const cSize = frame ?
@@ -2153,7 +2716,9 @@ static size_t ZSTD_compressContinue_inte
         if (ZSTD_isError(cSize)) return cSize;
         cctx->consumedSrcSize += srcSize;
         cctx->producedCSize += (cSize + fhSize);
-        if (cctx->appliedParams.fParams.contentSizeFlag) {  /* control src size */
+        assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+        if (cctx->pledgedSrcSizePlusOne != 0) {  /* control src size */
+            ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
             if (cctx->consumedSrcSize+1 > cctx->pledgedSrcSizePlusOne) {
                 DEBUGLOG(4, "error : pledgedSrcSize = %u, while realSrcSize >= %u",
                     (U32)cctx->pledgedSrcSizePlusOne-1, (U32)cctx->consumedSrcSize);
@@ -2184,44 +2749,50 @@ size_t ZSTD_compressBlock(ZSTD_CCtx* cct
 {
     size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
     if (srcSize > blockSizeMax) return ERROR(srcSize_wrong);
+
     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
 }
 
 /*! ZSTD_loadDictionaryContent() :
  *  @return : 0, or an error code
  */
-static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const void* src, size_t srcSize)
+static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                                         ZSTD_CCtx_params const* params,
+                                         const void* src, size_t srcSize,
+                                         ZSTD_dictTableLoadMethod_e dtlm)
 {
     const BYTE* const ip = (const BYTE*) src;
     const BYTE* const iend = ip + srcSize;
-    ZSTD_compressionParameters const* cParams = &params->cParams;
 
     ZSTD_window_update(&ms->window, src, srcSize);
     ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
 
+    /* Assert that we the ms params match the params we're being given */
+    ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+
     if (srcSize <= HASH_READ_SIZE) return 0;
 
     switch(params->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, cParams, iend);
+        ZSTD_fillHashTable(ms, iend, dtlm);
         break;
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, cParams, iend);
+        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
         break;
 
     case ZSTD_greedy:
     case ZSTD_lazy:
     case ZSTD_lazy2:
         if (srcSize >= HASH_READ_SIZE)
-            ZSTD_insertAndFindFirstIndex(ms, cParams, iend-HASH_READ_SIZE);
+            ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE);
         break;
 
     case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
     case ZSTD_btopt:
     case ZSTD_btultra:
         if (srcSize >= HASH_READ_SIZE)
-            ZSTD_updateTree(ms, cParams, iend-HASH_READ_SIZE, iend);
+            ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
         break;
 
     default:
@@ -2256,7 +2827,12 @@ static size_t ZSTD_checkDictNCount(short
  *  assumptions : magic number supposed already checked
  *                dictSize supposed > 8
  */
-static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const void* dict, size_t dictSize, void* workspace)
+static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+                                      ZSTD_matchState_t* ms,
+                                      ZSTD_CCtx_params const* params,
+                                      const void* dict, size_t dictSize,
+                                      ZSTD_dictTableLoadMethod_e dtlm,
+                                      void* workspace)
 {
     const BYTE* dictPtr = (const BYTE*)dict;
     const BYTE* const dictEnd = dictPtr + dictSize;
@@ -2265,13 +2841,15 @@ static size_t ZSTD_loadZstdDictionary(ZS
     size_t dictID;
 
     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+    assert(dictSize > 8);
+    assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
 
     dictPtr += 4;   /* skip magic number */
     dictID = params->fParams.noDictIDFlag ? 0 :  MEM_readLE32(dictPtr);
     dictPtr += 4;
 
     {   unsigned maxSymbolValue = 255;
-        size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.hufCTable, &maxSymbolValue, dictPtr, dictEnd-dictPtr);
+        size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr, dictEnd-dictPtr);
         if (HUF_isError(hufHeaderSize)) return ERROR(dictionary_corrupted);
         if (maxSymbolValue < 255) return ERROR(dictionary_corrupted);
         dictPtr += hufHeaderSize;
@@ -2282,7 +2860,8 @@ static size_t ZSTD_loadZstdDictionary(ZS
         if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
         if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
         /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
-        CHECK_E( FSE_buildCTable_wksp(bs->entropy.offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, workspace, HUF_WORKSPACE_SIZE),
+        /* fill all offset symbols to avoid garbage at end of table */
+        CHECK_E( FSE_buildCTable_wksp(bs->entropy.fse.offcodeCTable, offcodeNCount, MaxOff, offcodeLog, workspace, HUF_WORKSPACE_SIZE),
                  dictionary_corrupted);
         dictPtr += offcodeHeaderSize;
     }
@@ -2294,7 +2873,7 @@ static size_t ZSTD_loadZstdDictionary(ZS
         if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
         /* Every match length code must have non-zero probability */
         CHECK_F( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML));
-        CHECK_E( FSE_buildCTable_wksp(bs->entropy.matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, workspace, HUF_WORKSPACE_SIZE),
+        CHECK_E( FSE_buildCTable_wksp(bs->entropy.fse.matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, workspace, HUF_WORKSPACE_SIZE),
                  dictionary_corrupted);
         dictPtr += matchlengthHeaderSize;
     }
@@ -2306,7 +2885,7 @@ static size_t ZSTD_loadZstdDictionary(ZS
         if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
         /* Every literal length code must have non-zero probability */
         CHECK_F( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL));
-        CHECK_E( FSE_buildCTable_wksp(bs->entropy.litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, workspace, HUF_WORKSPACE_SIZE),
+        CHECK_E( FSE_buildCTable_wksp(bs->entropy.fse.litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, workspace, HUF_WORKSPACE_SIZE),
                  dictionary_corrupted);
         dictPtr += litlengthHeaderSize;
     }
@@ -2332,22 +2911,25 @@ static size_t ZSTD_loadZstdDictionary(ZS
                 if (bs->rep[u] > dictContentSize) return ERROR(dictionary_corrupted);
         }   }
 
-        bs->entropy.hufCTable_repeatMode = HUF_repeat_valid;
-        bs->entropy.offcode_repeatMode = FSE_repeat_valid;
-        bs->entropy.matchlength_repeatMode = FSE_repeat_valid;
-        bs->entropy.litlength_repeatMode = FSE_repeat_valid;
-        CHECK_F(ZSTD_loadDictionaryContent(ms, params, dictPtr, dictContentSize));
+        bs->entropy.huf.repeatMode = HUF_repeat_valid;
+        bs->entropy.fse.offcode_repeatMode = FSE_repeat_valid;
+        bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid;
+        bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid;
+        CHECK_F(ZSTD_loadDictionaryContent(ms, params, dictPtr, dictContentSize, dtlm));
         return dictID;
     }
 }
 
 /** ZSTD_compress_insertDictionary() :
 *   @return : dictID, or an error code */
-static size_t ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_matchState_t* ms,
-                                             ZSTD_CCtx_params const* params,
-                                       const void* dict, size_t dictSize,
-                                             ZSTD_dictContentType_e dictContentType,
-                                             void* workspace)
+static size_t
+ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+                               ZSTD_matchState_t* ms,
+                         const ZSTD_CCtx_params* params,
+                         const void* dict, size_t dictSize,
+                               ZSTD_dictContentType_e dictContentType,
+                               ZSTD_dictTableLoadMethod_e dtlm,
+                               void* workspace)
 {
     DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
     if ((dict==NULL) || (dictSize<=8)) return 0;
@@ -2356,12 +2938,12 @@ static size_t ZSTD_compress_insertDictio
 
     /* dict restricted modes */
     if (dictContentType == ZSTD_dct_rawContent)
-        return ZSTD_loadDictionaryContent(ms, params, dict, dictSize);
+        return ZSTD_loadDictionaryContent(ms, params, dict, dictSize, dtlm);
 
     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
         if (dictContentType == ZSTD_dct_auto) {
             DEBUGLOG(4, "raw content dictionary detected");
-            return ZSTD_loadDictionaryContent(ms, params, dict, dictSize);
+            return ZSTD_loadDictionaryContent(ms, params, dict, dictSize, dtlm);
         }
         if (dictContentType == ZSTD_dct_fullDict)
             return ERROR(dictionary_wrong);
@@ -2369,17 +2951,18 @@ static size_t ZSTD_compress_insertDictio
     }
 
     /* dict as full zstd dictionary */
-    return ZSTD_loadZstdDictionary(bs, ms, params, dict, dictSize, workspace);
+    return ZSTD_loadZstdDictionary(bs, ms, params, dict, dictSize, dtlm, workspace);
 }
 
 /*! ZSTD_compressBegin_internal() :
  * @return : 0, or an error code */
-size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
-                             const void* dict, size_t dictSize,
-                             ZSTD_dictContentType_e dictContentType,
-                             const ZSTD_CDict* cdict,
-                             ZSTD_CCtx_params params, U64 pledgedSrcSize,
-                             ZSTD_buffered_policy_e zbuff)
+static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    ZSTD_CCtx_params params, U64 pledgedSrcSize,
+                                    ZSTD_buffered_policy_e zbuff)
 {
     DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params.cParams.windowLog);
     /* params are supposed to be fully validated at this point */
@@ -2387,9 +2970,7 @@ size_t ZSTD_compressBegin_internal(ZSTD_
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
 
     if (cdict && cdict->dictContentSize>0) {
-        cctx->requestedParams = params;
-        return ZSTD_resetCCtx_usingCDict(cctx, cdict, params.cParams.windowLog,
-                                         params.fParams, pledgedSrcSize, zbuff);
+        return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff);
     }
 
     CHECK_F( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
@@ -2397,7 +2978,7 @@ size_t ZSTD_compressBegin_internal(ZSTD_
     {
         size_t const dictID = ZSTD_compress_insertDictionary(
                 cctx->blockState.prevCBlock, &cctx->blockState.matchState,
-                &params, dict, dictSize, dictContentType, cctx->entropyWorkspace);
+                &params, dict, dictSize, dictContentType, dtlm, cctx->entropyWorkspace);
         if (ZSTD_isError(dictID)) return dictID;
         assert(dictID <= (size_t)(U32)-1);
         cctx->dictID = (U32)dictID;
@@ -2408,6 +2989,7 @@ size_t ZSTD_compressBegin_internal(ZSTD_
 size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
                                     const void* dict, size_t dictSize,
                                     ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
                                     const ZSTD_CDict* cdict,
                                     ZSTD_CCtx_params params,
                                     unsigned long long pledgedSrcSize)
@@ -2416,7 +2998,7 @@ size_t ZSTD_compressBegin_advanced_inter
     /* compression parameters verification and optimization */
     CHECK_F( ZSTD_checkCParams(params.cParams) );
     return ZSTD_compressBegin_internal(cctx,
-                                       dict, dictSize, dictContentType,
+                                       dict, dictSize, dictContentType, dtlm,
                                        cdict,
                                        params, pledgedSrcSize,
                                        ZSTDb_not_buffered);
@@ -2431,7 +3013,7 @@ size_t ZSTD_compressBegin_advanced(ZSTD_
     ZSTD_CCtx_params const cctxParams =
             ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
     return ZSTD_compressBegin_advanced_internal(cctx,
-                                            dict, dictSize, ZSTD_dct_auto,
+                                            dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast,
                                             NULL /*cdict*/,
                                             cctxParams, pledgedSrcSize);
 }
@@ -2442,7 +3024,7 @@ size_t ZSTD_compressBegin_usingDict(ZSTD
     ZSTD_CCtx_params const cctxParams =
             ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
     DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (U32)dictSize);
-    return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, NULL,
+    return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
                                        cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
 }
 
@@ -2505,7 +3087,9 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx
     if (ZSTD_isError(cSize)) return cSize;
     endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize);
     if (ZSTD_isError(endResult)) return endResult;
-    if (cctx->appliedParams.fParams.contentSizeFlag) {  /* control src size */
+    assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+    if (cctx->pledgedSrcSizePlusOne != 0) {  /* control src size */
+        ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
         DEBUGLOG(4, "end of frame : controlling src size");
         if (cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize+1) {
             DEBUGLOG(4, "error : pledgedSrcSize = %u, while realSrcSize = %u",
@@ -2517,22 +3101,22 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx
 
 
 static size_t ZSTD_compress_internal (ZSTD_CCtx* cctx,
-                               void* dst, size_t dstCapacity,
-                         const void* src, size_t srcSize,
-                         const void* dict,size_t dictSize,
-                               ZSTD_parameters params)
+                                      void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const void* dict,size_t dictSize,
+                                      ZSTD_parameters params)
 {
     ZSTD_CCtx_params const cctxParams =
             ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
     DEBUGLOG(4, "ZSTD_compress_internal");
     return ZSTD_compress_advanced_internal(cctx,
-                                          dst, dstCapacity,
-                                          src, srcSize,
-                                          dict, dictSize,
-                                          cctxParams);
+                                           dst, dstCapacity,
+                                           src, srcSize,
+                                           dict, dictSize,
+                                           cctxParams);
 }
 
-size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
+size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
                                void* dst, size_t dstCapacity,
                          const void* src, size_t srcSize,
                          const void* dict,size_t dictSize,
@@ -2540,7 +3124,11 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx
 {
     DEBUGLOG(4, "ZSTD_compress_advanced");
     CHECK_F(ZSTD_checkCParams(params.cParams));
-    return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, dict, dictSize, params);
+    return ZSTD_compress_internal(cctx,
+                                  dst, dstCapacity,
+                                  src, srcSize,
+                                  dict, dictSize,
+                                  params);
 }
 
 /* Internal */
@@ -2551,37 +3139,44 @@ size_t ZSTD_compress_advanced_internal(
         const void* dict,size_t dictSize,
         ZSTD_CCtx_params params)
 {
-    DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)",
-                (U32)srcSize);
-    CHECK_F( ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, NULL,
-                                         params, srcSize, ZSTDb_not_buffered) );
+    DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", (U32)srcSize);
+    CHECK_F( ZSTD_compressBegin_internal(cctx,
+                         dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                         params, srcSize, ZSTDb_not_buffered) );
     return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
 }
 
-size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize,
-                               const void* dict, size_t dictSize, int compressionLevel)
+size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict, size_t dictSize,
+                               int compressionLevel)
 {
-    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, srcSize ? srcSize : 1, dict ? dictSize : 0);
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, srcSize + (!srcSize), dict ? dictSize : 0);
     ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
     assert(params.fParams.contentSizeFlag == 1);
-    ZSTD_CCtxParam_setParameter(&cctxParams, ZSTD_p_compressLiterals, compressionLevel>=0);
     return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, cctxParams);
 }
 
-size_t ZSTD_compressCCtx (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel)
+size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+                         void* dst, size_t dstCapacity,
+                   const void* src, size_t srcSize,
+                         int compressionLevel)
 {
     DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (U32)srcSize);
+    assert(cctx != NULL);
     return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel);
 }
 
-size_t ZSTD_compress(void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel)
+size_t ZSTD_compress(void* dst, size_t dstCapacity,
+               const void* src, size_t srcSize,
+                     int compressionLevel)
 {
     size_t result;
     ZSTD_CCtx ctxBody;
-    memset(&ctxBody, 0, sizeof(ctxBody));
-    ctxBody.customMem = ZSTD_defaultCMem;
+    ZSTD_initCCtx(&ctxBody, ZSTD_defaultCMem);
     result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel);
-    ZSTD_free(ctxBody.workSpace, ZSTD_defaultCMem);  /* can't free ctxBody itself, as it's on stack; free only heap content */
+    ZSTD_freeCCtxContent(&ctxBody);   /* can't free ctxBody itself, as it's on stack; free only heap content */
     return result;
 }
 
@@ -2619,9 +3214,9 @@ static size_t ZSTD_initCDict_internal(
                     ZSTD_dictContentType_e dictContentType,
                     ZSTD_compressionParameters cParams)
 {
-    DEBUGLOG(3, "ZSTD_initCDict_internal, dictContentType %u", (U32)dictContentType);
+    DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (U32)dictContentType);
     assert(!ZSTD_checkCParams(cParams));
-    cdict->cParams = cParams;
+    cdict->matchState.cParams = cParams;
     if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) {
         cdict->dictBuffer = NULL;
         cdict->dictContent = dictBuffer;
@@ -2654,7 +3249,7 @@ static size_t ZSTD_initCDict_internal(
         {   size_t const dictID = ZSTD_compress_insertDictionary(
                     &cdict->cBlockState, &cdict->matchState, &params,
                     cdict->dictContent, cdict->dictContentSize,
-                    dictContentType, cdict->workspace);
+                    dictContentType, ZSTD_dtlm_full, cdict->workspace);
             if (ZSTD_isError(dictID)) return dictID;
             assert(dictID <= (size_t)(U32)-1);
             cdict->dictID = (U32)dictID;
@@ -2775,7 +3370,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
 ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict)
 {
     assert(cdict != NULL);
-    return cdict->cParams;
+    return cdict->matchState.cParams;
 }
 
 /* ZSTD_compressBegin_usingCDict_advanced() :
@@ -2799,7 +3394,7 @@ size_t ZSTD_compressBegin_usingCDict_adv
         }
         params.fParams = fParams;
         return ZSTD_compressBegin_internal(cctx,
-                                           NULL, 0, ZSTD_dct_auto,
+                                           NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast,
                                            cdict,
                                            params, pledgedSrcSize,
                                            ZSTDb_not_buffered);
@@ -2813,7 +3408,7 @@ size_t ZSTD_compressBegin_usingCDict(ZST
 {
     ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
     DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag);
-    return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, 0);
+    return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
 }
 
 size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
@@ -2880,16 +3475,17 @@ size_t ZSTD_CStreamOutSize(void)
 static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx,
                     const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType,
                     const ZSTD_CDict* const cdict,
-                    ZSTD_CCtx_params const params, unsigned long long const pledgedSrcSize)
+                    ZSTD_CCtx_params params, unsigned long long const pledgedSrcSize)
 {
-    DEBUGLOG(4, "ZSTD_resetCStream_internal (disableLiteralCompression=%i)",
-                params.disableLiteralCompression);
+    DEBUGLOG(4, "ZSTD_resetCStream_internal");
+    /* Finalize the compression parameters */
+    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, dictSize);
     /* params are supposed to be fully validated at this point */
     assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
 
     CHECK_F( ZSTD_compressBegin_internal(cctx,
-                                         dict, dictSize, dictContentType,
+                                         dict, dictSize, dictContentType, ZSTD_dtlm_fast,
                                          cdict,
                                          params, pledgedSrcSize,
                                          ZSTDb_buffered) );
@@ -2912,7 +3508,6 @@ size_t ZSTD_resetCStream(ZSTD_CStream* z
     DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (U32)pledgedSrcSize);
     if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
     params.fParams.contentSizeFlag = 1;
-    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, 0);
     return ZSTD_resetCStream_internal(zcs, NULL, 0, ZSTD_dct_auto, zcs->cdict, params, pledgedSrcSize);
 }
 
@@ -2925,6 +3520,7 @@ size_t ZSTD_initCStream_internal(ZSTD_CS
                     ZSTD_CCtx_params params, unsigned long long pledgedSrcSize)
 {
     DEBUGLOG(4, "ZSTD_initCStream_internal");
+    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, dictSize);
     assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
 
@@ -2991,25 +3587,21 @@ size_t ZSTD_initCStream_advanced(ZSTD_CS
                 (U32)pledgedSrcSize, params.fParams.contentSizeFlag);
     CHECK_F( ZSTD_checkCParams(params.cParams) );
     if ((pledgedSrcSize==0) && (params.fParams.contentSizeFlag==0)) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;  /* for compatibility with older programs relying on this behavior. Users should now specify ZSTD_CONTENTSIZE_UNKNOWN. This line will be removed in the future. */
-    {   ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
-        return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, cctxParams, pledgedSrcSize);
-    }
+    zcs->requestedParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
+    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, zcs->requestedParams, pledgedSrcSize);
 }
 
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel)
 {
-    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
-    ZSTD_CCtx_params const cctxParams =
-            ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
-    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, cctxParams, ZSTD_CONTENTSIZE_UNKNOWN);
+    ZSTD_CCtxParams_init(&zcs->requestedParams, compressionLevel);
+    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, zcs->requestedParams, ZSTD_CONTENTSIZE_UNKNOWN);
 }
 
 size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss)
 {
     U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;  /* temporary : 0 interpreted as "unknown" during transition period. Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. `0` will be interpreted as "empty" in the future */
-    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0);
-    ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
-    return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, cctxParams, pledgedSrcSize);
+    ZSTD_CCtxParams_init(&zcs->requestedParams, compressionLevel);
+    return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, zcs->requestedParams, pledgedSrcSize);
 }
 
 size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
@@ -3073,7 +3665,7 @@ size_t ZSTD_compressStream_generic(ZSTD_
                 ip = iend;
                 op += cSize;
                 zcs->frameEnded = 1;
-                ZSTD_startNewCompression(zcs);
+                ZSTD_CCtx_reset(zcs);
                 someMoreWork = 0; break;
             }
             /* complete loading into inBuffer */
@@ -3126,7 +3718,7 @@ size_t ZSTD_compressStream_generic(ZSTD_
                     if (zcs->frameEnded) {
                         DEBUGLOG(5, "Frame completed directly in outBuffer");
                         someMoreWork = 0;
-                        ZSTD_startNewCompression(zcs);
+                        ZSTD_CCtx_reset(zcs);
                     }
                     break;
                 }
@@ -3154,7 +3746,7 @@ size_t ZSTD_compressStream_generic(ZSTD_
                 if (zcs->frameEnded) {
                     DEBUGLOG(5, "Frame completed on flush");
                     someMoreWork = 0;
-                    ZSTD_startNewCompression(zcs);
+                    ZSTD_CCtx_reset(zcs);
                     break;
                 }
                 zcs->streamStage = zcss_load;
@@ -3207,19 +3799,16 @@ size_t ZSTD_compress_generic (ZSTD_CCtx*
         params.cParams = ZSTD_getCParamsFromCCtxParams(
                 &cctx->requestedParams, cctx->pledgedSrcSizePlusOne-1, 0 /*dictSize*/);
 
+
 #ifdef ZSTD_MULTITHREAD
         if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) {
             params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */
         }
         if (params.nbWorkers > 0) {
             /* mt context creation */
-            if (cctx->mtctx == NULL || (params.nbWorkers != ZSTDMT_getNbWorkers(cctx->mtctx))) {
+            if (cctx->mtctx == NULL) {
                 DEBUGLOG(4, "ZSTD_compress_generic: creating new mtctx for nbWorkers=%u",
                             params.nbWorkers);
-                if (cctx->mtctx != NULL)
-                    DEBUGLOG(4, "ZSTD_compress_generic: previous nbWorkers was %u",
-                                ZSTDMT_getNbWorkers(cctx->mtctx));
-                ZSTDMT_freeCCtx(cctx->mtctx);
                 cctx->mtctx = ZSTDMT_createCCtx_advanced(params.nbWorkers, cctx->customMem);
                 if (cctx->mtctx == NULL) return ERROR(memory_allocation);
             }
@@ -3251,8 +3840,9 @@ size_t ZSTD_compress_generic (ZSTD_CCtx*
         {   size_t const flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp);
             if ( ZSTD_isError(flushMin)
               || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */
-                ZSTD_startNewCompression(cctx);
+                ZSTD_CCtx_reset(cctx);
             }
+            DEBUGLOG(5, "completed ZSTD_compress_generic delegating to ZSTDMT_compressStream_generic");
             return flushMin;
     }   }
 #endif
@@ -3308,82 +3898,83 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs,
 
 #define ZSTD_MAX_CLEVEL     22
 int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; }
 
 static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
 {   /* "default" - guarantees a monotonically increasing memory budget */
     /* W,  C,  H,  S,  L, TL, strat */
     { 19, 12, 13,  1,  6,  1, ZSTD_fast    },  /* base for negative levels */
-    { 19, 13, 14,  1,  7,  1, ZSTD_fast    },  /* level  1 */
-    { 19, 15, 16,  1,  6,  1, ZSTD_fast    },  /* level  2 */
-    { 20, 16, 17,  1,  5,  8, ZSTD_dfast   },  /* level  3 */
-    { 20, 17, 18,  1,  5,  8, ZSTD_dfast   },  /* level  4 */
-    { 20, 17, 18,  2,  5, 16, ZSTD_greedy  },  /* level  5 */
-    { 21, 17, 19,  2,  5, 16, ZSTD_lazy    },  /* level  6 */
-    { 21, 18, 19,  3,  5, 16, ZSTD_lazy    },  /* level  7 */
-    { 21, 18, 20,  3,  5, 16, ZSTD_lazy2   },  /* level  8 */
-    { 21, 19, 20,  3,  5, 16, ZSTD_lazy2   },  /* level  9 */
-    { 21, 19, 21,  4,  5, 16, ZSTD_lazy2   },  /* level 10 */
-    { 22, 20, 22,  4,  5, 16, ZSTD_lazy2   },  /* level 11 */
+    { 19, 13, 14,  1,  7,  0, ZSTD_fast    },  /* level  1 */
+    { 19, 15, 16,  1,  6,  0, ZSTD_fast    },  /* level  2 */
+    { 20, 16, 17,  1,  5,  1, ZSTD_dfast   },  /* level  3 */
+    { 20, 18, 18,  1,  5,  1, ZSTD_dfast   },  /* level  4 */
+    { 20, 18, 18,  2,  5,  2, ZSTD_greedy  },  /* level  5 */
+    { 21, 18, 19,  2,  5,  4, ZSTD_lazy    },  /* level  6 */
+    { 21, 18, 19,  3,  5,  8, ZSTD_lazy2   },  /* level  7 */
+    { 21, 19, 19,  3,  5, 16, ZSTD_lazy2   },  /* level  8 */
+    { 21, 19, 20,  4,  5, 16, ZSTD_lazy2   },  /* level  9 */
+    { 21, 20, 21,  4,  5, 16, ZSTD_lazy2   },  /* level 10 */
+    { 21, 21, 22,  4,  5, 16, ZSTD_lazy2   },  /* level 11 */
     { 22, 20, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 12 */
     { 22, 21, 22,  4,  5, 32, ZSTD_btlazy2 },  /* level 13 */
     { 22, 21, 22,  5,  5, 32, ZSTD_btlazy2 },  /* level 14 */
     { 22, 22, 22,  6,  5, 32, ZSTD_btlazy2 },  /* level 15 */
     { 22, 21, 22,  4,  5, 48, ZSTD_btopt   },  /* level 16 */
-    { 23, 22, 22,  4,  4, 48, ZSTD_btopt   },  /* level 17 */
-    { 23, 22, 22,  5,  3, 64, ZSTD_btopt   },  /* level 18 */
-    { 23, 23, 22,  7,  3,128, ZSTD_btopt   },  /* level 19 */
-    { 25, 25, 23,  7,  3,128, ZSTD_btultra },  /* level 20 */
-    { 26, 26, 24,  7,  3,256, ZSTD_btultra },  /* level 21 */
-    { 27, 27, 25,  9,  3,512, ZSTD_btultra },  /* level 22 */
+    { 23, 22, 22,  4,  4, 64, ZSTD_btopt   },  /* level 17 */
+    { 23, 23, 22,  6,  3,256, ZSTD_btopt   },  /* level 18 */
+    { 23, 24, 22,  7,  3,256, ZSTD_btultra },  /* level 19 */
+    { 25, 25, 23,  7,  3,256, ZSTD_btultra },  /* level 20 */
+    { 26, 26, 24,  7,  3,512, ZSTD_btultra },  /* level 21 */
+    { 27, 27, 25,  9,  3,999, ZSTD_btultra },  /* level 22 */
 },
 {   /* for srcSize <= 256 KB */
     /* W,  C,  H,  S,  L,  T, strat */
     { 18, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
-    { 18, 13, 14,  1,  6,  1, ZSTD_fast    },  /* level  1 */
-    { 18, 14, 13,  1,  5,  8, ZSTD_dfast   },  /* level  2 */
-    { 18, 16, 15,  1,  5,  8, ZSTD_dfast   },  /* level  3 */
-    { 18, 15, 17,  1,  5,  8, ZSTD_greedy  },  /* level  4.*/
-    { 18, 16, 17,  4,  5,  8, ZSTD_greedy  },  /* level  5.*/
-    { 18, 16, 17,  3,  5,  8, ZSTD_lazy    },  /* level  6.*/
-    { 18, 17, 17,  4,  4,  8, ZSTD_lazy    },  /* level  7 */
-    { 18, 17, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
-    { 18, 17, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
-    { 18, 17, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
-    { 18, 18, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 11.*/
-    { 18, 18, 17,  5,  4,  8, ZSTD_btlazy2 },  /* level 12.*/
-    { 18, 19, 17,  7,  4,  8, ZSTD_btlazy2 },  /* level 13 */
-    { 18, 18, 18,  4,  4, 16, ZSTD_btopt   },  /* level 14.*/
-    { 18, 18, 18,  4,  3, 16, ZSTD_btopt   },  /* level 15.*/
-    { 18, 19, 18,  6,  3, 32, ZSTD_btopt   },  /* level 16.*/
-    { 18, 19, 18,  8,  3, 64, ZSTD_btopt   },  /* level 17.*/
-    { 18, 19, 18,  9,  3,128, ZSTD_btopt   },  /* level 18.*/
-    { 18, 19, 18, 10,  3,256, ZSTD_btopt   },  /* level 19.*/
-    { 18, 19, 18, 11,  3,512, ZSTD_btultra },  /* level 20.*/
-    { 18, 19, 18, 12,  3,512, ZSTD_btultra },  /* level 21.*/
-    { 18, 19, 18, 13,  3,512, ZSTD_btultra },  /* level 22.*/
+    { 18, 13, 14,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 18, 14, 14,  1,  5,  1, ZSTD_dfast   },  /* level  2 */
+    { 18, 16, 16,  1,  4,  1, ZSTD_dfast   },  /* level  3 */
+    { 18, 16, 17,  2,  5,  2, ZSTD_greedy  },  /* level  4.*/
+    { 18, 18, 18,  3,  5,  2, ZSTD_greedy  },  /* level  5.*/
+    { 18, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6.*/
+    { 18, 18, 19,  4,  4,  4, ZSTD_lazy    },  /* level  7 */
+    { 18, 18, 19,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 18, 18, 19,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 18, 18, 19,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 18, 18, 19,  5,  4, 16, ZSTD_btlazy2 },  /* level 11.*/
+    { 18, 19, 19,  6,  4, 16, ZSTD_btlazy2 },  /* level 12.*/
+    { 18, 19, 19,  8,  4, 16, ZSTD_btlazy2 },  /* level 13 */
+    { 18, 18, 19,  4,  4, 24, ZSTD_btopt   },  /* level 14.*/
+    { 18, 18, 19,  4,  3, 24, ZSTD_btopt   },  /* level 15.*/
+    { 18, 19, 19,  6,  3, 64, ZSTD_btopt   },  /* level 16.*/
+    { 18, 19, 19,  8,  3,128, ZSTD_btopt   },  /* level 17.*/
+    { 18, 19, 19, 10,  3,256, ZSTD_btopt   },  /* level 18.*/
+    { 18, 19, 19, 10,  3,256, ZSTD_btultra },  /* level 19.*/
+    { 18, 19, 19, 11,  3,512, ZSTD_btultra },  /* level 20.*/
+    { 18, 19, 19, 12,  3,512, ZSTD_btultra },  /* level 21.*/
+    { 18, 19, 19, 13,  3,999, ZSTD_btultra },  /* level 22.*/
 },
 {   /* for srcSize <= 128 KB */
     /* W,  C,  H,  S,  L,  T, strat */
-    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* level  0 - not used */
-    { 17, 12, 13,  1,  6,  1, ZSTD_fast    },  /* level  1 */
-    { 17, 13, 16,  1,  5,  1, ZSTD_fast    },  /* level  2 */
-    { 17, 16, 16,  2,  5,  8, ZSTD_dfast   },  /* level  3 */
-    { 17, 13, 15,  3,  4,  8, ZSTD_greedy  },  /* level  4 */
-    { 17, 15, 17,  4,  4,  8, ZSTD_greedy  },  /* level  5 */
-    { 17, 16, 17,  3,  4,  8, ZSTD_lazy    },  /* level  6 */
-    { 17, 15, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 17, 12, 13,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 17, 13, 15,  1,  5,  0, ZSTD_fast    },  /* level  2 */
+    { 17, 15, 16,  2,  5,  1, ZSTD_dfast   },  /* level  3 */
+    { 17, 17, 17,  2,  4,  1, ZSTD_dfast   },  /* level  4 */
+    { 17, 16, 17,  3,  4,  2, ZSTD_greedy  },  /* level  5 */
+    { 17, 17, 17,  3,  4,  4, ZSTD_lazy    },  /* level  6 */
+    { 17, 17, 17,  3,  4,  8, ZSTD_lazy2   },  /* level  7 */
     { 17, 17, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
     { 17, 17, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
     { 17, 17, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
     { 17, 17, 17,  7,  4,  8, ZSTD_lazy2   },  /* level 11 */
-    { 17, 17, 17,  8,  4,  8, ZSTD_lazy2   },  /* level 12 */
-    { 17, 18, 17,  6,  4,  8, ZSTD_btlazy2 },  /* level 13.*/
-    { 17, 17, 17,  7,  3,  8, ZSTD_btopt   },  /* level 14.*/
-    { 17, 17, 17,  7,  3, 16, ZSTD_btopt   },  /* level 15.*/
-    { 17, 18, 17,  7,  3, 32, ZSTD_btopt   },  /* level 16.*/
-    { 17, 18, 17,  7,  3, 64, ZSTD_btopt   },  /* level 17.*/
-    { 17, 18, 17,  7,  3,256, ZSTD_btopt   },  /* level 18.*/
-    { 17, 18, 17,  8,  3,256, ZSTD_btopt   },  /* level 19.*/
+    { 17, 18, 17,  6,  4, 16, ZSTD_btlazy2 },  /* level 12 */
+    { 17, 18, 17,  8,  4, 16, ZSTD_btlazy2 },  /* level 13.*/
+    { 17, 18, 17,  4,  4, 32, ZSTD_btopt   },  /* level 14.*/
+    { 17, 18, 17,  6,  3, 64, ZSTD_btopt   },  /* level 15.*/
+    { 17, 18, 17,  7,  3,128, ZSTD_btopt   },  /* level 16.*/
+    { 17, 18, 17,  7,  3,256, ZSTD_btopt   },  /* level 17.*/
+    { 17, 18, 17,  8,  3,256, ZSTD_btopt   },  /* level 18.*/
+    { 17, 18, 17,  8,  3,256, ZSTD_btultra },  /* level 19.*/
     { 17, 18, 17,  9,  3,256, ZSTD_btultra },  /* level 20.*/
     { 17, 18, 17, 10,  3,256, ZSTD_btultra },  /* level 21.*/
     { 17, 18, 17, 11,  3,512, ZSTD_btultra },  /* level 22.*/
@@ -3391,28 +3982,28 @@ static const ZSTD_compressionParameters 
 {   /* for srcSize <= 16 KB */
     /* W,  C,  H,  S,  L,  T, strat */
     { 14, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
-    { 14, 14, 14,  1,  6,  1, ZSTD_fast    },  /* level  1 */
-    { 14, 14, 14,  1,  4,  1, ZSTD_fast    },  /* level  2 */
-    { 14, 14, 14,  1,  4,  6, ZSTD_dfast   },  /* level  3.*/
-    { 14, 14, 14,  4,  4,  6, ZSTD_greedy  },  /* level  4.*/
-    { 14, 14, 14,  3,  4,  6, ZSTD_lazy    },  /* level  5.*/
-    { 14, 14, 14,  4,  4,  6, ZSTD_lazy2   },  /* level  6 */
-    { 14, 14, 14,  5,  4,  6, ZSTD_lazy2   },  /* level  7 */
-    { 14, 14, 14,  6,  4,  6, ZSTD_lazy2   },  /* level  8.*/
-    { 14, 15, 14,  6,  4,  6, ZSTD_btlazy2 },  /* level  9.*/
-    { 14, 15, 14,  3,  3,  6, ZSTD_btopt   },  /* level 10.*/
-    { 14, 15, 14,  6,  3,  8, ZSTD_btopt   },  /* level 11.*/
+    { 14, 14, 15,  1,  5,  0, ZSTD_fast    },  /* level  1 */
+    { 14, 14, 15,  1,  4,  0, ZSTD_fast    },  /* level  2 */
+    { 14, 14, 14,  2,  4,  1, ZSTD_dfast   },  /* level  3.*/
+    { 14, 14, 14,  4,  4,  2, ZSTD_greedy  },  /* level  4.*/
+    { 14, 14, 14,  3,  4,  4, ZSTD_lazy    },  /* level  5.*/
+    { 14, 14, 14,  4,  4,  8, ZSTD_lazy2   },  /* level  6 */
+    { 14, 14, 14,  6,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 14, 14, 14,  8,  4,  8, ZSTD_lazy2   },  /* level  8.*/
+    { 14, 15, 14,  5,  4,  8, ZSTD_btlazy2 },  /* level  9.*/
+    { 14, 15, 14,  9,  4,  8, ZSTD_btlazy2 },  /* level 10.*/
+    { 14, 15, 14,  3,  4, 12, ZSTD_btopt   },  /* level 11.*/
     { 14, 15, 14,  6,  3, 16, ZSTD_btopt   },  /* level 12.*/
     { 14, 15, 14,  6,  3, 24, ZSTD_btopt   },  /* level 13.*/
     { 14, 15, 15,  6,  3, 48, ZSTD_btopt   },  /* level 14.*/
     { 14, 15, 15,  6,  3, 64, ZSTD_btopt   },  /* level 15.*/
     { 14, 15, 15,  6,  3, 96, ZSTD_btopt   },  /* level 16.*/
     { 14, 15, 15,  6,  3,128, ZSTD_btopt   },  /* level 17.*/
-    { 14, 15, 15,  6,  3,256, ZSTD_btopt   },  /* level 18.*/
-    { 14, 15, 15,  7,  3,256, ZSTD_btopt   },  /* level 19.*/
+    { 14, 15, 15,  8,  3,256, ZSTD_btopt   },  /* level 18.*/
+    { 14, 15, 15,  6,  3,256, ZSTD_btultra },  /* level 19.*/
     { 14, 15, 15,  8,  3,256, ZSTD_btultra },  /* level 20.*/
     { 14, 15, 15,  9,  3,256, ZSTD_btultra },  /* level 21.*/
-    { 14, 15, 15, 10,  3,256, ZSTD_btultra },  /* level 22.*/
+    { 14, 15, 15, 10,  3,512, ZSTD_btultra },  /* level 22.*/
 },
 };
 
diff --git a/contrib/python-zstandard/zstd/compress/zstd_compress_internal.h b/contrib/python-zstandard/zstd/compress/zstd_compress_internal.h
--- a/contrib/python-zstandard/zstd/compress/zstd_compress_internal.h
+++ b/contrib/python-zstandard/zstd/compress/zstd_compress_internal.h
@@ -27,6 +27,7 @@
 extern "C" {
 #endif
 
+
 /*-*************************************
 *  Constants
 ***************************************/
@@ -37,7 +38,8 @@ extern "C" {
                                        It's not a big deal though : candidate will just be sorted again.
                                        Additionnally, candidate position 1 will be lost.
                                        But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
-                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy */
+                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy
+                                       Constant required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
 
 
 /*-*************************************
@@ -46,6 +48,12 @@ extern "C" {
 typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e;
 typedef enum { zcss_init=0, zcss_load, zcss_flush } ZSTD_cStreamStage;
 
+typedef enum {
+    ZSTD_dictDefaultAttach = 0,
+    ZSTD_dictForceAttach = 1,
+    ZSTD_dictForceCopy = -1,
+} ZSTD_dictAttachPref_e;
+
 typedef struct ZSTD_prefixDict_s {
     const void* dict;
     size_t dictSize;
@@ -53,14 +61,22 @@ typedef struct ZSTD_prefixDict_s {
 } ZSTD_prefixDict;
 
 typedef struct {
-    U32 hufCTable[HUF_CTABLE_SIZE_U32(255)];
+    U32 CTable[HUF_CTABLE_SIZE_U32(255)];
+    HUF_repeat repeatMode;
+} ZSTD_hufCTables_t;
+
+typedef struct {
     FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
     FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
     FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
-    HUF_repeat hufCTable_repeatMode;
     FSE_repeat offcode_repeatMode;
     FSE_repeat matchlength_repeatMode;
     FSE_repeat litlength_repeatMode;
+} ZSTD_fseCTables_t;
+
+typedef struct {
+    ZSTD_hufCTables_t huf;
+    ZSTD_fseCTables_t fse;
 } ZSTD_entropyCTables_t;
 
 typedef struct {
@@ -76,26 +92,27 @@ typedef struct {
     U32 rep[ZSTD_REP_NUM];
 } ZSTD_optimal_t;
 
+typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
+
 typedef struct {
     /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
-    U32* litFreq;               /* table of literals statistics, of size 256 */
-    U32* litLengthFreq;         /* table of litLength statistics, of size (MaxLL+1) */
-    U32* matchLengthFreq;       /* table of matchLength statistics, of size (MaxML+1) */
-    U32* offCodeFreq;           /* table of offCode statistics, of size (MaxOff+1) */
-    ZSTD_match_t* matchTable;   /* list of found matches, of size ZSTD_OPT_NUM+1 */
-    ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
+    U32* litFreq;                /* table of literals statistics, of size 256 */
+    U32* litLengthFreq;          /* table of litLength statistics, of size (MaxLL+1) */
+    U32* matchLengthFreq;        /* table of matchLength statistics, of size (MaxML+1) */
+    U32* offCodeFreq;            /* table of offCode statistics, of size (MaxOff+1) */
+    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_NUM+1 */
+    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
 
     U32  litSum;                 /* nb of literals */
     U32  litLengthSum;           /* nb of litLength codes */
     U32  matchLengthSum;         /* nb of matchLength codes */
     U32  offCodeSum;             /* nb of offset codes */
-    /* begin updated by ZSTD_setLog2Prices */
-    U32  log2litSum;             /* pow2 to compare log2(litfreq) to */
-    U32  log2litLengthSum;       /* pow2 to compare log2(llfreq) to */
-    U32  log2matchLengthSum;     /* pow2 to compare log2(mlfreq) to */
-    U32  log2offCodeSum;         /* pow2 to compare log2(offreq) to */
-    /* end : updated by ZSTD_setLog2Prices */
-    U32  staticPrices;           /* prices follow a pre-defined cost structure, statistics are irrelevant */
+    U32  litSumBasePrice;        /* to compare to log2(litfreq) */
+    U32  litLengthSumBasePrice;  /* to compare to log2(llfreq)  */
+    U32  matchLengthSumBasePrice;/* to compare to log2(mlfreq)  */
+    U32  offCodeSumBasePrice;    /* to compare to log2(offreq)  */
+    ZSTD_OptPrice_e priceType;   /* prices can be determined dynamically, or follow a pre-defined cost structure */
+    const ZSTD_entropyCTables_t* symbolCosts;  /* pre-calculated dictionary statistics */
 } optState_t;
 
 typedef struct {
@@ -111,17 +128,20 @@ typedef struct {
     U32 lowLimit;           /* below that point, no more data */
 } ZSTD_window_t;
 
-typedef struct {
-    ZSTD_window_t window;      /* State for window round buffer management */
-    U32 loadedDictEnd;         /* index of end of dictionary */
-    U32 nextToUpdate;          /* index from which to continue table update */
-    U32 nextToUpdate3;         /* index from which to continue table update */
-    U32 hashLog3;              /* dispatch table : larger == faster, more memory */
+typedef struct ZSTD_matchState_t ZSTD_matchState_t;
+struct ZSTD_matchState_t {
+    ZSTD_window_t window;   /* State for window round buffer management */
+    U32 loadedDictEnd;      /* index of end of dictionary */
+    U32 nextToUpdate;       /* index from which to continue table update */
+    U32 nextToUpdate3;      /* index from which to continue table update */
+    U32 hashLog3;           /* dispatch table : larger == faster, more memory */
     U32* hashTable;
     U32* hashTable3;
     U32* chainTable;
     optState_t opt;         /* optimal parser state */
-} ZSTD_matchState_t;
+    const ZSTD_matchState_t *dictMatchState;
+    ZSTD_compressionParameters cParams;
+};
 
 typedef struct {
     ZSTD_compressedBlockState_t* prevCBlock;
@@ -161,7 +181,7 @@ typedef struct {
   rawSeq* seq;     /* The start of the sequences */
   size_t pos;      /* The position where reading stopped. <= size. */
   size_t size;     /* The number of sequences. <= capacity. */
-  size_t capacity; /* The capacity of the `seq` pointer */
+  size_t capacity; /* The capacity starting from `seq` pointer */
 } rawSeqStore_t;
 
 struct ZSTD_CCtx_params_s {
@@ -170,10 +190,11 @@ struct ZSTD_CCtx_params_s {
     ZSTD_frameParameters fParams;
 
     int compressionLevel;
-    int disableLiteralCompression;
     int forceWindow;           /* force back-references to respect limit of
                                 * 1<<wLog, even for dictionary */
 
+    ZSTD_dictAttachPref_e attachDictPref;
+
     /* Multithreading: used to pass parameters to mtctx */
     unsigned nbWorkers;
     unsigned jobSize;
@@ -193,6 +214,8 @@ struct ZSTD_CCtx_s {
     ZSTD_CCtx_params requestedParams;
     ZSTD_CCtx_params appliedParams;
     U32   dictID;
+
+    int workSpaceOversizedDuration;
     void* workSpace;
     size_t workSpaceSize;
     size_t blockSize;
@@ -235,11 +258,15 @@ struct ZSTD_CCtx_s {
 #endif
 };
 
+typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
+
+typedef enum { ZSTD_noDict = 0, ZSTD_extDict = 1, ZSTD_dictMatchState = 2 } ZSTD_dictMode_e;
+
 
 typedef size_t (*ZSTD_blockCompressor) (
         ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict);
+        void const* src, size_t srcSize);
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode);
 
 
 MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
@@ -280,16 +307,18 @@ MEM_STATIC U32 ZSTD_MLcode(U32 mlBase)
 */
 MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const void* literals, U32 offsetCode, size_t mlBase)
 {
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG >= 6)
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL >= 6)
     static const BYTE* g_start = NULL;
     if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
     {   U32 const pos = (U32)((const BYTE*)literals - g_start);
-        DEBUGLOG(6, "Cpos%7u :%3u literals, match%3u bytes at dist.code%7u",
+        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
                pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offsetCode);
     }
 #endif
+    assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
     /* copy Literals */
-    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + 128 KB);
+    assert(seqStorePtr->maxNbLit <= 128 KB);
+    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
     ZSTD_wildcopy(seqStorePtr->lit, literals, litLength);
     seqStorePtr->lit += litLength;
 
@@ -420,6 +449,11 @@ ZSTD_count_2segments(const BYTE* ip, con
     const BYTE* const vEnd = MIN( ip + (mEnd - match), iEnd);
     size_t const matchLength = ZSTD_count(ip, match, vEnd);
     if (match + matchLength != mEnd) return matchLength;
+    DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength);
+    DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match);
+    DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip);
+    DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart);
+    DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip+matchLength, iStart, iEnd));
     return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd);
 }
 
@@ -497,6 +531,20 @@ MEM_STATIC U32 ZSTD_window_hasExtDict(ZS
 }
 
 /**
+ * ZSTD_matchState_dictMode():
+ * Inspects the provided matchState and figures out what dictMode should be
+ * passed to the compressor.
+ */
+MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms)
+{
+    return ZSTD_window_hasExtDict(ms->window) ?
+        ZSTD_extDict :
+        ms->dictMatchState != NULL ?
+            ZSTD_dictMatchState :
+            ZSTD_noDict;
+}
+
+/**
  * ZSTD_window_needOverflowCorrection():
  * Returns non-zero if the indices are getting too large and need overflow
  * protection.
@@ -563,31 +611,41 @@ MEM_STATIC U32 ZSTD_window_correctOverfl
  * ZSTD_window_enforceMaxDist():
  * Updates lowLimit so that:
  *    (srcEnd - base) - lowLimit == maxDist + loadedDictEnd
+ *
  * This allows a simple check that index >= lowLimit to see if index is valid.
  * This must be called before a block compression call, with srcEnd as the block
  * source end.
+ *
  * If loadedDictEndPtr is not NULL, we set it to zero once we update lowLimit.
  * This is because dictionaries are allowed to be referenced as long as the last
  * byte of the dictionary is in the window, but once they are out of range,
  * they cannot be referenced. If loadedDictEndPtr is NULL, we use
  * loadedDictEnd == 0.
+ *
+ * In normal dict mode, the dict is between lowLimit and dictLimit. In
+ * dictMatchState mode, lowLimit and dictLimit are the same, and the dictionary
+ * is below them. forceWindow and dictMatchState are therefore incompatible.
  */
 MEM_STATIC void ZSTD_window_enforceMaxDist(ZSTD_window_t* window,
                                            void const* srcEnd, U32 maxDist,
-                                           U32* loadedDictEndPtr)
+                                           U32* loadedDictEndPtr,
+                                           const ZSTD_matchState_t** dictMatchStatePtr)
 {
     U32 const current = (U32)((BYTE const*)srcEnd - window->base);
     U32 loadedDictEnd = loadedDictEndPtr != NULL ? *loadedDictEndPtr : 0;
+    DEBUGLOG(5, "ZSTD_window_enforceMaxDist: current=%u, maxDist=%u", current, maxDist);
     if (current > maxDist + loadedDictEnd) {
         U32 const newLowLimit = current - maxDist;
         if (window->lowLimit < newLowLimit) window->lowLimit = newLowLimit;
         if (window->dictLimit < window->lowLimit) {
-            DEBUGLOG(5, "Update dictLimit from %u to %u", window->dictLimit,
-                     window->lowLimit);
+            DEBUGLOG(5, "Update dictLimit to match lowLimit, from %u to %u",
+                        window->dictLimit, window->lowLimit);
             window->dictLimit = window->lowLimit;
         }
         if (loadedDictEndPtr)
             *loadedDictEndPtr = 0;
+        if (dictMatchStatePtr)
+            *dictMatchStatePtr = NULL;
     }
 }
 
@@ -603,12 +661,12 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_w
 {
     BYTE const* const ip = (BYTE const*)src;
     U32 contiguous = 1;
+    DEBUGLOG(5, "ZSTD_window_update");
     /* Check if blocks follow each other */
     if (src != window->nextSrc) {
         /* not contiguous */
         size_t const distanceFromBase = (size_t)(window->nextSrc - window->base);
-        DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u",
-                 window->dictLimit);
+        DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit);
         window->lowLimit = window->dictLimit;
         assert(distanceFromBase == (size_t)(U32)distanceFromBase);  /* should never overflow */
         window->dictLimit = (U32)distanceFromBase;
@@ -625,10 +683,38 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_w
         ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase;
         U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
         window->lowLimit = lowLimitMax;
+        DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit);
     }
     return contiguous;
 }
 
+
+/* debug functions */
+
+MEM_STATIC double ZSTD_fWeight(U32 rawStat)
+{
+    U32 const fp_accuracy = 8;
+    U32 const fp_multiplier = (1 << fp_accuracy);
+    U32 const stat = rawStat + 1;
+    U32 const hb = ZSTD_highbit32(stat);
+    U32 const BWeight = hb * fp_multiplier;
+    U32 const FWeight = (stat << fp_accuracy) >> hb;
+    U32 const weight = BWeight + FWeight;
+    assert(hb + fp_accuracy < 31);
+    return (double)weight / fp_multiplier;
+}
+
+MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+{
+    unsigned u, sum;
+    for (u=0, sum=0; u<=max; u++) sum += table[u];
+    DEBUGLOG(2, "total nb elts: %u", sum);
+    for (u=0; u<=max; u++) {
+        DEBUGLOG(2, "%2u: %5u  (%.2f)",
+                u, table[u], ZSTD_fWeight(sum) - ZSTD_fWeight(table[u]) );
+    }
+}
+
 #if defined (__cplusplus)
 }
 #endif
@@ -640,7 +726,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_w
  * ============================================================== */
 
 /* ZSTD_getCParamsFromCCtxParams() :
- * cParams are built depending on compressionLevel, src size hints, 
+ * cParams are built depending on compressionLevel, src size hints,
  * LDM and manually set compression parameters.
  */
 ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
@@ -656,6 +742,8 @@ size_t ZSTD_initCStream_internal(ZSTD_CS
                      const ZSTD_CDict* cdict,
                      ZSTD_CCtx_params  params, unsigned long long pledgedSrcSize);
 
+void ZSTD_resetSeqStore(seqStore_t* ssPtr);
+
 /*! ZSTD_compressStream_generic() :
  *  Private use only. To be called from zstdmt_compress.c in single-thread mode. */
 size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
@@ -672,6 +760,7 @@ ZSTD_compressionParameters ZSTD_getCPara
 size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
                                     const void* dict, size_t dictSize,
                                     ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
                                     const ZSTD_CDict* cdict,
                                     ZSTD_CCtx_params params,
                                     unsigned long long pledgedSrcSize);
diff --git a/contrib/python-zstandard/zstd/compress/zstd_double_fast.c b/contrib/python-zstandard/zstd/compress/zstd_double_fast.c
--- a/contrib/python-zstandard/zstd/compress/zstd_double_fast.c
+++ b/contrib/python-zstandard/zstd/compress/zstd_double_fast.c
@@ -13,9 +13,9 @@
 
 
 void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
-                              ZSTD_compressionParameters const* cParams,
-                              void const* end)
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashLarge = ms->hashTable;
     U32  const hBitsL = cParams->hashLog;
     U32  const mls = cParams->searchLength;
@@ -40,6 +40,9 @@ void ZSTD_fillDoubleHashTable(ZSTD_match
                 hashSmall[smHash] = current + i;
             if (i == 0 || hashLarge[lgHash] == 0)
                 hashLarge[lgHash] = current + i;
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
         }
     }
 }
@@ -48,9 +51,10 @@ void ZSTD_fillDoubleHashTable(ZSTD_match
 FORCE_INLINE_TEMPLATE
 size_t ZSTD_compressBlock_doubleFast_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
-        U32 const mls /* template */)
+        void const* src, size_t srcSize,
+        U32 const mls /* template */, ZSTD_dictMode_e const dictMode)
 {
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
     U32* const hashLong = ms->hashTable;
     const U32 hBitsL = cParams->hashLog;
     U32* const hashSmall = ms->chainTable;
@@ -59,70 +63,188 @@ size_t ZSTD_compressBlock_doubleFast_gen
     const BYTE* const istart = (const BYTE*)src;
     const BYTE* ip = istart;
     const BYTE* anchor = istart;
-    const U32 lowestIndex = ms->window.dictLimit;
-    const BYTE* const lowest = base + lowestIndex;
+    const U32 prefixLowestIndex = ms->window.dictLimit;
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
     U32 offsetSaved = 0;
 
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams =
+                                     dictMode == ZSTD_dictMatchState ?
+                                     &dms->cParams : NULL;
+    const U32* const dictHashLong  = dictMode == ZSTD_dictMatchState ?
+                                     dms->hashTable : NULL;
+    const U32* const dictHashSmall = dictMode == ZSTD_dictMatchState ?
+                                     dms->chainTable : NULL;
+    const U32 dictStartIndex       = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.base : NULL;
+    const BYTE* const dictStart    = dictMode == ZSTD_dictMatchState ?
+                                     dictBase + dictStartIndex : NULL;
+    const BYTE* const dictEnd      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ?
+                                     prefixLowestIndex - (U32)(dictEnd - dictBase) :
+                                     0;
+    const U32 dictHBitsL           = dictMode == ZSTD_dictMatchState ?
+                                     dictCParams->hashLog : hBitsL;
+    const U32 dictHBitsS           = dictMode == ZSTD_dictMatchState ?
+                                     dictCParams->chainLog : hBitsS;
+    const U32 dictAndPrefixLength  = (U32)(ip - prefixLowest + dictEnd - dictStart);
+
+    assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState);
+
     /* init */
-    ip += (ip==lowest);
-    {   U32 const maxRep = (U32)(ip-lowest);
+    ip += (dictAndPrefixLength == 0);
+    if (dictMode == ZSTD_noDict) {
+        U32 const maxRep = (U32)(ip - prefixLowest);
         if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
         if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
     }
+    if (dictMode == ZSTD_dictMatchState) {
+        /* dictMatchState repCode checks don't currently handle repCode == 0
+         * disabling. */
+        assert(offset_1 <= dictAndPrefixLength);
+        assert(offset_2 <= dictAndPrefixLength);
+    }
 
     /* Main Search Loop */
     while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
         size_t mLength;
+        U32 offset;
         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
         U32 const current = (U32)(ip-base);
         U32 const matchIndexL = hashLong[h2];
-        U32 const matchIndexS = hashSmall[h];
+        U32 matchIndexS = hashSmall[h];
         const BYTE* matchLong = base + matchIndexL;
         const BYTE* match = base + matchIndexS;
+        const U32 repIndex = current + 1 - offset_1;
+        const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
+                            && repIndex < prefixLowestIndex) ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
         hashLong[h2] = hashSmall[h] = current;   /* update hash tables */
 
-        assert(offset_1 <= current);   /* supposed guaranteed by construction */
-        if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
-            /* favor repcode */
+        /* check dictMatchState repcode */
+        if (dictMode == ZSTD_dictMatchState
+            && ((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+            && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+            goto _match_stored;
+        }
+
+        /* check noDict repcode */
+        if ( dictMode == ZSTD_noDict
+          && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
             mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
             ip++;
             ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
-        } else {
-            U32 offset;
-            if ( (matchIndexL > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip)) ) {
+            goto _match_stored;
+        }
+
+        if (matchIndexL > prefixLowestIndex) {
+            /* check prefix long match */
+            if (MEM_read64(matchLong) == MEM_read64(ip)) {
                 mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
                 offset = (U32)(ip-matchLong);
-                while (((ip>anchor) & (matchLong>lowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
-            } else if ( (matchIndexS > lowestIndex) && (MEM_read32(match) == MEM_read32(ip)) ) {
-                size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
-                U32 const matchIndexL3 = hashLong[hl3];
-                const BYTE* matchL3 = base + matchIndexL3;
-                hashLong[hl3] = current + 1;
-                if ( (matchIndexL3 > lowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1)) ) {
+                while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        } else if (dictMode == ZSTD_dictMatchState) {
+            /* check dictMatchState long match */
+            U32 const dictMatchIndexL = dictHashLong[dictHL];
+            const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+            assert(dictMatchL < dictEnd);
+
+            if (dictMatchL > dictStart && MEM_read64(dictMatchL) == MEM_read64(ip)) {
+                mLength = ZSTD_count_2segments(ip+8, dictMatchL+8, iend, dictEnd, prefixLowest) + 8;
+                offset = (U32)(current - dictMatchIndexL - dictIndexDelta);
+                while (((ip>anchor) & (dictMatchL>dictStart)) && (ip[-1] == dictMatchL[-1])) { ip--; dictMatchL--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        }
+
+        if (matchIndexS > prefixLowestIndex) {
+            /* check prefix short match */
+            if (MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+            }
+        } else if (dictMode == ZSTD_dictMatchState) {
+            /* check dictMatchState short match */
+            U32 const dictMatchIndexS = dictHashSmall[dictHS];
+            match = dictBase + dictMatchIndexS;
+            matchIndexS = dictMatchIndexS + dictIndexDelta;
+
+            if (match > dictStart && MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+            }
+        }
+
+        ip += ((ip-anchor) >> kSearchStrength) + 1;
+        continue;
+
+_search_next_long:
+
+        {
+            size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+            U32 const matchIndexL3 = hashLong[hl3];
+            const BYTE* matchL3 = base + matchIndexL3;
+            hashLong[hl3] = current + 1;
+
+            /* check prefix long +1 match */
+            if (matchIndexL3 > prefixLowestIndex) {
+                if (MEM_read64(matchL3) == MEM_read64(ip+1)) {
                     mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
                     ip++;
                     offset = (U32)(ip-matchL3);
-                    while (((ip>anchor) & (matchL3>lowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
-                } else {
-                    mLength = ZSTD_count(ip+4, match+4, iend) + 4;
-                    offset = (U32)(ip-match);
-                    while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+                    while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                    goto _match_found;
                 }
-            } else {
-                ip += ((ip-anchor) >> kSearchStrength) + 1;
-                continue;
+            } else if (dictMode == ZSTD_dictMatchState) {
+                /* check dict long +1 match */
+                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
+                const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                assert(dictMatchL3 < dictEnd);
+                if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+                    mLength = ZSTD_count_2segments(ip+1+8, dictMatchL3+8, iend, dictEnd, prefixLowest) + 8;
+                    ip++;
+                    offset = (U32)(current + 1 - dictMatchIndexL3 - dictIndexDelta);
+                    while (((ip>anchor) & (dictMatchL3>dictStart)) && (ip[-1] == dictMatchL3[-1])) { ip--; dictMatchL3--; mLength++; } /* catch up */
+                    goto _match_found;
+                }
             }
-
-            offset_2 = offset_1;
-            offset_1 = offset;
-
-            ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
         }
 
+        /* if no long +1 match, explore the short match we found */
+        if (dictMode == ZSTD_dictMatchState && matchIndexS < prefixLowestIndex) {
+            mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4;
+            offset = (U32)(current - matchIndexS);
+            while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        } else {
+            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+            offset = (U32)(ip - match);
+            while (((ip>anchor) & (match>prefixLowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        }
+
+        /* fall-through */
+
+_match_found:
+        offset_2 = offset_1;
+        offset_1 = offset;
+
+        ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+_match_stored:
         /* match found */
         ip += mLength;
         anchor = ip;
@@ -135,19 +257,44 @@ size_t ZSTD_compressBlock_doubleFast_gen
                 hashSmall[ZSTD_hashPtr(ip-2, hBitsS, mls)] = (U32)(ip-2-base);
 
             /* check immediate repcode */
-            while ( (ip <= ilimit)
-                 && ( (offset_2>0)
-                 & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
-                /* store sequence */
-                size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
-                { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */
-                hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
-                hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
-                ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
-                ip += rLength;
-                anchor = ip;
-                continue;   /* faster when present ... (?) */
-    }   }   }
+            if (dictMode == ZSTD_dictMatchState) {
+                while (ip <= ilimit) {
+                    U32 const current2 = (U32)(ip-base);
+                    U32 const repIndex2 = current2 - offset_2;
+                    const BYTE* repMatch2 = dictMode == ZSTD_dictMatchState
+                        && repIndex2 < prefixLowestIndex ?
+                            dictBase - dictIndexDelta + repIndex2 :
+                            base + repIndex2;
+                    if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                       && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                        const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                        size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                        U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                        ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
+                        hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                        hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                        ip += repLength2;
+                        anchor = ip;
+                        continue;
+                    }
+                    break;
+                }
+            }
+
+            if (dictMode == ZSTD_noDict) {
+                while ( (ip <= ilimit)
+                     && ( (offset_2>0)
+                        & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                    /* store sequence */
+                    size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                    U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
+                    ip += rLength;
+                    anchor = ip;
+                    continue;   /* faster when present ... (?) */
+    }   }   }   }
 
     /* save reps for next block */
     rep[0] = offset_1 ? offset_1 : offsetSaved;
@@ -160,102 +307,126 @@ size_t ZSTD_compressBlock_doubleFast_gen
 
 size_t ZSTD_compressBlock_doubleFast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    const U32 mls = cParams->searchLength;
+    const U32 mls = ms->cParams.searchLength;
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 4);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict);
     case 5 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 5);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict);
     case 6 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 6);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict);
     case 7 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 7);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict);
+    }
+}
+
+
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    const U32 mls = ms->cParams.searchLength;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState);
     }
 }
 
 
 static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
+        void const* src, size_t srcSize,
         U32 const mls /* template */)
 {
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
     U32* const hashLong = ms->hashTable;
     U32  const hBitsL = cParams->hashLog;
     U32* const hashSmall = ms->chainTable;
     U32  const hBitsS = cParams->chainLog;
-    const BYTE* const base = ms->window.base;
-    const BYTE* const dictBase = ms->window.dictBase;
     const BYTE* const istart = (const BYTE*)src;
     const BYTE* ip = istart;
     const BYTE* anchor = istart;
-    const U32   lowestIndex = ms->window.lowLimit;
-    const BYTE* const dictStart = dictBase + lowestIndex;
-    const U32   dictLimit = ms->window.dictLimit;
-    const BYTE* const lowPrefixPtr = base + dictLimit;
-    const BYTE* const dictEnd = dictBase + dictLimit;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - 8;
+    const U32   prefixStartIndex = ms->window.dictLimit;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const U32   dictStartIndex = ms->window.lowLimit;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
     U32 offset_1=rep[0], offset_2=rep[1];
 
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_extDict_generic (srcSize=%zu)", srcSize);
+
     /* Search Loop */
     while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
         const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
         const U32 matchIndex = hashSmall[hSmall];
-        const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base;
+        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
         const BYTE* match = matchBase + matchIndex;
 
         const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
         const U32 matchLongIndex = hashLong[hLong];
-        const BYTE* matchLongBase = matchLongIndex < dictLimit ? dictBase : base;
+        const BYTE* const matchLongBase = matchLongIndex < prefixStartIndex ? dictBase : base;
         const BYTE* matchLong = matchLongBase + matchLongIndex;
 
         const U32 current = (U32)(ip-base);
         const U32 repIndex = current + 1 - offset_1;   /* offset_1 expected <= current +1 */
-        const BYTE* repBase = repIndex < dictLimit ? dictBase : base;
-        const BYTE* repMatch = repBase + repIndex;
+        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* const repMatch = repBase + repIndex;
         size_t mLength;
         hashSmall[hSmall] = hashLong[hLong] = current;   /* update hash table */
 
-        if ( (((U32)((dictLimit-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex))
-           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
-            const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, lowPrefixPtr) + 4;
+        if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
+            & (repIndex > dictStartIndex))
+          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
             ip++;
             ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
         } else {
-            if ((matchLongIndex > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
-                const BYTE* matchEnd = matchLongIndex < dictLimit ? dictEnd : iend;
-                const BYTE* lowMatchPtr = matchLongIndex < dictLimit ? dictStart : lowPrefixPtr;
+            if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+                const BYTE* const lowMatchPtr = matchLongIndex < prefixStartIndex ? dictStart : prefixStart;
                 U32 offset;
-                mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, lowPrefixPtr) + 8;
+                mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, prefixStart) + 8;
                 offset = current - matchLongIndex;
                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
                 offset_2 = offset_1;
                 offset_1 = offset;
                 ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
 
-            } else if ((matchIndex > lowestIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+            } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
                 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
                 U32 const matchIndex3 = hashLong[h3];
-                const BYTE* const match3Base = matchIndex3 < dictLimit ? dictBase : base;
+                const BYTE* const match3Base = matchIndex3 < prefixStartIndex ? dictBase : base;
                 const BYTE* match3 = match3Base + matchIndex3;
                 U32 offset;
                 hashLong[h3] = current + 1;
-                if ( (matchIndex3 > lowestIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
-                    const BYTE* matchEnd = matchIndex3 < dictLimit ? dictEnd : iend;
-                    const BYTE* lowMatchPtr = matchIndex3 < dictLimit ? dictStart : lowPrefixPtr;
-                    mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, lowPrefixPtr) + 8;
+                if ( (matchIndex3 > dictStartIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
+                    const BYTE* const matchEnd = matchIndex3 < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex3 < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, prefixStart) + 8;
                     ip++;
                     offset = current+1 - matchIndex3;
                     while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */
                 } else {
-                    const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend;
-                    const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
-                    mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, lowPrefixPtr) + 4;
+                    const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
                     offset = current - matchIndex;
                     while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
                 }
@@ -282,12 +453,13 @@ static size_t ZSTD_compressBlock_doubleF
             while (ip <= ilimit) {
                 U32 const current2 = (U32)(ip-base);
                 U32 const repIndex2 = current2 - offset_2;
-                const BYTE* repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
-                if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex))  /* intentional overflow */
-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
-                    const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, lowPrefixPtr) + 4;
-                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3)   /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
+                    & (repIndex2 > dictStartIndex))
+                  && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
                     ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
@@ -309,19 +481,19 @@ static size_t ZSTD_compressBlock_doubleF
 
 size_t ZSTD_compressBlock_doubleFast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    U32 const mls = cParams->searchLength;
+    U32 const mls = ms->cParams.searchLength;
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 4);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
     case 5 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 5);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
     case 6 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 6);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
     case 7 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 7);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
     }
 }
diff --git a/contrib/python-zstandard/zstd/compress/zstd_double_fast.h b/contrib/python-zstandard/zstd/compress/zstd_double_fast.h
--- a/contrib/python-zstandard/zstd/compress/zstd_double_fast.h
+++ b/contrib/python-zstandard/zstd/compress/zstd_double_fast.h
@@ -19,14 +19,16 @@ extern "C" {
 #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
 
 void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
-                              ZSTD_compressionParameters const* cParams,
-                              void const* end);
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
 size_t ZSTD_compressBlock_doubleFast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_doubleFast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 
 #if defined (__cplusplus)
diff --git a/contrib/python-zstandard/zstd/compress/zstd_fast.c b/contrib/python-zstandard/zstd/compress/zstd_fast.c
--- a/contrib/python-zstandard/zstd/compress/zstd_fast.c
+++ b/contrib/python-zstandard/zstd/compress/zstd_fast.c
@@ -13,9 +13,9 @@
 
 
 void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
-                        ZSTD_compressionParameters const* cParams,
-                        void const* end)
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashTable = ms->hashTable;
     U32  const hBits = cParams->hashLog;
     U32  const mls = cParams->searchLength;
@@ -34,6 +34,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_
             size_t const hash = ZSTD_hashPtr(ip + i, hBits, mls);
             if (i == 0 || hashTable[hash] == 0)
                 hashTable[hash] = current + i;
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
         }
     }
 }
@@ -42,26 +45,65 @@ FORCE_INLINE_TEMPLATE
 size_t ZSTD_compressBlock_fast_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize,
-        U32 const hlog, U32 const stepSize, U32 const mls)
+        U32 const mls, ZSTD_dictMode_e const dictMode)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
     const BYTE* const base = ms->window.base;
     const BYTE* const istart = (const BYTE*)src;
     const BYTE* ip = istart;
     const BYTE* anchor = istart;
-    const U32   lowestIndex = ms->window.dictLimit;
-    const BYTE* const lowest = base + lowestIndex;
+    const U32   prefixStartIndex = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
     U32 offsetSaved = 0;
 
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams =
+                                     dictMode == ZSTD_dictMatchState ?
+                                     &dms->cParams : NULL;
+    const U32* const dictHashTable = dictMode == ZSTD_dictMatchState ?
+                                     dms->hashTable : NULL;
+    const U32 dictStartIndex       = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.base : NULL;
+    const BYTE* const dictStart    = dictMode == ZSTD_dictMatchState ?
+                                     dictBase + dictStartIndex : NULL;
+    const BYTE* const dictEnd      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ?
+                                     prefixStartIndex - (U32)(dictEnd - dictBase) :
+                                     0;
+    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
+    const U32 dictHLog             = dictMode == ZSTD_dictMatchState ?
+                                     dictCParams->hashLog : hlog;
+
+    assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState);
+
+    /* otherwise, we would get index underflow when translating a dict index
+     * into a local index */
+    assert(dictMode != ZSTD_dictMatchState
+        || prefixStartIndex >= (U32)(dictEnd - dictBase));
+
     /* init */
-    ip += (ip==lowest);
-    {   U32 const maxRep = (U32)(ip-lowest);
+    ip += (dictAndPrefixLength == 0);
+    if (dictMode == ZSTD_noDict) {
+        U32 const maxRep = (U32)(ip - prefixStart);
         if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
         if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
     }
+    if (dictMode == ZSTD_dictMatchState) {
+        /* dictMatchState repCode checks don't currently handle repCode == 0
+         * disabling. */
+        assert(offset_1 <= dictAndPrefixLength);
+        assert(offset_2 <= dictAndPrefixLength);
+    }
 
     /* Main Search Loop */
     while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
@@ -70,26 +112,67 @@ size_t ZSTD_compressBlock_fast_generic(
         U32 const current = (U32)(ip-base);
         U32 const matchIndex = hashTable[h];
         const BYTE* match = base + matchIndex;
+        const U32 repIndex = current + 1 - offset_1;
+        const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
+                            && repIndex < prefixStartIndex) ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
         hashTable[h] = current;   /* update hash table */
 
-        if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+        if ( (dictMode == ZSTD_dictMatchState)
+          && ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else if ( dictMode == ZSTD_noDict
+                 && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
             mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
             ip++;
             ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
-        } else {
-            if ( (matchIndex <= lowestIndex)
-              || (MEM_read32(match) != MEM_read32(ip)) ) {
+        } else if ( (matchIndex <= prefixStartIndex) ) {
+            if (dictMode == ZSTD_dictMatchState) {
+                size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
+                U32 const dictMatchIndex = dictHashTable[dictHash];
+                const BYTE* dictMatch = dictBase + dictMatchIndex;
+                if (dictMatchIndex <= dictStartIndex ||
+                    MEM_read32(dictMatch) != MEM_read32(ip)) {
+                    assert(stepSize >= 1);
+                    ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+                    continue;
+                } else {
+                    /* found a dict match */
+                    U32 const offset = (U32)(current-dictMatchIndex-dictIndexDelta);
+                    mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+                    while (((ip>anchor) & (dictMatch>dictStart))
+                         && (ip[-1] == dictMatch[-1])) {
+                        ip--; dictMatch--; mLength++;
+                    } /* catch up */
+                    offset_2 = offset_1;
+                    offset_1 = offset;
+                    ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+                }
+            } else {
                 assert(stepSize >= 1);
                 ip += ((ip-anchor) >> kSearchStrength) + stepSize;
                 continue;
             }
+        } else if (MEM_read32(match) != MEM_read32(ip)) {
+            /* it's not a match, and we're not going to check the dictionary */
+            assert(stepSize >= 1);
+            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+            continue;
+        } else {
+            /* found a regular match */
+            U32 const offset = (U32)(ip-match);
             mLength = ZSTD_count(ip+4, match+4, iend) + 4;
-            {   U32 const offset = (U32)(ip-match);
-                while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
-                offset_2 = offset_1;
-                offset_1 = offset;
-                ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
-        }   }
+            while (((ip>anchor) & (match>prefixStart))
+                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+            offset_2 = offset_1;
+            offset_1 = offset;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+        }
 
         /* match found */
         ip += mLength;
@@ -97,21 +180,46 @@ size_t ZSTD_compressBlock_fast_generic(
 
         if (ip <= ilimit) {
             /* Fill Table */
+            assert(base+current+2 > istart);  /* check base overflow */
             hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2;  /* here because current+2 could be > iend-8 */
             hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+
             /* check immediate repcode */
-            while ( (ip <= ilimit)
-                 && ( (offset_2>0)
-                 & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
-                /* store sequence */
-                size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
-                { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; }  /* swap offset_2 <=> offset_1 */
-                hashTable[ZSTD_hashPtr(ip, hlog, mls)] = (U32)(ip-base);
-                ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
-                ip += rLength;
-                anchor = ip;
-                continue;   /* faster when present ... (?) */
-    }   }   }
+            if (dictMode == ZSTD_dictMatchState) {
+                while (ip <= ilimit) {
+                    U32 const current2 = (U32)(ip-base);
+                    U32 const repIndex2 = current2 - offset_2;
+                    const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                            dictBase - dictIndexDelta + repIndex2 :
+                            base + repIndex2;
+                    if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                       && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                        const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                        size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                        U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                        ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
+                        hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+                        ip += repLength2;
+                        anchor = ip;
+                        continue;
+                    }
+                    break;
+                }
+            }
+
+            if (dictMode == ZSTD_noDict) {
+                while ( (ip <= ilimit)
+                     && ( (offset_2>0)
+                        & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                    /* store sequence */
+                    size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                    U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = (U32)(ip-base);
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
+                    ip += rLength;
+                    anchor = ip;
+                    continue;   /* faster when present ... (?) */
+    }   }   }   }
 
     /* save reps for next block */
     rep[0] = offset_1 ? offset_1 : offsetSaved;
@@ -124,42 +232,66 @@ size_t ZSTD_compressBlock_fast_generic(
 
 size_t ZSTD_compressBlock_fast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    U32 const hlog = cParams->hashLog;
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
     U32 const mls = cParams->searchLength;
-    U32 const stepSize = cParams->targetLength;
+    assert(ms->dictMatchState == NULL);
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict);
     case 5 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict);
     case 6 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict);
     case 7 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict);
+    }
+}
+
+size_t ZSTD_compressBlock_fast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
+    U32 const mls = cParams->searchLength;
+    assert(ms->dictMatchState != NULL);
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState);
+    case 5 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState);
+    case 6 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState);
+    case 7 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState);
     }
 }
 
 
 static size_t ZSTD_compressBlock_fast_extDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        void const* src, size_t srcSize,
-        U32 const hlog, U32 const stepSize, U32 const mls)
+        void const* src, size_t srcSize, U32 const mls)
 {
-    U32* hashTable = ms->hashTable;
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
     const BYTE* const base = ms->window.base;
     const BYTE* const dictBase = ms->window.dictBase;
     const BYTE* const istart = (const BYTE*)src;
     const BYTE* ip = istart;
     const BYTE* anchor = istart;
-    const U32   lowestIndex = ms->window.lowLimit;
-    const BYTE* const dictStart = dictBase + lowestIndex;
-    const U32   dictLimit = ms->window.dictLimit;
-    const BYTE* const lowPrefixPtr = base + dictLimit;
-    const BYTE* const dictEnd = dictBase + dictLimit;
+    const U32   dictStartIndex = ms->window.lowLimit;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const U32   prefixStartIndex = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - 8;
     U32 offset_1=rep[0], offset_2=rep[1];
@@ -167,33 +299,34 @@ static size_t ZSTD_compressBlock_fast_ex
     /* Search Loop */
     while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
         const size_t h = ZSTD_hashPtr(ip, hlog, mls);
-        const U32 matchIndex = hashTable[h];
-        const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base;
-        const BYTE* match = matchBase + matchIndex;
-        const U32 current = (U32)(ip-base);
-        const U32 repIndex = current + 1 - offset_1;   /* offset_1 expected <= current +1 */
-        const BYTE* repBase = repIndex < dictLimit ? dictBase : base;
-        const BYTE* repMatch = repBase + repIndex;
+        const U32    matchIndex = hashTable[h];
+        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+        const BYTE*  match = matchBase + matchIndex;
+        const U32    current = (U32)(ip-base);
+        const U32    repIndex = current + 1 - offset_1;
+        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* const repMatch = repBase + repIndex;
         size_t mLength;
         hashTable[h] = current;   /* update hash table */
+        assert(offset_1 <= current +1);   /* check repIndex */
 
-        if ( (((U32)((dictLimit-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex))
+        if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex))
            && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
-            const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, lowPrefixPtr) + 4;
+            const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
             ip++;
             ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
         } else {
-            if ( (matchIndex < lowestIndex) ||
+            if ( (matchIndex < dictStartIndex) ||
                  (MEM_read32(match) != MEM_read32(ip)) ) {
                 assert(stepSize >= 1);
                 ip += ((ip-anchor) >> kSearchStrength) + stepSize;
                 continue;
             }
-            {   const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend;
-                const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
+            {   const BYTE* matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+                const BYTE* lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
                 U32 offset;
-                mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, lowPrefixPtr) + 4;
+                mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
                 while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
                 offset = current - matchIndex;
                 offset_2 = offset_1;
@@ -213,11 +346,11 @@ static size_t ZSTD_compressBlock_fast_ex
             while (ip <= ilimit) {
                 U32 const current2 = (U32)(ip-base);
                 U32 const repIndex2 = current2 - offset_2;
-                const BYTE* repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
-                if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex))  /* intentional overflow */
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex))  /* intentional overflow */
                    && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
-                    const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, lowPrefixPtr) + 4;
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
                     ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
                     hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
@@ -239,21 +372,20 @@ static size_t ZSTD_compressBlock_fast_ex
 
 size_t ZSTD_compressBlock_fast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    U32 const hlog = cParams->hashLog;
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
     U32 const mls = cParams->searchLength;
-    U32 const stepSize = cParams->targetLength;
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
     case 5 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
     case 6 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
     case 7 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
     }
 }
diff --git a/contrib/python-zstandard/zstd/compress/zstd_fast.h b/contrib/python-zstandard/zstd/compress/zstd_fast.h
--- a/contrib/python-zstandard/zstd/compress/zstd_fast.h
+++ b/contrib/python-zstandard/zstd/compress/zstd_fast.h
@@ -19,14 +19,16 @@ extern "C" {
 #include "zstd_compress_internal.h"
 
 void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
-                        ZSTD_compressionParameters const* cParams,
-                        void const* end);
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
 size_t ZSTD_compressBlock_fast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_fast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 #if defined (__cplusplus)
 }
diff --git a/contrib/python-zstandard/zstd/compress/zstd_lazy.c b/contrib/python-zstandard/zstd/compress/zstd_lazy.c
--- a/contrib/python-zstandard/zstd/compress/zstd_lazy.c
+++ b/contrib/python-zstandard/zstd/compress/zstd_lazy.c
@@ -16,11 +16,12 @@
 *  Binary Tree search
 ***************************************/
 
-void ZSTD_updateDUBT(
-                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+static void
+ZSTD_updateDUBT(ZSTD_matchState_t* ms,
                 const BYTE* ip, const BYTE* iend,
                 U32 mls)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashTable = ms->hashTable;
     U32  const hashLog = cParams->hashLog;
 
@@ -59,11 +60,12 @@ void ZSTD_updateDUBT(
  *  sort one already inserted but unsorted position
  *  assumption : current >= btlow == (current - btmask)
  *  doesn't fail */
-static void ZSTD_insertDUBT1(
-                 ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+static void
+ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
                  U32 current, const BYTE* inputEnd,
-                 U32 nbCompares, U32 btLow, int extDict)
+                 U32 nbCompares, U32 btLow, const ZSTD_dictMode_e dictMode)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32*   const bt = ms->chainTable;
     U32    const btLog  = cParams->chainLog - 1;
     U32    const btMask = (1 << btLog) - 1;
@@ -92,10 +94,12 @@ static void ZSTD_insertDUBT1(
         size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
         assert(matchIndex < current);
 
-        if ( (!extDict)
+        if ( (dictMode != ZSTD_extDict)
           || (matchIndex+matchLength >= dictLimit)  /* both in current segment*/
           || (current < dictLimit) /* both in extDict */) {
-            const BYTE* const mBase = !extDict || ((matchIndex+matchLength) >= dictLimit) ? base : dictBase;
+            const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
+                                     || (matchIndex+matchLength >= dictLimit)) ?
+                                        base : dictBase;
             assert( (matchIndex+matchLength >= dictLimit)   /* might be wrong if extDict is incorrectly set to 0 */
                  || (current < dictLimit) );
             match = mBase + matchIndex;
@@ -138,13 +142,95 @@ static void ZSTD_insertDUBT1(
 }
 
 
-static size_t ZSTD_DUBT_findBestMatch (
-                            ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                            const BYTE* const ip, const BYTE* const iend,
-                            size_t* offsetPtr,
-                            U32 const mls,
-                            U32 const extDict)
+static size_t
+ZSTD_DUBT_findBetterDictMatch (
+        ZSTD_matchState_t* ms,
+        const BYTE* const ip, const BYTE* const iend,
+        size_t* offsetPtr,
+        U32 nbCompares,
+        U32 const mls,
+        const ZSTD_dictMode_e dictMode)
 {
+    const ZSTD_matchState_t * const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
+    const U32 * const dictHashTable = dms->hashTable;
+    U32         const hashLog = dmsCParams->hashLog;
+    size_t      const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32               dictMatchIndex = dictHashTable[h];
+
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + ms->window.dictLimit;
+    U32         const current = (U32)(ip-base);
+    const BYTE* const dictBase = dms->window.base;
+    const BYTE* const dictEnd = dms->window.nextSrc;
+    U32         const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
+    U32         const dictLowLimit = dms->window.lowLimit;
+    U32         const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
+
+    U32*        const dictBt = dms->chainTable;
+    U32         const btLog  = dmsCParams->chainLog - 1;
+    U32         const btMask = (1 << btLog) - 1;
+    U32         const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
+
+    size_t commonLengthSmaller=0, commonLengthLarger=0, bestLength=0;
+    U32 matchEndIdx = current+8+1;
+
+    (void)dictMode;
+    assert(dictMode == ZSTD_dictMatchState);
+
+    while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
+        U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        const BYTE* match = dictBase + dictMatchIndex;
+        matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+        if (dictMatchIndex+matchLength >= dictHighLimit)
+            match = base + dictMatchIndex + dictIndexDelta;   /* to prepare for next usage of match[matchLength] */
+
+        if (matchLength > bestLength) {
+            U32 matchIndex = dictMatchIndex + dictIndexDelta;
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+            if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+                DEBUGLOG(2, "ZSTD_DUBT_findBestDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+                    current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex);
+                bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
+            }
+            if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+                break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+            }
+        }
+
+        DEBUGLOG(2, "matchLength:%6zu, match:%p, prefixStart:%p, ip:%p", matchLength, match, prefixStart, ip);
+        if (match[matchLength] < ip[matchLength]) {
+            if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            dictMatchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+        } else {
+            /* match is larger than current */
+            if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
+            commonLengthLarger = matchLength;
+            dictMatchIndex = nextPtr[0];
+        }
+    }
+
+    if (bestLength >= MINMATCH) {
+        U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
+        DEBUGLOG(2, "ZSTD_DUBT_findBestDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                    current, (U32)bestLength, (U32)*offsetPtr, mIndex);
+    }
+    return bestLength;
+
+}
+
+
+static size_t
+ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+                        const BYTE* const ip, const BYTE* const iend,
+                        size_t* offsetPtr,
+                        U32 const mls,
+                        const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32*   const hashTable = ms->hashTable;
     U32    const hashLog = cParams->hashLog;
     size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
@@ -195,8 +281,8 @@ static size_t ZSTD_DUBT_findBestMatch (
     while (matchIndex) {  /* will end on matchIndex == 0 */
         U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
         U32 const nextCandidateIdx = *nextCandidateIdxPtr;
-        ZSTD_insertDUBT1(ms, cParams, matchIndex, iend,
-                         nbCandidates, unsortLimit, extDict);
+        ZSTD_insertDUBT1(ms, matchIndex, iend,
+                         nbCandidates, unsortLimit, dictMode);
         matchIndex = nextCandidateIdx;
         nbCandidates++;
     }
@@ -221,7 +307,7 @@ static size_t ZSTD_DUBT_findBestMatch (
             size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
             const BYTE* match;
 
-            if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
+            if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {
                 match = base + matchIndex;
                 matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
             } else {
@@ -259,6 +345,10 @@ static size_t ZSTD_DUBT_findBestMatch (
 
         *smallerPtr = *largerPtr = 0;
 
+        if (dictMode == ZSTD_dictMatchState && nbCompares) {
+            bestLength = ZSTD_DUBT_findBetterDictMatch(ms, ip, iend, offsetPtr, nbCompares, mls, dictMode);
+        }
+
         assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */
         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
         if (bestLength >= MINMATCH) {
@@ -272,61 +362,64 @@ static size_t ZSTD_DUBT_findBestMatch (
 
 
 /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
-static size_t ZSTD_BtFindBestMatch (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                        const BYTE* const ip, const BYTE* const iLimit,
-                        size_t* offsetPtr,
-                        const U32 mls /* template */)
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+                const BYTE* const ip, const BYTE* const iLimit,
+                      size_t* offsetPtr,
+                const U32 mls /* template */,
+                const ZSTD_dictMode_e dictMode)
 {
     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
-    ZSTD_updateDUBT(ms, cParams, ip, iLimit, mls);
-    return ZSTD_DUBT_findBestMatch(ms, cParams, ip, iLimit, offsetPtr, mls, 0);
+    ZSTD_updateDUBT(ms, ip, iLimit, mls);
+    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
 }
 
 
-static size_t ZSTD_BtFindBestMatch_selectMLS (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                        const BYTE* ip, const BYTE* const iLimit,
-                        size_t* offsetPtr)
+static size_t
+ZSTD_BtFindBestMatch_selectMLS (  ZSTD_matchState_t* ms,
+                            const BYTE* ip, const BYTE* const iLimit,
+                                  size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 4);
-    case 5 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 5);
+    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
+    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
     case 7 :
-    case 6 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 6);
+    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
     }
 }
 
 
-/** Tree updater, providing best match */
-static size_t ZSTD_BtFindBestMatch_extDict (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                        const BYTE* const ip, const BYTE* const iLimit,
-                        size_t* offsetPtr,
-                        const U32 mls)
+static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
 {
-    DEBUGLOG(7, "ZSTD_BtFindBestMatch_extDict");
-    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
-    ZSTD_updateDUBT(ms, cParams, ip, iLimit, mls);
-    return ZSTD_DUBT_findBestMatch(ms, cParams, ip, iLimit, offsetPtr, mls, 1);
+    switch(ms->cParams.searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
+    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
+    case 7 :
+    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
+    }
 }
 
 
-static size_t ZSTD_BtFindBestMatch_selectMLS_extDict (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* const iLimit,
                         size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 4);
-    case 5 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 5);
+    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
+    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
     case 7 :
-    case 6 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 6);
+    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
     }
 }
 
@@ -340,7 +433,8 @@ static size_t ZSTD_BtFindBestMatch_selec
 /* Update chains up to ip (excluded)
    Assumption : always within prefix (i.e. not within extDict) */
 static U32 ZSTD_insertAndFindFirstIndex_internal(
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
+                        const ZSTD_compressionParameters* const cParams,
                         const BYTE* ip, U32 const mls)
 {
     U32* const hashTable  = ms->hashTable;
@@ -362,22 +456,21 @@ static U32 ZSTD_insertAndFindFirstIndex_
     return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
 }
 
-U32 ZSTD_insertAndFindFirstIndex(
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                        const BYTE* ip)
-{
-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, cParams->searchLength);
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.searchLength);
 }
 
 
 /* inlining is important to hardwire a hot branch (template emulation) */
 FORCE_INLINE_TEMPLATE
 size_t ZSTD_HcFindBestMatch_generic (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* const ip, const BYTE* const iLimit,
                         size_t* offsetPtr,
-                        const U32 mls, const U32 extDict)
+                        const U32 mls, const ZSTD_dictMode_e dictMode)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const chainTable = ms->chainTable;
     const U32 chainSize = (1 << cParams->chainLog);
     const U32 chainMask = chainSize-1;
@@ -397,7 +490,7 @@ size_t ZSTD_HcFindBestMatch_generic (
 
     for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
         size_t currentMl=0;
-        if ((!extDict) || matchIndex >= dictLimit) {
+        if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
             const BYTE* const match = base + matchIndex;
             if (match[ml] == ip[ml])   /* potentially better */
                 currentMl = ZSTD_count(ip, match, iLimit);
@@ -419,38 +512,87 @@ size_t ZSTD_HcFindBestMatch_generic (
         matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
     }
 
+    if (dictMode == ZSTD_dictMatchState) {
+        const ZSTD_matchState_t* const dms = ms->dictMatchState;
+        const U32* const dmsChainTable = dms->chainTable;
+        const U32 dmsChainSize         = (1 << dms->cParams.chainLog);
+        const U32 dmsChainMask         = dmsChainSize - 1;
+        const U32 dmsLowestIndex       = dms->window.dictLimit;
+        const BYTE* const dmsBase      = dms->window.base;
+        const BYTE* const dmsEnd       = dms->window.nextSrc;
+        const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+        const U32 dmsIndexDelta        = dictLimit - dmsSize;
+        const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;
+
+        matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
+
+        for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
+            size_t currentMl=0;
+            const BYTE* const match = dmsBase + matchIndex;
+            assert(match+4 <= dmsEnd);
+            if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
+
+            /* save best solution */
+            if (currentMl > ml) {
+                ml = currentMl;
+                *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
+                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+            }
+
+            if (matchIndex <= dmsMinChain) break;
+            matchIndex = dmsChainTable[matchIndex & dmsChainMask];
+        }
+    }
+
     return ml;
 }
 
 
 FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* const iLimit,
                         size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, 0);
-    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, 0);
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
     case 7 :
-    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, 0);
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
+    }
+}
+
+
+static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(ms->cParams.searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
+    case 7 :
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
     }
 }
 
 
 FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* const iLimit,
-                        size_t* const offsetPtr)
+                        size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, 1);
-    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, 1);
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
     case 7 :
-    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, 1);
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
     }
 }
 
@@ -462,30 +604,55 @@ FORCE_INLINE_TEMPLATE
 size_t ZSTD_compressBlock_lazy_generic(
                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
                         U32 rep[ZSTD_REP_NUM],
-                        ZSTD_compressionParameters const* cParams,
                         const void* src, size_t srcSize,
-                        const U32 searchMethod, const U32 depth)
+                        const U32 searchMethod, const U32 depth,
+                        ZSTD_dictMode_e const dictMode)
 {
     const BYTE* const istart = (const BYTE*)src;
     const BYTE* ip = istart;
     const BYTE* anchor = istart;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - 8;
-    const BYTE* const base = ms->window.base + ms->window.dictLimit;
+    const BYTE* const base = ms->window.base;
+    const U32 prefixLowestIndex = ms->window.dictLimit;
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
 
     typedef size_t (*searchMax_f)(
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
-    searchMax_f const searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS;
+    searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ?
+        (searchMethod ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) :
+        (searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS);
     U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
 
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const U32 dictLowestIndex      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.base : NULL;
+    const BYTE* const dictLowest   = dictMode == ZSTD_dictMatchState ?
+                                     dictBase + dictLowestIndex : NULL;
+    const BYTE* const dictEnd      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ?
+                                     prefixLowestIndex - (U32)(dictEnd - dictBase) :
+                                     0;
+    const U32 dictAndPrefixLength = (U32)(ip - prefixLowest + dictEnd - dictLowest);
+
     /* init */
-    ip += (ip==base);
+    ip += (dictAndPrefixLength == 0);
     ms->nextToUpdate3 = ms->nextToUpdate;
-    {   U32 const maxRep = (U32)(ip-base);
+    if (dictMode == ZSTD_noDict) {
+        U32 const maxRep = (U32)(ip - prefixLowest);
         if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
         if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
     }
+    if (dictMode == ZSTD_dictMatchState) {
+        /* dictMatchState repCode checks don't currently handle repCode == 0
+         * disabling. */
+        assert(offset_1 <= dictAndPrefixLength);
+        assert(offset_2 <= dictAndPrefixLength);
+    }
 
     /* Match Loop */
     while (ip < ilimit) {
@@ -494,15 +661,28 @@ size_t ZSTD_compressBlock_lazy_generic(
         const BYTE* start=ip+1;
 
         /* check repCode */
-        if ((offset_1>0) & (MEM_read32(ip+1) == MEM_read32(ip+1 - offset_1))) {
-            /* repcode : we take it */
+        if (dictMode == ZSTD_dictMatchState) {
+            const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
+            const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
+                                && repIndex < prefixLowestIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+                const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                if (depth==0) goto _storeSequence;
+            }
+        }
+        if ( dictMode == ZSTD_noDict
+          && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
             matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
             if (depth==0) goto _storeSequence;
         }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 99999999;
-            size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound);
+        {   size_t offsetFound = 999999999;
+            size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
             if (ml2 > matchLength)
                 matchLength = ml2, start = ip, offset=offsetFound;
         }
@@ -516,15 +696,31 @@ size_t ZSTD_compressBlock_lazy_generic(
         if (depth>=1)
         while (ip<ilimit) {
             ip ++;
-            if ((offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+            if ( (dictMode == ZSTD_noDict)
+              && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
                 int const gain2 = (int)(mlRep * 3);
                 int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
                 if ((mlRep >= 4) && (gain2 > gain1))
                     matchLength = mlRep, offset = 0, start = ip;
             }
-            {   size_t offset2=99999999;
-                size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+            if (dictMode == ZSTD_dictMatchState) {
+                const U32 repIndex = (U32)(ip - base) - offset_1;
+                const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
+                if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                    const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                    size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                    int const gain2 = (int)(mlRep * 3);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+                    if ((mlRep >= 4) && (gain2 > gain1))
+                        matchLength = mlRep, offset = 0, start = ip;
+                }
+            }
+            {   size_t offset2=999999999;
+                size_t const ml2 = searchMax(ms, ip, iend, &offset2);
                 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -535,15 +731,31 @@ size_t ZSTD_compressBlock_lazy_generic(
             /* let's find an even better one */
             if ((depth==2) && (ip<ilimit)) {
                 ip ++;
-                if ((offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
-                    size_t const ml2 = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
-                    int const gain2 = (int)(ml2 * 4);
+                if ( (dictMode == ZSTD_noDict)
+                  && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                    size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                    int const gain2 = (int)(mlRep * 4);
                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
-                    if ((ml2 >= 4) && (gain2 > gain1))
-                        matchLength = ml2, offset = 0, start = ip;
+                    if ((mlRep >= 4) && (gain2 > gain1))
+                        matchLength = mlRep, offset = 0, start = ip;
                 }
-                {   size_t offset2=99999999;
-                    size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                if (dictMode == ZSTD_dictMatchState) {
+                    const U32 repIndex = (U32)(ip - base) - offset_1;
+                    const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+                    if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                        && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                        const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                        size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                        int const gain2 = (int)(mlRep * 4);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+                        if ((mlRep >= 4) && (gain2 > gain1))
+                            matchLength = mlRep, offset = 0, start = ip;
+                    }
+                }
+                {   size_t offset2=999999999;
+                    size_t const ml2 = searchMax(ms, ip, iend, &offset2);
                     int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -560,9 +772,17 @@ size_t ZSTD_compressBlock_lazy_generic(
          */
         /* catch up */
         if (offset) {
-            while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > base))
-                 && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) )  /* only search for offset within prefix */
-                { start--; matchLength++; }
+            if (dictMode == ZSTD_noDict) {
+                while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest))
+                     && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) )  /* only search for offset within prefix */
+                    { start--; matchLength++; }
+            }
+            if (dictMode == ZSTD_dictMatchState) {
+                U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
+                const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+                const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+                while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+            }
             offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
         }
         /* store sequence */
@@ -573,16 +793,39 @@ size_t ZSTD_compressBlock_lazy_generic(
         }
 
         /* check immediate repcode */
-        while ( ((ip <= ilimit) & (offset_2>0))
-             && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
-            /* store sequence */
-            matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
-            offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
-            ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
-            ip += matchLength;
-            anchor = ip;
-            continue;   /* faster when present ... (?) */
-    }   }
+        if (dictMode == ZSTD_dictMatchState) {
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex = current2 - offset_2;
+                const BYTE* repMatch = dictMode == ZSTD_dictMatchState
+                    && repIndex < prefixLowestIndex ?
+                        dictBase - dictIndexDelta + repIndex :
+                        base + repIndex;
+                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+                    matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+                    offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
+                    ip += matchLength;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+            }
+        }
+
+        if (dictMode == ZSTD_noDict) {
+            while ( ((ip <= ilimit) & (offset_2>0))
+                 && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+                /* store sequence */
+                matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
+                ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
+                ip += matchLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+    }   }   }
 
     /* Save reps for next block */
     rep[0] = offset_1 ? offset_1 : savedOffset;
@@ -595,30 +838,58 @@ size_t ZSTD_compressBlock_lazy_generic(
 
 size_t ZSTD_compressBlock_btlazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 1, 2, ZSTD_noDict);
 }
 
 size_t ZSTD_compressBlock_lazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 2, ZSTD_noDict);
 }
 
 size_t ZSTD_compressBlock_lazy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 1, ZSTD_noDict);
 }
 
 size_t ZSTD_compressBlock_greedy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 0, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 1, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 1, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 0, ZSTD_dictMatchState);
 }
 
 
@@ -626,7 +897,6 @@ FORCE_INLINE_TEMPLATE
 size_t ZSTD_compressBlock_lazy_extDict_generic(
                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
                         U32 rep[ZSTD_REP_NUM],
-                        ZSTD_compressionParameters const* cParams,
                         const void* src, size_t srcSize,
                         const U32 searchMethod, const U32 depth)
 {
@@ -644,9 +914,9 @@ size_t ZSTD_compressBlock_lazy_extDict_g
     const BYTE* const dictStart  = dictBase + lowestIndex;
 
     typedef size_t (*searchMax_f)(
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
-    searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS_extDict : ZSTD_HcFindBestMatch_extDict_selectMLS;
+    searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
 
     U32 offset_1 = rep[0], offset_2 = rep[1];
 
@@ -674,8 +944,8 @@ size_t ZSTD_compressBlock_lazy_extDict_g
         }   }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 99999999;
-            size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound);
+        {   size_t offsetFound = 999999999;
+            size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
             if (ml2 > matchLength)
                 matchLength = ml2, start = ip, offset=offsetFound;
         }
@@ -707,8 +977,8 @@ size_t ZSTD_compressBlock_lazy_extDict_g
             }   }
 
             /* search match, depth 1 */
-            {   size_t offset2=99999999;
-                size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+            {   size_t offset2=999999999;
+                size_t const ml2 = searchMax(ms, ip, iend, &offset2);
                 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -737,8 +1007,8 @@ size_t ZSTD_compressBlock_lazy_extDict_g
                 }   }
 
                 /* search match, depth 2 */
-                {   size_t offset2=99999999;
-                    size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                {   size_t offset2=999999999;
+                    size_t const ml2 = searchMax(ms, ip, iend, &offset2);
                     int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -794,31 +1064,31 @@ size_t ZSTD_compressBlock_lazy_extDict_g
 
 size_t ZSTD_compressBlock_greedy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 0);
 }
 
 size_t ZSTD_compressBlock_lazy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 1);
 }
 
 size_t ZSTD_compressBlock_lazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 2);
 }
 
 size_t ZSTD_compressBlock_btlazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 1, 2);
 }
diff --git a/contrib/python-zstandard/zstd/compress/zstd_lazy.h b/contrib/python-zstandard/zstd/compress/zstd_lazy.h
--- a/contrib/python-zstandard/zstd/compress/zstd_lazy.h
+++ b/contrib/python-zstandard/zstd/compress/zstd_lazy.h
@@ -17,37 +17,48 @@ extern "C" {
 
 #include "zstd_compress_internal.h"
 
-U32 ZSTD_insertAndFindFirstIndex(
-        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-        const BYTE* ip);
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
 
 void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). pre-emptively increase value of ZSTD_DUBT_UNSORTED_MARK */
 
 size_t ZSTD_compressBlock_btlazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_greedy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 
 size_t ZSTD_compressBlock_greedy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btlazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 #if defined (__cplusplus)
 }
diff --git a/contrib/python-zstandard/zstd/compress/zstd_ldm.c b/contrib/python-zstandard/zstd/compress/zstd_ldm.c
--- a/contrib/python-zstandard/zstd/compress/zstd_ldm.c
+++ b/contrib/python-zstandard/zstd/compress/zstd_ldm.c
@@ -9,6 +9,7 @@
 
 #include "zstd_ldm.h"
 
+#include "debug.h"
 #include "zstd_fast.h"          /* ZSTD_fillHashTable() */
 #include "zstd_double_fast.h"   /* ZSTD_fillDoubleHashTable() */
 
@@ -20,7 +21,7 @@
 void ZSTD_ldm_adjustParameters(ldmParams_t* params,
                                ZSTD_compressionParameters const* cParams)
 {
-    U32 const windowLog = cParams->windowLog;
+    params->windowLog = cParams->windowLog;
     ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
     DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
     if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
@@ -33,12 +34,13 @@ void ZSTD_ldm_adjustParameters(ldmParams
       params->minMatchLength = minMatch;
     }
     if (params->hashLog == 0) {
-        params->hashLog = MAX(ZSTD_HASHLOG_MIN, windowLog - LDM_HASH_RLOG);
+        params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG);
         assert(params->hashLog <= ZSTD_HASHLOG_MAX);
     }
     if (params->hashEveryLog == 0) {
-        params->hashEveryLog =
-                windowLog < params->hashLog ? 0 : windowLog - params->hashLog;
+        params->hashEveryLog = params->windowLog < params->hashLog
+                                   ? 0
+                                   : params->windowLog - params->hashLog;
     }
     params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
 }
@@ -216,21 +218,18 @@ static size_t ZSTD_ldm_countBackwardsMat
  *  The tables for the other strategies are filled within their
  *  block compressors. */
 static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
-                                      ZSTD_compressionParameters const* cParams,
                                       void const* end)
 {
     const BYTE* const iend = (const BYTE*)end;
 
-    switch(cParams->strategy)
+    switch(ms->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, cParams, iend);
-        ms->nextToUpdate = (U32)(iend - ms->window.base);
+        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
         break;
 
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, cParams, iend);
-        ms->nextToUpdate = (U32)(iend - ms->window.base);
+        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
         break;
 
     case ZSTD_greedy:
@@ -508,7 +507,7 @@ size_t ZSTD_ldm_generateSequences(
          *       * Try invalidation after the sequence generation and test the
          *         the offset against maxDist directly.
          */
-        ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, NULL);
+        ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, NULL, NULL);
         /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */
         newLeftoverSize = ZSTD_ldm_generateSequences_internal(
             ldmState, sequences, params, chunkStart, chunkSize);
@@ -591,19 +590,19 @@ static rawSeq maybeSplitSequence(rawSeqS
 
 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
     ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
-    int const extDict)
+    void const* src, size_t srcSize)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     unsigned const minMatch = cParams->searchLength;
     ZSTD_blockCompressor const blockCompressor =
-        ZSTD_selectBlockCompressor(cParams->strategy, extDict);
-    BYTE const* const base = ms->window.base;
+        ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms));
     /* Input bounds */
     BYTE const* const istart = (BYTE const*)src;
     BYTE const* const iend = istart + srcSize;
     /* Input positions */
     BYTE const* ip = istart;
 
+    DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize);
     assert(rawSeqStore->pos <= rawSeqStore->size);
     assert(rawSeqStore->size <= rawSeqStore->capacity);
     /* Loop through each sequence and apply the block compressor to the lits */
@@ -621,14 +620,13 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor
 
         /* Fill tables for block compressor */
         ZSTD_ldm_limitTableUpdate(ms, ip);
-        ZSTD_ldm_fillFastTables(ms, cParams, ip);
+        ZSTD_ldm_fillFastTables(ms, ip);
         /* Run the block compressor */
+        DEBUGLOG(5, "calling block compressor on segment of size %u", sequence.litLength);
         {
             size_t const newLitLength =
-                blockCompressor(ms, seqStore, rep, cParams, ip,
-                                sequence.litLength);
+                blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
             ip += sequence.litLength;
-            ms->nextToUpdate = (U32)(ip - base);
             /* Update the repcodes */
             for (i = ZSTD_REP_NUM - 1; i > 0; i--)
                 rep[i] = rep[i-1];
@@ -642,12 +640,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor
     }
     /* Fill the tables for the block compressor */
     ZSTD_ldm_limitTableUpdate(ms, ip);
-    ZSTD_ldm_fillFastTables(ms, cParams, ip);
+    ZSTD_ldm_fillFastTables(ms, ip);
     /* Compress the last literals */
-    {
-        size_t const lastLiterals = blockCompressor(ms, seqStore, rep, cParams,
-                                                    ip, iend - ip);
-        ms->nextToUpdate = (U32)(iend - base);
-        return lastLiterals;
-    }
+    return blockCompressor(ms, seqStore, rep, ip, iend - ip);
 }
diff --git a/contrib/python-zstandard/zstd/compress/zstd_ldm.h b/contrib/python-zstandard/zstd/compress/zstd_ldm.h
--- a/contrib/python-zstandard/zstd/compress/zstd_ldm.h
+++ b/contrib/python-zstandard/zstd/compress/zstd_ldm.h
@@ -61,9 +61,7 @@ size_t ZSTD_ldm_generateSequences(
  */
 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-            ZSTD_compressionParameters const* cParams,
-            void const* src, size_t srcSize,
-            int const extDict);
+            void const* src, size_t srcSize);
 
 /**
  * ZSTD_ldm_skipSequences():
diff --git a/contrib/python-zstandard/zstd/compress/zstd_opt.c b/contrib/python-zstandard/zstd/compress/zstd_opt.c
--- a/contrib/python-zstandard/zstd/compress/zstd_opt.c
+++ b/contrib/python-zstandard/zstd/compress/zstd_opt.c
@@ -9,10 +9,11 @@
  */
 
 #include "zstd_compress_internal.h"
+#include "hist.h"
 #include "zstd_opt.h"
 
 
-#define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats. Also used for matchSum (?) */
+#define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
 #define ZSTD_FREQ_DIV       4   /* log factor when using previous stats to init next stats */
 #define ZSTD_MAX_PRICE     (1<<30)
 
@@ -20,128 +21,210 @@
 /*-*************************************
 *  Price functions for optimal parser
 ***************************************/
-static void ZSTD_setLog2Prices(optState_t* optPtr)
+
+#if 0    /* approximation at bit level */
+#  define BITCOST_ACCURACY 0
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat)  ((void)opt, ZSTD_bitWeight(stat))
+#elif 0  /* fractional bit accuracy */
+#  define BITCOST_ACCURACY 8
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
+#else    /* opt==approx, ultra==accurate */
+#  define BITCOST_ACCURACY 8
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+#endif
+
+MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+{
+    return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+}
+
+MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
 {
-    optPtr->log2litSum = ZSTD_highbit32(optPtr->litSum+1);
-    optPtr->log2litLengthSum = ZSTD_highbit32(optPtr->litLengthSum+1);
-    optPtr->log2matchLengthSum = ZSTD_highbit32(optPtr->matchLengthSum+1);
-    optPtr->log2offCodeSum = ZSTD_highbit32(optPtr->offCodeSum+1);
+    U32 const stat = rawStat + 1;
+    U32 const hb = ZSTD_highbit32(stat);
+    U32 const BWeight = hb * BITCOST_MULTIPLIER;
+    U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+    U32 const weight = BWeight + FWeight;
+    assert(hb + BITCOST_ACCURACY < 31);
+    return weight;
+}
+
+/* debugging function, @return price in bytes */
+MEM_STATIC double ZSTD_fCost(U32 price)
+{
+    return (double)price / (BITCOST_MULTIPLIER*8);
+}
+
+static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel)
+{
+    optPtr->litSumBasePrice = WEIGHT(optPtr->litSum, optLevel);
+    optPtr->litLengthSumBasePrice = WEIGHT(optPtr->litLengthSum, optLevel);
+    optPtr->matchLengthSumBasePrice = WEIGHT(optPtr->matchLengthSum, optLevel);
+    optPtr->offCodeSumBasePrice = WEIGHT(optPtr->offCodeSum, optLevel);
 }
 
 
+static U32 ZSTD_downscaleStat(U32* table, U32 lastEltIndex, int malus)
+{
+    U32 s, sum=0;
+    assert(ZSTD_FREQ_DIV+malus > 0 && ZSTD_FREQ_DIV+malus < 31);
+    for (s=0; s<=lastEltIndex; s++) {
+        table[s] = 1 + (table[s] >> (ZSTD_FREQ_DIV+malus));
+        sum += table[s];
+    }
+    return sum;
+}
+
 static void ZSTD_rescaleFreqs(optState_t* const optPtr,
-                              const BYTE* const src, size_t const srcSize)
+                              const BYTE* const src, size_t const srcSize,
+                              int optLevel)
 {
-    optPtr->staticPrices = 0;
+    optPtr->priceType = zop_dynamic;
+
+    if (optPtr->litLengthSum == 0) {  /* first block : init */
+        if (srcSize <= 1024)   /* heuristic */
+            optPtr->priceType = zop_predef;
+
+        assert(optPtr->symbolCosts != NULL);
+        if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { /* huffman table presumed generated by dictionary */
+            optPtr->priceType = zop_dynamic;
 
-    if (optPtr->litLengthSum == 0) {  /* first init */
-        unsigned u;
-        if (srcSize <= 1024) optPtr->staticPrices = 1;
+            assert(optPtr->litFreq != NULL);
+            optPtr->litSum = 0;
+            {   unsigned lit;
+                for (lit=0; lit<=MaxLit; lit++) {
+                    U32 const scaleLog = 11;   /* scale to 2K */
+                    U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit);
+                    assert(bitCost <= scaleLog);
+                    optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litSum += optPtr->litFreq[lit];
+            }   }
+
+            {   unsigned ll;
+                FSE_CState_t llstate;
+                FSE_initCState(&llstate, optPtr->symbolCosts->fse.litlengthCTable);
+                optPtr->litLengthSum = 0;
+                for (ll=0; ll<=MaxLL; ll++) {
+                    U32 const scaleLog = 10;   /* scale to 1K */
+                    U32 const bitCost = FSE_getMaxNbBits(llstate.symbolTT, ll);
+                    assert(bitCost < scaleLog);
+                    optPtr->litLengthFreq[ll] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litLengthSum += optPtr->litLengthFreq[ll];
+            }   }
 
-        assert(optPtr->litFreq!=NULL);
-        for (u=0; u<=MaxLit; u++)
-            optPtr->litFreq[u] = 0;
-        for (u=0; u<srcSize; u++)
-            optPtr->litFreq[src[u]]++;
-        optPtr->litSum = 0;
-        for (u=0; u<=MaxLit; u++) {
-            optPtr->litFreq[u] = 1 + (optPtr->litFreq[u] >> ZSTD_FREQ_DIV);
-            optPtr->litSum += optPtr->litFreq[u];
+            {   unsigned ml;
+                FSE_CState_t mlstate;
+                FSE_initCState(&mlstate, optPtr->symbolCosts->fse.matchlengthCTable);
+                optPtr->matchLengthSum = 0;
+                for (ml=0; ml<=MaxML; ml++) {
+                    U32 const scaleLog = 10;
+                    U32 const bitCost = FSE_getMaxNbBits(mlstate.symbolTT, ml);
+                    assert(bitCost < scaleLog);
+                    optPtr->matchLengthFreq[ml] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->matchLengthSum += optPtr->matchLengthFreq[ml];
+            }   }
+
+            {   unsigned of;
+                FSE_CState_t ofstate;
+                FSE_initCState(&ofstate, optPtr->symbolCosts->fse.offcodeCTable);
+                optPtr->offCodeSum = 0;
+                for (of=0; of<=MaxOff; of++) {
+                    U32 const scaleLog = 10;
+                    U32 const bitCost = FSE_getMaxNbBits(ofstate.symbolTT, of);
+                    assert(bitCost < scaleLog);
+                    optPtr->offCodeFreq[of] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->offCodeSum += optPtr->offCodeFreq[of];
+            }   }
+
+        } else {  /* not a dictionary */
+
+            assert(optPtr->litFreq != NULL);
+            {   unsigned lit = MaxLit;
+                HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+            }
+            optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1);
+
+            {   unsigned ll;
+                for (ll=0; ll<=MaxLL; ll++)
+                    optPtr->litLengthFreq[ll] = 1;
+            }
+            optPtr->litLengthSum = MaxLL+1;
+
+            {   unsigned ml;
+                for (ml=0; ml<=MaxML; ml++)
+                    optPtr->matchLengthFreq[ml] = 1;
+            }
+            optPtr->matchLengthSum = MaxML+1;
+
+            {   unsigned of;
+                for (of=0; of<=MaxOff; of++)
+                    optPtr->offCodeFreq[of] = 1;
+            }
+            optPtr->offCodeSum = MaxOff+1;
+
         }
 
-        for (u=0; u<=MaxLL; u++)
-            optPtr->litLengthFreq[u] = 1;
-        optPtr->litLengthSum = MaxLL+1;
-        for (u=0; u<=MaxML; u++)
-            optPtr->matchLengthFreq[u] = 1;
-        optPtr->matchLengthSum = MaxML+1;
-        for (u=0; u<=MaxOff; u++)
-            optPtr->offCodeFreq[u] = 1;
-        optPtr->offCodeSum = (MaxOff+1);
-
-    } else {
-        unsigned u;
+    } else {   /* new block : re-use previous statistics, scaled down */
 
-        optPtr->litSum = 0;
-        for (u=0; u<=MaxLit; u++) {
-            optPtr->litFreq[u] = 1 + (optPtr->litFreq[u] >> (ZSTD_FREQ_DIV+1));
-            optPtr->litSum += optPtr->litFreq[u];
-        }
-        optPtr->litLengthSum = 0;
-        for (u=0; u<=MaxLL; u++) {
-            optPtr->litLengthFreq[u] = 1 + (optPtr->litLengthFreq[u]>>(ZSTD_FREQ_DIV+1));
-            optPtr->litLengthSum += optPtr->litLengthFreq[u];
-        }
-        optPtr->matchLengthSum = 0;
-        for (u=0; u<=MaxML; u++) {
-            optPtr->matchLengthFreq[u] = 1 + (optPtr->matchLengthFreq[u]>>ZSTD_FREQ_DIV);
-            optPtr->matchLengthSum += optPtr->matchLengthFreq[u];
-        }
-        optPtr->offCodeSum = 0;
-        for (u=0; u<=MaxOff; u++) {
-            optPtr->offCodeFreq[u] = 1 + (optPtr->offCodeFreq[u]>>ZSTD_FREQ_DIV);
-            optPtr->offCodeSum += optPtr->offCodeFreq[u];
-        }
+        optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1);
+        optPtr->litLengthSum = ZSTD_downscaleStat(optPtr->litLengthFreq, MaxLL, 0);
+        optPtr->matchLengthSum = ZSTD_downscaleStat(optPtr->matchLengthFreq, MaxML, 0);
+        optPtr->offCodeSum = ZSTD_downscaleStat(optPtr->offCodeFreq, MaxOff, 0);
     }
 
-    ZSTD_setLog2Prices(optPtr);
+    ZSTD_setBasePrices(optPtr, optLevel);
 }
 
-
 /* ZSTD_rawLiteralsCost() :
- * cost of literals (only) in given segment (which length can be null)
- * does not include cost of literalLength symbol */
+ * price of literals (only) in specified segment (which length can be 0).
+ * does not include price of literalLength symbol */
 static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
-                                const optState_t* const optPtr)
+                                const optState_t* const optPtr,
+                                int optLevel)
 {
-    if (optPtr->staticPrices) return (litLength*6);  /* 6 bit per literal - no statistic used */
     if (litLength == 0) return 0;
+    if (optPtr->priceType == zop_predef)
+        return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
 
-    /* literals */
-    {   U32 u;
-        U32 cost = litLength * optPtr->log2litSum;
-        for (u=0; u < litLength; u++)
-            cost -= ZSTD_highbit32(optPtr->litFreq[literals[u]]+1);
-        return cost;
+    /* dynamic statistics */
+    {   U32 price = litLength * optPtr->litSumBasePrice;
+        U32 u;
+        for (u=0; u < litLength; u++) {
+            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
+            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+        }
+        return price;
     }
 }
 
 /* ZSTD_litLengthPrice() :
  * cost of literalLength symbol */
-static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr)
+static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel)
 {
-    if (optPtr->staticPrices) return ZSTD_highbit32((U32)litLength+1);
+    if (optPtr->priceType == zop_predef) return WEIGHT(litLength, optLevel);
 
-    /* literal Length */
+    /* dynamic statistics */
     {   U32 const llCode = ZSTD_LLcode(litLength);
-        U32 const price = LL_bits[llCode] + optPtr->log2litLengthSum - ZSTD_highbit32(optPtr->litLengthFreq[llCode]+1);
-        return price;
+        return (LL_bits[llCode] * BITCOST_MULTIPLIER) + (optPtr->litLengthSumBasePrice - WEIGHT(optPtr->litLengthFreq[llCode], optLevel));
     }
 }
 
-/* ZSTD_litLengthPrice() :
- * cost of the literal part of a sequence,
- * including literals themselves, and literalLength symbol */
-static U32 ZSTD_fullLiteralsCost(const BYTE* const literals, U32 const litLength,
-                                 const optState_t* const optPtr)
-{
-    return ZSTD_rawLiteralsCost(literals, litLength, optPtr)
-         + ZSTD_litLengthPrice(litLength, optPtr);
-}
-
 /* ZSTD_litLengthContribution() :
  * @return ( cost(litlength) - cost(0) )
  * this value can then be added to rawLiteralsCost()
  * to provide a cost which is directly comparable to a match ending at same position */
-static int ZSTD_litLengthContribution(U32 const litLength, const optState_t* const optPtr)
+static int ZSTD_litLengthContribution(U32 const litLength, const optState_t* const optPtr, int optLevel)
 {
-    if (optPtr->staticPrices) return ZSTD_highbit32(litLength+1);
+    if (optPtr->priceType >= zop_predef) return WEIGHT(litLength, optLevel);
 
-    /* literal Length */
+    /* dynamic statistics */
     {   U32 const llCode = ZSTD_LLcode(litLength);
-        int const contribution = LL_bits[llCode]
-                        + ZSTD_highbit32(optPtr->litLengthFreq[0]+1)
-                        - ZSTD_highbit32(optPtr->litLengthFreq[llCode]+1);
+        int const contribution = (LL_bits[llCode] * BITCOST_MULTIPLIER)
+                               + WEIGHT(optPtr->litLengthFreq[0], optLevel)   /* note: log2litLengthSum cancel out */
+                               - WEIGHT(optPtr->litLengthFreq[llCode], optLevel);
 #if 1
         return contribution;
 #else
@@ -155,10 +238,11 @@ static int ZSTD_litLengthContribution(U3
  * which can be compared to the ending cost of a match
  * should a new match start at this position */
 static int ZSTD_literalsContribution(const BYTE* const literals, U32 const litLength,
-                                     const optState_t* const optPtr)
+                                     const optState_t* const optPtr,
+                                     int optLevel)
 {
-    int const contribution = ZSTD_rawLiteralsCost(literals, litLength, optPtr)
-                           + ZSTD_litLengthContribution(litLength, optPtr);
+    int const contribution = ZSTD_rawLiteralsCost(literals, litLength, optPtr, optLevel)
+                           + ZSTD_litLengthContribution(litLength, optPtr, optLevel);
     return contribution;
 }
 
@@ -166,31 +250,38 @@ static int ZSTD_literalsContribution(con
  * Provides the cost of the match part (offset + matchLength) of a sequence
  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
  * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */
-FORCE_INLINE_TEMPLATE U32 ZSTD_getMatchPrice(
-                                    U32 const offset, U32 const matchLength,
-                                    const optState_t* const optPtr,
-                                    int const optLevel)
+FORCE_INLINE_TEMPLATE U32
+ZSTD_getMatchPrice(U32 const offset,
+                   U32 const matchLength,
+                   const optState_t* const optPtr,
+                   int const optLevel)
 {
     U32 price;
     U32 const offCode = ZSTD_highbit32(offset+1);
     U32 const mlBase = matchLength - MINMATCH;
     assert(matchLength >= MINMATCH);
 
-    if (optPtr->staticPrices)  /* fixed scheme, do not use statistics */
-        return ZSTD_highbit32((U32)mlBase+1) + 16 + offCode;
+    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
+        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
 
-    price = offCode + optPtr->log2offCodeSum - ZSTD_highbit32(optPtr->offCodeFreq[offCode]+1);
-    if ((optLevel<2) /*static*/ && offCode >= 20) price += (offCode-19)*2; /* handicap for long distance offsets, favor decompression speed */
+    /* dynamic statistics */
+    price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+    if ((optLevel<2) /*static*/ && offCode >= 20)
+        price += (offCode-19)*2 * BITCOST_MULTIPLIER; /* handicap for long distance offsets, favor decompression speed */
 
     /* match Length */
     {   U32 const mlCode = ZSTD_MLcode(mlBase);
-        price += ML_bits[mlCode] + optPtr->log2matchLengthSum - ZSTD_highbit32(optPtr->matchLengthFreq[mlCode]+1);
+        price += (ML_bits[mlCode] * BITCOST_MULTIPLIER) + (optPtr->matchLengthSumBasePrice - WEIGHT(optPtr->matchLengthFreq[mlCode], optLevel));
     }
 
+    price += BITCOST_MULTIPLIER / 5;   /* heuristic : make matches a bit more costly to favor less sequences -> faster decompression speed */
+
     DEBUGLOG(8, "ZSTD_getMatchPrice(ml:%u) = %u", matchLength, price);
     return price;
 }
 
+/* ZSTD_updateStats() :
+ * assumption : literals + litLengtn <= iend */
 static void ZSTD_updateStats(optState_t* const optPtr,
                              U32 litLength, const BYTE* literals,
                              U32 offsetCode, U32 matchLength)
@@ -269,10 +360,11 @@ static U32 ZSTD_insertAndFindFirstIndexH
  *  ip : assumed <= iend-8 .
  * @return : nb of positions added */
 static U32 ZSTD_insertBt1(
-                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                ZSTD_matchState_t* ms,
                 const BYTE* const ip, const BYTE* const iend,
-                U32 const mls, U32 const extDict)
+                U32 const mls, const int extDict)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32*   const hashTable = ms->hashTable;
     U32    const hashLog = cParams->hashLog;
     size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
@@ -293,6 +385,7 @@ static U32 ZSTD_insertBt1(
     U32* largerPtr  = smallerPtr + 1;
     U32 dummy32;   /* to be nullified at the end */
     U32 const windowLow = ms->window.lowLimit;
+    U32 const matchLow = windowLow ? windowLow : 1;
     U32 matchEndIdx = current+8+1;
     size_t bestLength = 8;
     U32 nbCompares = 1U << cParams->searchLog;
@@ -308,7 +401,7 @@ static U32 ZSTD_insertBt1(
     assert(ip <= iend-8);   /* required for h calculation */
     hashTable[h] = current;   /* Update Hash Table */
 
-    while (nbCompares-- && (matchIndex > windowLow)) {
+    while (nbCompares-- && (matchIndex >= matchLow)) {
         U32* const nextPtr = bt + 2*(matchIndex & btMask);
         size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
         assert(matchIndex < current);
@@ -334,8 +427,8 @@ static U32 ZSTD_insertBt1(
         }
 #endif
 
-        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
-            assert(matchIndex+matchLength >= dictLimit);   /* might be wrong if extDict is incorrectly set to 0 */
+        if (!extDict || (matchIndex+matchLength >= dictLimit)) {
+            assert(matchIndex+matchLength >= dictLimit);   /* might be wrong if actually extDict */
             match = base + matchIndex;
             matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
         } else {
@@ -379,35 +472,33 @@ static U32 ZSTD_insertBt1(
 
 FORCE_INLINE_TEMPLATE
 void ZSTD_updateTree_internal(
-                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                ZSTD_matchState_t* ms,
                 const BYTE* const ip, const BYTE* const iend,
-                const U32 mls, const U32 extDict)
+                const U32 mls, const ZSTD_dictMode_e dictMode)
 {
     const BYTE* const base = ms->window.base;
     U32 const target = (U32)(ip - base);
     U32 idx = ms->nextToUpdate;
-    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (extDict:%u)",
-                idx, target, extDict);
+    DEBUGLOG(5, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
+                idx, target, dictMode);
 
     while(idx < target)
-        idx += ZSTD_insertBt1(ms, cParams, base+idx, iend, mls, extDict);
+        idx += ZSTD_insertBt1(ms, base+idx, iend, mls, dictMode == ZSTD_extDict);
     ms->nextToUpdate = target;
 }
 
-void ZSTD_updateTree(
-                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                const BYTE* ip, const BYTE* iend)
-{
-    ZSTD_updateTree_internal(ms, cParams, ip, iend, cParams->searchLength, 0 /*extDict*/);
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+    ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.searchLength, ZSTD_noDict);
 }
 
 FORCE_INLINE_TEMPLATE
 U32 ZSTD_insertBtAndGetAllMatches (
-                    ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                    const BYTE* const ip, const BYTE* const iLimit, int const extDict,
+                    ZSTD_matchState_t* ms,
+                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
                     U32 rep[ZSTD_REP_NUM], U32 const ll0,
                     ZSTD_match_t* matches, const U32 lengthToBeat, U32 const mls /* template */)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
     const BYTE* const base = ms->window.base;
     U32 const current = (U32)(ip-base);
@@ -426,6 +517,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
     const BYTE* const prefixStart = base + dictLimit;
     U32 const btLow = btMask >= current ? 0 : current - btMask;
     U32 const windowLow = ms->window.lowLimit;
+    U32 const matchLow = windowLow ? windowLow : 1;
     U32* smallerPtr = bt + 2*(current&btMask);
     U32* largerPtr  = bt + 2*(current&btMask) + 1;
     U32 matchEndIdx = current+8+1;   /* farthest referenced position of any match => detects repetitive patterns */
@@ -433,8 +525,21 @@ U32 ZSTD_insertBtAndGetAllMatches (
     U32 mnum = 0;
     U32 nbCompares = 1U << cParams->searchLog;
 
+    const ZSTD_matchState_t* dms    = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
+    const ZSTD_compressionParameters* const dmsCParams =
+                                      dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL;
+    const BYTE* const dmsBase       = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL;
+    const BYTE* const dmsEnd        = dictMode == ZSTD_dictMatchState ? dms->window.nextSrc : NULL;
+    U32         const dmsHighLimit  = dictMode == ZSTD_dictMatchState ? (U32)(dmsEnd - dmsBase) : 0;
+    U32         const dmsLowLimit   = dictMode == ZSTD_dictMatchState ? dms->window.lowLimit : 0;
+    U32         const dmsIndexDelta = dictMode == ZSTD_dictMatchState ? windowLow - dmsHighLimit : 0;
+    U32         const dmsHashLog    = dictMode == ZSTD_dictMatchState ? dmsCParams->hashLog : hashLog;
+    U32         const dmsBtLog      = dictMode == ZSTD_dictMatchState ? dmsCParams->chainLog - 1 : btLog;
+    U32         const dmsBtMask     = dictMode == ZSTD_dictMatchState ? (1U << dmsBtLog) - 1 : 0;
+    U32         const dmsBtLow      = dictMode == ZSTD_dictMatchState && dmsBtMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - dmsBtMask : dmsLowLimit;
+
     size_t bestLength = lengthToBeat-1;
-    DEBUGLOG(7, "ZSTD_insertBtAndGetAllMatches");
+    DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", current);
 
     /* check repCode */
     {   U32 const lastR = ZSTD_REP_NUM + ll0;
@@ -449,18 +554,26 @@ U32 ZSTD_insertBtAndGetAllMatches (
                     repLen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repOffset, iLimit) + minMatch;
                 }
             } else {  /* repIndex < dictLimit || repIndex >= current */
-                const BYTE* const repMatch = dictBase + repIndex;
+                const BYTE* const repMatch = dictMode == ZSTD_dictMatchState ?
+                                             dmsBase + repIndex - dmsIndexDelta :
+                                             dictBase + repIndex;
                 assert(current >= windowLow);
-                if ( extDict /* this case only valid in extDict mode */
+                if ( dictMode == ZSTD_extDict
                   && ( ((repOffset-1) /*intentional overflow*/ < current - windowLow)  /* equivalent to `current > repIndex >= windowLow` */
                      & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */)
                   && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
                     repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch;
+                }
+                if (dictMode == ZSTD_dictMatchState
+                  && ( ((repOffset-1) /*intentional overflow*/ < current - (dmsLowLimit + dmsIndexDelta))  /* equivalent to `current > repIndex >= dmsLowLimit` */
+                     & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */
+                  && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                    repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch;
             }   }
             /* save longer solution */
             if (repLen > bestLength) {
-                DEBUGLOG(8, "found rep-match %u of length %u",
-                            repCode - ll0, (U32)repLen);
+                DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+                            repCode, ll0, repOffset, repLen);
                 bestLength = repLen;
                 matches[mnum].off = repCode - ll0;
                 matches[mnum].len = (U32)repLen;
@@ -473,10 +586,10 @@ U32 ZSTD_insertBtAndGetAllMatches (
     /* HC3 match finder */
     if ((mls == 3) /*static*/ && (bestLength < mls)) {
         U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, ip);
-        if ((matchIndex3 > windowLow)
+        if ((matchIndex3 >= matchLow)
           & (current - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) {
             size_t mlen;
-            if ((!extDict) /*static*/ || (matchIndex3 >= dictLimit)) {
+            if ((dictMode == ZSTD_noDict) /*static*/ || (dictMode == ZSTD_dictMatchState) /*static*/ || (matchIndex3 >= dictLimit)) {
                 const BYTE* const match = base + matchIndex3;
                 mlen = ZSTD_count(ip, match, iLimit);
             } else {
@@ -498,17 +611,21 @@ U32 ZSTD_insertBtAndGetAllMatches (
                      (ip+mlen == iLimit) ) {  /* best possible length */
                     ms->nextToUpdate = current+1;  /* skip insertion */
                     return 1;
-    }   }   }   }
+                }
+            }
+        }
+        /* no dictMatchState lookup: dicts don't have a populated HC3 table */
+    }
 
     hashTable[h] = current;   /* Update Hash Table */
 
-    while (nbCompares-- && (matchIndex > windowLow)) {
+    while (nbCompares-- && (matchIndex >= matchLow)) {
         U32* const nextPtr = bt + 2*(matchIndex & btMask);
         size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
         const BYTE* match;
         assert(current > matchIndex);
 
-        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
+        if ((dictMode == ZSTD_noDict) || (dictMode == ZSTD_dictMatchState) || (matchIndex+matchLength >= dictLimit)) {
             assert(matchIndex+matchLength >= dictLimit);  /* ensure the condition is correct when !extDict */
             match = base + matchIndex;
             matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit);
@@ -520,8 +637,8 @@ U32 ZSTD_insertBtAndGetAllMatches (
         }
 
         if (matchLength > bestLength) {
-            DEBUGLOG(8, "found match of length %u at distance %u",
-                    (U32)matchLength, current - matchIndex);
+            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
+                    (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE);
             assert(matchEndIdx > matchIndex);
             if (matchLength > matchEndIdx - matchIndex)
                 matchEndIdx = matchIndex + (U32)matchLength;
@@ -529,9 +646,10 @@ U32 ZSTD_insertBtAndGetAllMatches (
             matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE;
             matches[mnum].len = (U32)matchLength;
             mnum++;
-            if (matchLength > ZSTD_OPT_NUM) break;
-            if (ip+matchLength == iLimit) {  /* equal : no way to know if inf or sup */
-                break;   /* drop, to preserve bt consistency (miss a little bit of compression) */
+            if ( (matchLength > ZSTD_OPT_NUM)
+               | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+                if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */
+                break; /* drop, to preserve bt consistency (miss a little bit of compression) */
             }
         }
 
@@ -552,6 +670,47 @@ U32 ZSTD_insertBtAndGetAllMatches (
 
     *smallerPtr = *largerPtr = 0;
 
+    if (dictMode == ZSTD_dictMatchState && nbCompares) {
+        size_t const dmsH = ZSTD_hashPtr(ip, dmsHashLog, mls);
+        U32 dictMatchIndex = dms->hashTable[dmsH];
+        const U32* const dmsBt = dms->chainTable;
+        commonLengthSmaller = commonLengthLarger = 0;
+        while (nbCompares-- && (dictMatchIndex > dmsLowLimit)) {
+            const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & dmsBtMask);
+            size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+            const BYTE* match = dmsBase + dictMatchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dmsEnd, prefixStart);
+            if (dictMatchIndex+matchLength >= dmsHighLimit)
+                match = base + dictMatchIndex + dmsIndexDelta;   /* to prepare for next usage of match[matchLength] */
+
+            if (matchLength > bestLength) {
+                matchIndex = dictMatchIndex + dmsIndexDelta;
+                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
+                        (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE);
+                if (matchLength > matchEndIdx - matchIndex)
+                    matchEndIdx = matchIndex + (U32)matchLength;
+                bestLength = matchLength;
+                matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE;
+                matches[mnum].len = (U32)matchLength;
+                mnum++;
+                if ( (matchLength > ZSTD_OPT_NUM)
+                   | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+                    break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+                }
+            }
+
+            if (dictMatchIndex <= dmsBtLow) { break; }   /* beyond tree size, stop the search */
+            if (match[matchLength] < ip[matchLength]) {
+                commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+                dictMatchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            } else {
+                /* match is larger than current */
+                commonLengthLarger = matchLength;
+                dictMatchIndex = nextPtr[0];
+            }
+        }
+    }
+
     assert(matchEndIdx > current+8);
     ms->nextToUpdate = matchEndIdx - 8;  /* skip repetitive patterns */
     return mnum;
@@ -559,23 +718,24 @@ U32 ZSTD_insertBtAndGetAllMatches (
 
 
 FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                        const BYTE* ip, const BYTE* const iHighLimit, int const extDict,
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode,
                         U32 rep[ZSTD_REP_NUM], U32 const ll0,
                         ZSTD_match_t* matches, U32 const lengthToBeat)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32 const matchLengthSearch = cParams->searchLength;
-    DEBUGLOG(7, "ZSTD_BtGetAllMatches");
+    DEBUGLOG(8, "ZSTD_BtGetAllMatches");
     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
-    ZSTD_updateTree_internal(ms, cParams, ip, iHighLimit, matchLengthSearch, extDict);
+    ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMode);
     switch(matchLengthSearch)
     {
-    case 3 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 3);
+    case 3 : return ZSTD_insertBtAndGetAllMatches(ms, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 3);
     default :
-    case 4 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 4);
-    case 5 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 5);
+    case 4 : return ZSTD_insertBtAndGetAllMatches(ms, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 4);
+    case 5 : return ZSTD_insertBtAndGetAllMatches(ms, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 5);
     case 7 :
-    case 6 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 6);
+    case 6 : return ZSTD_insertBtAndGetAllMatches(ms, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 6);
     }
 }
 
@@ -587,7 +747,7 @@ typedef struct repcodes_s {
     U32 rep[3];
 } repcodes_t;
 
-repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
+static repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
 {
     repcodes_t newReps;
     if (offset >= ZSTD_REP_NUM) {  /* full offset */
@@ -609,65 +769,17 @@ repcodes_t ZSTD_updateRep(U32 const rep[
 }
 
 
-typedef struct {
-    const BYTE* anchor;
-    U32 litlen;
-    U32 rawLitCost;
-} cachedLiteralPrice_t;
-
-static U32 ZSTD_rawLiteralsCost_cached(
-                            cachedLiteralPrice_t* const cachedLitPrice,
-                            const BYTE* const anchor, U32 const litlen,
-                            const optState_t* const optStatePtr)
+static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
 {
-    U32 startCost;
-    U32 remainingLength;
-    const BYTE* startPosition;
-
-    if (anchor == cachedLitPrice->anchor) {
-        startCost = cachedLitPrice->rawLitCost;
-        startPosition = anchor + cachedLitPrice->litlen;
-        assert(litlen >= cachedLitPrice->litlen);
-        remainingLength = litlen - cachedLitPrice->litlen;
-    } else {
-        startCost = 0;
-        startPosition = anchor;
-        remainingLength = litlen;
-    }
-
-    {   U32 const rawLitCost = startCost + ZSTD_rawLiteralsCost(startPosition, remainingLength, optStatePtr);
-        cachedLitPrice->anchor = anchor;
-        cachedLitPrice->litlen = litlen;
-        cachedLitPrice->rawLitCost = rawLitCost;
-        return rawLitCost;
-    }
+    return sol.litlen + sol.mlen;
 }
 
-static U32 ZSTD_fullLiteralsCost_cached(
-                            cachedLiteralPrice_t* const cachedLitPrice,
-                            const BYTE* const anchor, U32 const litlen,
-                            const optState_t* const optStatePtr)
-{
-    return ZSTD_rawLiteralsCost_cached(cachedLitPrice, anchor, litlen, optStatePtr)
-         + ZSTD_litLengthPrice(litlen, optStatePtr);
-}
-
-static int ZSTD_literalsContribution_cached(
-                            cachedLiteralPrice_t* const cachedLitPrice,
-                            const BYTE* const anchor, U32 const litlen,
-                            const optState_t* const optStatePtr)
-{
-    int const contribution = ZSTD_rawLiteralsCost_cached(cachedLitPrice, anchor, litlen, optStatePtr)
-                           + ZSTD_litLengthContribution(litlen, optStatePtr);
-    return contribution;
-}
-
-FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,seqStore_t* seqStore,
-                                      U32 rep[ZSTD_REP_NUM],
-                                      ZSTD_compressionParameters const* cParams,
-                                      const void* src, size_t srcSize,
-                                      const int optLevel, const int extDict)
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                               seqStore_t* seqStore,
+                               U32 rep[ZSTD_REP_NUM],
+                               const void* src, size_t srcSize,
+                               const int optLevel, const ZSTD_dictMode_e dictMode)
 {
     optState_t* const optStatePtr = &ms->opt;
     const BYTE* const istart = (const BYTE*)src;
@@ -677,72 +789,76 @@ size_t ZSTD_compressBlock_opt_generic(ZS
     const BYTE* const ilimit = iend - 8;
     const BYTE* const base = ms->window.base;
     const BYTE* const prefixStart = base + ms->window.dictLimit;
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
 
     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
     U32 const minMatch = (cParams->searchLength == 3) ? 3 : 4;
 
     ZSTD_optimal_t* const opt = optStatePtr->priceTable;
     ZSTD_match_t* const matches = optStatePtr->matchTable;
-    cachedLiteralPrice_t cachedLitPrice;
+    ZSTD_optimal_t lastSequence;
 
     /* init */
     DEBUGLOG(5, "ZSTD_compressBlock_opt_generic");
+    assert(optLevel <= 2);
     ms->nextToUpdate3 = ms->nextToUpdate;
-    ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize);
+    ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize, optLevel);
     ip += (ip==prefixStart);
-    memset(&cachedLitPrice, 0, sizeof(cachedLitPrice));
 
     /* Match Loop */
     while (ip < ilimit) {
         U32 cur, last_pos = 0;
-        U32 best_mlen, best_off;
 
         /* find first match */
         {   U32 const litlen = (U32)(ip - anchor);
             U32 const ll0 = !litlen;
-            U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, ip, iend, extDict, rep, ll0, matches, minMatch);
+            U32 const nbMatches = ZSTD_BtGetAllMatches(ms, ip, iend, dictMode, rep, ll0, matches, minMatch);
             if (!nbMatches) { ip++; continue; }
 
             /* initialize opt[0] */
             { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
-            opt[0].mlen = 1;
+            opt[0].mlen = 0;  /* means is_a_literal */
             opt[0].litlen = litlen;
+            opt[0].price = ZSTD_literalsContribution(anchor, litlen, optStatePtr, optLevel);
 
             /* large match -> immediate encoding */
             {   U32 const maxML = matches[nbMatches-1].len;
-                DEBUGLOG(7, "found %u matches of maxLength=%u and offset=%u at cPos=%u => start new serie",
-                            nbMatches, maxML, matches[nbMatches-1].off, (U32)(ip-prefixStart));
+                U32 const maxOffset = matches[nbMatches-1].off;
+                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new serie",
+                            nbMatches, maxML, maxOffset, (U32)(ip-prefixStart));
 
                 if (maxML > sufficient_len) {
-                    best_mlen = maxML;
-                    best_off = matches[nbMatches-1].off;
-                    DEBUGLOG(7, "large match (%u>%u), immediate encoding",
-                                best_mlen, sufficient_len);
+                    lastSequence.litlen = litlen;
+                    lastSequence.mlen = maxML;
+                    lastSequence.off = maxOffset;
+                    DEBUGLOG(6, "large match (%u>%u), immediate encoding",
+                                maxML, sufficient_len);
                     cur = 0;
-                    last_pos = 1;
+                    last_pos = ZSTD_totalLen(lastSequence);
                     goto _shortestPath;
             }   }
 
             /* set prices for first matches starting position == 0 */
-            {   U32 const literalsPrice = ZSTD_fullLiteralsCost_cached(&cachedLitPrice, anchor, litlen, optStatePtr);
+            {   U32 const literalsPrice = opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
                 U32 pos;
                 U32 matchNb;
-                for (pos = 0; pos < minMatch; pos++) {
-                    opt[pos].mlen = 1;
-                    opt[pos].price = ZSTD_MAX_PRICE;
+                for (pos = 1; pos < minMatch; pos++) {
+                    opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
                 }
                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
                     U32 const offset = matches[matchNb].off;
                     U32 const end = matches[matchNb].len;
                     repcodes_t const repHistory = ZSTD_updateRep(rep, offset, ll0);
                     for ( ; pos <= end ; pos++ ) {
-                        U32 const matchPrice = literalsPrice + ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel);
-                        DEBUGLOG(7, "rPos:%u => set initial price : %u",
-                                    pos, matchPrice);
+                        U32 const matchPrice = ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel);
+                        U32 const sequencePrice = literalsPrice + matchPrice;
+                        DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+                                    pos, ZSTD_fCost(sequencePrice));
                         opt[pos].mlen = pos;
                         opt[pos].off = offset;
                         opt[pos].litlen = litlen;
-                        opt[pos].price = matchPrice;
+                        opt[pos].price = sequencePrice;
+                        ZSTD_STATIC_ASSERT(sizeof(opt[pos].rep) == sizeof(repHistory));
                         memcpy(opt[pos].rep, &repHistory, sizeof(repHistory));
                 }   }
                 last_pos = pos-1;
@@ -753,55 +869,67 @@ size_t ZSTD_compressBlock_opt_generic(ZS
         for (cur = 1; cur <= last_pos; cur++) {
             const BYTE* const inr = ip + cur;
             assert(cur < ZSTD_OPT_NUM);
+            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
 
             /* Fix current position with one literal if cheaper */
-            {   U32 const litlen = (opt[cur-1].mlen == 1) ? opt[cur-1].litlen + 1 : 1;
-                int price;  /* note : contribution can be negative */
-                if (cur > litlen) {
-                    price = opt[cur - litlen].price + ZSTD_literalsContribution(inr-litlen, litlen, optStatePtr);
-                } else {
-                    price = ZSTD_literalsContribution_cached(&cachedLitPrice, anchor, litlen, optStatePtr);
-                }
+            {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
+                int const price = opt[cur-1].price
+                                + ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
+                                + ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
+                                - ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
                 assert(price < 1000000000); /* overflow check */
                 if (price <= opt[cur].price) {
-                    DEBUGLOG(7, "rPos:%u : better price (%u<%u) using literal",
-                                cur, price, opt[cur].price);
-                    opt[cur].mlen = 1;
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+                                opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
+                    opt[cur].mlen = 0;
                     opt[cur].off = 0;
                     opt[cur].litlen = litlen;
                     opt[cur].price = price;
                     memcpy(opt[cur].rep, opt[cur-1].rep, sizeof(opt[cur].rep));
-            }   }
+                } else {
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
+                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
+                                opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
+                }
+            }
 
             /* last match must start at a minimum distance of 8 from oend */
             if (inr > ilimit) continue;
 
             if (cur == last_pos) break;
 
-             if ( (optLevel==0) /*static*/
-               && (opt[cur+1].price <= opt[cur].price) )
+            if ( (optLevel==0) /*static_test*/
+              && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
+                DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
                 continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+            }
 
-            {   U32 const ll0 = (opt[cur].mlen != 1);
-                U32 const litlen = (opt[cur].mlen == 1) ? opt[cur].litlen : 0;
-                U32 const previousPrice = (cur > litlen) ? opt[cur-litlen].price : 0;
-                U32 const basePrice = previousPrice + ZSTD_fullLiteralsCost(inr-litlen, litlen, optStatePtr);
-                U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, inr, iend, extDict, opt[cur].rep, ll0, matches, minMatch);
+            {   U32 const ll0 = (opt[cur].mlen != 0);
+                U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
+                U32 const previousPrice = opt[cur].price;
+                U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+                U32 const nbMatches = ZSTD_BtGetAllMatches(ms, inr, iend, dictMode, opt[cur].rep, ll0, matches, minMatch);
                 U32 matchNb;
-                if (!nbMatches) continue;
+                if (!nbMatches) {
+                    DEBUGLOG(7, "rPos:%u : no match found", cur);
+                    continue;
+                }
 
                 {   U32 const maxML = matches[nbMatches-1].len;
-                    DEBUGLOG(7, "rPos:%u, found %u matches, of maxLength=%u",
-                                cur, nbMatches, maxML);
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
+                                inr-istart, cur, nbMatches, maxML);
 
                     if ( (maxML > sufficient_len)
-                       | (cur + maxML >= ZSTD_OPT_NUM) ) {
-                        best_mlen = maxML;
-                        best_off = matches[nbMatches-1].off;
-                        last_pos = cur + 1;
+                      || (cur + maxML >= ZSTD_OPT_NUM) ) {
+                        lastSequence.mlen = maxML;
+                        lastSequence.off = matches[nbMatches-1].off;
+                        lastSequence.litlen = litlen;
+                        cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0;  /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
+                        last_pos = cur + ZSTD_totalLen(lastSequence);
+                        if (cur > ZSTD_OPT_NUM) cur = 0;   /* underflow => first match */
                         goto _shortestPath;
-                    }
-                }
+                }   }
 
                 /* set prices using matches found at position == cur */
                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
@@ -811,81 +939,97 @@ size_t ZSTD_compressBlock_opt_generic(ZS
                     U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
                     U32 mlen;
 
-                    DEBUGLOG(7, "testing match %u => offCode=%u, mlen=%u, llen=%u",
+                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
                                 matchNb, matches[matchNb].off, lastML, litlen);
 
-                    for (mlen = lastML; mlen >= startML; mlen--) {
+                    for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
                         U32 const pos = cur + mlen;
                         int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
 
                         if ((pos > last_pos) || (price < opt[pos].price)) {
-                            DEBUGLOG(7, "rPos:%u => new better price (%u<%u)",
-                                        pos, price, opt[pos].price);
-                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }
+                            DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
+                                        pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }   /* fill empty positions */
                             opt[pos].mlen = mlen;
                             opt[pos].off = offset;
                             opt[pos].litlen = litlen;
                             opt[pos].price = price;
+                            ZSTD_STATIC_ASSERT(sizeof(opt[pos].rep) == sizeof(repHistory));
                             memcpy(opt[pos].rep, &repHistory, sizeof(repHistory));
                         } else {
-                            if (optLevel==0) break;  /* gets ~+10% speed for about -0.01 ratio loss */
+                            DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
+                                        pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+                            if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
                         }
             }   }   }
         }  /* for (cur = 1; cur <= last_pos; cur++) */
 
-        best_mlen = opt[last_pos].mlen;
-        best_off = opt[last_pos].off;
-        cur = last_pos - best_mlen;
+        lastSequence = opt[last_pos];
+        cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0;  /* single sequence, and it starts before `ip` */
+        assert(cur < ZSTD_OPT_NUM);  /* control overflow*/
 
 _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
-        assert(opt[0].mlen == 1);
+        assert(opt[0].mlen == 0);
+
+        {   U32 const storeEnd = cur + 1;
+            U32 storeStart = storeEnd;
+            U32 seqPos = cur;
 
-        /* reverse traversal */
-        DEBUGLOG(7, "start reverse traversal (last_pos:%u, cur:%u)",
-                    last_pos, cur);
-        {   U32 selectedMatchLength = best_mlen;
-            U32 selectedOffset = best_off;
-            U32 pos = cur;
-            while (1) {
-                U32 const mlen = opt[pos].mlen;
-                U32 const off = opt[pos].off;
-                opt[pos].mlen = selectedMatchLength;
-                opt[pos].off = selectedOffset;
-                selectedMatchLength = mlen;
-                selectedOffset = off;
-                if (mlen > pos) break;
-                pos -= mlen;
-        }   }
+            DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
+                        last_pos, cur); (void)last_pos;
+            assert(storeEnd < ZSTD_OPT_NUM);
+            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
+            opt[storeEnd] = lastSequence;
+            while (seqPos > 0) {
+                U32 const backDist = ZSTD_totalLen(opt[seqPos]);
+                storeStart--;
+                DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+                            seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
+                opt[storeStart] = opt[seqPos];
+                seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
+            }
 
-        /* save sequences */
-        {   U32 pos;
-            for (pos=0; pos < last_pos; ) {
-                U32 const llen = (U32)(ip - anchor);
-                U32 const mlen = opt[pos].mlen;
-                U32 const offset = opt[pos].off;
-                if (mlen == 1) { ip++; pos++; continue; }  /* literal position => move on */
-                pos += mlen; ip += mlen;
+            /* save sequences */
+            DEBUGLOG(6, "sending selected sequences into seqStore")
+            {   U32 storePos;
+                for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+                    U32 const llen = opt[storePos].litlen;
+                    U32 const mlen = opt[storePos].mlen;
+                    U32 const offCode = opt[storePos].off;
+                    U32 const advance = llen + mlen;
+                    DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+                                anchor - istart, llen, mlen);
+
+                    if (mlen==0) {  /* only literals => must be last "sequence", actually starting a new stream of sequences */
+                        assert(storePos == storeEnd);   /* must be last sequence */
+                        ip = anchor + llen;     /* last "sequence" is a bunch of literals => don't progress anchor */
+                        continue;   /* will finish */
+                    }
 
-                /* repcodes update : like ZSTD_updateRep(), but update in place */
-                if (offset >= ZSTD_REP_NUM) {  /* full offset */
-                    rep[2] = rep[1];
-                    rep[1] = rep[0];
-                    rep[0] = offset - ZSTD_REP_MOVE;
-                } else {   /* repcode */
-                    U32 const repCode = offset + (llen==0);
-                    if (repCode) {  /* note : if repCode==0, no change */
-                        U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
-                        if (repCode >= 2) rep[2] = rep[1];
+                    /* repcodes update : like ZSTD_updateRep(), but update in place */
+                    if (offCode >= ZSTD_REP_NUM) {  /* full offset */
+                        rep[2] = rep[1];
                         rep[1] = rep[0];
-                        rep[0] = currentOffset;
-                    }
-                }
+                        rep[0] = offCode - ZSTD_REP_MOVE;
+                    } else {   /* repcode */
+                        U32 const repCode = offCode + (llen==0);
+                        if (repCode) {  /* note : if repCode==0, no change */
+                            U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+                            if (repCode >= 2) rep[2] = rep[1];
+                            rep[1] = rep[0];
+                            rep[0] = currentOffset;
+                    }   }
 
-                ZSTD_updateStats(optStatePtr, llen, anchor, offset, mlen);
-                ZSTD_storeSeq(seqStore, llen, anchor, offset, mlen-MINMATCH);
-                anchor = ip;
-        }   }
-        ZSTD_setLog2Prices(optStatePtr);
+                    assert(anchor + llen <= iend);
+                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
+                    ZSTD_storeSeq(seqStore, llen, anchor, offCode, mlen-MINMATCH);
+                    anchor += advance;
+                    ip = anchor;
+            }   }
+            ZSTD_setBasePrices(optStatePtr, optLevel);
+        }
+
     }   /* while (ip < ilimit) */
 
     /* Return the last literals size */
@@ -895,29 +1039,94 @@ size_t ZSTD_compressBlock_opt_generic(ZS
 
 size_t ZSTD_compressBlock_btopt(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressBlock_btopt");
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, 0 /*extDict*/);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_noDict);
+}
+
+
+/* used in 2-pass strategy */
+static U32 ZSTD_upscaleStat(U32* table, U32 lastEltIndex, int bonus)
+{
+    U32 s, sum=0;
+    assert(ZSTD_FREQ_DIV+bonus > 0);
+    for (s=0; s<=lastEltIndex; s++) {
+        table[s] <<= ZSTD_FREQ_DIV+bonus;
+        table[s]--;
+        sum += table[s];
+    }
+    return sum;
+}
+
+/* used in 2-pass strategy */
+MEM_STATIC void ZSTD_upscaleStats(optState_t* optPtr)
+{
+    optPtr->litSum = ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0);
+    optPtr->litLengthSum = ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL, 1);
+    optPtr->matchLengthSum = ZSTD_upscaleStat(optPtr->matchLengthFreq, MaxML, 1);
+    optPtr->offCodeSum = ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 1);
 }
 
 size_t ZSTD_compressBlock_btultra(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, 0 /*extDict*/);
+    DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
+#if 0
+    /* 2-pass strategy (disabled)
+     * this strategy makes a first pass over first block to collect statistics
+     * and seed next round's statistics with it.
+     * The compression ratio gain is generally small (~0.5% on first block),
+     * the cost is 2x cpu time on first block. */
+    assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+    if ( (ms->opt.litLengthSum==0)   /* first block */
+      && (seqStore->sequences == seqStore->sequencesStart)   /* no ldm */
+      && (ms->window.dictLimit == ms->window.lowLimit) ) {   /* no dictionary */
+        U32 tmpRep[ZSTD_REP_NUM];
+        DEBUGLOG(5, "ZSTD_compressBlock_btultra: first block: collecting statistics");
+        assert(ms->nextToUpdate >= ms->window.dictLimit
+            && ms->nextToUpdate <= ms->window.dictLimit + 1);
+        memcpy(tmpRep, rep, sizeof(tmpRep));
+        ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);   /* generate stats into ms->opt*/
+        ZSTD_resetSeqStore(seqStore);
+        /* invalidate first scan from history */
+        ms->window.base -= srcSize;
+        ms->window.dictLimit += (U32)srcSize;
+        ms->window.lowLimit = ms->window.dictLimit;
+        ms->nextToUpdate = ms->window.dictLimit;
+        ms->nextToUpdate3 = ms->window.dictLimit;
+        /* re-inforce weight of collected statistics */
+        ZSTD_upscaleStats(&ms->opt);
+    }
+#endif
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState);
 }
 
 size_t ZSTD_compressBlock_btopt_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, 1 /*extDict*/);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_extDict);
 }
 
 size_t ZSTD_compressBlock_btultra_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, 1 /*extDict*/);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_extDict);
 }
diff --git a/contrib/python-zstandard/zstd/compress/zstd_opt.h b/contrib/python-zstandard/zstd/compress/zstd_opt.h
--- a/contrib/python-zstandard/zstd/compress/zstd_opt.h
+++ b/contrib/python-zstandard/zstd/compress/zstd_opt.h
@@ -17,23 +17,29 @@ extern "C" {
 
 #include "zstd_compress_internal.h"
 
-void ZSTD_updateTree(
-        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-        const BYTE* ip, const BYTE* iend);  /* used in ZSTD_loadDictionaryContent() */
+/* used in ZSTD_loadDictionaryContent() */
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
 
 size_t ZSTD_compressBlock_btopt(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btultra(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 
 size_t ZSTD_compressBlock_btopt_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btultra_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 #if defined (__cplusplus)
 }
diff --git a/contrib/python-zstandard/zstd/compress/zstdmt_compress.c b/contrib/python-zstandard/zstd/compress/zstdmt_compress.c
--- a/contrib/python-zstandard/zstd/compress/zstdmt_compress.c
+++ b/contrib/python-zstandard/zstd/compress/zstdmt_compress.c
@@ -37,18 +37,19 @@
 #define ZSTD_RESIZE_SEQPOOL 0
 
 /* ======   Debug   ====== */
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2)
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) \
+    && !defined(_MSC_VER) \
+    && !defined(__MINGW32__)
 
 #  include <stdio.h>
 #  include <unistd.h>
 #  include <sys/times.h>
-#  define DEBUGLOGRAW(l, ...) if (l<=ZSTD_DEBUG) { fprintf(stderr, __VA_ARGS__); }
 
 #  define DEBUG_PRINTHEX(l,p,n) {            \
     unsigned debug_u;                        \
     for (debug_u=0; debug_u<(n); debug_u++)  \
-        DEBUGLOGRAW(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \
-    DEBUGLOGRAW(l, " \n");                   \
+        RAWLOG(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \
+    RAWLOG(l, " \n");                        \
 }
 
 static unsigned long long GetCurrentClockTimeMicroseconds(void)
@@ -62,7 +63,7 @@ static unsigned long long GetCurrentCloc
 
 #define MUTEX_WAIT_TIME_DLEVEL 6
 #define ZSTD_PTHREAD_MUTEX_LOCK(mutex) {          \
-    if (ZSTD_DEBUG >= MUTEX_WAIT_TIME_DLEVEL) {   \
+    if (DEBUGLEVEL >= MUTEX_WAIT_TIME_DLEVEL) {   \
         unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds(); \
         ZSTD_pthread_mutex_lock(mutex);           \
         {   unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \
@@ -160,6 +161,25 @@ static void ZSTDMT_setBufferSize(ZSTDMT_
     ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
 }
 
+
+static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, U32 nbWorkers)
+{
+    unsigned const maxNbBuffers = 2*nbWorkers + 3;
+    if (srcBufPool==NULL) return NULL;
+    if (srcBufPool->totalBuffers >= maxNbBuffers) /* good enough */
+        return srcBufPool;
+    /* need a larger buffer pool */
+    {   ZSTD_customMem const cMem = srcBufPool->cMem;
+        size_t const bSize = srcBufPool->bufferSize;   /* forward parameters */
+        ZSTDMT_bufferPool* newBufPool;
+        ZSTDMT_freeBufferPool(srcBufPool);
+        newBufPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+        if (newBufPool==NULL) return newBufPool;
+        ZSTDMT_setBufferSize(newBufPool, bSize);
+        return newBufPool;
+    }
+}
+
 /** ZSTDMT_getBuffer() :
  *  assumption : bufPool must be valid
  * @return : a buffer, with start pointer and size
@@ -229,8 +249,8 @@ static buffer_t ZSTDMT_resizeBuffer(ZSTD
 /* store buffer for later re-use, up to pool capacity */
 static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buf)
 {
+    DEBUGLOG(5, "ZSTDMT_releaseBuffer");
     if (buf.start == NULL) return;   /* compatible with release on NULL */
-    DEBUGLOG(5, "ZSTDMT_releaseBuffer");
     ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
     if (bufPool->nbBuffers < bufPool->totalBuffers) {
         bufPool->bTable[bufPool->nbBuffers++] = buf;  /* stored for later use */
@@ -300,7 +320,8 @@ static void ZSTDMT_setNbSeq(ZSTDMT_seqPo
 
 static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem)
 {
-    ZSTDMT_seqPool* seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    if (seqPool == NULL) return NULL;
     ZSTDMT_setNbSeq(seqPool, 0);
     return seqPool;
 }
@@ -310,6 +331,10 @@ static void ZSTDMT_freeSeqPool(ZSTDMT_se
     ZSTDMT_freeBufferPool(seqPool);
 }
 
+static ZSTDMT_seqPool* ZSTDMT_expandSeqPool(ZSTDMT_seqPool* pool, U32 nbWorkers)
+{
+    return ZSTDMT_expandBufferPool(pool, nbWorkers);
+}
 
 
 /* =====   CCtx Pool   ===== */
@@ -355,6 +380,18 @@ static ZSTDMT_CCtxPool* ZSTDMT_createCCt
     return cctxPool;
 }
 
+static ZSTDMT_CCtxPool* ZSTDMT_expandCCtxPool(ZSTDMT_CCtxPool* srcPool,
+                                              unsigned nbWorkers)
+{
+    if (srcPool==NULL) return NULL;
+    if (nbWorkers <= srcPool->totalCCtx) return srcPool;   /* good enough */
+    /* need a larger cctx pool */
+    {   ZSTD_customMem const cMem = srcPool->cMem;
+        ZSTDMT_freeCCtxPool(srcPool);
+        return ZSTDMT_createCCtxPool(nbWorkers, cMem);
+    }
+}
+
 /* only works during initialization phase, not during compression */
 static size_t ZSTDMT_sizeof_CCtxPool(ZSTDMT_CCtxPool* cctxPool)
 {
@@ -425,12 +462,11 @@ typedef struct {
     ZSTD_window_t ldmWindow;  /* A thread-safe copy of ldmState.window */
 } serialState_t;
 
-static int ZSTDMT_serialState_reset(serialState_t* serialState, ZSTDMT_seqPool* seqPool, ZSTD_CCtx_params params)
+static int ZSTDMT_serialState_reset(serialState_t* serialState, ZSTDMT_seqPool* seqPool, ZSTD_CCtx_params params, size_t jobSize)
 {
     /* Adjust parameters */
     if (params.ldmParams.enableLdm) {
         DEBUGLOG(4, "LDM window size = %u KB", (1U << params.cParams.windowLog) >> 10);
-        params.ldmParams.windowLog = params.cParams.windowLog;
         ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
         assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
         assert(params.ldmParams.hashEveryLog < 32);
@@ -453,7 +489,7 @@ static int ZSTDMT_serialState_reset(seri
             serialState->params.ldmParams.hashLog -
             serialState->params.ldmParams.bucketSizeLog;
         /* Size the seq pool tables */
-        ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, params.jobSize));
+        ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, jobSize));
         /* Reset the window */
         ZSTD_window_clear(&serialState->ldmState.window);
         serialState->ldmWindow = serialState->ldmState.window;
@@ -473,6 +509,7 @@ static int ZSTDMT_serialState_reset(seri
         memset(serialState->ldmState.bucketOffsets, 0, bucketSize);
     }
     serialState->params = params;
+    serialState->params.jobSize = (U32)jobSize;
     return 0;
 }
 
@@ -505,6 +542,7 @@ static void ZSTDMT_serialState_update(se
     /* Wait for our turn */
     ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex);
     while (serialState->nextJobID < jobID) {
+        DEBUGLOG(5, "wait for serialState->cond");
         ZSTD_pthread_cond_wait(&serialState->cond, &serialState->mutex);
     }
     /* A future job may error and skip our job */
@@ -514,6 +552,7 @@ static void ZSTDMT_serialState_update(se
             size_t error;
             assert(seqStore.seq != NULL && seqStore.pos == 0 &&
                    seqStore.size == 0 && seqStore.capacity > 0);
+            assert(src.size <= serialState->params.jobSize);
             ZSTD_window_update(&serialState->ldmState.window, src.start, src.size);
             error = ZSTD_ldm_generateSequences(
                 &serialState->ldmState, &seqStore,
@@ -593,14 +632,32 @@ typedef struct {
     unsigned frameChecksumNeeded;        /* used only by mtctx */
 } ZSTDMT_jobDescription;
 
+#define JOB_ERROR(e) {                          \
+    ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);   \
+    job->cSize = e;                             \
+    ZSTD_pthread_mutex_unlock(&job->job_mutex); \
+    goto _endJob;                               \
+}
+
 /* ZSTDMT_compressionJob() is a POOL_function type */
-void ZSTDMT_compressionJob(void* jobDescription)
+static void ZSTDMT_compressionJob(void* jobDescription)
 {
     ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription;
     ZSTD_CCtx_params jobParams = job->params;   /* do not modify job->params ! copy it, modify the copy */
     ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(job->cctxPool);
     rawSeqStore_t rawSeqStore = ZSTDMT_getSeq(job->seqPool);
     buffer_t dstBuff = job->dstBuff;
+    size_t lastCBlockSize = 0;
+
+    /* ressources */
+    if (cctx==NULL) JOB_ERROR(ERROR(memory_allocation));
+    if (dstBuff.start == NULL) {   /* streaming job : doesn't provide a dstBuffer */
+        dstBuff = ZSTDMT_getBuffer(job->bufPool);
+        if (dstBuff.start==NULL) JOB_ERROR(ERROR(memory_allocation));
+        job->dstBuff = dstBuff;   /* this value can be read in ZSTDMT_flush, when it copies the whole job */
+    }
+    if (jobParams.ldmParams.enableLdm && rawSeqStore.seq == NULL)
+        JOB_ERROR(ERROR(memory_allocation));
 
     /* Don't compute the checksum for chunks, since we compute it externally,
      * but write it in the header.
@@ -609,47 +666,31 @@ void ZSTDMT_compressionJob(void* jobDesc
     /* Don't run LDM for the chunks, since we handle it externally */
     jobParams.ldmParams.enableLdm = 0;
 
-    /* ressources */
-    if (cctx==NULL) {
-        job->cSize = ERROR(memory_allocation);
-        goto _endJob;
-    }
-    if (dstBuff.start == NULL) {   /* streaming job : doesn't provide a dstBuffer */
-        dstBuff = ZSTDMT_getBuffer(job->bufPool);
-        if (dstBuff.start==NULL) {
-            job->cSize = ERROR(memory_allocation);
-            goto _endJob;
-        }
-        job->dstBuff = dstBuff;   /* this value can be read in ZSTDMT_flush, when it copies the whole job */
-    }
 
     /* init */
     if (job->cdict) {
-        size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, job->cdict, jobParams, job->fullFrameSize);
+        size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, job->cdict, jobParams, job->fullFrameSize);
         assert(job->firstJob);  /* only allowed for first job */
-        if (ZSTD_isError(initError)) { job->cSize = initError; goto _endJob; }
+        if (ZSTD_isError(initError)) JOB_ERROR(initError);
     } else {  /* srcStart points at reloaded section */
         U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->src.size;
         {   size_t const forceWindowError = ZSTD_CCtxParam_setParameter(&jobParams, ZSTD_p_forceMaxWindow, !job->firstJob);
-            if (ZSTD_isError(forceWindowError)) {
-                job->cSize = forceWindowError;
-                goto _endJob;
-        }   }
+            if (ZSTD_isError(forceWindowError)) JOB_ERROR(forceWindowError);
+        }
         {   size_t const initError = ZSTD_compressBegin_advanced_internal(cctx,
                                         job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */
+                                        ZSTD_dtlm_fast,
                                         NULL, /*cdict*/
                                         jobParams, pledgedSrcSize);
-            if (ZSTD_isError(initError)) {
-                job->cSize = initError;
-                goto _endJob;
-    }   }   }
+            if (ZSTD_isError(initError)) JOB_ERROR(initError);
+    }   }
 
     /* Perform serial step as early as possible, but after CCtx initialization */
     ZSTDMT_serialState_update(job->serial, cctx, rawSeqStore, job->src, job->jobID);
 
     if (!job->firstJob) {  /* flush and overwrite frame header when it's not first job */
         size_t const hSize = ZSTD_compressContinue(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0);
-        if (ZSTD_isError(hSize)) { job->cSize = hSize; /* save error code */ goto _endJob; }
+        if (ZSTD_isError(hSize)) JOB_ERROR(hSize);
         DEBUGLOG(5, "ZSTDMT_compressionJob: flush and overwrite %u bytes of frame header (not first job)", (U32)hSize);
         ZSTD_invalidateRepCodes(cctx);
     }
@@ -667,7 +708,7 @@ void ZSTDMT_compressionJob(void* jobDesc
         assert(job->cSize == 0);
         for (chunkNb = 1; chunkNb < nbChunks; chunkNb++) {
             size_t const cSize = ZSTD_compressContinue(cctx, op, oend-op, ip, chunkSize);
-            if (ZSTD_isError(cSize)) { job->cSize = cSize; goto _endJob; }
+            if (ZSTD_isError(cSize)) JOB_ERROR(cSize);
             ip += chunkSize;
             op += cSize; assert(op < oend);
             /* stats */
@@ -680,18 +721,16 @@ void ZSTDMT_compressionJob(void* jobDesc
             ZSTD_pthread_mutex_unlock(&job->job_mutex);
         }
         /* last block */
-        assert(chunkSize > 0); assert((chunkSize & (chunkSize - 1)) == 0);  /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */
+        assert(chunkSize > 0);
+        assert((chunkSize & (chunkSize - 1)) == 0);  /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */
         if ((nbChunks > 0) | job->lastJob /*must output a "last block" flag*/ ) {
             size_t const lastBlockSize1 = job->src.size & (chunkSize-1);
             size_t const lastBlockSize = ((lastBlockSize1==0) & (job->src.size>=chunkSize)) ? chunkSize : lastBlockSize1;
             size_t const cSize = (job->lastJob) ?
                  ZSTD_compressEnd     (cctx, op, oend-op, ip, lastBlockSize) :
                  ZSTD_compressContinue(cctx, op, oend-op, ip, lastBlockSize);
-            if (ZSTD_isError(cSize)) { job->cSize = cSize; goto _endJob; }
-            /* stats */
-            ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
-            job->cSize += cSize;
-            ZSTD_pthread_mutex_unlock(&job->job_mutex);
+            if (ZSTD_isError(cSize)) JOB_ERROR(cSize);
+            lastCBlockSize = cSize;
     }   }
 
 _endJob:
@@ -704,7 +743,9 @@ void ZSTDMT_compressionJob(void* jobDesc
     ZSTDMT_releaseCCtx(job->cctxPool, cctx);
     /* report */
     ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
-    job->consumed = job->src.size;
+    if (ZSTD_isError(job->cSize)) assert(lastCBlockSize == 0);
+    job->cSize += lastCBlockSize;
+    job->consumed = job->src.size;  /* when job->consumed == job->src.size , compression job is presumed completed */
     ZSTD_pthread_cond_signal(&job->job_cond);
     ZSTD_pthread_mutex_unlock(&job->job_mutex);
 }
@@ -745,9 +786,9 @@ struct ZSTDMT_CCtx_s {
     ZSTD_CCtx_params params;
     size_t targetSectionSize;
     size_t targetPrefixSize;
-    roundBuff_t roundBuff;
+    int jobReady;        /* 1 => one job is already prepared, but pool has shortage of workers. Don't create a new job. */
     inBuff_t inBuff;
-    int jobReady;        /* 1 => one job is already prepared, but pool has shortage of workers. Don't create another one. */
+    roundBuff_t roundBuff;
     serialState_t serial;
     unsigned singleBlockingThread;
     unsigned jobIDMask;
@@ -798,6 +839,20 @@ static ZSTDMT_jobDescription* ZSTDMT_cre
     return jobTable;
 }
 
+static size_t ZSTDMT_expandJobsTable (ZSTDMT_CCtx* mtctx, U32 nbWorkers) {
+    U32 nbJobs = nbWorkers + 2;
+    if (nbJobs > mtctx->jobIDMask+1) {  /* need more job capacity */
+        ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem);
+        mtctx->jobIDMask = 0;
+        mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, mtctx->cMem);
+        if (mtctx->jobs==NULL) return ERROR(memory_allocation);
+        assert((nbJobs != 0) && ((nbJobs & (nbJobs - 1)) == 0));  /* ensure nbJobs is a power of 2 */
+        mtctx->jobIDMask = nbJobs - 1;
+    }
+    return 0;
+}
+
+
 /* ZSTDMT_CCtxParam_setNbWorkers():
  * Internal use only */
 size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers)
@@ -875,7 +930,7 @@ static void ZSTDMT_waitForAllJobsComplet
         unsigned const jobID = mtctx->doneJobID & mtctx->jobIDMask;
         ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex);
         while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) {
-            DEBUGLOG(5, "waiting for jobCompleted signal from job %u", mtctx->doneJobID);   /* we want to block when waiting for data to flush */
+            DEBUGLOG(4, "waiting for jobCompleted signal from job %u", mtctx->doneJobID);   /* we want to block when waiting for data to flush */
             ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex);
         }
         ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex);
@@ -924,6 +979,8 @@ size_t ZSTDMT_CCtxParam_setMTCtxParamete
         if ( (value > 0)  /* value==0 => automatic job size */
            & (value < ZSTDMT_JOBSIZE_MIN) )
             value = ZSTDMT_JOBSIZE_MIN;
+        if (value > ZSTDMT_JOBSIZE_MAX)
+            value = ZSTDMT_JOBSIZE_MAX;
         params->jobSize = value;
         return value;
     case ZSTDMT_p_overlapSectionLog :
@@ -950,6 +1007,21 @@ size_t ZSTDMT_setMTCtxParameter(ZSTDMT_C
     }
 }
 
+size_t ZSTDMT_getMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned* value)
+{
+    switch (parameter) {
+    case ZSTDMT_p_jobSize:
+        *value = mtctx->params.jobSize;
+        break;
+    case ZSTDMT_p_overlapSectionLog:
+        *value = mtctx->params.overlapSizeLog;
+        break;
+    default:
+        return ERROR(parameter_unsupported);
+    }
+    return 0;
+}
+
 /* Sets parameters relevant to the compression job,
  * initializing others to default values. */
 static ZSTD_CCtx_params ZSTDMT_initJobCCtxParams(ZSTD_CCtx_params const params)
@@ -960,13 +1032,30 @@ static ZSTD_CCtx_params ZSTDMT_initJobCC
     jobParams.cParams = params.cParams;
     jobParams.fParams = params.fParams;
     jobParams.compressionLevel = params.compressionLevel;
-    jobParams.disableLiteralCompression = params.disableLiteralCompression;
 
     return jobParams;
 }
 
+
+/* ZSTDMT_resize() :
+ * @return : error code if fails, 0 on success */
+static size_t ZSTDMT_resize(ZSTDMT_CCtx* mtctx, unsigned nbWorkers)
+{
+    if (POOL_resize(mtctx->factory, nbWorkers)) return ERROR(memory_allocation);
+    CHECK_F( ZSTDMT_expandJobsTable(mtctx, nbWorkers) );
+    mtctx->bufPool = ZSTDMT_expandBufferPool(mtctx->bufPool, nbWorkers);
+    if (mtctx->bufPool == NULL) return ERROR(memory_allocation);
+    mtctx->cctxPool = ZSTDMT_expandCCtxPool(mtctx->cctxPool, nbWorkers);
+    if (mtctx->cctxPool == NULL) return ERROR(memory_allocation);
+    mtctx->seqPool = ZSTDMT_expandSeqPool(mtctx->seqPool, nbWorkers);
+    if (mtctx->seqPool == NULL) return ERROR(memory_allocation);
+    ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers);
+    return 0;
+}
+
+
 /*! ZSTDMT_updateCParams_whileCompressing() :
- *  Updates only a selected set of compression parameters, to remain compatible with current frame.
+ *  Updates a selected set of compression parameters, remaining compatible with currently active frame.
  *  New parameters will be applied to next compression job. */
 void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams)
 {
@@ -981,38 +1070,36 @@ void ZSTDMT_updateCParams_whileCompressi
     }
 }
 
-/* ZSTDMT_getNbWorkers():
- * @return nb threads currently active in mtctx.
- * mtctx must be valid */
-unsigned ZSTDMT_getNbWorkers(const ZSTDMT_CCtx* mtctx)
-{
-    assert(mtctx != NULL);
-    return mtctx->params.nbWorkers;
-}
-
 /* ZSTDMT_getFrameProgression():
  * tells how much data has been consumed (input) and produced (output) for current frame.
  * able to count progression inside worker threads.
- * Note : mutex will be acquired during statistics collection. */
+ * Note : mutex will be acquired during statistics collection inside workers. */
 ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx)
 {
     ZSTD_frameProgression fps;
-    DEBUGLOG(6, "ZSTDMT_getFrameProgression");
+    DEBUGLOG(5, "ZSTDMT_getFrameProgression");
+    fps.ingested = mtctx->consumed + mtctx->inBuff.filled;
     fps.consumed = mtctx->consumed;
-    fps.produced = mtctx->produced;
-    fps.ingested = mtctx->consumed + mtctx->inBuff.filled;
+    fps.produced = fps.flushed = mtctx->produced;
+    fps.currentJobID = mtctx->nextJobID;
+    fps.nbActiveWorkers = 0;
     {   unsigned jobNb;
         unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady; assert(mtctx->jobReady <= 1);
         DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)",
                     mtctx->doneJobID, lastJobNb, mtctx->jobReady)
         for (jobNb = mtctx->doneJobID ; jobNb < lastJobNb ; jobNb++) {
             unsigned const wJobID = jobNb & mtctx->jobIDMask;
-            ZSTD_pthread_mutex_lock(&mtctx->jobs[wJobID].job_mutex);
-            {   size_t const cResult = mtctx->jobs[wJobID].cSize;
+            ZSTDMT_jobDescription* jobPtr = &mtctx->jobs[wJobID];
+            ZSTD_pthread_mutex_lock(&jobPtr->job_mutex);
+            {   size_t const cResult = jobPtr->cSize;
                 size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
-                fps.consumed += mtctx->jobs[wJobID].consumed;
-                fps.ingested += mtctx->jobs[wJobID].src.size;
+                size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed;
+                assert(flushed <= produced);
+                fps.ingested += jobPtr->src.size;
+                fps.consumed += jobPtr->consumed;
                 fps.produced += produced;
+                fps.flushed  += flushed;
+                fps.nbActiveWorkers += (jobPtr->consumed < jobPtr->src.size);
             }
             ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
         }
@@ -1021,6 +1108,34 @@ ZSTD_frameProgression ZSTDMT_getFramePro
 }
 
 
+size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx)
+{
+    size_t toFlush;
+    unsigned const jobID = mtctx->doneJobID;
+    assert(jobID <= mtctx->nextJobID);
+    if (jobID == mtctx->nextJobID) return 0;   /* no active job => nothing to flush */
+
+    /* look into oldest non-fully-flushed job */
+    {   unsigned const wJobID = jobID & mtctx->jobIDMask;
+        ZSTDMT_jobDescription* const jobPtr = &mtctx->jobs[wJobID];
+        ZSTD_pthread_mutex_lock(&jobPtr->job_mutex);
+        {   size_t const cResult = jobPtr->cSize;
+            size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
+            size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed;
+            assert(flushed <= produced);
+            toFlush = produced - flushed;
+            if (toFlush==0 && (jobPtr->consumed >= jobPtr->src.size)) {
+                /* doneJobID is not-fully-flushed, but toFlush==0 : doneJobID should be compressing some more data */
+                assert(jobPtr->consumed < jobPtr->src.size);
+            }
+        }
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+    }
+
+    return toFlush;
+}
+
+
 /* ------------------------------------------ */
 /* =====   Multi-threaded compression   ===== */
 /* ------------------------------------------ */
@@ -1087,18 +1202,10 @@ static size_t ZSTDMT_compress_advanced_i
 
     assert(avgJobSize >= 256 KB);  /* condition for ZSTD_compressBound(A) + ZSTD_compressBound(B) <= ZSTD_compressBound(A+B), required to compress directly into Dst (no additional buffer) */
     ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(avgJobSize) );
-    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params))
+    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, avgJobSize))
         return ERROR(memory_allocation);
 
-    if (nbJobs > mtctx->jobIDMask+1) {  /* enlarge job table */
-        U32 jobsTableSize = nbJobs;
-        ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem);
-        mtctx->jobIDMask = 0;
-        mtctx->jobs = ZSTDMT_createJobsTable(&jobsTableSize, mtctx->cMem);
-        if (mtctx->jobs==NULL) return ERROR(memory_allocation);
-        assert((jobsTableSize != 0) && ((jobsTableSize & (jobsTableSize - 1)) == 0));  /* ensure jobsTableSize is a power of 2 */
-        mtctx->jobIDMask = jobsTableSize - 1;
-    }
+    CHECK_F( ZSTDMT_expandJobsTable(mtctx, nbJobs) );  /* only expands if necessary */
 
     {   unsigned u;
         for (u=0; u<nbJobs; u++) {
@@ -1221,17 +1328,18 @@ size_t ZSTDMT_initCStream_internal(
         const ZSTD_CDict* cdict, ZSTD_CCtx_params params,
         unsigned long long pledgedSrcSize)
 {
-    DEBUGLOG(4, "ZSTDMT_initCStream_internal (pledgedSrcSize=%u, nbWorkers=%u, cctxPool=%u, disableLiteralCompression=%i)",
-                (U32)pledgedSrcSize, params.nbWorkers, mtctx->cctxPool->totalCCtx, params.disableLiteralCompression);
-    /* params are supposed to be fully validated at this point */
+    DEBUGLOG(4, "ZSTDMT_initCStream_internal (pledgedSrcSize=%u, nbWorkers=%u, cctxPool=%u)",
+                (U32)pledgedSrcSize, params.nbWorkers, mtctx->cctxPool->totalCCtx);
+
+    /* params supposed partially fully validated at this point */
     assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
-    assert(mtctx->cctxPool->totalCCtx == params.nbWorkers);
 
     /* init */
-    if (params.jobSize == 0) {
-        params.jobSize = 1U << ZSTDMT_computeTargetJobLog(params);
-    }
+    if (params.nbWorkers != mtctx->params.nbWorkers)
+        CHECK_F( ZSTDMT_resize(mtctx, params.nbWorkers) );
+
+    if (params.jobSize > 0 && params.jobSize < ZSTDMT_JOBSIZE_MIN) params.jobSize = ZSTDMT_JOBSIZE_MIN;
     if (params.jobSize > ZSTDMT_JOBSIZE_MAX) params.jobSize = ZSTDMT_JOBSIZE_MAX;
 
     mtctx->singleBlockingThread = (pledgedSrcSize <= ZSTDMT_JOBSIZE_MIN);  /* do not trigger multi-threading when srcSize is too small */
@@ -1270,7 +1378,9 @@ size_t ZSTDMT_initCStream_internal(
     mtctx->targetPrefixSize = (size_t)1 << ZSTDMT_computeOverlapLog(params);
     DEBUGLOG(4, "overlapLog=%u => %u KB", params.overlapSizeLog, (U32)(mtctx->targetPrefixSize>>10));
     mtctx->targetSectionSize = params.jobSize;
-    if (mtctx->targetSectionSize < ZSTDMT_JOBSIZE_MIN) mtctx->targetSectionSize = ZSTDMT_JOBSIZE_MIN;
+    if (mtctx->targetSectionSize == 0) {
+        mtctx->targetSectionSize = 1ULL << ZSTDMT_computeTargetJobLog(params);
+    }
     if (mtctx->targetSectionSize < mtctx->targetPrefixSize) mtctx->targetSectionSize = mtctx->targetPrefixSize;  /* job size must be >= overlap size */
     DEBUGLOG(4, "Job Size : %u KB (note : set to %u)", (U32)(mtctx->targetSectionSize>>10), params.jobSize);
     DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->targetSectionSize>>10));
@@ -1312,7 +1422,7 @@ size_t ZSTDMT_initCStream_internal(
     mtctx->allJobsCompleted = 0;
     mtctx->consumed = 0;
     mtctx->produced = 0;
-    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params))
+    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, mtctx->targetSectionSize))
         return ERROR(memory_allocation);
     return 0;
 }
@@ -1420,7 +1530,7 @@ static size_t ZSTDMT_createCompressionJo
         mtctx->jobs[jobID].jobID = mtctx->nextJobID;
         mtctx->jobs[jobID].firstJob = (mtctx->nextJobID==0);
         mtctx->jobs[jobID].lastJob = endFrame;
-        mtctx->jobs[jobID].frameChecksumNeeded = endFrame && (mtctx->nextJobID>0) && mtctx->params.fParams.checksumFlag;
+        mtctx->jobs[jobID].frameChecksumNeeded = mtctx->params.fParams.checksumFlag && endFrame && (mtctx->nextJobID>0);
         mtctx->jobs[jobID].dstFlushed = 0;
 
         /* Update the round buffer pos and clear the input buffer to be reset */
@@ -1468,6 +1578,8 @@ static size_t ZSTDMT_createCompressionJo
 
 
 /*! ZSTDMT_flushProduced() :
+ *  flush whatever data has been produced but not yet flushed in current job.
+ *  move to next job if current one is fully flushed.
  * `output` : `pos` will be updated with amount of data flushed .
  * `blockToFlush` : if >0, the function will block and wait if there is no data available to flush .
  * @return : amount of data remaining within internal buffer, 0 if no more, 1 if unknown but > 0, or an error code */
@@ -1496,7 +1608,7 @@ static size_t ZSTDMT_flushProduced(ZSTDM
     /* try to flush something */
     {   size_t cSize = mtctx->jobs[wJobID].cSize;                  /* shared */
         size_t const srcConsumed = mtctx->jobs[wJobID].consumed;   /* shared */
-        size_t const srcSize = mtctx->jobs[wJobID].src.size;        /* read-only, could be done after mutex lock, but no-declaration-after-statement */
+        size_t const srcSize = mtctx->jobs[wJobID].src.size;       /* read-only, could be done after mutex lock, but no-declaration-after-statement */
         ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
         if (ZSTD_isError(cSize)) {
             DEBUGLOG(5, "ZSTDMT_flushProduced: job %u : compression error detected : %s",
@@ -1516,6 +1628,7 @@ static size_t ZSTDMT_flushProduced(ZSTDM
             mtctx->jobs[wJobID].cSize += 4;  /* can write this shared value, as worker is no longer active */
             mtctx->jobs[wJobID].frameChecksumNeeded = 0;
         }
+
         if (cSize > 0) {   /* compression is ongoing or completed */
             size_t const toFlush = MIN(cSize - mtctx->jobs[wJobID].dstFlushed, output->size - output->pos);
             DEBUGLOG(5, "ZSTDMT_flushProduced: Flushing %u bytes from job %u (completion:%u/%u, generated:%u)",
@@ -1529,11 +1642,12 @@ static size_t ZSTDMT_flushProduced(ZSTDM
             output->pos += toFlush;
             mtctx->jobs[wJobID].dstFlushed += toFlush;  /* can write : this value is only used by mtctx */
 
-            if ( (srcConsumed == srcSize)    /* job completed */
+            if ( (srcConsumed == srcSize)    /* job is completed */
               && (mtctx->jobs[wJobID].dstFlushed == cSize) ) {   /* output buffer fully flushed => free this job position */
                 DEBUGLOG(5, "Job %u completed (%u bytes), moving to next one",
                         mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
                 ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[wJobID].dstBuff);
+                DEBUGLOG(5, "dstBuffer released");
                 mtctx->jobs[wJobID].dstBuff = g_nullBuffer;
                 mtctx->jobs[wJobID].cSize = 0;   /* ensure this job slot is considered "not started" in future check */
                 mtctx->consumed += srcSize;
@@ -1610,6 +1724,7 @@ static int ZSTDMT_doesOverlapWindow(buff
     range_t extDict;
     range_t prefix;
 
+    DEBUGLOG(5, "ZSTDMT_doesOverlapWindow");
     extDict.start = window.dictBase + window.lowLimit;
     extDict.size = window.dictLimit - window.lowLimit;
 
@@ -1630,12 +1745,13 @@ static void ZSTDMT_waitForLdmComplete(ZS
 {
     if (mtctx->params.ldmParams.enableLdm) {
         ZSTD_pthread_mutex_t* mutex = &mtctx->serial.ldmWindowMutex;
+        DEBUGLOG(5, "ZSTDMT_waitForLdmComplete");
         DEBUGLOG(5, "source  [0x%zx, 0x%zx)",
                     (size_t)buffer.start,
                     (size_t)buffer.start + buffer.capacity);
         ZSTD_PTHREAD_MUTEX_LOCK(mutex);
         while (ZSTDMT_doesOverlapWindow(buffer, mtctx->serial.ldmWindow)) {
-            DEBUGLOG(6, "Waiting for LDM to finish...");
+            DEBUGLOG(5, "Waiting for LDM to finish...");
             ZSTD_pthread_cond_wait(&mtctx->serial.ldmWindowCond, mutex);
         }
         DEBUGLOG(6, "Done waiting for LDM to finish");
@@ -1655,6 +1771,7 @@ static int ZSTDMT_tryGetInputRange(ZSTDM
     size_t const target = mtctx->targetSectionSize;
     buffer_t buffer;
 
+    DEBUGLOG(5, "ZSTDMT_tryGetInputRange");
     assert(mtctx->inBuff.buffer.start == NULL);
     assert(mtctx->roundBuff.capacity >= target);
 
@@ -1668,7 +1785,7 @@ static int ZSTDMT_tryGetInputRange(ZSTDM
         buffer.start = start;
         buffer.capacity = prefixSize;
         if (ZSTDMT_isOverlapped(buffer, inUse)) {
-            DEBUGLOG(6, "Waiting for buffer...");
+            DEBUGLOG(5, "Waiting for buffer...");
             return 0;
         }
         ZSTDMT_waitForLdmComplete(mtctx, buffer);
@@ -1680,7 +1797,7 @@ static int ZSTDMT_tryGetInputRange(ZSTDM
     buffer.capacity = target;
 
     if (ZSTDMT_isOverlapped(buffer, inUse)) {
-        DEBUGLOG(6, "Waiting for buffer...");
+        DEBUGLOG(5, "Waiting for buffer...");
         return 0;
     }
     assert(!ZSTDMT_isOverlapped(buffer, mtctx->inBuff.prefix));
@@ -1753,8 +1870,10 @@ size_t ZSTDMT_compressStream_generic(ZST
                 /* It is only possible for this operation to fail if there are
                  * still compression jobs ongoing.
                  */
+                DEBUGLOG(5, "ZSTDMT_tryGetInputRange failed");
                 assert(mtctx->doneJobID != mtctx->nextJobID);
-            }
+            } else
+                DEBUGLOG(5, "ZSTDMT_tryGetInputRange completed successfully : mtctx->inBuff.buffer.start = %p", mtctx->inBuff.buffer.start);
         }
         if (mtctx->inBuff.buffer.start != NULL) {
             size_t const toLoad = MIN(input->size - input->pos, mtctx->targetSectionSize - mtctx->inBuff.filled);
@@ -1782,6 +1901,7 @@ size_t ZSTDMT_compressStream_generic(ZST
     /* check for potential compressed data ready to be flushed */
     {   size_t const remainingToFlush = ZSTDMT_flushProduced(mtctx, output, !forwardInputProgress, endOp); /* block if there was no forward input progress */
         if (input->pos < input->size) return MAX(remainingToFlush, 1);  /* input not consumed : do not end flush yet */
+        DEBUGLOG(5, "end of ZSTDMT_compressStream_generic: remainingToFlush = %u", (U32)remainingToFlush);
         return remainingToFlush;
     }
 }
diff --git a/contrib/python-zstandard/zstd/compress/zstdmt_compress.h b/contrib/python-zstandard/zstd/compress/zstdmt_compress.h
--- a/contrib/python-zstandard/zstd/compress/zstdmt_compress.h
+++ b/contrib/python-zstandard/zstd/compress/zstdmt_compress.h
@@ -95,6 +95,11 @@ typedef enum {
  * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
 ZSTDLIB_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned value);
 
+/* ZSTDMT_getMTCtxParameter() :
+ * Query the ZSTDMT_CCtx for a parameter value.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTDMT_getMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned* value);
+
 
 /*! ZSTDMT_compressStream_generic() :
  *  Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream()
@@ -114,11 +119,21 @@ ZSTDLIB_API size_t ZSTDMT_compressStream
  * ===  Not exposed in libzstd. Never invoke directly   ===
  * ======================================================== */
 
+ /*! ZSTDMT_toFlushNow()
+  *  Tell how many bytes are ready to be flushed immediately.
+  *  Probe the oldest active job (not yet entirely flushed) and check its output buffer.
+  *  If return 0, it means there is no active job,
+  *  or, it means oldest job is still active, but everything produced has been flushed so far,
+  *  therefore flushing is limited by speed of oldest job. */
+size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx);
+
+/*! ZSTDMT_CCtxParam_setMTCtxParameter()
+ *  like ZSTDMT_setMTCtxParameter(), but into a ZSTD_CCtx_Params */
 size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, unsigned value);
 
-/* ZSTDMT_CCtxParam_setNbWorkers()
- * Set nbWorkers, and clamp it.
- * Also reset jobSize and overlapLog */
+/*! ZSTDMT_CCtxParam_setNbWorkers()
+ *  Set nbWorkers, and clamp it.
+ *  Also reset jobSize and overlapLog */
 size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers);
 
 /*! ZSTDMT_updateCParams_whileCompressing() :
@@ -126,14 +141,9 @@ size_t ZSTDMT_CCtxParam_setNbWorkers(ZST
  *  New parameters will be applied to next compression job. */
 void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams);
 
-/* ZSTDMT_getNbWorkers():
- * @return nb threads currently active in mtctx.
- * mtctx must be valid */
-unsigned ZSTDMT_getNbWorkers(const ZSTDMT_CCtx* mtctx);
-
-/* ZSTDMT_getFrameProgression():
- * tells how much data has been consumed (input) and produced (output) for current frame.
- * able to count progression inside worker threads.
+/*! ZSTDMT_getFrameProgression():
+ *  tells how much data has been consumed (input) and produced (output) for current frame.
+ *  able to count progression inside worker threads.
  */
 ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx);
 
diff --git a/contrib/python-zstandard/zstd/decompress/huf_decompress.c b/contrib/python-zstandard/zstd/decompress/huf_decompress.c
--- a/contrib/python-zstandard/zstd/decompress/huf_decompress.c
+++ b/contrib/python-zstandard/zstd/decompress/huf_decompress.c
@@ -1,6 +1,7 @@
 /* ******************************************************************
-   Huffman decoder, part of New Generation Entropy library
-   Copyright (C) 2013-2016, Yann Collet.
+   huff0 huffman decoder,
+   part of Finite State Entropy library
+   Copyright (C) 2013-present, Yann Collet.
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -29,16 +30,15 @@
 
     You can contact the author at :
     - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
-    - Public forum : https://groups.google.com/forum/#!forum/lz4c
 ****************************************************************** */
 
 /* **************************************************************
 *  Dependencies
 ****************************************************************/
 #include <string.h>     /* memcpy, memset */
+#include "compiler.h"
 #include "bitstream.h"  /* BIT_* */
-#include "compiler.h"
-#include "fse.h"        /* header compression */
+#include "fse.h"        /* to compress headers */
 #define HUF_STATIC_LINKING_ONLY
 #include "huf.h"
 #include "error_private.h"
@@ -48,7 +48,6 @@
 *  Error Management
 ****************************************************************/
 #define HUF_isError ERR_isError
-#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
 #define CHECK_F(f) { size_t const err_ = (f); if (HUF_isError(err_)) return err_; }
 
 
@@ -75,15 +74,15 @@ static DTableDesc HUF_getDTableDesc(cons
 /*-***************************/
 /*  single-symbol decoding   */
 /*-***************************/
-typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1;   /* single-symbol decoding */
 
-size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
 {
     U32 tableLog = 0;
     U32 nbSymbols = 0;
     size_t iSize;
     void* const dtPtr = DTable + 1;
-    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
+    HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;
 
     U32* rankVal;
     BYTE* huffWeight;
@@ -96,7 +95,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable*
 
     if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
 
-    HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+    DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
     /* memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
 
     iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
@@ -124,7 +123,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable*
             U32 const w = huffWeight[n];
             U32 const length = (1 << w) >> 1;
             U32 u;
-            HUF_DEltX2 D;
+            HUF_DEltX1 D;
             D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
             for (u = rankVal[w]; u < rankVal[w] + length; u++)
                 dt[u] = D;
@@ -134,17 +133,15 @@ size_t HUF_readDTableX2_wksp(HUF_DTable*
     return iSize;
 }
 
-size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
+size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
 {
     U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
-    return HUF_readDTableX2_wksp(DTable, src, srcSize,
+    return HUF_readDTableX1_wksp(DTable, src, srcSize,
                                  workSpace, sizeof(workSpace));
 }
 
-typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4;  /* double-symbols decoding */
-
 FORCE_INLINE_TEMPLATE BYTE
-HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
+HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog)
 {
     size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
     BYTE const c = dt[val].byte;
@@ -152,44 +149,44 @@ HUF_decodeSymbolX2(BIT_DStream_t* Dstrea
     return c;
 }
 
-#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
-    *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
 
-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)  \
+#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)  \
     if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
-        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
 
-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
     if (MEM_64bits()) \
-        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
 
 HINT_INLINE size_t
-HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
+HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
 {
     BYTE* const pStart = p;
 
     /* up to 4 symbols at a time */
     while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
-        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
-        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
     }
 
     /* [0-3] symbols remaining */
     if (MEM_32bits())
         while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))
-            HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+            HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
 
     /* no more data to retrieve from bitstream, no need to reload */
     while (p < pEnd)
-        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
 
     return pEnd-pStart;
 }
 
 FORCE_INLINE_TEMPLATE size_t
-HUF_decompress1X2_usingDTable_internal_body(
+HUF_decompress1X1_usingDTable_internal_body(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
     const HUF_DTable* DTable)
@@ -197,14 +194,14 @@ HUF_decompress1X2_usingDTable_internal_b
     BYTE* op = (BYTE*)dst;
     BYTE* const oend = op + dstSize;
     const void* dtPtr = DTable + 1;
-    const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+    const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
     BIT_DStream_t bitD;
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
     U32 const dtLog = dtd.tableLog;
 
     CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
 
-    HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+    HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog);
 
     if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
 
@@ -212,7 +209,7 @@ HUF_decompress1X2_usingDTable_internal_b
 }
 
 FORCE_INLINE_TEMPLATE size_t
-HUF_decompress4X2_usingDTable_internal_body(
+HUF_decompress4X1_usingDTable_internal_body(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
     const HUF_DTable* DTable)
@@ -224,7 +221,7 @@ HUF_decompress4X2_usingDTable_internal_b
         BYTE* const ostart = (BYTE*) dst;
         BYTE* const oend = ostart + dstSize;
         const void* const dtPtr = DTable + 1;
-        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+        const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
 
         /* Init */
         BIT_DStream_t bitD1;
@@ -260,22 +257,22 @@ HUF_decompress4X2_usingDTable_internal_b
         /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
         endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
         while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) {
-            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
             BIT_reloadDStream(&bitD1);
             BIT_reloadDStream(&bitD2);
             BIT_reloadDStream(&bitD3);
@@ -291,191 +288,10 @@ HUF_decompress4X2_usingDTable_internal_b
         /* note : op4 supposed already verified within main loop */
 
         /* finish bitStreams one by one */
-        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
-        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
-        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
-        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
-
-        /* check */
-        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
-          if (!endCheck) return ERROR(corruption_detected); }
-
-        /* decoded size */
-        return dstSize;
-    }
-}
-
-
-FORCE_INLINE_TEMPLATE U32
-HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
-{
-    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
-    memcpy(op, dt+val, 2);
-    BIT_skipBits(DStream, dt[val].nbBits);
-    return dt[val].length;
-}
-
-FORCE_INLINE_TEMPLATE U32
-HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
-{
-    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
-    memcpy(op, dt+val, 1);
-    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
-    else {
-        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
-            BIT_skipBits(DStream, dt[val].nbBits);
-            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
-                /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
-                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
-    }   }
-    return 1;
-}
-
-#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
-    ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
-
-#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
-        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
-
-#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
-    if (MEM_64bits()) \
-        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
-
-HINT_INLINE size_t
-HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
-                const HUF_DEltX4* const dt, const U32 dtLog)
-{
-    BYTE* const pStart = p;
-
-    /* up to 8 symbols at a time */
-    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
-        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
-        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
-    }
-
-    /* closer to end : up to 2 symbols at a time */
-    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
-        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
-
-    while (p <= pEnd-2)
-        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
-
-    if (p < pEnd)
-        p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
-
-    return p-pStart;
-}
-
-FORCE_INLINE_TEMPLATE size_t
-HUF_decompress1X4_usingDTable_internal_body(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    BIT_DStream_t bitD;
-
-    /* Init */
-    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
-
-    /* decode */
-    {   BYTE* const ostart = (BYTE*) dst;
-        BYTE* const oend = ostart + dstSize;
-        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
-        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
-        DTableDesc const dtd = HUF_getDTableDesc(DTable);
-        HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
-    }
-
-    /* check */
-    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
-
-    /* decoded size */
-    return dstSize;
-}
-
-
-FORCE_INLINE_TEMPLATE size_t
-HUF_decompress4X4_usingDTable_internal_body(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
-
-    {   const BYTE* const istart = (const BYTE*) cSrc;
-        BYTE* const ostart = (BYTE*) dst;
-        BYTE* const oend = ostart + dstSize;
-        const void* const dtPtr = DTable+1;
-        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
-
-        /* Init */
-        BIT_DStream_t bitD1;
-        BIT_DStream_t bitD2;
-        BIT_DStream_t bitD3;
-        BIT_DStream_t bitD4;
-        size_t const length1 = MEM_readLE16(istart);
-        size_t const length2 = MEM_readLE16(istart+2);
-        size_t const length3 = MEM_readLE16(istart+4);
-        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
-        const BYTE* const istart1 = istart + 6;  /* jumpTable */
-        const BYTE* const istart2 = istart1 + length1;
-        const BYTE* const istart3 = istart2 + length2;
-        const BYTE* const istart4 = istart3 + length3;
-        size_t const segmentSize = (dstSize+3) / 4;
-        BYTE* const opStart2 = ostart + segmentSize;
-        BYTE* const opStart3 = opStart2 + segmentSize;
-        BYTE* const opStart4 = opStart3 + segmentSize;
-        BYTE* op1 = ostart;
-        BYTE* op2 = opStart2;
-        BYTE* op3 = opStart3;
-        BYTE* op4 = opStart4;
-        U32 endSignal;
-        DTableDesc const dtd = HUF_getDTableDesc(DTable);
-        U32 const dtLog = dtd.tableLog;
-
-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
-        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
-        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
-        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
-        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
-
-        /* 16-32 symbols per loop (4-8 symbols per stream) */
-        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
-            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
-            HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
-            HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
-            HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
-            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
-            HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
-            HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
-            HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
-
-            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        }
-
-        /* check corruption */
-        if (op1 > opStart2) return ERROR(corruption_detected);
-        if (op2 > opStart3) return ERROR(corruption_detected);
-        if (op3 > opStart4) return ERROR(corruption_detected);
-        /* note : op4 already verified within main loop */
-
-        /* finish bitStreams one by one */
-        HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
-        HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
-        HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
-        HUF_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+        HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX1(op4, &bitD4, oend,     dt, dtLog);
 
         /* check */
         { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
@@ -493,7 +309,7 @@ typedef size_t (*HUF_decompress_usingDTa
                                                const HUF_DTable *DTable);
 #if DYNAMIC_BMI2
 
-#define X(fn)                                                               \
+#define HUF_DGEN(fn)                                                               \
                                                                             \
     static size_t fn##_default(                                             \
                   void* dst,  size_t dstSize,                               \
@@ -522,7 +338,7 @@ typedef size_t (*HUF_decompress_usingDTa
 
 #else
 
-#define X(fn)                                                               \
+#define HUF_DGEN(fn)                                                               \
     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
                      size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
     {                                                                       \
@@ -532,112 +348,114 @@ typedef size_t (*HUF_decompress_usingDTa
 
 #endif
 
-X(HUF_decompress1X2_usingDTable_internal)
-X(HUF_decompress4X2_usingDTable_internal)
-X(HUF_decompress1X4_usingDTable_internal)
-X(HUF_decompress4X4_usingDTable_internal)
+HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
 
-#undef X
 
 
-size_t HUF_decompress1X2_usingDTable(
+size_t HUF_decompress1X1_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+size_t HUF_decompress4X1_usingDTable(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
     const HUF_DTable* DTable)
 {
     DTableDesc dtd = HUF_getDTableDesc(DTable);
     if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
-
-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    const BYTE* ip = (const BYTE*) cSrc;
-
-    size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
-    if (HUF_isError(hSize)) return hSize;
-    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
-    ip += hSize; cSrcSize -= hSize;
-
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }
 
-
-size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
-                              const void* cSrc, size_t cSrcSize)
-{
-    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
-    return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
-                                       workSpace, sizeof(workSpace));
-}
-
-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
-{
-    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
-    return HUF_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
-}
-
-size_t HUF_decompress4X2_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
-
-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
                                    void* workSpace, size_t wkspSize, int bmi2)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX2_wksp (dctx, cSrc, cSrcSize,
+    size_t const hSize = HUF_readDTableX1_wksp (dctx, cSrc, cSrcSize,
                                                 workSpace, wkspSize);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
 }
 
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
                                    void* workSpace, size_t wkspSize)
 {
-    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
+    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
 }
 
 
-size_t HUF_decompress4X2_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
     U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
-    return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+    return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
                                        workSpace, sizeof(workSpace));
 }
-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
-    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
-    return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+    HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
 }
 
 
 /* *************************/
 /* double-symbols decoding */
 /* *************************/
-typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
 
-/* HUF_fillDTableX4Level2() :
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2;  /* double-symbols decoding */
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
+
+
+/* HUF_fillDTableX2Level2() :
  * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
-static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed,
                            const U32* rankValOrigin, const int minWeight,
                            const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
                            U32 nbBitsBaseline, U16 baseSeq)
 {
-    HUF_DEltX4 DElt;
+    HUF_DEltX2 DElt;
     U32 rankVal[HUF_TABLELOG_MAX + 1];
 
     /* get pre-calculated rankVal */
@@ -672,10 +490,8 @@ static void HUF_fillDTableX4Level2(HUF_D
     }   }
 }
 
-typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
-typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
 
-static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
+static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
                            const sortedSymbol_t* sortedList, const U32 sortedListSize,
                            const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
                            const U32 nbBitsBaseline)
@@ -700,12 +516,12 @@ static void HUF_fillDTableX4(HUF_DEltX4*
             int minWeight = nbBits + scaleLog;
             if (minWeight < 1) minWeight = 1;
             sortedRank = rankStart[minWeight];
-            HUF_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+            HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits,
                            rankValOrigin[nbBits], minWeight,
                            sortedList+sortedRank, sortedListSize-sortedRank,
                            nbBitsBaseline, symbol);
         } else {
-            HUF_DEltX4 DElt;
+            HUF_DEltX2 DElt;
             MEM_writeLE16(&(DElt.sequence), symbol);
             DElt.nbBits = (BYTE)(nbBits);
             DElt.length = 1;
@@ -717,16 +533,16 @@ static void HUF_fillDTableX4(HUF_DEltX4*
     }
 }
 
-size_t HUF_readDTableX4_wksp(HUF_DTable* DTable, const void* src,
-                             size_t srcSize, void* workSpace,
-                             size_t wkspSize)
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                       const void* src, size_t srcSize,
+                             void* workSpace, size_t wkspSize)
 {
     U32 tableLog, maxW, sizeOfSort, nbSymbols;
     DTableDesc dtd = HUF_getDTableDesc(DTable);
     U32 const maxTableLog = dtd.maxTableLog;
     size_t iSize;
     void* dtPtr = DTable+1;   /* force compiler to avoid strict-aliasing */
-    HUF_DEltX4* const dt = (HUF_DEltX4*)dtPtr;
+    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
     U32 *rankStart;
 
     rankValCol_t* rankVal;
@@ -752,7 +568,7 @@ size_t HUF_readDTableX4_wksp(HUF_DTable*
     rankStart = rankStart0 + 1;
     memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
 
-    HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable));   /* if compiler fails here, assertion is wrong */
+    DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable));   /* if compiler fails here, assertion is wrong */
     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
     /* memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
 
@@ -806,7 +622,7 @@ size_t HUF_readDTableX4_wksp(HUF_DTable*
                     rankValPtr[w] = rankVal0[w] >> consumed;
     }   }   }   }
 
-    HUF_fillDTableX4(dt, maxTableLog,
+    HUF_fillDTableX2(dt, maxTableLog,
                    sortedSymbol, sizeOfSort,
                    rankStart0, rankVal, maxW,
                    tableLog+1);
@@ -817,112 +633,296 @@ size_t HUF_readDTableX4_wksp(HUF_DTable*
     return iSize;
 }
 
-size_t HUF_readDTableX4(HUF_DTable* DTable, const void* src, size_t srcSize)
+size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
 {
   U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
-  return HUF_readDTableX4_wksp(DTable, src, srcSize,
+  return HUF_readDTableX2_wksp(DTable, src, srcSize,
                                workSpace, sizeof(workSpace));
 }
 
-size_t HUF_decompress1X4_usingDTable(
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
+    }   }
+    return 1;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+HINT_INLINE size_t
+HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+                const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to end : up to 2 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BIT_DStream_t bitD;
+
+    /* Init */
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    /* decode */
+    {   BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog);
+    }
+
+    /* check */
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
+
+size_t HUF_decompress1X2_usingDTable(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
     const HUF_DTable* DTable)
 {
     DTableDesc dtd = HUF_getDTableDesc(DTable);
     if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }
 
-size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
                                    void* workSpace, size_t wkspSize)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX4_wksp(DCtx, cSrc, cSrcSize,
+    size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
                                                workSpace, wkspSize);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress1X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
 }
 
 
-size_t HUF_decompress1X4_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
                               const void* cSrc, size_t cSrcSize)
 {
     U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
-    return HUF_decompress1X4_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+    return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
                                        workSpace, sizeof(workSpace));
 }
 
-size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
-    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_TABLELOG_MAX);
-    return HUF_decompress1X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
 }
 
-size_t HUF_decompress4X4_usingDTable(
+size_t HUF_decompress4X2_usingDTable(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
     const HUF_DTable* DTable)
 {
     DTableDesc dtd = HUF_getDTableDesc(DTable);
     if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }
 
-static size_t HUF_decompress4X4_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
                                    void* workSpace, size_t wkspSize, int bmi2)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t hSize = HUF_readDTableX4_wksp(dctx, cSrc, cSrcSize,
+    size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
                                          workSpace, wkspSize);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
 }
 
-size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
                                    void* workSpace, size_t wkspSize)
 {
-    return HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
 }
 
 
-size_t HUF_decompress4X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
                               const void* cSrc, size_t cSrcSize)
 {
     U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
-    return HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+    return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
                                        workSpace, sizeof(workSpace));
 }
 
-size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
-    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_TABLELOG_MAX);
-    return HUF_decompress4X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
 }
 
 
-/* ********************************/
-/* Generic decompression selector */
-/* ********************************/
+/* ***********************************/
+/* Universal decompression selectors */
+/* ***********************************/
 
 size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
                                     const void* cSrc, size_t cSrcSize,
                                     const HUF_DTable* DTable)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
-    return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
-                           HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }
 
 size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
@@ -930,8 +930,8 @@ size_t HUF_decompress4X_usingDTable(void
                                     const HUF_DTable* DTable)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
-    return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
-                           HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }
 
 
@@ -960,12 +960,12 @@ static const algo_time_t algoTime[16 /* 
 /** HUF_selectDecoder() :
  *  Tells which decoder is likely to decode faster,
  *  based on a set of pre-computed metrics.
- * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
  *  Assumption : 0 < dstSize <= 128 KB */
 U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
 {
     assert(dstSize > 0);
-    assert(dstSize <= 128 KB);
+    assert(dstSize <= 128*1024);
     /* decoder timing evaluation */
     {   U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 */
         U32 const D256 = (U32)(dstSize >> 8);
@@ -980,7 +980,7 @@ typedef size_t (*decompressionAlgo)(void
 
 size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
-    static const decompressionAlgo decompress[2] = { HUF_decompress4X2, HUF_decompress4X4 };
+    static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
 
     /* validation checks */
     if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1002,8 +1002,8 @@ size_t HUF_decompress4X_DCtx (HUF_DTable
     if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
 
     {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-        return algoNb ? HUF_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
-                        HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+        return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
     }
 }
 
@@ -1025,8 +1025,8 @@ size_t HUF_decompress4X_hufOnly_wksp(HUF
     if (cSrcSize == 0) return ERROR(corruption_detected);
 
     {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-        return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize):
-                        HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize):
+                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
     }
 }
 
@@ -1041,9 +1041,9 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DT
     if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
 
     {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-        return algoNb ? HUF_decompress1X4_DCtx_wksp(dctx, dst, dstSize, cSrc,
+        return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
                                 cSrcSize, workSpace, wkspSize):
-                        HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                        HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
                                 cSrcSize, workSpace, wkspSize);
     }
 }
@@ -1060,27 +1060,27 @@ size_t HUF_decompress1X_DCtx(HUF_DTable*
 size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
-    return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
-                           HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
 }
 
-size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
 }
 
 size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
-    return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
-                           HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
 }
 
 size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
@@ -1090,7 +1090,7 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi
     if (cSrcSize == 0) return ERROR(corruption_detected);
 
     {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-        return algoNb ? HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
-                        HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
     }
 }
diff --git a/contrib/python-zstandard/zstd/decompress/zstd_decompress.c b/contrib/python-zstandard/zstd/decompress/zstd_decompress.c
--- a/contrib/python-zstandard/zstd/decompress/zstd_decompress.c
+++ b/contrib/python-zstandard/zstd/decompress/zstd_decompress.c
@@ -40,12 +40,24 @@
 #  define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_DEFAULTMAX) + 1)
 #endif
 
+/*!
+ *  NO_FORWARD_PROGRESS_MAX :
+ *  maximum allowed nb of calls to ZSTD_decompressStream() and ZSTD_decompress_generic()
+ *  without any forward progress
+ *  (defined as: no byte read from input, and no byte flushed to output)
+ *  before triggering an error.
+ */
+#ifndef ZSTD_NO_FORWARD_PROGRESS_MAX
+#  define ZSTD_NO_FORWARD_PROGRESS_MAX 16
+#endif
+
 
 /*-*******************************************************
 *  Dependencies
 *********************************************************/
 #include <string.h>      /* memcpy, memmove, memset */
-#include "cpu.h"
+#include "compiler.h"    /* prefetch */
+#include "cpu.h"         /* bmi2 */
 #include "mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "fse.h"
@@ -57,6 +69,9 @@
 #  include "zstd_legacy.h"
 #endif
 
+static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict);
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict);
+
 
 /*-*************************************
 *  Errors
@@ -99,11 +114,10 @@ typedef struct {
 #define SEQSYMBOL_TABLE_SIZE(log)   (1 + (1 << (log)))
 
 typedef struct {
-    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];
-    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];
-    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];
+    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
     HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
-    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
     U32 rep[ZSTD_REP_NUM];
 } ZSTD_entropyDTables_t;
 
@@ -114,9 +128,10 @@ struct ZSTD_DCtx_s
     const ZSTD_seqSymbol* OFTptr;
     const HUF_DTable* HUFptr;
     ZSTD_entropyDTables_t entropy;
+    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];   /* space needed when building huffman tables */
     const void* previousDstEnd;   /* detect continuity */
-    const void* base;             /* start of current segment */
-    const void* vBase;            /* virtual start of previous segment if it was just before current one */
+    const void* prefixStart;      /* start of current segment */
+    const void* virtualStart;     /* virtual start of previous segment if it was just before current one */
     const void* dictEnd;          /* end of previous segment */
     size_t expected;
     ZSTD_frameHeader fParams;
@@ -127,7 +142,6 @@ struct ZSTD_DCtx_s
     U32 fseEntropy;
     XXH64_state_t xxhState;
     size_t headerSize;
-    U32 dictID;
     ZSTD_format_e format;
     const BYTE* litPtr;
     ZSTD_customMem customMem;
@@ -136,9 +150,13 @@ struct ZSTD_DCtx_s
     size_t staticSize;
     int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
 
-    /* streaming */
+    /* dictionary */
     ZSTD_DDict* ddictLocal;
-    const ZSTD_DDict* ddict;
+    const ZSTD_DDict* ddict;     /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
+    U32 dictID;
+    int ddictIsCold;             /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
+
+    /* streaming */
     ZSTD_dStreamStage streamStage;
     char*  inBuff;
     size_t inBuffSize;
@@ -153,6 +171,7 @@ struct ZSTD_DCtx_s
     U32 previousLegacyVersion;
     U32 legacyVersion;
     U32 hostageByte;
+    int noForwardProgress;
 
     /* workspace */
     BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
@@ -173,7 +192,7 @@ size_t ZSTD_estimateDCtxSize(void) { ret
 static size_t ZSTD_startingInputLength(ZSTD_format_e format)
 {
     size_t const startingInputLength = (format==ZSTD_f_zstd1_magicless) ?
-                    ZSTD_frameHeaderSize_prefix - ZSTD_frameIdSize :
+                    ZSTD_frameHeaderSize_prefix - ZSTD_FRAMEIDSIZE :
                     ZSTD_frameHeaderSize_prefix;
     ZSTD_STATIC_ASSERT(ZSTD_FRAMEHEADERSIZE_PREFIX >= ZSTD_FRAMEIDSIZE);
     /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */
@@ -188,10 +207,15 @@ static void ZSTD_initDCtx_internal(ZSTD_
     dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
     dctx->ddict       = NULL;
     dctx->ddictLocal  = NULL;
+    dctx->dictEnd     = NULL;
+    dctx->ddictIsCold = 0;
     dctx->inBuff      = NULL;
     dctx->inBuffSize  = 0;
     dctx->outBuffSize = 0;
     dctx->streamStage = zdss_init;
+    dctx->legacyContext = NULL;
+    dctx->previousLegacyVersion = 0;
+    dctx->noForwardProgress = 0;
     dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
 }
 
@@ -215,8 +239,6 @@ ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD
     {   ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_malloc(sizeof(*dctx), customMem);
         if (!dctx) return NULL;
         dctx->customMem = customMem;
-        dctx->legacyContext = NULL;
-        dctx->previousLegacyVersion = 0;
         ZSTD_initDCtx_internal(dctx);
         return dctx;
     }
@@ -265,7 +287,7 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, c
  *  Note 3 : Skippable Frame Identifiers are considered valid. */
 unsigned ZSTD_isFrame(const void* buffer, size_t size)
 {
-    if (size < ZSTD_frameIdSize) return 0;
+    if (size < ZSTD_FRAMEIDSIZE) return 0;
     {   U32 const magic = MEM_readLE32(buffer);
         if (magic == ZSTD_MAGICNUMBER) return 1;
         if ((magic & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
@@ -298,25 +320,28 @@ static size_t ZSTD_frameHeaderSize_inter
 
 /** ZSTD_frameHeaderSize() :
  *  srcSize must be >= ZSTD_frameHeaderSize_prefix.
- * @return : size of the Frame Header */
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
 size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
 {
     return ZSTD_frameHeaderSize_internal(src, srcSize, ZSTD_f_zstd1);
 }
 
 
-/** ZSTD_getFrameHeader_internal() :
+/** ZSTD_getFrameHeader_advanced() :
  *  decode Frame Header, or require larger `srcSize`.
  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
  * @return : 0, `zfhPtr` is correctly filled,
  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
  *           or an error code, which can be tested using ZSTD_isError() */
-static size_t ZSTD_getFrameHeader_internal(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
 {
     const BYTE* ip = (const BYTE*)src;
     size_t const minInputSize = ZSTD_startingInputLength(format);
 
+    memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
     if (srcSize < minInputSize) return minInputSize;
+    if (src==NULL) return ERROR(GENERIC);   /* invalid parameter */
 
     if ( (format != ZSTD_f_zstd1_magicless)
       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
@@ -325,7 +350,7 @@ static size_t ZSTD_getFrameHeader_intern
             if (srcSize < ZSTD_skippableHeaderSize)
                 return ZSTD_skippableHeaderSize; /* magic number + frame length */
             memset(zfhPtr, 0, sizeof(*zfhPtr));
-            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_frameIdSize);
+            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE);
             zfhPtr->frameType = ZSTD_skippableFrame;
             return 0;
         }
@@ -394,7 +419,7 @@ static size_t ZSTD_getFrameHeader_intern
  *           or an error code, which can be tested using ZSTD_isError() */
 size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize)
 {
-    return ZSTD_getFrameHeader_internal(zfhPtr, src, srcSize, ZSTD_f_zstd1);
+    return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1);
 }
 
 
@@ -437,7 +462,7 @@ unsigned long long ZSTD_findDecompressed
             size_t skippableSize;
             if (srcSize < ZSTD_skippableHeaderSize)
                 return ERROR(srcSize_wrong);
-            skippableSize = MEM_readLE32((const BYTE *)src + ZSTD_frameIdSize)
+            skippableSize = MEM_readLE32((const BYTE *)src + ZSTD_FRAMEIDSIZE)
                           + ZSTD_skippableHeaderSize;
             if (srcSize < skippableSize) {
                 return ZSTD_CONTENTSIZE_ERROR;
@@ -491,7 +516,7 @@ unsigned long long ZSTD_getDecompressedS
 *   @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
 static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize)
 {
-    size_t const result = ZSTD_getFrameHeader_internal(&(dctx->fParams), src, headerSize, dctx->format);
+    size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format);
     if (ZSTD_isError(result)) return result;    /* invalid header */
     if (result>0) return ERROR(srcSize_wrong);  /* headerSize too small */
     if (dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID))
@@ -526,6 +551,7 @@ size_t ZSTD_getcBlockSize(const void* sr
 static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
                           const void* src, size_t srcSize)
 {
+    if (dst==NULL) return ERROR(dstSize_tooSmall);
     if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
     memcpy(dst, src, srcSize);
     return srcSize;
@@ -542,6 +568,9 @@ static size_t ZSTD_setRleBlock(void* dst
     return regenSize;
 }
 
+/* Hidden declaration for fullbench */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize);
 /*! ZSTD_decodeLiteralsBlock() :
  * @return : nb of bytes read from src (< srcSize )
  *  note : symbol not declared but exposed for fullbench */
@@ -558,6 +587,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt
         case set_repeat:
             if (dctx->litEntropy==0) return ERROR(dictionary_corrupted);
             /* fall-through */
+
         case set_compressed:
             if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
             {   size_t lhSize, litSize, litCSize;
@@ -589,15 +619,20 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt
                 if (litSize > ZSTD_BLOCKSIZE_MAX) return ERROR(corruption_detected);
                 if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
 
+                /* prefetch huffman table if cold */
+                if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
+                    PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
+                }
+
                 if (HUF_isError((litEncType==set_repeat) ?
                                     ( singleStream ?
                                         HUF_decompress1X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) :
                                         HUF_decompress4X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) ) :
                                     ( singleStream ?
-                                        HUF_decompress1X2_DCtx_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                                                                         dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2) :
+                                        HUF_decompress1X1_DCtx_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                                                                         dctx->workspace, sizeof(dctx->workspace), dctx->bmi2) :
                                         HUF_decompress4X_hufOnly_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                                                                           dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2))))
+                                                                           dctx->workspace, sizeof(dctx->workspace), dctx->bmi2))))
                     return ERROR(corruption_detected);
 
                 dctx->litPtr = dctx->litBuffer;
@@ -869,7 +904,8 @@ static size_t ZSTD_buildSeqTable(ZSTD_se
                                  symbolEncodingType_e type, U32 max, U32 maxLog,
                                  const void* src, size_t srcSize,
                                  const U32* baseValue, const U32* nbAdditionalBits,
-                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable)
+                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
+                                 int ddictIsCold, int nbSeq)
 {
     switch(type)
     {
@@ -888,6 +924,12 @@ static size_t ZSTD_buildSeqTable(ZSTD_se
         return 0;
     case set_repeat:
         if (!flagRepeatTable) return ERROR(corruption_detected);
+        /* prefetch FSE table if used */
+        if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
+            const void* const pStart = *DTablePtr;
+            size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
+            PREFETCH_AREA(pStart, pSize);
+        }
         return 0;
     case set_compressed :
         {   U32 tableLog;
@@ -933,6 +975,9 @@ static const U32 ML_base[MaxML+1] = {
                     67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
                     0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
 
+/* Hidden delcaration for fullbench */
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                             const void* src, size_t srcSize);
 
 size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                              const void* src, size_t srcSize)
@@ -940,25 +985,25 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* 
     const BYTE* const istart = (const BYTE* const)src;
     const BYTE* const iend = istart + srcSize;
     const BYTE* ip = istart;
+    int nbSeq;
     DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
 
     /* check */
     if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);
 
     /* SeqHead */
-    {   int nbSeq = *ip++;
-        if (!nbSeq) { *nbSeqPtr=0; return 1; }
-        if (nbSeq > 0x7F) {
-            if (nbSeq == 0xFF) {
-                if (ip+2 > iend) return ERROR(srcSize_wrong);
-                nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
-            } else {
-                if (ip >= iend) return ERROR(srcSize_wrong);
-                nbSeq = ((nbSeq-0x80)<<8) + *ip++;
-            }
+    nbSeq = *ip++;
+    if (!nbSeq) { *nbSeqPtr=0; return 1; }
+    if (nbSeq > 0x7F) {
+        if (nbSeq == 0xFF) {
+            if (ip+2 > iend) return ERROR(srcSize_wrong);
+            nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+        } else {
+            if (ip >= iend) return ERROR(srcSize_wrong);
+            nbSeq = ((nbSeq-0x80)<<8) + *ip++;
         }
-        *nbSeqPtr = nbSeq;
     }
+    *nbSeqPtr = nbSeq;
 
     /* FSE table descriptors */
     if (ip+4 > iend) return ERROR(srcSize_wrong); /* minimum possible size */
@@ -972,7 +1017,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* 
                                                       LLtype, MaxLL, LLFSELog,
                                                       ip, iend-ip,
                                                       LL_base, LL_bits,
-                                                      LL_defaultDTable, dctx->fseEntropy);
+                                                      LL_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
             if (ZSTD_isError(llhSize)) return ERROR(corruption_detected);
             ip += llhSize;
         }
@@ -981,7 +1027,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* 
                                                       OFtype, MaxOff, OffFSELog,
                                                       ip, iend-ip,
                                                       OF_base, OF_bits,
-                                                      OF_defaultDTable, dctx->fseEntropy);
+                                                      OF_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
             if (ZSTD_isError(ofhSize)) return ERROR(corruption_detected);
             ip += ofhSize;
         }
@@ -990,12 +1037,23 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* 
                                                       MLtype, MaxML, MLFSELog,
                                                       ip, iend-ip,
                                                       ML_base, ML_bits,
-                                                      ML_defaultDTable, dctx->fseEntropy);
+                                                      ML_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
             if (ZSTD_isError(mlhSize)) return ERROR(corruption_detected);
             ip += mlhSize;
         }
     }
 
+    /* prefetch dictionary content */
+    if (dctx->ddictIsCold) {
+        size_t const dictSize = (const char*)dctx->prefixStart - (const char*)dctx->virtualStart;
+        size_t const psmin = MIN(dictSize, (size_t)(64*nbSeq) /* heuristic */ );
+        size_t const pSize = MIN(psmin, 128 KB /* protection */ );
+        const void* const pStart = (const char*)dctx->dictEnd - pSize;
+        PREFETCH_AREA(pStart, pSize);
+        dctx->ddictIsCold = 0;
+    }
+
     return ip-istart;
 }
 
@@ -1075,7 +1133,7 @@ HINT_INLINE
 size_t ZSTD_execSequence(BYTE* op,
                          BYTE* const oend, seq_t sequence,
                          const BYTE** litPtr, const BYTE* const litLimit,
-                         const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+                         const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
 {
     BYTE* const oLitEnd = op + sequence.litLength;
     size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -1087,7 +1145,7 @@ size_t ZSTD_execSequence(BYTE* op,
     /* check */
     if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
     if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
-    if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd);
+    if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
 
     /* copy Literals */
     ZSTD_copy8(op, *litPtr);
@@ -1097,11 +1155,11 @@ size_t ZSTD_execSequence(BYTE* op,
     *litPtr = iLitEnd;   /* update for next sequence */
 
     /* copy Match */
-    if (sequence.offset > (size_t)(oLitEnd - base)) {
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
         /* offset beyond prefix -> go into extDict */
-        if (sequence.offset > (size_t)(oLitEnd - vBase))
+        if (sequence.offset > (size_t)(oLitEnd - virtualStart))
             return ERROR(corruption_detected);
-        match = dictEnd + (match - base);
+        match = dictEnd + (match - prefixStart);
         if (match + sequence.matchLength <= dictEnd) {
             memmove(oLitEnd, match, sequence.matchLength);
             return sequenceLength;
@@ -1111,7 +1169,7 @@ size_t ZSTD_execSequence(BYTE* op,
             memmove(oLitEnd, match, length1);
             op = oLitEnd + length1;
             sequence.matchLength -= length1;
-            match = base;
+            match = prefixStart;
             if (op > oend_w || sequence.matchLength < MINMATCH) {
               U32 i;
               for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
@@ -1354,10 +1412,10 @@ ZSTD_decompressSequences_body( ZSTD_DCtx
     BYTE* op = ostart;
     const BYTE* litPtr = dctx->litPtr;
     const BYTE* const litEnd = litPtr + dctx->litSize;
-    const BYTE* const base = (const BYTE*) (dctx->base);
-    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+    const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
-    DEBUGLOG(5, "ZSTD_decompressSequences");
+    DEBUGLOG(5, "ZSTD_decompressSequences_body");
 
     /* Regen sequences */
     if (nbSeq) {
@@ -1372,14 +1430,14 @@ ZSTD_decompressSequences_body( ZSTD_DCtx
         for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) {
             nbSeq--;
             {   seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
-                size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+                size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
                 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
                 op += oneSeqSize;
         }   }
 
         /* check if reached exact end */
-        DEBUGLOG(5, "ZSTD_decompressSequences: after decode loop, remaining nbSeq : %i", nbSeq);
+        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
         if (nbSeq) return ERROR(corruption_detected);
         /* save reps for next block */
         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
@@ -1498,8 +1556,8 @@ ZSTD_decompressSequencesLong_body(
     BYTE* op = ostart;
     const BYTE* litPtr = dctx->litPtr;
     const BYTE* const litEnd = litPtr + dctx->litSize;
-    const BYTE* const prefixStart = (const BYTE*) (dctx->base);
-    const BYTE* const dictStart = (const BYTE*) (dctx->vBase);
+    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+    const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
 
     /* Regen sequences */
@@ -1662,7 +1720,8 @@ static size_t ZSTD_decompressBlock_inter
     /* isLongOffset must be true if there are long offsets.
      * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
      * We don't expect that to be the case in 64-bit mode.
-     * In block mode, window size is not known, so we have to be conservative. (note: but it could be evaluated from current-lowLimit)
+     * In block mode, window size is not known, so we have to be conservative.
+     * (note: but it could be evaluated from current-lowLimit)
      */
     ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)));
     DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
@@ -1701,8 +1760,8 @@ static void ZSTD_checkContinuity(ZSTD_DC
 {
     if (dst != dctx->previousDstEnd) {   /* not contiguous */
         dctx->dictEnd = dctx->previousDstEnd;
-        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
-        dctx->base = dst;
+        dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+        dctx->prefixStart = dst;
         dctx->previousDstEnd = dst;
     }
 }
@@ -1729,10 +1788,10 @@ ZSTDLIB_API size_t ZSTD_insertBlock(ZSTD
 }
 
 
-static size_t ZSTD_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length)
+static size_t ZSTD_generateNxBytes(void* dst, size_t dstCapacity, BYTE value, size_t length)
 {
     if (length > dstCapacity) return ERROR(dstSize_tooSmall);
-    memset(dst, byte, length);
+    memset(dst, value, length);
     return length;
 }
 
@@ -1749,7 +1808,7 @@ size_t ZSTD_findFrameCompressedSize(cons
 #endif
     if ( (srcSize >= ZSTD_skippableHeaderSize)
       && (MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START ) {
-        return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize);
+        return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + ZSTD_FRAMEIDSIZE);
     } else {
         const BYTE* ip = (const BYTE*)src;
         const BYTE* const ipstart = ip;
@@ -1783,7 +1842,6 @@ size_t ZSTD_findFrameCompressedSize(cons
         if (zfh.checksumFlag) {   /* Final frame content checksum */
             if (remainingSize < 4) return ERROR(srcSize_wrong);
             ip += 4;
-            remainingSize -= 4;
         }
 
         return ip - ipstart;
@@ -1871,9 +1929,6 @@ static size_t ZSTD_decompressFrame(ZSTD_
     return op-ostart;
 }
 
-static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict);
-static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict);
-
 static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
                                         void* dst, size_t dstCapacity,
                                   const void* src, size_t srcSize,
@@ -1881,6 +1936,9 @@ static size_t ZSTD_decompressMultiFrame(
                                   const ZSTD_DDict* ddict)
 {
     void* const dststart = dst;
+    int moreThan1Frame = 0;
+
+    DEBUGLOG(5, "ZSTD_decompressMultiFrame");
     assert(dict==NULL || ddict==NULL);  /* either dict or ddict set, not both */
 
     if (ddict) {
@@ -1889,7 +1947,6 @@ static size_t ZSTD_decompressMultiFrame(
     }
 
     while (srcSize >= ZSTD_frameHeaderSize_prefix) {
-        U32 magicNumber;
 
 #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
         if (ZSTD_isLegacy(src, srcSize)) {
@@ -1911,24 +1968,21 @@ static size_t ZSTD_decompressMultiFrame(
         }
 #endif
 
-        magicNumber = MEM_readLE32(src);
-        DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
-                    (U32)magicNumber, (U32)ZSTD_MAGICNUMBER);
-        if (magicNumber != ZSTD_MAGICNUMBER) {
+        {   U32 const magicNumber = MEM_readLE32(src);
+            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+                        (U32)magicNumber, (U32)ZSTD_MAGICNUMBER);
             if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
                 size_t skippableSize;
                 if (srcSize < ZSTD_skippableHeaderSize)
                     return ERROR(srcSize_wrong);
-                skippableSize = MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize)
+                skippableSize = MEM_readLE32((const BYTE*)src + ZSTD_FRAMEIDSIZE)
                               + ZSTD_skippableHeaderSize;
                 if (srcSize < skippableSize) return ERROR(srcSize_wrong);
 
                 src = (const BYTE *)src + skippableSize;
                 srcSize -= skippableSize;
                 continue;
-            }
-            return ERROR(prefix_unknown);
-        }
+        }   }
 
         if (ddict) {
             /* we were called from ZSTD_decompress_usingDDict */
@@ -1942,11 +1996,25 @@ static size_t ZSTD_decompressMultiFrame(
 
         {   const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity,
                                                     &src, &srcSize);
+            if ( (ZSTD_getErrorCode(res) == ZSTD_error_prefix_unknown)
+              && (moreThan1Frame==1) ) {
+                /* at least one frame successfully completed,
+                 * but following bytes are garbage :
+                 * it's more likely to be a srcSize error,
+                 * specifying more bytes than compressed size of frame(s).
+                 * This error message replaces ERROR(prefix_unknown),
+                 * which would be confusing, as the first header is actually correct.
+                 * Note that one could be unlucky, it might be a corruption error instead,
+                 * happening right at the place where we expect zstd magic bytes.
+                 * But this is _much_ less likely than a srcSize field error. */
+                return ERROR(srcSize_wrong);
+            }
             if (ZSTD_isError(res)) return res;
             /* no need to bound check, ZSTD_decompressFrame already has */
             dst = (BYTE*)dst + res;
             dstCapacity -= res;
         }
+        moreThan1Frame = 1;
     }  /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
 
     if (srcSize) return ERROR(srcSize_wrong); /* input not entirely consumed */
@@ -1980,6 +2048,7 @@ size_t ZSTD_decompress(void* dst, size_t
     return regenSize;
 #else   /* stack mode */
     ZSTD_DCtx dctx;
+    ZSTD_initDCtx_internal(&dctx);
     return ZSTD_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
 #endif
 }
@@ -2031,7 +2100,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx
     case ZSTDds_getFrameHeaderSize :
         assert(src != NULL);
         if (dctx->format == ZSTD_f_zstd1) {  /* allows header */
-            assert(srcSize >= ZSTD_frameIdSize);  /* to read skippable magic number */
+            assert(srcSize >= ZSTD_FRAMEIDSIZE);  /* to read skippable magic number */
             if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {        /* skippable frame */
                 memcpy(dctx->headerBuffer, src, srcSize);
                 dctx->expected = ZSTD_skippableHeaderSize - srcSize;  /* remaining to load to get full skippable frame header */
@@ -2141,7 +2210,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx
         assert(src != NULL);
         assert(srcSize <= ZSTD_skippableHeaderSize);
         memcpy(dctx->headerBuffer + (ZSTD_skippableHeaderSize - srcSize), src, srcSize);   /* complete skippable header */
-        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_frameIdSize);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
         dctx->stage = ZSTDds_skipFrame;
         return 0;
 
@@ -2159,27 +2228,33 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx
 static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
 {
     dctx->dictEnd = dctx->previousDstEnd;
-    dctx->vBase = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
-    dctx->base = dict;
+    dctx->virtualStart = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+    dctx->prefixStart = dict;
     dctx->previousDstEnd = (const char*)dict + dictSize;
     return 0;
 }
 
-/* ZSTD_loadEntropy() :
- * dict : must point at beginning of a valid zstd dictionary
+/*! ZSTD_loadEntropy() :
+ *  dict : must point at beginning of a valid zstd dictionary.
  * @return : size of entropy tables read */
-static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const dict, size_t const dictSize)
+static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy,
+                         const void* const dict, size_t const dictSize)
 {
     const BYTE* dictPtr = (const BYTE*)dict;
     const BYTE* const dictEnd = dictPtr + dictSize;
 
     if (dictSize <= 8) return ERROR(dictionary_corrupted);
+    assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY);   /* dict must be valid */
     dictPtr += 8;   /* skip header = magic + dictID */
 
-
-    {   size_t const hSize = HUF_readDTableX4_wksp(
-            entropy->hufTable, dictPtr, dictEnd - dictPtr,
-            entropy->workspace, sizeof(entropy->workspace));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, OFTable) == offsetof(ZSTD_entropyDTables_t, LLTable) + sizeof(entropy->LLTable));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, MLTable) == offsetof(ZSTD_entropyDTables_t, OFTable) + sizeof(entropy->OFTable));
+    ZSTD_STATIC_ASSERT(sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable) >= HUF_DECOMPRESS_WORKSPACE_SIZE);
+    {   void* const workspace = &entropy->LLTable;   /* use fse tables as temporary workspace; implies fse tables are grouped together */
+        size_t const workspaceSize = sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable);
+        size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                dictPtr, dictEnd - dictPtr,
+                                                workspace, workspaceSize);
         if (HUF_isError(hSize)) return ERROR(dictionary_corrupted);
         dictPtr += hSize;
     }
@@ -2190,7 +2265,7 @@ static size_t ZSTD_loadEntropy(ZSTD_entr
         if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
         if (offcodeMaxValue > MaxOff) return ERROR(dictionary_corrupted);
         if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->OFTable,
+        ZSTD_buildFSETable( entropy->OFTable,
                             offcodeNCount, offcodeMaxValue,
                             OF_base, OF_bits,
                             offcodeLog);
@@ -2203,7 +2278,7 @@ static size_t ZSTD_loadEntropy(ZSTD_entr
         if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
         if (matchlengthMaxValue > MaxML) return ERROR(dictionary_corrupted);
         if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->MLTable,
+        ZSTD_buildFSETable( entropy->MLTable,
                             matchlengthNCount, matchlengthMaxValue,
                             ML_base, ML_bits,
                             matchlengthLog);
@@ -2216,7 +2291,7 @@ static size_t ZSTD_loadEntropy(ZSTD_entr
         if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
         if (litlengthMaxValue > MaxLL) return ERROR(dictionary_corrupted);
         if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->LLTable,
+        ZSTD_buildFSETable( entropy->LLTable,
                             litlengthNCount, litlengthMaxValue,
                             LL_base, LL_bits,
                             litlengthLog);
@@ -2242,7 +2317,7 @@ static size_t ZSTD_decompress_insertDict
         if (magic != ZSTD_MAGIC_DICTIONARY) {
             return ZSTD_refDictContent(dctx, dict, dictSize);   /* pure content mode */
     }   }
-    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
 
     /* load entropy tables */
     {   size_t const eSize = ZSTD_loadEntropy(&dctx->entropy, dict, dictSize);
@@ -2256,7 +2331,6 @@ static size_t ZSTD_decompress_insertDict
     return ZSTD_refDictContent(dctx, dict, dictSize);
 }
 
-/* Note : this function cannot fail */
 size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
 {
     assert(dctx != NULL);
@@ -2264,8 +2338,8 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* d
     dctx->stage = ZSTDds_getFrameHeaderSize;
     dctx->decodedSize = 0;
     dctx->previousDstEnd = NULL;
-    dctx->base = NULL;
-    dctx->vBase = NULL;
+    dctx->prefixStart = NULL;
+    dctx->virtualStart = NULL;
     dctx->dictEnd = NULL;
     dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
     dctx->litEntropy = dctx->fseEntropy = 0;
@@ -2302,42 +2376,53 @@ struct ZSTD_DDict_s {
 
 static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict)
 {
+    assert(ddict != NULL);
     return ddict->dictContent;
 }
 
 static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict)
 {
+    assert(ddict != NULL);
     return ddict->dictSize;
 }
 
-size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dstDCtx, const ZSTD_DDict* ddict)
+size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
 {
-    CHECK_F( ZSTD_decompressBegin(dstDCtx) );
-    if (ddict) {   /* support begin on NULL */
-        dstDCtx->dictID = ddict->dictID;
-        dstDCtx->base = ddict->dictContent;
-        dstDCtx->vBase = ddict->dictContent;
-        dstDCtx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
-        dstDCtx->previousDstEnd = dstDCtx->dictEnd;
+    DEBUGLOG(4, "ZSTD_decompressBegin_usingDDict");
+    assert(dctx != NULL);
+    if (ddict) {
+        dctx->ddictIsCold = (dctx->dictEnd != (const char*)ddict->dictContent + ddict->dictSize);
+        DEBUGLOG(4, "DDict is %s",
+                    dctx->ddictIsCold ? "~cold~" : "hot!");
+    }
+    CHECK_F( ZSTD_decompressBegin(dctx) );
+    if (ddict) {   /* NULL ddict is equivalent to no dictionary */
+        dctx->dictID = ddict->dictID;
+        dctx->prefixStart = ddict->dictContent;
+        dctx->virtualStart = ddict->dictContent;
+        dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
+        dctx->previousDstEnd = dctx->dictEnd;
         if (ddict->entropyPresent) {
-            dstDCtx->litEntropy = 1;
-            dstDCtx->fseEntropy = 1;
-            dstDCtx->LLTptr = ddict->entropy.LLTable;
-            dstDCtx->MLTptr = ddict->entropy.MLTable;
-            dstDCtx->OFTptr = ddict->entropy.OFTable;
-            dstDCtx->HUFptr = ddict->entropy.hufTable;
-            dstDCtx->entropy.rep[0] = ddict->entropy.rep[0];
-            dstDCtx->entropy.rep[1] = ddict->entropy.rep[1];
-            dstDCtx->entropy.rep[2] = ddict->entropy.rep[2];
+            dctx->litEntropy = 1;
+            dctx->fseEntropy = 1;
+            dctx->LLTptr = ddict->entropy.LLTable;
+            dctx->MLTptr = ddict->entropy.MLTable;
+            dctx->OFTptr = ddict->entropy.OFTable;
+            dctx->HUFptr = ddict->entropy.hufTable;
+            dctx->entropy.rep[0] = ddict->entropy.rep[0];
+            dctx->entropy.rep[1] = ddict->entropy.rep[1];
+            dctx->entropy.rep[2] = ddict->entropy.rep[2];
         } else {
-            dstDCtx->litEntropy = 0;
-            dstDCtx->fseEntropy = 0;
+            dctx->litEntropy = 0;
+            dctx->fseEntropy = 0;
         }
     }
     return 0;
 }
 
-static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict, ZSTD_dictContentType_e dictContentType)
+static size_t
+ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict,
+                         ZSTD_dictContentType_e dictContentType)
 {
     ddict->dictID = 0;
     ddict->entropyPresent = 0;
@@ -2355,10 +2440,12 @@ static size_t ZSTD_loadEntropy_inDDict(Z
             return 0;   /* pure content mode */
         }
     }
-    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_frameIdSize);
+    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE);
 
     /* load entropy tables */
-    CHECK_E( ZSTD_loadEntropy(&ddict->entropy, ddict->dictContent, ddict->dictSize), dictionary_corrupted );
+    CHECK_E( ZSTD_loadEntropy(&ddict->entropy,
+                              ddict->dictContent, ddict->dictSize),
+             dictionary_corrupted );
     ddict->entropyPresent = 1;
     return 0;
 }
@@ -2372,6 +2459,7 @@ static size_t ZSTD_initDDict_internal(ZS
     if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) {
         ddict->dictBuffer = NULL;
         ddict->dictContent = dict;
+        if (!dict) dictSize = 0;
     } else {
         void* const internalBuffer = ZSTD_malloc(dictSize, ddict->cMem);
         ddict->dictBuffer = internalBuffer;
@@ -2396,14 +2484,15 @@ ZSTD_DDict* ZSTD_createDDict_advanced(co
     if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
 
     {   ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem);
-        if (!ddict) return NULL;
+        if (ddict == NULL) return NULL;
         ddict->cMem = customMem;
-
-        if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, dictLoadMethod, dictContentType) )) {
-            ZSTD_freeDDict(ddict);
-            return NULL;
-        }
-
+        {   size_t const initResult = ZSTD_initDDict_internal(ddict,
+                                            dict, dictSize,
+                                            dictLoadMethod, dictContentType);
+            if (ZSTD_isError(initResult)) {
+                ZSTD_freeDDict(ddict);
+                return NULL;
+        }   }
         return ddict;
     }
 }
@@ -2430,23 +2519,25 @@ ZSTD_DDict* ZSTD_createDDict_byReference
 
 
 const ZSTD_DDict* ZSTD_initStaticDDict(
-                                void* workspace, size_t workspaceSize,
+                                void* sBuffer, size_t sBufferSize,
                                 const void* dict, size_t dictSize,
                                 ZSTD_dictLoadMethod_e dictLoadMethod,
                                 ZSTD_dictContentType_e dictContentType)
 {
-    size_t const neededSpace =
-            sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
-    ZSTD_DDict* const ddict = (ZSTD_DDict*)workspace;
-    assert(workspace != NULL);
+    size_t const neededSpace = sizeof(ZSTD_DDict)
+                             + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+    ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer;
+    assert(sBuffer != NULL);
     assert(dict != NULL);
-    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
-    if (workspaceSize < neededSpace) return NULL;
+    if ((size_t)sBuffer & 7) return NULL;   /* 8-aligned */
+    if (sBufferSize < neededSpace) return NULL;
     if (dictLoadMethod == ZSTD_dlm_byCopy) {
         memcpy(ddict+1, dict, dictSize);  /* local copy */
         dict = ddict+1;
     }
-    if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, ZSTD_dlm_byRef, dictContentType) ))
+    if (ZSTD_isError( ZSTD_initDDict_internal(ddict,
+                                              dict, dictSize,
+                                              ZSTD_dlm_byRef, dictContentType) ))
         return NULL;
     return ddict;
 }
@@ -2484,7 +2575,7 @@ unsigned ZSTD_getDictID_fromDict(const v
 {
     if (dictSize < 8) return 0;
     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0;
-    return MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+    return MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
 }
 
 /*! ZSTD_getDictID_fromDDict() :
@@ -2560,12 +2651,15 @@ size_t ZSTD_freeDStream(ZSTD_DStream* zd
 }
 
 
-/* *** Initialization *** */
+/* ***  Initialization  *** */
 
 size_t ZSTD_DStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; }
 size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; }
 
-size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx,
+                                   const void* dict, size_t dictSize,
+                                         ZSTD_dictLoadMethod_e dictLoadMethod,
+                                         ZSTD_dictContentType_e dictContentType)
 {
     if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
     ZSTD_freeDDict(dctx->ddictLocal);
@@ -2607,6 +2701,7 @@ size_t ZSTD_initDStream_usingDict(ZSTD_D
 {
     DEBUGLOG(4, "ZSTD_initDStream_usingDict");
     zds->streamStage = zdss_init;
+    zds->noForwardProgress = 0;
     CHECK_F( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) );
     return ZSTD_frameHeaderSize_prefix;
 }
@@ -2618,13 +2713,6 @@ size_t ZSTD_initDStream(ZSTD_DStream* zd
     return ZSTD_initDStream_usingDict(zds, NULL, 0);
 }
 
-size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
-{
-    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
-    dctx->ddict = ddict;
-    return 0;
-}
-
 /* ZSTD_initDStream_usingDDict() :
  * ddict will just be referenced, and must outlive decompression session
  * this function cannot fail */
@@ -2663,6 +2751,13 @@ size_t ZSTD_setDStreamParameter(ZSTD_DSt
     return 0;
 }
 
+size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    dctx->ddict = ddict;
+    return 0;
+}
+
 size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize)
 {
     if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
@@ -2767,7 +2862,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
                     return hint;
             }   }
 #endif
-            {   size_t const hSize = ZSTD_getFrameHeader_internal(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format);
+            {   size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format);
                 DEBUGLOG(5, "header size : %u", (U32)hSize);
                 if (ZSTD_isError(hSize)) {
 #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
@@ -2828,7 +2923,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
             CHECK_F(ZSTD_decompressBegin_usingDDict(zds, zds->ddict));
 
             if ((MEM_readLE32(zds->headerBuffer) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
-                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_frameIdSize);
+                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
                 zds->stage = ZSTDds_skipFrame;
             } else {
                 CHECK_F(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize));
@@ -2947,8 +3042,18 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
     }   }
 
     /* result */
-    input->pos += (size_t)(ip-istart);
-    output->pos += (size_t)(op-ostart);
+    input->pos = (size_t)(ip - (const char*)(input->src));
+    output->pos = (size_t)(op - (char*)(output->dst));
+    if ((ip==istart) && (op==ostart)) {  /* no forward progress */
+        zds->noForwardProgress ++;
+        if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+            if (op==oend) return ERROR(dstSize_tooSmall);
+            if (ip==iend) return ERROR(srcSize_wrong);
+            assert(0);
+        }
+    } else {
+        zds->noForwardProgress = 0;
+    }
     {   size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds);
         if (!nextSrcSizeHint) {   /* frame fully decoded */
             if (zds->outEnd == zds->outStart) {  /* output fully flushed */
diff --git a/contrib/python-zstandard/zstd/dictBuilder/cover.c b/contrib/python-zstandard/zstd/dictBuilder/cover.c
--- a/contrib/python-zstandard/zstd/dictBuilder/cover.c
+++ b/contrib/python-zstandard/zstd/dictBuilder/cover.c
@@ -29,6 +29,7 @@
 #include "mem.h" /* read */
 #include "pool.h"
 #include "threading.h"
+#include "cover.h"
 #include "zstd_internal.h" /* includes zstd.h */
 #ifndef ZDICT_STATIC_LINKING_ONLY
 #define ZDICT_STATIC_LINKING_ONLY
@@ -39,6 +40,7 @@
 *  Constants
 ***************************************/
 #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define DEFAULT_SPLITPOINT 1.0
 
 /*-*************************************
 *  Console display
@@ -184,7 +186,7 @@ static void COVER_map_remove(COVER_map_t
 }
 
 /**
- * Destroyes a map that is inited with COVER_map_init().
+ * Destroys a map that is inited with COVER_map_init().
  */
 static void COVER_map_destroy(COVER_map_t *map) {
   if (map->data) {
@@ -203,6 +205,8 @@ typedef struct {
   size_t *offsets;
   const size_t *samplesSizes;
   size_t nbSamples;
+  size_t nbTrainSamples;
+  size_t nbTestSamples;
   U32 *suffix;
   size_t suffixSize;
   U32 *freqs;
@@ -220,9 +224,9 @@ static COVER_ctx_t *g_ctx = NULL;
 /**
  * Returns the sum of the sample sizes.
  */
-static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
+size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
   size_t sum = 0;
-  size_t i;
+  unsigned i;
   for (i = 0; i < nbSamples; ++i) {
     sum += samplesSizes[i];
   }
@@ -377,14 +381,6 @@ static void COVER_group(COVER_ctx_t *ctx
   ctx->suffix[dmerId] = freq;
 }
 
-/**
- * A segment is a range in the source as well as the score of the segment.
- */
-typedef struct {
-  U32 begin;
-  U32 end;
-  U32 score;
-} COVER_segment_t;
 
 /**
  * Selects the best segment in an epoch.
@@ -494,6 +490,10 @@ static int COVER_checkParameters(ZDICT_c
   if (parameters.d > parameters.k) {
     return 0;
   }
+  /* 0 < splitPoint <= 1 */
+  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
+    return 0;
+  }
   return 1;
 }
 
@@ -531,9 +531,14 @@ static void COVER_ctx_destroy(COVER_ctx_
  */
 static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
                           const size_t *samplesSizes, unsigned nbSamples,
-                          unsigned d) {
+                          unsigned d, double splitPoint) {
   const BYTE *const samples = (const BYTE *)samplesBuffer;
   const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+  /* Split samples into testing and training sets */
+  const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
+  const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
+  const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
+  const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
   /* Checks */
   if (totalSamplesSize < MAX(d, sizeof(U64)) ||
       totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
@@ -541,15 +546,29 @@ static int COVER_ctx_init(COVER_ctx_t *c
                  (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
     return 0;
   }
+  /* Check if there are at least 5 training samples */
+  if (nbTrainSamples < 5) {
+    DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
+    return 0;
+  }
+  /* Check if there's testing sample */
+  if (nbTestSamples < 1) {
+    DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
+    return 0;
+  }
   /* Zero the context */
   memset(ctx, 0, sizeof(*ctx));
-  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples,
-               (U32)totalSamplesSize);
+  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
+               (U32)trainingSamplesSize);
+  DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
+               (U32)testSamplesSize);
   ctx->samples = samples;
   ctx->samplesSizes = samplesSizes;
   ctx->nbSamples = nbSamples;
+  ctx->nbTrainSamples = nbTrainSamples;
+  ctx->nbTestSamples = nbTestSamples;
   /* Partial suffix array */
-  ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1;
+  ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
   ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
   /* Maps index to the dmerID */
   ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
@@ -563,7 +582,7 @@ static int COVER_ctx_init(COVER_ctx_t *c
   ctx->freqs = NULL;
   ctx->d = d;
 
-  /* Fill offsets from the samlesSizes */
+  /* Fill offsets from the samplesSizes */
   {
     U32 i;
     ctx->offsets[0] = 0;
@@ -581,10 +600,17 @@ static int COVER_ctx_init(COVER_ctx_t *c
     for (i = 0; i < ctx->suffixSize; ++i) {
       ctx->suffix[i] = i;
     }
-    /* qsort doesn't take an opaque pointer, so pass as a global */
+    /* qsort doesn't take an opaque pointer, so pass as a global.
+     * On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
+     */
     g_ctx = ctx;
+#if defined(__OpenBSD__)
+    mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
+          (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
+#else
     qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
           (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
+#endif
   }
   DISPLAYLEVEL(2, "Computing frequencies\n");
   /* For each dmer group (group of positions with the same first d bytes):
@@ -613,7 +639,7 @@ static size_t COVER_buildDictionary(cons
   /* Divide the data up into epochs of equal size.
    * We will select at least one segment from each epoch.
    */
-  const U32 epochs = (U32)(dictBufferCapacity / parameters.k);
+  const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k / 4));
   const U32 epochSize = (U32)(ctx->suffixSize / epochs);
   size_t epoch;
   DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
@@ -658,7 +684,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffe
   BYTE* const dict = (BYTE*)dictBuffer;
   COVER_ctx_t ctx;
   COVER_map_t activeDmers;
-
+  parameters.splitPoint = 1.0;
   /* Initialize global data */
   g_displayLevel = parameters.zParams.notificationLevel;
   /* Checks */
@@ -677,7 +703,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffe
   }
   /* Initialize context and activeDmers */
   if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
-                      parameters.d)) {
+                      parameters.d, parameters.splitPoint)) {
     return ERROR(GENERIC);
   }
   if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
@@ -704,28 +730,65 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffe
   }
 }
 
-/**
- * COVER_best_t is used for two purposes:
- * 1. Synchronizing threads.
- * 2. Saving the best parameters and dictionary.
- *
- * All of the methods except COVER_best_init() are thread safe if zstd is
- * compiled with multithreaded support.
- */
-typedef struct COVER_best_s {
-  ZSTD_pthread_mutex_t mutex;
-  ZSTD_pthread_cond_t cond;
-  size_t liveJobs;
-  void *dict;
-  size_t dictSize;
-  ZDICT_cover_params_t parameters;
-  size_t compressedSize;
-} COVER_best_t;
+
+
+size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
+                                    const size_t *samplesSizes, const BYTE *samples,
+                                    size_t *offsets,
+                                    size_t nbTrainSamples, size_t nbSamples,
+                                    BYTE *const dict, size_t dictBufferCapacity) {
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Pointers */
+  ZSTD_CCtx *cctx;
+  ZSTD_CDict *cdict;
+  void *dst;
+  /* Local variables */
+  size_t dstCapacity;
+  size_t i;
+  /* Allocate dst with enough space to compress the maximum sized sample */
+  {
+    size_t maxSampleSize = 0;
+    i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
+    for (; i < nbSamples; ++i) {
+      maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
+    }
+    dstCapacity = ZSTD_compressBound(maxSampleSize);
+    dst = malloc(dstCapacity);
+  }
+  /* Create the cctx and cdict */
+  cctx = ZSTD_createCCtx();
+  cdict = ZSTD_createCDict(dict, dictBufferCapacity,
+                           parameters.zParams.compressionLevel);
+  if (!dst || !cctx || !cdict) {
+    goto _compressCleanup;
+  }
+  /* Compress each sample and sum their sizes (or error) */
+  totalCompressedSize = dictBufferCapacity;
+  i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
+  for (; i < nbSamples; ++i) {
+    const size_t size = ZSTD_compress_usingCDict(
+        cctx, dst, dstCapacity, samples + offsets[i],
+        samplesSizes[i], cdict);
+    if (ZSTD_isError(size)) {
+      totalCompressedSize = ERROR(GENERIC);
+      goto _compressCleanup;
+    }
+    totalCompressedSize += size;
+  }
+_compressCleanup:
+  ZSTD_freeCCtx(cctx);
+  ZSTD_freeCDict(cdict);
+  if (dst) {
+    free(dst);
+  }
+  return totalCompressedSize;
+}
+
 
 /**
  * Initialize the `COVER_best_t`.
  */
-static void COVER_best_init(COVER_best_t *best) {
+void COVER_best_init(COVER_best_t *best) {
   if (best==NULL) return; /* compatible with init on NULL */
   (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
   (void)ZSTD_pthread_cond_init(&best->cond, NULL);
@@ -739,7 +802,7 @@ static void COVER_best_init(COVER_best_t
 /**
  * Wait until liveJobs == 0.
  */
-static void COVER_best_wait(COVER_best_t *best) {
+void COVER_best_wait(COVER_best_t *best) {
   if (!best) {
     return;
   }
@@ -753,7 +816,7 @@ static void COVER_best_wait(COVER_best_t
 /**
  * Call COVER_best_wait() and then destroy the COVER_best_t.
  */
-static void COVER_best_destroy(COVER_best_t *best) {
+void COVER_best_destroy(COVER_best_t *best) {
   if (!best) {
     return;
   }
@@ -769,7 +832,7 @@ static void COVER_best_destroy(COVER_bes
  * Called when a thread is about to be launched.
  * Increments liveJobs.
  */
-static void COVER_best_start(COVER_best_t *best) {
+void COVER_best_start(COVER_best_t *best) {
   if (!best) {
     return;
   }
@@ -783,7 +846,7 @@ static void COVER_best_start(COVER_best_
  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
  * If this dictionary is the best so far save it and its parameters.
  */
-static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
+void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
                               ZDICT_cover_params_t parameters, void *dict,
                               size_t dictSize) {
   if (!best) {
@@ -814,10 +877,10 @@ static void COVER_best_finish(COVER_best
       best->parameters = parameters;
       best->compressedSize = compressedSize;
     }
-    ZSTD_pthread_mutex_unlock(&best->mutex);
     if (liveJobs == 0) {
       ZSTD_pthread_cond_broadcast(&best->cond);
     }
+    ZSTD_pthread_mutex_unlock(&best->mutex);
   }
 }
 
@@ -832,7 +895,7 @@ typedef struct COVER_tryParameters_data_
 } COVER_tryParameters_data_t;
 
 /**
- * Tries a set of parameters and upates the COVER_best_t with the results.
+ * Tries a set of parameters and updates the COVER_best_t with the results.
  * This function is thread safe if zstd is compiled with multithreaded support.
  * It takes its parameters as an *OWNING* opaque pointer to support threading.
  */
@@ -863,7 +926,7 @@ static void COVER_tryParameters(void *op
                                               dictBufferCapacity, parameters);
     dictBufferCapacity = ZDICT_finalizeDictionary(
         dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
-        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples,
+        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
         parameters.zParams);
     if (ZDICT_isError(dictBufferCapacity)) {
       DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
@@ -871,49 +934,10 @@ static void COVER_tryParameters(void *op
     }
   }
   /* Check total compressed size */
-  {
-    /* Pointers */
-    ZSTD_CCtx *cctx;
-    ZSTD_CDict *cdict;
-    void *dst;
-    /* Local variables */
-    size_t dstCapacity;
-    size_t i;
-    /* Allocate dst with enough space to compress the maximum sized sample */
-    {
-      size_t maxSampleSize = 0;
-      for (i = 0; i < ctx->nbSamples; ++i) {
-        maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
-      }
-      dstCapacity = ZSTD_compressBound(maxSampleSize);
-      dst = malloc(dstCapacity);
-    }
-    /* Create the cctx and cdict */
-    cctx = ZSTD_createCCtx();
-    cdict = ZSTD_createCDict(dict, dictBufferCapacity,
-                             parameters.zParams.compressionLevel);
-    if (!dst || !cctx || !cdict) {
-      goto _compressCleanup;
-    }
-    /* Compress each sample and sum their sizes (or error) */
-    totalCompressedSize = dictBufferCapacity;
-    for (i = 0; i < ctx->nbSamples; ++i) {
-      const size_t size = ZSTD_compress_usingCDict(
-          cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
-          ctx->samplesSizes[i], cdict);
-      if (ZSTD_isError(size)) {
-        totalCompressedSize = ERROR(GENERIC);
-        goto _compressCleanup;
-      }
-      totalCompressedSize += size;
-    }
-  _compressCleanup:
-    ZSTD_freeCCtx(cctx);
-    ZSTD_freeCDict(cdict);
-    if (dst) {
-      free(dst);
-    }
-  }
+  totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
+                                                       ctx->samples, ctx->offsets,
+                                                       ctx->nbTrainSamples, ctx->nbSamples,
+                                                       dict, dictBufferCapacity);
 
 _cleanup:
   COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
@@ -934,6 +958,8 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainF
     ZDICT_cover_params_t *parameters) {
   /* constants */
   const unsigned nbThreads = parameters->nbThreads;
+  const double splitPoint =
+      parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
   const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
   const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
   const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
@@ -951,6 +977,10 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainF
   POOL_ctx *pool = NULL;
 
   /* Checks */
+  if (splitPoint <= 0 || splitPoint > 1) {
+    LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
+    return ERROR(GENERIC);
+  }
   if (kMinK < kMaxD || kMaxK < kMinK) {
     LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
     return ERROR(GENERIC);
@@ -981,7 +1011,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainF
     /* Initialize the context for this value of d */
     COVER_ctx_t ctx;
     LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
-    if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) {
+    if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint)) {
       LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
       COVER_best_destroy(&best);
       POOL_free(pool);
@@ -1006,6 +1036,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainF
       data->parameters = *parameters;
       data->parameters.k = k;
       data->parameters.d = d;
+      data->parameters.splitPoint = splitPoint;
       data->parameters.steps = kSteps;
       data->parameters.zParams.notificationLevel = g_displayLevel;
       /* Check the parameters */
diff --git a/contrib/python-zstandard/zstd/dictBuilder/cover.h b/contrib/python-zstandard/zstd/dictBuilder/cover.h
new file mode 100644
--- /dev/null
+++ b/contrib/python-zstandard/zstd/dictBuilder/cover.h
@@ -0,0 +1,83 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+/**
+ * COVER_best_t is used for two purposes:
+ * 1. Synchronizing threads.
+ * 2. Saving the best parameters and dictionary.
+ *
+ * All of the methods except COVER_best_init() are thread safe if zstd is
+ * compiled with multithreaded support.
+ */
+typedef struct COVER_best_s {
+  ZSTD_pthread_mutex_t mutex;
+  ZSTD_pthread_cond_t cond;
+  size_t liveJobs;
+  void *dict;
+  size_t dictSize;
+  ZDICT_cover_params_t parameters;
+  size_t compressedSize;
+} COVER_best_t;
+
+/**
+ * A segment is a range in the source as well as the score of the segment.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+  U32 score;
+} COVER_segment_t;
+
+/**
+ *  Checks total compressed size of a dictionary
+ */
+size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
+                                      const size_t *samplesSizes, const BYTE *samples,
+                                      size_t *offsets,
+                                      size_t nbTrainSamples, size_t nbSamples,
+                                      BYTE *const dict, size_t dictBufferCapacity);
+
+/**
+ * Returns the sum of the sample sizes.
+ */
+size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
+
+/**
+ * Initialize the `COVER_best_t`.
+ */
+void COVER_best_init(COVER_best_t *best);
+
+/**
+ * Wait until liveJobs == 0.
+ */
+void COVER_best_wait(COVER_best_t *best);
+
+/**
+ * Call COVER_best_wait() and then destroy the COVER_best_t.
+ */
+void COVER_best_destroy(COVER_best_t *best);
+
+/**
+ * Called when a thread is about to be launched.
+ * Increments liveJobs.
+ */
+void COVER_best_start(COVER_best_t *best);
+
+/**
+ * Called when a thread finishes executing, both on error or success.
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
+ * If this dictionary is the best so far save it and its parameters.
+ */
+void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
+                       ZDICT_cover_params_t parameters, void *dict,
+                       size_t dictSize);
diff --git a/contrib/python-zstandard/zstd/dictBuilder/divsufsort.c b/contrib/python-zstandard/zstd/dictBuilder/divsufsort.c
--- a/contrib/python-zstandard/zstd/dictBuilder/divsufsort.c
+++ b/contrib/python-zstandard/zstd/dictBuilder/divsufsort.c
@@ -1637,7 +1637,7 @@ construct_SA(const unsigned char *T, int
             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
             k = SA + BUCKET_B(c2 = c0, c1);
           }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
           *k-- = s;
         } else {
           assert(((s == 0) && (T[s] == c1)) || (s < 0));
@@ -1701,7 +1701,7 @@ construct_BWT(const unsigned char *T, in
             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
             k = SA + BUCKET_B(c2 = c0, c1);
           }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
           *k-- = s;
         } else if(s != 0) {
           *j = ~s;
@@ -1785,7 +1785,7 @@ construct_BWT_indexes(const unsigned cha
             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
             k = SA + BUCKET_B(c2 = c0, c1);
           }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
           *k-- = s;
         } else if(s != 0) {
           *j = ~s;
diff --git a/contrib/python-zstandard/zstd/dictBuilder/fastcover.c b/contrib/python-zstandard/zstd/dictBuilder/fastcover.c
new file mode 100644
--- /dev/null
+++ b/contrib/python-zstandard/zstd/dictBuilder/fastcover.c
@@ -0,0 +1,728 @@
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "cover.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define FASTCOVER_MAX_F 31
+#define FASTCOVER_MAX_ACCEL 10
+#define DEFAULT_SPLITPOINT 0.75
+#define DEFAULT_F 20
+#define DEFAULT_ACCEL 1
+
+
+/*-*************************************
+*  Console display
+***************************************/
+static int g_displayLevel = 2;
+#define DISPLAY(...)                                                           \
+  {                                                                            \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+  }
+#define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
+  if (displayLevel >= l) {                                                     \
+    DISPLAY(__VA_ARGS__);                                                      \
+  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
+
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
+static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+
+
+/*-*************************************
+* Hash Functions
+***************************************/
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
+
+/**
+ * Hash the d-byte value pointed to by p and mod 2^f
+ */
+static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 h, unsigned d) {
+  if (d == 6) {
+    return ZSTD_hash6Ptr(p, h) & ((1 << h) - 1);
+  }
+  return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1);
+}
+
+
+/*-*************************************
+* Acceleration
+***************************************/
+typedef struct {
+  unsigned finalize;    /* Percentage of training samples used for ZDICT_finalizeDictionary */
+  unsigned skip;        /* Number of dmer skipped between each dmer counted in computeFrequency */
+} FASTCOVER_accel_t;
+
+
+static const FASTCOVER_accel_t FASTCOVER_defaultAccelParameters[FASTCOVER_MAX_ACCEL+1] = {
+  { 100, 0 },   /* accel = 0, should not happen because accel = 0 defaults to accel = 1 */
+  { 100, 0 },   /* accel = 1 */
+  { 50, 1 },   /* accel = 2 */
+  { 34, 2 },   /* accel = 3 */
+  { 25, 3 },   /* accel = 4 */
+  { 20, 4 },   /* accel = 5 */
+  { 17, 5 },   /* accel = 6 */
+  { 14, 6 },   /* accel = 7 */
+  { 13, 7 },   /* accel = 8 */
+  { 11, 8 },   /* accel = 9 */
+  { 10, 9 },   /* accel = 10 */
+};
+
+
+/*-*************************************
+* Context
+***************************************/
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  size_t nbTrainSamples;
+  size_t nbTestSamples;
+  size_t nbDmers;
+  U32 *freqs;
+  unsigned d;
+  unsigned f;
+  FASTCOVER_accel_t accelParams;
+} FASTCOVER_ctx_t;
+
+
+/*-*************************************
+*  Helper functions
+***************************************/
+/**
+ * Selects the best segment in an epoch.
+ * Segments of are scored according to the function:
+ *
+ * Let F(d) be the frequency of all dmers with hash value d.
+ * Let S_i be hash value of the dmer at position i of segment S which has length k.
+ *
+ *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
+ *
+ * Once the dmer with hash value d is in the dictionay we set F(d) = 0.
+ */
+static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
+                                              U32 *freqs, U32 begin, U32 end,
+                                              ZDICT_cover_params_t parameters,
+                                              U16* segmentFreqs) {
+  /* Constants */
+  const U32 k = parameters.k;
+  const U32 d = parameters.d;
+  const U32 f = ctx->f;
+  const U32 dmersInK = k - d + 1;
+
+  /* Try each segment (activeSegment) and save the best (bestSegment) */
+  COVER_segment_t bestSegment = {0, 0, 0};
+  COVER_segment_t activeSegment;
+
+  /* Reset the activeDmers in the segment */
+  /* The activeSegment starts at the beginning of the epoch. */
+  activeSegment.begin = begin;
+  activeSegment.end = begin;
+  activeSegment.score = 0;
+
+  /* Slide the activeSegment through the whole epoch.
+   * Save the best segment in bestSegment.
+   */
+  while (activeSegment.end < end) {
+    /* Get hash value of current dmer */
+    const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
+
+    /* Add frequency of this index to score if this is the first occurence of index in active segment */
+    if (segmentFreqs[index] == 0) {
+      activeSegment.score += freqs[index];
+    }
+    /* Increment end of segment and segmentFreqs*/
+    activeSegment.end += 1;
+    segmentFreqs[index] += 1;
+    /* If the window is now too large, drop the first position */
+    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+      /* Get hash value of the dmer to be eliminated from active segment */
+      const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+      segmentFreqs[delIndex] -= 1;
+      /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
+      if (segmentFreqs[delIndex] == 0) {
+        activeSegment.score -= freqs[delIndex];
+      }
+      /* Increment start of segment */
+      activeSegment.begin += 1;
+    }
+
+    /* If this segment is the best so far save it */
+    if (activeSegment.score > bestSegment.score) {
+      bestSegment = activeSegment;
+    }
+  }
+
+  /* Zero out rest of segmentFreqs array */
+  while (activeSegment.begin < end) {
+    const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+    segmentFreqs[delIndex] -= 1;
+    activeSegment.begin += 1;
+  }
+
+  {
+    /*  Zero the frequency of hash value of each dmer covered by the chosen segment. */
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, f, d);
+      freqs[i] = 0;
+    }
+  }
+
+  return bestSegment;
+}
+
+
+static int FASTCOVER_checkParameters(ZDICT_cover_params_t parameters,
+                                     size_t maxDictSize, unsigned f,
+                                     unsigned accel) {
+  /* k, d, and f are required parameters */
+  if (parameters.d == 0 || parameters.k == 0) {
+    return 0;
+  }
+  /* d has to be 6 or 8 */
+  if (parameters.d != 6 && parameters.d != 8) {
+    return 0;
+  }
+  /* k <= maxDictSize */
+  if (parameters.k > maxDictSize) {
+    return 0;
+  }
+  /* d <= k */
+  if (parameters.d > parameters.k) {
+    return 0;
+  }
+  /* 0 < f <= FASTCOVER_MAX_F*/
+  if (f > FASTCOVER_MAX_F || f == 0) {
+    return 0;
+  }
+  /* 0 < splitPoint <= 1 */
+  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) {
+    return 0;
+  }
+  /* 0 < accel <= 10 */
+  if (accel > 10 || accel == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+
+/**
+ * Clean up a context initialized with `FASTCOVER_ctx_init()`.
+ */
+static void
+FASTCOVER_ctx_destroy(FASTCOVER_ctx_t* ctx)
+{
+    if (!ctx) return;
+
+    free(ctx->freqs);
+    ctx->freqs = NULL;
+
+    free(ctx->offsets);
+    ctx->offsets = NULL;
+}
+
+
+/**
+ * Calculate for frequency of hash value of each dmer in ctx->samples
+ */
+static void
+FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
+{
+    const unsigned f = ctx->f;
+    const unsigned d = ctx->d;
+    const unsigned skip = ctx->accelParams.skip;
+    const unsigned readLength = MAX(d, 8);
+    size_t i;
+    assert(ctx->nbTrainSamples >= 5);
+    assert(ctx->nbTrainSamples <= ctx->nbSamples);
+    for (i = 0; i < ctx->nbTrainSamples; i++) {
+        size_t start = ctx->offsets[i];  /* start of current dmer */
+        size_t const currSampleEnd = ctx->offsets[i+1];
+        while (start + readLength <= currSampleEnd) {
+            const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, d);
+            freqs[dmerIndex]++;
+            start = start + skip + 1;
+        }
+    }
+}
+
+
+/**
+ * Prepare a context for dictionary building.
+ * The context is only dependent on the parameter `d` and can used multiple
+ * times.
+ * Returns 1 on success or zero on error.
+ * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
+ */
+static int
+FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
+                   const void* samplesBuffer,
+                   const size_t* samplesSizes, unsigned nbSamples,
+                   unsigned d, double splitPoint, unsigned f,
+                   FASTCOVER_accel_t accelParams)
+{
+    const BYTE* const samples = (const BYTE*)samplesBuffer;
+    const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+    /* Split samples into testing and training sets */
+    const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
+    const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
+    const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
+    const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
+
+    /* Checks */
+    if (totalSamplesSize < MAX(d, sizeof(U64)) ||
+        totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
+        DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                    (U32)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
+        return 0;
+    }
+
+    /* Check if there are at least 5 training samples */
+    if (nbTrainSamples < 5) {
+        DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
+        return 0;
+    }
+
+    /* Check if there's testing sample */
+    if (nbTestSamples < 1) {
+        DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
+        return 0;
+    }
+
+    /* Zero the context */
+    memset(ctx, 0, sizeof(*ctx));
+    DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
+                    (U32)trainingSamplesSize);
+    DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
+                    (U32)testSamplesSize);
+
+    ctx->samples = samples;
+    ctx->samplesSizes = samplesSizes;
+    ctx->nbSamples = nbSamples;
+    ctx->nbTrainSamples = nbTrainSamples;
+    ctx->nbTestSamples = nbTestSamples;
+    ctx->nbDmers = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
+    ctx->d = d;
+    ctx->f = f;
+    ctx->accelParams = accelParams;
+
+    /* The offsets of each file */
+    ctx->offsets = (size_t*)calloc((nbSamples + 1), sizeof(size_t));
+    if (ctx->offsets == NULL) {
+        DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
+        FASTCOVER_ctx_destroy(ctx);
+        return 0;
+    }
+
+    /* Fill offsets from the samplesSizes */
+    {   U32 i;
+        ctx->offsets[0] = 0;
+        assert(nbSamples >= 5);
+        for (i = 1; i <= nbSamples; ++i) {
+            ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+        }
+    }
+
+    /* Initialize frequency array of size 2^f */
+    ctx->freqs = (U32*)calloc(((U64)1 << f), sizeof(U32));
+    if (ctx->freqs == NULL) {
+        DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
+        FASTCOVER_ctx_destroy(ctx);
+        return 0;
+    }
+
+    DISPLAYLEVEL(2, "Computing frequencies\n");
+    FASTCOVER_computeFrequency(ctx->freqs, ctx);
+
+    return 1;
+}
+
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t
+FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
+                          U32* freqs,
+                          void* dictBuffer, size_t dictBufferCapacity,
+                          ZDICT_cover_params_t parameters,
+                          U16* segmentFreqs)
+{
+  BYTE *const dict = (BYTE *)dictBuffer;
+  size_t tail = dictBufferCapacity;
+  /* Divide the data up into epochs of equal size.
+   * We will select at least one segment from each epoch.
+   */
+  const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
+  const U32 epochSize = (U32)(ctx->nbDmers / epochs);
+  size_t epoch;
+  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
+               epochSize);
+  /* Loop through the epochs until there are no more segments or the dictionary
+   * is full.
+   */
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
+    const U32 epochBegin = (U32)(epoch * epochSize);
+    const U32 epochEnd = epochBegin + epochSize;
+    size_t segmentSize;
+    /* Select a segment */
+    COVER_segment_t segment = FASTCOVER_selectSegment(
+        ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
+
+    /* If the segment covers no dmers, then we are out of content */
+    if (segment.score == 0) {
+      break;
+    }
+
+    /* Trim the segment if necessary and if it is too small then we are done */
+    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
+    if (segmentSize < parameters.d) {
+      break;
+    }
+
+    /* We fill the dictionary from the back to allow the best segments to be
+     * referenced with the smallest offsets.
+     */
+    tail -= segmentSize;
+    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+    DISPLAYUPDATE(
+        2, "\r%u%%       ",
+        (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+  }
+  DISPLAYLEVEL(2, "\r%79s\r", "");
+  return tail;
+}
+
+
+/**
+ * Parameters for FASTCOVER_tryParameters().
+ */
+typedef struct FASTCOVER_tryParameters_data_s {
+    const FASTCOVER_ctx_t* ctx;
+    COVER_best_t* best;
+    size_t dictBufferCapacity;
+    ZDICT_cover_params_t parameters;
+} FASTCOVER_tryParameters_data_t;
+
+
+/**
+ * Tries a set of parameters and updates the COVER_best_t with the results.
+ * This function is thread safe if zstd is compiled with multithreaded support.
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
+ */
+static void FASTCOVER_tryParameters(void *opaque)
+{
+  /* Save parameters as local variables */
+  FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque;
+  const FASTCOVER_ctx_t *const ctx = data->ctx;
+  const ZDICT_cover_params_t parameters = data->parameters;
+  size_t dictBufferCapacity = data->dictBufferCapacity;
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Initialize array to keep track of frequency of dmer within activeSegment */
+  U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
+  /* Allocate space for hash table, dict, and freqs */
+  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
+  U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
+  if (!segmentFreqs || !dict || !freqs) {
+    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
+    goto _cleanup;
+  }
+  /* Copy the frequencies because we need to modify them */
+  memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
+  /* Build the dictionary */
+  { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
+                                                  parameters, segmentFreqs);
+    const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
+    dictBufferCapacity = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams);
+    if (ZDICT_isError(dictBufferCapacity)) {
+      DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
+      goto _cleanup;
+    }
+  }
+  /* Check total compressed size */
+  totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
+                                                       ctx->samples, ctx->offsets,
+                                                       ctx->nbTrainSamples, ctx->nbSamples,
+                                                       dict, dictBufferCapacity);
+_cleanup:
+  COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
+                    dictBufferCapacity);
+  free(data);
+  free(segmentFreqs);
+  free(dict);
+  free(freqs);
+}
+
+
+static void
+FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
+                               ZDICT_cover_params_t* coverParams)
+{
+    coverParams->k = fastCoverParams.k;
+    coverParams->d = fastCoverParams.d;
+    coverParams->steps = fastCoverParams.steps;
+    coverParams->nbThreads = fastCoverParams.nbThreads;
+    coverParams->splitPoint = fastCoverParams.splitPoint;
+    coverParams->zParams = fastCoverParams.zParams;
+}
+
+
+static void
+FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
+                                   ZDICT_fastCover_params_t* fastCoverParams,
+                                   unsigned f, unsigned accel)
+{
+    fastCoverParams->k = coverParams.k;
+    fastCoverParams->d = coverParams.d;
+    fastCoverParams->steps = coverParams.steps;
+    fastCoverParams->nbThreads = coverParams.nbThreads;
+    fastCoverParams->splitPoint = coverParams.splitPoint;
+    fastCoverParams->f = f;
+    fastCoverParams->accel = accel;
+    fastCoverParams->zParams = coverParams.zParams;
+}
+
+
+ZDICTLIB_API size_t
+ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
+                                const void* samplesBuffer,
+                                const size_t* samplesSizes, unsigned nbSamples,
+                                ZDICT_fastCover_params_t parameters)
+{
+    BYTE* const dict = (BYTE*)dictBuffer;
+    FASTCOVER_ctx_t ctx;
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* Initialize global data */
+    g_displayLevel = parameters.zParams.notificationLevel;
+    /* Assign splitPoint and f if not provided */
+    parameters.splitPoint = 1.0;
+    parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f;
+    parameters.accel = parameters.accel == 0 ? DEFAULT_ACCEL : parameters.accel;
+    /* Convert to cover parameter */
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(parameters, &coverParams);
+    /* Checks */
+    if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
+                                   parameters.accel)) {
+      DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    /* Assign corresponding FASTCOVER_accel_t to accelParams*/
+    accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
+    /* Initialize context */
+    if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+                            coverParams.d, parameters.splitPoint, parameters.f,
+                            accelParams)) {
+      DISPLAYLEVEL(1, "Failed to initialize context\n");
+      return ERROR(GENERIC);
+    }
+    /* Build the dictionary */
+    DISPLAYLEVEL(2, "Building dictionary\n");
+    {
+      /* Initialize array to keep track of frequency of dmer within activeSegment */
+      U16* segmentFreqs = (U16 *)calloc(((U64)1 << parameters.f), sizeof(U16));
+      const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
+                                                dictBufferCapacity, coverParams, segmentFreqs);
+      const unsigned nbFinalizeSamples = (unsigned)(ctx.nbTrainSamples * ctx.accelParams.finalize / 100);
+      const size_t dictionarySize = ZDICT_finalizeDictionary(
+          dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+          samplesBuffer, samplesSizes, nbFinalizeSamples, coverParams.zParams);
+      if (!ZSTD_isError(dictionarySize)) {
+          DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                      (U32)dictionarySize);
+      }
+      FASTCOVER_ctx_destroy(&ctx);
+      free(segmentFreqs);
+      return dictionarySize;
+    }
+}
+
+
+ZDICTLIB_API size_t
+ZDICT_optimizeTrainFromBuffer_fastCover(
+                    void* dictBuffer, size_t dictBufferCapacity,
+                    const void* samplesBuffer,
+                    const size_t* samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t* parameters)
+{
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* constants */
+    const unsigned nbThreads = parameters->nbThreads;
+    const double splitPoint =
+        parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
+    const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
+    const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+    const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+    const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+    const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
+    const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
+    const unsigned kIterations =
+        (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+    const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
+    const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
+    /* Local variables */
+    const int displayLevel = parameters->zParams.notificationLevel;
+    unsigned iteration = 1;
+    unsigned d;
+    unsigned k;
+    COVER_best_t best;
+    POOL_ctx *pool = NULL;
+    /* Checks */
+    if (splitPoint <= 0 || splitPoint > 1) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
+      return ERROR(GENERIC);
+    }
+    if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
+      return ERROR(GENERIC);
+    }
+    if (kMinK < kMaxD || kMaxK < kMinK) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    if (nbThreads > 1) {
+      pool = POOL_create(nbThreads, 1);
+      if (!pool) {
+        return ERROR(memory_allocation);
+      }
+    }
+    /* Initialization */
+    COVER_best_init(&best);
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(*parameters, &coverParams);
+    accelParams = FASTCOVER_defaultAccelParameters[accel];
+    /* Turn down global display level to clean up display at level 2 and below */
+    g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
+    /* Loop through d first because each new value needs a new context */
+    LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
+                      kIterations);
+    for (d = kMinD; d <= kMaxD; d += 2) {
+      /* Initialize the context for this value of d */
+      FASTCOVER_ctx_t ctx;
+      LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
+      if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams)) {
+        LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+        COVER_best_destroy(&best);
+        POOL_free(pool);
+        return ERROR(GENERIC);
+      }
+      /* Loop through k reusing the same context */
+      for (k = kMinK; k <= kMaxK; k += kStepSize) {
+        /* Prepare the arguments */
+        FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc(
+            sizeof(FASTCOVER_tryParameters_data_t));
+        LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
+        if (!data) {
+          LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
+          COVER_best_destroy(&best);
+          FASTCOVER_ctx_destroy(&ctx);
+          POOL_free(pool);
+          return ERROR(GENERIC);
+        }
+        data->ctx = &ctx;
+        data->best = &best;
+        data->dictBufferCapacity = dictBufferCapacity;
+        data->parameters = coverParams;
+        data->parameters.k = k;
+        data->parameters.d = d;
+        data->parameters.splitPoint = splitPoint;
+        data->parameters.steps = kSteps;
+        data->parameters.zParams.notificationLevel = g_displayLevel;
+        /* Check the parameters */
+        if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
+                                       data->ctx->f, accel)) {
+          DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+          free(data);
+          continue;
+        }
+        /* Call the function and pass ownership of data to it */
+        COVER_best_start(&best);
+        if (pool) {
+          POOL_add(pool, &FASTCOVER_tryParameters, data);
+        } else {
+          FASTCOVER_tryParameters(data);
+        }
+        /* Print status */
+        LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
+                           (U32)((iteration * 100) / kIterations));
+        ++iteration;
+      }
+      COVER_best_wait(&best);
+      FASTCOVER_ctx_destroy(&ctx);
+    }
+    LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
+    /* Fill the output buffer and parameters with output of the best parameters */
+    {
+      const size_t dictSize = best.dictSize;
+      if (ZSTD_isError(best.compressedSize)) {
+        const size_t compressedSize = best.compressedSize;
+        COVER_best_destroy(&best);
+        POOL_free(pool);
+        return compressedSize;
+      }
+      FASTCOVER_convertToFastCoverParams(best.parameters, parameters, f, accel);
+      memcpy(dictBuffer, best.dict, dictSize);
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return dictSize;
+    }
+
+}
diff --git a/contrib/python-zstandard/zstd/dictBuilder/zdict.c b/contrib/python-zstandard/zstd/dictBuilder/zdict.c
--- a/contrib/python-zstandard/zstd/dictBuilder/zdict.c
+++ b/contrib/python-zstandard/zstd/dictBuilder/zdict.c
@@ -293,7 +293,7 @@ static dictItem ZDICT_analyzePos(
             refinedEnd = refinedStart + selectedCount;
         }
 
-        /* evaluate gain based on new ref */
+        /* evaluate gain based on new dict */
         start = refinedStart;
         pos = suffix[refinedStart];
         end = start;
@@ -341,7 +341,7 @@ static dictItem ZDICT_analyzePos(
         for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
             savings[i] = savings[i-1] + (lengthList[i] * (i-3));
 
-        DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f)  \n",
+        DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f)  \n",
                      (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
 
         solution.pos = (U32)pos;
@@ -581,7 +581,7 @@ static void ZDICT_fillNoise(void* buffer
 
 typedef struct
 {
-    ZSTD_CCtx* ref;    /* contains reference to dictionary */
+    ZSTD_CDict* dict;    /* dictionary */
     ZSTD_CCtx* zc;     /* working context */
     void* workPlace;   /* must be ZSTD_BLOCKSIZE_MAX allocated */
 } EStats_ress_t;
@@ -597,8 +597,9 @@ static void ZDICT_countEStats(EStats_res
     size_t cSize;
 
     if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */
-    {   size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
-        if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
+    {   size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
+        if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
+
     }
     cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
     if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
@@ -697,7 +698,7 @@ static size_t ZDICT_analyzeEntropy(void*
     short litLengthNCount[MaxLL+1];
     U32 repOffset[MAXREPOFFSET];
     offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
-    EStats_ress_t esr;
+    EStats_ress_t esr = { NULL, NULL, NULL };
     ZSTD_parameters params;
     U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
     size_t pos = 0, errorCode;
@@ -708,14 +709,6 @@ static size_t ZDICT_analyzeEntropy(void*
 
     /* init */
     DEBUGLOG(4, "ZDICT_analyzeEntropy");
-    esr.ref = ZSTD_createCCtx();
-    esr.zc = ZSTD_createCCtx();
-    esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
-    if (!esr.ref || !esr.zc || !esr.workPlace) {
-        eSize = ERROR(memory_allocation);
-        DISPLAYLEVEL(1, "Not enough memory \n");
-        goto _cleanup;
-    }
     if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; }   /* too large dictionary */
     for (u=0; u<256; u++) countLit[u] = 1;   /* any character must be described */
     for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
@@ -724,14 +717,17 @@ static size_t ZDICT_analyzeEntropy(void*
     memset(repOffset, 0, sizeof(repOffset));
     repOffset[1] = repOffset[4] = repOffset[8] = 1;
     memset(bestRepOffset, 0, sizeof(bestRepOffset));
-    if (compressionLevel<=0) compressionLevel = g_compressionLevel_default;
+    if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
     params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
-    {   size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
-        if (ZSTD_isError(beginResult)) {
-            DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
-            eSize = ERROR(GENERIC);
-            goto _cleanup;
-    }   }
+
+    esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
+    esr.zc = ZSTD_createCCtx();
+    esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
+    if (!esr.dict || !esr.zc || !esr.workPlace) {
+        eSize = ERROR(memory_allocation);
+        DISPLAYLEVEL(1, "Not enough memory \n");
+        goto _cleanup;
+    }
 
     /* collect stats on all samples */
     for (u=0; u<nbFiles; u++) {
@@ -856,7 +852,7 @@ static size_t ZDICT_analyzeEntropy(void*
     eSize += 12;
 
 _cleanup:
-    ZSTD_freeCCtx(esr.ref);
+    ZSTD_freeCDict(esr.dict);
     ZSTD_freeCCtx(esr.zc);
     free(esr.workPlace);
 
@@ -867,13 +863,13 @@ static size_t ZDICT_analyzeEntropy(void*
 
 size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
                           const void* customDictContent, size_t dictContentSize,
-                          const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                          ZDICT_params_t params)
+                          const void* samplesBuffer, const size_t* samplesSizes,
+                          unsigned nbSamples, ZDICT_params_t params)
 {
     size_t hSize;
 #define HBUFFSIZE 256   /* should prove large enough for all entropy headers */
     BYTE header[HBUFFSIZE];
-    int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
+    int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
     U32 const notificationLevel = params.notificationLevel;
 
     /* check conditions */
@@ -914,11 +910,12 @@ size_t ZDICT_finalizeDictionary(void* di
 }
 
 
-size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
-                                                 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                                                 ZDICT_params_t params)
+static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
+        void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+        const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+        ZDICT_params_t params)
 {
-    int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
+    int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
     U32 const notificationLevel = params.notificationLevel;
     size_t hSize = 8;
 
@@ -947,7 +944,11 @@ size_t ZDICT_addEntropyTablesFromBuffer_
     return MIN(dictBufferCapacity, hSize+dictContentSize);
 }
 
-
+/* Hidden declaration for dbio.c */
+size_t ZDICT_trainFromBuffer_unsafe_legacy(
+                            void* dictBuffer, size_t maxDictSize,
+                            const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                            ZDICT_legacy_params_t params);
 /*! ZDICT_trainFromBuffer_unsafe_legacy() :
 *   Warning : `samplesBuffer` must be followed by noisy guard band.
 *   @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
@@ -991,8 +992,10 @@ size_t ZDICT_trainFromBuffer_unsafe_lega
             U32 const pos = dictList[u].pos;
             U32 const length = dictList[u].length;
             U32 const printedLength = MIN(40, length);
-            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
+            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
+                free(dictList);
                 return ERROR(GENERIC);   /* should never happen */
+            }
             DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
                          u, length, pos, dictList[u].savings);
             ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
@@ -1082,17 +1085,17 @@ size_t ZDICT_trainFromBuffer_legacy(void
 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
 {
-    ZDICT_cover_params_t params;
+    ZDICT_fastCover_params_t params;
     DEBUGLOG(3, "ZDICT_trainFromBuffer");
     memset(&params, 0, sizeof(params));
     params.d = 8;
     params.steps = 4;
     /* Default to level 6 since no compression level information is available */
-    params.zParams.compressionLevel = 6;
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
-    params.zParams.notificationLevel = ZSTD_DEBUG;
+    params.zParams.compressionLevel = 3;
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
+    params.zParams.notificationLevel = DEBUGLEVEL;
 #endif
-    return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
+    return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
                                                samplesBuffer, samplesSizes, nbSamples,
                                                &params);
 }
diff --git a/contrib/python-zstandard/zstd/dictBuilder/zdict.h b/contrib/python-zstandard/zstd/dictBuilder/zdict.h
--- a/contrib/python-zstandard/zstd/dictBuilder/zdict.h
+++ b/contrib/python-zstandard/zstd/dictBuilder/zdict.h
@@ -39,7 +39,8 @@ extern "C" {
 
 /*! ZDICT_trainFromBuffer():
  *  Train a dictionary from an array of samples.
- *  Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
+ *  Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
+ *  f=20, and accel=1.
  *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
  *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
  *  The resulting dictionary will be saved into `dictBuffer`.
@@ -52,7 +53,8 @@ extern "C" {
  *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
  */
 ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
-                                    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+                                    const void* samplesBuffer,
+                                    const size_t* samplesSizes, unsigned nbSamples);
 
 
 /*======   Helper functions   ======*/
@@ -84,11 +86,22 @@ typedef struct {
 typedef struct {
     unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
     unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
-    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
     unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
     ZDICT_params_t zParams;
 } ZDICT_cover_params_t;
 
+typedef struct {
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
+    unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
+    unsigned f;                  /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
+    unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
+    unsigned accel;              /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
+    ZDICT_params_t zParams;
+} ZDICT_fastCover_params_t;
 
 /*! ZDICT_trainFromBuffer_cover():
  *  Train a dictionary from an array of samples using the COVER algorithm.
@@ -115,9 +128,9 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffe
  * dictionary constructed with those parameters is stored in `dictBuffer`.
  *
  * All of the parameters d, k, steps are optional.
- * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
  * if steps is zero it defaults to its default value.
- * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
  *
  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  *           or an error code, which can be tested with ZDICT_isError().
@@ -129,6 +142,48 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainF
     const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
           ZDICT_cover_params_t* parameters);
 
+/*! ZDICT_trainFromBuffer_fastCover():
+ *  Train a dictionary from an array of samples using a modified version of COVER algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  d and k are required.
+ *  All other parameters are optional, will use default values if not provided
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note: ZDICT_trainFromBuffer_fastCover() requires about 1 bytes of memory for each input byte and additionally another 6 * 2^f bytes of memory .
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
+                    size_t dictBufferCapacity, const void *samplesBuffer,
+                    const size_t *samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t parameters);
+
+/*! ZDICT_optimizeTrainFromBuffer_fastCover():
+ * The same requirements as above hold for all the parameters except `parameters`.
+ * This function tries many parameter combinations (specifically, k and d combinations)
+ * and picks the best parameters. `*parameters` is filled with the best parameters found,
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
+ * All of the parameters d, k, steps, f, and accel are optional.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
+ * if steps is zero it defaults to its default value.
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
+ * If f is zero, default value of 20 is used.
+ * If accel is zero, default value of 1 is used.
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ *           On success `*parameters` contains the parameters selected.
+ * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 1 byte of memory for each input byte and additionally another 6 * 2^f bytes of memory for each thread.
+ */
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
+                    size_t dictBufferCapacity, const void* samplesBuffer,
+                    const size_t* samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t* parameters);
+
 /*! ZDICT_finalizeDictionary():
  * Given a custom content as a basis for dictionary, and a set of samples,
  * finalize dictionary by adding headers and statistics.
diff --git a/contrib/python-zstandard/zstd/zstd.h b/contrib/python-zstandard/zstd/zstd.h
--- a/contrib/python-zstandard/zstd/zstd.h
+++ b/contrib/python-zstandard/zstd/zstd.h
@@ -35,31 +35,43 @@ extern "C" {
 #endif
 
 
-/*******************************************************************************************************
+/*******************************************************************************
   Introduction
 
-  zstd, short for Zstandard, is a fast lossless compression algorithm,
-  targeting real-time compression scenarios at zlib-level and better compression ratios.
-  The zstd compression library provides in-memory compression and decompression functions.
-  The library supports compression levels from 1 up to ZSTD_maxCLevel() which is currently 22.
-  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
   Compression can be done in:
     - a single step (described as Simple API)
     - a single step, reusing a context (described as Explicit context)
     - unbounded multiple steps (described as Streaming compression)
-  The compression ratio achievable on small data can be highly improved using a dictionary in:
+
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
     - a single step (described as Simple dictionary API)
-    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
 
-  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
-  Advanced experimental APIs shall never be used with a dynamic library.
-  They are not "stable", their definition may change in the future. Only static linking is allowed.
-*********************************************************************************************************/
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
+*******************************************************************************/
 
 /*------   Version   ------*/
 #define ZSTD_VERSION_MAJOR    1
 #define ZSTD_VERSION_MINOR    3
-#define ZSTD_VERSION_RELEASE  4
+#define ZSTD_VERSION_RELEASE  6
 
 #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
 ZSTDLIB_API unsigned ZSTD_versionNumber(void);   /**< useful to check dll version */
@@ -68,8 +80,14 @@ ZSTDLIB_API unsigned ZSTD_versionNumber(
 #define ZSTD_QUOTE(str) #str
 #define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
 #define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
-ZSTDLIB_API const char* ZSTD_versionString(void);   /* added in v1.3.0 */
+ZSTDLIB_API const char* ZSTD_versionString(void);   /* v1.3.0+ */
 
+/***************************************
+*  Default constant
+***************************************/
+#ifndef ZSTD_CLEVEL_DEFAULT
+#  define ZSTD_CLEVEL_DEFAULT 3
+#endif
 
 /***************************************
 *  Simple API
@@ -96,7 +114,7 @@ ZSTDLIB_API size_t ZSTD_decompress( void
  *  `src` should point to the start of a ZSTD encoded frame.
  *  `srcSize` must be at least as large as the frame header.
  *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
- *  @return : - decompressed size of the frame in `src`, if known
+ *  @return : - decompressed size of `src` frame content, if known
  *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
  *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
  *   note 1 : a 0 return value means the frame is valid but "empty".
@@ -106,7 +124,8 @@ ZSTDLIB_API size_t ZSTD_decompress( void
  *            Optionally, application can rely on some implicit limit,
  *            as ZSTD_decompress() only needs an upper bound of decompressed size.
  *            (For example, data could be necessarily cut into blocks <= 16 KB).
- *   note 3 : decompressed size is always present when compression is done with ZSTD_compress()
+ *   note 3 : decompressed size is always present when compression is completed using single-pass functions,
+ *            such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
  *   note 4 : decompressed size can be very large (64-bits value),
  *            potentially larger than what local system can handle as a single memory segment.
  *            In which case, it's necessary to use streaming mode to decompress data.
@@ -123,8 +142,7 @@ ZSTDLIB_API unsigned long long ZSTD_getF
  *  Both functions work the same way, but ZSTD_getDecompressedSize() blends
  *  "empty", "unknown" and "error" results to the same return value (0),
  *  while ZSTD_getFrameContentSize() gives them separate return values.
- * `src` is the start of a zstd compressed frame.
- * @return : content size to be decompressed, as a 64-bits value _if known and not empty_, 0 otherwise. */
+ * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
 ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
 
 
@@ -205,7 +223,8 @@ typedef struct ZSTD_CDict_s ZSTD_CDict;
  *  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
  *  ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
  *  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
- *  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict */
+ *  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict
+ *  Note : A ZSTD_CDict can be created with an empty dictionary, but it is inefficient for small data. */
 ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
                                          int compressionLevel);
 
@@ -217,7 +236,9 @@ ZSTDLIB_API size_t      ZSTD_freeCDict(Z
  *  Compression using a digested Dictionary.
  *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
  *  Note that compression level is decided during dictionary creation.
- *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no)
+ *  Note : ZSTD_compress_usingCDict() can be used with a ZSTD_CDict created from an empty dictionary.
+ *         But it is inefficient for small data, and it is recommended to use ZSTD_compressCCtx(). */
 ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
                                             void* dst, size_t dstCapacity,
                                       const void* src, size_t srcSize,
@@ -272,39 +293,44 @@ typedef struct ZSTD_outBuffer_s {
 *  since it will play nicer with system's memory, by re-using already allocated memory.
 *  Use one separate ZSTD_CStream per thread for parallel execution.
 *
-*  Start a new compression by initializing ZSTD_CStream.
+*  Start a new compression by initializing ZSTD_CStream context.
 *  Use ZSTD_initCStream() to start a new compression operation.
-*  Use ZSTD_initCStream_usingDict() or ZSTD_initCStream_usingCDict() for a compression which requires a dictionary (experimental section)
+*  Use variants ZSTD_initCStream_usingDict() or ZSTD_initCStream_usingCDict() for streaming with dictionary (experimental section)
 *
-*  Use ZSTD_compressStream() repetitively to consume input stream.
-*  The function will automatically update both `pos` fields.
-*  Note that it may not consume the entire input, in which case `pos < size`,
-*  and it's up to the caller to present again remaining data.
+*  Use ZSTD_compressStream() as many times as necessary to consume input stream.
+*  The function will automatically update both `pos` fields within `input` and `output`.
+*  Note that the function may not consume the entire input,
+*  for example, because the output buffer is already full,
+*  in which case `input.pos < input.size`.
+*  The caller must check if input has been entirely consumed.
+*  If not, the caller must make some room to receive more compressed data,
+*  typically by emptying output buffer, or allocating a new output buffer,
+*  and then present again remaining input data.
 *  @return : a size hint, preferred nb of bytes to use as input for next function call
 *            or an error code, which can be tested using ZSTD_isError().
 *            Note 1 : it's just a hint, to help latency a little, any other value will work fine.
 *            Note 2 : size hint is guaranteed to be <= ZSTD_CStreamInSize()
 *
-*  At any moment, it's possible to flush whatever data remains within internal buffer, using ZSTD_flushStream().
-*  `output->pos` will be updated.
-*  Note that some content might still be left within internal buffer if `output->size` is too small.
-*  @return : nb of bytes still present within internal buffer (0 if it's empty)
+*  At any moment, it's possible to flush whatever data might remain stuck within internal buffer,
+*  using ZSTD_flushStream(). `output->pos` will be updated.
+*  Note that, if `output->size` is too small, a single invocation of ZSTD_flushStream() might not be enough (return code > 0).
+*  In which case, make some room to receive more compressed data, and call again ZSTD_flushStream().
+*  @return : 0 if internal buffers are entirely flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
 *            or an error code, which can be tested using ZSTD_isError().
 *
 *  ZSTD_endStream() instructs to finish a frame.
 *  It will perform a flush and write frame epilogue.
 *  The epilogue is required for decoders to consider a frame completed.
-*  ZSTD_endStream() may not be able to flush full data if `output->size` is too small.
-*  In which case, call again ZSTD_endStream() to complete the flush.
+*  flush() operation is the same, and follows same rules as ZSTD_flushStream().
 *  @return : 0 if frame fully completed and fully flushed,
-             or >0 if some data is still present within internal buffer
-                  (value is minimum size estimation for remaining data to flush, but it could be more)
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
 *            or an error code, which can be tested using ZSTD_isError().
 *
 * *******************************************************************/
 
 typedef ZSTD_CCtx ZSTD_CStream;  /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
-                                 /* Continue to distinguish them for compatibility with versions <= v1.2.0 */
+                                 /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */
 /*===== ZSTD_CStream management functions =====*/
 ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
 ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);
@@ -359,21 +385,28 @@ ZSTDLIB_API size_t ZSTD_DStreamOutSize(v
 
 
 
+
+#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
 /****************************************************************************************
- * START OF ADVANCED AND EXPERIMENTAL FUNCTIONS
+ *   ADVANCED AND EXPERIMENTAL FUNCTIONS
+ ****************************************************************************************
  * The definitions in this section are considered experimental.
  * They should never be used with a dynamic library, as prototypes may change in the future.
  * They are provided for advanced scenarios.
  * Use them only in association with static linking.
  * ***************************************************************************************/
 
-#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
-#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+ZSTDLIB_API int ZSTD_minCLevel(void);  /*!< minimum negative compression level allowed */
 
-/* --- Constants ---*/
-#define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
+/* ---  Constants  ---*/
+#define ZSTD_MAGICNUMBER            0xFD2FB528   /* v0.8+ */
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437   /* v0.7+ */
 #define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
-#define ZSTD_MAGIC_DICTIONARY       0xEC30A437   /* >= v0.7.0 */
+
+#define ZSTD_BLOCKSIZELOG_MAX 17
+#define ZSTD_BLOCKSIZE_MAX   (1<<ZSTD_BLOCKSIZELOG_MAX)   /* define, for static allocation */
 
 #define ZSTD_WINDOWLOG_MAX_32   30
 #define ZSTD_WINDOWLOG_MAX_64   31
@@ -390,9 +423,10 @@ ZSTDLIB_API size_t ZSTD_DStreamOutSize(v
 #define ZSTD_SEARCHLOG_MIN       1
 #define ZSTD_SEARCHLENGTH_MAX    7   /* only for ZSTD_fast, other strategies are limited to 6 */
 #define ZSTD_SEARCHLENGTH_MIN    3   /* only for ZSTD_btopt, other strategies are limited to 4 */
-#define ZSTD_TARGETLENGTH_MIN    1   /* only used by btopt, btultra and btfast */
+#define ZSTD_TARGETLENGTH_MAX  ZSTD_BLOCKSIZE_MAX
+#define ZSTD_TARGETLENGTH_MIN    0   /* note : comparing this constant to an unsigned results in a tautological test */
+#define ZSTD_LDM_MINMATCH_MAX 4096
 #define ZSTD_LDM_MINMATCH_MIN    4
-#define ZSTD_LDM_MINMATCH_MAX 4096
 #define ZSTD_LDM_BUCKETSIZELOG_MAX 8
 
 #define ZSTD_FRAMEHEADERSIZE_PREFIX 5   /* minimum input size to know frame header size */
@@ -404,7 +438,8 @@ static const size_t ZSTD_frameHeaderSize
 static const size_t ZSTD_skippableHeaderSize = 8;  /* magic number + skippable frame length */
 
 
-/*--- Advanced types ---*/
+
+/* ---  Advanced types  --- */
 typedef enum { ZSTD_fast=1, ZSTD_dfast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2,
                ZSTD_btlazy2, ZSTD_btopt, ZSTD_btultra } ZSTD_strategy;   /* from faster to stronger */
 
@@ -480,9 +515,9 @@ ZSTDLIB_API size_t ZSTD_findFrameCompres
 ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
 
 /*! ZSTD_frameHeaderSize() :
-*   `src` should point to the start of a ZSTD frame
-*   `srcSize` must be >= ZSTD_frameHeaderSize_prefix.
-*   @return : size of the Frame Header */
+ *  srcSize must be >= ZSTD_frameHeaderSize_prefix.
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
 ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
 
 
@@ -711,29 +746,48 @@ ZSTDLIB_API size_t ZSTD_initCStream_usin
 
 /*! ZSTD_resetCStream() :
  *  start a new compression job, using same parameters from previous job.
- *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place..
+ *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
  *  Note that zcs must be init at least once before using ZSTD_resetCStream().
  *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
  *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
  *  For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
  *  but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
- * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ * @return : 0, or an error code (which can be tested using ZSTD_isError())
+ */
 ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
 
 
 typedef struct {
-    unsigned long long ingested;
-    unsigned long long consumed;
-    unsigned long long produced;
+    unsigned long long ingested;   /* nb input bytes read and buffered */
+    unsigned long long consumed;   /* nb input bytes actually compressed */
+    unsigned long long produced;   /* nb of compressed bytes generated and buffered */
+    unsigned long long flushed;    /* nb of compressed bytes flushed : not provided; can be tracked from caller side */
+    unsigned currentJobID;         /* MT only : latest started job nb */
+    unsigned nbActiveWorkers;      /* MT only : nb of workers actively compressing at probe time */
 } ZSTD_frameProgression;
 
-/* ZSTD_getFrameProgression():
+/* ZSTD_getFrameProgression() :
  * tells how much data has been ingested (read from input)
  * consumed (input actually compressed) and produced (output) for current frame.
- * Therefore, (ingested - consumed) is amount of input data buffered internally, not yet compressed.
- * Can report progression inside worker threads (multi-threading and non-blocking mode).
+ * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed.
+ * Aggregates progression inside active worker threads.
  */
-ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+
+/*! ZSTD_toFlushNow() :
+ *  Tell how many bytes are ready to be flushed immediately.
+ *  Useful for multithreading scenarios (nbWorkers >= 1).
+ *  Probe the oldest active job, defined as oldest job not yet entirely flushed,
+ *  and check its output buffer.
+ * @return : amount of data stored in oldest job and ready to be flushed immediately.
+ *  if @return == 0, it means either :
+ *  + there is no active job (could be checked with ZSTD_frameProgression()), or
+ *  + oldest job is still actively compressing data,
+ *    but everything it has produced has also been flushed so far,
+ *    therefore flushing speed is currently limited by production speed of oldest job
+ *    irrespective of the speed of concurrent newer jobs.
+ */
+ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
 
 
 
@@ -880,6 +934,11 @@ typedef struct {
     unsigned dictID;
     unsigned checksumFlag;
 } ZSTD_frameHeader;
+/** ZSTD_getFrameHeader() :
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
 ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
 ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
 
@@ -901,23 +960,15 @@ ZSTDLIB_API ZSTD_nextInputType_e ZSTD_ne
 /**       New advanced API (experimental)       */
 /* ============================================ */
 
-/* notes on API design :
- *   In this proposal, parameters are pushed one by one into an existing context,
- *   and then applied on all subsequent compression jobs.
- *   When no parameter is ever provided, CCtx is created with compression level ZSTD_CLEVEL_DEFAULT.
+/* API design :
+ *   In this advanced API, parameters are pushed one by one into an existing context,
+ *   using ZSTD_CCtx_set*() functions.
+ *   Pushed parameters are sticky : they are applied to next job, and any subsequent job.
+ *   It's possible to reset parameters to "default" using ZSTD_CCtx_reset().
+ *   Important : "sticky" parameters only work with `ZSTD_compress_generic()` !
+ *               For any other entry point, "sticky" parameters are ignored !
  *
  *   This API is intended to replace all others advanced / experimental API entry points.
- *   But it stands a reasonable chance to become "stable", after a reasonable testing period.
- */
-
-/* note on naming convention :
- *   Initially, the API favored names like ZSTD_setCCtxParameter() .
- *   In this proposal, convention is changed towards ZSTD_CCtx_setParameter() .
- *   The main driver is that it identifies more clearly the target object type.
- *   It feels clearer when considering multiple targets :
- *   ZSTD_CDict_setParameter() (rather than ZSTD_setCDictParameter())
- *   ZSTD_CCtxParams_setParameter()  (rather than ZSTD_setCCtxParamsParameter() )
- *   etc...
  */
 
 /* note on enum design :
@@ -947,7 +998,7 @@ typedef enum {
     /* compression parameters */
     ZSTD_p_compressionLevel=100, /* Update all compression parameters according to pre-defined cLevel table
                               * Default level is ZSTD_CLEVEL_DEFAULT==3.
-                              * Special: value 0 means "do not change cLevel".
+                              * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT.
                               * Note 1 : it's possible to pass a negative compression level by casting it to unsigned type.
                               * Note 2 : setting a level sets all default values of other compression parameters.
                               * Note 3 : setting compressionLevel automatically updates ZSTD_p_compressLiterals. */
@@ -956,16 +1007,19 @@ typedef enum {
                               * Special: value 0 means "use default windowLog".
                               * Note: Using a window size greater than ZSTD_MAXWINDOWSIZE_DEFAULT (default: 2^27)
                               *       requires explicitly allowing such window size during decompression stage. */
-    ZSTD_p_hashLog,          /* Size of the probe table, as a power of 2.
+    ZSTD_p_hashLog,          /* Size of the initial probe table, as a power of 2.
                               * Resulting table size is (1 << (hashLog+2)).
                               * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
                               * Larger tables improve compression ratio of strategies <= dFast,
                               * and improve speed of strategies > dFast.
                               * Special: value 0 means "use default hashLog". */
-    ZSTD_p_chainLog,         /* Size of the full-search table, as a power of 2.
+    ZSTD_p_chainLog,         /* Size of the multi-probe search table, as a power of 2.
                               * Resulting table size is (1 << (chainLog+2)).
+                              * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX.
                               * Larger tables result in better and slower compression.
                               * This parameter is useless when using "fast" strategy.
+                              * Note it's still useful when using "dfast" strategy,
+                              * in which case it defines a secondary probe table.
                               * Special: value 0 means "use default chainLog". */
     ZSTD_p_searchLog,        /* Number of search attempts, as a power of 2.
                               * More attempts result in better and slower compression.
@@ -1047,27 +1101,52 @@ typedef enum {
     /* experimental parameters - no stability guaranteed                   */
     /* =================================================================== */
 
-    ZSTD_p_compressLiterals=1000, /* control huffman compression of literals (enabled) by default.
-                              * disabling it improves speed and decreases compression ratio by a large amount.
-                              * note : this setting is automatically updated when changing compression level.
-                              *        positive compression levels set ZSTD_p_compressLiterals to 1.
-                              *        negative compression levels set ZSTD_p_compressLiterals to 0. */
-
     ZSTD_p_forceMaxWindow=1100, /* Force back-reference distances to remain < windowSize,
                               * even when referencing into Dictionary content (default:0) */
+    ZSTD_p_forceAttachDict,  /* ZSTD supports usage of a CDict in-place
+                              * (avoiding having to copy the compression tables
+                              * from the CDict into the working context). Using
+                              * a CDict in this way saves an initial setup step,
+                              * but comes at the cost of more work per byte of
+                              * input. ZSTD has a simple internal heuristic that
+                              * guesses which strategy will be faster. You can
+                              * use this flag to override that guess.
+                              *
+                              * Note that the by-reference, in-place strategy is
+                              * only used when reusing a compression context
+                              * with compatible compression parameters. (If
+                              * incompatible / uninitialized, the working
+                              * context needs to be cleared anyways, which is
+                              * about as expensive as overwriting it with the
+                              * dictionary context, so there's no savings in
+                              * using the CDict by-ref.)
+                              *
+                              * Values greater than 0 force attaching the dict.
+                              * Values less than 0 force copying the dict.
+                              * 0 selects the default heuristic-guided behavior.
+                              */
 
 } ZSTD_cParameter;
 
 
 /*! ZSTD_CCtx_setParameter() :
  *  Set one compression parameter, selected by enum ZSTD_cParameter.
- *  Setting a parameter is generally only possible during frame initialization (before starting compression),
- *  except for a few exceptions which can be updated during compression: compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
- *  Note : when `value` is an enum, cast it to unsigned for proper type checking.
- *  @result : informational value (typically, value being set clamped correctly),
+ *  Setting a parameter is generally only possible during frame initialization (before starting compression).
+ *  Exception : when using multi-threading mode (nbThreads >= 1),
+ *              following parameters can be updated _during_ compression (within same frame):
+ *              => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
+ *              new parameters will be active on next job, or after a flush().
+ *  Note : when `value` type is not unsigned (int, or enum), cast it to unsigned for proper type checking.
+ *  @result : informational value (typically, value being set, correctly clamped),
  *            or an error code (which can be tested with ZSTD_isError()). */
 ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value);
 
+/*! ZSTD_CCtx_getParameter() :
+ * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned* value);
+
 /*! ZSTD_CCtx_setPledgedSrcSize() :
  *  Total input data size to be compressed as a single frame.
  *  This value will be controlled at the end, and result in error if not respected.
@@ -1114,30 +1193,45 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZS
 
 /*! ZSTD_CCtx_refPrefix() :
  *  Reference a prefix (single-usage dictionary) for next compression job.
- *  Decompression need same prefix to properly regenerate data.
- *  Prefix is **only used once**. Tables are discarded at end of compression job.
- *  Subsequent compression jobs will be done without prefix (if none is explicitly referenced).
- *  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_CDict instead.
+ *  Decompression will need same prefix to properly regenerate data.
+ *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+ *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ *  Note that prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
- *  Note 1 : Prefix buffer is referenced. It must outlive compression job.
- *  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ *  Note 1 : Prefix buffer is referenced. It **must** outlive compression job.
+ *           Its contain must remain unmodified up to end of compression (ZSTD_e_end).
+ *  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+ *           ensure that the window size is large enough to contain the entire source.
+ *           See ZSTD_p_windowLog.
+ *  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
  *           It's a CPU consuming operation, with non-negligible impact on latency.
- *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+ *           If there is a need to use same prefix multiple times, consider loadDictionary instead.
+ *  Note 4 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
  *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. */
-ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize);
-ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                       const void* prefix, size_t prefixSize);
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx,
+                                       const void* prefix, size_t prefixSize,
+                                       ZSTD_dictContentType_e dictContentType);
 
 /*! ZSTD_CCtx_reset() :
  *  Return a CCtx to clean state.
  *  Useful after an error, or to interrupt an ongoing compression job and start a new one.
  *  Any internal data not yet flushed is cancelled.
- *  Dictionary (if any) is dropped.
- *  All parameters are back to default values.
- *  It's possible to modify compression parameters after a reset.
+ *  The parameters and dictionary are kept unchanged, to reset them use ZSTD_CCtx_resetParameters().
  */
 ZSTDLIB_API void ZSTD_CCtx_reset(ZSTD_CCtx* cctx);
 
+/*! ZSTD_CCtx_resetParameters() :
+ *  All parameters are back to default values (compression level is ZSTD_CLEVEL_DEFAULT).
+ *  Dictionary (if any) is dropped.
+ *  Resetting parameters is only possible during frame initialization (before starting compression).
+ *  To reset the context use ZSTD_CCtx_reset().
+ *  @return 0 or an error code (which can be checked with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_resetParameters(ZSTD_CCtx* cctx);
+
 
 
 typedef enum {
@@ -1235,6 +1329,13 @@ ZSTDLIB_API size_t ZSTD_CCtxParams_init_
  */
 ZSTDLIB_API size_t ZSTD_CCtxParam_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, unsigned value);
 
+/*! ZSTD_CCtxParam_getParameter() :
+ * Similar to ZSTD_CCtx_getParameter.
+ * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParam_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, unsigned* value);
+
 /*! ZSTD_CCtx_setParametersUsingCCtxParams() :
  *  Apply a set of ZSTD_CCtx_params to the compression context.
  *  This can be done even after compression is started,
@@ -1246,10 +1347,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_setParamete
         ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
 
 
-/*===   Advanced parameters for decompression API  ===*/
+/* ==================================== */
+/*===   Advanced decompression API   ===*/
+/* ==================================== */
 
-/* The following parameters must be set after creating a ZSTD_DCtx* (or ZSTD_DStream*) object,
- * but before starting decompression of a frame.
+/* The following API works the same way as the advanced compression API :
+ * a context is created, parameters are pushed into it one by one,
+ * then the context can be used to decompress data using an interface similar to the straming API.
  */
 
 /*! ZSTD_DCtx_loadDictionary() :
@@ -1286,17 +1390,25 @@ ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZS
 
 /*! ZSTD_DCtx_refPrefix() :
  *  Reference a prefix (single-usage dictionary) for next compression job.
- *  Prefix is **only used once**. It must be explicitly referenced before each frame.
- *  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_DDict instead.
+ *  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+ *  and must use the same prefix as the one used during compression.
+ *  Prefix is **only used once**. Reference is discarded at end of frame.
+ *  End of frame is reached when ZSTD_DCtx_decompress_generic() returns 0.
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary
- *  Note 2 : Prefix buffer is referenced. It must outlive compression job.
+ *  Note 2 : Prefix buffer is referenced. It **must** outlive decompression job.
+ *           Prefix buffer must remain unmodified up to the end of frame,
+ *           reached when ZSTD_DCtx_decompress_generic() returns 0.
  *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
  *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode.
  *  Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost.
+ *           A fulldict prefix is more costly though.
  */
-ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize);
-ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx,
+                                    const void* prefix, size_t prefixSize);
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx,
+                                    const void* prefix, size_t prefixSize,
+                                    ZSTD_dictContentType_e dictContentType);
 
 
 /*! ZSTD_DCtx_setMaxWindowSize() :
@@ -1318,6 +1430,13 @@ ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindo
 ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
 
 
+/*! ZSTD_getFrameHeader_advanced() :
+ *  same as ZSTD_getFrameHeader(),
+ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr,
+                        const void* src, size_t srcSize, ZSTD_format_e format);
+
+
 /*! ZSTD_decompress_generic() :
  *  Behave the same as ZSTD_decompressStream.
  *  Decompression parameters cannot be changed once decompression is started.
@@ -1383,8 +1502,6 @@ ZSTDLIB_API void ZSTD_DCtx_reset(ZSTD_DC
         Use ZSTD_insertBlock() for such a case.
 */
 
-#define ZSTD_BLOCKSIZELOG_MAX 17
-#define ZSTD_BLOCKSIZE_MAX   (1<<ZSTD_BLOCKSIZELOG_MAX)   /* define, for static allocation */
 /*=====   Raw zstd block functions  =====*/
 ZSTDLIB_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
 ZSTDLIB_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
diff --git a/contrib/python-zstandard/zstd_cffi.py b/contrib/python-zstandard/zstd_cffi.py
--- a/contrib/python-zstandard/zstd_cffi.py
+++ b/contrib/python-zstandard/zstd_cffi.py
@@ -40,6 +40,8 @@ from __future__ import absolute_import, 
     'DECOMPRESSION_RECOMMENDED_INPUT_SIZE',
     'DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE',
     'MAGIC_NUMBER',
+    'BLOCKSIZELOG_MAX',
+    'BLOCKSIZE_MAX',
     'WINDOWLOG_MIN',
     'WINDOWLOG_MAX',
     'CHAINLOG_MIN',
@@ -52,6 +54,7 @@ from __future__ import absolute_import, 
     'SEARCHLENGTH_MIN',
     'SEARCHLENGTH_MAX',
     'TARGETLENGTH_MIN',
+    'TARGETLENGTH_MAX',
     'LDM_MINMATCH_MIN',
     'LDM_MINMATCH_MAX',
     'LDM_BUCKETSIZELOG_MAX',
@@ -102,6 +105,8 @@ CONTENTSIZE_UNKNOWN = lib.ZSTD_CONTENTSI
 CONTENTSIZE_ERROR = lib.ZSTD_CONTENTSIZE_ERROR
 ZSTD_VERSION = (lib.ZSTD_VERSION_MAJOR, lib.ZSTD_VERSION_MINOR, lib.ZSTD_VERSION_RELEASE)
 
+BLOCKSIZELOG_MAX = lib.ZSTD_BLOCKSIZELOG_MAX
+BLOCKSIZE_MAX = lib.ZSTD_BLOCKSIZE_MAX
 WINDOWLOG_MIN = lib.ZSTD_WINDOWLOG_MIN
 WINDOWLOG_MAX = lib.ZSTD_WINDOWLOG_MAX
 CHAINLOG_MIN = lib.ZSTD_CHAINLOG_MIN
@@ -114,6 +119,7 @@ SEARCHLOG_MAX = lib.ZSTD_SEARCHLOG_MAX
 SEARCHLENGTH_MIN = lib.ZSTD_SEARCHLENGTH_MIN
 SEARCHLENGTH_MAX = lib.ZSTD_SEARCHLENGTH_MAX
 TARGETLENGTH_MIN = lib.ZSTD_TARGETLENGTH_MIN
+TARGETLENGTH_MAX = lib.ZSTD_TARGETLENGTH_MAX
 LDM_MINMATCH_MIN = lib.ZSTD_LDM_MINMATCH_MIN
 LDM_MINMATCH_MAX = lib.ZSTD_LDM_MINMATCH_MAX
 LDM_BUCKETSIZELOG_MAX = lib.ZSTD_LDM_BUCKETSIZELOG_MAX
@@ -191,7 +197,6 @@ def _make_cctx_params(params):
         (lib.ZSTD_p_nbWorkers, params.threads),
         (lib.ZSTD_p_jobSize, params.job_size),
         (lib.ZSTD_p_overlapSizeLog, params.overlap_size_log),
-        (lib.ZSTD_p_compressLiterals, params.compress_literals),
         (lib.ZSTD_p_forceMaxWindow, params.force_max_window),
         (lib.ZSTD_p_enableLongDistanceMatching, params.enable_ldm),
         (lib.ZSTD_p_ldmHashLog, params.ldm_hash_log),
@@ -224,9 +229,6 @@ class ZstdCompressionParameters(object):
             if arg not in kwargs:
                 kwargs[arg] = getattr(params, attr)
 
-        if 'compress_literals' not in kwargs:
-            kwargs['compress_literals'] = 1 if level >= 0 else 0
-
         return ZstdCompressionParameters(**kwargs)
 
     def __init__(self, format=0, compression_level=0, window_log=0, hash_log=0,
@@ -235,14 +237,11 @@ class ZstdCompressionParameters(object):
                  write_dict_id=0, job_size=0, overlap_size_log=0,
                  force_max_window=0, enable_ldm=0, ldm_hash_log=0,
                  ldm_min_match=0, ldm_bucket_size_log=0, ldm_hash_every_log=0,
-                 threads=0, compress_literals=None):
+                 threads=0):
 
         if threads < 0:
             threads = _cpu_count()
 
-        if compress_literals is None:
-            compress_literals = compression_level >= 0
-
         self.format = format
         self.compression_level = compression_level
         self.window_log = window_log
@@ -257,7 +256,6 @@ class ZstdCompressionParameters(object):
         self.write_dict_id = write_dict_id
         self.job_size = job_size
         self.overlap_size_log = overlap_size_log
-        self.compress_literals = compress_literals
         self.force_max_window = force_max_window
         self.enable_ldm = enable_ldm
         self.ldm_hash_log = ldm_hash_log
@@ -411,13 +409,14 @@ class ZstdCompressionWriter(object):
                 raise ZstdError('zstd compress error: %s' %
                                 _zstd_error(zresult))
 
-            if not out_buffer.pos:
-                break
+            if out_buffer.pos:
+                self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:])
+                total_write += out_buffer.pos
+                self._bytes_compressed += out_buffer.pos
+                out_buffer.pos = 0
 
-            self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:])
-            total_write += out_buffer.pos
-            self._bytes_compressed += out_buffer.pos
-            out_buffer.pos = 0
+            if not zresult:
+                break
 
         return total_write
 
@@ -460,6 +459,14 @@ class ZstdCompressionObj(object):
         if self._finished:
             raise ZstdError('compressor object already finished')
 
+        if flush_mode == COMPRESSOBJ_FLUSH_BLOCK:
+            z_flush_mode = lib.ZSTD_e_flush
+        elif flush_mode == COMPRESSOBJ_FLUSH_FINISH:
+            z_flush_mode = lib.ZSTD_e_end
+            self._finished = True
+        else:
+            raise ZstdError('unhandled flush mode')
+
         assert self._out.pos == 0
 
         in_buffer = ffi.new('ZSTD_inBuffer *')
@@ -467,35 +474,13 @@ class ZstdCompressionObj(object):
         in_buffer.size = 0
         in_buffer.pos = 0
 
-        if flush_mode == COMPRESSOBJ_FLUSH_BLOCK:
-            zresult = lib.ZSTD_compress_generic(self._compressor._cctx,
-                                                self._out,
-                                                in_buffer,
-                                                lib.ZSTD_e_flush)
-            if lib.ZSTD_isError(zresult):
-                raise ZstdError('zstd compress error: %s' %
-                                _zstd_error(zresult))
-
-            # Output buffer is guaranteed to hold full block.
-            assert zresult == 0
-
-            if self._out.pos:
-                result = ffi.buffer(self._out.dst, self._out.pos)[:]
-                self._out.pos = 0
-                return result
-            else:
-                return b''
-
-        assert flush_mode == COMPRESSOBJ_FLUSH_FINISH
-        self._finished = True
-
         chunks = []
 
         while True:
             zresult = lib.ZSTD_compress_generic(self._compressor._cctx,
                                                 self._out,
                                                 in_buffer,
-                                                lib.ZSTD_e_end)
+                                                z_flush_mode)
             if lib.ZSTD_isError(zresult):
                 raise ZstdError('error ending compression stream: %s' %
                                 _zstd_error(zresult))
@@ -510,11 +495,107 @@ class ZstdCompressionObj(object):
         return b''.join(chunks)
 
 
+class ZstdCompressionChunker(object):
+    def __init__(self, compressor, chunk_size):
+        self._compressor = compressor
+        self._out = ffi.new('ZSTD_outBuffer *')
+        self._dst_buffer = ffi.new('char[]', chunk_size)
+        self._out.dst = self._dst_buffer
+        self._out.size = chunk_size
+        self._out.pos = 0
+
+        self._in = ffi.new('ZSTD_inBuffer *')
+        self._in.src = ffi.NULL
+        self._in.size = 0
+        self._in.pos = 0
+        self._finished = False
+
+    def compress(self, data):
+        if self._finished:
+            raise ZstdError('cannot call compress() after compression finished')
+
+        if self._in.src != ffi.NULL:
+            raise ZstdError('cannot perform operation before consuming output '
+                            'from previous operation')
+
+        data_buffer = ffi.from_buffer(data)
+
+        if not len(data_buffer):
+            return
+
+        self._in.src = data_buffer
+        self._in.size = len(data_buffer)
+        self._in.pos = 0
+
+        while self._in.pos < self._in.size:
+            zresult = lib.ZSTD_compress_generic(self._compressor._cctx,
+                                                self._out,
+                                                self._in,
+                                                lib.ZSTD_e_continue)
+
+            if self._in.pos == self._in.size:
+                self._in.src = ffi.NULL
+                self._in.size = 0
+                self._in.pos = 0
+
+            if lib.ZSTD_isError(zresult):
+                raise ZstdError('zstd compress error: %s' %
+                                _zstd_error(zresult))
+
+            if self._out.pos == self._out.size:
+                yield ffi.buffer(self._out.dst, self._out.pos)[:]
+                self._out.pos = 0
+
+    def flush(self):
+        if self._finished:
+            raise ZstdError('cannot call flush() after compression finished')
+
+        if self._in.src != ffi.NULL:
+            raise ZstdError('cannot call flush() before consuming output from '
+                            'previous operation')
+
+        while True:
+            zresult = lib.ZSTD_compress_generic(self._compressor._cctx,
+                                                self._out, self._in,
+                                                lib.ZSTD_e_flush)
+            if lib.ZSTD_isError(zresult):
+                raise ZstdError('zstd compress error: %s' % _zstd_error(zresult))
+
+            if self._out.pos:
+                yield ffi.buffer(self._out.dst, self._out.pos)[:]
+                self._out.pos = 0
+
+            if not zresult:
+                return
+
+    def finish(self):
+        if self._finished:
+            raise ZstdError('cannot call finish() after compression finished')
+
+        if self._in.src != ffi.NULL:
+            raise ZstdError('cannot call finish() before consuming output from '
+                            'previous operation')
+
+        while True:
+            zresult = lib.ZSTD_compress_generic(self._compressor._cctx,
+                                                self._out, self._in,
+                                                lib.ZSTD_e_end)
+            if lib.ZSTD_isError(zresult):
+                raise ZstdError('zstd compress error: %s' % _zstd_error(zresult))
+
+            if self._out.pos:
+                yield ffi.buffer(self._out.dst, self._out.pos)[:]
+                self._out.pos = 0
+
+            if not zresult:
+                self._finished = True
+                return
+
+
 class CompressionReader(object):
-    def __init__(self, compressor, source, size, read_size):
+    def __init__(self, compressor, source, read_size):
         self._compressor = compressor
         self._source = source
-        self._source_size = size
         self._read_size = read_size
         self._entered = False
         self._closed = False
@@ -530,12 +611,6 @@ class CompressionReader(object):
         if self._entered:
             raise ValueError('cannot __enter__ multiple times')
 
-        zresult = lib.ZSTD_CCtx_setPledgedSrcSize(self._compressor._cctx,
-                                                  self._source_size)
-        if lib.ZSTD_isError(zresult):
-            raise ZstdError('error setting source size: %s' %
-                            _zstd_error(zresult))
-
         self._entered = True
         return self
 
@@ -578,6 +653,7 @@ class CompressionReader(object):
         self._closed = True
         return None
 
+    @property
     def closed(self):
         return self._closed
 
@@ -596,9 +672,6 @@ class CompressionReader(object):
     next = __next__
 
     def read(self, size=-1):
-        if not self._entered:
-            raise ZstdError('read() must be called from an active context manager')
-
         if self._closed:
             raise ValueError('stream is closed')
 
@@ -759,16 +832,14 @@ class ZstdCompressor(object):
         self._dict_data = dict_data
 
         # We defer setting up garbage collection until after calling
-        # _ensure_cctx() to ensure the memory size estimate is more accurate.
+        # _setup_cctx() to ensure the memory size estimate is more accurate.
         try:
-            self._ensure_cctx()
+            self._setup_cctx()
         finally:
             self._cctx = ffi.gc(cctx, lib.ZSTD_freeCCtx,
                                 size=lib.ZSTD_sizeof_CCtx(cctx))
 
-    def _ensure_cctx(self):
-        lib.ZSTD_CCtx_reset(self._cctx)
-
+    def _setup_cctx(self):
         zresult = lib.ZSTD_CCtx_setParametersUsingCCtxParams(self._cctx,
                                                              self._params)
         if lib.ZSTD_isError(zresult):
@@ -793,7 +864,7 @@ class ZstdCompressor(object):
         return lib.ZSTD_sizeof_CCtx(self._cctx)
 
     def compress(self, data):
-        self._ensure_cctx()
+        lib.ZSTD_CCtx_reset(self._cctx)
 
         data_buffer = ffi.from_buffer(data)
 
@@ -830,7 +901,7 @@ class ZstdCompressor(object):
         return ffi.buffer(out, out_buffer.pos)[:]
 
     def compressobj(self, size=-1):
-        self._ensure_cctx()
+        lib.ZSTD_CCtx_reset(self._cctx)
 
         if size < 0:
             size = lib.ZSTD_CONTENTSIZE_UNKNOWN
@@ -851,6 +922,19 @@ class ZstdCompressor(object):
 
         return cobj
 
+    def chunker(self, size=-1, chunk_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE):
+        lib.ZSTD_CCtx_reset(self._cctx)
+
+        if size < 0:
+            size = lib.ZSTD_CONTENTSIZE_UNKNOWN
+
+        zresult = lib.ZSTD_CCtx_setPledgedSrcSize(self._cctx, size)
+        if lib.ZSTD_isError(zresult):
+            raise ZstdError('error setting source size: %s' %
+                            _zstd_error(zresult))
+
+        return ZstdCompressionChunker(self, chunk_size=chunk_size)
+
     def copy_stream(self, ifh, ofh, size=-1,
                     read_size=COMPRESSION_RECOMMENDED_INPUT_SIZE,
                     write_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE):
@@ -860,7 +944,7 @@ class ZstdCompressor(object):
         if not hasattr(ofh, 'write'):
             raise ValueError('second argument must have a write() method')
 
-        self._ensure_cctx()
+        lib.ZSTD_CCtx_reset(self._cctx)
 
         if size < 0:
             size = lib.ZSTD_CONTENTSIZE_UNKNOWN
@@ -927,7 +1011,7 @@ class ZstdCompressor(object):
 
     def stream_reader(self, source, size=-1,
                       read_size=COMPRESSION_RECOMMENDED_INPUT_SIZE):
-        self._ensure_cctx()
+        lib.ZSTD_CCtx_reset(self._cctx)
 
         try:
             size = len(source)
@@ -937,7 +1021,12 @@ class ZstdCompressor(object):
         if size < 0:
             size = lib.ZSTD_CONTENTSIZE_UNKNOWN
 
-        return CompressionReader(self, source, size, read_size)
+        zresult = lib.ZSTD_CCtx_setPledgedSrcSize(self._cctx, size)
+        if lib.ZSTD_isError(zresult):
+            raise ZstdError('error setting source size: %s' %
+                            _zstd_error(zresult))
+
+        return CompressionReader(self, source, read_size)
 
     def stream_writer(self, writer, size=-1,
                  write_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE):
@@ -945,7 +1034,7 @@ class ZstdCompressor(object):
         if not hasattr(writer, 'write'):
             raise ValueError('must pass an object with a write() method')
 
-        self._ensure_cctx()
+        lib.ZSTD_CCtx_reset(self._cctx)
 
         if size < 0:
             size = lib.ZSTD_CONTENTSIZE_UNKNOWN
@@ -967,7 +1056,7 @@ class ZstdCompressor(object):
             raise ValueError('must pass an object with a read() method or '
                              'conforms to buffer protocol')
 
-        self._ensure_cctx()
+        lib.ZSTD_CCtx_reset(self._cctx)
 
         if size < 0:
             size = lib.ZSTD_CONTENTSIZE_UNKNOWN
@@ -1267,7 +1356,7 @@ class ZstdDecompressionObj(object):
 
         chunks = []
 
-        while in_buffer.pos < in_buffer.size:
+        while True:
             zresult = lib.ZSTD_decompress_generic(self._decompressor._dctx,
                                                   out_buffer, in_buffer)
             if lib.ZSTD_isError(zresult):
@@ -1280,7 +1369,12 @@ class ZstdDecompressionObj(object):
 
             if out_buffer.pos:
                 chunks.append(ffi.buffer(out_buffer.dst, out_buffer.pos)[:])
-                out_buffer.pos = 0
+
+            if (zresult == 0 or
+                    (in_buffer.pos == in_buffer.size and out_buffer.pos == 0)):
+                break
+
+            out_buffer.pos = 0
 
         return b''.join(chunks)
 
@@ -1303,8 +1397,6 @@ class DecompressionReader(object):
         if self._entered:
             raise ValueError('cannot __enter__ multiple times')
 
-        self._decompressor._ensure_dctx()
-
         self._entered = True
         return self
 
@@ -1347,6 +1439,7 @@ class DecompressionReader(object):
         self._closed = True
         return None
 
+    @property
     def closed(self):
         return self._closed
 
@@ -1364,10 +1457,7 @@ class DecompressionReader(object):
 
     next = __next__
 
-    def read(self, size=-1):
-        if not self._entered:
-            raise ZstdError('read() must be called from an active context manager')
-
+    def read(self, size):
         if self._closed:
             raise ValueError('stream is closed')
 
@@ -1442,10 +1532,6 @@ class DecompressionReader(object):
         return ffi.buffer(out_buffer.dst, out_buffer.pos)[:]
 
     def seek(self, pos, whence=os.SEEK_SET):
-        if not self._entered:
-            raise ZstdError('seek() must be called from an active context '
-                            'manager')
-
         if self._closed:
             raise ValueError('stream is closed')
 
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -919,7 +919,9 @@ extmodules = [
 
 sys.path.insert(0, 'contrib/python-zstandard')
 import setup_zstd
-extmodules.append(setup_zstd.get_c_extension(name='mercurial.zstd'))
+extmodules.append(setup_zstd.get_c_extension(
+    name='mercurial.zstd',
+    root=os.path.abspath(os.path.dirname(__file__))))
 
 try:
     from distutils import cygwinccompiler