Show More
cborutil.py
1061 lines
| 35.9 KiB
| text/x-python
|
PythonLexer
Gregory Szorc
|
r37729 | # cborutil.py - CBOR extensions | ||
# | ||||
# Copyright 2018 Gregory Szorc <gregory.szorc@gmail.com> | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
from __future__ import absolute_import | ||||
import struct | ||||
Gregory Szorc
|
r39448 | import sys | ||
Gregory Szorc
|
r37729 | |||
Matt Harbison
|
r39490 | from .. import pycompat | ||
Gregory Szorc
|
r37729 | # Very short very of RFC 7049... | ||
# | ||||
# Each item begins with a byte. The 3 high bits of that byte denote the | ||||
# "major type." The lower 5 bits denote the "subtype." Each major type | ||||
# has its own encoding mechanism. | ||||
# | ||||
# Most types have lengths. However, bytestring, string, array, and map | ||||
# can be indefinite length. These are denotes by a subtype with value 31. | ||||
# Sub-components of those types then come afterwards and are terminated | ||||
# by a "break" byte. | ||||
MAJOR_TYPE_UINT = 0 | ||||
MAJOR_TYPE_NEGINT = 1 | ||||
MAJOR_TYPE_BYTESTRING = 2 | ||||
MAJOR_TYPE_STRING = 3 | ||||
MAJOR_TYPE_ARRAY = 4 | ||||
MAJOR_TYPE_MAP = 5 | ||||
MAJOR_TYPE_SEMANTIC = 6 | ||||
MAJOR_TYPE_SPECIAL = 7 | ||||
SUBTYPE_MASK = 0b00011111 | ||||
Gregory Szorc
|
r39448 | SUBTYPE_FALSE = 20 | ||
SUBTYPE_TRUE = 21 | ||||
SUBTYPE_NULL = 22 | ||||
Gregory Szorc
|
r37729 | SUBTYPE_HALF_FLOAT = 25 | ||
SUBTYPE_SINGLE_FLOAT = 26 | ||||
SUBTYPE_DOUBLE_FLOAT = 27 | ||||
SUBTYPE_INDEFINITE = 31 | ||||
Gregory Szorc
|
r39448 | SEMANTIC_TAG_FINITE_SET = 258 | ||
Gregory Szorc
|
r37729 | # Indefinite types begin with their major type ORd with information value 31. | ||
BEGIN_INDEFINITE_BYTESTRING = struct.pack( | ||||
Augie Fackler
|
r43906 | '>B', MAJOR_TYPE_BYTESTRING << 5 | SUBTYPE_INDEFINITE | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r37729 | BEGIN_INDEFINITE_ARRAY = struct.pack( | ||
Augie Fackler
|
r43906 | '>B', MAJOR_TYPE_ARRAY << 5 | SUBTYPE_INDEFINITE | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r37729 | BEGIN_INDEFINITE_MAP = struct.pack( | ||
Augie Fackler
|
r43906 | '>B', MAJOR_TYPE_MAP << 5 | SUBTYPE_INDEFINITE | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r37729 | |||
Augie Fackler
|
r43906 | ENCODED_LENGTH_1 = struct.Struct('>B') | ||
ENCODED_LENGTH_2 = struct.Struct('>BB') | ||||
ENCODED_LENGTH_3 = struct.Struct('>BH') | ||||
ENCODED_LENGTH_4 = struct.Struct('>BL') | ||||
ENCODED_LENGTH_5 = struct.Struct('>BQ') | ||||
Gregory Szorc
|
r37729 | |||
# The break ends an indefinite length item. | ||||
BREAK = b'\xff' | ||||
BREAK_INT = 255 | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | def encodelength(majortype, length): | ||
"""Obtain a value encoding the major type and its length.""" | ||||
if length < 24: | ||||
return ENCODED_LENGTH_1.pack(majortype << 5 | length) | ||||
elif length < 256: | ||||
return ENCODED_LENGTH_2.pack(majortype << 5 | 24, length) | ||||
elif length < 65536: | ||||
return ENCODED_LENGTH_3.pack(majortype << 5 | 25, length) | ||||
elif length < 4294967296: | ||||
return ENCODED_LENGTH_4.pack(majortype << 5 | 26, length) | ||||
else: | ||||
return ENCODED_LENGTH_5.pack(majortype << 5 | 27, length) | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | def streamencodebytestring(v): | ||
yield encodelength(MAJOR_TYPE_BYTESTRING, len(v)) | ||||
yield v | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | def streamencodebytestringfromiter(it): | ||
"""Convert an iterator of chunks to an indefinite bytestring. | ||||
Given an input that is iterable and each element in the iterator is | ||||
representable as bytes, emit an indefinite length bytestring. | ||||
""" | ||||
yield BEGIN_INDEFINITE_BYTESTRING | ||||
for chunk in it: | ||||
yield encodelength(MAJOR_TYPE_BYTESTRING, len(chunk)) | ||||
yield chunk | ||||
yield BREAK | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | def streamencodeindefinitebytestring(source, chunksize=65536): | ||
"""Given a large source buffer, emit as an indefinite length bytestring. | ||||
This is a generator of chunks constituting the encoded CBOR data. | ||||
""" | ||||
yield BEGIN_INDEFINITE_BYTESTRING | ||||
i = 0 | ||||
l = len(source) | ||||
while True: | ||||
Augie Fackler
|
r43346 | chunk = source[i : i + chunksize] | ||
Gregory Szorc
|
r37729 | i += len(chunk) | ||
yield encodelength(MAJOR_TYPE_BYTESTRING, len(chunk)) | ||||
yield chunk | ||||
if i >= l: | ||||
break | ||||
yield BREAK | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | def streamencodeint(v): | ||
if v >= 18446744073709551616 or v < -18446744073709551616: | ||||
Augie Fackler
|
r43347 | raise ValueError(b'big integers not supported') | ||
Gregory Szorc
|
r37729 | |||
if v >= 0: | ||||
yield encodelength(MAJOR_TYPE_UINT, v) | ||||
else: | ||||
yield encodelength(MAJOR_TYPE_NEGINT, abs(v) - 1) | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | def streamencodearray(l): | ||
"""Encode a known size iterable to an array.""" | ||||
yield encodelength(MAJOR_TYPE_ARRAY, len(l)) | ||||
for i in l: | ||||
for chunk in streamencode(i): | ||||
yield chunk | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | def streamencodearrayfromiter(it): | ||
"""Encode an iterator of items to an indefinite length array.""" | ||||
yield BEGIN_INDEFINITE_ARRAY | ||||
for i in it: | ||||
for chunk in streamencode(i): | ||||
yield chunk | ||||
yield BREAK | ||||
Augie Fackler
|
r43346 | |||
Augie Fackler
|
r37915 | def _mixedtypesortkey(v): | ||
return type(v).__name__, v | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | def streamencodeset(s): | ||
# https://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml defines | ||||
# semantic tag 258 for finite sets. | ||||
Gregory Szorc
|
r39448 | yield encodelength(MAJOR_TYPE_SEMANTIC, SEMANTIC_TAG_FINITE_SET) | ||
Gregory Szorc
|
r37729 | |||
Augie Fackler
|
r37915 | for chunk in streamencodearray(sorted(s, key=_mixedtypesortkey)): | ||
Gregory Szorc
|
r37729 | yield chunk | ||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | def streamencodemap(d): | ||
"""Encode dictionary to a generator. | ||||
Does not supporting indefinite length dictionaries. | ||||
""" | ||||
yield encodelength(MAJOR_TYPE_MAP, len(d)) | ||||
Augie Fackler
|
r43346 | for key, value in sorted( | ||
Gregory Szorc
|
r43376 | pycompat.iteritems(d), key=lambda x: _mixedtypesortkey(x[0]) | ||
Augie Fackler
|
r43346 | ): | ||
Gregory Szorc
|
r37729 | for chunk in streamencode(key): | ||
yield chunk | ||||
for chunk in streamencode(value): | ||||
yield chunk | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | def streamencodemapfromiter(it): | ||
"""Given an iterable of (key, value), encode to an indefinite length map.""" | ||||
yield BEGIN_INDEFINITE_MAP | ||||
for key, value in it: | ||||
for chunk in streamencode(key): | ||||
yield chunk | ||||
for chunk in streamencode(value): | ||||
yield chunk | ||||
yield BREAK | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | def streamencodebool(b): | ||
# major type 7, simple value 20 and 21. | ||||
yield b'\xf5' if b else b'\xf4' | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | def streamencodenone(v): | ||
# major type 7, simple value 22. | ||||
yield b'\xf6' | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | STREAM_ENCODERS = { | ||
bytes: streamencodebytestring, | ||||
int: streamencodeint, | ||||
Matt Harbison
|
r39490 | pycompat.long: streamencodeint, | ||
Gregory Szorc
|
r37729 | list: streamencodearray, | ||
tuple: streamencodearray, | ||||
dict: streamencodemap, | ||||
set: streamencodeset, | ||||
bool: streamencodebool, | ||||
type(None): streamencodenone, | ||||
} | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r37729 | def streamencode(v): | ||
"""Encode a value in a streaming manner. | ||||
Given an input object, encode it to CBOR recursively. | ||||
Returns a generator of CBOR encoded bytes. There is no guarantee | ||||
that each emitted chunk fully decodes to a value or sub-value. | ||||
Encoding is deterministic - unordered collections are sorted. | ||||
""" | ||||
fn = STREAM_ENCODERS.get(v.__class__) | ||||
if not fn: | ||||
Yuya Nishihara
|
r42680 | # handle subtypes such as encoding.localstr and util.sortdict | ||
for ty in STREAM_ENCODERS: | ||||
if not isinstance(v, ty): | ||||
continue | ||||
fn = STREAM_ENCODERS[ty] | ||||
break | ||||
if not fn: | ||||
Augie Fackler
|
r43347 | raise ValueError(b'do not know how to encode %s' % type(v)) | ||
Gregory Szorc
|
r37729 | |||
return fn(v) | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r39448 | class CBORDecodeError(Exception): | ||
"""Represents an error decoding CBOR.""" | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r39448 | if sys.version_info.major >= 3: | ||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r39448 | def _elementtointeger(b, i): | ||
return b[i] | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r39448 | else: | ||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r39448 | def _elementtointeger(b, i): | ||
return ord(b[i]) | ||||
Augie Fackler
|
r43346 | |||
Augie Fackler
|
r43906 | STRUCT_BIG_UBYTE = struct.Struct('>B') | ||
Augie Fackler
|
r43347 | STRUCT_BIG_USHORT = struct.Struct(b'>H') | ||
STRUCT_BIG_ULONG = struct.Struct(b'>L') | ||||
STRUCT_BIG_ULONGLONG = struct.Struct(b'>Q') | ||||
Gregory Szorc
|
r39448 | |||
SPECIAL_NONE = 0 | ||||
SPECIAL_START_INDEFINITE_BYTESTRING = 1 | ||||
SPECIAL_START_ARRAY = 2 | ||||
SPECIAL_START_MAP = 3 | ||||
SPECIAL_START_SET = 4 | ||||
SPECIAL_INDEFINITE_BREAK = 5 | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r39448 | def decodeitem(b, offset=0): | ||
"""Decode a new CBOR value from a buffer at offset. | ||||
This function attempts to decode up to one complete CBOR value | ||||
from ``b`` starting at offset ``offset``. | ||||
The beginning of a collection (such as an array, map, set, or | ||||
indefinite length bytestring) counts as a single value. For these | ||||
special cases, a state flag will indicate that a special value was seen. | ||||
When called, the function either returns a decoded value or gives | ||||
a hint as to how many more bytes are needed to do so. By calling | ||||
the function repeatedly given a stream of bytes, the caller can | ||||
build up the original values. | ||||
Returns a tuple with the following elements: | ||||
* Bool indicating whether a complete value was decoded. | ||||
* A decoded value if first value is True otherwise None | ||||
* Integer number of bytes. If positive, the number of bytes | ||||
read. If negative, the number of bytes we need to read to | ||||
decode this value or the next chunk in this value. | ||||
* One of the ``SPECIAL_*`` constants indicating special treatment | ||||
for this value. ``SPECIAL_NONE`` means this is a fully decoded | ||||
simple value (such as an integer or bool). | ||||
""" | ||||
initial = _elementtointeger(b, offset) | ||||
offset += 1 | ||||
majortype = initial >> 5 | ||||
subtype = initial & SUBTYPE_MASK | ||||
if majortype == MAJOR_TYPE_UINT: | ||||
complete, value, readcount = decodeuint(subtype, b, offset) | ||||
if complete: | ||||
return True, value, readcount + 1, SPECIAL_NONE | ||||
else: | ||||
return False, None, readcount, SPECIAL_NONE | ||||
elif majortype == MAJOR_TYPE_NEGINT: | ||||
# Negative integers are the same as UINT except inverted minus 1. | ||||
complete, value, readcount = decodeuint(subtype, b, offset) | ||||
if complete: | ||||
return True, -value - 1, readcount + 1, SPECIAL_NONE | ||||
else: | ||||
return False, None, readcount, SPECIAL_NONE | ||||
elif majortype == MAJOR_TYPE_BYTESTRING: | ||||
# Beginning of bytestrings are treated as uints in order to | ||||
# decode their length, which may be indefinite. | ||||
Augie Fackler
|
r43346 | complete, size, readcount = decodeuint( | ||
subtype, b, offset, allowindefinite=True | ||||
) | ||||
Gregory Szorc
|
r39448 | |||
# We don't know the size of the bytestring. It must be a definitive | ||||
# length since the indefinite subtype would be encoded in the initial | ||||
# byte. | ||||
if not complete: | ||||
return False, None, readcount, SPECIAL_NONE | ||||
# We know the length of the bytestring. | ||||
if size is not None: | ||||
# And the data is available in the buffer. | ||||
if offset + readcount + size <= len(b): | ||||
Augie Fackler
|
r43346 | value = b[offset + readcount : offset + readcount + size] | ||
Gregory Szorc
|
r39448 | return True, value, readcount + size + 1, SPECIAL_NONE | ||
# And we need more data in order to return the bytestring. | ||||
else: | ||||
wanted = len(b) - offset - readcount - size | ||||
return False, None, wanted, SPECIAL_NONE | ||||
# It is an indefinite length bytestring. | ||||
else: | ||||
return True, None, 1, SPECIAL_START_INDEFINITE_BYTESTRING | ||||
elif majortype == MAJOR_TYPE_STRING: | ||||
Augie Fackler
|
r43347 | raise CBORDecodeError(b'string major type not supported') | ||
Gregory Szorc
|
r39448 | |||
elif majortype == MAJOR_TYPE_ARRAY: | ||||
# Beginning of arrays are treated as uints in order to decode their | ||||
# length. We don't allow indefinite length arrays. | ||||
complete, size, readcount = decodeuint(subtype, b, offset) | ||||
if complete: | ||||
return True, size, readcount + 1, SPECIAL_START_ARRAY | ||||
else: | ||||
return False, None, readcount, SPECIAL_NONE | ||||
elif majortype == MAJOR_TYPE_MAP: | ||||
# Beginning of maps are treated as uints in order to decode their | ||||
# number of elements. We don't allow indefinite length arrays. | ||||
complete, size, readcount = decodeuint(subtype, b, offset) | ||||
if complete: | ||||
return True, size, readcount + 1, SPECIAL_START_MAP | ||||
else: | ||||
return False, None, readcount, SPECIAL_NONE | ||||
elif majortype == MAJOR_TYPE_SEMANTIC: | ||||
# Semantic tag value is read the same as a uint. | ||||
complete, tagvalue, readcount = decodeuint(subtype, b, offset) | ||||
if not complete: | ||||
return False, None, readcount, SPECIAL_NONE | ||||
# This behavior here is a little wonky. The main type being "decorated" | ||||
# by this semantic tag follows. A more robust parser would probably emit | ||||
# a special flag indicating this as a semantic tag and let the caller | ||||
# deal with the types that follow. But since we don't support many | ||||
# semantic tags, it is easier to deal with the special cases here and | ||||
# hide complexity from the caller. If we add support for more semantic | ||||
# tags, we should probably move semantic tag handling into the caller. | ||||
if tagvalue == SEMANTIC_TAG_FINITE_SET: | ||||
if offset + readcount >= len(b): | ||||
return False, None, -1, SPECIAL_NONE | ||||
Augie Fackler
|
r43346 | complete, size, readcount2, special = decodeitem( | ||
b, offset + readcount | ||||
) | ||||
Gregory Szorc
|
r39448 | |||
if not complete: | ||||
return False, None, readcount2, SPECIAL_NONE | ||||
if special != SPECIAL_START_ARRAY: | ||||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Martin von Zweigbergk
|
r43387 | b'expected array after finite set semantic tag' | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
return True, size, readcount + readcount2 + 1, SPECIAL_START_SET | ||||
else: | ||||
Augie Fackler
|
r43347 | raise CBORDecodeError(b'semantic tag %d not allowed' % tagvalue) | ||
Gregory Szorc
|
r39448 | |||
elif majortype == MAJOR_TYPE_SPECIAL: | ||||
# Only specific values for the information field are allowed. | ||||
if subtype == SUBTYPE_FALSE: | ||||
return True, False, 1, SPECIAL_NONE | ||||
elif subtype == SUBTYPE_TRUE: | ||||
return True, True, 1, SPECIAL_NONE | ||||
elif subtype == SUBTYPE_NULL: | ||||
return True, None, 1, SPECIAL_NONE | ||||
elif subtype == SUBTYPE_INDEFINITE: | ||||
return True, None, 1, SPECIAL_INDEFINITE_BREAK | ||||
# If value is 24, subtype is in next byte. | ||||
else: | ||||
Augie Fackler
|
r43347 | raise CBORDecodeError(b'special type %d not allowed' % subtype) | ||
Gregory Szorc
|
r39448 | else: | ||
assert False | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r39448 | def decodeuint(subtype, b, offset=0, allowindefinite=False): | ||
"""Decode an unsigned integer. | ||||
``subtype`` is the lower 5 bits from the initial byte CBOR item | ||||
"header." ``b`` is a buffer containing bytes. ``offset`` points to | ||||
the index of the first byte after the byte that ``subtype`` was | ||||
derived from. | ||||
``allowindefinite`` allows the special indefinite length value | ||||
indicator. | ||||
Returns a 3-tuple of (successful, value, count). | ||||
The first element is a bool indicating if decoding completed. The 2nd | ||||
is the decoded integer value or None if not fully decoded or the subtype | ||||
is 31 and ``allowindefinite`` is True. The 3rd value is the count of bytes. | ||||
If positive, it is the number of additional bytes decoded. If negative, | ||||
it is the number of additional bytes needed to decode this value. | ||||
""" | ||||
# Small values are inline. | ||||
if subtype < 24: | ||||
return True, subtype, 0 | ||||
# Indefinite length specifier. | ||||
elif subtype == 31: | ||||
if allowindefinite: | ||||
return True, None, 0 | ||||
else: | ||||
Augie Fackler
|
r43347 | raise CBORDecodeError(b'indefinite length uint not allowed here') | ||
Gregory Szorc
|
r39448 | elif subtype >= 28: | ||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Augie Fackler
|
r43347 | b'unsupported subtype on integer type: %d' % subtype | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
if subtype == 24: | ||||
s = STRUCT_BIG_UBYTE | ||||
elif subtype == 25: | ||||
s = STRUCT_BIG_USHORT | ||||
elif subtype == 26: | ||||
s = STRUCT_BIG_ULONG | ||||
elif subtype == 27: | ||||
s = STRUCT_BIG_ULONGLONG | ||||
else: | ||||
Augie Fackler
|
r43347 | raise CBORDecodeError(b'bounds condition checking violation') | ||
Gregory Szorc
|
r39448 | |||
if len(b) - offset >= s.size: | ||||
return True, s.unpack_from(b, offset)[0], s.size | ||||
else: | ||||
return False, None, len(b) - offset - s.size | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r39448 | class bytestringchunk(bytes): | ||
"""Represents a chunk/segment in an indefinite length bytestring. | ||||
This behaves like a ``bytes`` but in addition has the ``isfirst`` | ||||
and ``islast`` attributes indicating whether this chunk is the first | ||||
or last in an indefinite length bytestring. | ||||
""" | ||||
def __new__(cls, v, first=False, last=False): | ||||
self = bytes.__new__(cls, v) | ||||
self.isfirst = first | ||||
self.islast = last | ||||
return self | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r39448 | class sansiodecoder(object): | ||
"""A CBOR decoder that doesn't perform its own I/O. | ||||
To use, construct an instance and feed it segments containing | ||||
CBOR-encoded bytes via ``decode()``. The return value from ``decode()`` | ||||
indicates whether a fully-decoded value is available, how many bytes | ||||
were consumed, and offers a hint as to how many bytes should be fed | ||||
in next time to decode the next value. | ||||
The decoder assumes it will decode N discrete CBOR values, not just | ||||
a single value. i.e. if the bytestream contains uints packed one after | ||||
the other, the decoder will decode them all, rather than just the initial | ||||
one. | ||||
When ``decode()`` indicates a value is available, call ``getavailable()`` | ||||
to return all fully decoded values. | ||||
``decode()`` can partially decode input. It is up to the caller to keep | ||||
track of what data was consumed and to pass unconsumed data in on the | ||||
next invocation. | ||||
The decoder decodes atomically at the *item* level. See ``decodeitem()``. | ||||
If an *item* cannot be fully decoded, the decoder won't record it as | ||||
partially consumed. Instead, the caller will be instructed to pass in | ||||
the initial bytes of this item on the next invocation. This does result | ||||
in some redundant parsing. But the overhead should be minimal. | ||||
This decoder only supports a subset of CBOR as required by Mercurial. | ||||
It lacks support for: | ||||
* Indefinite length arrays | ||||
* Indefinite length maps | ||||
* Use of indefinite length bytestrings as keys or values within | ||||
arrays, maps, or sets. | ||||
* Nested arrays, maps, or sets within sets | ||||
* Any semantic tag that isn't a mathematical finite set | ||||
* Floating point numbers | ||||
* Undefined special value | ||||
CBOR types are decoded to Python types as follows: | ||||
uint -> int | ||||
negint -> int | ||||
bytestring -> bytes | ||||
map -> dict | ||||
array -> list | ||||
True -> bool | ||||
False -> bool | ||||
null -> None | ||||
indefinite length bytestring chunk -> [bytestringchunk] | ||||
The only non-obvious mapping here is an indefinite length bytestring | ||||
to the ``bytestringchunk`` type. This is to facilitate streaming | ||||
indefinite length bytestrings out of the decoder and to differentiate | ||||
a regular bytestring from an indefinite length bytestring. | ||||
""" | ||||
_STATE_NONE = 0 | ||||
_STATE_WANT_MAP_KEY = 1 | ||||
_STATE_WANT_MAP_VALUE = 2 | ||||
_STATE_WANT_ARRAY_VALUE = 3 | ||||
_STATE_WANT_SET_VALUE = 4 | ||||
_STATE_WANT_BYTESTRING_CHUNK_FIRST = 5 | ||||
_STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT = 6 | ||||
def __init__(self): | ||||
# TODO add support for limiting size of bytestrings | ||||
# TODO add support for limiting number of keys / values in collections | ||||
# TODO add support for limiting size of buffered partial values | ||||
self.decodedbytecount = 0 | ||||
self._state = self._STATE_NONE | ||||
# Stack of active nested collections. Each entry is a dict describing | ||||
# the collection. | ||||
self._collectionstack = [] | ||||
# Fully decoded key to use for the current map. | ||||
self._currentmapkey = None | ||||
# Fully decoded values available for retrieval. | ||||
self._decodedvalues = [] | ||||
@property | ||||
def inprogress(self): | ||||
"""Whether the decoder has partially decoded a value.""" | ||||
return self._state != self._STATE_NONE | ||||
def decode(self, b, offset=0): | ||||
"""Attempt to decode bytes from an input buffer. | ||||
``b`` is a collection of bytes and ``offset`` is the byte | ||||
offset within that buffer from which to begin reading data. | ||||
``b`` must support ``len()`` and accessing bytes slices via | ||||
``__slice__``. Typically ``bytes`` instances are used. | ||||
Returns a tuple with the following fields: | ||||
* Bool indicating whether values are available for retrieval. | ||||
* Integer indicating the number of bytes that were fully consumed, | ||||
starting from ``offset``. | ||||
* Integer indicating the number of bytes that are desired for the | ||||
next call in order to decode an item. | ||||
""" | ||||
if not b: | ||||
return bool(self._decodedvalues), 0, 0 | ||||
initialoffset = offset | ||||
# We could easily split the body of this loop into a function. But | ||||
# Python performance is sensitive to function calls and collections | ||||
# are composed of many items. So leaving as a while loop could help | ||||
# with performance. One thing that may not help is the use of | ||||
# if..elif versus a lookup/dispatch table. There may be value | ||||
# in switching that. | ||||
while offset < len(b): | ||||
# Attempt to decode an item. This could be a whole value or a | ||||
# special value indicating an event, such as start or end of a | ||||
# collection or indefinite length type. | ||||
complete, value, readcount, special = decodeitem(b, offset) | ||||
if readcount > 0: | ||||
self.decodedbytecount += readcount | ||||
if not complete: | ||||
assert readcount < 0 | ||||
return ( | ||||
bool(self._decodedvalues), | ||||
offset - initialoffset, | ||||
-readcount, | ||||
) | ||||
offset += readcount | ||||
# No nested state. We either have a full value or beginning of a | ||||
# complex value to deal with. | ||||
if self._state == self._STATE_NONE: | ||||
# A normal value. | ||||
if special == SPECIAL_NONE: | ||||
self._decodedvalues.append(value) | ||||
elif special == SPECIAL_START_ARRAY: | ||||
Augie Fackler
|
r43346 | self._collectionstack.append( | ||
Augie Fackler
|
r43347 | {b'remaining': value, b'v': [],} | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | self._state = self._STATE_WANT_ARRAY_VALUE | ||
elif special == SPECIAL_START_MAP: | ||||
Augie Fackler
|
r43346 | self._collectionstack.append( | ||
Augie Fackler
|
r43347 | {b'remaining': value, b'v': {},} | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | self._state = self._STATE_WANT_MAP_KEY | ||
elif special == SPECIAL_START_SET: | ||||
Augie Fackler
|
r43346 | self._collectionstack.append( | ||
Augie Fackler
|
r43347 | {b'remaining': value, b'v': set(),} | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | self._state = self._STATE_WANT_SET_VALUE | ||
elif special == SPECIAL_START_INDEFINITE_BYTESTRING: | ||||
self._state = self._STATE_WANT_BYTESTRING_CHUNK_FIRST | ||||
else: | ||||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Augie Fackler
|
r43347 | b'unhandled special state: %d' % special | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
# This value becomes an element of the current array. | ||||
elif self._state == self._STATE_WANT_ARRAY_VALUE: | ||||
# Simple values get appended. | ||||
if special == SPECIAL_NONE: | ||||
c = self._collectionstack[-1] | ||||
Augie Fackler
|
r43347 | c[b'v'].append(value) | ||
c[b'remaining'] -= 1 | ||||
Gregory Szorc
|
r39448 | |||
# self._state doesn't need changed. | ||||
# An array nested within an array. | ||||
elif special == SPECIAL_START_ARRAY: | ||||
lastc = self._collectionstack[-1] | ||||
newvalue = [] | ||||
Augie Fackler
|
r43347 | lastc[b'v'].append(newvalue) | ||
lastc[b'remaining'] -= 1 | ||||
Gregory Szorc
|
r39448 | |||
Augie Fackler
|
r43346 | self._collectionstack.append( | ||
Augie Fackler
|
r43347 | {b'remaining': value, b'v': newvalue,} | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
# self._state doesn't need changed. | ||||
# A map nested within an array. | ||||
elif special == SPECIAL_START_MAP: | ||||
lastc = self._collectionstack[-1] | ||||
newvalue = {} | ||||
Augie Fackler
|
r43347 | lastc[b'v'].append(newvalue) | ||
lastc[b'remaining'] -= 1 | ||||
Gregory Szorc
|
r39448 | |||
Augie Fackler
|
r43346 | self._collectionstack.append( | ||
Augie Fackler
|
r43347 | {b'remaining': value, b'v': newvalue} | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
self._state = self._STATE_WANT_MAP_KEY | ||||
elif special == SPECIAL_START_SET: | ||||
lastc = self._collectionstack[-1] | ||||
newvalue = set() | ||||
Augie Fackler
|
r43347 | lastc[b'v'].append(newvalue) | ||
lastc[b'remaining'] -= 1 | ||||
Gregory Szorc
|
r39448 | |||
Augie Fackler
|
r43346 | self._collectionstack.append( | ||
Augie Fackler
|
r43347 | {b'remaining': value, b'v': newvalue,} | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
self._state = self._STATE_WANT_SET_VALUE | ||||
elif special == SPECIAL_START_INDEFINITE_BYTESTRING: | ||||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Augie Fackler
|
r43347 | b'indefinite length bytestrings ' | ||
b'not allowed as array values' | ||||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
else: | ||||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Augie Fackler
|
r43347 | b'unhandled special item when ' | ||
b'expecting array value: %d' % special | ||||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
# This value becomes the key of the current map instance. | ||||
elif self._state == self._STATE_WANT_MAP_KEY: | ||||
if special == SPECIAL_NONE: | ||||
self._currentmapkey = value | ||||
self._state = self._STATE_WANT_MAP_VALUE | ||||
elif special == SPECIAL_START_INDEFINITE_BYTESTRING: | ||||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Augie Fackler
|
r43347 | b'indefinite length bytestrings ' | ||
b'not allowed as map keys' | ||||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
Augie Fackler
|
r43346 | elif special in ( | ||
SPECIAL_START_ARRAY, | ||||
SPECIAL_START_MAP, | ||||
SPECIAL_START_SET, | ||||
): | ||||
raise CBORDecodeError( | ||||
Martin von Zweigbergk
|
r43387 | b'collections not supported as map keys' | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
# We do not allow special values to be used as map keys. | ||||
else: | ||||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Augie Fackler
|
r43347 | b'unhandled special item when ' | ||
b'expecting map key: %d' % special | ||||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
# This value becomes the value of the current map key. | ||||
elif self._state == self._STATE_WANT_MAP_VALUE: | ||||
# Simple values simply get inserted into the map. | ||||
if special == SPECIAL_NONE: | ||||
lastc = self._collectionstack[-1] | ||||
Augie Fackler
|
r43347 | lastc[b'v'][self._currentmapkey] = value | ||
lastc[b'remaining'] -= 1 | ||||
Gregory Szorc
|
r39448 | |||
self._state = self._STATE_WANT_MAP_KEY | ||||
# A new array is used as the map value. | ||||
elif special == SPECIAL_START_ARRAY: | ||||
lastc = self._collectionstack[-1] | ||||
newvalue = [] | ||||
Augie Fackler
|
r43347 | lastc[b'v'][self._currentmapkey] = newvalue | ||
lastc[b'remaining'] -= 1 | ||||
Gregory Szorc
|
r39448 | |||
Augie Fackler
|
r43346 | self._collectionstack.append( | ||
Augie Fackler
|
r43347 | {b'remaining': value, b'v': newvalue,} | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
self._state = self._STATE_WANT_ARRAY_VALUE | ||||
# A new map is used as the map value. | ||||
elif special == SPECIAL_START_MAP: | ||||
lastc = self._collectionstack[-1] | ||||
newvalue = {} | ||||
Augie Fackler
|
r43347 | lastc[b'v'][self._currentmapkey] = newvalue | ||
lastc[b'remaining'] -= 1 | ||||
Gregory Szorc
|
r39448 | |||
Augie Fackler
|
r43346 | self._collectionstack.append( | ||
Augie Fackler
|
r43347 | {b'remaining': value, b'v': newvalue,} | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
self._state = self._STATE_WANT_MAP_KEY | ||||
# A new set is used as the map value. | ||||
elif special == SPECIAL_START_SET: | ||||
lastc = self._collectionstack[-1] | ||||
newvalue = set() | ||||
Augie Fackler
|
r43347 | lastc[b'v'][self._currentmapkey] = newvalue | ||
lastc[b'remaining'] -= 1 | ||||
Gregory Szorc
|
r39448 | |||
Augie Fackler
|
r43346 | self._collectionstack.append( | ||
Augie Fackler
|
r43347 | {b'remaining': value, b'v': newvalue,} | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
self._state = self._STATE_WANT_SET_VALUE | ||||
elif special == SPECIAL_START_INDEFINITE_BYTESTRING: | ||||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Augie Fackler
|
r43347 | b'indefinite length bytestrings not ' | ||
b'allowed as map values' | ||||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
else: | ||||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Augie Fackler
|
r43347 | b'unhandled special item when ' | ||
b'expecting map value: %d' % special | ||||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
self._currentmapkey = None | ||||
# This value is added to the current set. | ||||
elif self._state == self._STATE_WANT_SET_VALUE: | ||||
if special == SPECIAL_NONE: | ||||
lastc = self._collectionstack[-1] | ||||
Augie Fackler
|
r43347 | lastc[b'v'].add(value) | ||
lastc[b'remaining'] -= 1 | ||||
Gregory Szorc
|
r39448 | |||
elif special == SPECIAL_START_INDEFINITE_BYTESTRING: | ||||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Augie Fackler
|
r43347 | b'indefinite length bytestrings not ' | ||
b'allowed as set values' | ||||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
Augie Fackler
|
r43346 | elif special in ( | ||
SPECIAL_START_ARRAY, | ||||
SPECIAL_START_MAP, | ||||
SPECIAL_START_SET, | ||||
): | ||||
raise CBORDecodeError( | ||||
Martin von Zweigbergk
|
r43387 | b'collections not allowed as set values' | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
# We don't allow non-trivial types to exist as set values. | ||||
else: | ||||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Augie Fackler
|
r43347 | b'unhandled special item when ' | ||
b'expecting set value: %d' % special | ||||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
# This value represents the first chunk in an indefinite length | ||||
# bytestring. | ||||
elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_FIRST: | ||||
# We received a full chunk. | ||||
if special == SPECIAL_NONE: | ||||
Augie Fackler
|
r43346 | self._decodedvalues.append( | ||
bytestringchunk(value, first=True) | ||||
) | ||||
Gregory Szorc
|
r39448 | |||
self._state = self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT | ||||
# The end of stream marker. This means it is an empty | ||||
# indefinite length bytestring. | ||||
elif special == SPECIAL_INDEFINITE_BREAK: | ||||
# We /could/ convert this to a b''. But we want to preserve | ||||
# the nature of the underlying data so consumers expecting | ||||
# an indefinite length bytestring get one. | ||||
Augie Fackler
|
r43346 | self._decodedvalues.append( | ||
bytestringchunk(b'', first=True, last=True) | ||||
) | ||||
Gregory Szorc
|
r39448 | |||
# Since indefinite length bytestrings can't be used in | ||||
# collections, we must be at the root level. | ||||
assert not self._collectionstack | ||||
self._state = self._STATE_NONE | ||||
else: | ||||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Augie Fackler
|
r43347 | b'unexpected special value when ' | ||
b'expecting bytestring chunk: %d' % special | ||||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
# This value represents the non-initial chunk in an indefinite | ||||
# length bytestring. | ||||
elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT: | ||||
# We received a full chunk. | ||||
if special == SPECIAL_NONE: | ||||
self._decodedvalues.append(bytestringchunk(value)) | ||||
# The end of stream marker. | ||||
elif special == SPECIAL_INDEFINITE_BREAK: | ||||
self._decodedvalues.append(bytestringchunk(b'', last=True)) | ||||
# Since indefinite length bytestrings can't be used in | ||||
# collections, we must be at the root level. | ||||
assert not self._collectionstack | ||||
self._state = self._STATE_NONE | ||||
else: | ||||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Augie Fackler
|
r43347 | b'unexpected special value when ' | ||
b'expecting bytestring chunk: %d' % special | ||||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
else: | ||||
Augie Fackler
|
r43346 | raise CBORDecodeError( | ||
Augie Fackler
|
r43347 | b'unhandled decoder state: %d' % self._state | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r39448 | |||
# We could have just added the final value in a collection. End | ||||
# all complete collections at the top of the stack. | ||||
while True: | ||||
# Bail if we're not waiting on a new collection item. | ||||
Augie Fackler
|
r43346 | if self._state not in ( | ||
self._STATE_WANT_ARRAY_VALUE, | ||||
self._STATE_WANT_MAP_KEY, | ||||
self._STATE_WANT_SET_VALUE, | ||||
): | ||||
Gregory Szorc
|
r39448 | break | ||
# Or we are expecting more items for this collection. | ||||
lastc = self._collectionstack[-1] | ||||
Augie Fackler
|
r43347 | if lastc[b'remaining']: | ||
Gregory Szorc
|
r39448 | break | ||
# The collection at the top of the stack is complete. | ||||
# Discard it, as it isn't needed for future items. | ||||
self._collectionstack.pop() | ||||
# If this is a nested collection, we don't emit it, since it | ||||
# will be emitted by its parent collection. But we do need to | ||||
# update state to reflect what the new top-most collection | ||||
# on the stack is. | ||||
if self._collectionstack: | ||||
self._state = { | ||||
list: self._STATE_WANT_ARRAY_VALUE, | ||||
dict: self._STATE_WANT_MAP_KEY, | ||||
set: self._STATE_WANT_SET_VALUE, | ||||
Augie Fackler
|
r43347 | }[type(self._collectionstack[-1][b'v'])] | ||
Gregory Szorc
|
r39448 | |||
# If this is the root collection, emit it. | ||||
else: | ||||
Augie Fackler
|
r43347 | self._decodedvalues.append(lastc[b'v']) | ||
Gregory Szorc
|
r39448 | self._state = self._STATE_NONE | ||
return ( | ||||
bool(self._decodedvalues), | ||||
offset - initialoffset, | ||||
0, | ||||
) | ||||
def getavailable(self): | ||||
"""Returns an iterator over fully decoded values. | ||||
Once values are retrieved, they won't be available on the next call. | ||||
""" | ||||
l = list(self._decodedvalues) | ||||
self._decodedvalues = [] | ||||
return l | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r39450 | class bufferingdecoder(object): | ||
"""A CBOR decoder that buffers undecoded input. | ||||
This is a glorified wrapper around ``sansiodecoder`` that adds a buffering | ||||
layer. All input that isn't consumed by ``sansiodecoder`` will be buffered | ||||
and concatenated with any new input that arrives later. | ||||
TODO consider adding limits as to the maximum amount of data that can | ||||
be buffered. | ||||
""" | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r39450 | def __init__(self): | ||
self._decoder = sansiodecoder() | ||||
Gregory Szorc
|
r40066 | self._chunks = [] | ||
self._wanted = 0 | ||||
Gregory Szorc
|
r39450 | |||
def decode(self, b): | ||||
"""Attempt to decode bytes to CBOR values. | ||||
Returns a tuple with the following fields: | ||||
* Bool indicating whether new values are available for retrieval. | ||||
* Integer number of bytes decoded from the new input. | ||||
* Integer number of bytes wanted to decode the next value. | ||||
""" | ||||
Gregory Szorc
|
r40160 | # We /might/ be able to support passing a bytearray all the | ||
# way through. For now, let's cheat. | ||||
if isinstance(b, bytearray): | ||||
b = bytes(b) | ||||
Gregory Szorc
|
r40066 | # Our strategy for buffering is to aggregate the incoming chunks in a | ||
# list until we've received enough data to decode the next item. | ||||
# This is slightly more complicated than using an ``io.BytesIO`` | ||||
# or continuously concatenating incoming data. However, because it | ||||
# isn't constantly reallocating backing memory for a growing buffer, | ||||
# it prevents excessive memory thrashing and is significantly faster, | ||||
# especially in cases where the percentage of input chunks that don't | ||||
# decode into a full item is high. | ||||
Gregory Szorc
|
r39450 | |||
Gregory Szorc
|
r40066 | if self._chunks: | ||
# A previous call said we needed N bytes to decode the next item. | ||||
# But this call doesn't provide enough data. We buffer the incoming | ||||
# chunk without attempting to decode. | ||||
if len(b) < self._wanted: | ||||
self._chunks.append(b) | ||||
self._wanted -= len(b) | ||||
return False, 0, self._wanted | ||||
# Else we may have enough data to decode the next item. Aggregate | ||||
# old data with new and reset the buffer. | ||||
newlen = len(b) | ||||
self._chunks.append(b) | ||||
b = b''.join(self._chunks) | ||||
self._chunks = [] | ||||
oldlen = len(b) - newlen | ||||
Gregory Szorc
|
r39450 | else: | ||
oldlen = 0 | ||||
available, readcount, wanted = self._decoder.decode(b) | ||||
Gregory Szorc
|
r40066 | self._wanted = wanted | ||
Gregory Szorc
|
r39450 | |||
if readcount < len(b): | ||||
Gregory Szorc
|
r40066 | self._chunks.append(b[readcount:]) | ||
Gregory Szorc
|
r39450 | |||
return available, readcount - oldlen, wanted | ||||
def getavailable(self): | ||||
return self._decoder.getavailable() | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r39448 | def decodeall(b): | ||
"""Decode all CBOR items present in an iterable of bytes. | ||||
In addition to regular decode errors, raises CBORDecodeError if the | ||||
entirety of the passed buffer does not fully decode to complete CBOR | ||||
values. This includes failure to decode any value, incomplete collection | ||||
types, incomplete indefinite length items, and extra data at the end of | ||||
the buffer. | ||||
""" | ||||
if not b: | ||||
return [] | ||||
decoder = sansiodecoder() | ||||
havevalues, readcount, wantbytes = decoder.decode(b) | ||||
if readcount != len(b): | ||||
Augie Fackler
|
r43347 | raise CBORDecodeError(b'input data not fully consumed') | ||
Gregory Szorc
|
r39448 | |||
if decoder.inprogress: | ||||
Augie Fackler
|
r43347 | raise CBORDecodeError(b'input data not complete') | ||
Gregory Szorc
|
r39448 | |||
return decoder.getavailable() | ||||