##// END OF EJS Templates
cborutil: change buffering strategy...
Gregory Szorc -
r40066:62160d30 default
parent child Browse files
Show More
@@ -1,968 +1,990 b''
1 # cborutil.py - CBOR extensions
1 # cborutil.py - CBOR extensions
2 #
2 #
3 # Copyright 2018 Gregory Szorc <gregory.szorc@gmail.com>
3 # Copyright 2018 Gregory Szorc <gregory.szorc@gmail.com>
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import struct
10 import struct
11 import sys
11 import sys
12
12
13 from .. import pycompat
13 from .. import pycompat
14
14
15 # Very short very of RFC 7049...
15 # Very short very of RFC 7049...
16 #
16 #
17 # Each item begins with a byte. The 3 high bits of that byte denote the
17 # Each item begins with a byte. The 3 high bits of that byte denote the
18 # "major type." The lower 5 bits denote the "subtype." Each major type
18 # "major type." The lower 5 bits denote the "subtype." Each major type
19 # has its own encoding mechanism.
19 # has its own encoding mechanism.
20 #
20 #
21 # Most types have lengths. However, bytestring, string, array, and map
21 # Most types have lengths. However, bytestring, string, array, and map
22 # can be indefinite length. These are denotes by a subtype with value 31.
22 # can be indefinite length. These are denotes by a subtype with value 31.
23 # Sub-components of those types then come afterwards and are terminated
23 # Sub-components of those types then come afterwards and are terminated
24 # by a "break" byte.
24 # by a "break" byte.
25
25
26 MAJOR_TYPE_UINT = 0
26 MAJOR_TYPE_UINT = 0
27 MAJOR_TYPE_NEGINT = 1
27 MAJOR_TYPE_NEGINT = 1
28 MAJOR_TYPE_BYTESTRING = 2
28 MAJOR_TYPE_BYTESTRING = 2
29 MAJOR_TYPE_STRING = 3
29 MAJOR_TYPE_STRING = 3
30 MAJOR_TYPE_ARRAY = 4
30 MAJOR_TYPE_ARRAY = 4
31 MAJOR_TYPE_MAP = 5
31 MAJOR_TYPE_MAP = 5
32 MAJOR_TYPE_SEMANTIC = 6
32 MAJOR_TYPE_SEMANTIC = 6
33 MAJOR_TYPE_SPECIAL = 7
33 MAJOR_TYPE_SPECIAL = 7
34
34
35 SUBTYPE_MASK = 0b00011111
35 SUBTYPE_MASK = 0b00011111
36
36
37 SUBTYPE_FALSE = 20
37 SUBTYPE_FALSE = 20
38 SUBTYPE_TRUE = 21
38 SUBTYPE_TRUE = 21
39 SUBTYPE_NULL = 22
39 SUBTYPE_NULL = 22
40 SUBTYPE_HALF_FLOAT = 25
40 SUBTYPE_HALF_FLOAT = 25
41 SUBTYPE_SINGLE_FLOAT = 26
41 SUBTYPE_SINGLE_FLOAT = 26
42 SUBTYPE_DOUBLE_FLOAT = 27
42 SUBTYPE_DOUBLE_FLOAT = 27
43 SUBTYPE_INDEFINITE = 31
43 SUBTYPE_INDEFINITE = 31
44
44
45 SEMANTIC_TAG_FINITE_SET = 258
45 SEMANTIC_TAG_FINITE_SET = 258
46
46
47 # Indefinite types begin with their major type ORd with information value 31.
47 # Indefinite types begin with their major type ORd with information value 31.
48 BEGIN_INDEFINITE_BYTESTRING = struct.pack(
48 BEGIN_INDEFINITE_BYTESTRING = struct.pack(
49 r'>B', MAJOR_TYPE_BYTESTRING << 5 | SUBTYPE_INDEFINITE)
49 r'>B', MAJOR_TYPE_BYTESTRING << 5 | SUBTYPE_INDEFINITE)
50 BEGIN_INDEFINITE_ARRAY = struct.pack(
50 BEGIN_INDEFINITE_ARRAY = struct.pack(
51 r'>B', MAJOR_TYPE_ARRAY << 5 | SUBTYPE_INDEFINITE)
51 r'>B', MAJOR_TYPE_ARRAY << 5 | SUBTYPE_INDEFINITE)
52 BEGIN_INDEFINITE_MAP = struct.pack(
52 BEGIN_INDEFINITE_MAP = struct.pack(
53 r'>B', MAJOR_TYPE_MAP << 5 | SUBTYPE_INDEFINITE)
53 r'>B', MAJOR_TYPE_MAP << 5 | SUBTYPE_INDEFINITE)
54
54
55 ENCODED_LENGTH_1 = struct.Struct(r'>B')
55 ENCODED_LENGTH_1 = struct.Struct(r'>B')
56 ENCODED_LENGTH_2 = struct.Struct(r'>BB')
56 ENCODED_LENGTH_2 = struct.Struct(r'>BB')
57 ENCODED_LENGTH_3 = struct.Struct(r'>BH')
57 ENCODED_LENGTH_3 = struct.Struct(r'>BH')
58 ENCODED_LENGTH_4 = struct.Struct(r'>BL')
58 ENCODED_LENGTH_4 = struct.Struct(r'>BL')
59 ENCODED_LENGTH_5 = struct.Struct(r'>BQ')
59 ENCODED_LENGTH_5 = struct.Struct(r'>BQ')
60
60
61 # The break ends an indefinite length item.
61 # The break ends an indefinite length item.
62 BREAK = b'\xff'
62 BREAK = b'\xff'
63 BREAK_INT = 255
63 BREAK_INT = 255
64
64
65 def encodelength(majortype, length):
65 def encodelength(majortype, length):
66 """Obtain a value encoding the major type and its length."""
66 """Obtain a value encoding the major type and its length."""
67 if length < 24:
67 if length < 24:
68 return ENCODED_LENGTH_1.pack(majortype << 5 | length)
68 return ENCODED_LENGTH_1.pack(majortype << 5 | length)
69 elif length < 256:
69 elif length < 256:
70 return ENCODED_LENGTH_2.pack(majortype << 5 | 24, length)
70 return ENCODED_LENGTH_2.pack(majortype << 5 | 24, length)
71 elif length < 65536:
71 elif length < 65536:
72 return ENCODED_LENGTH_3.pack(majortype << 5 | 25, length)
72 return ENCODED_LENGTH_3.pack(majortype << 5 | 25, length)
73 elif length < 4294967296:
73 elif length < 4294967296:
74 return ENCODED_LENGTH_4.pack(majortype << 5 | 26, length)
74 return ENCODED_LENGTH_4.pack(majortype << 5 | 26, length)
75 else:
75 else:
76 return ENCODED_LENGTH_5.pack(majortype << 5 | 27, length)
76 return ENCODED_LENGTH_5.pack(majortype << 5 | 27, length)
77
77
78 def streamencodebytestring(v):
78 def streamencodebytestring(v):
79 yield encodelength(MAJOR_TYPE_BYTESTRING, len(v))
79 yield encodelength(MAJOR_TYPE_BYTESTRING, len(v))
80 yield v
80 yield v
81
81
82 def streamencodebytestringfromiter(it):
82 def streamencodebytestringfromiter(it):
83 """Convert an iterator of chunks to an indefinite bytestring.
83 """Convert an iterator of chunks to an indefinite bytestring.
84
84
85 Given an input that is iterable and each element in the iterator is
85 Given an input that is iterable and each element in the iterator is
86 representable as bytes, emit an indefinite length bytestring.
86 representable as bytes, emit an indefinite length bytestring.
87 """
87 """
88 yield BEGIN_INDEFINITE_BYTESTRING
88 yield BEGIN_INDEFINITE_BYTESTRING
89
89
90 for chunk in it:
90 for chunk in it:
91 yield encodelength(MAJOR_TYPE_BYTESTRING, len(chunk))
91 yield encodelength(MAJOR_TYPE_BYTESTRING, len(chunk))
92 yield chunk
92 yield chunk
93
93
94 yield BREAK
94 yield BREAK
95
95
96 def streamencodeindefinitebytestring(source, chunksize=65536):
96 def streamencodeindefinitebytestring(source, chunksize=65536):
97 """Given a large source buffer, emit as an indefinite length bytestring.
97 """Given a large source buffer, emit as an indefinite length bytestring.
98
98
99 This is a generator of chunks constituting the encoded CBOR data.
99 This is a generator of chunks constituting the encoded CBOR data.
100 """
100 """
101 yield BEGIN_INDEFINITE_BYTESTRING
101 yield BEGIN_INDEFINITE_BYTESTRING
102
102
103 i = 0
103 i = 0
104 l = len(source)
104 l = len(source)
105
105
106 while True:
106 while True:
107 chunk = source[i:i + chunksize]
107 chunk = source[i:i + chunksize]
108 i += len(chunk)
108 i += len(chunk)
109
109
110 yield encodelength(MAJOR_TYPE_BYTESTRING, len(chunk))
110 yield encodelength(MAJOR_TYPE_BYTESTRING, len(chunk))
111 yield chunk
111 yield chunk
112
112
113 if i >= l:
113 if i >= l:
114 break
114 break
115
115
116 yield BREAK
116 yield BREAK
117
117
118 def streamencodeint(v):
118 def streamencodeint(v):
119 if v >= 18446744073709551616 or v < -18446744073709551616:
119 if v >= 18446744073709551616 or v < -18446744073709551616:
120 raise ValueError('big integers not supported')
120 raise ValueError('big integers not supported')
121
121
122 if v >= 0:
122 if v >= 0:
123 yield encodelength(MAJOR_TYPE_UINT, v)
123 yield encodelength(MAJOR_TYPE_UINT, v)
124 else:
124 else:
125 yield encodelength(MAJOR_TYPE_NEGINT, abs(v) - 1)
125 yield encodelength(MAJOR_TYPE_NEGINT, abs(v) - 1)
126
126
127 def streamencodearray(l):
127 def streamencodearray(l):
128 """Encode a known size iterable to an array."""
128 """Encode a known size iterable to an array."""
129
129
130 yield encodelength(MAJOR_TYPE_ARRAY, len(l))
130 yield encodelength(MAJOR_TYPE_ARRAY, len(l))
131
131
132 for i in l:
132 for i in l:
133 for chunk in streamencode(i):
133 for chunk in streamencode(i):
134 yield chunk
134 yield chunk
135
135
136 def streamencodearrayfromiter(it):
136 def streamencodearrayfromiter(it):
137 """Encode an iterator of items to an indefinite length array."""
137 """Encode an iterator of items to an indefinite length array."""
138
138
139 yield BEGIN_INDEFINITE_ARRAY
139 yield BEGIN_INDEFINITE_ARRAY
140
140
141 for i in it:
141 for i in it:
142 for chunk in streamencode(i):
142 for chunk in streamencode(i):
143 yield chunk
143 yield chunk
144
144
145 yield BREAK
145 yield BREAK
146
146
147 def _mixedtypesortkey(v):
147 def _mixedtypesortkey(v):
148 return type(v).__name__, v
148 return type(v).__name__, v
149
149
150 def streamencodeset(s):
150 def streamencodeset(s):
151 # https://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml defines
151 # https://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml defines
152 # semantic tag 258 for finite sets.
152 # semantic tag 258 for finite sets.
153 yield encodelength(MAJOR_TYPE_SEMANTIC, SEMANTIC_TAG_FINITE_SET)
153 yield encodelength(MAJOR_TYPE_SEMANTIC, SEMANTIC_TAG_FINITE_SET)
154
154
155 for chunk in streamencodearray(sorted(s, key=_mixedtypesortkey)):
155 for chunk in streamencodearray(sorted(s, key=_mixedtypesortkey)):
156 yield chunk
156 yield chunk
157
157
158 def streamencodemap(d):
158 def streamencodemap(d):
159 """Encode dictionary to a generator.
159 """Encode dictionary to a generator.
160
160
161 Does not supporting indefinite length dictionaries.
161 Does not supporting indefinite length dictionaries.
162 """
162 """
163 yield encodelength(MAJOR_TYPE_MAP, len(d))
163 yield encodelength(MAJOR_TYPE_MAP, len(d))
164
164
165 for key, value in sorted(d.iteritems(),
165 for key, value in sorted(d.iteritems(),
166 key=lambda x: _mixedtypesortkey(x[0])):
166 key=lambda x: _mixedtypesortkey(x[0])):
167 for chunk in streamencode(key):
167 for chunk in streamencode(key):
168 yield chunk
168 yield chunk
169 for chunk in streamencode(value):
169 for chunk in streamencode(value):
170 yield chunk
170 yield chunk
171
171
172 def streamencodemapfromiter(it):
172 def streamencodemapfromiter(it):
173 """Given an iterable of (key, value), encode to an indefinite length map."""
173 """Given an iterable of (key, value), encode to an indefinite length map."""
174 yield BEGIN_INDEFINITE_MAP
174 yield BEGIN_INDEFINITE_MAP
175
175
176 for key, value in it:
176 for key, value in it:
177 for chunk in streamencode(key):
177 for chunk in streamencode(key):
178 yield chunk
178 yield chunk
179 for chunk in streamencode(value):
179 for chunk in streamencode(value):
180 yield chunk
180 yield chunk
181
181
182 yield BREAK
182 yield BREAK
183
183
184 def streamencodebool(b):
184 def streamencodebool(b):
185 # major type 7, simple value 20 and 21.
185 # major type 7, simple value 20 and 21.
186 yield b'\xf5' if b else b'\xf4'
186 yield b'\xf5' if b else b'\xf4'
187
187
188 def streamencodenone(v):
188 def streamencodenone(v):
189 # major type 7, simple value 22.
189 # major type 7, simple value 22.
190 yield b'\xf6'
190 yield b'\xf6'
191
191
192 STREAM_ENCODERS = {
192 STREAM_ENCODERS = {
193 bytes: streamencodebytestring,
193 bytes: streamencodebytestring,
194 int: streamencodeint,
194 int: streamencodeint,
195 pycompat.long: streamencodeint,
195 pycompat.long: streamencodeint,
196 list: streamencodearray,
196 list: streamencodearray,
197 tuple: streamencodearray,
197 tuple: streamencodearray,
198 dict: streamencodemap,
198 dict: streamencodemap,
199 set: streamencodeset,
199 set: streamencodeset,
200 bool: streamencodebool,
200 bool: streamencodebool,
201 type(None): streamencodenone,
201 type(None): streamencodenone,
202 }
202 }
203
203
204 def streamencode(v):
204 def streamencode(v):
205 """Encode a value in a streaming manner.
205 """Encode a value in a streaming manner.
206
206
207 Given an input object, encode it to CBOR recursively.
207 Given an input object, encode it to CBOR recursively.
208
208
209 Returns a generator of CBOR encoded bytes. There is no guarantee
209 Returns a generator of CBOR encoded bytes. There is no guarantee
210 that each emitted chunk fully decodes to a value or sub-value.
210 that each emitted chunk fully decodes to a value or sub-value.
211
211
212 Encoding is deterministic - unordered collections are sorted.
212 Encoding is deterministic - unordered collections are sorted.
213 """
213 """
214 fn = STREAM_ENCODERS.get(v.__class__)
214 fn = STREAM_ENCODERS.get(v.__class__)
215
215
216 if not fn:
216 if not fn:
217 raise ValueError('do not know how to encode %s' % type(v))
217 raise ValueError('do not know how to encode %s' % type(v))
218
218
219 return fn(v)
219 return fn(v)
220
220
221 class CBORDecodeError(Exception):
221 class CBORDecodeError(Exception):
222 """Represents an error decoding CBOR."""
222 """Represents an error decoding CBOR."""
223
223
224 if sys.version_info.major >= 3:
224 if sys.version_info.major >= 3:
225 def _elementtointeger(b, i):
225 def _elementtointeger(b, i):
226 return b[i]
226 return b[i]
227 else:
227 else:
228 def _elementtointeger(b, i):
228 def _elementtointeger(b, i):
229 return ord(b[i])
229 return ord(b[i])
230
230
231 STRUCT_BIG_UBYTE = struct.Struct(r'>B')
231 STRUCT_BIG_UBYTE = struct.Struct(r'>B')
232 STRUCT_BIG_USHORT = struct.Struct('>H')
232 STRUCT_BIG_USHORT = struct.Struct('>H')
233 STRUCT_BIG_ULONG = struct.Struct('>L')
233 STRUCT_BIG_ULONG = struct.Struct('>L')
234 STRUCT_BIG_ULONGLONG = struct.Struct('>Q')
234 STRUCT_BIG_ULONGLONG = struct.Struct('>Q')
235
235
236 SPECIAL_NONE = 0
236 SPECIAL_NONE = 0
237 SPECIAL_START_INDEFINITE_BYTESTRING = 1
237 SPECIAL_START_INDEFINITE_BYTESTRING = 1
238 SPECIAL_START_ARRAY = 2
238 SPECIAL_START_ARRAY = 2
239 SPECIAL_START_MAP = 3
239 SPECIAL_START_MAP = 3
240 SPECIAL_START_SET = 4
240 SPECIAL_START_SET = 4
241 SPECIAL_INDEFINITE_BREAK = 5
241 SPECIAL_INDEFINITE_BREAK = 5
242
242
243 def decodeitem(b, offset=0):
243 def decodeitem(b, offset=0):
244 """Decode a new CBOR value from a buffer at offset.
244 """Decode a new CBOR value from a buffer at offset.
245
245
246 This function attempts to decode up to one complete CBOR value
246 This function attempts to decode up to one complete CBOR value
247 from ``b`` starting at offset ``offset``.
247 from ``b`` starting at offset ``offset``.
248
248
249 The beginning of a collection (such as an array, map, set, or
249 The beginning of a collection (such as an array, map, set, or
250 indefinite length bytestring) counts as a single value. For these
250 indefinite length bytestring) counts as a single value. For these
251 special cases, a state flag will indicate that a special value was seen.
251 special cases, a state flag will indicate that a special value was seen.
252
252
253 When called, the function either returns a decoded value or gives
253 When called, the function either returns a decoded value or gives
254 a hint as to how many more bytes are needed to do so. By calling
254 a hint as to how many more bytes are needed to do so. By calling
255 the function repeatedly given a stream of bytes, the caller can
255 the function repeatedly given a stream of bytes, the caller can
256 build up the original values.
256 build up the original values.
257
257
258 Returns a tuple with the following elements:
258 Returns a tuple with the following elements:
259
259
260 * Bool indicating whether a complete value was decoded.
260 * Bool indicating whether a complete value was decoded.
261 * A decoded value if first value is True otherwise None
261 * A decoded value if first value is True otherwise None
262 * Integer number of bytes. If positive, the number of bytes
262 * Integer number of bytes. If positive, the number of bytes
263 read. If negative, the number of bytes we need to read to
263 read. If negative, the number of bytes we need to read to
264 decode this value or the next chunk in this value.
264 decode this value or the next chunk in this value.
265 * One of the ``SPECIAL_*`` constants indicating special treatment
265 * One of the ``SPECIAL_*`` constants indicating special treatment
266 for this value. ``SPECIAL_NONE`` means this is a fully decoded
266 for this value. ``SPECIAL_NONE`` means this is a fully decoded
267 simple value (such as an integer or bool).
267 simple value (such as an integer or bool).
268 """
268 """
269
269
270 initial = _elementtointeger(b, offset)
270 initial = _elementtointeger(b, offset)
271 offset += 1
271 offset += 1
272
272
273 majortype = initial >> 5
273 majortype = initial >> 5
274 subtype = initial & SUBTYPE_MASK
274 subtype = initial & SUBTYPE_MASK
275
275
276 if majortype == MAJOR_TYPE_UINT:
276 if majortype == MAJOR_TYPE_UINT:
277 complete, value, readcount = decodeuint(subtype, b, offset)
277 complete, value, readcount = decodeuint(subtype, b, offset)
278
278
279 if complete:
279 if complete:
280 return True, value, readcount + 1, SPECIAL_NONE
280 return True, value, readcount + 1, SPECIAL_NONE
281 else:
281 else:
282 return False, None, readcount, SPECIAL_NONE
282 return False, None, readcount, SPECIAL_NONE
283
283
284 elif majortype == MAJOR_TYPE_NEGINT:
284 elif majortype == MAJOR_TYPE_NEGINT:
285 # Negative integers are the same as UINT except inverted minus 1.
285 # Negative integers are the same as UINT except inverted minus 1.
286 complete, value, readcount = decodeuint(subtype, b, offset)
286 complete, value, readcount = decodeuint(subtype, b, offset)
287
287
288 if complete:
288 if complete:
289 return True, -value - 1, readcount + 1, SPECIAL_NONE
289 return True, -value - 1, readcount + 1, SPECIAL_NONE
290 else:
290 else:
291 return False, None, readcount, SPECIAL_NONE
291 return False, None, readcount, SPECIAL_NONE
292
292
293 elif majortype == MAJOR_TYPE_BYTESTRING:
293 elif majortype == MAJOR_TYPE_BYTESTRING:
294 # Beginning of bytestrings are treated as uints in order to
294 # Beginning of bytestrings are treated as uints in order to
295 # decode their length, which may be indefinite.
295 # decode their length, which may be indefinite.
296 complete, size, readcount = decodeuint(subtype, b, offset,
296 complete, size, readcount = decodeuint(subtype, b, offset,
297 allowindefinite=True)
297 allowindefinite=True)
298
298
299 # We don't know the size of the bytestring. It must be a definitive
299 # We don't know the size of the bytestring. It must be a definitive
300 # length since the indefinite subtype would be encoded in the initial
300 # length since the indefinite subtype would be encoded in the initial
301 # byte.
301 # byte.
302 if not complete:
302 if not complete:
303 return False, None, readcount, SPECIAL_NONE
303 return False, None, readcount, SPECIAL_NONE
304
304
305 # We know the length of the bytestring.
305 # We know the length of the bytestring.
306 if size is not None:
306 if size is not None:
307 # And the data is available in the buffer.
307 # And the data is available in the buffer.
308 if offset + readcount + size <= len(b):
308 if offset + readcount + size <= len(b):
309 value = b[offset + readcount:offset + readcount + size]
309 value = b[offset + readcount:offset + readcount + size]
310 return True, value, readcount + size + 1, SPECIAL_NONE
310 return True, value, readcount + size + 1, SPECIAL_NONE
311
311
312 # And we need more data in order to return the bytestring.
312 # And we need more data in order to return the bytestring.
313 else:
313 else:
314 wanted = len(b) - offset - readcount - size
314 wanted = len(b) - offset - readcount - size
315 return False, None, wanted, SPECIAL_NONE
315 return False, None, wanted, SPECIAL_NONE
316
316
317 # It is an indefinite length bytestring.
317 # It is an indefinite length bytestring.
318 else:
318 else:
319 return True, None, 1, SPECIAL_START_INDEFINITE_BYTESTRING
319 return True, None, 1, SPECIAL_START_INDEFINITE_BYTESTRING
320
320
321 elif majortype == MAJOR_TYPE_STRING:
321 elif majortype == MAJOR_TYPE_STRING:
322 raise CBORDecodeError('string major type not supported')
322 raise CBORDecodeError('string major type not supported')
323
323
324 elif majortype == MAJOR_TYPE_ARRAY:
324 elif majortype == MAJOR_TYPE_ARRAY:
325 # Beginning of arrays are treated as uints in order to decode their
325 # Beginning of arrays are treated as uints in order to decode their
326 # length. We don't allow indefinite length arrays.
326 # length. We don't allow indefinite length arrays.
327 complete, size, readcount = decodeuint(subtype, b, offset)
327 complete, size, readcount = decodeuint(subtype, b, offset)
328
328
329 if complete:
329 if complete:
330 return True, size, readcount + 1, SPECIAL_START_ARRAY
330 return True, size, readcount + 1, SPECIAL_START_ARRAY
331 else:
331 else:
332 return False, None, readcount, SPECIAL_NONE
332 return False, None, readcount, SPECIAL_NONE
333
333
334 elif majortype == MAJOR_TYPE_MAP:
334 elif majortype == MAJOR_TYPE_MAP:
335 # Beginning of maps are treated as uints in order to decode their
335 # Beginning of maps are treated as uints in order to decode their
336 # number of elements. We don't allow indefinite length arrays.
336 # number of elements. We don't allow indefinite length arrays.
337 complete, size, readcount = decodeuint(subtype, b, offset)
337 complete, size, readcount = decodeuint(subtype, b, offset)
338
338
339 if complete:
339 if complete:
340 return True, size, readcount + 1, SPECIAL_START_MAP
340 return True, size, readcount + 1, SPECIAL_START_MAP
341 else:
341 else:
342 return False, None, readcount, SPECIAL_NONE
342 return False, None, readcount, SPECIAL_NONE
343
343
344 elif majortype == MAJOR_TYPE_SEMANTIC:
344 elif majortype == MAJOR_TYPE_SEMANTIC:
345 # Semantic tag value is read the same as a uint.
345 # Semantic tag value is read the same as a uint.
346 complete, tagvalue, readcount = decodeuint(subtype, b, offset)
346 complete, tagvalue, readcount = decodeuint(subtype, b, offset)
347
347
348 if not complete:
348 if not complete:
349 return False, None, readcount, SPECIAL_NONE
349 return False, None, readcount, SPECIAL_NONE
350
350
351 # This behavior here is a little wonky. The main type being "decorated"
351 # This behavior here is a little wonky. The main type being "decorated"
352 # by this semantic tag follows. A more robust parser would probably emit
352 # by this semantic tag follows. A more robust parser would probably emit
353 # a special flag indicating this as a semantic tag and let the caller
353 # a special flag indicating this as a semantic tag and let the caller
354 # deal with the types that follow. But since we don't support many
354 # deal with the types that follow. But since we don't support many
355 # semantic tags, it is easier to deal with the special cases here and
355 # semantic tags, it is easier to deal with the special cases here and
356 # hide complexity from the caller. If we add support for more semantic
356 # hide complexity from the caller. If we add support for more semantic
357 # tags, we should probably move semantic tag handling into the caller.
357 # tags, we should probably move semantic tag handling into the caller.
358 if tagvalue == SEMANTIC_TAG_FINITE_SET:
358 if tagvalue == SEMANTIC_TAG_FINITE_SET:
359 if offset + readcount >= len(b):
359 if offset + readcount >= len(b):
360 return False, None, -1, SPECIAL_NONE
360 return False, None, -1, SPECIAL_NONE
361
361
362 complete, size, readcount2, special = decodeitem(b,
362 complete, size, readcount2, special = decodeitem(b,
363 offset + readcount)
363 offset + readcount)
364
364
365 if not complete:
365 if not complete:
366 return False, None, readcount2, SPECIAL_NONE
366 return False, None, readcount2, SPECIAL_NONE
367
367
368 if special != SPECIAL_START_ARRAY:
368 if special != SPECIAL_START_ARRAY:
369 raise CBORDecodeError('expected array after finite set '
369 raise CBORDecodeError('expected array after finite set '
370 'semantic tag')
370 'semantic tag')
371
371
372 return True, size, readcount + readcount2 + 1, SPECIAL_START_SET
372 return True, size, readcount + readcount2 + 1, SPECIAL_START_SET
373
373
374 else:
374 else:
375 raise CBORDecodeError('semantic tag %d not allowed' % tagvalue)
375 raise CBORDecodeError('semantic tag %d not allowed' % tagvalue)
376
376
377 elif majortype == MAJOR_TYPE_SPECIAL:
377 elif majortype == MAJOR_TYPE_SPECIAL:
378 # Only specific values for the information field are allowed.
378 # Only specific values for the information field are allowed.
379 if subtype == SUBTYPE_FALSE:
379 if subtype == SUBTYPE_FALSE:
380 return True, False, 1, SPECIAL_NONE
380 return True, False, 1, SPECIAL_NONE
381 elif subtype == SUBTYPE_TRUE:
381 elif subtype == SUBTYPE_TRUE:
382 return True, True, 1, SPECIAL_NONE
382 return True, True, 1, SPECIAL_NONE
383 elif subtype == SUBTYPE_NULL:
383 elif subtype == SUBTYPE_NULL:
384 return True, None, 1, SPECIAL_NONE
384 return True, None, 1, SPECIAL_NONE
385 elif subtype == SUBTYPE_INDEFINITE:
385 elif subtype == SUBTYPE_INDEFINITE:
386 return True, None, 1, SPECIAL_INDEFINITE_BREAK
386 return True, None, 1, SPECIAL_INDEFINITE_BREAK
387 # If value is 24, subtype is in next byte.
387 # If value is 24, subtype is in next byte.
388 else:
388 else:
389 raise CBORDecodeError('special type %d not allowed' % subtype)
389 raise CBORDecodeError('special type %d not allowed' % subtype)
390 else:
390 else:
391 assert False
391 assert False
392
392
393 def decodeuint(subtype, b, offset=0, allowindefinite=False):
393 def decodeuint(subtype, b, offset=0, allowindefinite=False):
394 """Decode an unsigned integer.
394 """Decode an unsigned integer.
395
395
396 ``subtype`` is the lower 5 bits from the initial byte CBOR item
396 ``subtype`` is the lower 5 bits from the initial byte CBOR item
397 "header." ``b`` is a buffer containing bytes. ``offset`` points to
397 "header." ``b`` is a buffer containing bytes. ``offset`` points to
398 the index of the first byte after the byte that ``subtype`` was
398 the index of the first byte after the byte that ``subtype`` was
399 derived from.
399 derived from.
400
400
401 ``allowindefinite`` allows the special indefinite length value
401 ``allowindefinite`` allows the special indefinite length value
402 indicator.
402 indicator.
403
403
404 Returns a 3-tuple of (successful, value, count).
404 Returns a 3-tuple of (successful, value, count).
405
405
406 The first element is a bool indicating if decoding completed. The 2nd
406 The first element is a bool indicating if decoding completed. The 2nd
407 is the decoded integer value or None if not fully decoded or the subtype
407 is the decoded integer value or None if not fully decoded or the subtype
408 is 31 and ``allowindefinite`` is True. The 3rd value is the count of bytes.
408 is 31 and ``allowindefinite`` is True. The 3rd value is the count of bytes.
409 If positive, it is the number of additional bytes decoded. If negative,
409 If positive, it is the number of additional bytes decoded. If negative,
410 it is the number of additional bytes needed to decode this value.
410 it is the number of additional bytes needed to decode this value.
411 """
411 """
412
412
413 # Small values are inline.
413 # Small values are inline.
414 if subtype < 24:
414 if subtype < 24:
415 return True, subtype, 0
415 return True, subtype, 0
416 # Indefinite length specifier.
416 # Indefinite length specifier.
417 elif subtype == 31:
417 elif subtype == 31:
418 if allowindefinite:
418 if allowindefinite:
419 return True, None, 0
419 return True, None, 0
420 else:
420 else:
421 raise CBORDecodeError('indefinite length uint not allowed here')
421 raise CBORDecodeError('indefinite length uint not allowed here')
422 elif subtype >= 28:
422 elif subtype >= 28:
423 raise CBORDecodeError('unsupported subtype on integer type: %d' %
423 raise CBORDecodeError('unsupported subtype on integer type: %d' %
424 subtype)
424 subtype)
425
425
426 if subtype == 24:
426 if subtype == 24:
427 s = STRUCT_BIG_UBYTE
427 s = STRUCT_BIG_UBYTE
428 elif subtype == 25:
428 elif subtype == 25:
429 s = STRUCT_BIG_USHORT
429 s = STRUCT_BIG_USHORT
430 elif subtype == 26:
430 elif subtype == 26:
431 s = STRUCT_BIG_ULONG
431 s = STRUCT_BIG_ULONG
432 elif subtype == 27:
432 elif subtype == 27:
433 s = STRUCT_BIG_ULONGLONG
433 s = STRUCT_BIG_ULONGLONG
434 else:
434 else:
435 raise CBORDecodeError('bounds condition checking violation')
435 raise CBORDecodeError('bounds condition checking violation')
436
436
437 if len(b) - offset >= s.size:
437 if len(b) - offset >= s.size:
438 return True, s.unpack_from(b, offset)[0], s.size
438 return True, s.unpack_from(b, offset)[0], s.size
439 else:
439 else:
440 return False, None, len(b) - offset - s.size
440 return False, None, len(b) - offset - s.size
441
441
442 class bytestringchunk(bytes):
442 class bytestringchunk(bytes):
443 """Represents a chunk/segment in an indefinite length bytestring.
443 """Represents a chunk/segment in an indefinite length bytestring.
444
444
445 This behaves like a ``bytes`` but in addition has the ``isfirst``
445 This behaves like a ``bytes`` but in addition has the ``isfirst``
446 and ``islast`` attributes indicating whether this chunk is the first
446 and ``islast`` attributes indicating whether this chunk is the first
447 or last in an indefinite length bytestring.
447 or last in an indefinite length bytestring.
448 """
448 """
449
449
450 def __new__(cls, v, first=False, last=False):
450 def __new__(cls, v, first=False, last=False):
451 self = bytes.__new__(cls, v)
451 self = bytes.__new__(cls, v)
452 self.isfirst = first
452 self.isfirst = first
453 self.islast = last
453 self.islast = last
454
454
455 return self
455 return self
456
456
457 class sansiodecoder(object):
457 class sansiodecoder(object):
458 """A CBOR decoder that doesn't perform its own I/O.
458 """A CBOR decoder that doesn't perform its own I/O.
459
459
460 To use, construct an instance and feed it segments containing
460 To use, construct an instance and feed it segments containing
461 CBOR-encoded bytes via ``decode()``. The return value from ``decode()``
461 CBOR-encoded bytes via ``decode()``. The return value from ``decode()``
462 indicates whether a fully-decoded value is available, how many bytes
462 indicates whether a fully-decoded value is available, how many bytes
463 were consumed, and offers a hint as to how many bytes should be fed
463 were consumed, and offers a hint as to how many bytes should be fed
464 in next time to decode the next value.
464 in next time to decode the next value.
465
465
466 The decoder assumes it will decode N discrete CBOR values, not just
466 The decoder assumes it will decode N discrete CBOR values, not just
467 a single value. i.e. if the bytestream contains uints packed one after
467 a single value. i.e. if the bytestream contains uints packed one after
468 the other, the decoder will decode them all, rather than just the initial
468 the other, the decoder will decode them all, rather than just the initial
469 one.
469 one.
470
470
471 When ``decode()`` indicates a value is available, call ``getavailable()``
471 When ``decode()`` indicates a value is available, call ``getavailable()``
472 to return all fully decoded values.
472 to return all fully decoded values.
473
473
474 ``decode()`` can partially decode input. It is up to the caller to keep
474 ``decode()`` can partially decode input. It is up to the caller to keep
475 track of what data was consumed and to pass unconsumed data in on the
475 track of what data was consumed and to pass unconsumed data in on the
476 next invocation.
476 next invocation.
477
477
478 The decoder decodes atomically at the *item* level. See ``decodeitem()``.
478 The decoder decodes atomically at the *item* level. See ``decodeitem()``.
479 If an *item* cannot be fully decoded, the decoder won't record it as
479 If an *item* cannot be fully decoded, the decoder won't record it as
480 partially consumed. Instead, the caller will be instructed to pass in
480 partially consumed. Instead, the caller will be instructed to pass in
481 the initial bytes of this item on the next invocation. This does result
481 the initial bytes of this item on the next invocation. This does result
482 in some redundant parsing. But the overhead should be minimal.
482 in some redundant parsing. But the overhead should be minimal.
483
483
484 This decoder only supports a subset of CBOR as required by Mercurial.
484 This decoder only supports a subset of CBOR as required by Mercurial.
485 It lacks support for:
485 It lacks support for:
486
486
487 * Indefinite length arrays
487 * Indefinite length arrays
488 * Indefinite length maps
488 * Indefinite length maps
489 * Use of indefinite length bytestrings as keys or values within
489 * Use of indefinite length bytestrings as keys or values within
490 arrays, maps, or sets.
490 arrays, maps, or sets.
491 * Nested arrays, maps, or sets within sets
491 * Nested arrays, maps, or sets within sets
492 * Any semantic tag that isn't a mathematical finite set
492 * Any semantic tag that isn't a mathematical finite set
493 * Floating point numbers
493 * Floating point numbers
494 * Undefined special value
494 * Undefined special value
495
495
496 CBOR types are decoded to Python types as follows:
496 CBOR types are decoded to Python types as follows:
497
497
498 uint -> int
498 uint -> int
499 negint -> int
499 negint -> int
500 bytestring -> bytes
500 bytestring -> bytes
501 map -> dict
501 map -> dict
502 array -> list
502 array -> list
503 True -> bool
503 True -> bool
504 False -> bool
504 False -> bool
505 null -> None
505 null -> None
506 indefinite length bytestring chunk -> [bytestringchunk]
506 indefinite length bytestring chunk -> [bytestringchunk]
507
507
508 The only non-obvious mapping here is an indefinite length bytestring
508 The only non-obvious mapping here is an indefinite length bytestring
509 to the ``bytestringchunk`` type. This is to facilitate streaming
509 to the ``bytestringchunk`` type. This is to facilitate streaming
510 indefinite length bytestrings out of the decoder and to differentiate
510 indefinite length bytestrings out of the decoder and to differentiate
511 a regular bytestring from an indefinite length bytestring.
511 a regular bytestring from an indefinite length bytestring.
512 """
512 """
513
513
514 _STATE_NONE = 0
514 _STATE_NONE = 0
515 _STATE_WANT_MAP_KEY = 1
515 _STATE_WANT_MAP_KEY = 1
516 _STATE_WANT_MAP_VALUE = 2
516 _STATE_WANT_MAP_VALUE = 2
517 _STATE_WANT_ARRAY_VALUE = 3
517 _STATE_WANT_ARRAY_VALUE = 3
518 _STATE_WANT_SET_VALUE = 4
518 _STATE_WANT_SET_VALUE = 4
519 _STATE_WANT_BYTESTRING_CHUNK_FIRST = 5
519 _STATE_WANT_BYTESTRING_CHUNK_FIRST = 5
520 _STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT = 6
520 _STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT = 6
521
521
522 def __init__(self):
522 def __init__(self):
523 # TODO add support for limiting size of bytestrings
523 # TODO add support for limiting size of bytestrings
524 # TODO add support for limiting number of keys / values in collections
524 # TODO add support for limiting number of keys / values in collections
525 # TODO add support for limiting size of buffered partial values
525 # TODO add support for limiting size of buffered partial values
526
526
527 self.decodedbytecount = 0
527 self.decodedbytecount = 0
528
528
529 self._state = self._STATE_NONE
529 self._state = self._STATE_NONE
530
530
531 # Stack of active nested collections. Each entry is a dict describing
531 # Stack of active nested collections. Each entry is a dict describing
532 # the collection.
532 # the collection.
533 self._collectionstack = []
533 self._collectionstack = []
534
534
535 # Fully decoded key to use for the current map.
535 # Fully decoded key to use for the current map.
536 self._currentmapkey = None
536 self._currentmapkey = None
537
537
538 # Fully decoded values available for retrieval.
538 # Fully decoded values available for retrieval.
539 self._decodedvalues = []
539 self._decodedvalues = []
540
540
541 @property
541 @property
542 def inprogress(self):
542 def inprogress(self):
543 """Whether the decoder has partially decoded a value."""
543 """Whether the decoder has partially decoded a value."""
544 return self._state != self._STATE_NONE
544 return self._state != self._STATE_NONE
545
545
546 def decode(self, b, offset=0):
546 def decode(self, b, offset=0):
547 """Attempt to decode bytes from an input buffer.
547 """Attempt to decode bytes from an input buffer.
548
548
549 ``b`` is a collection of bytes and ``offset`` is the byte
549 ``b`` is a collection of bytes and ``offset`` is the byte
550 offset within that buffer from which to begin reading data.
550 offset within that buffer from which to begin reading data.
551
551
552 ``b`` must support ``len()`` and accessing bytes slices via
552 ``b`` must support ``len()`` and accessing bytes slices via
553 ``__slice__``. Typically ``bytes`` instances are used.
553 ``__slice__``. Typically ``bytes`` instances are used.
554
554
555 Returns a tuple with the following fields:
555 Returns a tuple with the following fields:
556
556
557 * Bool indicating whether values are available for retrieval.
557 * Bool indicating whether values are available for retrieval.
558 * Integer indicating the number of bytes that were fully consumed,
558 * Integer indicating the number of bytes that were fully consumed,
559 starting from ``offset``.
559 starting from ``offset``.
560 * Integer indicating the number of bytes that are desired for the
560 * Integer indicating the number of bytes that are desired for the
561 next call in order to decode an item.
561 next call in order to decode an item.
562 """
562 """
563 if not b:
563 if not b:
564 return bool(self._decodedvalues), 0, 0
564 return bool(self._decodedvalues), 0, 0
565
565
566 initialoffset = offset
566 initialoffset = offset
567
567
568 # We could easily split the body of this loop into a function. But
568 # We could easily split the body of this loop into a function. But
569 # Python performance is sensitive to function calls and collections
569 # Python performance is sensitive to function calls and collections
570 # are composed of many items. So leaving as a while loop could help
570 # are composed of many items. So leaving as a while loop could help
571 # with performance. One thing that may not help is the use of
571 # with performance. One thing that may not help is the use of
572 # if..elif versus a lookup/dispatch table. There may be value
572 # if..elif versus a lookup/dispatch table. There may be value
573 # in switching that.
573 # in switching that.
574 while offset < len(b):
574 while offset < len(b):
575 # Attempt to decode an item. This could be a whole value or a
575 # Attempt to decode an item. This could be a whole value or a
576 # special value indicating an event, such as start or end of a
576 # special value indicating an event, such as start or end of a
577 # collection or indefinite length type.
577 # collection or indefinite length type.
578 complete, value, readcount, special = decodeitem(b, offset)
578 complete, value, readcount, special = decodeitem(b, offset)
579
579
580 if readcount > 0:
580 if readcount > 0:
581 self.decodedbytecount += readcount
581 self.decodedbytecount += readcount
582
582
583 if not complete:
583 if not complete:
584 assert readcount < 0
584 assert readcount < 0
585 return (
585 return (
586 bool(self._decodedvalues),
586 bool(self._decodedvalues),
587 offset - initialoffset,
587 offset - initialoffset,
588 -readcount,
588 -readcount,
589 )
589 )
590
590
591 offset += readcount
591 offset += readcount
592
592
593 # No nested state. We either have a full value or beginning of a
593 # No nested state. We either have a full value or beginning of a
594 # complex value to deal with.
594 # complex value to deal with.
595 if self._state == self._STATE_NONE:
595 if self._state == self._STATE_NONE:
596 # A normal value.
596 # A normal value.
597 if special == SPECIAL_NONE:
597 if special == SPECIAL_NONE:
598 self._decodedvalues.append(value)
598 self._decodedvalues.append(value)
599
599
600 elif special == SPECIAL_START_ARRAY:
600 elif special == SPECIAL_START_ARRAY:
601 self._collectionstack.append({
601 self._collectionstack.append({
602 'remaining': value,
602 'remaining': value,
603 'v': [],
603 'v': [],
604 })
604 })
605 self._state = self._STATE_WANT_ARRAY_VALUE
605 self._state = self._STATE_WANT_ARRAY_VALUE
606
606
607 elif special == SPECIAL_START_MAP:
607 elif special == SPECIAL_START_MAP:
608 self._collectionstack.append({
608 self._collectionstack.append({
609 'remaining': value,
609 'remaining': value,
610 'v': {},
610 'v': {},
611 })
611 })
612 self._state = self._STATE_WANT_MAP_KEY
612 self._state = self._STATE_WANT_MAP_KEY
613
613
614 elif special == SPECIAL_START_SET:
614 elif special == SPECIAL_START_SET:
615 self._collectionstack.append({
615 self._collectionstack.append({
616 'remaining': value,
616 'remaining': value,
617 'v': set(),
617 'v': set(),
618 })
618 })
619 self._state = self._STATE_WANT_SET_VALUE
619 self._state = self._STATE_WANT_SET_VALUE
620
620
621 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
621 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
622 self._state = self._STATE_WANT_BYTESTRING_CHUNK_FIRST
622 self._state = self._STATE_WANT_BYTESTRING_CHUNK_FIRST
623
623
624 else:
624 else:
625 raise CBORDecodeError('unhandled special state: %d' %
625 raise CBORDecodeError('unhandled special state: %d' %
626 special)
626 special)
627
627
628 # This value becomes an element of the current array.
628 # This value becomes an element of the current array.
629 elif self._state == self._STATE_WANT_ARRAY_VALUE:
629 elif self._state == self._STATE_WANT_ARRAY_VALUE:
630 # Simple values get appended.
630 # Simple values get appended.
631 if special == SPECIAL_NONE:
631 if special == SPECIAL_NONE:
632 c = self._collectionstack[-1]
632 c = self._collectionstack[-1]
633 c['v'].append(value)
633 c['v'].append(value)
634 c['remaining'] -= 1
634 c['remaining'] -= 1
635
635
636 # self._state doesn't need changed.
636 # self._state doesn't need changed.
637
637
638 # An array nested within an array.
638 # An array nested within an array.
639 elif special == SPECIAL_START_ARRAY:
639 elif special == SPECIAL_START_ARRAY:
640 lastc = self._collectionstack[-1]
640 lastc = self._collectionstack[-1]
641 newvalue = []
641 newvalue = []
642
642
643 lastc['v'].append(newvalue)
643 lastc['v'].append(newvalue)
644 lastc['remaining'] -= 1
644 lastc['remaining'] -= 1
645
645
646 self._collectionstack.append({
646 self._collectionstack.append({
647 'remaining': value,
647 'remaining': value,
648 'v': newvalue,
648 'v': newvalue,
649 })
649 })
650
650
651 # self._state doesn't need changed.
651 # self._state doesn't need changed.
652
652
653 # A map nested within an array.
653 # A map nested within an array.
654 elif special == SPECIAL_START_MAP:
654 elif special == SPECIAL_START_MAP:
655 lastc = self._collectionstack[-1]
655 lastc = self._collectionstack[-1]
656 newvalue = {}
656 newvalue = {}
657
657
658 lastc['v'].append(newvalue)
658 lastc['v'].append(newvalue)
659 lastc['remaining'] -= 1
659 lastc['remaining'] -= 1
660
660
661 self._collectionstack.append({
661 self._collectionstack.append({
662 'remaining': value,
662 'remaining': value,
663 'v': newvalue
663 'v': newvalue
664 })
664 })
665
665
666 self._state = self._STATE_WANT_MAP_KEY
666 self._state = self._STATE_WANT_MAP_KEY
667
667
668 elif special == SPECIAL_START_SET:
668 elif special == SPECIAL_START_SET:
669 lastc = self._collectionstack[-1]
669 lastc = self._collectionstack[-1]
670 newvalue = set()
670 newvalue = set()
671
671
672 lastc['v'].append(newvalue)
672 lastc['v'].append(newvalue)
673 lastc['remaining'] -= 1
673 lastc['remaining'] -= 1
674
674
675 self._collectionstack.append({
675 self._collectionstack.append({
676 'remaining': value,
676 'remaining': value,
677 'v': newvalue,
677 'v': newvalue,
678 })
678 })
679
679
680 self._state = self._STATE_WANT_SET_VALUE
680 self._state = self._STATE_WANT_SET_VALUE
681
681
682 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
682 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
683 raise CBORDecodeError('indefinite length bytestrings '
683 raise CBORDecodeError('indefinite length bytestrings '
684 'not allowed as array values')
684 'not allowed as array values')
685
685
686 else:
686 else:
687 raise CBORDecodeError('unhandled special item when '
687 raise CBORDecodeError('unhandled special item when '
688 'expecting array value: %d' % special)
688 'expecting array value: %d' % special)
689
689
690 # This value becomes the key of the current map instance.
690 # This value becomes the key of the current map instance.
691 elif self._state == self._STATE_WANT_MAP_KEY:
691 elif self._state == self._STATE_WANT_MAP_KEY:
692 if special == SPECIAL_NONE:
692 if special == SPECIAL_NONE:
693 self._currentmapkey = value
693 self._currentmapkey = value
694 self._state = self._STATE_WANT_MAP_VALUE
694 self._state = self._STATE_WANT_MAP_VALUE
695
695
696 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
696 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
697 raise CBORDecodeError('indefinite length bytestrings '
697 raise CBORDecodeError('indefinite length bytestrings '
698 'not allowed as map keys')
698 'not allowed as map keys')
699
699
700 elif special in (SPECIAL_START_ARRAY, SPECIAL_START_MAP,
700 elif special in (SPECIAL_START_ARRAY, SPECIAL_START_MAP,
701 SPECIAL_START_SET):
701 SPECIAL_START_SET):
702 raise CBORDecodeError('collections not supported as map '
702 raise CBORDecodeError('collections not supported as map '
703 'keys')
703 'keys')
704
704
705 # We do not allow special values to be used as map keys.
705 # We do not allow special values to be used as map keys.
706 else:
706 else:
707 raise CBORDecodeError('unhandled special item when '
707 raise CBORDecodeError('unhandled special item when '
708 'expecting map key: %d' % special)
708 'expecting map key: %d' % special)
709
709
710 # This value becomes the value of the current map key.
710 # This value becomes the value of the current map key.
711 elif self._state == self._STATE_WANT_MAP_VALUE:
711 elif self._state == self._STATE_WANT_MAP_VALUE:
712 # Simple values simply get inserted into the map.
712 # Simple values simply get inserted into the map.
713 if special == SPECIAL_NONE:
713 if special == SPECIAL_NONE:
714 lastc = self._collectionstack[-1]
714 lastc = self._collectionstack[-1]
715 lastc['v'][self._currentmapkey] = value
715 lastc['v'][self._currentmapkey] = value
716 lastc['remaining'] -= 1
716 lastc['remaining'] -= 1
717
717
718 self._state = self._STATE_WANT_MAP_KEY
718 self._state = self._STATE_WANT_MAP_KEY
719
719
720 # A new array is used as the map value.
720 # A new array is used as the map value.
721 elif special == SPECIAL_START_ARRAY:
721 elif special == SPECIAL_START_ARRAY:
722 lastc = self._collectionstack[-1]
722 lastc = self._collectionstack[-1]
723 newvalue = []
723 newvalue = []
724
724
725 lastc['v'][self._currentmapkey] = newvalue
725 lastc['v'][self._currentmapkey] = newvalue
726 lastc['remaining'] -= 1
726 lastc['remaining'] -= 1
727
727
728 self._collectionstack.append({
728 self._collectionstack.append({
729 'remaining': value,
729 'remaining': value,
730 'v': newvalue,
730 'v': newvalue,
731 })
731 })
732
732
733 self._state = self._STATE_WANT_ARRAY_VALUE
733 self._state = self._STATE_WANT_ARRAY_VALUE
734
734
735 # A new map is used as the map value.
735 # A new map is used as the map value.
736 elif special == SPECIAL_START_MAP:
736 elif special == SPECIAL_START_MAP:
737 lastc = self._collectionstack[-1]
737 lastc = self._collectionstack[-1]
738 newvalue = {}
738 newvalue = {}
739
739
740 lastc['v'][self._currentmapkey] = newvalue
740 lastc['v'][self._currentmapkey] = newvalue
741 lastc['remaining'] -= 1
741 lastc['remaining'] -= 1
742
742
743 self._collectionstack.append({
743 self._collectionstack.append({
744 'remaining': value,
744 'remaining': value,
745 'v': newvalue,
745 'v': newvalue,
746 })
746 })
747
747
748 self._state = self._STATE_WANT_MAP_KEY
748 self._state = self._STATE_WANT_MAP_KEY
749
749
750 # A new set is used as the map value.
750 # A new set is used as the map value.
751 elif special == SPECIAL_START_SET:
751 elif special == SPECIAL_START_SET:
752 lastc = self._collectionstack[-1]
752 lastc = self._collectionstack[-1]
753 newvalue = set()
753 newvalue = set()
754
754
755 lastc['v'][self._currentmapkey] = newvalue
755 lastc['v'][self._currentmapkey] = newvalue
756 lastc['remaining'] -= 1
756 lastc['remaining'] -= 1
757
757
758 self._collectionstack.append({
758 self._collectionstack.append({
759 'remaining': value,
759 'remaining': value,
760 'v': newvalue,
760 'v': newvalue,
761 })
761 })
762
762
763 self._state = self._STATE_WANT_SET_VALUE
763 self._state = self._STATE_WANT_SET_VALUE
764
764
765 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
765 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
766 raise CBORDecodeError('indefinite length bytestrings not '
766 raise CBORDecodeError('indefinite length bytestrings not '
767 'allowed as map values')
767 'allowed as map values')
768
768
769 else:
769 else:
770 raise CBORDecodeError('unhandled special item when '
770 raise CBORDecodeError('unhandled special item when '
771 'expecting map value: %d' % special)
771 'expecting map value: %d' % special)
772
772
773 self._currentmapkey = None
773 self._currentmapkey = None
774
774
775 # This value is added to the current set.
775 # This value is added to the current set.
776 elif self._state == self._STATE_WANT_SET_VALUE:
776 elif self._state == self._STATE_WANT_SET_VALUE:
777 if special == SPECIAL_NONE:
777 if special == SPECIAL_NONE:
778 lastc = self._collectionstack[-1]
778 lastc = self._collectionstack[-1]
779 lastc['v'].add(value)
779 lastc['v'].add(value)
780 lastc['remaining'] -= 1
780 lastc['remaining'] -= 1
781
781
782 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
782 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
783 raise CBORDecodeError('indefinite length bytestrings not '
783 raise CBORDecodeError('indefinite length bytestrings not '
784 'allowed as set values')
784 'allowed as set values')
785
785
786 elif special in (SPECIAL_START_ARRAY,
786 elif special in (SPECIAL_START_ARRAY,
787 SPECIAL_START_MAP,
787 SPECIAL_START_MAP,
788 SPECIAL_START_SET):
788 SPECIAL_START_SET):
789 raise CBORDecodeError('collections not allowed as set '
789 raise CBORDecodeError('collections not allowed as set '
790 'values')
790 'values')
791
791
792 # We don't allow non-trivial types to exist as set values.
792 # We don't allow non-trivial types to exist as set values.
793 else:
793 else:
794 raise CBORDecodeError('unhandled special item when '
794 raise CBORDecodeError('unhandled special item when '
795 'expecting set value: %d' % special)
795 'expecting set value: %d' % special)
796
796
797 # This value represents the first chunk in an indefinite length
797 # This value represents the first chunk in an indefinite length
798 # bytestring.
798 # bytestring.
799 elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_FIRST:
799 elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_FIRST:
800 # We received a full chunk.
800 # We received a full chunk.
801 if special == SPECIAL_NONE:
801 if special == SPECIAL_NONE:
802 self._decodedvalues.append(bytestringchunk(value,
802 self._decodedvalues.append(bytestringchunk(value,
803 first=True))
803 first=True))
804
804
805 self._state = self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT
805 self._state = self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT
806
806
807 # The end of stream marker. This means it is an empty
807 # The end of stream marker. This means it is an empty
808 # indefinite length bytestring.
808 # indefinite length bytestring.
809 elif special == SPECIAL_INDEFINITE_BREAK:
809 elif special == SPECIAL_INDEFINITE_BREAK:
810 # We /could/ convert this to a b''. But we want to preserve
810 # We /could/ convert this to a b''. But we want to preserve
811 # the nature of the underlying data so consumers expecting
811 # the nature of the underlying data so consumers expecting
812 # an indefinite length bytestring get one.
812 # an indefinite length bytestring get one.
813 self._decodedvalues.append(bytestringchunk(b'',
813 self._decodedvalues.append(bytestringchunk(b'',
814 first=True,
814 first=True,
815 last=True))
815 last=True))
816
816
817 # Since indefinite length bytestrings can't be used in
817 # Since indefinite length bytestrings can't be used in
818 # collections, we must be at the root level.
818 # collections, we must be at the root level.
819 assert not self._collectionstack
819 assert not self._collectionstack
820 self._state = self._STATE_NONE
820 self._state = self._STATE_NONE
821
821
822 else:
822 else:
823 raise CBORDecodeError('unexpected special value when '
823 raise CBORDecodeError('unexpected special value when '
824 'expecting bytestring chunk: %d' %
824 'expecting bytestring chunk: %d' %
825 special)
825 special)
826
826
827 # This value represents the non-initial chunk in an indefinite
827 # This value represents the non-initial chunk in an indefinite
828 # length bytestring.
828 # length bytestring.
829 elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT:
829 elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT:
830 # We received a full chunk.
830 # We received a full chunk.
831 if special == SPECIAL_NONE:
831 if special == SPECIAL_NONE:
832 self._decodedvalues.append(bytestringchunk(value))
832 self._decodedvalues.append(bytestringchunk(value))
833
833
834 # The end of stream marker.
834 # The end of stream marker.
835 elif special == SPECIAL_INDEFINITE_BREAK:
835 elif special == SPECIAL_INDEFINITE_BREAK:
836 self._decodedvalues.append(bytestringchunk(b'', last=True))
836 self._decodedvalues.append(bytestringchunk(b'', last=True))
837
837
838 # Since indefinite length bytestrings can't be used in
838 # Since indefinite length bytestrings can't be used in
839 # collections, we must be at the root level.
839 # collections, we must be at the root level.
840 assert not self._collectionstack
840 assert not self._collectionstack
841 self._state = self._STATE_NONE
841 self._state = self._STATE_NONE
842
842
843 else:
843 else:
844 raise CBORDecodeError('unexpected special value when '
844 raise CBORDecodeError('unexpected special value when '
845 'expecting bytestring chunk: %d' %
845 'expecting bytestring chunk: %d' %
846 special)
846 special)
847
847
848 else:
848 else:
849 raise CBORDecodeError('unhandled decoder state: %d' %
849 raise CBORDecodeError('unhandled decoder state: %d' %
850 self._state)
850 self._state)
851
851
852 # We could have just added the final value in a collection. End
852 # We could have just added the final value in a collection. End
853 # all complete collections at the top of the stack.
853 # all complete collections at the top of the stack.
854 while True:
854 while True:
855 # Bail if we're not waiting on a new collection item.
855 # Bail if we're not waiting on a new collection item.
856 if self._state not in (self._STATE_WANT_ARRAY_VALUE,
856 if self._state not in (self._STATE_WANT_ARRAY_VALUE,
857 self._STATE_WANT_MAP_KEY,
857 self._STATE_WANT_MAP_KEY,
858 self._STATE_WANT_SET_VALUE):
858 self._STATE_WANT_SET_VALUE):
859 break
859 break
860
860
861 # Or we are expecting more items for this collection.
861 # Or we are expecting more items for this collection.
862 lastc = self._collectionstack[-1]
862 lastc = self._collectionstack[-1]
863
863
864 if lastc['remaining']:
864 if lastc['remaining']:
865 break
865 break
866
866
867 # The collection at the top of the stack is complete.
867 # The collection at the top of the stack is complete.
868
868
869 # Discard it, as it isn't needed for future items.
869 # Discard it, as it isn't needed for future items.
870 self._collectionstack.pop()
870 self._collectionstack.pop()
871
871
872 # If this is a nested collection, we don't emit it, since it
872 # If this is a nested collection, we don't emit it, since it
873 # will be emitted by its parent collection. But we do need to
873 # will be emitted by its parent collection. But we do need to
874 # update state to reflect what the new top-most collection
874 # update state to reflect what the new top-most collection
875 # on the stack is.
875 # on the stack is.
876 if self._collectionstack:
876 if self._collectionstack:
877 self._state = {
877 self._state = {
878 list: self._STATE_WANT_ARRAY_VALUE,
878 list: self._STATE_WANT_ARRAY_VALUE,
879 dict: self._STATE_WANT_MAP_KEY,
879 dict: self._STATE_WANT_MAP_KEY,
880 set: self._STATE_WANT_SET_VALUE,
880 set: self._STATE_WANT_SET_VALUE,
881 }[type(self._collectionstack[-1]['v'])]
881 }[type(self._collectionstack[-1]['v'])]
882
882
883 # If this is the root collection, emit it.
883 # If this is the root collection, emit it.
884 else:
884 else:
885 self._decodedvalues.append(lastc['v'])
885 self._decodedvalues.append(lastc['v'])
886 self._state = self._STATE_NONE
886 self._state = self._STATE_NONE
887
887
888 return (
888 return (
889 bool(self._decodedvalues),
889 bool(self._decodedvalues),
890 offset - initialoffset,
890 offset - initialoffset,
891 0,
891 0,
892 )
892 )
893
893
894 def getavailable(self):
894 def getavailable(self):
895 """Returns an iterator over fully decoded values.
895 """Returns an iterator over fully decoded values.
896
896
897 Once values are retrieved, they won't be available on the next call.
897 Once values are retrieved, they won't be available on the next call.
898 """
898 """
899
899
900 l = list(self._decodedvalues)
900 l = list(self._decodedvalues)
901 self._decodedvalues = []
901 self._decodedvalues = []
902 return l
902 return l
903
903
904 class bufferingdecoder(object):
904 class bufferingdecoder(object):
905 """A CBOR decoder that buffers undecoded input.
905 """A CBOR decoder that buffers undecoded input.
906
906
907 This is a glorified wrapper around ``sansiodecoder`` that adds a buffering
907 This is a glorified wrapper around ``sansiodecoder`` that adds a buffering
908 layer. All input that isn't consumed by ``sansiodecoder`` will be buffered
908 layer. All input that isn't consumed by ``sansiodecoder`` will be buffered
909 and concatenated with any new input that arrives later.
909 and concatenated with any new input that arrives later.
910
910
911 TODO consider adding limits as to the maximum amount of data that can
911 TODO consider adding limits as to the maximum amount of data that can
912 be buffered.
912 be buffered.
913 """
913 """
914 def __init__(self):
914 def __init__(self):
915 self._decoder = sansiodecoder()
915 self._decoder = sansiodecoder()
916 self._leftover = None
916 self._chunks = []
917 self._wanted = 0
917
918
918 def decode(self, b):
919 def decode(self, b):
919 """Attempt to decode bytes to CBOR values.
920 """Attempt to decode bytes to CBOR values.
920
921
921 Returns a tuple with the following fields:
922 Returns a tuple with the following fields:
922
923
923 * Bool indicating whether new values are available for retrieval.
924 * Bool indicating whether new values are available for retrieval.
924 * Integer number of bytes decoded from the new input.
925 * Integer number of bytes decoded from the new input.
925 * Integer number of bytes wanted to decode the next value.
926 * Integer number of bytes wanted to decode the next value.
926 """
927 """
928 # Our strategy for buffering is to aggregate the incoming chunks in a
929 # list until we've received enough data to decode the next item.
930 # This is slightly more complicated than using an ``io.BytesIO``
931 # or continuously concatenating incoming data. However, because it
932 # isn't constantly reallocating backing memory for a growing buffer,
933 # it prevents excessive memory thrashing and is significantly faster,
934 # especially in cases where the percentage of input chunks that don't
935 # decode into a full item is high.
927
936
928 if self._leftover:
937 if self._chunks:
929 oldlen = len(self._leftover)
938 # A previous call said we needed N bytes to decode the next item.
930 b = self._leftover + b
939 # But this call doesn't provide enough data. We buffer the incoming
931 self._leftover = None
940 # chunk without attempting to decode.
941 if len(b) < self._wanted:
942 self._chunks.append(b)
943 self._wanted -= len(b)
944 return False, 0, self._wanted
945
946 # Else we may have enough data to decode the next item. Aggregate
947 # old data with new and reset the buffer.
948 newlen = len(b)
949 self._chunks.append(b)
950 b = b''.join(self._chunks)
951 self._chunks = []
952 oldlen = len(b) - newlen
953
932 else:
954 else:
933 b = b
934 oldlen = 0
955 oldlen = 0
935
956
936 available, readcount, wanted = self._decoder.decode(b)
957 available, readcount, wanted = self._decoder.decode(b)
958 self._wanted = wanted
937
959
938 if readcount < len(b):
960 if readcount < len(b):
939 self._leftover = b[readcount:]
961 self._chunks.append(b[readcount:])
940
962
941 return available, readcount - oldlen, wanted
963 return available, readcount - oldlen, wanted
942
964
943 def getavailable(self):
965 def getavailable(self):
944 return self._decoder.getavailable()
966 return self._decoder.getavailable()
945
967
946 def decodeall(b):
968 def decodeall(b):
947 """Decode all CBOR items present in an iterable of bytes.
969 """Decode all CBOR items present in an iterable of bytes.
948
970
949 In addition to regular decode errors, raises CBORDecodeError if the
971 In addition to regular decode errors, raises CBORDecodeError if the
950 entirety of the passed buffer does not fully decode to complete CBOR
972 entirety of the passed buffer does not fully decode to complete CBOR
951 values. This includes failure to decode any value, incomplete collection
973 values. This includes failure to decode any value, incomplete collection
952 types, incomplete indefinite length items, and extra data at the end of
974 types, incomplete indefinite length items, and extra data at the end of
953 the buffer.
975 the buffer.
954 """
976 """
955 if not b:
977 if not b:
956 return []
978 return []
957
979
958 decoder = sansiodecoder()
980 decoder = sansiodecoder()
959
981
960 havevalues, readcount, wantbytes = decoder.decode(b)
982 havevalues, readcount, wantbytes = decoder.decode(b)
961
983
962 if readcount != len(b):
984 if readcount != len(b):
963 raise CBORDecodeError('input data not fully consumed')
985 raise CBORDecodeError('input data not fully consumed')
964
986
965 if decoder.inprogress:
987 if decoder.inprogress:
966 raise CBORDecodeError('input data not complete')
988 raise CBORDecodeError('input data not complete')
967
989
968 return decoder.getavailable()
990 return decoder.getavailable()
General Comments 0
You need to be logged in to leave comments. Login now