##// END OF EJS Templates
cborutil: remove Python 2 definition of _elementtointeger()...
Gregory Szorc -
r49797:bce8f66d default
parent child Browse files
Show More
@@ -1,1081 +1,1072 b''
1 # cborutil.py - CBOR extensions
1 # cborutil.py - CBOR extensions
2 #
2 #
3 # Copyright 2018 Gregory Szorc <gregory.szorc@gmail.com>
3 # Copyright 2018 Gregory Szorc <gregory.szorc@gmail.com>
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8
8
9 import struct
9 import struct
10 import sys
11
10
12
11
13 # Very short very of RFC 7049...
12 # Very short very of RFC 7049...
14 #
13 #
15 # Each item begins with a byte. The 3 high bits of that byte denote the
14 # Each item begins with a byte. The 3 high bits of that byte denote the
16 # "major type." The lower 5 bits denote the "subtype." Each major type
15 # "major type." The lower 5 bits denote the "subtype." Each major type
17 # has its own encoding mechanism.
16 # has its own encoding mechanism.
18 #
17 #
19 # Most types have lengths. However, bytestring, string, array, and map
18 # Most types have lengths. However, bytestring, string, array, and map
20 # can be indefinite length. These are denotes by a subtype with value 31.
19 # can be indefinite length. These are denotes by a subtype with value 31.
21 # Sub-components of those types then come afterwards and are terminated
20 # Sub-components of those types then come afterwards and are terminated
22 # by a "break" byte.
21 # by a "break" byte.
23
22
24 MAJOR_TYPE_UINT = 0
23 MAJOR_TYPE_UINT = 0
25 MAJOR_TYPE_NEGINT = 1
24 MAJOR_TYPE_NEGINT = 1
26 MAJOR_TYPE_BYTESTRING = 2
25 MAJOR_TYPE_BYTESTRING = 2
27 MAJOR_TYPE_STRING = 3
26 MAJOR_TYPE_STRING = 3
28 MAJOR_TYPE_ARRAY = 4
27 MAJOR_TYPE_ARRAY = 4
29 MAJOR_TYPE_MAP = 5
28 MAJOR_TYPE_MAP = 5
30 MAJOR_TYPE_SEMANTIC = 6
29 MAJOR_TYPE_SEMANTIC = 6
31 MAJOR_TYPE_SPECIAL = 7
30 MAJOR_TYPE_SPECIAL = 7
32
31
33 SUBTYPE_MASK = 0b00011111
32 SUBTYPE_MASK = 0b00011111
34
33
35 SUBTYPE_FALSE = 20
34 SUBTYPE_FALSE = 20
36 SUBTYPE_TRUE = 21
35 SUBTYPE_TRUE = 21
37 SUBTYPE_NULL = 22
36 SUBTYPE_NULL = 22
38 SUBTYPE_HALF_FLOAT = 25
37 SUBTYPE_HALF_FLOAT = 25
39 SUBTYPE_SINGLE_FLOAT = 26
38 SUBTYPE_SINGLE_FLOAT = 26
40 SUBTYPE_DOUBLE_FLOAT = 27
39 SUBTYPE_DOUBLE_FLOAT = 27
41 SUBTYPE_INDEFINITE = 31
40 SUBTYPE_INDEFINITE = 31
42
41
43 SEMANTIC_TAG_FINITE_SET = 258
42 SEMANTIC_TAG_FINITE_SET = 258
44
43
45 # Indefinite types begin with their major type ORd with information value 31.
44 # Indefinite types begin with their major type ORd with information value 31.
46 BEGIN_INDEFINITE_BYTESTRING = struct.pack(
45 BEGIN_INDEFINITE_BYTESTRING = struct.pack(
47 '>B', MAJOR_TYPE_BYTESTRING << 5 | SUBTYPE_INDEFINITE
46 '>B', MAJOR_TYPE_BYTESTRING << 5 | SUBTYPE_INDEFINITE
48 )
47 )
49 BEGIN_INDEFINITE_ARRAY = struct.pack(
48 BEGIN_INDEFINITE_ARRAY = struct.pack(
50 '>B', MAJOR_TYPE_ARRAY << 5 | SUBTYPE_INDEFINITE
49 '>B', MAJOR_TYPE_ARRAY << 5 | SUBTYPE_INDEFINITE
51 )
50 )
52 BEGIN_INDEFINITE_MAP = struct.pack(
51 BEGIN_INDEFINITE_MAP = struct.pack(
53 '>B', MAJOR_TYPE_MAP << 5 | SUBTYPE_INDEFINITE
52 '>B', MAJOR_TYPE_MAP << 5 | SUBTYPE_INDEFINITE
54 )
53 )
55
54
56 ENCODED_LENGTH_1 = struct.Struct('>B')
55 ENCODED_LENGTH_1 = struct.Struct('>B')
57 ENCODED_LENGTH_2 = struct.Struct('>BB')
56 ENCODED_LENGTH_2 = struct.Struct('>BB')
58 ENCODED_LENGTH_3 = struct.Struct('>BH')
57 ENCODED_LENGTH_3 = struct.Struct('>BH')
59 ENCODED_LENGTH_4 = struct.Struct('>BL')
58 ENCODED_LENGTH_4 = struct.Struct('>BL')
60 ENCODED_LENGTH_5 = struct.Struct('>BQ')
59 ENCODED_LENGTH_5 = struct.Struct('>BQ')
61
60
62 # The break ends an indefinite length item.
61 # The break ends an indefinite length item.
63 BREAK = b'\xff'
62 BREAK = b'\xff'
64 BREAK_INT = 255
63 BREAK_INT = 255
65
64
66
65
67 def encodelength(majortype, length):
66 def encodelength(majortype, length):
68 """Obtain a value encoding the major type and its length."""
67 """Obtain a value encoding the major type and its length."""
69 if length < 24:
68 if length < 24:
70 return ENCODED_LENGTH_1.pack(majortype << 5 | length)
69 return ENCODED_LENGTH_1.pack(majortype << 5 | length)
71 elif length < 256:
70 elif length < 256:
72 return ENCODED_LENGTH_2.pack(majortype << 5 | 24, length)
71 return ENCODED_LENGTH_2.pack(majortype << 5 | 24, length)
73 elif length < 65536:
72 elif length < 65536:
74 return ENCODED_LENGTH_3.pack(majortype << 5 | 25, length)
73 return ENCODED_LENGTH_3.pack(majortype << 5 | 25, length)
75 elif length < 4294967296:
74 elif length < 4294967296:
76 return ENCODED_LENGTH_4.pack(majortype << 5 | 26, length)
75 return ENCODED_LENGTH_4.pack(majortype << 5 | 26, length)
77 else:
76 else:
78 return ENCODED_LENGTH_5.pack(majortype << 5 | 27, length)
77 return ENCODED_LENGTH_5.pack(majortype << 5 | 27, length)
79
78
80
79
81 def streamencodebytestring(v):
80 def streamencodebytestring(v):
82 yield encodelength(MAJOR_TYPE_BYTESTRING, len(v))
81 yield encodelength(MAJOR_TYPE_BYTESTRING, len(v))
83 yield v
82 yield v
84
83
85
84
86 def streamencodebytestringfromiter(it):
85 def streamencodebytestringfromiter(it):
87 """Convert an iterator of chunks to an indefinite bytestring.
86 """Convert an iterator of chunks to an indefinite bytestring.
88
87
89 Given an input that is iterable and each element in the iterator is
88 Given an input that is iterable and each element in the iterator is
90 representable as bytes, emit an indefinite length bytestring.
89 representable as bytes, emit an indefinite length bytestring.
91 """
90 """
92 yield BEGIN_INDEFINITE_BYTESTRING
91 yield BEGIN_INDEFINITE_BYTESTRING
93
92
94 for chunk in it:
93 for chunk in it:
95 yield encodelength(MAJOR_TYPE_BYTESTRING, len(chunk))
94 yield encodelength(MAJOR_TYPE_BYTESTRING, len(chunk))
96 yield chunk
95 yield chunk
97
96
98 yield BREAK
97 yield BREAK
99
98
100
99
101 def streamencodeindefinitebytestring(source, chunksize=65536):
100 def streamencodeindefinitebytestring(source, chunksize=65536):
102 """Given a large source buffer, emit as an indefinite length bytestring.
101 """Given a large source buffer, emit as an indefinite length bytestring.
103
102
104 This is a generator of chunks constituting the encoded CBOR data.
103 This is a generator of chunks constituting the encoded CBOR data.
105 """
104 """
106 yield BEGIN_INDEFINITE_BYTESTRING
105 yield BEGIN_INDEFINITE_BYTESTRING
107
106
108 i = 0
107 i = 0
109 l = len(source)
108 l = len(source)
110
109
111 while True:
110 while True:
112 chunk = source[i : i + chunksize]
111 chunk = source[i : i + chunksize]
113 i += len(chunk)
112 i += len(chunk)
114
113
115 yield encodelength(MAJOR_TYPE_BYTESTRING, len(chunk))
114 yield encodelength(MAJOR_TYPE_BYTESTRING, len(chunk))
116 yield chunk
115 yield chunk
117
116
118 if i >= l:
117 if i >= l:
119 break
118 break
120
119
121 yield BREAK
120 yield BREAK
122
121
123
122
124 def streamencodeint(v):
123 def streamencodeint(v):
125 if v >= 18446744073709551616 or v < -18446744073709551616:
124 if v >= 18446744073709551616 or v < -18446744073709551616:
126 raise ValueError(b'big integers not supported')
125 raise ValueError(b'big integers not supported')
127
126
128 if v >= 0:
127 if v >= 0:
129 yield encodelength(MAJOR_TYPE_UINT, v)
128 yield encodelength(MAJOR_TYPE_UINT, v)
130 else:
129 else:
131 yield encodelength(MAJOR_TYPE_NEGINT, abs(v) - 1)
130 yield encodelength(MAJOR_TYPE_NEGINT, abs(v) - 1)
132
131
133
132
134 def streamencodearray(l):
133 def streamencodearray(l):
135 """Encode a known size iterable to an array."""
134 """Encode a known size iterable to an array."""
136
135
137 yield encodelength(MAJOR_TYPE_ARRAY, len(l))
136 yield encodelength(MAJOR_TYPE_ARRAY, len(l))
138
137
139 for i in l:
138 for i in l:
140 for chunk in streamencode(i):
139 for chunk in streamencode(i):
141 yield chunk
140 yield chunk
142
141
143
142
144 def streamencodearrayfromiter(it):
143 def streamencodearrayfromiter(it):
145 """Encode an iterator of items to an indefinite length array."""
144 """Encode an iterator of items to an indefinite length array."""
146
145
147 yield BEGIN_INDEFINITE_ARRAY
146 yield BEGIN_INDEFINITE_ARRAY
148
147
149 for i in it:
148 for i in it:
150 for chunk in streamencode(i):
149 for chunk in streamencode(i):
151 yield chunk
150 yield chunk
152
151
153 yield BREAK
152 yield BREAK
154
153
155
154
156 def _mixedtypesortkey(v):
155 def _mixedtypesortkey(v):
157 return type(v).__name__, v
156 return type(v).__name__, v
158
157
159
158
160 def streamencodeset(s):
159 def streamencodeset(s):
161 # https://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml defines
160 # https://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml defines
162 # semantic tag 258 for finite sets.
161 # semantic tag 258 for finite sets.
163 yield encodelength(MAJOR_TYPE_SEMANTIC, SEMANTIC_TAG_FINITE_SET)
162 yield encodelength(MAJOR_TYPE_SEMANTIC, SEMANTIC_TAG_FINITE_SET)
164
163
165 for chunk in streamencodearray(sorted(s, key=_mixedtypesortkey)):
164 for chunk in streamencodearray(sorted(s, key=_mixedtypesortkey)):
166 yield chunk
165 yield chunk
167
166
168
167
169 def streamencodemap(d):
168 def streamencodemap(d):
170 """Encode dictionary to a generator.
169 """Encode dictionary to a generator.
171
170
172 Does not supporting indefinite length dictionaries.
171 Does not supporting indefinite length dictionaries.
173 """
172 """
174 yield encodelength(MAJOR_TYPE_MAP, len(d))
173 yield encodelength(MAJOR_TYPE_MAP, len(d))
175
174
176 for key, value in sorted(d.items(), key=lambda x: _mixedtypesortkey(x[0])):
175 for key, value in sorted(d.items(), key=lambda x: _mixedtypesortkey(x[0])):
177 for chunk in streamencode(key):
176 for chunk in streamencode(key):
178 yield chunk
177 yield chunk
179 for chunk in streamencode(value):
178 for chunk in streamencode(value):
180 yield chunk
179 yield chunk
181
180
182
181
183 def streamencodemapfromiter(it):
182 def streamencodemapfromiter(it):
184 """Given an iterable of (key, value), encode to an indefinite length map."""
183 """Given an iterable of (key, value), encode to an indefinite length map."""
185 yield BEGIN_INDEFINITE_MAP
184 yield BEGIN_INDEFINITE_MAP
186
185
187 for key, value in it:
186 for key, value in it:
188 for chunk in streamencode(key):
187 for chunk in streamencode(key):
189 yield chunk
188 yield chunk
190 for chunk in streamencode(value):
189 for chunk in streamencode(value):
191 yield chunk
190 yield chunk
192
191
193 yield BREAK
192 yield BREAK
194
193
195
194
196 def streamencodebool(b):
195 def streamencodebool(b):
197 # major type 7, simple value 20 and 21.
196 # major type 7, simple value 20 and 21.
198 yield b'\xf5' if b else b'\xf4'
197 yield b'\xf5' if b else b'\xf4'
199
198
200
199
201 def streamencodenone(v):
200 def streamencodenone(v):
202 # major type 7, simple value 22.
201 # major type 7, simple value 22.
203 yield b'\xf6'
202 yield b'\xf6'
204
203
205
204
206 STREAM_ENCODERS = {
205 STREAM_ENCODERS = {
207 bytes: streamencodebytestring,
206 bytes: streamencodebytestring,
208 int: streamencodeint,
207 int: streamencodeint,
209 int: streamencodeint,
208 int: streamencodeint,
210 list: streamencodearray,
209 list: streamencodearray,
211 tuple: streamencodearray,
210 tuple: streamencodearray,
212 dict: streamencodemap,
211 dict: streamencodemap,
213 set: streamencodeset,
212 set: streamencodeset,
214 bool: streamencodebool,
213 bool: streamencodebool,
215 type(None): streamencodenone,
214 type(None): streamencodenone,
216 }
215 }
217
216
218
217
219 def streamencode(v):
218 def streamencode(v):
220 """Encode a value in a streaming manner.
219 """Encode a value in a streaming manner.
221
220
222 Given an input object, encode it to CBOR recursively.
221 Given an input object, encode it to CBOR recursively.
223
222
224 Returns a generator of CBOR encoded bytes. There is no guarantee
223 Returns a generator of CBOR encoded bytes. There is no guarantee
225 that each emitted chunk fully decodes to a value or sub-value.
224 that each emitted chunk fully decodes to a value or sub-value.
226
225
227 Encoding is deterministic - unordered collections are sorted.
226 Encoding is deterministic - unordered collections are sorted.
228 """
227 """
229 fn = STREAM_ENCODERS.get(v.__class__)
228 fn = STREAM_ENCODERS.get(v.__class__)
230
229
231 if not fn:
230 if not fn:
232 # handle subtypes such as encoding.localstr and util.sortdict
231 # handle subtypes such as encoding.localstr and util.sortdict
233 for ty in STREAM_ENCODERS:
232 for ty in STREAM_ENCODERS:
234 if not isinstance(v, ty):
233 if not isinstance(v, ty):
235 continue
234 continue
236 fn = STREAM_ENCODERS[ty]
235 fn = STREAM_ENCODERS[ty]
237 break
236 break
238
237
239 if not fn:
238 if not fn:
240 raise ValueError(b'do not know how to encode %s' % type(v))
239 raise ValueError(b'do not know how to encode %s' % type(v))
241
240
242 return fn(v)
241 return fn(v)
243
242
244
243
245 class CBORDecodeError(Exception):
244 class CBORDecodeError(Exception):
246 """Represents an error decoding CBOR."""
245 """Represents an error decoding CBOR."""
247
246
248
247
249 if sys.version_info.major >= 3:
248 def _elementtointeger(b, i):
250
249 return b[i]
251 def _elementtointeger(b, i):
252 return b[i]
253
254
255 else:
256
257 def _elementtointeger(b, i):
258 return ord(b[i])
259
250
260
251
261 STRUCT_BIG_UBYTE = struct.Struct('>B')
252 STRUCT_BIG_UBYTE = struct.Struct('>B')
262 STRUCT_BIG_USHORT = struct.Struct(b'>H')
253 STRUCT_BIG_USHORT = struct.Struct(b'>H')
263 STRUCT_BIG_ULONG = struct.Struct(b'>L')
254 STRUCT_BIG_ULONG = struct.Struct(b'>L')
264 STRUCT_BIG_ULONGLONG = struct.Struct(b'>Q')
255 STRUCT_BIG_ULONGLONG = struct.Struct(b'>Q')
265
256
266 SPECIAL_NONE = 0
257 SPECIAL_NONE = 0
267 SPECIAL_START_INDEFINITE_BYTESTRING = 1
258 SPECIAL_START_INDEFINITE_BYTESTRING = 1
268 SPECIAL_START_ARRAY = 2
259 SPECIAL_START_ARRAY = 2
269 SPECIAL_START_MAP = 3
260 SPECIAL_START_MAP = 3
270 SPECIAL_START_SET = 4
261 SPECIAL_START_SET = 4
271 SPECIAL_INDEFINITE_BREAK = 5
262 SPECIAL_INDEFINITE_BREAK = 5
272
263
273
264
274 def decodeitem(b, offset=0):
265 def decodeitem(b, offset=0):
275 """Decode a new CBOR value from a buffer at offset.
266 """Decode a new CBOR value from a buffer at offset.
276
267
277 This function attempts to decode up to one complete CBOR value
268 This function attempts to decode up to one complete CBOR value
278 from ``b`` starting at offset ``offset``.
269 from ``b`` starting at offset ``offset``.
279
270
280 The beginning of a collection (such as an array, map, set, or
271 The beginning of a collection (such as an array, map, set, or
281 indefinite length bytestring) counts as a single value. For these
272 indefinite length bytestring) counts as a single value. For these
282 special cases, a state flag will indicate that a special value was seen.
273 special cases, a state flag will indicate that a special value was seen.
283
274
284 When called, the function either returns a decoded value or gives
275 When called, the function either returns a decoded value or gives
285 a hint as to how many more bytes are needed to do so. By calling
276 a hint as to how many more bytes are needed to do so. By calling
286 the function repeatedly given a stream of bytes, the caller can
277 the function repeatedly given a stream of bytes, the caller can
287 build up the original values.
278 build up the original values.
288
279
289 Returns a tuple with the following elements:
280 Returns a tuple with the following elements:
290
281
291 * Bool indicating whether a complete value was decoded.
282 * Bool indicating whether a complete value was decoded.
292 * A decoded value if first value is True otherwise None
283 * A decoded value if first value is True otherwise None
293 * Integer number of bytes. If positive, the number of bytes
284 * Integer number of bytes. If positive, the number of bytes
294 read. If negative, the number of bytes we need to read to
285 read. If negative, the number of bytes we need to read to
295 decode this value or the next chunk in this value.
286 decode this value or the next chunk in this value.
296 * One of the ``SPECIAL_*`` constants indicating special treatment
287 * One of the ``SPECIAL_*`` constants indicating special treatment
297 for this value. ``SPECIAL_NONE`` means this is a fully decoded
288 for this value. ``SPECIAL_NONE`` means this is a fully decoded
298 simple value (such as an integer or bool).
289 simple value (such as an integer or bool).
299 """
290 """
300
291
301 initial = _elementtointeger(b, offset)
292 initial = _elementtointeger(b, offset)
302 offset += 1
293 offset += 1
303
294
304 majortype = initial >> 5
295 majortype = initial >> 5
305 subtype = initial & SUBTYPE_MASK
296 subtype = initial & SUBTYPE_MASK
306
297
307 if majortype == MAJOR_TYPE_UINT:
298 if majortype == MAJOR_TYPE_UINT:
308 complete, value, readcount = decodeuint(subtype, b, offset)
299 complete, value, readcount = decodeuint(subtype, b, offset)
309
300
310 if complete:
301 if complete:
311 return True, value, readcount + 1, SPECIAL_NONE
302 return True, value, readcount + 1, SPECIAL_NONE
312 else:
303 else:
313 return False, None, readcount, SPECIAL_NONE
304 return False, None, readcount, SPECIAL_NONE
314
305
315 elif majortype == MAJOR_TYPE_NEGINT:
306 elif majortype == MAJOR_TYPE_NEGINT:
316 # Negative integers are the same as UINT except inverted minus 1.
307 # Negative integers are the same as UINT except inverted minus 1.
317 complete, value, readcount = decodeuint(subtype, b, offset)
308 complete, value, readcount = decodeuint(subtype, b, offset)
318
309
319 if complete:
310 if complete:
320 return True, -value - 1, readcount + 1, SPECIAL_NONE
311 return True, -value - 1, readcount + 1, SPECIAL_NONE
321 else:
312 else:
322 return False, None, readcount, SPECIAL_NONE
313 return False, None, readcount, SPECIAL_NONE
323
314
324 elif majortype == MAJOR_TYPE_BYTESTRING:
315 elif majortype == MAJOR_TYPE_BYTESTRING:
325 # Beginning of bytestrings are treated as uints in order to
316 # Beginning of bytestrings are treated as uints in order to
326 # decode their length, which may be indefinite.
317 # decode their length, which may be indefinite.
327 complete, size, readcount = decodeuint(
318 complete, size, readcount = decodeuint(
328 subtype, b, offset, allowindefinite=True
319 subtype, b, offset, allowindefinite=True
329 )
320 )
330
321
331 # We don't know the size of the bytestring. It must be a definitive
322 # We don't know the size of the bytestring. It must be a definitive
332 # length since the indefinite subtype would be encoded in the initial
323 # length since the indefinite subtype would be encoded in the initial
333 # byte.
324 # byte.
334 if not complete:
325 if not complete:
335 return False, None, readcount, SPECIAL_NONE
326 return False, None, readcount, SPECIAL_NONE
336
327
337 # We know the length of the bytestring.
328 # We know the length of the bytestring.
338 if size is not None:
329 if size is not None:
339 # And the data is available in the buffer.
330 # And the data is available in the buffer.
340 if offset + readcount + size <= len(b):
331 if offset + readcount + size <= len(b):
341 value = b[offset + readcount : offset + readcount + size]
332 value = b[offset + readcount : offset + readcount + size]
342 return True, value, readcount + size + 1, SPECIAL_NONE
333 return True, value, readcount + size + 1, SPECIAL_NONE
343
334
344 # And we need more data in order to return the bytestring.
335 # And we need more data in order to return the bytestring.
345 else:
336 else:
346 wanted = len(b) - offset - readcount - size
337 wanted = len(b) - offset - readcount - size
347 return False, None, wanted, SPECIAL_NONE
338 return False, None, wanted, SPECIAL_NONE
348
339
349 # It is an indefinite length bytestring.
340 # It is an indefinite length bytestring.
350 else:
341 else:
351 return True, None, 1, SPECIAL_START_INDEFINITE_BYTESTRING
342 return True, None, 1, SPECIAL_START_INDEFINITE_BYTESTRING
352
343
353 elif majortype == MAJOR_TYPE_STRING:
344 elif majortype == MAJOR_TYPE_STRING:
354 raise CBORDecodeError(b'string major type not supported')
345 raise CBORDecodeError(b'string major type not supported')
355
346
356 elif majortype == MAJOR_TYPE_ARRAY:
347 elif majortype == MAJOR_TYPE_ARRAY:
357 # Beginning of arrays are treated as uints in order to decode their
348 # Beginning of arrays are treated as uints in order to decode their
358 # length. We don't allow indefinite length arrays.
349 # length. We don't allow indefinite length arrays.
359 complete, size, readcount = decodeuint(subtype, b, offset)
350 complete, size, readcount = decodeuint(subtype, b, offset)
360
351
361 if complete:
352 if complete:
362 return True, size, readcount + 1, SPECIAL_START_ARRAY
353 return True, size, readcount + 1, SPECIAL_START_ARRAY
363 else:
354 else:
364 return False, None, readcount, SPECIAL_NONE
355 return False, None, readcount, SPECIAL_NONE
365
356
366 elif majortype == MAJOR_TYPE_MAP:
357 elif majortype == MAJOR_TYPE_MAP:
367 # Beginning of maps are treated as uints in order to decode their
358 # Beginning of maps are treated as uints in order to decode their
368 # number of elements. We don't allow indefinite length arrays.
359 # number of elements. We don't allow indefinite length arrays.
369 complete, size, readcount = decodeuint(subtype, b, offset)
360 complete, size, readcount = decodeuint(subtype, b, offset)
370
361
371 if complete:
362 if complete:
372 return True, size, readcount + 1, SPECIAL_START_MAP
363 return True, size, readcount + 1, SPECIAL_START_MAP
373 else:
364 else:
374 return False, None, readcount, SPECIAL_NONE
365 return False, None, readcount, SPECIAL_NONE
375
366
376 elif majortype == MAJOR_TYPE_SEMANTIC:
367 elif majortype == MAJOR_TYPE_SEMANTIC:
377 # Semantic tag value is read the same as a uint.
368 # Semantic tag value is read the same as a uint.
378 complete, tagvalue, readcount = decodeuint(subtype, b, offset)
369 complete, tagvalue, readcount = decodeuint(subtype, b, offset)
379
370
380 if not complete:
371 if not complete:
381 return False, None, readcount, SPECIAL_NONE
372 return False, None, readcount, SPECIAL_NONE
382
373
383 # This behavior here is a little wonky. The main type being "decorated"
374 # This behavior here is a little wonky. The main type being "decorated"
384 # by this semantic tag follows. A more robust parser would probably emit
375 # by this semantic tag follows. A more robust parser would probably emit
385 # a special flag indicating this as a semantic tag and let the caller
376 # a special flag indicating this as a semantic tag and let the caller
386 # deal with the types that follow. But since we don't support many
377 # deal with the types that follow. But since we don't support many
387 # semantic tags, it is easier to deal with the special cases here and
378 # semantic tags, it is easier to deal with the special cases here and
388 # hide complexity from the caller. If we add support for more semantic
379 # hide complexity from the caller. If we add support for more semantic
389 # tags, we should probably move semantic tag handling into the caller.
380 # tags, we should probably move semantic tag handling into the caller.
390 if tagvalue == SEMANTIC_TAG_FINITE_SET:
381 if tagvalue == SEMANTIC_TAG_FINITE_SET:
391 if offset + readcount >= len(b):
382 if offset + readcount >= len(b):
392 return False, None, -1, SPECIAL_NONE
383 return False, None, -1, SPECIAL_NONE
393
384
394 complete, size, readcount2, special = decodeitem(
385 complete, size, readcount2, special = decodeitem(
395 b, offset + readcount
386 b, offset + readcount
396 )
387 )
397
388
398 if not complete:
389 if not complete:
399 return False, None, readcount2, SPECIAL_NONE
390 return False, None, readcount2, SPECIAL_NONE
400
391
401 if special != SPECIAL_START_ARRAY:
392 if special != SPECIAL_START_ARRAY:
402 raise CBORDecodeError(
393 raise CBORDecodeError(
403 b'expected array after finite set semantic tag'
394 b'expected array after finite set semantic tag'
404 )
395 )
405
396
406 return True, size, readcount + readcount2 + 1, SPECIAL_START_SET
397 return True, size, readcount + readcount2 + 1, SPECIAL_START_SET
407
398
408 else:
399 else:
409 raise CBORDecodeError(b'semantic tag %d not allowed' % tagvalue)
400 raise CBORDecodeError(b'semantic tag %d not allowed' % tagvalue)
410
401
411 elif majortype == MAJOR_TYPE_SPECIAL:
402 elif majortype == MAJOR_TYPE_SPECIAL:
412 # Only specific values for the information field are allowed.
403 # Only specific values for the information field are allowed.
413 if subtype == SUBTYPE_FALSE:
404 if subtype == SUBTYPE_FALSE:
414 return True, False, 1, SPECIAL_NONE
405 return True, False, 1, SPECIAL_NONE
415 elif subtype == SUBTYPE_TRUE:
406 elif subtype == SUBTYPE_TRUE:
416 return True, True, 1, SPECIAL_NONE
407 return True, True, 1, SPECIAL_NONE
417 elif subtype == SUBTYPE_NULL:
408 elif subtype == SUBTYPE_NULL:
418 return True, None, 1, SPECIAL_NONE
409 return True, None, 1, SPECIAL_NONE
419 elif subtype == SUBTYPE_INDEFINITE:
410 elif subtype == SUBTYPE_INDEFINITE:
420 return True, None, 1, SPECIAL_INDEFINITE_BREAK
411 return True, None, 1, SPECIAL_INDEFINITE_BREAK
421 # If value is 24, subtype is in next byte.
412 # If value is 24, subtype is in next byte.
422 else:
413 else:
423 raise CBORDecodeError(b'special type %d not allowed' % subtype)
414 raise CBORDecodeError(b'special type %d not allowed' % subtype)
424 else:
415 else:
425 assert False
416 assert False
426
417
427
418
428 def decodeuint(subtype, b, offset=0, allowindefinite=False):
419 def decodeuint(subtype, b, offset=0, allowindefinite=False):
429 """Decode an unsigned integer.
420 """Decode an unsigned integer.
430
421
431 ``subtype`` is the lower 5 bits from the initial byte CBOR item
422 ``subtype`` is the lower 5 bits from the initial byte CBOR item
432 "header." ``b`` is a buffer containing bytes. ``offset`` points to
423 "header." ``b`` is a buffer containing bytes. ``offset`` points to
433 the index of the first byte after the byte that ``subtype`` was
424 the index of the first byte after the byte that ``subtype`` was
434 derived from.
425 derived from.
435
426
436 ``allowindefinite`` allows the special indefinite length value
427 ``allowindefinite`` allows the special indefinite length value
437 indicator.
428 indicator.
438
429
439 Returns a 3-tuple of (successful, value, count).
430 Returns a 3-tuple of (successful, value, count).
440
431
441 The first element is a bool indicating if decoding completed. The 2nd
432 The first element is a bool indicating if decoding completed. The 2nd
442 is the decoded integer value or None if not fully decoded or the subtype
433 is the decoded integer value or None if not fully decoded or the subtype
443 is 31 and ``allowindefinite`` is True. The 3rd value is the count of bytes.
434 is 31 and ``allowindefinite`` is True. The 3rd value is the count of bytes.
444 If positive, it is the number of additional bytes decoded. If negative,
435 If positive, it is the number of additional bytes decoded. If negative,
445 it is the number of additional bytes needed to decode this value.
436 it is the number of additional bytes needed to decode this value.
446 """
437 """
447
438
448 # Small values are inline.
439 # Small values are inline.
449 if subtype < 24:
440 if subtype < 24:
450 return True, subtype, 0
441 return True, subtype, 0
451 # Indefinite length specifier.
442 # Indefinite length specifier.
452 elif subtype == 31:
443 elif subtype == 31:
453 if allowindefinite:
444 if allowindefinite:
454 return True, None, 0
445 return True, None, 0
455 else:
446 else:
456 raise CBORDecodeError(b'indefinite length uint not allowed here')
447 raise CBORDecodeError(b'indefinite length uint not allowed here')
457 elif subtype >= 28:
448 elif subtype >= 28:
458 raise CBORDecodeError(
449 raise CBORDecodeError(
459 b'unsupported subtype on integer type: %d' % subtype
450 b'unsupported subtype on integer type: %d' % subtype
460 )
451 )
461
452
462 if subtype == 24:
453 if subtype == 24:
463 s = STRUCT_BIG_UBYTE
454 s = STRUCT_BIG_UBYTE
464 elif subtype == 25:
455 elif subtype == 25:
465 s = STRUCT_BIG_USHORT
456 s = STRUCT_BIG_USHORT
466 elif subtype == 26:
457 elif subtype == 26:
467 s = STRUCT_BIG_ULONG
458 s = STRUCT_BIG_ULONG
468 elif subtype == 27:
459 elif subtype == 27:
469 s = STRUCT_BIG_ULONGLONG
460 s = STRUCT_BIG_ULONGLONG
470 else:
461 else:
471 raise CBORDecodeError(b'bounds condition checking violation')
462 raise CBORDecodeError(b'bounds condition checking violation')
472
463
473 if len(b) - offset >= s.size:
464 if len(b) - offset >= s.size:
474 return True, s.unpack_from(b, offset)[0], s.size
465 return True, s.unpack_from(b, offset)[0], s.size
475 else:
466 else:
476 return False, None, len(b) - offset - s.size
467 return False, None, len(b) - offset - s.size
477
468
478
469
479 class bytestringchunk(bytes):
470 class bytestringchunk(bytes):
480 """Represents a chunk/segment in an indefinite length bytestring.
471 """Represents a chunk/segment in an indefinite length bytestring.
481
472
482 This behaves like a ``bytes`` but in addition has the ``isfirst``
473 This behaves like a ``bytes`` but in addition has the ``isfirst``
483 and ``islast`` attributes indicating whether this chunk is the first
474 and ``islast`` attributes indicating whether this chunk is the first
484 or last in an indefinite length bytestring.
475 or last in an indefinite length bytestring.
485 """
476 """
486
477
487 def __new__(cls, v, first=False, last=False):
478 def __new__(cls, v, first=False, last=False):
488 self = bytes.__new__(cls, v)
479 self = bytes.__new__(cls, v)
489 self.isfirst = first
480 self.isfirst = first
490 self.islast = last
481 self.islast = last
491
482
492 return self
483 return self
493
484
494
485
495 class sansiodecoder(object):
486 class sansiodecoder(object):
496 """A CBOR decoder that doesn't perform its own I/O.
487 """A CBOR decoder that doesn't perform its own I/O.
497
488
498 To use, construct an instance and feed it segments containing
489 To use, construct an instance and feed it segments containing
499 CBOR-encoded bytes via ``decode()``. The return value from ``decode()``
490 CBOR-encoded bytes via ``decode()``. The return value from ``decode()``
500 indicates whether a fully-decoded value is available, how many bytes
491 indicates whether a fully-decoded value is available, how many bytes
501 were consumed, and offers a hint as to how many bytes should be fed
492 were consumed, and offers a hint as to how many bytes should be fed
502 in next time to decode the next value.
493 in next time to decode the next value.
503
494
504 The decoder assumes it will decode N discrete CBOR values, not just
495 The decoder assumes it will decode N discrete CBOR values, not just
505 a single value. i.e. if the bytestream contains uints packed one after
496 a single value. i.e. if the bytestream contains uints packed one after
506 the other, the decoder will decode them all, rather than just the initial
497 the other, the decoder will decode them all, rather than just the initial
507 one.
498 one.
508
499
509 When ``decode()`` indicates a value is available, call ``getavailable()``
500 When ``decode()`` indicates a value is available, call ``getavailable()``
510 to return all fully decoded values.
501 to return all fully decoded values.
511
502
512 ``decode()`` can partially decode input. It is up to the caller to keep
503 ``decode()`` can partially decode input. It is up to the caller to keep
513 track of what data was consumed and to pass unconsumed data in on the
504 track of what data was consumed and to pass unconsumed data in on the
514 next invocation.
505 next invocation.
515
506
516 The decoder decodes atomically at the *item* level. See ``decodeitem()``.
507 The decoder decodes atomically at the *item* level. See ``decodeitem()``.
517 If an *item* cannot be fully decoded, the decoder won't record it as
508 If an *item* cannot be fully decoded, the decoder won't record it as
518 partially consumed. Instead, the caller will be instructed to pass in
509 partially consumed. Instead, the caller will be instructed to pass in
519 the initial bytes of this item on the next invocation. This does result
510 the initial bytes of this item on the next invocation. This does result
520 in some redundant parsing. But the overhead should be minimal.
511 in some redundant parsing. But the overhead should be minimal.
521
512
522 This decoder only supports a subset of CBOR as required by Mercurial.
513 This decoder only supports a subset of CBOR as required by Mercurial.
523 It lacks support for:
514 It lacks support for:
524
515
525 * Indefinite length arrays
516 * Indefinite length arrays
526 * Indefinite length maps
517 * Indefinite length maps
527 * Use of indefinite length bytestrings as keys or values within
518 * Use of indefinite length bytestrings as keys or values within
528 arrays, maps, or sets.
519 arrays, maps, or sets.
529 * Nested arrays, maps, or sets within sets
520 * Nested arrays, maps, or sets within sets
530 * Any semantic tag that isn't a mathematical finite set
521 * Any semantic tag that isn't a mathematical finite set
531 * Floating point numbers
522 * Floating point numbers
532 * Undefined special value
523 * Undefined special value
533
524
534 CBOR types are decoded to Python types as follows:
525 CBOR types are decoded to Python types as follows:
535
526
536 uint -> int
527 uint -> int
537 negint -> int
528 negint -> int
538 bytestring -> bytes
529 bytestring -> bytes
539 map -> dict
530 map -> dict
540 array -> list
531 array -> list
541 True -> bool
532 True -> bool
542 False -> bool
533 False -> bool
543 null -> None
534 null -> None
544 indefinite length bytestring chunk -> [bytestringchunk]
535 indefinite length bytestring chunk -> [bytestringchunk]
545
536
546 The only non-obvious mapping here is an indefinite length bytestring
537 The only non-obvious mapping here is an indefinite length bytestring
547 to the ``bytestringchunk`` type. This is to facilitate streaming
538 to the ``bytestringchunk`` type. This is to facilitate streaming
548 indefinite length bytestrings out of the decoder and to differentiate
539 indefinite length bytestrings out of the decoder and to differentiate
549 a regular bytestring from an indefinite length bytestring.
540 a regular bytestring from an indefinite length bytestring.
550 """
541 """
551
542
552 _STATE_NONE = 0
543 _STATE_NONE = 0
553 _STATE_WANT_MAP_KEY = 1
544 _STATE_WANT_MAP_KEY = 1
554 _STATE_WANT_MAP_VALUE = 2
545 _STATE_WANT_MAP_VALUE = 2
555 _STATE_WANT_ARRAY_VALUE = 3
546 _STATE_WANT_ARRAY_VALUE = 3
556 _STATE_WANT_SET_VALUE = 4
547 _STATE_WANT_SET_VALUE = 4
557 _STATE_WANT_BYTESTRING_CHUNK_FIRST = 5
548 _STATE_WANT_BYTESTRING_CHUNK_FIRST = 5
558 _STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT = 6
549 _STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT = 6
559
550
560 def __init__(self):
551 def __init__(self):
561 # TODO add support for limiting size of bytestrings
552 # TODO add support for limiting size of bytestrings
562 # TODO add support for limiting number of keys / values in collections
553 # TODO add support for limiting number of keys / values in collections
563 # TODO add support for limiting size of buffered partial values
554 # TODO add support for limiting size of buffered partial values
564
555
565 self.decodedbytecount = 0
556 self.decodedbytecount = 0
566
557
567 self._state = self._STATE_NONE
558 self._state = self._STATE_NONE
568
559
569 # Stack of active nested collections. Each entry is a dict describing
560 # Stack of active nested collections. Each entry is a dict describing
570 # the collection.
561 # the collection.
571 self._collectionstack = []
562 self._collectionstack = []
572
563
573 # Fully decoded key to use for the current map.
564 # Fully decoded key to use for the current map.
574 self._currentmapkey = None
565 self._currentmapkey = None
575
566
576 # Fully decoded values available for retrieval.
567 # Fully decoded values available for retrieval.
577 self._decodedvalues = []
568 self._decodedvalues = []
578
569
579 @property
570 @property
580 def inprogress(self):
571 def inprogress(self):
581 """Whether the decoder has partially decoded a value."""
572 """Whether the decoder has partially decoded a value."""
582 return self._state != self._STATE_NONE
573 return self._state != self._STATE_NONE
583
574
584 def decode(self, b, offset=0):
575 def decode(self, b, offset=0):
585 """Attempt to decode bytes from an input buffer.
576 """Attempt to decode bytes from an input buffer.
586
577
587 ``b`` is a collection of bytes and ``offset`` is the byte
578 ``b`` is a collection of bytes and ``offset`` is the byte
588 offset within that buffer from which to begin reading data.
579 offset within that buffer from which to begin reading data.
589
580
590 ``b`` must support ``len()`` and accessing bytes slices via
581 ``b`` must support ``len()`` and accessing bytes slices via
591 ``__slice__``. Typically ``bytes`` instances are used.
582 ``__slice__``. Typically ``bytes`` instances are used.
592
583
593 Returns a tuple with the following fields:
584 Returns a tuple with the following fields:
594
585
595 * Bool indicating whether values are available for retrieval.
586 * Bool indicating whether values are available for retrieval.
596 * Integer indicating the number of bytes that were fully consumed,
587 * Integer indicating the number of bytes that were fully consumed,
597 starting from ``offset``.
588 starting from ``offset``.
598 * Integer indicating the number of bytes that are desired for the
589 * Integer indicating the number of bytes that are desired for the
599 next call in order to decode an item.
590 next call in order to decode an item.
600 """
591 """
601 if not b:
592 if not b:
602 return bool(self._decodedvalues), 0, 0
593 return bool(self._decodedvalues), 0, 0
603
594
604 initialoffset = offset
595 initialoffset = offset
605
596
606 # We could easily split the body of this loop into a function. But
597 # We could easily split the body of this loop into a function. But
607 # Python performance is sensitive to function calls and collections
598 # Python performance is sensitive to function calls and collections
608 # are composed of many items. So leaving as a while loop could help
599 # are composed of many items. So leaving as a while loop could help
609 # with performance. One thing that may not help is the use of
600 # with performance. One thing that may not help is the use of
610 # if..elif versus a lookup/dispatch table. There may be value
601 # if..elif versus a lookup/dispatch table. There may be value
611 # in switching that.
602 # in switching that.
612 while offset < len(b):
603 while offset < len(b):
613 # Attempt to decode an item. This could be a whole value or a
604 # Attempt to decode an item. This could be a whole value or a
614 # special value indicating an event, such as start or end of a
605 # special value indicating an event, such as start or end of a
615 # collection or indefinite length type.
606 # collection or indefinite length type.
616 complete, value, readcount, special = decodeitem(b, offset)
607 complete, value, readcount, special = decodeitem(b, offset)
617
608
618 if readcount > 0:
609 if readcount > 0:
619 self.decodedbytecount += readcount
610 self.decodedbytecount += readcount
620
611
621 if not complete:
612 if not complete:
622 assert readcount < 0
613 assert readcount < 0
623 return (
614 return (
624 bool(self._decodedvalues),
615 bool(self._decodedvalues),
625 offset - initialoffset,
616 offset - initialoffset,
626 -readcount,
617 -readcount,
627 )
618 )
628
619
629 offset += readcount
620 offset += readcount
630
621
631 # No nested state. We either have a full value or beginning of a
622 # No nested state. We either have a full value or beginning of a
632 # complex value to deal with.
623 # complex value to deal with.
633 if self._state == self._STATE_NONE:
624 if self._state == self._STATE_NONE:
634 # A normal value.
625 # A normal value.
635 if special == SPECIAL_NONE:
626 if special == SPECIAL_NONE:
636 self._decodedvalues.append(value)
627 self._decodedvalues.append(value)
637
628
638 elif special == SPECIAL_START_ARRAY:
629 elif special == SPECIAL_START_ARRAY:
639 self._collectionstack.append(
630 self._collectionstack.append(
640 {
631 {
641 b'remaining': value,
632 b'remaining': value,
642 b'v': [],
633 b'v': [],
643 }
634 }
644 )
635 )
645 self._state = self._STATE_WANT_ARRAY_VALUE
636 self._state = self._STATE_WANT_ARRAY_VALUE
646
637
647 elif special == SPECIAL_START_MAP:
638 elif special == SPECIAL_START_MAP:
648 self._collectionstack.append(
639 self._collectionstack.append(
649 {
640 {
650 b'remaining': value,
641 b'remaining': value,
651 b'v': {},
642 b'v': {},
652 }
643 }
653 )
644 )
654 self._state = self._STATE_WANT_MAP_KEY
645 self._state = self._STATE_WANT_MAP_KEY
655
646
656 elif special == SPECIAL_START_SET:
647 elif special == SPECIAL_START_SET:
657 self._collectionstack.append(
648 self._collectionstack.append(
658 {
649 {
659 b'remaining': value,
650 b'remaining': value,
660 b'v': set(),
651 b'v': set(),
661 }
652 }
662 )
653 )
663 self._state = self._STATE_WANT_SET_VALUE
654 self._state = self._STATE_WANT_SET_VALUE
664
655
665 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
656 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
666 self._state = self._STATE_WANT_BYTESTRING_CHUNK_FIRST
657 self._state = self._STATE_WANT_BYTESTRING_CHUNK_FIRST
667
658
668 else:
659 else:
669 raise CBORDecodeError(
660 raise CBORDecodeError(
670 b'unhandled special state: %d' % special
661 b'unhandled special state: %d' % special
671 )
662 )
672
663
673 # This value becomes an element of the current array.
664 # This value becomes an element of the current array.
674 elif self._state == self._STATE_WANT_ARRAY_VALUE:
665 elif self._state == self._STATE_WANT_ARRAY_VALUE:
675 # Simple values get appended.
666 # Simple values get appended.
676 if special == SPECIAL_NONE:
667 if special == SPECIAL_NONE:
677 c = self._collectionstack[-1]
668 c = self._collectionstack[-1]
678 c[b'v'].append(value)
669 c[b'v'].append(value)
679 c[b'remaining'] -= 1
670 c[b'remaining'] -= 1
680
671
681 # self._state doesn't need changed.
672 # self._state doesn't need changed.
682
673
683 # An array nested within an array.
674 # An array nested within an array.
684 elif special == SPECIAL_START_ARRAY:
675 elif special == SPECIAL_START_ARRAY:
685 lastc = self._collectionstack[-1]
676 lastc = self._collectionstack[-1]
686 newvalue = []
677 newvalue = []
687
678
688 lastc[b'v'].append(newvalue)
679 lastc[b'v'].append(newvalue)
689 lastc[b'remaining'] -= 1
680 lastc[b'remaining'] -= 1
690
681
691 self._collectionstack.append(
682 self._collectionstack.append(
692 {
683 {
693 b'remaining': value,
684 b'remaining': value,
694 b'v': newvalue,
685 b'v': newvalue,
695 }
686 }
696 )
687 )
697
688
698 # self._state doesn't need changed.
689 # self._state doesn't need changed.
699
690
700 # A map nested within an array.
691 # A map nested within an array.
701 elif special == SPECIAL_START_MAP:
692 elif special == SPECIAL_START_MAP:
702 lastc = self._collectionstack[-1]
693 lastc = self._collectionstack[-1]
703 newvalue = {}
694 newvalue = {}
704
695
705 lastc[b'v'].append(newvalue)
696 lastc[b'v'].append(newvalue)
706 lastc[b'remaining'] -= 1
697 lastc[b'remaining'] -= 1
707
698
708 self._collectionstack.append(
699 self._collectionstack.append(
709 {b'remaining': value, b'v': newvalue}
700 {b'remaining': value, b'v': newvalue}
710 )
701 )
711
702
712 self._state = self._STATE_WANT_MAP_KEY
703 self._state = self._STATE_WANT_MAP_KEY
713
704
714 elif special == SPECIAL_START_SET:
705 elif special == SPECIAL_START_SET:
715 lastc = self._collectionstack[-1]
706 lastc = self._collectionstack[-1]
716 newvalue = set()
707 newvalue = set()
717
708
718 lastc[b'v'].append(newvalue)
709 lastc[b'v'].append(newvalue)
719 lastc[b'remaining'] -= 1
710 lastc[b'remaining'] -= 1
720
711
721 self._collectionstack.append(
712 self._collectionstack.append(
722 {
713 {
723 b'remaining': value,
714 b'remaining': value,
724 b'v': newvalue,
715 b'v': newvalue,
725 }
716 }
726 )
717 )
727
718
728 self._state = self._STATE_WANT_SET_VALUE
719 self._state = self._STATE_WANT_SET_VALUE
729
720
730 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
721 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
731 raise CBORDecodeError(
722 raise CBORDecodeError(
732 b'indefinite length bytestrings '
723 b'indefinite length bytestrings '
733 b'not allowed as array values'
724 b'not allowed as array values'
734 )
725 )
735
726
736 else:
727 else:
737 raise CBORDecodeError(
728 raise CBORDecodeError(
738 b'unhandled special item when '
729 b'unhandled special item when '
739 b'expecting array value: %d' % special
730 b'expecting array value: %d' % special
740 )
731 )
741
732
742 # This value becomes the key of the current map instance.
733 # This value becomes the key of the current map instance.
743 elif self._state == self._STATE_WANT_MAP_KEY:
734 elif self._state == self._STATE_WANT_MAP_KEY:
744 if special == SPECIAL_NONE:
735 if special == SPECIAL_NONE:
745 self._currentmapkey = value
736 self._currentmapkey = value
746 self._state = self._STATE_WANT_MAP_VALUE
737 self._state = self._STATE_WANT_MAP_VALUE
747
738
748 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
739 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
749 raise CBORDecodeError(
740 raise CBORDecodeError(
750 b'indefinite length bytestrings '
741 b'indefinite length bytestrings '
751 b'not allowed as map keys'
742 b'not allowed as map keys'
752 )
743 )
753
744
754 elif special in (
745 elif special in (
755 SPECIAL_START_ARRAY,
746 SPECIAL_START_ARRAY,
756 SPECIAL_START_MAP,
747 SPECIAL_START_MAP,
757 SPECIAL_START_SET,
748 SPECIAL_START_SET,
758 ):
749 ):
759 raise CBORDecodeError(
750 raise CBORDecodeError(
760 b'collections not supported as map keys'
751 b'collections not supported as map keys'
761 )
752 )
762
753
763 # We do not allow special values to be used as map keys.
754 # We do not allow special values to be used as map keys.
764 else:
755 else:
765 raise CBORDecodeError(
756 raise CBORDecodeError(
766 b'unhandled special item when '
757 b'unhandled special item when '
767 b'expecting map key: %d' % special
758 b'expecting map key: %d' % special
768 )
759 )
769
760
770 # This value becomes the value of the current map key.
761 # This value becomes the value of the current map key.
771 elif self._state == self._STATE_WANT_MAP_VALUE:
762 elif self._state == self._STATE_WANT_MAP_VALUE:
772 # Simple values simply get inserted into the map.
763 # Simple values simply get inserted into the map.
773 if special == SPECIAL_NONE:
764 if special == SPECIAL_NONE:
774 lastc = self._collectionstack[-1]
765 lastc = self._collectionstack[-1]
775 lastc[b'v'][self._currentmapkey] = value
766 lastc[b'v'][self._currentmapkey] = value
776 lastc[b'remaining'] -= 1
767 lastc[b'remaining'] -= 1
777
768
778 self._state = self._STATE_WANT_MAP_KEY
769 self._state = self._STATE_WANT_MAP_KEY
779
770
780 # A new array is used as the map value.
771 # A new array is used as the map value.
781 elif special == SPECIAL_START_ARRAY:
772 elif special == SPECIAL_START_ARRAY:
782 lastc = self._collectionstack[-1]
773 lastc = self._collectionstack[-1]
783 newvalue = []
774 newvalue = []
784
775
785 lastc[b'v'][self._currentmapkey] = newvalue
776 lastc[b'v'][self._currentmapkey] = newvalue
786 lastc[b'remaining'] -= 1
777 lastc[b'remaining'] -= 1
787
778
788 self._collectionstack.append(
779 self._collectionstack.append(
789 {
780 {
790 b'remaining': value,
781 b'remaining': value,
791 b'v': newvalue,
782 b'v': newvalue,
792 }
783 }
793 )
784 )
794
785
795 self._state = self._STATE_WANT_ARRAY_VALUE
786 self._state = self._STATE_WANT_ARRAY_VALUE
796
787
797 # A new map is used as the map value.
788 # A new map is used as the map value.
798 elif special == SPECIAL_START_MAP:
789 elif special == SPECIAL_START_MAP:
799 lastc = self._collectionstack[-1]
790 lastc = self._collectionstack[-1]
800 newvalue = {}
791 newvalue = {}
801
792
802 lastc[b'v'][self._currentmapkey] = newvalue
793 lastc[b'v'][self._currentmapkey] = newvalue
803 lastc[b'remaining'] -= 1
794 lastc[b'remaining'] -= 1
804
795
805 self._collectionstack.append(
796 self._collectionstack.append(
806 {
797 {
807 b'remaining': value,
798 b'remaining': value,
808 b'v': newvalue,
799 b'v': newvalue,
809 }
800 }
810 )
801 )
811
802
812 self._state = self._STATE_WANT_MAP_KEY
803 self._state = self._STATE_WANT_MAP_KEY
813
804
814 # A new set is used as the map value.
805 # A new set is used as the map value.
815 elif special == SPECIAL_START_SET:
806 elif special == SPECIAL_START_SET:
816 lastc = self._collectionstack[-1]
807 lastc = self._collectionstack[-1]
817 newvalue = set()
808 newvalue = set()
818
809
819 lastc[b'v'][self._currentmapkey] = newvalue
810 lastc[b'v'][self._currentmapkey] = newvalue
820 lastc[b'remaining'] -= 1
811 lastc[b'remaining'] -= 1
821
812
822 self._collectionstack.append(
813 self._collectionstack.append(
823 {
814 {
824 b'remaining': value,
815 b'remaining': value,
825 b'v': newvalue,
816 b'v': newvalue,
826 }
817 }
827 )
818 )
828
819
829 self._state = self._STATE_WANT_SET_VALUE
820 self._state = self._STATE_WANT_SET_VALUE
830
821
831 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
822 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
832 raise CBORDecodeError(
823 raise CBORDecodeError(
833 b'indefinite length bytestrings not '
824 b'indefinite length bytestrings not '
834 b'allowed as map values'
825 b'allowed as map values'
835 )
826 )
836
827
837 else:
828 else:
838 raise CBORDecodeError(
829 raise CBORDecodeError(
839 b'unhandled special item when '
830 b'unhandled special item when '
840 b'expecting map value: %d' % special
831 b'expecting map value: %d' % special
841 )
832 )
842
833
843 self._currentmapkey = None
834 self._currentmapkey = None
844
835
845 # This value is added to the current set.
836 # This value is added to the current set.
846 elif self._state == self._STATE_WANT_SET_VALUE:
837 elif self._state == self._STATE_WANT_SET_VALUE:
847 if special == SPECIAL_NONE:
838 if special == SPECIAL_NONE:
848 lastc = self._collectionstack[-1]
839 lastc = self._collectionstack[-1]
849 lastc[b'v'].add(value)
840 lastc[b'v'].add(value)
850 lastc[b'remaining'] -= 1
841 lastc[b'remaining'] -= 1
851
842
852 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
843 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
853 raise CBORDecodeError(
844 raise CBORDecodeError(
854 b'indefinite length bytestrings not '
845 b'indefinite length bytestrings not '
855 b'allowed as set values'
846 b'allowed as set values'
856 )
847 )
857
848
858 elif special in (
849 elif special in (
859 SPECIAL_START_ARRAY,
850 SPECIAL_START_ARRAY,
860 SPECIAL_START_MAP,
851 SPECIAL_START_MAP,
861 SPECIAL_START_SET,
852 SPECIAL_START_SET,
862 ):
853 ):
863 raise CBORDecodeError(
854 raise CBORDecodeError(
864 b'collections not allowed as set values'
855 b'collections not allowed as set values'
865 )
856 )
866
857
867 # We don't allow non-trivial types to exist as set values.
858 # We don't allow non-trivial types to exist as set values.
868 else:
859 else:
869 raise CBORDecodeError(
860 raise CBORDecodeError(
870 b'unhandled special item when '
861 b'unhandled special item when '
871 b'expecting set value: %d' % special
862 b'expecting set value: %d' % special
872 )
863 )
873
864
874 # This value represents the first chunk in an indefinite length
865 # This value represents the first chunk in an indefinite length
875 # bytestring.
866 # bytestring.
876 elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_FIRST:
867 elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_FIRST:
877 # We received a full chunk.
868 # We received a full chunk.
878 if special == SPECIAL_NONE:
869 if special == SPECIAL_NONE:
879 self._decodedvalues.append(
870 self._decodedvalues.append(
880 bytestringchunk(value, first=True)
871 bytestringchunk(value, first=True)
881 )
872 )
882
873
883 self._state = self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT
874 self._state = self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT
884
875
885 # The end of stream marker. This means it is an empty
876 # The end of stream marker. This means it is an empty
886 # indefinite length bytestring.
877 # indefinite length bytestring.
887 elif special == SPECIAL_INDEFINITE_BREAK:
878 elif special == SPECIAL_INDEFINITE_BREAK:
888 # We /could/ convert this to a b''. But we want to preserve
879 # We /could/ convert this to a b''. But we want to preserve
889 # the nature of the underlying data so consumers expecting
880 # the nature of the underlying data so consumers expecting
890 # an indefinite length bytestring get one.
881 # an indefinite length bytestring get one.
891 self._decodedvalues.append(
882 self._decodedvalues.append(
892 bytestringchunk(b'', first=True, last=True)
883 bytestringchunk(b'', first=True, last=True)
893 )
884 )
894
885
895 # Since indefinite length bytestrings can't be used in
886 # Since indefinite length bytestrings can't be used in
896 # collections, we must be at the root level.
887 # collections, we must be at the root level.
897 assert not self._collectionstack
888 assert not self._collectionstack
898 self._state = self._STATE_NONE
889 self._state = self._STATE_NONE
899
890
900 else:
891 else:
901 raise CBORDecodeError(
892 raise CBORDecodeError(
902 b'unexpected special value when '
893 b'unexpected special value when '
903 b'expecting bytestring chunk: %d' % special
894 b'expecting bytestring chunk: %d' % special
904 )
895 )
905
896
906 # This value represents the non-initial chunk in an indefinite
897 # This value represents the non-initial chunk in an indefinite
907 # length bytestring.
898 # length bytestring.
908 elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT:
899 elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT:
909 # We received a full chunk.
900 # We received a full chunk.
910 if special == SPECIAL_NONE:
901 if special == SPECIAL_NONE:
911 self._decodedvalues.append(bytestringchunk(value))
902 self._decodedvalues.append(bytestringchunk(value))
912
903
913 # The end of stream marker.
904 # The end of stream marker.
914 elif special == SPECIAL_INDEFINITE_BREAK:
905 elif special == SPECIAL_INDEFINITE_BREAK:
915 self._decodedvalues.append(bytestringchunk(b'', last=True))
906 self._decodedvalues.append(bytestringchunk(b'', last=True))
916
907
917 # Since indefinite length bytestrings can't be used in
908 # Since indefinite length bytestrings can't be used in
918 # collections, we must be at the root level.
909 # collections, we must be at the root level.
919 assert not self._collectionstack
910 assert not self._collectionstack
920 self._state = self._STATE_NONE
911 self._state = self._STATE_NONE
921
912
922 else:
913 else:
923 raise CBORDecodeError(
914 raise CBORDecodeError(
924 b'unexpected special value when '
915 b'unexpected special value when '
925 b'expecting bytestring chunk: %d' % special
916 b'expecting bytestring chunk: %d' % special
926 )
917 )
927
918
928 else:
919 else:
929 raise CBORDecodeError(
920 raise CBORDecodeError(
930 b'unhandled decoder state: %d' % self._state
921 b'unhandled decoder state: %d' % self._state
931 )
922 )
932
923
933 # We could have just added the final value in a collection. End
924 # We could have just added the final value in a collection. End
934 # all complete collections at the top of the stack.
925 # all complete collections at the top of the stack.
935 while True:
926 while True:
936 # Bail if we're not waiting on a new collection item.
927 # Bail if we're not waiting on a new collection item.
937 if self._state not in (
928 if self._state not in (
938 self._STATE_WANT_ARRAY_VALUE,
929 self._STATE_WANT_ARRAY_VALUE,
939 self._STATE_WANT_MAP_KEY,
930 self._STATE_WANT_MAP_KEY,
940 self._STATE_WANT_SET_VALUE,
931 self._STATE_WANT_SET_VALUE,
941 ):
932 ):
942 break
933 break
943
934
944 # Or we are expecting more items for this collection.
935 # Or we are expecting more items for this collection.
945 lastc = self._collectionstack[-1]
936 lastc = self._collectionstack[-1]
946
937
947 if lastc[b'remaining']:
938 if lastc[b'remaining']:
948 break
939 break
949
940
950 # The collection at the top of the stack is complete.
941 # The collection at the top of the stack is complete.
951
942
952 # Discard it, as it isn't needed for future items.
943 # Discard it, as it isn't needed for future items.
953 self._collectionstack.pop()
944 self._collectionstack.pop()
954
945
955 # If this is a nested collection, we don't emit it, since it
946 # If this is a nested collection, we don't emit it, since it
956 # will be emitted by its parent collection. But we do need to
947 # will be emitted by its parent collection. But we do need to
957 # update state to reflect what the new top-most collection
948 # update state to reflect what the new top-most collection
958 # on the stack is.
949 # on the stack is.
959 if self._collectionstack:
950 if self._collectionstack:
960 self._state = {
951 self._state = {
961 list: self._STATE_WANT_ARRAY_VALUE,
952 list: self._STATE_WANT_ARRAY_VALUE,
962 dict: self._STATE_WANT_MAP_KEY,
953 dict: self._STATE_WANT_MAP_KEY,
963 set: self._STATE_WANT_SET_VALUE,
954 set: self._STATE_WANT_SET_VALUE,
964 }[type(self._collectionstack[-1][b'v'])]
955 }[type(self._collectionstack[-1][b'v'])]
965
956
966 # If this is the root collection, emit it.
957 # If this is the root collection, emit it.
967 else:
958 else:
968 self._decodedvalues.append(lastc[b'v'])
959 self._decodedvalues.append(lastc[b'v'])
969 self._state = self._STATE_NONE
960 self._state = self._STATE_NONE
970
961
971 return (
962 return (
972 bool(self._decodedvalues),
963 bool(self._decodedvalues),
973 offset - initialoffset,
964 offset - initialoffset,
974 0,
965 0,
975 )
966 )
976
967
977 def getavailable(self):
968 def getavailable(self):
978 """Returns an iterator over fully decoded values.
969 """Returns an iterator over fully decoded values.
979
970
980 Once values are retrieved, they won't be available on the next call.
971 Once values are retrieved, they won't be available on the next call.
981 """
972 """
982
973
983 l = list(self._decodedvalues)
974 l = list(self._decodedvalues)
984 self._decodedvalues = []
975 self._decodedvalues = []
985 return l
976 return l
986
977
987
978
988 class bufferingdecoder(object):
979 class bufferingdecoder(object):
989 """A CBOR decoder that buffers undecoded input.
980 """A CBOR decoder that buffers undecoded input.
990
981
991 This is a glorified wrapper around ``sansiodecoder`` that adds a buffering
982 This is a glorified wrapper around ``sansiodecoder`` that adds a buffering
992 layer. All input that isn't consumed by ``sansiodecoder`` will be buffered
983 layer. All input that isn't consumed by ``sansiodecoder`` will be buffered
993 and concatenated with any new input that arrives later.
984 and concatenated with any new input that arrives later.
994
985
995 TODO consider adding limits as to the maximum amount of data that can
986 TODO consider adding limits as to the maximum amount of data that can
996 be buffered.
987 be buffered.
997 """
988 """
998
989
999 def __init__(self):
990 def __init__(self):
1000 self._decoder = sansiodecoder()
991 self._decoder = sansiodecoder()
1001 self._chunks = []
992 self._chunks = []
1002 self._wanted = 0
993 self._wanted = 0
1003
994
1004 def decode(self, b):
995 def decode(self, b):
1005 """Attempt to decode bytes to CBOR values.
996 """Attempt to decode bytes to CBOR values.
1006
997
1007 Returns a tuple with the following fields:
998 Returns a tuple with the following fields:
1008
999
1009 * Bool indicating whether new values are available for retrieval.
1000 * Bool indicating whether new values are available for retrieval.
1010 * Integer number of bytes decoded from the new input.
1001 * Integer number of bytes decoded from the new input.
1011 * Integer number of bytes wanted to decode the next value.
1002 * Integer number of bytes wanted to decode the next value.
1012 """
1003 """
1013 # We /might/ be able to support passing a bytearray all the
1004 # We /might/ be able to support passing a bytearray all the
1014 # way through. For now, let's cheat.
1005 # way through. For now, let's cheat.
1015 if isinstance(b, bytearray):
1006 if isinstance(b, bytearray):
1016 b = bytes(b)
1007 b = bytes(b)
1017
1008
1018 # Our strategy for buffering is to aggregate the incoming chunks in a
1009 # Our strategy for buffering is to aggregate the incoming chunks in a
1019 # list until we've received enough data to decode the next item.
1010 # list until we've received enough data to decode the next item.
1020 # This is slightly more complicated than using an ``io.BytesIO``
1011 # This is slightly more complicated than using an ``io.BytesIO``
1021 # or continuously concatenating incoming data. However, because it
1012 # or continuously concatenating incoming data. However, because it
1022 # isn't constantly reallocating backing memory for a growing buffer,
1013 # isn't constantly reallocating backing memory for a growing buffer,
1023 # it prevents excessive memory thrashing and is significantly faster,
1014 # it prevents excessive memory thrashing and is significantly faster,
1024 # especially in cases where the percentage of input chunks that don't
1015 # especially in cases where the percentage of input chunks that don't
1025 # decode into a full item is high.
1016 # decode into a full item is high.
1026
1017
1027 if self._chunks:
1018 if self._chunks:
1028 # A previous call said we needed N bytes to decode the next item.
1019 # A previous call said we needed N bytes to decode the next item.
1029 # But this call doesn't provide enough data. We buffer the incoming
1020 # But this call doesn't provide enough data. We buffer the incoming
1030 # chunk without attempting to decode.
1021 # chunk without attempting to decode.
1031 if len(b) < self._wanted:
1022 if len(b) < self._wanted:
1032 self._chunks.append(b)
1023 self._chunks.append(b)
1033 self._wanted -= len(b)
1024 self._wanted -= len(b)
1034 return False, 0, self._wanted
1025 return False, 0, self._wanted
1035
1026
1036 # Else we may have enough data to decode the next item. Aggregate
1027 # Else we may have enough data to decode the next item. Aggregate
1037 # old data with new and reset the buffer.
1028 # old data with new and reset the buffer.
1038 newlen = len(b)
1029 newlen = len(b)
1039 self._chunks.append(b)
1030 self._chunks.append(b)
1040 b = b''.join(self._chunks)
1031 b = b''.join(self._chunks)
1041 self._chunks = []
1032 self._chunks = []
1042 oldlen = len(b) - newlen
1033 oldlen = len(b) - newlen
1043
1034
1044 else:
1035 else:
1045 oldlen = 0
1036 oldlen = 0
1046
1037
1047 available, readcount, wanted = self._decoder.decode(b)
1038 available, readcount, wanted = self._decoder.decode(b)
1048 self._wanted = wanted
1039 self._wanted = wanted
1049
1040
1050 if readcount < len(b):
1041 if readcount < len(b):
1051 self._chunks.append(b[readcount:])
1042 self._chunks.append(b[readcount:])
1052
1043
1053 return available, readcount - oldlen, wanted
1044 return available, readcount - oldlen, wanted
1054
1045
1055 def getavailable(self):
1046 def getavailable(self):
1056 return self._decoder.getavailable()
1047 return self._decoder.getavailable()
1057
1048
1058
1049
1059 def decodeall(b):
1050 def decodeall(b):
1060 """Decode all CBOR items present in an iterable of bytes.
1051 """Decode all CBOR items present in an iterable of bytes.
1061
1052
1062 In addition to regular decode errors, raises CBORDecodeError if the
1053 In addition to regular decode errors, raises CBORDecodeError if the
1063 entirety of the passed buffer does not fully decode to complete CBOR
1054 entirety of the passed buffer does not fully decode to complete CBOR
1064 values. This includes failure to decode any value, incomplete collection
1055 values. This includes failure to decode any value, incomplete collection
1065 types, incomplete indefinite length items, and extra data at the end of
1056 types, incomplete indefinite length items, and extra data at the end of
1066 the buffer.
1057 the buffer.
1067 """
1058 """
1068 if not b:
1059 if not b:
1069 return []
1060 return []
1070
1061
1071 decoder = sansiodecoder()
1062 decoder = sansiodecoder()
1072
1063
1073 havevalues, readcount, wantbytes = decoder.decode(b)
1064 havevalues, readcount, wantbytes = decoder.decode(b)
1074
1065
1075 if readcount != len(b):
1066 if readcount != len(b):
1076 raise CBORDecodeError(b'input data not fully consumed')
1067 raise CBORDecodeError(b'input data not fully consumed')
1077
1068
1078 if decoder.inprogress:
1069 if decoder.inprogress:
1079 raise CBORDecodeError(b'input data not complete')
1070 raise CBORDecodeError(b'input data not complete')
1080
1071
1081 return decoder.getavailable()
1072 return decoder.getavailable()
General Comments 0
You need to be logged in to leave comments. Login now