##// END OF EJS Templates
Backport PR #6077: allow unicode keys in dicts in json_clean...
MinRK -
Show More
@@ -1,263 +1,243
1 """Utilities to manipulate JSON objects.
2 """
3 #-----------------------------------------------------------------------------
4 # Copyright (C) 2010-2011 The IPython Development Team
5 #
6 # Distributed under the terms of the BSD License. The full license is in
7 # the file COPYING.txt, distributed as part of this software.
8 #-----------------------------------------------------------------------------
1 """Utilities to manipulate JSON objects."""
2
3 # Copyright (c) IPython Development Team.
4 # Distributed under the terms of the Modified BSD License.
9 5
10 #-----------------------------------------------------------------------------
11 # Imports
12 #-----------------------------------------------------------------------------
13 # stdlib
14 6 import math
15 7 import re
16 8 import types
17 9 from datetime import datetime
18 10
19 11 try:
20 12 # base64.encodestring is deprecated in Python 3.x
21 13 from base64 import encodebytes
22 14 except ImportError:
23 15 # Python 2.x
24 16 from base64 import encodestring as encodebytes
25 17
26 18 from IPython.utils import py3compat
27 19 from IPython.utils.py3compat import string_types, unicode_type, iteritems
28 20 from IPython.utils.encoding import DEFAULT_ENCODING
29 21 next_attr_name = '__next__' if py3compat.PY3 else 'next'
30 22
31 23 #-----------------------------------------------------------------------------
32 24 # Globals and constants
33 25 #-----------------------------------------------------------------------------
34 26
35 27 # timestamp formats
36 28 ISO8601 = "%Y-%m-%dT%H:%M:%S.%f"
37 29 ISO8601_PAT=re.compile(r"^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(\.\d{1,6})?Z?([\+\-]\d{2}:?\d{2})?$")
38 30
39 31 # holy crap, strptime is not threadsafe.
40 32 # Calling it once at import seems to help.
41 33 datetime.strptime("1", "%d")
42 34
43 35 #-----------------------------------------------------------------------------
44 36 # Classes and functions
45 37 #-----------------------------------------------------------------------------
46 38
47 39 def rekey(dikt):
48 40 """Rekey a dict that has been forced to use str keys where there should be
49 41 ints by json."""
50 42 for k in dikt:
51 43 if isinstance(k, string_types):
52 44 ik=fk=None
53 45 try:
54 46 ik = int(k)
55 47 except ValueError:
56 48 try:
57 49 fk = float(k)
58 50 except ValueError:
59 51 continue
60 52 if ik is not None:
61 53 nk = ik
62 54 else:
63 55 nk = fk
64 56 if nk in dikt:
65 57 raise KeyError("already have key %r"%nk)
66 58 dikt[nk] = dikt.pop(k)
67 59 return dikt
68 60
69 61 def parse_date(s):
70 62 """parse an ISO8601 date string
71 63
72 64 If it is None or not a valid ISO8601 timestamp,
73 65 it will be returned unmodified.
74 66 Otherwise, it will return a datetime object.
75 67 """
76 68 if s is None:
77 69 return s
78 70 m = ISO8601_PAT.match(s)
79 71 if m:
80 72 # FIXME: add actual timezone support
81 73 # this just drops the timezone info
82 74 notz, ms, tz = m.groups()
83 75 if not ms:
84 76 ms = '.0'
85 77 notz = notz + ms
86 78 return datetime.strptime(notz, ISO8601)
87 79 return s
88 80
89 81 def extract_dates(obj):
90 82 """extract ISO8601 dates from unpacked JSON"""
91 83 if isinstance(obj, dict):
92 84 new_obj = {} # don't clobber
93 85 for k,v in iteritems(obj):
94 86 new_obj[k] = extract_dates(v)
95 87 obj = new_obj
96 88 elif isinstance(obj, (list, tuple)):
97 89 obj = [ extract_dates(o) for o in obj ]
98 90 elif isinstance(obj, string_types):
99 91 obj = parse_date(obj)
100 92 return obj
101 93
102 94 def squash_dates(obj):
103 95 """squash datetime objects into ISO8601 strings"""
104 96 if isinstance(obj, dict):
105 97 obj = dict(obj) # don't clobber
106 98 for k,v in iteritems(obj):
107 99 obj[k] = squash_dates(v)
108 100 elif isinstance(obj, (list, tuple)):
109 101 obj = [ squash_dates(o) for o in obj ]
110 102 elif isinstance(obj, datetime):
111 103 obj = obj.isoformat()
112 104 return obj
113 105
114 106 def date_default(obj):
115 107 """default function for packing datetime objects in JSON."""
116 108 if isinstance(obj, datetime):
117 109 return obj.isoformat()
118 110 else:
119 111 raise TypeError("%r is not JSON serializable"%obj)
120 112
121 113
122 114 # constants for identifying png/jpeg data
123 115 PNG = b'\x89PNG\r\n\x1a\n'
124 116 # front of PNG base64-encoded
125 117 PNG64 = b'iVBORw0KG'
126 118 JPEG = b'\xff\xd8'
127 119 # front of JPEG base64-encoded
128 120 JPEG64 = b'/9'
129 121 # front of PDF base64-encoded
130 122 PDF64 = b'JVBER'
131 123
132 124 def encode_images(format_dict):
133 125 """b64-encodes images in a displaypub format dict
134 126
135 127 Perhaps this should be handled in json_clean itself?
136 128
137 129 Parameters
138 130 ----------
139 131
140 132 format_dict : dict
141 133 A dictionary of display data keyed by mime-type
142 134
143 135 Returns
144 136 -------
145 137
146 138 format_dict : dict
147 139 A copy of the same dictionary,
148 140 but binary image data ('image/png', 'image/jpeg' or 'application/pdf')
149 141 is base64-encoded.
150 142
151 143 """
152 144 encoded = format_dict.copy()
153 145
154 146 pngdata = format_dict.get('image/png')
155 147 if isinstance(pngdata, bytes):
156 148 # make sure we don't double-encode
157 149 if not pngdata.startswith(PNG64):
158 150 pngdata = encodebytes(pngdata)
159 151 encoded['image/png'] = pngdata.decode('ascii')
160 152
161 153 jpegdata = format_dict.get('image/jpeg')
162 154 if isinstance(jpegdata, bytes):
163 155 # make sure we don't double-encode
164 156 if not jpegdata.startswith(JPEG64):
165 157 jpegdata = encodebytes(jpegdata)
166 158 encoded['image/jpeg'] = jpegdata.decode('ascii')
167 159
168 160 pdfdata = format_dict.get('application/pdf')
169 161 if isinstance(pdfdata, bytes):
170 162 # make sure we don't double-encode
171 163 if not pdfdata.startswith(PDF64):
172 164 pdfdata = encodebytes(pdfdata)
173 165 encoded['application/pdf'] = pdfdata.decode('ascii')
174 166
175 167 return encoded
176 168
177 169
178 170 def json_clean(obj):
179 171 """Clean an object to ensure it's safe to encode in JSON.
180 172
181 173 Atomic, immutable objects are returned unmodified. Sets and tuples are
182 174 converted to lists, lists are copied and dicts are also copied.
183 175
184 176 Note: dicts whose keys could cause collisions upon encoding (such as a dict
185 177 with both the number 1 and the string '1' as keys) will cause a ValueError
186 178 to be raised.
187 179
188 180 Parameters
189 181 ----------
190 182 obj : any python object
191 183
192 184 Returns
193 185 -------
194 186 out : object
195 187
196 188 A version of the input which will not cause an encoding error when
197 189 encoded as JSON. Note that this function does not *encode* its inputs,
198 190 it simply sanitizes it so that there will be no encoding errors later.
199 191
200 Examples
201 --------
202 >>> json_clean(4)
203 4
204 >>> json_clean(list(range(10)))
205 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
206 >>> sorted(json_clean(dict(x=1, y=2)).items())
207 [('x', 1), ('y', 2)]
208 >>> sorted(json_clean(dict(x=1, y=2, z=[1,2,3])).items())
209 [('x', 1), ('y', 2), ('z', [1, 2, 3])]
210 >>> json_clean(True)
211 True
212 192 """
213 193 # types that are 'atomic' and ok in json as-is.
214 194 atomic_ok = (unicode_type, type(None))
215 195
216 196 # containers that we need to convert into lists
217 197 container_to_list = (tuple, set, types.GeneratorType)
218 198
219 199 if isinstance(obj, float):
220 200 # cast out-of-range floats to their reprs
221 201 if math.isnan(obj) or math.isinf(obj):
222 202 return repr(obj)
223 203 return float(obj)
224 204
225 205 if isinstance(obj, int):
226 206 # cast int to int, in case subclasses override __str__ (e.g. boost enum, #4598)
227 207 if isinstance(obj, bool):
228 208 # bools are ints, but we don't want to cast them to 0,1
229 209 return obj
230 210 return int(obj)
231 211
232 212 if isinstance(obj, atomic_ok):
233 213 return obj
234 214
235 215 if isinstance(obj, bytes):
236 216 return obj.decode(DEFAULT_ENCODING, 'replace')
237 217
238 218 if isinstance(obj, container_to_list) or (
239 219 hasattr(obj, '__iter__') and hasattr(obj, next_attr_name)):
240 220 obj = list(obj)
241 221
242 222 if isinstance(obj, list):
243 223 return [json_clean(x) for x in obj]
244 224
245 225 if isinstance(obj, dict):
246 226 # First, validate that the dict won't lose data in conversion due to
247 227 # key collisions after stringification. This can happen with keys like
248 228 # True and 'true' or 1 and '1', which collide in JSON.
249 229 nkeys = len(obj)
250 nkeys_collapsed = len(set(map(str, obj)))
230 nkeys_collapsed = len(set(map(unicode_type, obj)))
251 231 if nkeys != nkeys_collapsed:
252 raise ValueError('dict can not be safely converted to JSON: '
232 raise ValueError('dict cannot be safely converted to JSON: '
253 233 'key collision would lead to dropped values')
254 234 # If all OK, proceed by making the new dict that will be json-safe
255 235 out = {}
256 236 for k,v in iteritems(obj):
257 out[str(k)] = json_clean(v)
237 out[unicode_type(k)] = json_clean(v)
258 238 return out
259 239
260 240 # If we get here, we don't know how to handle the object, so we just get
261 241 # its repr and return that. This will catch lambdas, open sockets, class
262 242 # objects, and any other complicated contraption that json can't encode
263 243 return repr(obj)
@@ -1,151 +1,144
1 """Test suite for our JSON utilities.
2 """
3 #-----------------------------------------------------------------------------
4 # Copyright (C) 2010-2011 The IPython Development Team
5 #
6 # Distributed under the terms of the BSD License. The full license is in
7 # the file COPYING.txt, distributed as part of this software.
8 #-----------------------------------------------------------------------------
9
10 #-----------------------------------------------------------------------------
11 # Imports
12 #-----------------------------------------------------------------------------
13 # stdlib
1 # coding: utf-8
2 """Test suite for our JSON utilities."""
3
4 # Copyright (c) IPython Development Team.
5 # Distributed under the terms of the Modified BSD License.
6
14 7 import datetime
15 8 import json
16 9 from base64 import decodestring
17 10
18 # third party
19 11 import nose.tools as nt
20 12
21 # our own
22 13 from IPython.utils import jsonutil, tz
23 14 from ..jsonutil import json_clean, encode_images
24 15 from ..py3compat import unicode_to_str, str_to_bytes, iteritems
25 16
26 #-----------------------------------------------------------------------------
27 # Test functions
28 #-----------------------------------------------------------------------------
17
29 18 class Int(int):
30 19 def __str__(self):
31 20 return 'Int(%i)' % self
32 21
33 22 def test():
34 23 # list of input/expected output. Use None for the expected output if it
35 24 # can be the same as the input.
36 25 pairs = [(1, None), # start with scalars
37 26 (1.0, None),
38 27 ('a', None),
39 28 (True, None),
40 29 (False, None),
41 30 (None, None),
42 31 # complex numbers for now just go to strings, as otherwise they
43 32 # are unserializable
44 33 (1j, '1j'),
45 34 # Containers
46 35 ([1, 2], None),
47 36 ((1, 2), [1, 2]),
48 37 (set([1, 2]), [1, 2]),
49 38 (dict(x=1), None),
50 39 ({'x': 1, 'y':[1,2,3], '1':'int'}, None),
51 40 # More exotic objects
52 41 ((x for x in range(3)), [0, 1, 2]),
53 42 (iter([1, 2]), [1, 2]),
54 43 (Int(5), 5),
55 44 ]
56 45
57 46 for val, jval in pairs:
58 47 if jval is None:
59 48 jval = val
60 49 out = json_clean(val)
61 50 # validate our cleanup
62 51 nt.assert_equal(out, jval)
63 52 # and ensure that what we return, indeed encodes cleanly
64 53 json.loads(json.dumps(out))
65 54
66 55
67 56
68 57 def test_encode_images():
69 58 # invalid data, but the header and footer are from real files
70 59 pngdata = b'\x89PNG\r\n\x1a\nblahblahnotactuallyvalidIEND\xaeB`\x82'
71 60 jpegdata = b'\xff\xd8\xff\xe0\x00\x10JFIFblahblahjpeg(\xa0\x0f\xff\xd9'
72 61 pdfdata = b'%PDF-1.\ntrailer<</Root<</Pages<</Kids[<</MediaBox[0 0 3 3]>>]>>>>>>'
73 62
74 63 fmt = {
75 64 'image/png' : pngdata,
76 65 'image/jpeg' : jpegdata,
77 66 'application/pdf' : pdfdata
78 67 }
79 68 encoded = encode_images(fmt)
80 69 for key, value in iteritems(fmt):
81 70 # encoded has unicode, want bytes
82 71 decoded = decodestring(encoded[key].encode('ascii'))
83 72 nt.assert_equal(decoded, value)
84 73 encoded2 = encode_images(encoded)
85 74 nt.assert_equal(encoded, encoded2)
86 75
87 76 b64_str = {}
88 77 for key, encoded in iteritems(encoded):
89 78 b64_str[key] = unicode_to_str(encoded)
90 79 encoded3 = encode_images(b64_str)
91 80 nt.assert_equal(encoded3, b64_str)
92 81 for key, value in iteritems(fmt):
93 82 # encoded3 has str, want bytes
94 83 decoded = decodestring(str_to_bytes(encoded3[key]))
95 84 nt.assert_equal(decoded, value)
96 85
97 86 def test_lambda():
98 87 jc = json_clean(lambda : 1)
99 88 nt.assert_is_instance(jc, str)
100 89 nt.assert_in('<lambda>', jc)
101 90 json.dumps(jc)
102 91
103 92 def test_extract_dates():
104 93 timestamps = [
105 94 '2013-07-03T16:34:52.249482',
106 95 '2013-07-03T16:34:52.249482Z',
107 96 '2013-07-03T16:34:52.249482Z-0800',
108 97 '2013-07-03T16:34:52.249482Z+0800',
109 98 '2013-07-03T16:34:52.249482Z+08:00',
110 99 '2013-07-03T16:34:52.249482Z-08:00',
111 100 '2013-07-03T16:34:52.249482-0800',
112 101 '2013-07-03T16:34:52.249482+0800',
113 102 '2013-07-03T16:34:52.249482+08:00',
114 103 '2013-07-03T16:34:52.249482-08:00',
115 104 ]
116 105 extracted = jsonutil.extract_dates(timestamps)
117 106 ref = extracted[0]
118 107 for dt in extracted:
119 108 nt.assert_true(isinstance(dt, datetime.datetime))
120 109 nt.assert_equal(dt, ref)
121 110
122 111 def test_parse_ms_precision():
123 112 base = '2013-07-03T16:34:52'
124 113 digits = '1234567890'
125 114
126 115 parsed = jsonutil.parse_date(base)
127 116 nt.assert_is_instance(parsed, datetime.datetime)
128 117 for i in range(len(digits)):
129 118 ts = base + '.' + digits[:i]
130 119 parsed = jsonutil.parse_date(ts)
131 120 if i >= 1 and i <= 6:
132 121 nt.assert_is_instance(parsed, datetime.datetime)
133 122 else:
134 123 nt.assert_is_instance(parsed, str)
135 124
136 125 def test_date_default():
137 126 data = dict(today=datetime.datetime.now(), utcnow=tz.utcnow())
138 127 jsondata = json.dumps(data, default=jsonutil.date_default)
139 128 nt.assert_in("+00", jsondata)
140 129 nt.assert_equal(jsondata.count("+00"), 1)
141 130 extracted = jsonutil.extract_dates(json.loads(jsondata))
142 131 for dt in extracted.values():
143 132 nt.assert_is_instance(dt, datetime.datetime)
144 133
145 134 def test_exception():
146 135 bad_dicts = [{1:'number', '1':'string'},
147 136 {True:'bool', 'True':'string'},
148 137 ]
149 138 for d in bad_dicts:
150 139 nt.assert_raises(ValueError, json_clean, d)
151
140
141 def test_unicode_dict():
142 data = {u'üniço∂e': u'üniço∂e'}
143 clean = jsonutil.json_clean(data)
144 nt.assert_equal(data, clean)
General Comments 0
You need to be logged in to leave comments. Login now