upstream/ipython Commit - r17154:565f82ac

Backport PR : allow unicode keys in dicts in json_clean...

MinRK -

r17154:565f82ac

parent child

IPython/utils/jsonutil.py

0 +7 -27

-             """Utilities to manipulate JSON objects.
-             """
-             #-----------------------------------------------------------------------------
-             #  Copyright (C) 2010-2011  The IPython Development Team
+             #
-             #  Distributed under the terms of the BSD License.  The full license is in
-             #  the file COPYING.txt, distributed as part of this software.
-             #-----------------------------------------------------------------------------
+             """Utilities to manipulate JSON objects."""
+             # Copyright (c) IPython Development Team.
+             # Distributed under the terms of the Modified BSD License.
-             #-----------------------------------------------------------------------------
-             # Imports
-             #-----------------------------------------------------------------------------
-             # stdlib
              import math
              import re
              import types
              from datetime import datetime
              try:
                  # base64.encodestring is deprecated in Python 3.x
                  from base64 import encodebytes
              except ImportError:
                  # Python 2.x
                  from base64 import encodestring as encodebytes
              from IPython.utils import py3compat
              from IPython.utils.py3compat import string_types, unicode_type, iteritems
              from IPython.utils.encoding import DEFAULT_ENCODING
              next_attr_name = '__next__' if py3compat.PY3 else 'next'
              #-----------------------------------------------------------------------------
              # Globals and constants
              #-----------------------------------------------------------------------------
              # timestamp formats
              ISO8601 = "%Y-%m-%dT%H:%M:%S.%f"
              ISO8601_PAT=re.compile(r"^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(\.\d{1,6})?Z?([\+\-]\d{2}:?\d{2})?$")
              # holy crap, strptime is not threadsafe.
              # Calling it once at import seems to help.
              datetime.strptime("1", "%d")
              #-----------------------------------------------------------------------------
              # Classes and functions
              #-----------------------------------------------------------------------------
              def rekey(dikt):
                  """Rekey a dict that has been forced to use str keys where there should be
                  ints by json."""
                  for k in dikt:
                      if isinstance(k, string_types):
                          ik=fk=None
                          try:
                              ik = int(k)
                          except ValueError:
                              try:
                                  fk = float(k)
                              except ValueError:
                                  continue
                          if ik is not None:
                              nk = ik
                          else:
                              nk = fk
                          if nk in dikt:
                              raise KeyError("already have key %r"%nk)
                          dikt[nk] = dikt.pop(k)
                  return dikt
              def parse_date(s):
                  """parse an ISO8601 date string
                  If it is None or not a valid ISO8601 timestamp,
                  it will be returned unmodified.
                  Otherwise, it will return a datetime object.
                  """
                  if s is None:
                      return s
                  m = ISO8601_PAT.match(s)
                  if m:
                      # FIXME: add actual timezone support
                      # this just drops the timezone info
                      notz, ms, tz = m.groups()
                      if not ms:
                          ms = '.0'
                      notz = notz + ms
                      return datetime.strptime(notz, ISO8601)
                  return s
              def extract_dates(obj):
                  """extract ISO8601 dates from unpacked JSON"""
                  if isinstance(obj, dict):
                      new_obj = {} # don't clobber
                      for k,v in iteritems(obj):
                          new_obj[k] = extract_dates(v)
                      obj = new_obj
                  elif isinstance(obj, (list, tuple)):
                      obj = [ extract_dates(o) for o in obj ]
                  elif isinstance(obj, string_types):
                      obj = parse_date(obj)
                  return obj
              def squash_dates(obj):
                  """squash datetime objects into ISO8601 strings"""
                  if isinstance(obj, dict):
                      obj = dict(obj) # don't clobber
                      for k,v in iteritems(obj):
                          obj[k] = squash_dates(v)
                  elif isinstance(obj, (list, tuple)):
                      obj = [ squash_dates(o) for o in obj ]
                  elif isinstance(obj, datetime):
                      obj = obj.isoformat()
                  return obj
              def date_default(obj):
                  """default function for packing datetime objects in JSON."""
                  if isinstance(obj, datetime):
                      return obj.isoformat()
                  else:
                      raise TypeError("%r is not JSON serializable"%obj)
              # constants for identifying png/jpeg data
              PNG = b'\x89PNG\r\n\x1a\n'
              # front of PNG base64-encoded
              PNG64 = b'iVBORw0KG'
              JPEG = b'\xff\xd8'
              # front of JPEG base64-encoded
              JPEG64 = b'/9'
              # front of PDF base64-encoded
              PDF64 = b'JVBER'
              def encode_images(format_dict):
                  """b64-encodes images in a displaypub format dict
                  Perhaps this should be handled in json_clean itself?
                  Parameters
                  ----------
                  format_dict : dict
                      A dictionary of display data keyed by mime-type
                  Returns
                  -------
                  format_dict : dict
                      A copy of the same dictionary,
                      but binary image data ('image/png', 'image/jpeg' or 'application/pdf')
                      is base64-encoded.
                  """
                  encoded = format_dict.copy()
                  pngdata = format_dict.get('image/png')
                  if isinstance(pngdata, bytes):
                      # make sure we don't double-encode
                      if not pngdata.startswith(PNG64):
                          pngdata = encodebytes(pngdata)
                      encoded['image/png'] = pngdata.decode('ascii')
                  jpegdata = format_dict.get('image/jpeg')
                  if isinstance(jpegdata, bytes):
                      # make sure we don't double-encode
                      if not jpegdata.startswith(JPEG64):
                          jpegdata = encodebytes(jpegdata)
                      encoded['image/jpeg'] = jpegdata.decode('ascii')
                  pdfdata = format_dict.get('application/pdf')
                  if isinstance(pdfdata, bytes):
                      # make sure we don't double-encode
                      if not pdfdata.startswith(PDF64):
                          pdfdata = encodebytes(pdfdata)
                      encoded['application/pdf'] = pdfdata.decode('ascii')
                  return encoded
              def json_clean(obj):
                  """Clean an object to ensure it's safe to encode in JSON.
                  Atomic, immutable objects are returned unmodified.  Sets and tuples are
                  converted to lists, lists are copied and dicts are also copied.
                  Note: dicts whose keys could cause collisions upon encoding (such as a dict
                  with both the number 1 and the string '1' as keys) will cause a ValueError
                  to be raised.
                  Parameters
                  ----------
                  obj : any python object
                  Returns
                  -------
                  out : object
                    A version of the input which will not cause an encoding error when
                    encoded as JSON.  Note that this function does not *encode* its inputs,
                    it simply sanitizes it so that there will be no encoding errors later.
-                 Examples
-                 --------
-                 >>> json_clean(4)
-                 >>> json_clean(list(range(10)))
-                 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
-                 >>> sorted(json_clean(dict(x=1, y=2)).items())
-                 [('x', 1), ('y', 2)]
-                 >>> sorted(json_clean(dict(x=1, y=2, z=[1,2,3])).items())
-                 [('x', 1), ('y', 2), ('z', [1, 2, 3])]
-                 >>> json_clean(True)
-                 True
                  """
                  # types that are 'atomic' and ok in json as-is.
                  atomic_ok = (unicode_type, type(None))
                  # containers that we need to convert into lists
                  container_to_list = (tuple, set, types.GeneratorType)
                  if isinstance(obj, float):
                      # cast out-of-range floats to their reprs
                      if math.isnan(obj) or math.isinf(obj):
                          return repr(obj)
                      return float(obj)
                  if isinstance(obj, int):
                      # cast int to int, in case subclasses override __str__ (e.g. boost enum, #4598)
                      if isinstance(obj, bool):
                          # bools are ints, but we don't want to cast them to 0,1
                          return obj
                      return int(obj)
                  if isinstance(obj, atomic_ok):
                      return obj
                  if isinstance(obj, bytes):
                      return obj.decode(DEFAULT_ENCODING, 'replace')
                  if isinstance(obj, container_to_list) or (
                      hasattr(obj, '__iter__') and hasattr(obj, next_attr_name)):
                      obj = list(obj)
                  if isinstance(obj, list):
                      return [json_clean(x) for x in obj]
                  if isinstance(obj, dict):
                      # First, validate that the dict won't lose data in conversion due to
                      # key collisions after stringification.  This can happen with keys like
                      # True and 'true' or 1 and '1', which collide in JSON.
                      nkeys = len(obj)
-                     nkeys_collapsed = len(set(map(str, obj)))
+                     nkeys_collapsed = len(set(map(unicode_type, obj)))
                      if nkeys != nkeys_collapsed:
-                         raise ValueError('dict can not be safely converted to JSON: '
+                         raise ValueError('dict cannot be safely converted to JSON: '
                                           'key collision would lead to dropped values')
                      # If all OK, proceed by making the new dict that will be json-safe
                      out = {}
                      for k,v in iteritems(obj):
-                         out[str(k)] = json_clean(v)
+                         out[unicode_type(k)] = json_clean(v)
                      return out
                  # If we get here, we don't know how to handle the object, so we just get
                  # its repr and return that.  This will catch lambdas, open sockets, class
                  # objects, and any other complicated contraption that json can't encode
                  return repr(obj)

IPython/utils/tests/test_jsonutil.py

0 +12 -19

-             """Test suite for our JSON utilities.
-             """
-             #-----------------------------------------------------------------------------
-             #  Copyright (C) 2010-2011  The IPython Development Team
+             #
-             #  Distributed under the terms of the BSD License.  The full license is in
-             #  the file COPYING.txt, distributed as part of this software.
-             #-----------------------------------------------------------------------------
-             #-----------------------------------------------------------------------------
-             # Imports
-             #-----------------------------------------------------------------------------
-             # stdlib
+             # coding: utf-8
+             """Test suite for our JSON utilities."""
+             # Copyright (c) IPython Development Team.
+             # Distributed under the terms of the Modified BSD License.
              import datetime
              import json
              from base64 import decodestring
-             # third party
              import nose.tools as nt
-             # our own
              from IPython.utils import jsonutil, tz
              from ..jsonutil import json_clean, encode_images
              from ..py3compat import unicode_to_str, str_to_bytes, iteritems
-             #-----------------------------------------------------------------------------
-             # Test functions
-             #-----------------------------------------------------------------------------
              class Int(int):
                  def __str__(self):
                      return 'Int(%i)' % self
              def test():
                  # list of input/expected output.  Use None for the expected output if it
                  # can be the same as the input.
                  pairs = [(1, None), # start with scalars
                           (1.0, None),
                           ('a', None),
                           (True, None),
                           (False, None),
                           (None, None),
                           # complex numbers for now just go to strings, as otherwise they
                           # are unserializable
                           (1j, '1j'),
                           # Containers
                           ([1, 2], None),
                           ((1, 2), [1, 2]),
                           (set([1, 2]), [1, 2]),
                           (dict(x=1), None),
                           ({'x': 1, 'y':[1,2,3], '1':'int'}, None),
                           # More exotic objects
                           ((x for x in range(3)), [0, 1, 2]),
                           (iter([1, 2]), [1, 2]),
                           (Int(5), 5),
                           ]
                  for val, jval in pairs:
                      if jval is None:
                          jval = val
                      out = json_clean(val)
                      # validate our cleanup
                      nt.assert_equal(out, jval)
                      # and ensure that what we return, indeed encodes cleanly
                      json.loads(json.dumps(out))
              def test_encode_images():
                  # invalid data, but the header and footer are from real files
                  pngdata = b'\x89PNG\r\n\x1a\nblahblahnotactuallyvalidIEND\xaeB`\x82'
                  jpegdata = b'\xff\xd8\xff\xe0\x00\x10JFIFblahblahjpeg(\xa0\x0f\xff\xd9'
                  pdfdata = b'%PDF-1.\ntrailer<</Root<</Pages<</Kids[<</MediaBox[0 0 3 3]>>]>>>>>>'
                  fmt = {
                      'image/png'  : pngdata,
                      'image/jpeg' : jpegdata,
                      'application/pdf' : pdfdata
                  }
                  encoded = encode_images(fmt)
                  for key, value in iteritems(fmt):
                      # encoded has unicode, want bytes
                      decoded = decodestring(encoded[key].encode('ascii'))
                      nt.assert_equal(decoded, value)
                  encoded2 = encode_images(encoded)
                  nt.assert_equal(encoded, encoded2)
                  b64_str = {}
                  for key, encoded in iteritems(encoded):
                      b64_str[key] = unicode_to_str(encoded)
                  encoded3 = encode_images(b64_str)
                  nt.assert_equal(encoded3, b64_str)
                  for key, value in iteritems(fmt):
                      # encoded3 has str, want bytes
                      decoded = decodestring(str_to_bytes(encoded3[key]))
                      nt.assert_equal(decoded, value)
              def test_lambda():
                  jc = json_clean(lambda : 1)
                  nt.assert_is_instance(jc, str)
                  nt.assert_in('<lambda>', jc)
                  json.dumps(jc)
              def test_extract_dates():
                  timestamps = [
                      '2013-07-03T16:34:52.249482',
                      '2013-07-03T16:34:52.249482Z',
                      '2013-07-03T16:34:52.249482Z-0800',
                      '2013-07-03T16:34:52.249482Z+0800',
                      '2013-07-03T16:34:52.249482Z+08:00',
                      '2013-07-03T16:34:52.249482Z-08:00',
                      '2013-07-03T16:34:52.249482-0800',
                      '2013-07-03T16:34:52.249482+0800',
                      '2013-07-03T16:34:52.249482+08:00',
                      '2013-07-03T16:34:52.249482-08:00',
                  ]
                  extracted = jsonutil.extract_dates(timestamps)
                  ref = extracted[0]
                  for dt in extracted:
                      nt.assert_true(isinstance(dt, datetime.datetime))
                      nt.assert_equal(dt, ref)
              def test_parse_ms_precision():
                  base = '2013-07-03T16:34:52'
                  digits = '1234567890'
                  parsed = jsonutil.parse_date(base)
                  nt.assert_is_instance(parsed, datetime.datetime)
                  for i in range(len(digits)):
                      ts = base + '.' + digits[:i]
                      parsed = jsonutil.parse_date(ts)
                      if i >= 1 and i <= 6:
                          nt.assert_is_instance(parsed, datetime.datetime)
                      else:
                          nt.assert_is_instance(parsed, str)
              def test_date_default():
                  data = dict(today=datetime.datetime.now(), utcnow=tz.utcnow())
                  jsondata = json.dumps(data, default=jsonutil.date_default)
                  nt.assert_in("+00", jsondata)
                  nt.assert_equal(jsondata.count("+00"), 1)
                  extracted = jsonutil.extract_dates(json.loads(jsondata))
                  for dt in extracted.values():
                      nt.assert_is_instance(dt, datetime.datetime)
              def test_exception():
                  bad_dicts = [{1:'number', '1':'string'},
                               {True:'bool', 'True':'string'},
                               ]
                  for d in bad_dicts:
                      nt.assert_raises(ValueError, json_clean, d)
+             def test_unicode_dict():
+                 data = {u'üniço∂e': u'üniço∂e'}
+                 clean = jsonutil.json_clean(data)
+                 nt.assert_equal(data, clean)

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages