8817bb2e |
1 | """Implementation of JSONDecoder |
2 | """ |
3 | import re |
4 | import sys |
5 | import struct |
6 | |
7 | from simplejson.scanner import make_scanner |
8 | try: |
9 | from simplejson._speedups import scanstring as c_scanstring |
10 | except ImportError: |
11 | c_scanstring = None |
12 | |
13 | __all__ = ['JSONDecoder'] |
14 | |
15 | FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL |
16 | |
17 | def _floatconstants(): |
18 | _BYTES = '7FF80000000000007FF0000000000000'.decode('hex') |
19 | if sys.byteorder != 'big': |
20 | _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1] |
21 | nan, inf = struct.unpack('dd', _BYTES) |
22 | return nan, inf, -inf |
23 | |
24 | NaN, PosInf, NegInf = _floatconstants() |
25 | |
26 | |
27 | def linecol(doc, pos): |
28 | lineno = doc.count('\n', 0, pos) + 1 |
29 | if lineno == 1: |
30 | colno = pos |
31 | else: |
32 | colno = pos - doc.rindex('\n', 0, pos) |
33 | return lineno, colno |
34 | |
35 | |
36 | def errmsg(msg, doc, pos, end=None): |
37 | # Note that this function is called from _speedups |
38 | lineno, colno = linecol(doc, pos) |
39 | if end is None: |
40 | #fmt = '{0}: line {1} column {2} (char {3})' |
41 | #return fmt.format(msg, lineno, colno, pos) |
42 | fmt = '%s: line %d column %d (char %d)' |
43 | return fmt % (msg, lineno, colno, pos) |
44 | endlineno, endcolno = linecol(doc, end) |
45 | #fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})' |
46 | #return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end) |
47 | fmt = '%s: line %d column %d - line %d column %d (char %d - %d)' |
48 | return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end) |
49 | |
50 | |
51 | _CONSTANTS = { |
52 | '-Infinity': NegInf, |
53 | 'Infinity': PosInf, |
54 | 'NaN': NaN, |
55 | } |
56 | |
57 | STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) |
58 | BACKSLASH = { |
59 | '"': u'"', '\\': u'\\', '/': u'/', |
60 | 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t', |
61 | } |
62 | |
63 | DEFAULT_ENCODING = "utf-8" |
64 | |
65 | def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match): |
66 | """Scan the string s for a JSON string. End is the index of the |
67 | character in s after the quote that started the JSON string. |
68 | Unescapes all valid JSON string escape sequences and raises ValueError |
69 | on attempt to decode an invalid string. If strict is False then literal |
70 | control characters are allowed in the string. |
71 | |
72 | Returns a tuple of the decoded string and the index of the character in s |
73 | after the end quote.""" |
74 | if encoding is None: |
75 | encoding = DEFAULT_ENCODING |
76 | chunks = [] |
77 | _append = chunks.append |
78 | begin = end - 1 |
79 | while 1: |
80 | chunk = _m(s, end) |
81 | if chunk is None: |
82 | raise ValueError( |
83 | errmsg("Unterminated string starting at", s, begin)) |
84 | end = chunk.end() |
85 | content, terminator = chunk.groups() |
86 | # Content is contains zero or more unescaped string characters |
87 | if content: |
88 | if not isinstance(content, unicode): |
89 | content = unicode(content, encoding) |
90 | _append(content) |
91 | # Terminator is the end of string, a literal control character, |
92 | # or a backslash denoting that an escape sequence follows |
93 | if terminator == '"': |
94 | break |
95 | elif terminator != '\\': |
96 | if strict: |
97 | msg = "Invalid control character %r at" % (terminator,) |
98 | #msg = "Invalid control character {0!r} at".format(terminator) |
99 | raise ValueError(errmsg(msg, s, end)) |
100 | else: |
101 | _append(terminator) |
102 | continue |
103 | try: |
104 | esc = s[end] |
105 | except IndexError: |
106 | raise ValueError( |
107 | errmsg("Unterminated string starting at", s, begin)) |
108 | # If not a unicode escape sequence, must be in the lookup table |
109 | if esc != 'u': |
110 | try: |
111 | char = _b[esc] |
112 | except KeyError: |
113 | msg = "Invalid \\escape: " + repr(esc) |
114 | raise ValueError(errmsg(msg, s, end)) |
115 | end += 1 |
116 | else: |
117 | # Unicode escape sequence |
118 | esc = s[end + 1:end + 5] |
119 | next_end = end + 5 |
120 | if len(esc) != 4: |
121 | msg = "Invalid \\uXXXX escape" |
122 | raise ValueError(errmsg(msg, s, end)) |
123 | uni = int(esc, 16) |
124 | # Check for surrogate pair on UCS-4 systems |
125 | if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: |
126 | msg = "Invalid \\uXXXX\\uXXXX surrogate pair" |
127 | if not s[end + 5:end + 7] == '\\u': |
128 | raise ValueError(errmsg(msg, s, end)) |
129 | esc2 = s[end + 7:end + 11] |
130 | if len(esc2) != 4: |
131 | raise ValueError(errmsg(msg, s, end)) |
132 | uni2 = int(esc2, 16) |
133 | uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) |
134 | next_end += 6 |
135 | char = unichr(uni) |
136 | end = next_end |
137 | # Append the unescaped character |
138 | _append(char) |
139 | return u''.join(chunks), end |
140 | |
141 | |
142 | # Use speedup if available |
143 | scanstring = c_scanstring or py_scanstring |
144 | |
145 | WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS) |
146 | WHITESPACE_STR = ' \t\n\r' |
147 | |
148 | def JSONObject((s, end), encoding, strict, scan_once, object_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR): |
149 | pairs = {} |
150 | # Use a slice to prevent IndexError from being raised, the following |
151 | # check will raise a more specific ValueError if the string is empty |
152 | nextchar = s[end:end + 1] |
153 | # Normally we expect nextchar == '"' |
154 | if nextchar != '"': |
155 | if nextchar in _ws: |
156 | end = _w(s, end).end() |
157 | nextchar = s[end:end + 1] |
158 | # Trivial empty object |
159 | if nextchar == '}': |
160 | return pairs, end + 1 |
161 | elif nextchar != '"': |
162 | raise ValueError(errmsg("Expecting property name", s, end)) |
163 | end += 1 |
164 | while True: |
165 | key, end = scanstring(s, end, encoding, strict) |
166 | |
167 | # To skip some function call overhead we optimize the fast paths where |
168 | # the JSON key separator is ": " or just ":". |
169 | if s[end:end + 1] != ':': |
170 | end = _w(s, end).end() |
171 | if s[end:end + 1] != ':': |
172 | raise ValueError(errmsg("Expecting : delimiter", s, end)) |
173 | |
174 | end += 1 |
175 | |
176 | try: |
177 | if s[end] in _ws: |
178 | end += 1 |
179 | if s[end] in _ws: |
180 | end = _w(s, end + 1).end() |
181 | except IndexError: |
182 | pass |
183 | |
184 | try: |
185 | value, end = scan_once(s, end) |
186 | except StopIteration: |
187 | raise ValueError(errmsg("Expecting object", s, end)) |
188 | pairs[key] = value |
189 | |
190 | try: |
191 | nextchar = s[end] |
192 | if nextchar in _ws: |
193 | end = _w(s, end + 1).end() |
194 | nextchar = s[end] |
195 | except IndexError: |
196 | nextchar = '' |
197 | end += 1 |
198 | |
199 | if nextchar == '}': |
200 | break |
201 | elif nextchar != ',': |
202 | raise ValueError(errmsg("Expecting , delimiter", s, end - 1)) |
203 | |
204 | try: |
205 | nextchar = s[end] |
206 | if nextchar in _ws: |
207 | end += 1 |
208 | nextchar = s[end] |
209 | if nextchar in _ws: |
210 | end = _w(s, end + 1).end() |
211 | nextchar = s[end] |
212 | except IndexError: |
213 | nextchar = '' |
214 | |
215 | end += 1 |
216 | if nextchar != '"': |
217 | raise ValueError(errmsg("Expecting property name", s, end - 1)) |
218 | |
219 | if object_hook is not None: |
220 | pairs = object_hook(pairs) |
221 | return pairs, end |
222 | |
223 | def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR): |
224 | values = [] |
225 | nextchar = s[end:end + 1] |
226 | if nextchar in _ws: |
227 | end = _w(s, end + 1).end() |
228 | nextchar = s[end:end + 1] |
229 | # Look-ahead for trivial empty array |
230 | if nextchar == ']': |
231 | return values, end + 1 |
232 | _append = values.append |
233 | while True: |
234 | try: |
235 | value, end = scan_once(s, end) |
236 | except StopIteration: |
237 | raise ValueError(errmsg("Expecting object", s, end)) |
238 | _append(value) |
239 | nextchar = s[end:end + 1] |
240 | if nextchar in _ws: |
241 | end = _w(s, end + 1).end() |
242 | nextchar = s[end:end + 1] |
243 | end += 1 |
244 | if nextchar == ']': |
245 | break |
246 | elif nextchar != ',': |
247 | raise ValueError(errmsg("Expecting , delimiter", s, end)) |
248 | |
249 | try: |
250 | if s[end] in _ws: |
251 | end += 1 |
252 | if s[end] in _ws: |
253 | end = _w(s, end + 1).end() |
254 | except IndexError: |
255 | pass |
256 | |
257 | return values, end |
258 | |
259 | class JSONDecoder(object): |
260 | """Simple JSON <http://json.org> decoder |
261 | |
262 | Performs the following translations in decoding by default: |
263 | |
264 | +---------------+-------------------+ |
265 | | JSON | Python | |
266 | +===============+===================+ |
267 | | object | dict | |
268 | +---------------+-------------------+ |
269 | | array | list | |
270 | +---------------+-------------------+ |
271 | | string | unicode | |
272 | +---------------+-------------------+ |
273 | | number (int) | int, long | |
274 | +---------------+-------------------+ |
275 | | number (real) | float | |
276 | +---------------+-------------------+ |
277 | | true | True | |
278 | +---------------+-------------------+ |
279 | | false | False | |
280 | +---------------+-------------------+ |
281 | | null | None | |
282 | +---------------+-------------------+ |
283 | |
284 | It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as |
285 | their corresponding ``float`` values, which is outside the JSON spec. |
286 | |
287 | """ |
288 | |
289 | def __init__(self, encoding=None, object_hook=None, parse_float=None, |
290 | parse_int=None, parse_constant=None, strict=True): |
291 | """``encoding`` determines the encoding used to interpret any ``str`` |
292 | objects decoded by this instance (utf-8 by default). It has no |
293 | effect when decoding ``unicode`` objects. |
294 | |
295 | Note that currently only encodings that are a superset of ASCII work, |
296 | strings of other encodings should be passed in as ``unicode``. |
297 | |
298 | ``object_hook``, if specified, will be called with the result |
299 | of every JSON object decoded and its return value will be used in |
300 | place of the given ``dict``. This can be used to provide custom |
301 | deserializations (e.g. to support JSON-RPC class hinting). |
302 | |
303 | ``parse_float``, if specified, will be called with the string |
304 | of every JSON float to be decoded. By default this is equivalent to |
305 | float(num_str). This can be used to use another datatype or parser |
306 | for JSON floats (e.g. decimal.Decimal). |
307 | |
308 | ``parse_int``, if specified, will be called with the string |
309 | of every JSON int to be decoded. By default this is equivalent to |
310 | int(num_str). This can be used to use another datatype or parser |
311 | for JSON integers (e.g. float). |
312 | |
313 | ``parse_constant``, if specified, will be called with one of the |
314 | following strings: -Infinity, Infinity, NaN. |
315 | This can be used to raise an exception if invalid JSON numbers |
316 | are encountered. |
317 | |
318 | """ |
319 | self.encoding = encoding |
320 | self.object_hook = object_hook |
321 | self.parse_float = parse_float or float |
322 | self.parse_int = parse_int or int |
323 | self.parse_constant = parse_constant or _CONSTANTS.__getitem__ |
324 | self.strict = strict |
325 | self.parse_object = JSONObject |
326 | self.parse_array = JSONArray |
327 | self.parse_string = scanstring |
328 | self.scan_once = make_scanner(self) |
329 | |
330 | def decode(self, s, _w=WHITESPACE.match): |
331 | """Return the Python representation of ``s`` (a ``str`` or ``unicode`` |
332 | instance containing a JSON document) |
333 | |
334 | """ |
335 | obj, end = self.raw_decode(s, idx=_w(s, 0).end()) |
336 | end = _w(s, end).end() |
337 | if end != len(s): |
338 | raise ValueError(errmsg("Extra data", s, end, len(s))) |
339 | return obj |
340 | |
341 | def raw_decode(self, s, idx=0): |
342 | """Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning |
343 | with a JSON document) and return a 2-tuple of the Python |
344 | representation and the index in ``s`` where the document ended. |
345 | |
346 | This can be used to decode a JSON document from a string that may |
347 | have extraneous data at the end. |
348 | |
349 | """ |
350 | try: |
351 | obj, end = self.scan_once(s, idx) |
352 | except StopIteration: |
353 | raise ValueError("No JSON object could be decoded") |
354 | return obj, end |