From bd634ff55d5bacb1767674f19297756323cdc5ae Mon Sep 17 00:00:00 2001 From: Jamie Cockburn Date: Wed, 7 Jun 2023 20:39:11 +0100 Subject: [PATCH 1/4] added buffered reading to tokenizer --- src/json_stream/tokenizer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/json_stream/tokenizer.py b/src/json_stream/tokenizer.py index 21b0bb4..7cb072f 100644 --- a/src/json_stream/tokenizer.py +++ b/src/json_stream/tokenizer.py @@ -365,9 +365,14 @@ def process_char(char): return advance, next_state state = State.WHITESPACE - c = stream.read(1) - index = 0 - while c: + buffer = stream.read(io.DEFAULT_BUFFER_SIZE) + c = None + index = -1 + advance = True + while buffer: + if advance: + c, buffer = buffer[0], buffer[1:] or stream.read(io.DEFAULT_BUFFER_SIZE) + index += 1 try: advance, state = process_char(c) except ValueError as e: @@ -376,9 +381,6 @@ def process_char(char): completed = False token = [] yield now_token - if advance: - c = stream.read(1) - index += 1 process_char(SpecialChar.EOF) if completed: yield now_token From 1a72977d56355d360f01dbea71e29e6cea0baec1 Mon Sep 17 00:00:00 2001 From: Jamie Cockburn Date: Mon, 12 Jun 2023 23:32:41 +0100 Subject: [PATCH 2/4] added json string stream --- .../{tokenizer.py => tokenizer/__init__.py} | 2 +- src/json_stream/tokenizer/strings.py | 128 ++++++++++++++ src/json_stream/tokenizer/tests/__init__.py | 0 .../tokenizer/tests/test_strings.py | 158 ++++++++++++++++++ .../{ => tokenizer}/tests/test_tokenizer.py | 4 + 5 files changed, 291 insertions(+), 1 deletion(-) rename src/json_stream/{tokenizer.py => tokenizer/__init__.py} (99%) create mode 100644 src/json_stream/tokenizer/strings.py create mode 100644 src/json_stream/tokenizer/tests/__init__.py create mode 100644 src/json_stream/tokenizer/tests/test_strings.py rename src/json_stream/{ => tokenizer}/tests/test_tokenizer.py (97%) diff --git a/src/json_stream/tokenizer.py b/src/json_stream/tokenizer/__init__.py similarity index 99% rename from src/json_stream/tokenizer.py rename to src/json_stream/tokenizer/__init__.py index 7cb072f..8f91b78 100644 --- a/src/json_stream/tokenizer.py +++ b/src/json_stream/tokenizer/__init__.py @@ -90,7 +90,7 @@ def is_delimiter(char): now_token = "" def process_char(char): - nonlocal token, completed, now_token, unicode_buffer + nonlocal completed, now_token, unicode_buffer advance = True add_char = False next_state = state diff --git a/src/json_stream/tokenizer/strings.py b/src/json_stream/tokenizer/strings.py new file mode 100644 index 0000000..a90636a --- /dev/null +++ b/src/json_stream/tokenizer/strings.py @@ -0,0 +1,128 @@ +import io +import unicodedata +from typing import Union + +from json_stream.tokenizer import State, SURROGATE + +STRING_ESCAPE_CODES = { + '\\': '\\', + '/': '/', + '"': '"', + 'b': '\b', + 'f': '\f', + 'n': '\n', + 't': '\t', + 'r': '\r' +} + + +class JsonStringReader(io.TextIOBase): + def __init__(self, stream: io.TextIOBase, initial_buffer=''): + self.stream = stream + self.buffer = initial_buffer + self.unicode_buffer = '' + self.state = State.STRING + self.complete = False + + def read(self, size: Union[int, None] = None) -> str: + result = '' + length = io.DEFAULT_BUFFER_SIZE + while not self.complete and (size is None or not result): + if size: + length = size - len(result) + result += self._read_chunk(length) + return result + + def _read_chunk(self, size: Union[int, None] = ...) -> str: + chunk = self.buffer or self.stream.read(size) + if not chunk: + raise ValueError("Unterminated string at end of file") + state = self.state + unicode_buffer = self.unicode_buffer + result = "" + start = 0 + for i, c in enumerate(chunk): + if i == size: + if state == State.STRING: + result += chunk[start:i] + self.buffer = chunk[i:] + break + if state == State.STRING: + if c == '"': + result += chunk[start:i] + self.complete = True + self.buffer = chunk[i + 1:] + break + elif c == "\\": + state = State.STRING_ESCAPE + result += chunk[start:i] + start = i + 1 + + elif state == State.STRING_ESCAPE: + char = STRING_ESCAPE_CODES.get(c) + start = i + 1 + if char: + result += char + state = State.STRING + elif c == 'u': + state = State.UNICODE + else: + raise ValueError("Invalid string escape: {}".format(c)) + + elif state == State.UNICODE: + unicode_buffer += c + start = i + 1 + if len(unicode_buffer) == 4: + try: + code_point = int(unicode_buffer, 16) + except ValueError: + raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer}") + char = chr(code_point) + if unicodedata.category(char) == SURROGATE: + state = State.UNICODE_SURROGATE_START + else: + result += char + unicode_buffer = '' + state = State.STRING + + elif state == State.UNICODE_SURROGATE_START: + if c == "\\": + state = State.UNICODE_SURROGATE_STRING_ESCAPE + start = i + 1 + else: + raise ValueError(f"Unpaired UTF-16 surrogate") + + elif state == State.UNICODE_SURROGATE_STRING_ESCAPE: + if c == "u": + state = State.UNICODE_SURROGATE + start = i + 1 + else: + raise ValueError(f"Unpaired UTF-16 surrogate") + + elif state == State.UNICODE_SURROGATE: + unicode_buffer += c + start = i + 1 + if len(unicode_buffer) == 8: + code_point_1 = int(unicode_buffer[:4], 16) + try: + code_point_2 = int(unicode_buffer[4:], 16) + except ValueError: + raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer[4:]}") + if unicodedata.category(chr(code_point_2)) != SURROGATE: + raise ValueError(f"Second half of UTF-16 surrogate pair is not a surrogate!") + try: + pair = int.to_bytes(code_point_1, 2, 'little') + int.to_bytes(code_point_2, 2, 'little') + result += pair.decode('utf-16-le') + except ValueError: + raise ValueError( + f"Error decoding UTF-16 surrogate pair \\u{unicode_buffer[:4]}\\u{unicode_buffer[4:]}" + ) + unicode_buffer = '' + state = State.STRING + else: + result += chunk[start:] + self.buffer = '' + + self.state = state + self.unicode_buffer = unicode_buffer + return result diff --git a/src/json_stream/tokenizer/tests/__init__.py b/src/json_stream/tokenizer/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/json_stream/tokenizer/tests/test_strings.py b/src/json_stream/tokenizer/tests/test_strings.py new file mode 100644 index 0000000..fc21a22 --- /dev/null +++ b/src/json_stream/tokenizer/tests/test_strings.py @@ -0,0 +1,158 @@ +import re +from io import StringIO +from unittest import TestCase + +from json_stream.tokenizer.strings import JsonStringReader + + +class TestJsonStringReader(TestCase): + def test_string_parsing(self): + self.assertStringEquals("word", r'"word"') + self.assertStringEquals("this char at end: Ȃ", r'"this char at end: \u0202"') + self.assertStringEquals("this char in middle: Ȃ.", r'"this char in middle: \u0202."') + + def test_empty_string(self): + self.assertStringEquals("", r'""') + + def test_escaping(self): + self.assertStringEquals("with\tescape", r'"with\tescape"') + self.assertStringEquals("with\n a different escape", r'"with\n a different escape"') + self.assertStringEquals("using a \bbackspace", r'"using a \bbackspace"') + self.assertStringEquals("now we have \f a formfeed", r'"now we have \f a formfeed"') + self.assertStringEquals('"a quote"', r'"\"a quote\""') + self.assertStringEquals("/", r'"\/"') + + def test_unicode_literal(self): + self.assertStringEquals('Ä', r'"\u00c4"') + self.assertStringEquals("꽸", r'"\uaf78"') + self.assertStringEquals("訋", r'"\u8A0b"') + self.assertStringEquals("돧", r'"\uB3e7"') + self.assertStringEquals("ዯ", r'"\u12eF"') + + def test_invalid_string_escape(self): + self.assertStringRaises(r'"\h"', "Invalid string escape: h") + self.assertStringRaises(r'"\2"', "Invalid string escape: 2") + self.assertStringRaises(r'"\!"', "Invalid string escape: !") + + def test_unicode_literal_truncated(self): + self.assertStringRaises(r'"\u00c"', re.escape(r'Invalid unicode literal: \u00c"')) + + def test_unicode_literal_bad_hex(self): + self.assertStringRaises(r'"\u00x4"', re.escape(r"Invalid unicode literal: \u00x4")) + + def test_unicode_surrogate_pair_literal(self): + self.assertStringEquals('𝄞', r'"\ud834\udd1e"') + + def test_unicode_surrogate_pair_unpaired(self): + self.assertStringRaises(r'"\ud834"', "Unpaired UTF-16 surrogate") + self.assertStringRaises(r'"\ud834', "Unterminated string at end of file") + self.assertStringRaises(r'"\ud834\x', "Unpaired UTF-16 surrogate") + self.assertStringRaises(r'"\ud834' + '\\', "Unterminated string at end of file") + + def test_unicode_surrogate_pair_non_surrogate(self): + self.assertStringRaises(r'"\ud834\u00c4"', "Second half of UTF-16 surrogate pair is not a surrogate!") + + def test_unicode_surrogate_pair_literal_truncated(self): + self.assertStringRaises(r'"\ud834\u00c"', re.escape(r'Invalid unicode literal: \u00c"')) + + def test_unicode_surrogate_pair_literal_bad_hex(self): + self.assertStringRaises(r'"\ud834\u00x4"', re.escape(r"Invalid unicode literal: \u00x4")) + + def test_unicode_surrogate_pair_literal_invalid(self): + message = re.escape(r"Error decoding UTF-16 surrogate pair \ud834\ud834") + self.assertStringRaises(r'"\ud834\ud834"', message) + + def test_unicode_surrogate_pair_literal_unterminated(self): + self.assertStringRaises(r'"\ud834\ud83', r"Unterminated string at end of file") + + def test_unterminated_strings(self): + self.assertStringRaises('"unterminated', "Unterminated string at end of file") + + def test_unterminated_strings_while_in_escape(self): + self.assertStringRaises(r'"\"', "Unterminated string at end of file") + self.assertStringRaises(r'"\u"', "Unterminated string at end of file") + self.assertStringRaises(r'"\u!"', "Unterminated string at end of file") + self.assertStringRaises(r'"\u!!"', "Unterminated string at end of file") + self.assertStringRaises(r'"\u!!!', "Unterminated string at end of file") + + def test_with_initial_buffer(self): + self.assertStringEquals("there will be more string", buffer='"there will be ', stream='more string"') + + def test_remainder(self): + reader, f = self.assertStringEquals( + "after the string", + stream='"after the string"there is more stuff', + remaining_buffer='there is more stuff', + ) + self.assertRead(reader, f, '', remaining_buffer='there is more stuff') + + def test_remainder_read_past_end_of_string(self): + reader, f = self.assertStringEquals( + "after the string", + stream='"after the string"there is more stuff', + remaining_buffer='the', remaining_stream='re is more stuff', amount=20 + ) + self.assertRead(reader, f, '', remaining_buffer='the', remaining_stream='re is more stuff', amount=20) + + def test_remainder_when_string_ends_after_initial_buffer(self): + reader, f = self.assertStringEquals( + "after the string", + buffer='"after the', stream=' string"there is more stuff', + remaining_buffer='there is more stuff', + ) + self.assertRead(reader, f, '', remaining_buffer='there is more stuff') + + def test_remainder_when_string_ends_within_initial_buffer(self): + reader, f = self.assertStringEquals( + "after the string", + buffer='"after the string"there', stream=' is more stuff', + remaining_buffer='there', remaining_stream=' is more stuff', + ) + self.assertRead(reader, f, '', remaining_buffer='there', remaining_stream=' is more stuff') + + def test_read_part_shorter_initial_buffer(self): + reader, f = self.assertStringEquals( + "there", + buffer='"there will be ', stream='more string"', + remaining_buffer=' will be ', remaining_stream='more string"', amount=5, complete=False, + ) + self.assertRead(reader, f, ' will be more string') + + def test_read_part_longer_than_initial_buffer(self): + reader, f = self.assertStringEquals( + "there will be ", + buffer='"there will be ', stream='more string"', + remaining_buffer='', remaining_stream='more string"', amount=20, complete=False, + ) + self.assertRead(reader, f, 'more string') + + def test_read_over_split_escape(self): + json = r'"abcde\u00c4edcba"' + for i in range(len(json)): + buffer, stream = json[:i], json[i:] + self.assertStringEquals("abcdeÄedcba", buffer=buffer, stream=stream) + + def assertStringEquals(self, result, stream, buffer='', remaining_buffer='', remaining_stream='', amount=None, + complete=True): + if buffer: + buffer = buffer[1:] + else: + stream = stream[1:] + f = StringIO(stream) + reader = JsonStringReader(f, buffer) + self.assertRead(reader, f, result, remaining_buffer, remaining_stream, amount, complete) + return reader, f + + def assertRead(self, reader, stream, result, remaining_buffer='', remaining_stream='', amount=None, complete=True): + self.assertEqual(result, reader.read(amount)) + self.assertEqual(remaining_buffer, reader.buffer) + pos = stream.tell() + self.assertEqual(remaining_stream, stream.read()) + stream.seek(pos) + self.assertEqual(complete, reader.complete) + + def assertStringRaises(self, s, error): + stream = StringIO(s[1:]) + f = JsonStringReader(stream) + with self.assertRaisesRegex(ValueError, error): + f.read() diff --git a/src/json_stream/tests/test_tokenizer.py b/src/json_stream/tokenizer/tests/test_tokenizer.py similarity index 97% rename from src/json_stream/tests/test_tokenizer.py rename to src/json_stream/tokenizer/tests/test_tokenizer.py index e27b4a7..61a6068 100644 --- a/src/json_stream/tests/test_tokenizer.py +++ b/src/json_stream/tokenizer/tests/test_tokenizer.py @@ -199,3 +199,7 @@ def test_unicode_surrogate_pair_literal_invalid(self): def test_unicode_surrogate_pair_literal_unterminated(self): with self.assertRaisesRegex(ValueError, r"Unterminated unicode literal at end of file"): list(tokenize(StringIO(r'"\ud834\ud83'))) + + def test_unicode_surrogate_pair_literal_unterminated_first_half(self): + with self.assertRaisesRegex(ValueError, r"Unterminated unicode literal at end of file"): + list(tokenize(StringIO(r'"\ud83'))) From 4ee2bbad6977ad75e1d04519b7e8f477cb8c1919 Mon Sep 17 00:00:00 2001 From: Jamie Cockburn Date: Mon, 12 Jun 2023 23:55:25 +0100 Subject: [PATCH 3/4] integrated json string reader with tokenizer --- src/json_stream/tokenizer/__init__.py | 189 +++++------------- src/json_stream/tokenizer/strings.py | 48 +++-- .../tokenizer/tests/test_tokenizer.py | 10 +- 3 files changed, 88 insertions(+), 159 deletions(-) diff --git a/src/json_stream/tokenizer/__init__.py b/src/json_stream/tokenizer/__init__.py index 8f91b78..6295dac 100644 --- a/src/json_stream/tokenizer/__init__.py +++ b/src/json_stream/tokenizer/__init__.py @@ -6,9 +6,9 @@ Copyright (c) 2019 Daniel Yule """ import io -import unicodedata +from typing import Optional, Tuple -SURROGATE = 'Cs' +from json_stream.tokenizer.strings import JsonStringReader class TokenType: @@ -29,7 +29,6 @@ class State: FLOATING_POINT_0 = 6 FLOATING_POINT = 8 STRING = 9 - STRING_ESCAPE = 10 STRING_END = 11 TRUE_1 = 12 TRUE_2 = 13 @@ -41,10 +40,6 @@ class State: NULL_1 = 19 NULL_2 = 20 NULL_3 = 21 - UNICODE = 22 - UNICODE_SURROGATE_START = 23 - UNICODE_SURROGATE_STRING_ESCAPE = 24 - UNICODE_SURROGATE = 25 class SpecialChar: @@ -78,22 +73,20 @@ def _ensure_text(stream): return stream -def tokenize(stream): +def tokenize(stream, strings_as_streams=False): stream = _ensure_text(stream) def is_delimiter(char): return char.isspace() or char in "{}[]:," or char == SpecialChar.EOF token = [] - unicode_buffer = "" completed = False - now_token = "" + now_token: Optional[Tuple] = None def process_char(char): - nonlocal completed, now_token, unicode_buffer + nonlocal completed, now_token, state, buffer, index advance = True add_char = False - next_state = state if state == State.WHITESPACE: if char == "{": completed = True @@ -113,36 +106,40 @@ def process_char(char): elif char == ":": completed = True now_token = (TokenType.OPERATOR, ":") - elif char == "\"": - next_state = State.STRING + elif char == '"': + state = State.STRING + now_token = (TokenType.STRING, JsonStringReader(stream, buffer)) + if strings_as_streams: + completed = True + advance = False elif char in "123456789": - next_state = State.INTEGER + state = State.INTEGER add_char = True elif char == "0": - next_state = State.INTEGER_0 + state = State.INTEGER_0 add_char = True elif char == "-": - next_state = State.INTEGER_SIGN + state = State.INTEGER_SIGN add_char = True elif char == "f": - next_state = State.FALSE_1 + state = State.FALSE_1 elif char == "t": - next_state = State.TRUE_1 + state = State.TRUE_1 elif char == "n": - next_state = State.NULL_1 + state = State.NULL_1 elif not char.isspace() and not char == SpecialChar.EOF: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.INTEGER: if char in "0123456789": add_char = True elif char == ".": - next_state = State.FLOATING_POINT_0 + state = State.FLOATING_POINT_0 add_char = True elif char == "e" or char == 'E': - next_state = State.INTEGER_EXP_0 + state = State.INTEGER_EXP_0 add_char = True elif is_delimiter(char): - next_state = State.WHITESPACE + state = State.WHITESPACE completed = True now_token = (TokenType.NUMBER, int("".join(token))) advance = False @@ -150,13 +147,13 @@ def process_char(char): raise ValueError("A number must contain only digits. Got '{}'".format(char)) elif state == State.INTEGER_0: if char == ".": - next_state = State.FLOATING_POINT_0 + state = State.FLOATING_POINT_0 add_char = True elif char == "e" or char == 'E': - next_state = State.INTEGER_EXP_0 + state = State.INTEGER_EXP_0 add_char = True elif is_delimiter(char): - next_state = State.WHITESPACE + state = State.WHITESPACE completed = True now_token = (TokenType.NUMBER, 0) advance = False @@ -164,16 +161,16 @@ def process_char(char): raise ValueError("A 0 must be followed by a '.' or a 'e'. Got '{0}'".format(char)) elif state == State.INTEGER_SIGN: if char == "0": - next_state = State.INTEGER_0 + state = State.INTEGER_0 add_char = True elif char in "123456789": - next_state = State.INTEGER + state = State.INTEGER add_char = True else: raise ValueError("A - must be followed by a digit. Got '{0}'".format(char)) elif state == State.INTEGER_EXP_0: if char == "+" or char == "-" or char in "0123456789": - next_state = State.INTEGER_EXP + state = State.INTEGER_EXP add_char = True else: raise ValueError("An e in a number must be followed by a '+', '-' or digit. Got '{0}'".format(char)) @@ -183,7 +180,7 @@ def process_char(char): elif is_delimiter(char): completed = True now_token = (TokenType.NUMBER, float("".join(token))) - next_state = State.WHITESPACE + state = State.WHITESPACE advance = False else: raise ValueError("A number exponent must consist only of digits. Got '{}'".format(char)) @@ -191,179 +188,100 @@ def process_char(char): if char in "0123456789": add_char = True elif char == "e" or char == "E": - next_state = State.INTEGER_EXP_0 + state = State.INTEGER_EXP_0 add_char = True elif is_delimiter(char): completed = True now_token = (TokenType.NUMBER, float("".join(token))) - next_state = State.WHITESPACE + state = State.WHITESPACE advance = False else: raise ValueError("A number must include only digits") elif state == State.FLOATING_POINT_0: if char in "0123456789": - next_state = State.FLOATING_POINT + state = State.FLOATING_POINT add_char = True else: raise ValueError("A number with a decimal point must be followed by a fractional part") elif state == State.FALSE_1: if char == "a": - next_state = State.FALSE_2 + state = State.FALSE_2 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.FALSE_2: if char == "l": - next_state = State.FALSE_3 + state = State.FALSE_3 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.FALSE_3: if char == "s": - next_state = State.FALSE_4 + state = State.FALSE_4 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.FALSE_4: if char == "e": - next_state = State.WHITESPACE + state = State.WHITESPACE completed = True now_token = (TokenType.BOOLEAN, False) else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.TRUE_1: if char == "r": - next_state = State.TRUE_2 + state = State.TRUE_2 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.TRUE_2: if char == "u": - next_state = State.TRUE_3 + state = State.TRUE_3 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.TRUE_3: if char == "e": - next_state = State.WHITESPACE + state = State.WHITESPACE completed = True now_token = (TokenType.BOOLEAN, True) else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.NULL_1: if char == "u": - next_state = State.NULL_2 + state = State.NULL_2 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.NULL_2: if char == "l": - next_state = State.NULL_3 + state = State.NULL_3 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.NULL_3: if char == "l": - next_state = State.WHITESPACE + state = State.WHITESPACE completed = True now_token = (TokenType.NULL, None) else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.STRING: - if char == "\"": + reader: JsonStringReader = now_token[1] + try: + s = reader.read() + finally: + index += reader.index + if not strings_as_streams: + now_token = (TokenType.STRING, s) completed = True - now_token = (TokenType.STRING, "".join(token)) - next_state = State.STRING_END - elif char == "\\": - next_state = State.STRING_ESCAPE - elif char == SpecialChar.EOF: - raise ValueError("Unterminated string at end of file") - else: - add_char = True + buffer = reader.buffer + state = State.STRING_END elif state == State.STRING_END: if is_delimiter(char): advance = False - next_state = State.WHITESPACE + state = State.WHITESPACE else: raise ValueError("Expected whitespace or an operator after string. Got '{}'".format(char)) - elif state == State.STRING_ESCAPE: - next_state = State.STRING - if char == "\\" or char == "\"": - add_char = True - elif char == "b": - char = "\b" - add_char = True - elif char == "f": - char = "\f" - add_char = True - elif char == "n": - char = "\n" - add_char = True - elif char == "t": - char = "\t" - add_char = True - elif char == "r": - char = "\r" - add_char = True - elif char == "/": - char = "/" - add_char = True - elif char == "u": - next_state = State.UNICODE - unicode_buffer = "" - else: - raise ValueError("Invalid string escape: {}".format(char)) - elif state == State.UNICODE: - if char == SpecialChar.EOF: - raise ValueError('Unterminated unicode literal at end of file') - unicode_buffer += char - if len(unicode_buffer) == 4: - try: - code_point = int(unicode_buffer, 16) - except ValueError: - raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer}") - char = chr(code_point) - if unicodedata.category(char) == SURROGATE: - next_state = State.UNICODE_SURROGATE_START - else: - next_state = State.STRING - add_char = True - elif state == State.UNICODE_SURROGATE_START: - if char == "\\": - next_state = State.UNICODE_SURROGATE_STRING_ESCAPE - elif char == SpecialChar.EOF: - raise ValueError("Unpaired UTF-16 surrogate at end of file") - else: - raise ValueError(f"Unpaired UTF-16 surrogate") - - elif state == State.UNICODE_SURROGATE_STRING_ESCAPE: - if char == "u": - next_state = State.UNICODE_SURROGATE - elif char == SpecialChar.EOF: - raise ValueError("Unpaired UTF-16 surrogate at end of file") - else: - raise ValueError(f"Unpaired UTF-16 surrogate") - - elif state == State.UNICODE_SURROGATE: - if char == SpecialChar.EOF: - raise ValueError('Unterminated unicode literal at end of file') - unicode_buffer += char - if len(unicode_buffer) == 8: - code_point_1 = int(unicode_buffer[:4], 16) - try: - code_point_2 = int(unicode_buffer[4:], 16) - except ValueError: - raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer[4:]}") - char = chr(code_point_2) - if unicodedata.category(char) != SURROGATE: - raise ValueError(f"Second half of UTF-16 surrogate pair is not a surrogate!") - try: - pair = int.to_bytes(code_point_1, 2, 'little') + int.to_bytes(code_point_2, 2, 'little') - char = pair.decode('utf-16-le') - except ValueError: - raise ValueError( - f"Error decoding UTF-16 surrogate pair \\u{unicode_buffer[:4]}\\u{unicode_buffer[4:]}" - ) - next_state = State.STRING - add_char = True if add_char: token.append(char) - return advance, next_state + return advance + state = State.WHITESPACE buffer = stream.read(io.DEFAULT_BUFFER_SIZE) c = None @@ -374,13 +292,14 @@ def process_char(char): c, buffer = buffer[0], buffer[1:] or stream.read(io.DEFAULT_BUFFER_SIZE) index += 1 try: - advance, state = process_char(c) + advance = process_char(c) except ValueError as e: raise ValueError("".join([e.args[0], " at index {}".format(index)])) if completed: completed = False token = [] yield now_token + process_char(SpecialChar.EOF) if completed: yield now_token diff --git a/src/json_stream/tokenizer/strings.py b/src/json_stream/tokenizer/strings.py index a90636a..41fb55d 100644 --- a/src/json_stream/tokenizer/strings.py +++ b/src/json_stream/tokenizer/strings.py @@ -1,8 +1,7 @@ import io import unicodedata from typing import Union - -from json_stream.tokenizer import State, SURROGATE +from io import DEFAULT_BUFFER_SIZE STRING_ESCAPE_CODES = { '\\': '\\', @@ -15,18 +14,28 @@ 'r': '\r' } +SURROGATE = 'Cs' + +CHAR = 1 +STRING_ESCAPE = 2 +UNICODE = 4 +UNICODE_SURROGATE_START = 5 +UNICODE_SURROGATE_STRING_ESCAPE = 6 +UNICODE_SURROGATE = 7 + class JsonStringReader(io.TextIOBase): def __init__(self, stream: io.TextIOBase, initial_buffer=''): self.stream = stream self.buffer = initial_buffer self.unicode_buffer = '' - self.state = State.STRING + self.state = CHAR self.complete = False + self.index = 0 def read(self, size: Union[int, None] = None) -> str: result = '' - length = io.DEFAULT_BUFFER_SIZE + length = DEFAULT_BUFFER_SIZE while not self.complete and (size is None or not result): if size: length = size - len(result) @@ -42,34 +51,35 @@ def _read_chunk(self, size: Union[int, None] = ...) -> str: result = "" start = 0 for i, c in enumerate(chunk): + self.index += 1 if i == size: - if state == State.STRING: + if state == CHAR: result += chunk[start:i] self.buffer = chunk[i:] break - if state == State.STRING: + if state == CHAR: if c == '"': result += chunk[start:i] self.complete = True self.buffer = chunk[i + 1:] break elif c == "\\": - state = State.STRING_ESCAPE + state = STRING_ESCAPE result += chunk[start:i] start = i + 1 - elif state == State.STRING_ESCAPE: + elif state == STRING_ESCAPE: char = STRING_ESCAPE_CODES.get(c) start = i + 1 if char: result += char - state = State.STRING + state = CHAR elif c == 'u': - state = State.UNICODE + state = UNICODE else: raise ValueError("Invalid string escape: {}".format(c)) - elif state == State.UNICODE: + elif state == UNICODE: unicode_buffer += c start = i + 1 if len(unicode_buffer) == 4: @@ -79,27 +89,27 @@ def _read_chunk(self, size: Union[int, None] = ...) -> str: raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer}") char = chr(code_point) if unicodedata.category(char) == SURROGATE: - state = State.UNICODE_SURROGATE_START + state = UNICODE_SURROGATE_START else: result += char unicode_buffer = '' - state = State.STRING + state = CHAR - elif state == State.UNICODE_SURROGATE_START: + elif state == UNICODE_SURROGATE_START: if c == "\\": - state = State.UNICODE_SURROGATE_STRING_ESCAPE + state = UNICODE_SURROGATE_STRING_ESCAPE start = i + 1 else: raise ValueError(f"Unpaired UTF-16 surrogate") - elif state == State.UNICODE_SURROGATE_STRING_ESCAPE: + elif state == UNICODE_SURROGATE_STRING_ESCAPE: if c == "u": - state = State.UNICODE_SURROGATE + state = UNICODE_SURROGATE start = i + 1 else: raise ValueError(f"Unpaired UTF-16 surrogate") - elif state == State.UNICODE_SURROGATE: + elif state == UNICODE_SURROGATE: unicode_buffer += c start = i + 1 if len(unicode_buffer) == 8: @@ -118,7 +128,7 @@ def _read_chunk(self, size: Union[int, None] = ...) -> str: f"Error decoding UTF-16 surrogate pair \\u{unicode_buffer[:4]}\\u{unicode_buffer[4:]}" ) unicode_buffer = '' - state = State.STRING + state = CHAR else: result += chunk[start:] self.buffer = '' diff --git a/src/json_stream/tokenizer/tests/test_tokenizer.py b/src/json_stream/tokenizer/tests/test_tokenizer.py index 61a6068..141755f 100644 --- a/src/json_stream/tokenizer/tests/test_tokenizer.py +++ b/src/json_stream/tokenizer/tests/test_tokenizer.py @@ -90,7 +90,7 @@ def test_string_parsing(self): self.tokenize_sequence(r'"\2"') with self.assertRaisesRegex(ValueError, "Invalid string escape: ! at index 2"): self.tokenize_sequence(r'"\!"') - with self.assertRaisesRegex(ValueError, "Unterminated unicode literal at end of file"): + with self.assertRaisesRegex(ValueError, "Unterminated string at end of file at index 4"): self.tokenize_sequence(r'"\u!"') def test_unterminated_strings(self): @@ -172,11 +172,11 @@ def test_unicode_surrogate_pair_literal(self): def test_unicode_surrogate_pair_unpaired(self): with self.assertRaisesRegex(ValueError, "Unpaired UTF-16 surrogate at index 7"): list(tokenize(StringIO(r'"\ud834"'))) - with self.assertRaisesRegex(ValueError, "Unpaired UTF-16 surrogate at end of file"): + with self.assertRaisesRegex(ValueError, "Unterminated string at end of file"): list(tokenize(StringIO(r'"\ud834'))) with self.assertRaisesRegex(ValueError, "Unpaired UTF-16 surrogate at index 8"): list(tokenize(StringIO(r'"\ud834\x'))) - with self.assertRaisesRegex(ValueError, "Unpaired UTF-16 surrogate at end of file"): + with self.assertRaisesRegex(ValueError, "Unterminated string at end of file"): list(tokenize(StringIO(r'"\ud834' + '\\'))) def test_unicode_surrogate_pair_non_surrogate(self): @@ -197,9 +197,9 @@ def test_unicode_surrogate_pair_literal_invalid(self): list(tokenize(StringIO(r'"\ud834\ud834"'))) def test_unicode_surrogate_pair_literal_unterminated(self): - with self.assertRaisesRegex(ValueError, r"Unterminated unicode literal at end of file"): + with self.assertRaisesRegex(ValueError, r"Unterminated string at end of file at index 11"): list(tokenize(StringIO(r'"\ud834\ud83'))) def test_unicode_surrogate_pair_literal_unterminated_first_half(self): - with self.assertRaisesRegex(ValueError, r"Unterminated unicode literal at end of file"): + with self.assertRaisesRegex(ValueError, r"Unterminated string at end of file"): list(tokenize(StringIO(r'"\ud83'))) From b75d9167d3f8033f88a5a749352076da7940ceda Mon Sep 17 00:00:00 2001 From: Jamie Cockburn Date: Tue, 13 Jun 2023 19:56:55 +0100 Subject: [PATCH 4/4] simple performance improvements --- src/json_stream/base.py | 143 +++++++----- src/json_stream/loader.py | 7 +- src/json_stream/tokenizer/__init__.py | 214 +++++++++--------- .../tokenizer/tests/test_tokenizer.py | 8 +- src/json_stream/visitor.py | 4 +- 5 files changed, 201 insertions(+), 175 deletions(-) diff --git a/src/json_stream/base.py b/src/json_stream/base.py index 8e03bc8..f40ce2c 100644 --- a/src/json_stream/base.py +++ b/src/json_stream/base.py @@ -1,65 +1,51 @@ -import collections import copy -from abc import ABC -from collections import OrderedDict +from collections import OrderedDict, deque from itertools import chain from typing import Optional, Iterator, Any -from json_stream.tokenizer import TokenType +from json_stream.tokenizer import ( + OPERATOR, + STRING, +) + +COLON = (OPERATOR, ":") class TransientAccessException(Exception): pass -class StreamingJSONBase(ABC): +class StreamingJSONBase(object): INCOMPLETE_ERROR = "Unexpected end of file" - @classmethod - def factory(cls, token, token_stream, persistent): - if persistent: - if token == '{': - return PersistentStreamingJSONObject(token_stream) - if token == '[': - return PersistentStreamingJSONList(token_stream) - else: - if token == '{': - return TransientStreamingJSONObject(token_stream) - if token == '[': - return TransientStreamingJSONList(token_stream) - raise ValueError(f"Unknown operator {token}") # pragma: no cover - - _persistent_children: bool + __slots__ = '_persistent_children', '_stream', '_child', 'streaming' def __init__(self, token_stream): + # this is inlined in subclasses self.streaming = True self._stream = token_stream self._child: Optional[StreamingJSONBase] = None - def _clear_child(self): - if self._child is not None: - self._child.read_all() - self._child = None - def _iter_items(self): + if not self.streaming: + return + load = self._load_item while True: - if not self.streaming: - return - self._clear_child() + # clear child + if self._child is not None: + # inlined from read_all() + deque(self._child._iter_items(), maxlen=0) + self._child = None + try: - item = self._load_item() + yield load() except StopIteration: if self.streaming: raise ValueError(self.INCOMPLETE_ERROR) return - yield item - - def _done(self): - self.streaming = False - raise StopIteration() def read_all(self): - collections.deque(self._iter_items(), maxlen=0) + deque(self._iter_items(), maxlen=0) def _load_item(self): raise NotImplementedError() # pragma: no cover @@ -83,9 +69,15 @@ def __deepcopy__(self, memo): raise copy.Error("Copying json_steam objects leads to a bad time") -class PersistentStreamingJSONBase(StreamingJSONBase, ABC): +class PersistentStreamingJSONBase(StreamingJSONBase): + __slots__ = '_data' + def __init__(self, token_stream): - super().__init__(token_stream) + # inlined from super + self.streaming = True + self._stream = token_stream + self._child: Optional[StreamingJSONBase] = None + self._data = self._init_persistent_data() self._persistent_children = True @@ -107,9 +99,15 @@ def __repr__(self): # pragma: no cover return f"<{type(self).__name__}: {repr(self._data)}, {'STREAMING' if self.streaming else 'DONE'}>" -class TransientStreamingJSONBase(StreamingJSONBase, ABC): +class TransientStreamingJSONBase(StreamingJSONBase): + __slots__ = '_started', + def __init__(self, token_stream): - super().__init__(token_stream) + # inlined from super + self.streaming = True + self._stream = token_stream + self._child: Optional[StreamingJSONBase] = None + self._started = False self._persistent_children = False @@ -137,22 +135,26 @@ def __repr__(self): # pragma: no cover return f"<{type(self).__name__}: TRANSIENT, {'STREAMING' if self.streaming else 'DONE'}>" -class StreamingJSONList(StreamingJSONBase, ABC): +class StreamingJSONList(StreamingJSONBase): INCOMPLETE_ERROR = "Unterminated list at end of file" + __slots__ = () + def _load_item(self): - token_type, v = next(self._stream) - if token_type == TokenType.OPERATOR: + stream = self._stream + token_type, v = next(stream) + if token_type == OPERATOR: if v == ']': - self._done() + self.streaming = False + raise StopIteration() if v == ',': - token_type, v = next(self._stream) + token_type, v = next(stream) elif v in '{[': pass else: # pragma: no cover raise ValueError(f"Expecting value, comma or ], got {v}") - if token_type == TokenType.OPERATOR: - self._child = v = self.factory(v, self._stream, self._persistent_children) + if token_type == OPERATOR: + self._child = v = factory[self._persistent_children, v](stream) return v def _get__iter__(self): @@ -160,6 +162,8 @@ def _get__iter__(self): class PersistentStreamingJSONList(PersistentStreamingJSONBase, StreamingJSONList): + __slots__ = () + def _init_persistent_data(self): return [] @@ -185,8 +189,16 @@ def __getitem__(self, k) -> Any: class TransientStreamingJSONList(TransientStreamingJSONBase, StreamingJSONList): + __slots__ = "_index", + def __init__(self, token_stream): - super().__init__(token_stream) + # inlined from super + self.streaming = True + self._stream = token_stream + self._child: Optional[StreamingJSONBase] = None + self._started = False + self._persistent_children = False + self._index = -1 def _load_item(self): @@ -203,26 +215,29 @@ def _find_item(self, i): raise IndexError(f"Index {i} out of range") -class StreamingJSONObject(StreamingJSONBase, ABC): +class StreamingJSONObject(StreamingJSONBase): INCOMPLETE_ERROR = "Unterminated object at end of file" + __slots__ = () + def _load_item(self): - token_type, k = next(self._stream) - if token_type == TokenType.OPERATOR: + stream = self._stream + token_type, k = next(stream) + if token_type == OPERATOR: if k == '}': - self._done() + self.streaming = False + raise StopIteration() if k == ',': - token_type, k = next(self._stream) - if token_type != TokenType.STRING: # pragma: no cover + token_type, k = next(stream) + if token_type != STRING: # pragma: no cover raise ValueError(f"Expecting string, comma or }}, got {k} ({token_type})") - token_type, token = next(self._stream) - if token_type != TokenType.OPERATOR or token != ":": + if next(stream) != COLON: raise ValueError("Expecting :") # pragma: no cover - token_type, v = next(self._stream) - if token_type == TokenType.OPERATOR: - self._child = v = self.factory(v, self._stream, self._persistent_children) + token_type, v = next(stream) + if token_type == OPERATOR: + self._child = v = factory[self._persistent_children, v](stream) return k, v def _get__iter__(self): @@ -251,6 +266,8 @@ def get(self, k, default=None) -> Any: class PersistentStreamingJSONObject(PersistentStreamingJSONBase, StreamingJSONObject): + __slots__ = () + def _init_persistent_data(self): return OrderedDict() @@ -277,6 +294,8 @@ def __getitem__(self, k) -> Any: class TransientStreamingJSONObject(TransientStreamingJSONBase, StreamingJSONObject): + __slots__ = () + def _find_item(self, k): was_started = self._started try: @@ -299,3 +318,11 @@ def keys(self): def values(self): self._check_started() return (v for k, v in self._iter_items()) + + +factory = { + (True, '{'): PersistentStreamingJSONObject, + (True, '['): PersistentStreamingJSONList, + (False, '{'): TransientStreamingJSONObject, + (False, '['): TransientStreamingJSONList, +} diff --git a/src/json_stream/loader.py b/src/json_stream/loader.py index 680e801..3265a3d 100644 --- a/src/json_stream/loader.py +++ b/src/json_stream/loader.py @@ -1,12 +1,13 @@ -from json_stream.base import StreamingJSONBase, TokenType +from json_stream.base import factory from json_stream.iterators import ensure_file from json_stream.select_tokenizer import default_tokenizer +from json_stream.tokenizer import OPERATOR def load(fp_or_iterable, persistent=False, tokenizer=default_tokenizer): fp = ensure_file(fp_or_iterable) token_stream = tokenizer(fp) token_type, token = next(token_stream) - if token_type == TokenType.OPERATOR: - return StreamingJSONBase.factory(token, token_stream, persistent) + if token_type == OPERATOR: + return factory[persistent, token](token_stream) return token diff --git a/src/json_stream/tokenizer/__init__.py b/src/json_stream/tokenizer/__init__.py index 6295dac..c5c1c20 100644 --- a/src/json_stream/tokenizer/__init__.py +++ b/src/json_stream/tokenizer/__init__.py @@ -11,45 +11,43 @@ from json_stream.tokenizer.strings import JsonStringReader -class TokenType: - OPERATOR = 0 - STRING = 1 - NUMBER = 2 - BOOLEAN = 3 - NULL = 4 +# TokenType +OPERATOR = 0 +STRING = 1 +NUMBER = 2 +BOOLEAN = 3 +NULL = 4 +# State +WHITESPACE = 0 +INTEGER_0 = 1 +INTEGER_SIGN = 2 +INTEGER = 3 +INTEGER_EXP = 4 +INTEGER_EXP_0 = 5 +FLOATING_POINT_0 = 6 +FLOATING_POINT = 8 +STR = 9 +STR_END = 11 +TRUE_1 = 12 +TRUE_2 = 13 +TRUE_3 = 14 +FALSE_1 = 15 +FALSE_2 = 16 +FALSE_3 = 17 +FALSE_4 = 18 +NULL_1 = 19 +NULL_2 = 20 +NULL_3 = 21 -class State: - WHITESPACE = 0 - INTEGER_0 = 1 - INTEGER_SIGN = 2 - INTEGER = 3 - INTEGER_EXP = 4 - INTEGER_EXP_0 = 5 - FLOATING_POINT_0 = 6 - FLOATING_POINT = 8 - STRING = 9 - STRING_END = 11 - TRUE_1 = 12 - TRUE_2 = 13 - TRUE_3 = 14 - FALSE_1 = 15 - FALSE_2 = 16 - FALSE_3 = 17 - FALSE_4 = 18 - NULL_1 = 19 - NULL_2 = 20 - NULL_3 = 21 - - -class SpecialChar: - # Kind of a hack but simple: if we used the empty string "" to represent - # EOF, expressions like `char in "0123456789"` would be true for EOF, which - # is confusing. If we used a non-string, they would result in TypeErrors. - # By using the string "EOF", they work as expected. The only thing we have - # to be careful about is to not ever use "EOF" in any such strings used for - # char membership checking, which we have no reason to do anyway. - EOF = "EOF" +# SpecialChar +# Kind of a hack but simple: if we used the empty string "" to represent +# EOF, expressions like `char in "0123456789"` would be true for EOF, which +# is confusing. If we used a non-string, they would result in TypeErrors. +# By using the string "EOF", they work as expected. The only thing we have +# to be careful about is to not ever use "EOF" in any such strings used for +# char membership checking, which we have no reason to do anyway. +EOF = "EOF" def _guess_encoding(stream): @@ -77,7 +75,7 @@ def tokenize(stream, strings_as_streams=False): stream = _ensure_text(stream) def is_delimiter(char): - return char.isspace() or char in "{}[]:," or char == SpecialChar.EOF + return char.isspace() or char in "{}[]:," or char == EOF token = [] completed = False @@ -87,193 +85,193 @@ def process_char(char): nonlocal completed, now_token, state, buffer, index advance = True add_char = False - if state == State.WHITESPACE: + if state == WHITESPACE: if char == "{": completed = True - now_token = (TokenType.OPERATOR, "{") + now_token = (OPERATOR, "{") elif char == "}": completed = True - now_token = (TokenType.OPERATOR, "}") + now_token = (OPERATOR, "}") elif char == "[": completed = True - now_token = (TokenType.OPERATOR, "[") + now_token = (OPERATOR, "[") elif char == "]": completed = True - now_token = (TokenType.OPERATOR, "]") + now_token = (OPERATOR, "]") elif char == ",": completed = True - now_token = (TokenType.OPERATOR, ",") + now_token = (OPERATOR, ",") elif char == ":": completed = True - now_token = (TokenType.OPERATOR, ":") + now_token = (OPERATOR, ":") elif char == '"': - state = State.STRING - now_token = (TokenType.STRING, JsonStringReader(stream, buffer)) + state = STR + now_token = (STRING, JsonStringReader(stream, buffer)) if strings_as_streams: completed = True advance = False elif char in "123456789": - state = State.INTEGER + state = INTEGER add_char = True elif char == "0": - state = State.INTEGER_0 + state = INTEGER_0 add_char = True elif char == "-": - state = State.INTEGER_SIGN + state = INTEGER_SIGN add_char = True elif char == "f": - state = State.FALSE_1 + state = FALSE_1 elif char == "t": - state = State.TRUE_1 + state = TRUE_1 elif char == "n": - state = State.NULL_1 - elif not char.isspace() and not char == SpecialChar.EOF: + state = NULL_1 + elif not char.isspace() and not char == EOF: raise ValueError("Invalid JSON character: '{0}'".format(char)) - elif state == State.INTEGER: + elif state == INTEGER: if char in "0123456789": add_char = True elif char == ".": - state = State.FLOATING_POINT_0 + state = FLOATING_POINT_0 add_char = True elif char == "e" or char == 'E': - state = State.INTEGER_EXP_0 + state = INTEGER_EXP_0 add_char = True elif is_delimiter(char): - state = State.WHITESPACE + state = WHITESPACE completed = True - now_token = (TokenType.NUMBER, int("".join(token))) + now_token = (NUMBER, int("".join(token))) advance = False else: raise ValueError("A number must contain only digits. Got '{}'".format(char)) - elif state == State.INTEGER_0: + elif state == INTEGER_0: if char == ".": - state = State.FLOATING_POINT_0 + state = FLOATING_POINT_0 add_char = True elif char == "e" or char == 'E': - state = State.INTEGER_EXP_0 + state = INTEGER_EXP_0 add_char = True elif is_delimiter(char): - state = State.WHITESPACE + state = WHITESPACE completed = True - now_token = (TokenType.NUMBER, 0) + now_token = (NUMBER, 0) advance = False else: raise ValueError("A 0 must be followed by a '.' or a 'e'. Got '{0}'".format(char)) - elif state == State.INTEGER_SIGN: + elif state == INTEGER_SIGN: if char == "0": - state = State.INTEGER_0 + state = INTEGER_0 add_char = True elif char in "123456789": - state = State.INTEGER + state = INTEGER add_char = True else: raise ValueError("A - must be followed by a digit. Got '{0}'".format(char)) - elif state == State.INTEGER_EXP_0: + elif state == INTEGER_EXP_0: if char == "+" or char == "-" or char in "0123456789": - state = State.INTEGER_EXP + state = INTEGER_EXP add_char = True else: raise ValueError("An e in a number must be followed by a '+', '-' or digit. Got '{0}'".format(char)) - elif state == State.INTEGER_EXP: + elif state == INTEGER_EXP: if char in "0123456789": add_char = True elif is_delimiter(char): completed = True - now_token = (TokenType.NUMBER, float("".join(token))) - state = State.WHITESPACE + now_token = (NUMBER, float("".join(token))) + state = WHITESPACE advance = False else: raise ValueError("A number exponent must consist only of digits. Got '{}'".format(char)) - elif state == State.FLOATING_POINT: + elif state == FLOATING_POINT: if char in "0123456789": add_char = True elif char == "e" or char == "E": - state = State.INTEGER_EXP_0 + state = INTEGER_EXP_0 add_char = True elif is_delimiter(char): completed = True - now_token = (TokenType.NUMBER, float("".join(token))) - state = State.WHITESPACE + now_token = (NUMBER, float("".join(token))) + state = WHITESPACE advance = False else: raise ValueError("A number must include only digits") - elif state == State.FLOATING_POINT_0: + elif state == FLOATING_POINT_0: if char in "0123456789": - state = State.FLOATING_POINT + state = FLOATING_POINT add_char = True else: raise ValueError("A number with a decimal point must be followed by a fractional part") - elif state == State.FALSE_1: + elif state == FALSE_1: if char == "a": - state = State.FALSE_2 + state = FALSE_2 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) - elif state == State.FALSE_2: + elif state == FALSE_2: if char == "l": - state = State.FALSE_3 + state = FALSE_3 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) - elif state == State.FALSE_3: + elif state == FALSE_3: if char == "s": - state = State.FALSE_4 + state = FALSE_4 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) - elif state == State.FALSE_4: + elif state == FALSE_4: if char == "e": - state = State.WHITESPACE + state = WHITESPACE completed = True - now_token = (TokenType.BOOLEAN, False) + now_token = (BOOLEAN, False) else: raise ValueError("Invalid JSON character: '{0}'".format(char)) - elif state == State.TRUE_1: + elif state == TRUE_1: if char == "r": - state = State.TRUE_2 + state = TRUE_2 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) - elif state == State.TRUE_2: + elif state == TRUE_2: if char == "u": - state = State.TRUE_3 + state = TRUE_3 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) - elif state == State.TRUE_3: + elif state == TRUE_3: if char == "e": - state = State.WHITESPACE + state = WHITESPACE completed = True - now_token = (TokenType.BOOLEAN, True) + now_token = (BOOLEAN, True) else: raise ValueError("Invalid JSON character: '{0}'".format(char)) - elif state == State.NULL_1: + elif state == NULL_1: if char == "u": - state = State.NULL_2 + state = NULL_2 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) - elif state == State.NULL_2: + elif state == NULL_2: if char == "l": - state = State.NULL_3 + state = NULL_3 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) - elif state == State.NULL_3: + elif state == NULL_3: if char == "l": - state = State.WHITESPACE + state = WHITESPACE completed = True - now_token = (TokenType.NULL, None) + now_token = (NULL, None) else: raise ValueError("Invalid JSON character: '{0}'".format(char)) - elif state == State.STRING: + elif state == STR: reader: JsonStringReader = now_token[1] try: s = reader.read() finally: index += reader.index if not strings_as_streams: - now_token = (TokenType.STRING, s) + now_token = (STRING, s) completed = True buffer = reader.buffer - state = State.STRING_END - elif state == State.STRING_END: + state = STR_END + elif state == STR_END: if is_delimiter(char): advance = False - state = State.WHITESPACE + state = WHITESPACE else: raise ValueError("Expected whitespace or an operator after string. Got '{}'".format(char)) @@ -282,7 +280,7 @@ def process_char(char): return advance - state = State.WHITESPACE + state = WHITESPACE buffer = stream.read(io.DEFAULT_BUFFER_SIZE) c = None index = -1 @@ -300,6 +298,6 @@ def process_char(char): token = [] yield now_token - process_char(SpecialChar.EOF) + process_char(EOF) if completed: yield now_token diff --git a/src/json_stream/tokenizer/tests/test_tokenizer.py b/src/json_stream/tokenizer/tests/test_tokenizer.py index 141755f..fbf3812 100644 --- a/src/json_stream/tokenizer/tests/test_tokenizer.py +++ b/src/json_stream/tokenizer/tests/test_tokenizer.py @@ -9,7 +9,7 @@ from io import StringIO from unittest import TestCase -from json_stream.tokenizer import tokenize, TokenType +from json_stream.tokenizer import tokenize, NUMBER, OPERATOR, STRING class TestJsonTokenization(TestCase): @@ -21,21 +21,21 @@ def assertNumberEquals(self, expected, actual): self.assertEqual(1, len(token_list)) ttype, token = token_list[0] self.assertEqual(expected, token) - self.assertEqual(ttype, TokenType.NUMBER) + self.assertEqual(ttype, NUMBER) def assertOperatorEquals(self, expected, actual): token_list = self.tokenize_sequence(actual) ttype, token = token_list[0] self.assertEqual(expected, token) - self.assertEqual(ttype, TokenType.OPERATOR) + self.assertEqual(ttype, OPERATOR) def assertStringEquals(self, *, expected, json_input): token_list = self.tokenize_sequence(json_input) self.assertEqual(1, len(token_list)) ttype, token = token_list[0] self.assertEqual(expected, token) - self.assertEqual(ttype, TokenType.STRING) + self.assertEqual(ttype, STRING) def test_number_parsing(self): self.assertNumberEquals(0, "0") diff --git a/src/json_stream/visitor.py b/src/json_stream/visitor.py index 99edd38..7570679 100644 --- a/src/json_stream/visitor.py +++ b/src/json_stream/visitor.py @@ -1,4 +1,4 @@ -from json_stream.base import StreamingJSONObject, StreamingJSONList, StreamingJSONBase +from json_stream.base import StreamingJSONObject, StreamingJSONList, factory from json_stream.iterators import ensure_file from json_stream.select_tokenizer import default_tokenizer @@ -23,5 +23,5 @@ def visit(fp_or_iterator, visitor, tokenizer=default_tokenizer): fp = ensure_file(fp_or_iterator) token_stream = tokenizer(fp) _, token = next(token_stream) - obj = StreamingJSONBase.factory(token, token_stream, persistent=False) + obj = factory[False, token](token_stream) _visit(obj, visitor, ())