From bd634ff55d5bacb1767674f19297756323cdc5ae Mon Sep 17 00:00:00 2001
From: Jamie Cockburn <jamie_cockburn@hotmail.co.uk>
Date: Wed, 7 Jun 2023 20:39:11 +0100
Subject: [PATCH 1/4] added buffered reading to tokenizer

---
 src/json_stream/tokenizer.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/json_stream/tokenizer.py b/src/json_stream/tokenizer.py
index 21b0bb4..7cb072f 100644
--- a/src/json_stream/tokenizer.py
+++ b/src/json_stream/tokenizer.py
@@ -365,9 +365,14 @@ def process_char(char):
 
         return advance, next_state
     state = State.WHITESPACE
-    c = stream.read(1)
-    index = 0
-    while c:
+    buffer = stream.read(io.DEFAULT_BUFFER_SIZE)
+    c = None
+    index = -1
+    advance = True
+    while buffer:
+        if advance:
+            c, buffer = buffer[0], buffer[1:] or stream.read(io.DEFAULT_BUFFER_SIZE)
+            index += 1
         try:
             advance, state = process_char(c)
         except ValueError as e:
@@ -376,9 +381,6 @@ def process_char(char):
             completed = False
             token = []
             yield now_token
-        if advance:
-            c = stream.read(1)
-            index += 1
     process_char(SpecialChar.EOF)
     if completed:
         yield now_token

From 1a72977d56355d360f01dbea71e29e6cea0baec1 Mon Sep 17 00:00:00 2001
From: Jamie Cockburn <jamie_cockburn@hotmail.co.uk>
Date: Mon, 12 Jun 2023 23:32:41 +0100
Subject: [PATCH 2/4] added json string stream

---
 .../{tokenizer.py => tokenizer/__init__.py}   |   2 +-
 src/json_stream/tokenizer/strings.py          | 128 ++++++++++++++
 src/json_stream/tokenizer/tests/__init__.py   |   0
 .../tokenizer/tests/test_strings.py           | 158 ++++++++++++++++++
 .../{ => tokenizer}/tests/test_tokenizer.py   |   4 +
 5 files changed, 291 insertions(+), 1 deletion(-)
 rename src/json_stream/{tokenizer.py => tokenizer/__init__.py} (99%)
 create mode 100644 src/json_stream/tokenizer/strings.py
 create mode 100644 src/json_stream/tokenizer/tests/__init__.py
 create mode 100644 src/json_stream/tokenizer/tests/test_strings.py
 rename src/json_stream/{ => tokenizer}/tests/test_tokenizer.py (97%)

diff --git a/src/json_stream/tokenizer.py b/src/json_stream/tokenizer/__init__.py
similarity index 99%
rename from src/json_stream/tokenizer.py
rename to src/json_stream/tokenizer/__init__.py
index 7cb072f..8f91b78 100644
--- a/src/json_stream/tokenizer.py
+++ b/src/json_stream/tokenizer/__init__.py
@@ -90,7 +90,7 @@ def is_delimiter(char):
     now_token = ""
 
     def process_char(char):
-        nonlocal token, completed, now_token, unicode_buffer
+        nonlocal completed, now_token, unicode_buffer
         advance = True
         add_char = False
         next_state = state
diff --git a/src/json_stream/tokenizer/strings.py b/src/json_stream/tokenizer/strings.py
new file mode 100644
index 0000000..a90636a
--- /dev/null
+++ b/src/json_stream/tokenizer/strings.py
@@ -0,0 +1,128 @@
+import io
+import unicodedata
+from typing import Union
+
+from json_stream.tokenizer import State, SURROGATE
+
+STRING_ESCAPE_CODES = {
+    '\\': '\\',
+    '/': '/',
+    '"': '"',
+    'b': '\b',
+    'f': '\f',
+    'n': '\n',
+    't': '\t',
+    'r': '\r'
+}
+
+
+class JsonStringReader(io.TextIOBase):
+    def __init__(self, stream: io.TextIOBase, initial_buffer=''):
+        self.stream = stream
+        self.buffer = initial_buffer
+        self.unicode_buffer = ''
+        self.state = State.STRING
+        self.complete = False
+
+    def read(self, size: Union[int, None] = None) -> str:
+        result = ''
+        length = io.DEFAULT_BUFFER_SIZE
+        while not self.complete and (size is None or not result):
+            if size:
+                length = size - len(result)
+            result += self._read_chunk(length)
+        return result
+
+    def _read_chunk(self, size: Union[int, None] = ...) -> str:
+        chunk = self.buffer or self.stream.read(size)
+        if not chunk:
+            raise ValueError("Unterminated string at end of file")
+        state = self.state
+        unicode_buffer = self.unicode_buffer
+        result = ""
+        start = 0
+        for i, c in enumerate(chunk):
+            if i == size:
+                if state == State.STRING:
+                    result += chunk[start:i]
+                self.buffer = chunk[i:]
+                break
+            if state == State.STRING:
+                if c == '"':
+                    result += chunk[start:i]
+                    self.complete = True
+                    self.buffer = chunk[i + 1:]
+                    break
+                elif c == "\\":
+                    state = State.STRING_ESCAPE
+                    result += chunk[start:i]
+                    start = i + 1
+
+            elif state == State.STRING_ESCAPE:
+                char = STRING_ESCAPE_CODES.get(c)
+                start = i + 1
+                if char:
+                    result += char
+                    state = State.STRING
+                elif c == 'u':
+                    state = State.UNICODE
+                else:
+                    raise ValueError("Invalid string escape: {}".format(c))
+
+            elif state == State.UNICODE:
+                unicode_buffer += c
+                start = i + 1
+                if len(unicode_buffer) == 4:
+                    try:
+                        code_point = int(unicode_buffer, 16)
+                    except ValueError:
+                        raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer}")
+                    char = chr(code_point)
+                    if unicodedata.category(char) == SURROGATE:
+                        state = State.UNICODE_SURROGATE_START
+                    else:
+                        result += char
+                        unicode_buffer = ''
+                        state = State.STRING
+
+            elif state == State.UNICODE_SURROGATE_START:
+                if c == "\\":
+                    state = State.UNICODE_SURROGATE_STRING_ESCAPE
+                    start = i + 1
+                else:
+                    raise ValueError(f"Unpaired UTF-16 surrogate")
+
+            elif state == State.UNICODE_SURROGATE_STRING_ESCAPE:
+                if c == "u":
+                    state = State.UNICODE_SURROGATE
+                    start = i + 1
+                else:
+                    raise ValueError(f"Unpaired UTF-16 surrogate")
+
+            elif state == State.UNICODE_SURROGATE:
+                unicode_buffer += c
+                start = i + 1
+                if len(unicode_buffer) == 8:
+                    code_point_1 = int(unicode_buffer[:4], 16)
+                    try:
+                        code_point_2 = int(unicode_buffer[4:], 16)
+                    except ValueError:
+                        raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer[4:]}")
+                    if unicodedata.category(chr(code_point_2)) != SURROGATE:
+                        raise ValueError(f"Second half of UTF-16 surrogate pair is not a surrogate!")
+                    try:
+                        pair = int.to_bytes(code_point_1, 2, 'little') + int.to_bytes(code_point_2, 2, 'little')
+                        result += pair.decode('utf-16-le')
+                    except ValueError:
+                        raise ValueError(
+                            f"Error decoding UTF-16 surrogate pair \\u{unicode_buffer[:4]}\\u{unicode_buffer[4:]}"
+                        )
+                    unicode_buffer = ''
+                    state = State.STRING
+        else:
+            result += chunk[start:]
+            self.buffer = ''
+
+        self.state = state
+        self.unicode_buffer = unicode_buffer
+        return result
diff --git a/src/json_stream/tokenizer/tests/__init__.py b/src/json_stream/tokenizer/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/json_stream/tokenizer/tests/test_strings.py b/src/json_stream/tokenizer/tests/test_strings.py
new file mode 100644
index 0000000..fc21a22
--- /dev/null
+++ b/src/json_stream/tokenizer/tests/test_strings.py
@@ -0,0 +1,158 @@
+import re
+from io import StringIO
+from unittest import TestCase
+
+from json_stream.tokenizer.strings import JsonStringReader
+
+
+class TestJsonStringReader(TestCase):
+    def test_string_parsing(self):
+        self.assertStringEquals("word", r'"word"')
+        self.assertStringEquals("this char at end: Ȃ", r'"this char at end: \u0202"')
+        self.assertStringEquals("this char in middle: Ȃ.", r'"this char in middle: \u0202."')
+
+    def test_empty_string(self):
+        self.assertStringEquals("", r'""')
+
+    def test_escaping(self):
+        self.assertStringEquals("with\tescape", r'"with\tescape"')
+        self.assertStringEquals("with\n a different escape", r'"with\n a different escape"')
+        self.assertStringEquals("using a \bbackspace", r'"using a \bbackspace"')
+        self.assertStringEquals("now we have \f a formfeed", r'"now we have \f a formfeed"')
+        self.assertStringEquals('"a quote"', r'"\"a quote\""')
+        self.assertStringEquals("/", r'"\/"')
+
+    def test_unicode_literal(self):
+        self.assertStringEquals('Ä', r'"\u00c4"')
+        self.assertStringEquals("꽸", r'"\uaf78"')
+        self.assertStringEquals("訋", r'"\u8A0b"')
+        self.assertStringEquals("돧", r'"\uB3e7"')
+        self.assertStringEquals("ዯ", r'"\u12eF"')
+
+    def test_invalid_string_escape(self):
+        self.assertStringRaises(r'"\h"', "Invalid string escape: h")
+        self.assertStringRaises(r'"\2"', "Invalid string escape: 2")
+        self.assertStringRaises(r'"\!"', "Invalid string escape: !")
+
+    def test_unicode_literal_truncated(self):
+        self.assertStringRaises(r'"\u00c"', re.escape(r'Invalid unicode literal: \u00c"'))
+
+    def test_unicode_literal_bad_hex(self):
+        self.assertStringRaises(r'"\u00x4"', re.escape(r"Invalid unicode literal: \u00x4"))
+
+    def test_unicode_surrogate_pair_literal(self):
+        self.assertStringEquals('𝄞', r'"\ud834\udd1e"')
+
+    def test_unicode_surrogate_pair_unpaired(self):
+        self.assertStringRaises(r'"\ud834"', "Unpaired UTF-16 surrogate")
+        self.assertStringRaises(r'"\ud834', "Unterminated string at end of file")
+        self.assertStringRaises(r'"\ud834\x', "Unpaired UTF-16 surrogate")
+        self.assertStringRaises(r'"\ud834' + '\\', "Unterminated string at end of file")
+
+    def test_unicode_surrogate_pair_non_surrogate(self):
+        self.assertStringRaises(r'"\ud834\u00c4"', "Second half of UTF-16 surrogate pair is not a surrogate!")
+
+    def test_unicode_surrogate_pair_literal_truncated(self):
+        self.assertStringRaises(r'"\ud834\u00c"', re.escape(r'Invalid unicode literal: \u00c"'))
+
+    def test_unicode_surrogate_pair_literal_bad_hex(self):
+        self.assertStringRaises(r'"\ud834\u00x4"', re.escape(r"Invalid unicode literal: \u00x4"))
+
+    def test_unicode_surrogate_pair_literal_invalid(self):
+        message = re.escape(r"Error decoding UTF-16 surrogate pair \ud834\ud834")
+        self.assertStringRaises(r'"\ud834\ud834"', message)
+
+    def test_unicode_surrogate_pair_literal_unterminated(self):
+        self.assertStringRaises(r'"\ud834\ud83', r"Unterminated string at end of file")
+
+    def test_unterminated_strings(self):
+        self.assertStringRaises('"unterminated', "Unterminated string at end of file")
+
+    def test_unterminated_strings_while_in_escape(self):
+        self.assertStringRaises(r'"\"', "Unterminated string at end of file")
+        self.assertStringRaises(r'"\u"', "Unterminated string at end of file")
+        self.assertStringRaises(r'"\u!"', "Unterminated string at end of file")
+        self.assertStringRaises(r'"\u!!"', "Unterminated string at end of file")
+        self.assertStringRaises(r'"\u!!!', "Unterminated string at end of file")
+
+    def test_with_initial_buffer(self):
+        self.assertStringEquals("there will be more string", buffer='"there will be ', stream='more string"')
+
+    def test_remainder(self):
+        reader, f = self.assertStringEquals(
+            "after the string",
+            stream='"after the string"there is more stuff',
+            remaining_buffer='there is more stuff',
+        )
+        self.assertRead(reader, f, '', remaining_buffer='there is more stuff')
+
+    def test_remainder_read_past_end_of_string(self):
+        reader, f = self.assertStringEquals(
+            "after the string",
+            stream='"after the string"there is more stuff',
+            remaining_buffer='the', remaining_stream='re is more stuff', amount=20
+        )
+        self.assertRead(reader, f, '', remaining_buffer='the', remaining_stream='re is more stuff', amount=20)
+
+    def test_remainder_when_string_ends_after_initial_buffer(self):
+        reader, f = self.assertStringEquals(
+            "after the string",
+            buffer='"after the', stream=' string"there is more stuff',
+            remaining_buffer='there is more stuff',
+        )
+        self.assertRead(reader, f, '', remaining_buffer='there is more stuff')
+
+    def test_remainder_when_string_ends_within_initial_buffer(self):
+        reader, f = self.assertStringEquals(
+            "after the string",
+            buffer='"after the string"there', stream=' is more stuff',
+            remaining_buffer='there', remaining_stream=' is more stuff',
+        )
+        self.assertRead(reader, f, '', remaining_buffer='there', remaining_stream=' is more stuff')
+
+    def test_read_part_shorter_initial_buffer(self):
+        reader, f = self.assertStringEquals(
+            "there",
+            buffer='"there will be ', stream='more string"',
+            remaining_buffer=' will be ', remaining_stream='more string"', amount=5, complete=False,
+        )
+        self.assertRead(reader, f, ' will be more string')
+
+    def test_read_part_longer_than_initial_buffer(self):
+        reader, f = self.assertStringEquals(
+            "there will be ",
+            buffer='"there will be ', stream='more string"',
+            remaining_buffer='', remaining_stream='more string"', amount=20, complete=False,
+        )
+        self.assertRead(reader, f, 'more string')
+
+    def test_read_over_split_escape(self):
+        json = r'"abcde\u00c4edcba"'
+        for i in range(len(json)):
+            buffer, stream = json[:i], json[i:]
+            self.assertStringEquals("abcdeÄedcba", buffer=buffer, stream=stream)
+
+    def assertStringEquals(self, result, stream, buffer='', remaining_buffer='', remaining_stream='', amount=None,
+                           complete=True):
+        if buffer:
+            buffer = buffer[1:]
+        else:
+            stream = stream[1:]
+        f = StringIO(stream)
+        reader = JsonStringReader(f, buffer)
+        self.assertRead(reader, f, result, remaining_buffer, remaining_stream, amount, complete)
+        return reader, f
+
+    def assertRead(self, reader, stream, result, remaining_buffer='', remaining_stream='', amount=None, complete=True):
+        self.assertEqual(result, reader.read(amount))
+        self.assertEqual(remaining_buffer, reader.buffer)
+        pos = stream.tell()
+        self.assertEqual(remaining_stream, stream.read())
+        stream.seek(pos)
+        self.assertEqual(complete, reader.complete)
+
+    def assertStringRaises(self, s, error):
+        stream = StringIO(s[1:])
+        f = JsonStringReader(stream)
+        with self.assertRaisesRegex(ValueError, error):
+            f.read()
diff --git a/src/json_stream/tests/test_tokenizer.py b/src/json_stream/tokenizer/tests/test_tokenizer.py
similarity index 97%
rename from src/json_stream/tests/test_tokenizer.py
rename to src/json_stream/tokenizer/tests/test_tokenizer.py
index e27b4a7..61a6068 100644
--- a/src/json_stream/tests/test_tokenizer.py
+++ b/src/json_stream/tokenizer/tests/test_tokenizer.py
@@ -199,3 +199,7 @@ def test_unicode_surrogate_pair_literal_invalid(self):
     def test_unicode_surrogate_pair_literal_unterminated(self):
         with self.assertRaisesRegex(ValueError, r"Unterminated unicode literal at end of file"):
             list(tokenize(StringIO(r'"\ud834\ud83')))
+
+    def test_unicode_surrogate_pair_literal_unterminated_first_half(self):
+        with self.assertRaisesRegex(ValueError, r"Unterminated unicode literal at end of file"):
+            list(tokenize(StringIO(r'"\ud83')))

From 4ee2bbad6977ad75e1d04519b7e8f477cb8c1919 Mon Sep 17 00:00:00 2001
From: Jamie Cockburn <jamie_cockburn@hotmail.co.uk>
Date: Mon, 12 Jun 2023 23:55:25 +0100
Subject: [PATCH 3/4] integrated json string reader with tokenizer

---
 src/json_stream/tokenizer/__init__.py         | 189 +++++-------------
 src/json_stream/tokenizer/strings.py          |  48 +++--
 .../tokenizer/tests/test_tokenizer.py         |  10 +-
 3 files changed, 88 insertions(+), 159 deletions(-)

diff --git a/src/json_stream/tokenizer/__init__.py b/src/json_stream/tokenizer/__init__.py
index 8f91b78..6295dac 100644
--- a/src/json_stream/tokenizer/__init__.py
+++ b/src/json_stream/tokenizer/__init__.py
@@ -6,9 +6,9 @@
 Copyright (c) 2019 Daniel Yule
 """
 import io
-import unicodedata
+from typing import Optional, Tuple
 
-SURROGATE = 'Cs'
+from json_stream.tokenizer.strings import JsonStringReader
 
 
 class TokenType:
@@ -29,7 +29,6 @@ class State:
     FLOATING_POINT_0 = 6
     FLOATING_POINT = 8
     STRING = 9
-    STRING_ESCAPE = 10
     STRING_END = 11
     TRUE_1 = 12
     TRUE_2 = 13
@@ -41,10 +40,6 @@ class State:
     NULL_1 = 19
     NULL_2 = 20
     NULL_3 = 21
-    UNICODE = 22
-    UNICODE_SURROGATE_START = 23
-    UNICODE_SURROGATE_STRING_ESCAPE = 24
-    UNICODE_SURROGATE = 25
 
 
 class SpecialChar:
@@ -78,22 +73,20 @@ def _ensure_text(stream):
     return stream
 
 
-def tokenize(stream):
+def tokenize(stream, strings_as_streams=False):
     stream = _ensure_text(stream)
 
     def is_delimiter(char):
         return char.isspace() or char in "{}[]:," or char == SpecialChar.EOF
 
     token = []
-    unicode_buffer = ""
     completed = False
-    now_token = ""
+    now_token: Optional[Tuple] = None
 
     def process_char(char):
-        nonlocal completed, now_token, unicode_buffer
+        nonlocal completed, now_token, state, buffer, index
         advance = True
         add_char = False
-        next_state = state
         if state == State.WHITESPACE:
             if char == "{":
                 completed = True
@@ -113,36 +106,40 @@ def process_char(char):
             elif char == ":":
                 completed = True
                 now_token = (TokenType.OPERATOR, ":")
-            elif char == "\"":
-                next_state = State.STRING
+            elif char == '"':
+                state = State.STRING
+                now_token = (TokenType.STRING, JsonStringReader(stream, buffer))
+                if strings_as_streams:
+                    completed = True
+                advance = False
             elif char in "123456789":
-                next_state = State.INTEGER
+                state = State.INTEGER
                 add_char = True
             elif char == "0":
-                next_state = State.INTEGER_0
+                state = State.INTEGER_0
                 add_char = True
             elif char == "-":
-                next_state = State.INTEGER_SIGN
+                state = State.INTEGER_SIGN
                 add_char = True
             elif char == "f":
-                next_state = State.FALSE_1
+                state = State.FALSE_1
             elif char == "t":
-                next_state = State.TRUE_1
+                state = State.TRUE_1
             elif char == "n":
-                next_state = State.NULL_1
+                state = State.NULL_1
             elif not char.isspace() and not char == SpecialChar.EOF:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
         elif state == State.INTEGER:
             if char in "0123456789":
                 add_char = True
             elif char == ".":
-                next_state = State.FLOATING_POINT_0
+                state = State.FLOATING_POINT_0
                 add_char = True
             elif char == "e" or char == 'E':
-                next_state = State.INTEGER_EXP_0
+                state = State.INTEGER_EXP_0
                 add_char = True
             elif is_delimiter(char):
-                next_state = State.WHITESPACE
+                state = State.WHITESPACE
                 completed = True
                 now_token = (TokenType.NUMBER, int("".join(token)))
                 advance = False
@@ -150,13 +147,13 @@ def process_char(char):
                 raise ValueError("A number must contain only digits.  Got '{}'".format(char))
         elif state == State.INTEGER_0:
             if char == ".":
-                next_state = State.FLOATING_POINT_0
+                state = State.FLOATING_POINT_0
                 add_char = True
             elif char == "e" or char == 'E':
-                next_state = State.INTEGER_EXP_0
+                state = State.INTEGER_EXP_0
                 add_char = True
             elif is_delimiter(char):
-                next_state = State.WHITESPACE
+                state = State.WHITESPACE
                 completed = True
                 now_token = (TokenType.NUMBER, 0)
                 advance = False
@@ -164,16 +161,16 @@ def process_char(char):
                 raise ValueError("A 0 must be followed by a '.' or a 'e'.  Got '{0}'".format(char))
         elif state == State.INTEGER_SIGN:
             if char == "0":
-                next_state = State.INTEGER_0
+                state = State.INTEGER_0
                 add_char = True
             elif char in "123456789":
-                next_state = State.INTEGER
+                state = State.INTEGER
                 add_char = True
             else:
                 raise ValueError("A - must be followed by a digit.  Got '{0}'".format(char))
         elif state == State.INTEGER_EXP_0:
             if char == "+" or char == "-" or char in "0123456789":
-                next_state = State.INTEGER_EXP
+                state = State.INTEGER_EXP
                 add_char = True
             else:
                 raise ValueError("An e in a number must be followed by a '+', '-' or digit.  Got '{0}'".format(char))
@@ -183,7 +180,7 @@ def process_char(char):
             elif is_delimiter(char):
                 completed = True
                 now_token = (TokenType.NUMBER, float("".join(token)))
-                next_state = State.WHITESPACE
+                state = State.WHITESPACE
                 advance = False
             else:
                 raise ValueError("A number exponent must consist only of digits.  Got '{}'".format(char))
@@ -191,179 +188,100 @@ def process_char(char):
             if char in "0123456789":
                 add_char = True
             elif char == "e" or char == "E":
-                next_state = State.INTEGER_EXP_0
+                state = State.INTEGER_EXP_0
                 add_char = True
             elif is_delimiter(char):
                 completed = True
                 now_token = (TokenType.NUMBER, float("".join(token)))
-                next_state = State.WHITESPACE
+                state = State.WHITESPACE
                 advance = False
             else:
                 raise ValueError("A number must include only digits")
         elif state == State.FLOATING_POINT_0:
             if char in "0123456789":
-                next_state = State.FLOATING_POINT
+                state = State.FLOATING_POINT
                 add_char = True
             else:
                 raise ValueError("A number with a decimal point must be followed by a fractional part")
         elif state == State.FALSE_1:
             if char == "a":
-                next_state = State.FALSE_2
+                state = State.FALSE_2
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
         elif state == State.FALSE_2:
             if char == "l":
-                next_state = State.FALSE_3
+                state = State.FALSE_3
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
         elif state == State.FALSE_3:
             if char == "s":
-                next_state = State.FALSE_4
+                state = State.FALSE_4
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
         elif state == State.FALSE_4:
             if char == "e":
-                next_state = State.WHITESPACE
+                state = State.WHITESPACE
                 completed = True
                 now_token = (TokenType.BOOLEAN, False)
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
         elif state == State.TRUE_1:
             if char == "r":
-                next_state = State.TRUE_2
+                state = State.TRUE_2
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
         elif state == State.TRUE_2:
             if char == "u":
-                next_state = State.TRUE_3
+                state = State.TRUE_3
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
         elif state == State.TRUE_3:
             if char == "e":
-                next_state = State.WHITESPACE
+                state = State.WHITESPACE
                 completed = True
                 now_token = (TokenType.BOOLEAN, True)
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
         elif state == State.NULL_1:
             if char == "u":
-                next_state = State.NULL_2
+                state = State.NULL_2
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
         elif state == State.NULL_2:
             if char == "l":
-                next_state = State.NULL_3
+                state = State.NULL_3
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
         elif state == State.NULL_3:
             if char == "l":
-                next_state = State.WHITESPACE
+                state = State.WHITESPACE
                 completed = True
                 now_token = (TokenType.NULL, None)
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
         elif state == State.STRING:
-            if char == "\"":
+            reader: JsonStringReader = now_token[1]
+            try:
+                s = reader.read()
+            finally:
+                index += reader.index
+            if not strings_as_streams:
+                now_token = (TokenType.STRING, s)
                 completed = True
-                now_token = (TokenType.STRING, "".join(token))
-                next_state = State.STRING_END
-            elif char == "\\":
-                next_state = State.STRING_ESCAPE
-            elif char == SpecialChar.EOF:
-                raise ValueError("Unterminated string at end of file")
-            else:
-                add_char = True
+            buffer = reader.buffer
+            state = State.STRING_END
         elif state == State.STRING_END:
             if is_delimiter(char):
                 advance = False
-                next_state = State.WHITESPACE
+                state = State.WHITESPACE
             else:
                 raise ValueError("Expected whitespace or an operator after string.  Got '{}'".format(char))
-        elif state == State.STRING_ESCAPE:
-            next_state = State.STRING
-            if char == "\\" or char == "\"":
-                add_char = True
-            elif char == "b":
-                char = "\b"
-                add_char = True
-            elif char == "f":
-                char = "\f"
-                add_char = True
-            elif char == "n":
-                char = "\n"
-                add_char = True
-            elif char == "t":
-                char = "\t"
-                add_char = True
-            elif char == "r":
-                char = "\r"
-                add_char = True
-            elif char == "/":
-                char = "/"
-                add_char = True
-            elif char == "u":
-                next_state = State.UNICODE
-                unicode_buffer = ""
-            else:
-                raise ValueError("Invalid string escape: {}".format(char))
-        elif state == State.UNICODE:
-            if char == SpecialChar.EOF:
-                raise ValueError('Unterminated unicode literal at end of file')
-            unicode_buffer += char
-            if len(unicode_buffer) == 4:
-                try:
-                    code_point = int(unicode_buffer, 16)
-                except ValueError:
-                    raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer}")
-                char = chr(code_point)
-                if unicodedata.category(char) == SURROGATE:
-                    next_state = State.UNICODE_SURROGATE_START
-                else:
-                    next_state = State.STRING
-                    add_char = True
-        elif state == State.UNICODE_SURROGATE_START:
-            if char == "\\":
-                next_state = State.UNICODE_SURROGATE_STRING_ESCAPE
-            elif char == SpecialChar.EOF:
-                raise ValueError("Unpaired UTF-16 surrogate at end of file")
-            else:
-                raise ValueError(f"Unpaired UTF-16 surrogate")
-
-        elif state == State.UNICODE_SURROGATE_STRING_ESCAPE:
-            if char == "u":
-                next_state = State.UNICODE_SURROGATE
-            elif char == SpecialChar.EOF:
-                raise ValueError("Unpaired UTF-16 surrogate at end of file")
-            else:
-                raise ValueError(f"Unpaired UTF-16 surrogate")
-
-        elif state == State.UNICODE_SURROGATE:
-            if char == SpecialChar.EOF:
-                raise ValueError('Unterminated unicode literal at end of file')
-            unicode_buffer += char
-            if len(unicode_buffer) == 8:
-                code_point_1 = int(unicode_buffer[:4], 16)
-                try:
-                    code_point_2 = int(unicode_buffer[4:], 16)
-                except ValueError:
-                    raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer[4:]}")
-                char = chr(code_point_2)
-                if unicodedata.category(char) != SURROGATE:
-                    raise ValueError(f"Second half of UTF-16 surrogate pair is not a surrogate!")
-                try:
-                    pair = int.to_bytes(code_point_1, 2, 'little') + int.to_bytes(code_point_2, 2, 'little')
-                    char = pair.decode('utf-16-le')
-                except ValueError:
-                    raise ValueError(
-                        f"Error decoding UTF-16 surrogate pair \\u{unicode_buffer[:4]}\\u{unicode_buffer[4:]}"
-                    )
-                next_state = State.STRING
-                add_char = True
 
         if add_char:
             token.append(char)
 
-        return advance, next_state
+        return advance
+
     state = State.WHITESPACE
     buffer = stream.read(io.DEFAULT_BUFFER_SIZE)
     c = None
@@ -374,13 +292,14 @@ def process_char(char):
             c, buffer = buffer[0], buffer[1:] or stream.read(io.DEFAULT_BUFFER_SIZE)
             index += 1
         try:
-            advance, state = process_char(c)
+            advance = process_char(c)
         except ValueError as e:
             raise ValueError("".join([e.args[0], " at index {}".format(index)]))
         if completed:
             completed = False
             token = []
             yield now_token
+
     process_char(SpecialChar.EOF)
     if completed:
         yield now_token
diff --git a/src/json_stream/tokenizer/strings.py b/src/json_stream/tokenizer/strings.py
index a90636a..41fb55d 100644
--- a/src/json_stream/tokenizer/strings.py
+++ b/src/json_stream/tokenizer/strings.py
@@ -1,8 +1,7 @@
 import io
 import unicodedata
 from typing import Union
-
-from json_stream.tokenizer import State, SURROGATE
+from io import DEFAULT_BUFFER_SIZE
 
 STRING_ESCAPE_CODES = {
     '\\': '\\',
@@ -15,18 +14,28 @@
     'r': '\r'
 }
 
+SURROGATE = 'Cs'
+
+CHAR = 1
+STRING_ESCAPE = 2
+UNICODE = 4
+UNICODE_SURROGATE_START = 5
+UNICODE_SURROGATE_STRING_ESCAPE = 6
+UNICODE_SURROGATE = 7
+
 
 class JsonStringReader(io.TextIOBase):
     def __init__(self, stream: io.TextIOBase, initial_buffer=''):
         self.stream = stream
         self.buffer = initial_buffer
         self.unicode_buffer = ''
-        self.state = State.STRING
+        self.state = CHAR
         self.complete = False
+        self.index = 0
 
     def read(self, size: Union[int, None] = None) -> str:
         result = ''
-        length = io.DEFAULT_BUFFER_SIZE
+        length = DEFAULT_BUFFER_SIZE
         while not self.complete and (size is None or not result):
             if size:
                 length = size - len(result)
@@ -42,34 +51,35 @@ def _read_chunk(self, size: Union[int, None] = ...) -> str:
         result = ""
         start = 0
         for i, c in enumerate(chunk):
+            self.index += 1
             if i == size:
-                if state == State.STRING:
+                if state == CHAR:
                     result += chunk[start:i]
                 self.buffer = chunk[i:]
                 break
-            if state == State.STRING:
+            if state == CHAR:
                 if c == '"':
                     result += chunk[start:i]
                     self.complete = True
                     self.buffer = chunk[i + 1:]
                     break
                 elif c == "\\":
-                    state = State.STRING_ESCAPE
+                    state = STRING_ESCAPE
                     result += chunk[start:i]
                     start = i + 1
 
-            elif state == State.STRING_ESCAPE:
+            elif state == STRING_ESCAPE:
                 char = STRING_ESCAPE_CODES.get(c)
                 start = i + 1
                 if char:
                     result += char
-                    state = State.STRING
+                    state = CHAR
                 elif c == 'u':
-                    state = State.UNICODE
+                    state = UNICODE
                 else:
                     raise ValueError("Invalid string escape: {}".format(c))
 
-            elif state == State.UNICODE:
+            elif state == UNICODE:
                 unicode_buffer += c
                 start = i + 1
                 if len(unicode_buffer) == 4:
@@ -79,27 +89,27 @@ def _read_chunk(self, size: Union[int, None] = ...) -> str:
                         raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer}")
                     char = chr(code_point)
                     if unicodedata.category(char) == SURROGATE:
-                        state = State.UNICODE_SURROGATE_START
+                        state = UNICODE_SURROGATE_START
                     else:
                         result += char
                         unicode_buffer = ''
-                        state = State.STRING
+                        state = CHAR
 
-            elif state == State.UNICODE_SURROGATE_START:
+            elif state == UNICODE_SURROGATE_START:
                 if c == "\\":
-                    state = State.UNICODE_SURROGATE_STRING_ESCAPE
+                    state = UNICODE_SURROGATE_STRING_ESCAPE
                     start = i + 1
                 else:
                     raise ValueError(f"Unpaired UTF-16 surrogate")
 
-            elif state == State.UNICODE_SURROGATE_STRING_ESCAPE:
+            elif state == UNICODE_SURROGATE_STRING_ESCAPE:
                 if c == "u":
-                    state = State.UNICODE_SURROGATE
+                    state = UNICODE_SURROGATE
                     start = i + 1
                 else:
                     raise ValueError(f"Unpaired UTF-16 surrogate")
 
-            elif state == State.UNICODE_SURROGATE:
+            elif state == UNICODE_SURROGATE:
                 unicode_buffer += c
                 start = i + 1
                 if len(unicode_buffer) == 8:
@@ -118,7 +128,7 @@ def _read_chunk(self, size: Union[int, None] = ...) -> str:
                             f"Error decoding UTF-16 surrogate pair \\u{unicode_buffer[:4]}\\u{unicode_buffer[4:]}"
                         )
                     unicode_buffer = ''
-                    state = State.STRING
+                    state = CHAR
         else:
             result += chunk[start:]
             self.buffer = ''
diff --git a/src/json_stream/tokenizer/tests/test_tokenizer.py b/src/json_stream/tokenizer/tests/test_tokenizer.py
index 61a6068..141755f 100644
--- a/src/json_stream/tokenizer/tests/test_tokenizer.py
+++ b/src/json_stream/tokenizer/tests/test_tokenizer.py
@@ -90,7 +90,7 @@ def test_string_parsing(self):
             self.tokenize_sequence(r'"\2"')
         with self.assertRaisesRegex(ValueError, "Invalid string escape: ! at index 2"):
             self.tokenize_sequence(r'"\!"')
-        with self.assertRaisesRegex(ValueError, "Unterminated unicode literal at end of file"):
+        with self.assertRaisesRegex(ValueError, "Unterminated string at end of file at index 4"):
             self.tokenize_sequence(r'"\u!"')
 
     def test_unterminated_strings(self):
@@ -172,11 +172,11 @@ def test_unicode_surrogate_pair_literal(self):
     def test_unicode_surrogate_pair_unpaired(self):
         with self.assertRaisesRegex(ValueError, "Unpaired UTF-16 surrogate at index 7"):
             list(tokenize(StringIO(r'"\ud834"')))
-        with self.assertRaisesRegex(ValueError, "Unpaired UTF-16 surrogate at end of file"):
+        with self.assertRaisesRegex(ValueError, "Unterminated string at end of file"):
             list(tokenize(StringIO(r'"\ud834')))
         with self.assertRaisesRegex(ValueError, "Unpaired UTF-16 surrogate at index 8"):
             list(tokenize(StringIO(r'"\ud834\x')))
-        with self.assertRaisesRegex(ValueError, "Unpaired UTF-16 surrogate at end of file"):
+        with self.assertRaisesRegex(ValueError, "Unterminated string at end of file"):
             list(tokenize(StringIO(r'"\ud834' + '\\')))
 
     def test_unicode_surrogate_pair_non_surrogate(self):
@@ -197,9 +197,9 @@ def test_unicode_surrogate_pair_literal_invalid(self):
             list(tokenize(StringIO(r'"\ud834\ud834"')))
 
     def test_unicode_surrogate_pair_literal_unterminated(self):
-        with self.assertRaisesRegex(ValueError, r"Unterminated unicode literal at end of file"):
+        with self.assertRaisesRegex(ValueError, r"Unterminated string at end of file at index 11"):
             list(tokenize(StringIO(r'"\ud834\ud83')))
 
     def test_unicode_surrogate_pair_literal_unterminated_first_half(self):
-        with self.assertRaisesRegex(ValueError, r"Unterminated unicode literal at end of file"):
+        with self.assertRaisesRegex(ValueError, r"Unterminated string at end of file"):
             list(tokenize(StringIO(r'"\ud83')))

From b75d9167d3f8033f88a5a749352076da7940ceda Mon Sep 17 00:00:00 2001
From: Jamie Cockburn <jamie_cockburn@hotmail.co.uk>
Date: Tue, 13 Jun 2023 19:56:55 +0100
Subject: [PATCH 4/4] simple performance improvements

---
 src/json_stream/base.py                       | 143 +++++++-----
 src/json_stream/loader.py                     |   7 +-
 src/json_stream/tokenizer/__init__.py         | 214 +++++++++---------
 .../tokenizer/tests/test_tokenizer.py         |   8 +-
 src/json_stream/visitor.py                    |   4 +-
 5 files changed, 201 insertions(+), 175 deletions(-)

diff --git a/src/json_stream/base.py b/src/json_stream/base.py
index 8e03bc8..f40ce2c 100644
--- a/src/json_stream/base.py
+++ b/src/json_stream/base.py
@@ -1,65 +1,51 @@
-import collections
 import copy
-from abc import ABC
-from collections import OrderedDict
+from collections import OrderedDict, deque
 from itertools import chain
 from typing import Optional, Iterator, Any
 
-from json_stream.tokenizer import TokenType
+from json_stream.tokenizer import (
+    OPERATOR,
+    STRING,
+)
+
+COLON = (OPERATOR, ":")
 
 
 class TransientAccessException(Exception):
     pass
 
 
-class StreamingJSONBase(ABC):
+class StreamingJSONBase(object):
     INCOMPLETE_ERROR = "Unexpected end of file"
 
-    @classmethod
-    def factory(cls, token, token_stream, persistent):
-        if persistent:
-            if token == '{':
-                return PersistentStreamingJSONObject(token_stream)
-            if token == '[':
-                return PersistentStreamingJSONList(token_stream)
-        else:
-            if token == '{':
-                return TransientStreamingJSONObject(token_stream)
-            if token == '[':
-                return TransientStreamingJSONList(token_stream)
-        raise ValueError(f"Unknown operator {token}")  # pragma: no cover
-
-    _persistent_children: bool
+    __slots__ = '_persistent_children', '_stream', '_child', 'streaming'
 
     def __init__(self, token_stream):
+        # this is inlined in subclasses
         self.streaming = True
         self._stream = token_stream
         self._child: Optional[StreamingJSONBase] = None
 
-    def _clear_child(self):
-        if self._child is not None:
-            self._child.read_all()
-            self._child = None
-
     def _iter_items(self):
+        if not self.streaming:
+            return
+        load = self._load_item
         while True:
-            if not self.streaming:
-                return
-            self._clear_child()
+            # clear child
+            if self._child is not None:
+                # inlined from read_all()
+                deque(self._child._iter_items(), maxlen=0)
+                self._child = None
+
             try:
-                item = self._load_item()
+                yield load()
             except StopIteration:
                 if self.streaming:
                     raise ValueError(self.INCOMPLETE_ERROR)
                 return
-            yield item
-
-    def _done(self):
-        self.streaming = False
-        raise StopIteration()
 
     def read_all(self):
-        collections.deque(self._iter_items(), maxlen=0)
+        deque(self._iter_items(), maxlen=0)
 
     def _load_item(self):
         raise NotImplementedError()  # pragma: no cover
@@ -83,9 +69,15 @@ def __deepcopy__(self, memo):
         raise copy.Error("Copying json_steam objects leads to a bad time")
 
 
-class PersistentStreamingJSONBase(StreamingJSONBase, ABC):
+class PersistentStreamingJSONBase(StreamingJSONBase):
+    __slots__ = '_data'
+
     def __init__(self, token_stream):
-        super().__init__(token_stream)
+        # inlined from super
+        self.streaming = True
+        self._stream = token_stream
+        self._child: Optional[StreamingJSONBase] = None
+
         self._data = self._init_persistent_data()
         self._persistent_children = True
 
@@ -107,9 +99,15 @@ def __repr__(self):  # pragma: no cover
         return f"<{type(self).__name__}: {repr(self._data)}, {'STREAMING' if self.streaming else 'DONE'}>"
 
 
-class TransientStreamingJSONBase(StreamingJSONBase, ABC):
+class TransientStreamingJSONBase(StreamingJSONBase):
+    __slots__ = '_started',
+
     def __init__(self, token_stream):
-        super().__init__(token_stream)
+        # inlined from super
+        self.streaming = True
+        self._stream = token_stream
+        self._child: Optional[StreamingJSONBase] = None
+
         self._started = False
         self._persistent_children = False
 
@@ -137,22 +135,26 @@ def __repr__(self):  # pragma: no cover
         return f"<{type(self).__name__}: TRANSIENT, {'STREAMING' if self.streaming else 'DONE'}>"
 
 
-class StreamingJSONList(StreamingJSONBase, ABC):
+class StreamingJSONList(StreamingJSONBase):
     INCOMPLETE_ERROR = "Unterminated list at end of file"
 
+    __slots__ = ()
+
     def _load_item(self):
-        token_type, v = next(self._stream)
-        if token_type == TokenType.OPERATOR:
+        stream = self._stream
+        token_type, v = next(stream)
+        if token_type == OPERATOR:
             if v == ']':
-                self._done()
+                self.streaming = False
+                raise StopIteration()
             if v == ',':
-                token_type, v = next(self._stream)
+                token_type, v = next(stream)
             elif v in '{[':
                 pass
             else:  # pragma: no cover
                 raise ValueError(f"Expecting value, comma or ], got {v}")
-        if token_type == TokenType.OPERATOR:
-            self._child = v = self.factory(v, self._stream, self._persistent_children)
+        if token_type == OPERATOR:
+            self._child = v = factory[self._persistent_children, v](stream)
         return v
 
     def _get__iter__(self):
@@ -160,6 +162,8 @@ def _get__iter__(self):
 
 
 class PersistentStreamingJSONList(PersistentStreamingJSONBase, StreamingJSONList):
+    __slots__ = ()
+
     def _init_persistent_data(self):
         return []
 
@@ -185,8 +189,16 @@ def __getitem__(self, k) -> Any:
 
 
 class TransientStreamingJSONList(TransientStreamingJSONBase, StreamingJSONList):
+    __slots__ = "_index",
+
     def __init__(self, token_stream):
-        super().__init__(token_stream)
+        # inlined from super
+        self.streaming = True
+        self._stream = token_stream
+        self._child: Optional[StreamingJSONBase] = None
+        self._started = False
+        self._persistent_children = False
+
         self._index = -1
 
     def _load_item(self):
@@ -203,26 +215,29 @@ def _find_item(self, i):
         raise IndexError(f"Index {i} out of range")
 
 
-class StreamingJSONObject(StreamingJSONBase, ABC):
+class StreamingJSONObject(StreamingJSONBase):
     INCOMPLETE_ERROR = "Unterminated object at end of file"
 
+    __slots__ = ()
+
     def _load_item(self):
-        token_type, k = next(self._stream)
-        if token_type == TokenType.OPERATOR:
+        stream = self._stream
+        token_type, k = next(stream)
+        if token_type == OPERATOR:
             if k == '}':
-                self._done()
+                self.streaming = False
+                raise StopIteration()
             if k == ',':
-                token_type, k = next(self._stream)
-        if token_type != TokenType.STRING:  # pragma: no cover
+                token_type, k = next(stream)
+        if token_type != STRING:  # pragma: no cover
             raise ValueError(f"Expecting string, comma or }}, got {k} ({token_type})")
 
-        token_type, token = next(self._stream)
-        if token_type != TokenType.OPERATOR or token != ":":
+        if next(stream) != COLON:
             raise ValueError("Expecting :")  # pragma: no cover
 
-        token_type, v = next(self._stream)
-        if token_type == TokenType.OPERATOR:
-            self._child = v = self.factory(v, self._stream, self._persistent_children)
+        token_type, v = next(stream)
+        if token_type == OPERATOR:
+            self._child = v = factory[self._persistent_children, v](stream)
         return k, v
 
     def _get__iter__(self):
@@ -251,6 +266,8 @@ def get(self, k, default=None) -> Any:
 
 
 class PersistentStreamingJSONObject(PersistentStreamingJSONBase, StreamingJSONObject):
+    __slots__ = ()
+
     def _init_persistent_data(self):
         return OrderedDict()
 
@@ -277,6 +294,8 @@ def __getitem__(self, k) -> Any:
 
 
 class TransientStreamingJSONObject(TransientStreamingJSONBase, StreamingJSONObject):
+    __slots__ = ()
+
     def _find_item(self, k):
         was_started = self._started
         try:
@@ -299,3 +318,11 @@ def keys(self):
     def values(self):
         self._check_started()
         return (v for k, v in self._iter_items())
+
+
+factory = {
+    (True, '{'): PersistentStreamingJSONObject,
+    (True, '['): PersistentStreamingJSONList,
+    (False, '{'): TransientStreamingJSONObject,
+    (False, '['): TransientStreamingJSONList,
+}
diff --git a/src/json_stream/loader.py b/src/json_stream/loader.py
index 680e801..3265a3d 100644
--- a/src/json_stream/loader.py
+++ b/src/json_stream/loader.py
@@ -1,12 +1,13 @@
-from json_stream.base import StreamingJSONBase, TokenType
+from json_stream.base import factory
 from json_stream.iterators import ensure_file
 from json_stream.select_tokenizer import default_tokenizer
+from json_stream.tokenizer import OPERATOR
 
 
 def load(fp_or_iterable, persistent=False, tokenizer=default_tokenizer):
     fp = ensure_file(fp_or_iterable)
     token_stream = tokenizer(fp)
     token_type, token = next(token_stream)
-    if token_type == TokenType.OPERATOR:
-        return StreamingJSONBase.factory(token, token_stream, persistent)
+    if token_type == OPERATOR:
+        return factory[persistent, token](token_stream)
     return token
diff --git a/src/json_stream/tokenizer/__init__.py b/src/json_stream/tokenizer/__init__.py
index 6295dac..c5c1c20 100644
--- a/src/json_stream/tokenizer/__init__.py
+++ b/src/json_stream/tokenizer/__init__.py
@@ -11,45 +11,43 @@
 from json_stream.tokenizer.strings import JsonStringReader
 
 
-class TokenType:
-    OPERATOR = 0
-    STRING = 1
-    NUMBER = 2
-    BOOLEAN = 3
-    NULL = 4
+# TokenType
+OPERATOR = 0
+STRING = 1
+NUMBER = 2
+BOOLEAN = 3
+NULL = 4
 
+# State
+WHITESPACE = 0
+INTEGER_0 = 1
+INTEGER_SIGN = 2
+INTEGER = 3
+INTEGER_EXP = 4
+INTEGER_EXP_0 = 5
+FLOATING_POINT_0 = 6
+FLOATING_POINT = 8
+STR = 9
+STR_END = 11
+TRUE_1 = 12
+TRUE_2 = 13
+TRUE_3 = 14
+FALSE_1 = 15
+FALSE_2 = 16
+FALSE_3 = 17
+FALSE_4 = 18
+NULL_1 = 19
+NULL_2 = 20
+NULL_3 = 21
 
-class State:
-    WHITESPACE = 0
-    INTEGER_0 = 1
-    INTEGER_SIGN = 2
-    INTEGER = 3
-    INTEGER_EXP = 4
-    INTEGER_EXP_0 = 5
-    FLOATING_POINT_0 = 6
-    FLOATING_POINT = 8
-    STRING = 9
-    STRING_END = 11
-    TRUE_1 = 12
-    TRUE_2 = 13
-    TRUE_3 = 14
-    FALSE_1 = 15
-    FALSE_2 = 16
-    FALSE_3 = 17
-    FALSE_4 = 18
-    NULL_1 = 19
-    NULL_2 = 20
-    NULL_3 = 21
-
-
-class SpecialChar:
-    # Kind of a hack but simple: if we used the empty string "" to represent
-    # EOF, expressions like `char in "0123456789"` would be true for EOF, which
-    # is confusing. If we used a non-string, they would result in TypeErrors.
-    # By using the string "EOF", they work as expected. The only thing we have
-    # to be careful about is to not ever use "EOF" in any such strings used for
-    # char membership checking, which we have no reason to do anyway.
-    EOF = "EOF"
+# SpecialChar
+# Kind of a hack but simple: if we used the empty string "" to represent
+# EOF, expressions like `char in "0123456789"` would be true for EOF, which
+# is confusing. If we used a non-string, they would result in TypeErrors.
+# By using the string "EOF", they work as expected. The only thing we have
+# to be careful about is to not ever use "EOF" in any such strings used for
+# char membership checking, which we have no reason to do anyway.
+EOF = "EOF"
 
 
 def _guess_encoding(stream):
@@ -77,7 +75,7 @@ def tokenize(stream, strings_as_streams=False):
     stream = _ensure_text(stream)
 
     def is_delimiter(char):
-        return char.isspace() or char in "{}[]:," or char == SpecialChar.EOF
+        return char.isspace() or char in "{}[]:," or char == EOF
 
     token = []
     completed = False
@@ -87,193 +85,193 @@ def process_char(char):
         nonlocal completed, now_token, state, buffer, index
         advance = True
         add_char = False
-        if state == State.WHITESPACE:
+        if state == WHITESPACE:
             if char == "{":
                 completed = True
-                now_token = (TokenType.OPERATOR, "{")
+                now_token = (OPERATOR, "{")
             elif char == "}":
                 completed = True
-                now_token = (TokenType.OPERATOR, "}")
+                now_token = (OPERATOR, "}")
             elif char == "[":
                 completed = True
-                now_token = (TokenType.OPERATOR, "[")
+                now_token = (OPERATOR, "[")
             elif char == "]":
                 completed = True
-                now_token = (TokenType.OPERATOR, "]")
+                now_token = (OPERATOR, "]")
             elif char == ",":
                 completed = True
-                now_token = (TokenType.OPERATOR, ",")
+                now_token = (OPERATOR, ",")
             elif char == ":":
                 completed = True
-                now_token = (TokenType.OPERATOR, ":")
+                now_token = (OPERATOR, ":")
             elif char == '"':
-                state = State.STRING
-                now_token = (TokenType.STRING, JsonStringReader(stream, buffer))
+                state = STR
+                now_token = (STRING, JsonStringReader(stream, buffer))
                 if strings_as_streams:
                     completed = True
                 advance = False
             elif char in "123456789":
-                state = State.INTEGER
+                state = INTEGER
                 add_char = True
             elif char == "0":
-                state = State.INTEGER_0
+                state = INTEGER_0
                 add_char = True
             elif char == "-":
-                state = State.INTEGER_SIGN
+                state = INTEGER_SIGN
                 add_char = True
             elif char == "f":
-                state = State.FALSE_1
+                state = FALSE_1
             elif char == "t":
-                state = State.TRUE_1
+                state = TRUE_1
             elif char == "n":
-                state = State.NULL_1
-            elif not char.isspace() and not char == SpecialChar.EOF:
+                state = NULL_1
+            elif not char.isspace() and not char == EOF:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
-        elif state == State.INTEGER:
+        elif state == INTEGER:
             if char in "0123456789":
                 add_char = True
             elif char == ".":
-                state = State.FLOATING_POINT_0
+                state = FLOATING_POINT_0
                 add_char = True
             elif char == "e" or char == 'E':
-                state = State.INTEGER_EXP_0
+                state = INTEGER_EXP_0
                 add_char = True
             elif is_delimiter(char):
-                state = State.WHITESPACE
+                state = WHITESPACE
                 completed = True
-                now_token = (TokenType.NUMBER, int("".join(token)))
+                now_token = (NUMBER, int("".join(token)))
                 advance = False
             else:
                 raise ValueError("A number must contain only digits.  Got '{}'".format(char))
-        elif state == State.INTEGER_0:
+        elif state == INTEGER_0:
             if char == ".":
-                state = State.FLOATING_POINT_0
+                state = FLOATING_POINT_0
                 add_char = True
             elif char == "e" or char == 'E':
-                state = State.INTEGER_EXP_0
+                state = INTEGER_EXP_0
                 add_char = True
             elif is_delimiter(char):
-                state = State.WHITESPACE
+                state = WHITESPACE
                 completed = True
-                now_token = (TokenType.NUMBER, 0)
+                now_token = (NUMBER, 0)
                 advance = False
             else:
                 raise ValueError("A 0 must be followed by a '.' or a 'e'.  Got '{0}'".format(char))
-        elif state == State.INTEGER_SIGN:
+        elif state == INTEGER_SIGN:
             if char == "0":
-                state = State.INTEGER_0
+                state = INTEGER_0
                 add_char = True
             elif char in "123456789":
-                state = State.INTEGER
+                state = INTEGER
                 add_char = True
             else:
                 raise ValueError("A - must be followed by a digit.  Got '{0}'".format(char))
-        elif state == State.INTEGER_EXP_0:
+        elif state == INTEGER_EXP_0:
             if char == "+" or char == "-" or char in "0123456789":
-                state = State.INTEGER_EXP
+                state = INTEGER_EXP
                 add_char = True
             else:
                 raise ValueError("An e in a number must be followed by a '+', '-' or digit.  Got '{0}'".format(char))
-        elif state == State.INTEGER_EXP:
+        elif state == INTEGER_EXP:
             if char in "0123456789":
                 add_char = True
             elif is_delimiter(char):
                 completed = True
-                now_token = (TokenType.NUMBER, float("".join(token)))
-                state = State.WHITESPACE
+                now_token = (NUMBER, float("".join(token)))
+                state = WHITESPACE
                 advance = False
             else:
                 raise ValueError("A number exponent must consist only of digits.  Got '{}'".format(char))
-        elif state == State.FLOATING_POINT:
+        elif state == FLOATING_POINT:
             if char in "0123456789":
                 add_char = True
             elif char == "e" or char == "E":
-                state = State.INTEGER_EXP_0
+                state = INTEGER_EXP_0
                 add_char = True
             elif is_delimiter(char):
                 completed = True
-                now_token = (TokenType.NUMBER, float("".join(token)))
-                state = State.WHITESPACE
+                now_token = (NUMBER, float("".join(token)))
+                state = WHITESPACE
                 advance = False
             else:
                 raise ValueError("A number must include only digits")
-        elif state == State.FLOATING_POINT_0:
+        elif state == FLOATING_POINT_0:
             if char in "0123456789":
-                state = State.FLOATING_POINT
+                state = FLOATING_POINT
                 add_char = True
             else:
                 raise ValueError("A number with a decimal point must be followed by a fractional part")
-        elif state == State.FALSE_1:
+        elif state == FALSE_1:
             if char == "a":
-                state = State.FALSE_2
+                state = FALSE_2
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
-        elif state == State.FALSE_2:
+        elif state == FALSE_2:
             if char == "l":
-                state = State.FALSE_3
+                state = FALSE_3
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
-        elif state == State.FALSE_3:
+        elif state == FALSE_3:
             if char == "s":
-                state = State.FALSE_4
+                state = FALSE_4
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
-        elif state == State.FALSE_4:
+        elif state == FALSE_4:
             if char == "e":
-                state = State.WHITESPACE
+                state = WHITESPACE
                 completed = True
-                now_token = (TokenType.BOOLEAN, False)
+                now_token = (BOOLEAN, False)
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
-        elif state == State.TRUE_1:
+        elif state == TRUE_1:
             if char == "r":
-                state = State.TRUE_2
+                state = TRUE_2
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
-        elif state == State.TRUE_2:
+        elif state == TRUE_2:
             if char == "u":
-                state = State.TRUE_3
+                state = TRUE_3
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
-        elif state == State.TRUE_3:
+        elif state == TRUE_3:
             if char == "e":
-                state = State.WHITESPACE
+                state = WHITESPACE
                 completed = True
-                now_token = (TokenType.BOOLEAN, True)
+                now_token = (BOOLEAN, True)
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
-        elif state == State.NULL_1:
+        elif state == NULL_1:
             if char == "u":
-                state = State.NULL_2
+                state = NULL_2
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
-        elif state == State.NULL_2:
+        elif state == NULL_2:
             if char == "l":
-                state = State.NULL_3
+                state = NULL_3
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
-        elif state == State.NULL_3:
+        elif state == NULL_3:
             if char == "l":
-                state = State.WHITESPACE
+                state = WHITESPACE
                 completed = True
-                now_token = (TokenType.NULL, None)
+                now_token = (NULL, None)
             else:
                 raise ValueError("Invalid JSON character: '{0}'".format(char))
-        elif state == State.STRING:
+        elif state == STR:
             reader: JsonStringReader = now_token[1]
             try:
                 s = reader.read()
             finally:
                 index += reader.index
             if not strings_as_streams:
-                now_token = (TokenType.STRING, s)
+                now_token = (STRING, s)
                 completed = True
             buffer = reader.buffer
-            state = State.STRING_END
-        elif state == State.STRING_END:
+            state = STR_END
+        elif state == STR_END:
             if is_delimiter(char):
                 advance = False
-                state = State.WHITESPACE
+                state = WHITESPACE
             else:
                 raise ValueError("Expected whitespace or an operator after string.  Got '{}'".format(char))
 
@@ -282,7 +280,7 @@ def process_char(char):
 
         return advance
 
-    state = State.WHITESPACE
+    state = WHITESPACE
     buffer = stream.read(io.DEFAULT_BUFFER_SIZE)
     c = None
     index = -1
@@ -300,6 +298,6 @@ def process_char(char):
             token = []
             yield now_token
 
-    process_char(SpecialChar.EOF)
+    process_char(EOF)
     if completed:
         yield now_token
diff --git a/src/json_stream/tokenizer/tests/test_tokenizer.py b/src/json_stream/tokenizer/tests/test_tokenizer.py
index 141755f..fbf3812 100644
--- a/src/json_stream/tokenizer/tests/test_tokenizer.py
+++ b/src/json_stream/tokenizer/tests/test_tokenizer.py
@@ -9,7 +9,7 @@
 from io import StringIO
 from unittest import TestCase
 
-from json_stream.tokenizer import tokenize, TokenType
+from json_stream.tokenizer import tokenize, NUMBER, OPERATOR, STRING
 
 
 class TestJsonTokenization(TestCase):
@@ -21,21 +21,21 @@ def assertNumberEquals(self, expected, actual):
         self.assertEqual(1, len(token_list))
         ttype, token = token_list[0]
         self.assertEqual(expected, token)
-        self.assertEqual(ttype, TokenType.NUMBER)
+        self.assertEqual(ttype, NUMBER)
 
     def assertOperatorEquals(self, expected, actual):
 
         token_list = self.tokenize_sequence(actual)
         ttype, token = token_list[0]
         self.assertEqual(expected, token)
-        self.assertEqual(ttype, TokenType.OPERATOR)
+        self.assertEqual(ttype, OPERATOR)
 
     def assertStringEquals(self, *, expected, json_input):
         token_list = self.tokenize_sequence(json_input)
         self.assertEqual(1, len(token_list))
         ttype, token = token_list[0]
         self.assertEqual(expected, token)
-        self.assertEqual(ttype, TokenType.STRING)
+        self.assertEqual(ttype, STRING)
 
     def test_number_parsing(self):
         self.assertNumberEquals(0, "0")
diff --git a/src/json_stream/visitor.py b/src/json_stream/visitor.py
index 99edd38..7570679 100644
--- a/src/json_stream/visitor.py
+++ b/src/json_stream/visitor.py
@@ -1,4 +1,4 @@
-from json_stream.base import StreamingJSONObject, StreamingJSONList, StreamingJSONBase
+from json_stream.base import StreamingJSONObject, StreamingJSONList, factory
 from json_stream.iterators import ensure_file
 from json_stream.select_tokenizer import default_tokenizer
 
@@ -23,5 +23,5 @@ def visit(fp_or_iterator, visitor, tokenizer=default_tokenizer):
     fp = ensure_file(fp_or_iterator)
     token_stream = tokenizer(fp)
     _, token = next(token_stream)
-    obj = StreamingJSONBase.factory(token, token_stream, persistent=False)
+    obj = factory[False, token](token_stream)
     _visit(obj, visitor, ())