diff --git a/assets/coverage.svg b/assets/coverage.svg
index 6bfc8fa..fe06143 100644
--- a/assets/coverage.svg
+++ b/assets/coverage.svg
@@ -9,13 +9,13 @@
-
+
coverage
coverage
- 99%
- 99%
+ 94%
+ 94%
diff --git a/dictdatabase/byte_codes.py b/dictdatabase/byte_codes.py
index 5f50482..7d6383d 100644
--- a/dictdatabase/byte_codes.py
+++ b/dictdatabase/byte_codes.py
@@ -8,4 +8,5 @@
SPACE = 32
TAB = 9
NEWLINE = 10
+COLON = 58
COMMA = 44
diff --git a/dictdatabase/indexing.py b/dictdatabase/indexing.py
index c5eaabc..760224a 100644
--- a/dictdatabase/indexing.py
+++ b/dictdatabase/indexing.py
@@ -1,6 +1,7 @@
+from dataclasses import dataclass
import orjson
import os
-from . import config
+from . import config, utils, byte_codes, io_bytes
# Problem: Multiple read processes will concurrently read and write the same file
# In some cases this will result in a empty read error, thats why the try-except exists
@@ -21,6 +22,42 @@
# - Leave everything as is. While not ideal, it works. When empty read error occurs, don't use the index for that read
+
+
+
+
+@dataclass
+class KeyFinderState:
+ skip_next = False
+ in_str = False
+ list_depth = 0
+ dict_depth = 1
+ key_start = None
+ key_end = None
+ value_end = None
+ indices = []
+ i = 1
+
+
+def batched_find_all_top_level_keys(db_name):
+ state, b = KeyFinderState(), 0
+ while True:
+ batch_start = b * 10_000_000
+ batch_end = batch_start + 10_000_000
+
+ batch_bytes = io_bytes.read_bytes(db_name, batch_start, batch_end)
+
+ if batch_start == 0 and batch_bytes[0] != byte_codes.OPEN_CURLY:
+ raise ValueError("The first byte of the database file must be an opening curly brace")
+ if len(batch_bytes) == 0:
+ break
+ utils.find_all_top_level_keys(batch_bytes, state, len(batch_bytes))
+ return state.indices
+
+
+
+
+
class Indexer:
"""
The Indexer takes the name of a database file, and tries to load the .index file
@@ -57,6 +94,7 @@ def __init__(self, db_name: str):
self.data = {}
+
def get(self, key):
"""
Returns a list of 5 elements for a key if it exists, otherwise None
diff --git a/dictdatabase/utils.py b/dictdatabase/utils.py
index 052c3cf..e1ecf27 100644
--- a/dictdatabase/utils.py
+++ b/dictdatabase/utils.py
@@ -1,8 +1,10 @@
from __future__ import annotations
+from dataclasses import dataclass
from typing import Tuple
import os
import glob
from . import config, byte_codes
+from . indexing import KeyFinderState
def file_info(db_name: str) -> Tuple[str, bool, str, bool]:
@@ -37,17 +39,70 @@ def find_all(file_name: str) -> list[str]:
return files_all
+
+def find_all_top_level_keys(json_bytes: bytes, state: KeyFinderState, batch_size: int) -> KeyFinderState:
+ """
+ In the bytes of the json object find all top level keys and the start and end
+ indices of their values.
+ """
+
+ while state.i < batch_size:
+ current = json_bytes[state.i]
+ if state.skip_next:
+ state.skip_next = False
+ elif current == byte_codes.BACKSLASH:
+ state.skip_next = True
+ elif current == byte_codes.QUOTE:
+ if state.dict_depth == 1 and state.list_depth == 0:
+ if state.in_str:
+ state.key_end = state.i
+ state.i += 1
+ while json_bytes[state.i] in [byte_codes.SPACE, byte_codes.COLON]:
+ state.i += 1
+ state.value_start = state.i
+ else:
+ state.key_start = state.i + 1
+ state.in_str = not state.in_str
+ elif state.in_str or current in [byte_codes.SPACE, byte_codes.TAB, byte_codes.NEWLINE]:
+ pass
+ elif current == byte_codes.OPEN_SQUARE:
+ state.list_depth += 1
+ elif current == byte_codes.CLOSE_SQUARE:
+ state.list_depth -= 1
+ elif current == byte_codes.OPEN_CURLY:
+ state.dict_depth += 1
+ elif current == byte_codes.CLOSE_CURLY:
+ state.dict_depth -= 1
+ elif state.list_depth == 0 and state.dict_depth == 1:
+ state.indices.append((json_bytes[state.key_start:state.key_end].decode(), state.value_start, state.i + 1))
+ state.i += 1
+
+
+
+
+
+
+
def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int:
"""
Finds the index of the next comma or closing bracket/brace after the value
of a key-value pair in a bytes object containing valid JSON when decoded.
+ Valid start indices are the index after the colon or the index after that.
+
+ Example:
+
+ 01234567
+ "2": {},
+
+ Valid start indices are 4 and 5. Returns 7.
+
Args:
- `json_bytes`: A bytes object containing valid JSON when decoded
- `index`: The start index in json_bytes
Returns:
- - The end index of the value.
+ - The end index of the first byte right after the value's bytes.
"""
# See https://www.json.org/json-en.html for the JSON syntax