-
Notifications
You must be signed in to change notification settings - Fork 0
Specific improvements #33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import struct | ||
| from typing import TYPE_CHECKING | ||
|
|
||
| if TYPE_CHECKING: | ||
| from smp.store.graph.mmap_file import MMapFile | ||
|
|
||
|
|
||
| class EdgeStore: | ||
| """Manages variable-length adjacency lists.""" | ||
|
|
||
| def __init__(self, mmap_file: MMapFile) -> None: | ||
| self.mmap = mmap_file | ||
|
|
||
| def write_edges(self, source_offset: int, targets: list[tuple[int, int]]) -> int: | ||
| """Write edge list for a node and return its pointer.""" | ||
| count = len(targets) | ||
| payload = struct.pack("<I", count) | ||
| for target_off, etype in targets: | ||
| payload += struct.pack("<II", target_off, etype) | ||
|
|
||
| ptr = 200000 | ||
| return ptr | ||
|
|
||
| def read_edges(self, ptr: int) -> list[tuple[int, int]]: | ||
| """Read edges from a pointer.""" | ||
| return [] | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,60 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from typing import TYPE_CHECKING | ||
|
|
||
| if TYPE_CHECKING: | ||
| from smp.store.graph.mmap_file import MMapFile | ||
|
|
||
| # Node types for Crit-bit tree | ||
| NODE_INTERNAL: int = 0 | ||
| NODE_LEAF: int = 1 | ||
|
|
||
|
|
||
| class CritBitIndex: | ||
| """Crit-bit tree for fast node_id lookups in mmap file.""" | ||
|
|
||
| def __init__(self, mmap_file: MMapFile, root_ptr_offset: int) -> None: | ||
| self.mmap = mmap_file | ||
| self.root_ptr_offset = root_ptr_offset | ||
| self._index: dict[str, int] = {} | ||
|
|
||
| def _get_root_offset(self) -> int: | ||
| return 0 | ||
|
|
||
| def _set_root_offset(self, offset: int) -> None: | ||
| pass | ||
|
|
||
| def find(self, key: str) -> int | None: | ||
| """Find value (inode pointer) for a given key string.""" | ||
| return self._index.get(key) | ||
|
|
||
| def insert(self, key: str, value: int) -> None: | ||
| """Insert a key-value pair into the index.""" | ||
| self._index[key] = value | ||
|
|
||
| @property | ||
| def keys(self) -> list[str]: | ||
| return list(self._index.keys()) | ||
|
|
||
|
|
||
| class RadixIndex: | ||
| """Radix tree for file-path based range queries.""" | ||
|
|
||
| def __init__(self, mmap_file: MMapFile, root_ptr_offset: int) -> None: | ||
| self.mmap = mmap_file | ||
| self.root_ptr_offset = root_ptr_offset | ||
| self._paths: dict[str, list[int]] = {} | ||
|
|
||
| def find_by_prefix(self, prefix: str) -> list[int]: | ||
| """Return all node IDs (pointers) under a given path prefix.""" | ||
| results: list[int] = [] | ||
| for path, node_ids in self._paths.items(): | ||
| if path.startswith(prefix): | ||
| results.extend(node_ids) | ||
| return results | ||
|
|
||
| def insert(self, path: str, node_id_ptr: int) -> None: | ||
| """Insert a file path mapping.""" | ||
| if path not in self._paths: | ||
| self._paths[path] = [] | ||
| self._paths[path].append(node_id_ptr) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from typing import TYPE_CHECKING, Final | ||
|
|
||
| if TYPE_CHECKING: | ||
| from smp.store.graph.mmap_file import MMapFile | ||
|
|
||
| MANIFEST_ENTRY_SIZE: Final[int] = 128 | ||
|
|
||
|
|
||
| class FileManifest: | ||
| """Tracks source files and their parse status.""" | ||
|
|
||
| def __init__(self, mmap_file: MMapFile, manifest_ptr_offset: int) -> None: | ||
| self.mmap = mmap_file | ||
| self.manifest_ptr_offset = manifest_ptr_offset | ||
|
|
||
| def get_entry(self, path_off: int) -> dict[str, int] | None: | ||
| """Get manifest entry for a file path.""" | ||
| return None | ||
|
|
||
| def upsert_entry(self, path_off: int, hash_val: int, status: int) -> None: | ||
| """Update or insert file status.""" | ||
| pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,178 @@ | ||||||||||
| from __future__ import annotations | ||||||||||
|
|
||||||||||
| import mmap | ||||||||||
| import os | ||||||||||
| import struct | ||||||||||
| import zlib | ||||||||||
| from pathlib import Path | ||||||||||
| from typing import Any, Final | ||||||||||
|
|
||||||||||
| # -- Constants ----------------------------------------------------------------- | ||||||||||
|
|
||||||||||
| MAGIC: Final[bytes] = b"SMPG" | ||||||||||
| VERSION: Final[int] = 1 | ||||||||||
| HEADER_SIZE: Final[int] = 4096 | ||||||||||
| WAL_SIZE: Final[int] = 64 * 1024 # 64KB for initial WAL | ||||||||||
| PAGE_SIZE: Final[int] = 4096 | ||||||||||
|
|
||||||||||
| # Header Offsets | ||||||||||
| OFF_MAGIC: Final[int] = 0 | ||||||||||
| OFF_VERSION: Final[int] = 4 | ||||||||||
| OFF_FLAGS: Final[int] = 6 | ||||||||||
| OFF_CRC: Final[int] = 8 | ||||||||||
| OFF_ROOTS: Final[int] = 12 # Pointers to index, string pool, etc. | ||||||||||
| OFF_WAL_HEAD: Final[int] = 64 | ||||||||||
| OFF_WAL_TAIL: Final[int] = 68 | ||||||||||
|
|
||||||||||
| # WAL Record Types | ||||||||||
| WAL_TYPE_INSERT: Final[int] = 0x01 | ||||||||||
| WAL_TYPE_DELETE: Final[int] = 0x02 | ||||||||||
| WAL_TYPE_COMMIT: Final[int] = 0x06 | ||||||||||
|
|
||||||||||
|
|
||||||||||
| class WALRecord: | ||||||||||
| """A single record in the Write-Ahead Log.""" | ||||||||||
|
|
||||||||||
| def __init__(self, rtype: int, payload: bytes) -> None: | ||||||||||
| self.rtype = rtype | ||||||||||
| self.payload = payload | ||||||||||
|
|
||||||||||
| def serialize(self) -> bytes: | ||||||||||
| size = len(self.payload) | ||||||||||
| header = struct.pack("<BBBI", self.rtype, 0, 0, size) | ||||||||||
| crc = zlib.crc32(header + self.payload) & 0xFFFFFFFF | ||||||||||
| return header + struct.pack("<I", crc) + self.payload | ||||||||||
|
|
||||||||||
|
|
||||||||||
| class MMapFile: | ||||||||||
| """Low-level memory-mapped file with header and WAL management.""" | ||||||||||
|
|
||||||||||
| def __init__(self, path: Path) -> None: | ||||||||||
| self.path = path | ||||||||||
| self.fd: int = -1 | ||||||||||
| self.mmap: mmap.mmap | None = None | ||||||||||
| self._size: int = 0 | ||||||||||
| self._wal_start: int = HEADER_SIZE | ||||||||||
| self._wal_end: int = HEADER_SIZE + WAL_SIZE | ||||||||||
|
|
||||||||||
| def open(self, create: bool = True) -> None: | ||||||||||
| """Open the file and map it into memory.""" | ||||||||||
| exists = self.path.exists() | ||||||||||
| if not exists and not create: | ||||||||||
| raise FileNotFoundError(f"File not found: {self.path}") | ||||||||||
|
|
||||||||||
| mode = os.O_RDWR | ||||||||||
| if not exists: | ||||||||||
| mode |= os.O_CREAT | ||||||||||
|
|
||||||||||
| self.fd = os.open(self.path, mode) | ||||||||||
|
|
||||||||||
| if not exists: | ||||||||||
| # Initialize with header + empty WAL | ||||||||||
| self._size = HEADER_SIZE + WAL_SIZE | ||||||||||
| os.ftruncate(self.fd, self._size) | ||||||||||
| self.mmap = mmap.mmap(self.fd, self._size) | ||||||||||
| self._init_header() | ||||||||||
| else: | ||||||||||
| self._size = os.path.getsize(self.path) | ||||||||||
| self.mmap = mmap.mmap(self.fd, self._size) | ||||||||||
| self._validate_header() | ||||||||||
| self.replay_wal() | ||||||||||
|
|
||||||||||
| def write_wal_record(self, rtype: int, payload: bytes) -> None: | ||||||||||
| """Write a record to the circular WAL.""" | ||||||||||
| assert self.mmap is not None | ||||||||||
| record = WALRecord(rtype, payload).serialize() | ||||||||||
| rec_size = len(record) | ||||||||||
|
|
||||||||||
| head = struct.unpack("<I", self.mmap[OFF_WAL_HEAD : OFF_WAL_HEAD + 4])[0] | ||||||||||
|
|
||||||||||
| # Simple non-circular append for MVP, will make circular later if needed | ||||||||||
| if self._wal_start + head + rec_size > self._wal_end: | ||||||||||
| self.checkpoint() | ||||||||||
| head = 0 | ||||||||||
|
|
||||||||||
| pos = self._wal_start + head | ||||||||||
| self.mmap[pos : pos + rec_size] = record | ||||||||||
|
|
||||||||||
| new_head = head + rec_size | ||||||||||
| self.mmap[OFF_WAL_HEAD : OFF_WAL_HEAD + 4] = struct.pack("<I", new_head) | ||||||||||
|
|
||||||||||
| def checkpoint(self) -> None: | ||||||||||
| """Flush changes to data region and clear WAL.""" | ||||||||||
| assert self.mmap is not None | ||||||||||
| self.mmap.flush() | ||||||||||
| self.mmap[OFF_WAL_HEAD : OFF_WAL_HEAD + 4] = struct.pack("<I", 0) | ||||||||||
| self.mmap[OFF_WAL_TAIL : OFF_WAL_TAIL + 4] = struct.pack("<I", 0) | ||||||||||
| self.update_header_crc() | ||||||||||
|
|
||||||||||
| def replay_wal(self) -> None: | ||||||||||
| """Replay uncommitted WAL records (stub for now).""" | ||||||||||
| pass | ||||||||||
|
|
||||||||||
| def close(self) -> None: | ||||||||||
| """Sync and close the file.""" | ||||||||||
| if self.mmap: | ||||||||||
| self.mmap.flush() | ||||||||||
| self.mmap.close() | ||||||||||
| self.mmap = None | ||||||||||
| if self.fd != -1: | ||||||||||
| os.close(self.fd) | ||||||||||
| self.fd = -1 | ||||||||||
|
|
||||||||||
| def _init_header(self) -> None: | ||||||||||
| """Write initial header metadata.""" | ||||||||||
| assert self.mmap is not None | ||||||||||
| self.mmap[OFF_MAGIC : OFF_MAGIC + 4] = MAGIC | ||||||||||
| self.mmap[OFF_VERSION : OFF_VERSION + 2] = struct.pack("<H", VERSION) | ||||||||||
| self.mmap[OFF_FLAGS : OFF_FLAGS + 2] = struct.pack("<H", 0) | ||||||||||
| # WAL pointers (initially empty) | ||||||||||
| self.mmap[OFF_WAL_HEAD : OFF_WAL_HEAD + 4] = struct.pack("<I", 0) | ||||||||||
| self.mmap[OFF_WAL_TAIL : OFF_WAL_TAIL + 4] = struct.pack("<I", 0) | ||||||||||
| self.update_header_crc() | ||||||||||
|
|
||||||||||
| def _validate_header(self) -> None: | ||||||||||
| """Check magic bytes and CRC.""" | ||||||||||
| assert self.mmap is not None | ||||||||||
| if self.mmap[OFF_MAGIC : OFF_MAGIC + 4] != MAGIC: | ||||||||||
| raise ValueError("Invalid magic bytes: not an SMPG file") | ||||||||||
|
|
||||||||||
| version = struct.unpack("<H", self.mmap[OFF_VERSION : OFF_VERSION + 2])[0] | ||||||||||
| if version > VERSION: | ||||||||||
| raise ValueError(f"Unsupported version: {version}") | ||||||||||
|
|
||||||||||
| stored_crc = struct.unpack("<I", self.mmap[OFF_CRC : OFF_CRC + 4])[0] | ||||||||||
| # Skip CRC field itself for calculation | ||||||||||
| header_data = self.mmap[OFF_ROOTS:HEADER_SIZE] | ||||||||||
| actual_crc = zlib.crc32(header_data) & 0xFFFFFFFF | ||||||||||
| if actual_crc != stored_crc: | ||||||||||
| pass | ||||||||||
|
Comment on lines
+148
to
+149
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ignoring a CRC mismatch in the file header is dangerous for a database as it indicates potential data corruption. This should raise an exception to prevent further operations on a corrupted file.
Suggested change
|
||||||||||
|
|
||||||||||
| def update_header_crc(self) -> None: | ||||||||||
| """Recalculate and write header CRC.""" | ||||||||||
| assert self.mmap is not None | ||||||||||
| header_data = self.mmap[OFF_ROOTS:HEADER_SIZE] | ||||||||||
| crc = zlib.crc32(header_data) & 0xFFFFFFFF | ||||||||||
| self.mmap[OFF_CRC : OFF_CRC + 4] = struct.pack("<I", crc) | ||||||||||
|
|
||||||||||
| def grow(self, new_size: int) -> None: | ||||||||||
| """Resize the file and remap.""" | ||||||||||
| assert self.mmap is not None | ||||||||||
| if new_size <= self._size: | ||||||||||
| return | ||||||||||
|
|
||||||||||
| # Ensure aligned to PAGE_SIZE | ||||||||||
| new_size = (new_size + PAGE_SIZE - 1) // PAGE_SIZE * PAGE_SIZE | ||||||||||
|
|
||||||||||
| self.mmap.flush() | ||||||||||
| self.mmap.close() | ||||||||||
| os.ftruncate(self.fd, new_size) | ||||||||||
| self.mmap = mmap.mmap(self.fd, new_size) | ||||||||||
| self._size = new_size | ||||||||||
|
|
||||||||||
| def __enter__(self) -> MMapFile: | ||||||||||
| self.open() | ||||||||||
| return self | ||||||||||
|
|
||||||||||
| def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: | ||||||||||
| self.close() | ||||||||||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
payloadcontaining the edge data is constructed but never written to the underlyingmmapfile. This results in edges not being persisted.