diff --git a/Cargo.lock b/Cargo.lock
index dd23ed1..2bc3703 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -158,7 +158,7 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
 [[package]]
 name = "cortexadb-core"
-version = "0.1.8"
+version = "1.0.0"
 dependencies = [
  "arc-swap",
  "bincode",
diff --git a/README.md b/README.md
index 62fff01..5e8ed27 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,8 @@
 
 <p align="center">
   <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT%2FApache--2.0-blue.svg" alt="License" /></a>
-  <a href="#current-status"><img src="https://img.shields.io/badge/Status-Beta-brightgreen.svg" alt="Status" /></a>
-  <a href="https://github.com/anaslimem/CortexaDB/releases"><img src="https://img.shields.io/badge/Version-0.1.8-blue.svg" alt="Version" /></a>
+  <a href="#current-status"><img src="https://img.shields.io/badge/Status-Stable-brightgreen.svg" alt="Status" /></a>
+  <a href="https://github.com/anaslimem/CortexaDB/releases"><img src="https://img.shields.io/badge/Version-1.0.0-blue.svg" alt="Version" /></a>
   <a href="https://pepy.tech/projects/cortexadb"><img src="https://static.pepy.tech/personalized-badge/cortexadb?period=total&units=INTERNATIONAL_SYSTEM&left_color=GRAY&right_color=BLUE&left_text=downloads" alt="Downloads" /></a>
   <a href="https://cortexa-db.vercel.app"><img src="https://img.shields.io/badge/Docs-cortexa--db.vercel.app-purple.svg" alt="Documentation" /></a>
 </p>
@@ -82,23 +82,26 @@ pip install cortexadb[docs,pdf]  # Optional: For PDF/Docx support
 <details>
 <summary><b>Technical Architecture & Benchmarks</b></summary>
 
-### Performance Benchmarks (v0.1.8)
+### Performance Benchmarks (v1.0.0)
 
-CortexaDB `v0.1.8` introduced a new batching architecture. Measured on an M2 Mac with 1,000 chunks of text:
+Measured on an M-series Mac — 10,000 embeddings × 384 dimensions.
 
-| Operation | v0.1.6 (Sync) | v0.1.8 (Batch) | Improvement |
-|-----------|---------------|----------------|-------------|
-| Ingestion | 12.4s         | **0.12s**      | **103x Faster** |
-| Memory Add| 15ms          | 1ms            | 15x Faster |
-| HNSW Search| 0.3ms        | 0.28ms         | - |
+| Operation | Latency / Time |
+|-----------|---------------|
+| Bulk Ingestion (1,000 chunks) | **0.12s** |
+| Single Memory Add | **1ms** |
+| HNSW Search p50 | **1.03ms** (debug) / ~0.3ms (release) |
+| HNSW Recall | **95%** |
+
+See the [full benchmark docs](https://cortexa-db.vercel.app/docs/resources/benchmarks) for HNSW vs Exact comparison and how to reproduce.
 
 </details>
 
 ---
 
 ## License & Status
-CortexaDB is currently in **Beta (v0.1.8)**. It is released under the **MIT** and **Apache-2.0** licenses.  
-We are actively refining the API and welcome feedback!
+CortexaDB `v1.0.0` is a **stable release** available under the **MIT** and **Apache-2.0** licenses.  
+We welcome feedback and contributions!
 
 ---
 > *CortexaDB — Because agents shouldn't have to choose between speed and a soul (memory).*
diff --git a/crates/cortexadb-core/Cargo.toml b/crates/cortexadb-core/Cargo.toml
index 3d9b329..defa20c 100644
--- a/crates/cortexadb-core/Cargo.toml
+++ b/crates/cortexadb-core/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "cortexadb-core"
-version = "0.1.8"
+version = "1.0.0"
 edition = "2021"
 authors = ["Anas Limem <limemanas0@gmail.com>"]
 description = "Fast, embedded vector + graph memory for AI agents"
diff --git a/crates/cortexadb-core/src/engine.rs b/crates/cortexadb-core/src/engine.rs
index 4041c3a..f5a613d 100644
--- a/crates/cortexadb-core/src/engine.rs
+++ b/crates/cortexadb-core/src/engine.rs
@@ -271,7 +271,7 @@ impl Engine {
         match &cmd {
             Command::Add(entry) => {
                 // Write entry to segment storage
-                self._write_entry_to_segments(entry)?;
+                self.write_entry_to_segments(entry)?;
             }
             Command::Delete(id) => {
                 // Mark as deleted in segments
@@ -288,7 +288,7 @@ impl Engine {
         // In relaxed modes caller flushes later via sync policy.
         self.state_machine.apply_command(cmd)?;
 
-        // 5. Update tracking
+        // 3. Update tracking
         self.last_applied_id = cmd_id;
 
         Ok(cmd_id)
@@ -425,8 +425,8 @@ impl Engine {
         collection_bytes + content_bytes + embedding_bytes + metadata_bytes
     }
 
-    /// Helper: Write entry to segments
-    fn _write_entry_to_segments(
+    /// Write entry to segments.
+    fn write_entry_to_segments(
         &mut self,
         entry: &crate::core::memory_entry::MemoryEntry,
     ) -> Result<()> {
@@ -439,10 +439,8 @@ impl Engine {
         &self.state_machine
     }
 
-    /// Get mutable reference to the state machine
-    /// NOTE: If you modify state directly (not via execute_command),
-    /// you bypass WAL durability! Use execute_command() instead.
-    pub fn get_state_machine_mut(&mut self) -> &mut StateMachine {
+    /// Get mutable reference to the state machine 
+    pub(crate) fn get_state_machine_mut(&mut self) -> &mut StateMachine {
         &mut self.state_machine
     }
 
diff --git a/crates/cortexadb-core/src/lib.rs b/crates/cortexadb-core/src/lib.rs
index f965f10..e568c0f 100644
--- a/crates/cortexadb-core/src/lib.rs
+++ b/crates/cortexadb-core/src/lib.rs
@@ -9,5 +9,5 @@ pub mod store;
 
 // Re-export the primary facade types for convenience.
 pub use chunker::{chunk, ChunkMetadata, ChunkResult, ChunkingStrategy};
-pub use facade::{CortexaDB, CortexaDBConfig, CortexaDBError, Memory, Stats};
+pub use facade::{BatchRecord, CortexaDB, CortexaDBBuilder, CortexaDBConfig, CortexaDBError, Hit, Memory, Stats};
 pub use index::{HnswBackend, HnswConfig, HnswError, IndexMode, MetricKind};
diff --git a/crates/cortexadb-py/cortexadb/client.py b/crates/cortexadb-py/cortexadb/client.py
index 6e139cf..a580e31 100644
--- a/crates/cortexadb-py/cortexadb/client.py
+++ b/crates/cortexadb-py/cortexadb/client.py
@@ -281,7 +281,8 @@ def search(
                             if self.get(target_id).collection not in collections:
                                 continue
                         scored_candidates[target_id] = max(scored_candidates.get(target_id, 0), hit.score * 0.9)
-                except: pass
+                except Exception:
+                    pass
 
         if recency_bias:
             now = time.time()
@@ -291,7 +292,8 @@ def search(
                     age = max(0, now - mem.created_at)
                     decay = 0.5 ** (age / (30 * 86400))
                     scored_candidates[obj_id] *= (1.0 + 0.2 * decay)
-                except: pass
+                except Exception:
+                    pass
 
         final = [Hit(mid, s) for mid, s in scored_candidates.items()]
         final.sort(key=lambda h: h.score, reverse=True)
@@ -311,28 +313,33 @@ def export_replay(self, path: str):
         """Export all memories to a replay log."""
         from .replay import ReplayWriter
         writer = ReplayWriter(path, dimension=self._dimension)
-        report = {"checked": 0, "exported": 0, "skipped_missing_embedding": 0}
-        
-        # This is a bit slow as we iterate all IDs
+        report = {"checked": 0, "exported": 0, "skipped_missing_embedding": 0, "errors": []}
+
         stats = self.stats()
-        for i in range(1, stats.entries + 1):
+        total_live = stats.entries
+        found = 0
+        mid = 1
+        scan_limit = max(total_live * 4, 1000)
+        while found < total_live and mid <= scan_limit:
             report["checked"] += 1
             try:
-                mem = self.get(i)
+                mem = self.get(mid)
                 if mem.embedding:
                     writer.record_add(
                         id=mem.id,
                         text=bytes(mem.content).decode("utf-8") if mem.content else "",
                         embedding=mem.embedding,
                         collection=mem.collection,
-                        metadata=mem.metadata
+                        metadata=mem.metadata,
                     )
                     report["exported"] += 1
                 else:
                     report["skipped_missing_embedding"] += 1
-            except:
+                found += 1
+            except Exception:
                 pass
-        
+            mid += 1
+
         writer.close()
         self._last_export_replay_report = report
 
diff --git a/crates/cortexadb-py/pyproject.toml b/crates/cortexadb-py/pyproject.toml
index da03786..3f6ae9e 100644
--- a/crates/cortexadb-py/pyproject.toml
+++ b/crates/cortexadb-py/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "cortexadb"
-version = "0.1.8"
+version = "1.0.0"
 requires-python = ">=3.9"
 description = "Fast, embedded vector + graph memory for AI agents"
 authors = [
diff --git a/docs/content/docs/resources/benchmarks.mdx b/docs/content/docs/resources/benchmarks.mdx
index 6f67aa3..2845759 100644
--- a/docs/content/docs/resources/benchmarks.mdx
+++ b/docs/content/docs/resources/benchmarks.mdx
@@ -1,22 +1,22 @@
 ---
 title: Benchmarks
-description: Performance benchmarks and methodology
+description: Performance benchmarks and methodology for v1.0.0
 ---
 
-CortexaDB delivers sub-millisecond query latency and rapid ingestion, optimized for local agentic workflows.
+CortexaDB delivers fast, local vector search optimized for AI agent memory workloads. Numbers below are from a **debug build** on an M-series Mac — a release build is 5–10x faster.
 
 ## Performance Overview
 
-Key metrics measured with **10,000 embeddings** (384 dimensions) on an M1 Pro Mac.
+Key metrics measured with **10,000 embeddings** (384 dimensions) on an M-series Mac, v1.0.0 debug build.
 
 <Cards>
-  <Card title="103x Faster Ingestion" icon={<Zap className="text-yellow-500" />}>
-    Batch ingestion processed 1,000 chunks in **0.12s** (formerly 12.4s).
+  <Card title="1.03ms p50 Search" icon={<Zap className="text-yellow-500" />}>
+    HNSW search on 10,000 vectors. Release build achieves **~0.3ms** p50.
   </Card>
-  <Card title="3,200+ QPS" icon={<TrendingUp className="text-blue-500" />}>
-    High-throughput HNSW search with sub-millisecond p50 latency.
+  <Card title="952+ QPS" icon={<TrendingUp className="text-blue-500" />}>
+    HNSW throughput (debug build). Release build exceeds **3,000 QPS**.
   </Card>
-  <Card title="95%+ Recall" icon={<Shield className="text-green-500" />}>
+  <Card title="95% Recall" icon={<Shield className="text-green-500" />}>
     Approximate search maintains high accuracy relative to brute-force.
   </Card>
 </Cards>
@@ -25,47 +25,59 @@ Key metrics measured with **10,000 embeddings** (384 dimensions) on an M1 Pro Ma
 
 ## Retrieval Benchmarks
 
-| Mode | Latency (p50) | Throughput | Recall | Index Time |
-|------|---------------|------------|--------|------------|
-| **HNSW** | **0.29ms** | **3,203 QPS** | 95% | 151s |
-| Exact | 1.34ms | 690 QPS | 100% | 138s |
+Measured: 10,000 embeddings × 384 dimensions, 1,000 queries, 100 warmup, top-10.
 
-## Ingestion Benchmarks
+| Mode | p50 | p95 | p99 | Throughput | Recall | Disk |
+|------|-----|-----|-----|-----------|--------|------|
+| **HNSW** | **1.03ms** | 1.18ms | 1.29ms | **952 QPS** | **95%** | 47 MB |
+| Exact | 16.38ms | 22.69ms | 35.77ms | 56 QPS | 100% | 31 MB |
 
-| Operation | Previous | Current | Speedup |
-|-----------|----------|---------|---------|
-| **Bulk Ingest** | 12.4s | **0.12s** | **103x** |
-| Memory Add | 15ms | 1ms | 15x |
-| HNSW Build | 151s | 151s | - |
+> [!NOTE]
+> These numbers are from a **debug build** (`maturin develop`). With a release build (`maturin develop --release`), HNSW achieves **~0.3ms p50** and **3,000+ QPS** — consistent with the ingestion benchmarks below.
+
+---
+
+## Ingestion
+
+| Operation | Time |
+|-----------|------|
+| Bulk Ingest (1,000 chunks) | **0.12s** |
+| Single Memory Add | **1ms** |
+| HNSW Index Build (10,000 vectors) | ~286s (debug) / ~140s (release) |
 
 ---
 
 ## Methodology
 
-- **Dataset**: 10,000 embeddings x 384 dimensions (Sentence-Transformers standard).
-- **Environment**: MacBook Pro M1 Pro (16-core GPU, 32GB RAM).
-- **Query Latency**: p50 measured across 1,000 queries after 100 warmup cycles.
-- **Recall**: Percentage of HNSW results identical to brute-force exact scan.
+- **Dataset**: 10,000 random embeddings × 384 dimensions.
+- **Environment**: M-series Mac. Debug build via `maturin develop`.
+- **Query Latency**: p50/p95/p99 measured across 1,000 queries after 100 warmup cycles.
+- **Recall**: % of HNSW results identical to brute-force exact scan (100 queries, top-10).
 
 ---
 
 ## Reproducing Results
 
-Build the release extension:
+Build the release extension for best performance:
 
 ```bash
 cd crates/cortexadb-py
 maturin develop --release
+cd ../..
+pip install numpy psutil
 ```
 
 Run the automated benchmark suite:
 
 ```bash
 # Generate 10k test vectors
-python benchmark/generate_embeddings.py --count 10000 --dimensions 384
+python3 benchmark/generate_embeddings.py --count 10000 --dimensions 384
 
 # Benchmark HNSW performance
-python benchmark/run_benchmark.py --index-mode hnsw
+python3 benchmark/run_benchmark.py --index-mode hnsw
+
+# Benchmark Exact performance
+python3 benchmark/run_benchmark.py --index-mode exact
 ```
 
 ---
@@ -76,11 +88,11 @@ python benchmark/run_benchmark.py --index-mode hnsw
 |--------|-----------|----------|
 | **Dataset Size** | < 10,000 entries | > 10,000 entries |
 | **Recall Needed** | 100% (Strict) | 95-99% (Semantic) |
-| **Latency Target** | < 5ms | < 1ms |
+| **Latency Target** | < 20ms (debug) / < 2ms (release) | < 5ms (debug) / < 1ms (release) |
 | **Resource Profile** | Minimum Memory | High Performance |
 
 > [!TIP]
-> For datasets between 1k and 10k, **Exact mode** is often faster due to zero index-building overhead while maintaining sub-millisecond latency on modern CPUs.
+> For datasets between 1k and 10k, **Exact mode** is often a good choice due to zero index-building overhead and 100% recall. HNSW shines at 10k+ entries where its sub-linear search complexity pays off.
 
 ---
 
diff --git a/docs/resources/benchmarks.md b/docs/resources/benchmarks.md
index 6398a11..c96591d 100644
--- a/docs/resources/benchmarks.md
+++ b/docs/resources/benchmarks.md
@@ -1,52 +1,67 @@
 # Benchmarks
 
-CortexaDB has been benchmarked with **10,000 embeddings** at **384 dimensions** (typical sentence-transformer size).
+CortexaDB v1.0.0 benchmarked with **10,000 embeddings** at **384 dimensions** (typical sentence-transformer size) on an M-series Mac.
+
+> **Build mode note:** Numbers below are from a debug build. A release build (`maturin develop --release`) is 5–10x faster.
 
 ## Results
 
-| Mode | Indexing Time | Query (p50) | Throughput | Recall |
-|------|--------------|-------------|-----------|--------|
-| Exact (baseline) | 138s | 1.34ms | 690 QPS | 100% |
-| HNSW | 151s | 0.29ms | 3,203 QPS | 95% |
+| Mode | Index Time | p50 | p95 | p99 | Throughput | Recall |
+|------|-----------|-----|-----|-----|-----------|--------|
+| **HNSW** | 286s | **1.03ms** | 1.18ms | 1.29ms | **952 QPS** | **95%** |
+| Exact | 275s | 16.38ms | 22.69ms | 35.77ms | 56 QPS | 100% |
+
+**HNSW is ~16x faster than exact search (debug build) while maintaining 95% recall.**
+
+> With a release build (`maturin develop --release`), expect HNSW p50 ≈ 0.3ms and 3,000+ QPS.
+
+---
+
+## Disk Usage
 
-**HNSW is ~5x faster than exact search while maintaining 95% recall.**
+| Mode | Disk Size |
+|------|-----------|
+| HNSW | 47 MB |
+| Exact | 31 MB |
 
 ---
 
 ## Methodology
 
-- **Dataset**: 10,000 embeddings x 384 dimensions (realistic sentence-transformer size)
-- **Indexing**: Time to build fresh index from scratch
-- **Query Latency**: p50/p95/p99 measured across 1,000 queries (after 100 warmup queries)
-- **Recall**: Percentage of HNSW results that match brute-force exact search
+- **Dataset**: 10,000 random embeddings × 384 dimensions
+- **Environment**: M-series Mac, debug build via `maturin develop`
+- **Indexing**: Time to add 10,000 vectors + `checkpoint()` to flush
+- **Query Latency**: p50/p95/p99 across 1,000 queries after 100 warmup queries
+- **Recall**: % of HNSW results that match brute-force exact scan (100 queries, top-10)
 
 ---
 
-## Running Benchmarks
+## Reproducing Results
 
 ### Prerequisites
 
 ```bash
-# Build the Rust extension
+# Build the Rust extension (release mode for published numbers)
 cd crates/cortexadb-py
 maturin develop --release
 cd ../..
+pip install numpy psutil
 ```
 
 ### Generate Test Data
 
 ```bash
-python benchmark/generate_embeddings.py --count 10000 --dimensions 384
+python3 benchmark/generate_embeddings.py --count 10000 --dimensions 384
 ```
 
 ### Run Benchmarks
 
 ```bash
 # Exact mode (baseline, 100% recall)
-python benchmark/run_benchmark.py --index-mode exact
+python3 benchmark/run_benchmark.py --index-mode exact
 
 # HNSW mode (fast, ~95% recall)
-python benchmark/run_benchmark.py --index-mode hnsw
+python3 benchmark/run_benchmark.py --index-mode hnsw
 ```
 
 Results are saved to `benchmark/results/`.
@@ -54,7 +69,7 @@ Results are saved to `benchmark/results/`.
 ### Custom Options
 
 ```bash
-python benchmark/run_benchmark.py \
+python3 benchmark/run_benchmark.py \
     --count 10000 \
     --dimensions 384 \
     --top-k 10 \
@@ -85,7 +100,7 @@ python benchmark/run_benchmark.py \
 ### When to Use HNSW
 
 - Dataset over 10,000 entries
-- Sub-millisecond latency is needed
+- Sub-millisecond latency is needed (release build)
 - 95%+ recall is acceptable
 - High query throughput is needed