From 30241ac5b5a1dd932ba7ca2b6dc367639ce27c3c Mon Sep 17 00:00:00 2001
From: Noah Lyons <n.lyons53@gmail.com>
Date: Wed, 25 Mar 2026 11:29:02 -0400
Subject: [PATCH 1/2] Fix IndexError in CacheOrder.pop() when both deques are
 empty

`trim_to(n_bytes=0)` loops on `self._n_bytes > n_bytes` without checking
whether the CacheOrder has entries left to evict.  When both internal
deques are empty, `0 >= 0` selects the regular deque and `popleft()`
raises `IndexError`.

Guard `trim_to`'s byte-eviction loop with `len(self._lru) > 0` and add
an explicit empty check in `CacheOrder.pop()` for a clearer error
message if a caller ever bypasses the guard.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 mlx_lm/server.py     |  4 +++-
 tests/test_server.py | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/mlx_lm/server.py b/mlx_lm/server.py
index 7fc91fa2a..753418264 100644
--- a/mlx_lm/server.py
+++ b/mlx_lm/server.py
@@ -196,6 +196,8 @@ def remove(self, model, tokens):
                 self._lru_checkpoints.remove((model, tokens))
 
         def pop(self):
+            if not self._lru and not self._lru_checkpoints:
+                raise IndexError("pop from empty CacheOrder")
             if len(self._lru) >= len(self._lru_checkpoints):
                 return self._lru.popleft()
             else:
@@ -344,7 +346,7 @@ def trim_to(
         while len(self._lru) > n_sequences:
             model, tokens = self._lru.pop()
             self._delete(model, tokens)
-        while self._n_bytes > n_bytes:
+        while self._n_bytes > n_bytes and len(self._lru) > 0:
             model, tokens = self._lru.pop()
             self._delete(model, tokens)
 
diff --git a/tests/test_server.py b/tests/test_server.py
index c5a815e4f..6df30018b 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -560,6 +560,23 @@ def test_lru_bytes(self):
         self.assertEqual(c, None)
         self.assertEqual(t, [3, 4])
 
+    def test_trim_to_zero_bytes_on_empty_cache(self):
+        cache = LRUPromptCache(max_size=10)
+        # Should not raise IndexError on empty cache
+        cache.trim_to(n_bytes=0)
+        self.assertEqual(len(cache), 0)
+
+    def test_trim_to_zero_bytes_evicts_all(self):
+        cache = LRUPromptCache(max_size=10)
+        model = ("test", None, None)
+        cache.insert_cache(model, [1, 2], [MockCache("aaa")])
+        cache.insert_cache(model, [3, 4], [MockCache("bbb")])
+        self.assertEqual(len(cache), 2)
+
+        cache.trim_to(n_bytes=0)
+        self.assertEqual(len(cache), 0)
+        self.assertEqual(cache.nbytes, 0)
+
 
 if __name__ == "__main__":
     unittest.main()

From 5f19ec66e98709fc749ec0ed7c1d8119bf244116 Mon Sep 17 00:00:00 2001
From: Noah Lyons <n.lyons53@gmail.com>
Date: Fri, 27 Mar 2026 11:56:24 -0400
Subject: [PATCH 2/2] fix: fail closed on prompt cache accounting drift

---
 mlx_lm/server.py     | 4 ++++
 tests/test_server.py | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/mlx_lm/server.py b/mlx_lm/server.py
index 753418264..549ab43b8 100644
--- a/mlx_lm/server.py
+++ b/mlx_lm/server.py
@@ -349,6 +349,10 @@ def trim_to(
         while self._n_bytes > n_bytes and len(self._lru) > 0:
             model, tokens = self._lru.pop()
             self._delete(model, tokens)
+        if self._n_bytes > n_bytes:
+            raise RuntimeError(
+                "LRUPromptCache byte accounting drifted out of sync with cache order"
+            )
 
     def log_cache_stats(self):
         ncaches, nbytes = len(self), self.nbytes
diff --git a/tests/test_server.py b/tests/test_server.py
index 6df30018b..4474a6f39 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -566,6 +566,13 @@ def test_trim_to_zero_bytes_on_empty_cache(self):
         cache.trim_to(n_bytes=0)
         self.assertEqual(len(cache), 0)
 
+    def test_trim_to_raises_on_inconsistent_byte_accounting(self):
+        cache = LRUPromptCache(max_size=10)
+        cache._n_bytes = 1
+
+        with self.assertRaisesRegex(RuntimeError, "byte accounting"):
+            cache.trim_to(n_bytes=0)
+
     def test_trim_to_zero_bytes_evicts_all(self):
         cache = LRUPromptCache(max_size=10)
         model = ("test", None, None)