From 75c31edbd28ac549cc1a0e64c6dfe8300d06a1c4 Mon Sep 17 00:00:00 2001
From: Marcelo Trylesinski <marcelotryle@gmail.com>
Date: Mon, 8 Dec 2025 14:49:51 +0100
Subject: [PATCH] Support cache on response

---
 deploy/src/db.ts                              |   1 +
 deploy/src/index.ts                           |   3 +-
 deploy/src/types.ts                           |   1 +
 gateway/src/gateway.ts                        |   8 +-
 gateway/src/index.ts                          |   5 +
 gateway/src/middleware/cache.ts               | 158 ++++++++++++++++++
 gateway/src/middleware/storage.ts             |  37 ++++
 gateway/src/types.ts                          |   1 +
 gateway/test/cache.spec.ts                    | 121 ++++++++++++++
 gateway/test/worker.ts                        |  20 +++
 ...87cc1a5c8eb66b52fcb8fc7fe0c79e37c3baa.yaml |  86 ++++++++++
 ...8ce863c757e7d9339cd0badcab66d33ec5714.yaml |  88 ++++++++++
 ...406fc9feeafb5264cc083cb1083a9c838e80c.yaml |  90 ++++++++++
 ...b8273f9e45df86a46af3b6a1ef014074413a8.yaml |  80 +++++++++
 ...383b111fa3b75552a1ad141a59bf64e6046cb.yaml |  95 +++++++++++
 15 files changed, 792 insertions(+), 2 deletions(-)
 create mode 100644 gateway/src/middleware/cache.ts
 create mode 100644 gateway/src/middleware/storage.ts
 create mode 100644 gateway/test/cache.spec.ts
 create mode 100644 proxy-vcr/proxy_vcr/cassettes/openai-159ef69d00bcffdce018936eaf487cc1a5c8eb66b52fcb8fc7fe0c79e37c3baa.yaml
 create mode 100644 proxy-vcr/proxy_vcr/cassettes/openai-281c234fea62893bcc6cf9938f48ce863c757e7d9339cd0badcab66d33ec5714.yaml
 create mode 100644 proxy-vcr/proxy_vcr/cassettes/openai-4aa76245acbac6ffcef36851e34406fc9feeafb5264cc083cb1083a9c838e80c.yaml
 create mode 100644 proxy-vcr/proxy_vcr/cassettes/openai-85c50c20d79a55446c8ee8cce3db8273f9e45df86a46af3b6a1ef014074413a8.yaml
 create mode 100644 proxy-vcr/proxy_vcr/cassettes/openai-94748ba2f45b546b778e93a23f7383b111fa3b75552a1ad141a59bf64e6046cb.yaml
diff --git a/deploy/src/db.ts b/deploy/src/db.ts
index 5e8f122..5373c4c 100644
--- a/deploy/src/db.ts
+++ b/deploy/src/db.ts
@@ -66,6 +66,7 @@ export class ConfigDB extends KeysDbD1 {
       providers: providersWithKeys,
       routingGroups,
       otelSettings: user?.otel ?? project.otel,
+      cacheEnabled: keyInfo.cacheEnabled,
     }
   }
 }
diff --git a/deploy/src/index.ts b/deploy/src/index.ts
index c9d868c..07fe94e 100644
--- a/deploy/src/index.ts
+++ b/deploy/src/index.ts
@@ -16,7 +16,7 @@ along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 
 import { env } from 'cloudflare:workers'
-import { type GatewayOptions, gatewayFetch, LimitDbD1 } from '@pydantic/ai-gateway'
+import { type GatewayOptions, gatewayFetch, KVCacheStorage, LimitDbD1 } from '@pydantic/ai-gateway'
 import { instrument } from '@pydantic/logfire-cf-workers'
 import logfire from 'logfire'
 import { config } from './config'
@@ -40,6 +40,7 @@ const handler = {
       kv: env.KV,
       kvVersion: await hash(JSON.stringify(config)),
       subFetch: fetch,
+      cache: { storage: new KVCacheStorage(env.KV) },
     }
     try {
       return await gatewayFetch(request, url, ctx, gatewayEnv)
diff --git a/deploy/src/types.ts b/deploy/src/types.ts
index 62ddb84..64a1344 100644
--- a/deploy/src/types.ts
+++ b/deploy/src/types.ts
@@ -44,4 +44,5 @@ export interface ApiKey<ProviderKey extends string> {
   spendingLimitMonthly?: number
   spendingLimitTotal?: number
   providers: ProviderKey[] | '__all__'
+  cacheEnabled?: boolean
 }
diff --git a/gateway/src/gateway.ts b/gateway/src/gateway.ts
index 2b38dd0..a49fc91 100644
--- a/gateway/src/gateway.ts
+++ b/gateway/src/gateway.ts
@@ -3,6 +3,7 @@ import { type GatewayOptions, noopLimiter } from '.'
 import { apiKeyAuth, setApiKeyCache } from './auth'
 import { currentScopeIntervals, type ExceededScope, endOfMonth, endOfWeek, type SpendScope } from './db'
 import { type HandlerResponse, RequestHandler } from './handler'
+import { CacheMiddleware } from './middleware/cache'
 import { OtelTrace } from './otel'
 import { genAiOtelAttributes } from './otel/attributes'
 import type { ApiKeyInfo, ProviderProxy } from './types'
@@ -174,6 +175,11 @@ export async function gatewayWithLimiter(
 
   const otel = new OtelTrace(request, apiKeyInfo.otelSettings, options)
 
+  const middlewares = options.proxyMiddlewares ?? []
+  if (options.cache) {
+    middlewares.push(new CacheMiddleware({ storage: options.cache.storage }))
+  }
+
   let result: HandlerResponse | null = null
 
   for (const providerProxy of providerProxies) {
@@ -187,7 +193,7 @@ export async function gatewayWithLimiter(
       apiKeyInfo,
       restOfPath,
       otelSpan,
-      middlewares: options.proxyMiddlewares,
+      middlewares,
     })
 
     try {
diff --git a/gateway/src/index.ts b/gateway/src/index.ts
index bc9a8a2..43eb398 100644
--- a/gateway/src/index.ts
+++ b/gateway/src/index.ts
@@ -18,6 +18,7 @@ import logfire from 'logfire'
 import type { KeysDb, LimitDb } from './db'
 import { gateway } from './gateway'
 import type { Middleware, Next } from './handler'
+import type { CacheStorage as GatewayCacheStorage } from './middleware/storage'
 import type { RateLimiter } from './rateLimiter'
 import { refreshGenaiPrices } from './refreshGenaiPrices'
 import type { SubFetch } from './types'
@@ -27,6 +28,8 @@ export { changeProjectState as setProjectState, deleteApiKeyCache, setApiKeyCach
 export type { Middleware, Next }
 export * from './db'
 export type { RequestHandler } from './handler'
+export { CacheMiddleware, type CacheOptions } from './middleware/cache'
+export { type CachedResponse, type CacheStorage, KVCacheStorage } from './middleware/storage'
 export * from './rateLimiter'
 export * from './types'
 
@@ -42,6 +45,8 @@ export interface GatewayOptions {
   proxyPrefixLength?: number
   /** proxyMiddlewares: perform actions before and after the request is made to the providers */
   proxyMiddlewares?: Middleware[]
+  /** Cache configuration */
+  cache?: { storage: GatewayCacheStorage }
 }
 
 export async function gatewayFetch(
diff --git a/gateway/src/middleware/cache.ts b/gateway/src/middleware/cache.ts
new file mode 100644
index 0000000..7fced8f
--- /dev/null
+++ b/gateway/src/middleware/cache.ts
@@ -0,0 +1,158 @@
+import logfire from 'logfire'
+import type { HandlerResponse, Middleware, Next, RequestHandler } from '../handler'
+import type { CachedResponse, CacheStorage as GatewayCacheStorage } from './storage'
+
+export interface CacheOptions {
+  storage: GatewayCacheStorage
+}
+
+export class CacheMiddleware implements Middleware {
+  private options: CacheOptions
+
+  constructor(options: CacheOptions) {
+    this.options = options
+  }
+
+  dispatch(next: Next): Next {
+    return async (handler: RequestHandler) => {
+      if (!handler.apiKeyInfo.cacheEnabled) {
+        return await next(handler)
+      }
+
+      const { method, url, headers } = handler.request
+      // Clone the request to read the body without consuming the original
+      const requestBody = await handler.request.clone().text()
+      const requestUrl = new URL(url)
+      requestUrl.pathname = handler.restOfPath
+      const path = requestUrl.toString()
+
+      const apiKeyId = handler.apiKeyInfo.id
+      const hash = await this.calculateHash(method, path, requestBody, apiKeyId)
+
+      const shouldBypassCache = this.shouldBypassCache(headers)
+
+      if (!shouldBypassCache) {
+        const cached = await this.getCachedResponse(hash)
+
+        if (cached) {
+          logfire.info('Cache hit', { hash, apiKeyId: handler.apiKeyInfo.id })
+          return this.toCachedHandlerResponse(requestBody, cached)
+        }
+      }
+
+      const result = await next(handler)
+
+      const shouldStoreCache = this.shouldStoreCache(handler.request, result)
+      if (shouldStoreCache) {
+        handler.runAfter('cache-store', this.storeCachedResponse(hash, result))
+      }
+
+      return this.addCacheHeaders(result, shouldBypassCache ? 'BYPASS' : 'MISS')
+    }
+  }
+
+  private shouldBypassCache(requestHeaders: Headers): boolean {
+    const cacheControl = requestHeaders.get('cache-control')
+    return cacheControl?.includes('no-cache') || cacheControl?.includes('no-store') || false
+  }
+
+  private shouldStoreCache(request: Request, result: HandlerResponse): boolean {
+    const cacheControl = request.headers.get('cache-control')
+
+    if (cacheControl?.includes('no-store')) {
+      return false
+    }
+
+    if ('responseStream' in result) {
+      return false
+    }
+
+    if ('error' in result || 'unexpectedStatus' in result || 'response' in result || 'modelNotFound' in result) {
+      return false
+    }
+
+    return true
+  }
+
+  private async calculateHash(method: string, url: string, body: string, apiKeyId: number): Promise<string> {
+    const data = `${apiKeyId}:${method}:${url}:${body}`
+    const encoder = new TextEncoder()
+    const dataBuffer = encoder.encode(data)
+    const hashBuffer = await crypto.subtle.digest('SHA-256', dataBuffer)
+    const hashArray = Array.from(new Uint8Array(hashBuffer))
+    const hashHex = hashArray.map((b) => b.toString(16).padStart(2, '0')).join('')
+
+    return hashHex
+  }
+
+  private async getCachedResponse(hash: string): Promise<CachedResponse | null> {
+    try {
+      return await this.options.storage.get(hash)
+    } catch (error) {
+      logfire.reportError('Error getting cached response', error as Error, { hash })
+      return null
+    }
+  }
+
+  private async storeCachedResponse(hash: string, result: HandlerResponse): Promise<void> {
+    if (!('successStatus' in result) || 'responseStream' in result) {
+      return
+    }
+
+    try {
+      const { successStatus, responseHeaders, responseBody, requestModel, responseModel } = result
+
+      const headers: Record<string, string> = {}
+      responseHeaders.forEach((value, key) => {
+        headers[key] = value
+      })
+
+      const cached: CachedResponse = {
+        status: successStatus,
+        headers,
+        body: responseBody,
+        timestamp: Date.now(),
+        requestModel,
+        responseModel,
+      }
+
+      await this.options.storage.set(hash, cached)
+
+      const sizeBytes = new TextEncoder().encode(responseBody).length
+
+      logfire.info('Response cached', { hash, sizeBytes })
+    } catch (error) {
+      logfire.reportError('Error storing cached response', error as Error, { hash })
+    }
+  }
+
+  private toCachedHandlerResponse(
+    requestBody: string,
+    cached: CachedResponse,
+  ): Extract<HandlerResponse, { successStatus: number }> {
+    const responseHeaders = new Headers(cached.headers)
+    const age = Math.floor((Date.now() - cached.timestamp) / 1000)
+
+    responseHeaders.set('Age', age.toString())
+    responseHeaders.set('X-Cache-Status', 'HIT')
+
+    return {
+      successStatus: cached.status,
+      responseHeaders,
+      responseBody: cached.body,
+      requestBody,
+      requestModel: cached.requestModel,
+      responseModel: cached.responseModel ?? 'unknown',
+      usage: { input_tokens: 0, output_tokens: 0 },
+      cost: 0,
+    }
+  }
+
+  private addCacheHeaders(result: HandlerResponse, status: 'HIT' | 'MISS' | 'BYPASS'): HandlerResponse {
+    if ('responseHeaders' in result) {
+      result.responseHeaders.set('X-Cache-Status', status)
+    }
+
+    return result
+  }
+}
diff --git a/gateway/src/middleware/storage.ts b/gateway/src/middleware/storage.ts
new file mode 100644
index 0000000..ca5c187
--- /dev/null
+++ b/gateway/src/middleware/storage.ts
@@ -0,0 +1,37 @@
+export interface CachedResponse {
+  status: number
+  headers: Record<string, string>
+  body: string
+  timestamp: number
+  requestModel?: string
+  responseModel?: string
+}
+
+export interface CacheStorage {
+  get(hash: string): Promise<CachedResponse | null>
+  set(hash: string, response: CachedResponse): Promise<void>
+}
+
+export class KVCacheStorage implements CacheStorage {
+  private kv: KVNamespace
+  private namespace: string
+  private ttl: number
+
+  constructor(kv: KVNamespace, namespace: string = 'response', ttl: number = 86400) {
+    this.kv = kv
+    this.namespace = namespace
+    this.ttl = ttl
+  }
+
+  async get(hash: string): Promise<CachedResponse | null> {
+    const kvKey = cacheKey(this.namespace, hash)
+    return await this.kv.get<CachedResponse>(kvKey, 'json')
+  }
+
+  async set(hash: string, response: CachedResponse): Promise<void> {
+    const kvKey = cacheKey(this.namespace, hash)
+    await this.kv.put(kvKey, JSON.stringify(response), { expirationTtl: this.ttl })
+  }
+}
+
+const cacheKey = (namespace: string, hash: string): string => `${namespace}:${hash}`
diff --git a/gateway/src/types.ts b/gateway/src/types.ts
index f3db208..3db368c 100644
--- a/gateway/src/types.ts
+++ b/gateway/src/types.ts
@@ -36,6 +36,7 @@ export interface ApiKeyInfo<ProviderKey extends string = string> {
   // among values with same priority, use weight for randomized load balancing; if missing, treat as 1
   routingGroups: Record<string, { key: ProviderKey; priority?: number; weight?: number }[]>
   otelSettings?: OtelSettings
+  cacheEnabled?: boolean
 }
 
 export type ProviderID =
diff --git a/gateway/test/cache.spec.ts b/gateway/test/cache.spec.ts
new file mode 100644
index 0000000..512864b
--- /dev/null
+++ b/gateway/test/cache.spec.ts
@@ -0,0 +1,121 @@
+import { describe, expect } from 'vitest'
+import { test } from './setup'
+
+const requestBody = {
+  model: 'gpt-5',
+  messages: [{ role: 'user', content: 'What is the capital of France?' }],
+  max_completion_tokens: 1200,
+}
+const requestHeaders = {
+  Authorization: 'cache-enabled',
+  'Content-Type': 'application/json',
+  Accept: 'application/json',
+  'Accept-Encoding': 'deflate',
+}
+
+describe('cache', () => {
+  test('should return MISS on first request', async ({ gateway }) => {
+    const { fetch } = gateway
+
+    const firstResponse = await fetch('https://example.com/openai/chat/completions', {
+      method: 'POST',
+      headers: requestHeaders,
+      body: JSON.stringify(requestBody),
+    })
+    expect(firstResponse.status).toBe(200)
+    expect(Object.fromEntries(firstResponse.headers.entries())).toMatchInlineSnapshot(`
+      {
+        "content-length": "564",
+        "content-type": "application/json",
+        "pydantic-ai-gateway-price-estimate": "0.0008USD",
+        "server": "uvicorn",
+        "x-cache-status": "MISS",
+      }
+    `)
+
+    const cachedResponse = await fetch('https://example.com/openai/chat/completions', {
+      method: 'POST',
+      headers: requestHeaders,
+      body: JSON.stringify(requestBody),
+    })
+
+    expect(cachedResponse.status).toBe(200)
+    expect(Object.fromEntries(cachedResponse.headers.entries())).toMatchInlineSnapshot(`
+      {
+        "age": "0",
+        "content-length": "564",
+        "content-type": "application/json",
+        "pydantic-ai-gateway-price-estimate": "0.0008USD",
+        "server": "uvicorn",
+        "x-cache-status": "HIT",
+      }
+    `)
+
+    // Sleep to get a different age.
+    await new Promise((resolve) => setTimeout(resolve, 1000))
+
+    const cachedResponse2 = await fetch('https://example.com/openai/chat/completions', {
+      method: 'POST',
+      headers: requestHeaders,
+      body: JSON.stringify(requestBody),
+    })
+    expect(Number(cachedResponse2.headers.get('Age'))).toBeGreaterThan(0)
+  })
+
+  test('should return BYPASS with cache-control: no-cache', async ({ gateway }) => {
+    const { fetch } = gateway
+
+    const response = await fetch('https://example.com/openai/chat/completions', {
+      method: 'POST',
+      headers: { ...requestHeaders, 'Cache-Control': 'no-cache' },
+      body: JSON.stringify(requestBody),
+    })
+
+    expect(response.headers.get('X-Cache-Status')).toBe('BYPASS')
+  })
+
+  test('should return BYPASS with cache-control: no-store', async ({ gateway }) => {
+    const { fetch } = gateway
+
+    const response = await fetch('https://example.com/openai/chat/completions', {
+      method: 'POST',
+      headers: { ...requestHeaders, 'Cache-Control': 'no-store' },
+      body: JSON.stringify(requestBody),
+    })
+
+    expect(response.headers.get('X-Cache-Status')).toBe('BYPASS')
+  })
+
+  test('should not cache streaming responses', async ({ gateway }) => {
+    const { fetch } = gateway
+
+    const response = await fetch('https://example.com/openai/chat/completions', {
+      method: 'POST',
+      headers: requestHeaders,
+      body: JSON.stringify({ ...requestBody, stream: true }),
+    })
+    expect(response.headers.get('X-Cache-Status')).toBe('MISS')
+  })
+
+  test('different request bodies should have different cache keys', async ({ gateway }) => {
+    const { fetch } = gateway
+
+    const response1 = await fetch('https://example.com/openai/chat/completions', {
+      method: 'POST',
+      headers: requestHeaders,
+      body: JSON.stringify({ ...requestBody, messages: [{ role: 'user', content: 'Message A' }] }),
+    })
+    expect(response1.headers.get('X-Cache-Status')).toBe('MISS')
+
+    const response2 = await fetch('https://example.com/openai/chat/completions', {
+      method: 'POST',
+      headers: requestHeaders,
+      body: JSON.stringify({ ...requestBody, messages: [{ role: 'user', content: 'Message B' }] }),
+    })
+    expect(response2.headers.get('X-Cache-Status')).toBe('MISS')
+
+    const body1 = await response1.json()
+    const body2 = await response2.json()
+    expect(body1).not.toBe(body2)
+  })
+})
diff --git a/gateway/test/worker.ts b/gateway/test/worker.ts
index faade12..5cca989 100644
--- a/gateway/test/worker.ts
+++ b/gateway/test/worker.ts
@@ -4,6 +4,7 @@ import {
   gatewayFetch,
   type KeyStatus,
   KeysDbD1,
+  KVCacheStorage,
   LimitDbD1,
   type Middleware,
   type ProviderProxy,
@@ -43,6 +44,7 @@ export function buildGatewayEnv(
     proxyPrefixLength,
     proxyMiddlewares,
     rateLimiter,
+    cache: { storage: new KVCacheStorage(env.KV) },
   }
 }
 
@@ -55,6 +57,7 @@ export namespace IDS {
   export const keyTinyLimit = 6
   export const keyFallbackTest = 7
   export const keyFallbackAnthropicGoogleVertex = 8
+  export const keyCacheEnabled = 9
 }
 
 class TestKeysDB extends KeysDbD1 {
@@ -247,6 +250,23 @@ class TestKeysDB extends KeysDbD1 {
           ],
           routingGroups: { anthropic: [{ key: 'anthropic' }, { key: 'google-vertex' }] },
         }
+      case 'cache-enabled':
+        return {
+          id: IDS.keyCacheEnabled,
+          user: IDS.userDefault,
+          project: IDS.projectDefault,
+          org: IDS.orgDefault,
+          key,
+          status: 'active',
+          providers: this.allProviders,
+          routingGroups: { openai: [{ key: 'openai' }] },
+          cacheEnabled: true,
+          otelSettings: {
+            writeToken: 'write-token',
+            baseUrl: 'https://logfire.pydantic.dev',
+            exporterProtocol: 'http/json',
+          },
+        }
       default:
         return null
     }
diff --git a/proxy-vcr/proxy_vcr/cassettes/openai-159ef69d00bcffdce018936eaf487cc1a5c8eb66b52fcb8fc7fe0c79e37c3baa.yaml b/proxy-vcr/proxy_vcr/cassettes/openai-159ef69d00bcffdce018936eaf487cc1a5c8eb66b52fcb8fc7fe0c79e37c3baa.yaml
new file mode 100644
index 0000000..5f8f10e
--- /dev/null
+++ b/proxy-vcr/proxy_vcr/cassettes/openai-159ef69d00bcffdce018936eaf487cc1a5c8eb66b52fcb8fc7fe0c79e37c3baa.yaml
@@ -0,0 +1,86 @@
+interactions:
+- request:
+    body: '{"model":"gpt-5","messages":[{"role":"user","content":"What is the capital
+      of France?"}],"max_completion_tokens":1200}'
+    headers:
+      accept:
+      - '*/*'
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '118'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - python-httpx/0.28.1
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: !!binary |
+        H4sIAAAAAAAAA3SSTU8CMRCG7/srmp53yYLyEW5ITFAveCFRQza1HaDQbZt21qiE/25aPrpGvPQw
+        z8yb953OPiOESkHHhPINQ15bVUx3i+3L3WT7PnlUg9nzN586qxf3D/O7p9cZzcOEed8Cx/NUh5va
+        KkBp9BFzBwwhqHaHg36v7HbLUQS1EaDC2Npi0S96Za9flKOiHJ7mNkZy8HRM3jJCCNnHNzjUAj7p
+        mJT5uVKD92wNdHxpIoQ6o0KFMu+lR6aR5glyoxF0ND1nTvpOGzpYNZ4Fa7pRqgWY1gZZiBZtLU/k
+        cDGyklr6TeWAeaODuEdjaaSHjJBlDNb88kqtM7XFCs0Oomz35ihH0yITHPZPEA0yleqjUX5FrRKA
+        TCrfWgzljG9ApMm0RdYIaVoga2X7a+aa9jG31OukMrj9Vz8BzsEiiMo6EJL/TpzaHIQ7+6/tsuTo
+        mHpwH5JDhRJc+AgBK9ao4w1Q/+UR6mol9RqcdTIeQvjr7JD9AAAA//8DAOJaZgUFAwAA
+    headers:
+      CF-RAY:
+      - 9aaca9d12fb3cc4d-MAD
+      Connection:
+      - keep-alive
+      Content-Encoding:
+      - gzip
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 08 Dec 2025 13:38:31 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - max-age=31536000; includeSubDomains; preload
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - pydantic-28gund
+      openai-processing-ms:
+      - '2410'
+      openai-project:
+      - proj_dKobscVY9YJxeEaDJen54e3d
+      openai-version:
+      - '2020-10-01'
+      x-envoy-upstream-service-time:
+      - '2428'
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - '15000'
+      x-ratelimit-limit-tokens:
+      - '40000000'
+      x-ratelimit-remaining-requests:
+      - '14999'
+      x-ratelimit-remaining-tokens:
+      - '39999990'
+      x-ratelimit-reset-requests:
+      - 4ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_276a4c432c6c4f04902eca21289abb10
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/proxy-vcr/proxy_vcr/cassettes/openai-281c234fea62893bcc6cf9938f48ce863c757e7d9339cd0badcab66d33ec5714.yaml b/proxy-vcr/proxy_vcr/cassettes/openai-281c234fea62893bcc6cf9938f48ce863c757e7d9339cd0badcab66d33ec5714.yaml
new file mode 100644
index 0000000..e9e67f2
--- /dev/null
+++ b/proxy-vcr/proxy_vcr/cassettes/openai-281c234fea62893bcc6cf9938f48ce863c757e7d9339cd0badcab66d33ec5714.yaml
@@ -0,0 +1,88 @@
+interactions:
+- request:
+    body: '{"model":"gpt-5","messages":[{"role":"user","content":"Message A"}],"max_completion_tokens":1200}'
+    headers:
+      accept:
+      - '*/*'
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '97'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - python-httpx/0.28.1
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: !!binary |
+        H4sIAAAAAAAAA3STz04bMRDG73mKkU+tlKBk2/Anl4pSlapSVYlWtFJB0dQ7mxi8thnPQhcUiQcp
+        L8eTVHYCu6hw8WF+nm/G841vBgDKlGoGSi9RdB3s6OD8OB4eXRz/bL9XxZuz6Wcz4eKQ8OuF/DhS
+        w5Thf5+RloesLe3rYEmMd2usmVAoqU52tqfFeLL9djeD2pdkU9oiyGg6KsbFdDTeHY13NnlLbzRF
+        NYNfAwCAm3ymDl1Jf9QMxsOHSE0x4oLU7PESgGJvU0RhjCYKOlHDDmrvhFxu+pO/givf2BJa34A1
+        5wQ1gXhYoistwf3t3y9rfdi/v717ByduBB8YKwHPYOrA/pLASAp/a+oa2VxTQsLookV5gPsObXud
+        pF3m2iIbaRM6yDMCBKZgW3jVq/n+/vbuddYWz0kKKs+QZBmYKmJymk7ciftIZKFiyr0HjEIgS4Kq
+        sRY2A0pVYyBtqjazhUe71R8LU9VETKa4xtoeQOe8YDI1G3K6IatHCyrjTFzOmTB6l8YaxQeV6WoA
+        cJotbZ64pAL7Oshc/Dll2d21muo2qGPF9mRDxQvaPtgbPiM3L0nQ2NjbCaVRL6nsUrsFwqY0vgcG
+        vcf9385z2uuHG7foVCZ7xYsFOqA1BaFyHphKo58+urvGlP7YS9cex5xbVpH40miaiyFOVpRUYWPX
+        +69iG4XqeWXcgjiwyZ8guT1YDf4BAAD//wMApzAU5wEEAAA=
+    headers:
+      CF-RAY:
+      - 9aacb6fae9ca24b7-MAD
+      Connection:
+      - keep-alive
+      Content-Encoding:
+      - gzip
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 08 Dec 2025 13:47:35 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - max-age=31536000; includeSubDomains; preload
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - pydantic-28gund
+      openai-processing-ms:
+      - '7188'
+      openai-project:
+      - proj_dKobscVY9YJxeEaDJen54e3d
+      openai-version:
+      - '2020-10-01'
+      x-envoy-upstream-service-time:
+      - '7427'
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - '15000'
+      x-ratelimit-limit-tokens:
+      - '40000000'
+      x-ratelimit-remaining-requests:
+      - '14999'
+      x-ratelimit-remaining-tokens:
+      - '39999994'
+      x-ratelimit-reset-requests:
+      - 4ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_6fe764a946564a8e990747a7b308de1e
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/proxy-vcr/proxy_vcr/cassettes/openai-4aa76245acbac6ffcef36851e34406fc9feeafb5264cc083cb1083a9c838e80c.yaml b/proxy-vcr/proxy_vcr/cassettes/openai-4aa76245acbac6ffcef36851e34406fc9feeafb5264cc083cb1083a9c838e80c.yaml
new file mode 100644
index 0000000..2b49a4d
--- /dev/null
+++ b/proxy-vcr/proxy_vcr/cassettes/openai-4aa76245acbac6ffcef36851e34406fc9feeafb5264cc083cb1083a9c838e80c.yaml
@@ -0,0 +1,90 @@
+interactions:
+- request:
+    body: '{"model":"gpt-5","messages":[{"role":"user","content":"Message B"}],"max_completion_tokens":1200}'
+    headers:
+      accept:
+      - '*/*'
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '97'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - python-httpx/0.28.1
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: !!binary |
+        H4sIAAAAAAAAA3RTy04bQRC8+ytacwJpDbYj28SXiKBEoCgCBSmXgKzOTK938Dw2M70Yg5D8G5HC
+        z/lLotk17KKQyxy6umr6Uf3QAxBaiRkIWSBLW5r+yfJ7PB9OTn/dVLefP53fV1/U8Mau7MidHp+J
+        LDH8zxuS/Mw6kN6Whlh718AyEDIl1eF0Mh4NhpPxpAasV2QSbVFyf9wfDUbj/uCoP5jueIXXkqKY
+        wY8eAMBD/aYKnaI7MYNB9hyxFCMuSMxekgBE8CZFBMaoI6NjkbWg9I7J1UWf+MooWPsKYoGBgAuC
+        vDIGmO4YfA7bzZ+vzQfwcbt5AnQKVgVy4mw3vxUYvSSwBOxBeVhpLkDzhyt35c7LNIcIZyDRQUGm
+        rOHZlevDRfA+D4QKfABSmiH3AaTBoHl9yN5RyvpGq6CZYM/6QCnDoslAeid1pAzyoMkps86AWB7s
+        J8ZlZS0GfU9JlwO6aJBrrZOAOQNCoNKs64C3ZWpZc1N1p9Pj7eYpdXBhCCOBdtJUigDdOn0eOaB2
+        HGEPK6XJScogVZyBIbfgIgNLSle2GQ1Z1Obw0qBc7kP0u2kwauMDaD7oLiZQXkVMtnCVMR0AnfOM
+        9TiTJa53yOOLCXLtdCzmgTB6lxYb2ZeiRh97ANe1qapXPhFl8LbkOfsl1bJHjZpoPdxi78bvdyh7
+        RtMBJtPsDbm5otRi7LhSSJQFqZbaWjjN0XeAXqe5f8t5S7tpXLtFqzJqLu3ND1pASiqZ1LwMpLR8
+        3XSbFihd+f/SXsZclywihVstac6aQlqFohwr01ygiOvIZOe5dgsKZdD1GaZt9x57fwEAAP//AwCV
+        ZWyCgwQAAA==
+    headers:
+      CF-RAY:
+      - 9aacb72ce91224b7-MAD
+      Connection:
+      - keep-alive
+      Content-Encoding:
+      - gzip
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 08 Dec 2025 13:47:45 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - max-age=31536000; includeSubDomains; preload
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - pydantic-28gund
+      openai-processing-ms:
+      - '8718'
+      openai-project:
+      - proj_dKobscVY9YJxeEaDJen54e3d
+      openai-version:
+      - '2020-10-01'
+      x-envoy-upstream-service-time:
+      - '8883'
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - '15000'
+      x-ratelimit-limit-tokens:
+      - '40000000'
+      x-ratelimit-remaining-requests:
+      - '14999'
+      x-ratelimit-remaining-tokens:
+      - '39999994'
+      x-ratelimit-reset-requests:
+      - 4ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_79b538eabeb64e568ddf746f7b40f4d8
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/proxy-vcr/proxy_vcr/cassettes/openai-85c50c20d79a55446c8ee8cce3db8273f9e45df86a46af3b6a1ef014074413a8.yaml b/proxy-vcr/proxy_vcr/cassettes/openai-85c50c20d79a55446c8ee8cce3db8273f9e45df86a46af3b6a1ef014074413a8.yaml
new file mode 100644
index 0000000..ad7cc1b
--- /dev/null
+++ b/proxy-vcr/proxy_vcr/cassettes/openai-85c50c20d79a55446c8ee8cce3db8273f9e45df86a46af3b6a1ef014074413a8.yaml
@@ -0,0 +1,80 @@
+interactions:
+- request:
+    body: '{"model":"gpt-5","messages":[{"role":"user","content":"What is the capital
+      of France?"}],"max_completion_tokens":1}'
+    headers:
+      accept:
+      - '*/*'
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '115'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - python-httpx/0.28.1
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"error\": {\n    \"message\": \"Could not finish the message
+        because max_tokens or model output limit was reached. Please try again with
+        higher max_tokens.\",\n    \"type\": \"invalid_request_error\",\n    \"param\":
+        null,\n    \"code\": null\n  }\n}"
+    headers:
+      CF-RAY:
+      - 9aaca9bfaaf2cc4d-MAD
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '235'
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 08 Dec 2025 13:38:26 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - max-age=31536000; includeSubDomains; preload
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - pydantic-28gund
+      openai-processing-ms:
+      - '306'
+      openai-project:
+      - proj_dKobscVY9YJxeEaDJen54e3d
+      openai-version:
+      - '2020-10-01'
+      x-envoy-upstream-service-time:
+      - '449'
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - '15000'
+      x-ratelimit-limit-tokens:
+      - '40000000'
+      x-ratelimit-remaining-requests:
+      - '14999'
+      x-ratelimit-remaining-tokens:
+      - '39999990'
+      x-ratelimit-reset-requests:
+      - 4ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_65f02d8d9a39474ab7ba122fc0ba2927
+    status:
+      code: 400
+      message: Bad Request
+version: 1
diff --git a/proxy-vcr/proxy_vcr/cassettes/openai-94748ba2f45b546b778e93a23f7383b111fa3b75552a1ad141a59bf64e6046cb.yaml b/proxy-vcr/proxy_vcr/cassettes/openai-94748ba2f45b546b778e93a23f7383b111fa3b75552a1ad141a59bf64e6046cb.yaml
new file mode 100644
index 0000000..5f34dca
--- /dev/null
+++ b/proxy-vcr/proxy_vcr/cassettes/openai-94748ba2f45b546b778e93a23f7383b111fa3b75552a1ad141a59bf64e6046cb.yaml
@@ -0,0 +1,95 @@
+interactions:
+- request:
+    body: '{"model":"gpt-5","messages":[{"role":"user","content":"What is the capital
+      of France?"}],"max_completion_tokens":1200,"stream":true,"stream_options":{"include_usage":true}}'
+    headers:
+      accept:
+      - '*/*'
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '172'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - python-httpx/0.28.1
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: 'data: {"id":"chatcmpl-CkVmZ8JZ8oof5i1bL0G7ixPcF9J7h","object":"chat.completion.chunk","created":1765201295,"model":"gpt-5-2025-08-07","service_tier":"default","system_fingerprint":null,"choices":[{"index":0,"delta":{"role":"assistant","content":"","refusal":null},"finish_reason":null}],"usage":null,"obfuscation":"PIQTUtyOdh"}
+
+
+        data: {"id":"chatcmpl-CkVmZ8JZ8oof5i1bL0G7ixPcF9J7h","object":"chat.completion.chunk","created":1765201295,"model":"gpt-5-2025-08-07","service_tier":"default","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"Paris"},"finish_reason":null}],"usage":null,"obfuscation":"re3rLzM"}
+
+
+        data: {"id":"chatcmpl-CkVmZ8JZ8oof5i1bL0G7ixPcF9J7h","object":"chat.completion.chunk","created":1765201295,"model":"gpt-5-2025-08-07","service_tier":"default","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"."},"finish_reason":null}],"usage":null,"obfuscation":"sRTMX9sXRGg"}
+
+
+        data: {"id":"chatcmpl-CkVmZ8JZ8oof5i1bL0G7ixPcF9J7h","object":"chat.completion.chunk","created":1765201295,"model":"gpt-5-2025-08-07","service_tier":"default","system_fingerprint":null,"choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"usage":null,"obfuscation":"q16gDt"}
+
+
+        data: {"id":"chatcmpl-CkVmZ8JZ8oof5i1bL0G7ixPcF9J7h","object":"chat.completion.chunk","created":1765201295,"model":"gpt-5-2025-08-07","service_tier":"default","system_fingerprint":null,"choices":[],"usage":{"prompt_tokens":13,"completion_tokens":11,"total_tokens":24,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}},"obfuscation":"xR2uRrbBUrh"}
+
+
+        data: [DONE]
+
+
+        '
+    headers:
+      CF-RAY:
+      - 9aacae5afd4e3ed1-MAD
+      Connection:
+      - keep-alive
+      Content-Type:
+      - text/event-stream; charset=utf-8
+      Date:
+      - Mon, 08 Dec 2025 13:41:36 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - max-age=31536000; includeSubDomains; preload
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - pydantic-28gund
+      openai-processing-ms:
+      - '1135'
+      openai-project:
+      - proj_dKobscVY9YJxeEaDJen54e3d
+      openai-version:
+      - '2020-10-01'
+      x-envoy-upstream-service-time:
+      - '1282'
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - '15000'
+      x-ratelimit-limit-tokens:
+      - '40000000'
+      x-ratelimit-remaining-requests:
+      - '14999'
+      x-ratelimit-remaining-tokens:
+      - '39999990'
+      x-ratelimit-reset-requests:
+      - 4ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_33fa052094294105968f87c4e65778f4
+    status:
+      code: 200
+      message: OK
+version: 1