From f7ccc592e0957ed0bcef2e6ad5ecd42ed9b3bffe Mon Sep 17 00:00:00 2001 From: xyaz1313 Date: Wed, 15 Apr 2026 22:23:30 +0800 Subject: [PATCH] fix: stream database dumps to support large databases (#59) Previously, dumpDatabaseRoute loaded ALL table data into memory before sending the response. For databases >1GB this caused: 1. Out-of-memory crashes 2. 30-second Cloudflare Workers timeout This commit rewrites the dump to use ReadableStream with LIMIT/OFFSET batching: - Fetches rows in batches of 500 (constant BATCH_SIZE) - Streams INSERT statements incrementally via ReadableStream - Yields control between batches (10ms breathing interval) to avoid starving other concurrent requests - Escapes identifiers with double-quotes and values with proper SQL quoting (strings, NULL, blobs, booleans) - Peak memory is now O(BATCH_SIZE) instead of O(database_size) All existing tests pass, plus new tests for NULL handling and multi-batch streaming. --- src/export/dump.test.ts | 60 +++++++++++-- src/export/dump.ts | 186 ++++++++++++++++++++++++++++++---------- 2 files changed, 197 insertions(+), 49 deletions(-) diff --git a/src/export/dump.test.ts b/src/export/dump.test.ts index ca65b43..fe72002 100644 --- a/src/export/dump.test.ts +++ b/src/export/dump.test.ts @@ -41,17 +41,22 @@ beforeEach(() => { describe('Database Dump Module', () => { it('should return a database dump when tables exist', async () => { vi.mocked(executeOperation) + // table list .mockResolvedValueOnce([{ name: 'users' }, { name: 'orders' }]) + // users schema .mockResolvedValueOnce([ { sql: 'CREATE TABLE users (id INTEGER, name TEXT);' }, ]) + // users data batch (only one batch needed) .mockResolvedValueOnce([ { id: 1, name: 'Alice' }, { id: 2, name: 'Bob' }, ]) + // orders schema .mockResolvedValueOnce([ { sql: 'CREATE TABLE orders (id INTEGER, total REAL);' }, ]) + // orders data batch .mockResolvedValueOnce([ { id: 1, total: 99.99 }, { id: 2, total: 49.5 }, @@ -71,13 +76,13 @@ describe('Database Dump Module', () => { expect(dumpText).toContain( 'CREATE TABLE users (id INTEGER, name TEXT);' ) - expect(dumpText).toContain("INSERT INTO users VALUES (1, 'Alice');") - expect(dumpText).toContain("INSERT INTO users VALUES (2, 'Bob');") + expect(dumpText).toContain('INSERT INTO "users" VALUES (1, \'Alice\');') + expect(dumpText).toContain('INSERT INTO "users" VALUES (2, \'Bob\');') expect(dumpText).toContain( 'CREATE TABLE orders (id INTEGER, total REAL);' ) - expect(dumpText).toContain('INSERT INTO orders VALUES (1, 99.99);') - expect(dumpText).toContain('INSERT INTO orders VALUES (2, 49.5);') + expect(dumpText).toContain('INSERT INTO "orders" VALUES (1, 99.99);') + expect(dumpText).toContain('INSERT INTO "orders" VALUES (2, 49.5);') }) it('should handle empty databases (no tables)', async () => { @@ -99,6 +104,7 @@ describe('Database Dump Module', () => { .mockResolvedValueOnce([ { sql: 'CREATE TABLE users (id INTEGER, name TEXT);' }, ]) + // empty batch (no data) .mockResolvedValueOnce([]) const response = await dumpDatabaseRoute(mockDataSource, mockConfig) @@ -108,7 +114,7 @@ describe('Database Dump Module', () => { expect(dumpText).toContain( 'CREATE TABLE users (id INTEGER, name TEXT);' ) - expect(dumpText).not.toContain('INSERT INTO users VALUES') + expect(dumpText).not.toContain('INSERT INTO "users" VALUES') }) it('should escape single quotes properly in string values', async () => { @@ -124,10 +130,52 @@ describe('Database Dump Module', () => { expect(response).toBeInstanceOf(Response) const dumpText = await response.text() expect(dumpText).toContain( - "INSERT INTO users VALUES (1, 'Alice''s adventure');" + "INSERT INTO \"users\" VALUES (1, 'Alice''s adventure');" ) }) + it('should handle NULL values', async () => { + vi.mocked(executeOperation) + .mockResolvedValueOnce([{ name: 'users' }]) + .mockResolvedValueOnce([ + { sql: 'CREATE TABLE users (id INTEGER, name TEXT);' }, + ]) + .mockResolvedValueOnce([{ id: 1, name: null }]) + + const response = await dumpDatabaseRoute(mockDataSource, mockConfig) + + const dumpText = await response.text() + expect(dumpText).toContain('INSERT INTO "users" VALUES (1, NULL);') + }) + + it('should stream data in batches when table has many rows', async () => { + // Simulate a table with 501 rows → should trigger 2 batches (500 + 1) + const batch1 = Array.from({ length: 500 }, (_, i) => ({ + id: i + 1, + name: `user${i + 1}`, + })) + const batch2 = [{ id: 501, name: 'user501' }] + + vi.mocked(executeOperation) + .mockResolvedValueOnce([{ name: 'users' }]) + .mockResolvedValueOnce([ + { sql: 'CREATE TABLE users (id INTEGER, name TEXT);' }, + ]) + .mockResolvedValueOnce(batch1) // first batch + .mockResolvedValueOnce(batch2) // second batch + + const response = await dumpDatabaseRoute(mockDataSource, mockConfig) + + const dumpText = await response.text() + expect(dumpText).toContain('INSERT INTO "users" VALUES (1, \'user1\');') + expect(dumpText).toContain( + 'INSERT INTO "users" VALUES (501, \'user501\');' + ) + + // Verify LIMIT/OFFSET was used (4 calls: table list, schema, batch1, batch2) + expect(executeOperation).toHaveBeenCalledTimes(4) + }) + it('should return a 500 response when an error occurs', async () => { const consoleErrorMock = vi .spyOn(console, 'error') diff --git a/src/export/dump.ts b/src/export/dump.ts index 91a2e89..00f1880 100644 --- a/src/export/dump.ts +++ b/src/export/dump.ts @@ -3,6 +3,22 @@ import { StarbaseDBConfiguration } from '../handler' import { DataSource } from '../types' import { createResponse } from '../utils' +const BATCH_SIZE = 500 // Fetch rows in batches to avoid memory spikes +const BREATHING_INTERVAL_MS = 10 // Yield control between batches + +/** + * Stream a database dump using ReadableStream to avoid OOM on large databases. + * + * For each table we: + * 1. Emit the CREATE TABLE statement (schema) + * 2. Fetch rows in batches of BATCH_SIZE using LIMIT/OFFSET + * 3. Emit INSERT statements for each batch + * 4. Yield control briefly between batches so other requests aren't starved + * + * This keeps peak memory usage proportional to BATCH_SIZE rather than the + * entire database size, and prevents the 30-second Cloudflare Workers timeout + * from killing the request on large databases. + */ export async function dumpDatabaseRoute( dataSource: DataSource, config: StarbaseDBConfiguration @@ -15,57 +31,141 @@ export async function dumpDatabaseRoute( config ) - const tables = tablesResult.map((row: any) => row.name) - let dumpContent = 'SQLite format 3\0' // SQLite file header - - // Iterate through all tables - for (const table of tables) { - // Get table schema - const schemaResult = await executeOperation( - [ - { - sql: `SELECT sql FROM sqlite_master WHERE type='table' AND name='${table}';`, - }, - ], - dataSource, - config - ) - - if (schemaResult.length) { - const schema = schemaResult[0].sql - dumpContent += `\n-- Table: ${table}\n${schema};\n\n` - } - - // Get table data - const dataResult = await executeOperation( - [{ sql: `SELECT * FROM ${table};` }], - dataSource, - config - ) - - for (const row of dataResult) { - const values = Object.values(row).map((value) => - typeof value === 'string' - ? `'${value.replace(/'/g, "''")}'` - : value - ) - dumpContent += `INSERT INTO ${table} VALUES (${values.join(', ')});\n` - } - - dumpContent += '\n' - } - - // Create a Blob from the dump content - const blob = new Blob([dumpContent], { type: 'application/x-sqlite3' }) + const tables: string[] = tablesResult.map((row: any) => row.name) + + const encoder = new TextEncoder() + + const stream = new ReadableStream({ + async start(controller) { + try { + // SQLite file header + controller.enqueue(encoder.encode('SQLite format 3\0')) + + for (const table of tables) { + // ── Schema ────────────────────────────────────────── + const schemaResult = await executeOperation( + [ + { + sql: `SELECT sql FROM sqlite_master WHERE type='table' AND name=?;`, + params: [table], + }, + ], + dataSource, + config + ) + + if (schemaResult.length) { + const schema = schemaResult[0].sql + controller.enqueue( + encoder.encode( + `\n-- Table: ${table}\n${schema};\n\n` + ) + ) + } + + // ── Data (streamed in batches) ────────────────────── + let offset = 0 + let hasMore = true + + while (hasMore) { + const dataResult = await executeOperation( + [ + { + sql: `SELECT * FROM ${escapeIdent(table)} LIMIT ? OFFSET ?;`, + params: [BATCH_SIZE, offset], + }, + ], + dataSource, + config + ) + + if (!dataResult || dataResult.length === 0) { + hasMore = false + break + } + + // Build INSERT statements for this batch + const batchLines: string[] = [] + for (const row of dataResult) { + const values = Object.values(row).map((value) => + escapeValue(value) + ) + batchLines.push( + `INSERT INTO ${escapeIdent(table)} VALUES (${values.join(', ')});` + ) + } + controller.enqueue( + encoder.encode(batchLines.join('\n') + '\n') + ) + + offset += dataResult.length + + // If we got fewer rows than requested, we're done + if (dataResult.length < BATCH_SIZE) { + hasMore = false + } + + // Breathing interval – yield to the event loop so + // other in-flight requests can be served. + await sleep(BREATHING_INTERVAL_MS) + } + + controller.enqueue(encoder.encode('\n')) + } + + controller.close() + } catch (err) { + controller.error(err) + } + }, + }) const headers = new Headers({ 'Content-Type': 'application/x-sqlite3', 'Content-Disposition': 'attachment; filename="database_dump.sql"', }) - return new Response(blob, { headers }) + return new Response(stream, { headers }) } catch (error: any) { console.error('Database Dump Error:', error) return createResponse(undefined, 'Failed to create database dump', 500) } } + +// ── Helpers ──────────────────────────────────────────────────────────────── + +/** Escape a SQL identifier (table/column name) – very basic but sufficient for generated code. */ +function escapeIdent(name: string): string { + return `"${name.replace(/"/g, '""')}"` +} + +/** Format a JS value as a SQL literal. */ +function escapeValue(value: unknown): string { + if (value === null || value === undefined) { + return 'NULL' + } + if (typeof value === 'string') { + return `'${value.replace(/'/g, "''")}'` + } + if (typeof value === 'number' || typeof value === 'bigint') { + return String(value) + } + if (typeof value === 'boolean') { + return value ? '1' : '0' + } + // ArrayBuffer / Uint8Array → hex blob literal + if (value instanceof Uint8Array || value instanceof ArrayBuffer) { + const bytes = + value instanceof ArrayBuffer ? new Uint8Array(value) : value + const hex = Array.from(bytes) + .map((b) => b.toString(16).padStart(2, '0')) + .join('') + return `X'${hex}'` + } + // Fallback: stringify as quoted text + return `'${String(value).replace(/'/g, "''")}'` +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)) +}