From 2006959ea745adf5cb5704ee87c1b00f9b425d5f Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Tue, 30 Sep 2025 14:27:50 +0200
Subject: [PATCH 01/28] feat: add CSV conversion command to ensrainbow CLI

- Introduced `convert-csv` command for converting CSV files to .ensrainbow format.
- Added support for single and two-column CSV formats.
- Implemented error handling for invalid CSV data.
- Created tests for various CSV scenarios, including special characters and invalid formats.
- Updated package dependencies to include `csv-simple-parser` for CSV parsing.
---
 apps/ensrainbow/package.json                  |   3 +-
 apps/ensrainbow/src/cli.ts                    |  49 +++-
 .../src/commands/convert-csv-command.test.ts  | 241 +++++++++++++++++
 .../src/commands/convert-csv-command.ts       | 248 ++++++++++++++++++
 .../test/fixtures/test_labels_1col.csv        |  10 +
 .../test/fixtures/test_labels_2col.csv        |  10 +
 .../fixtures/test_labels_invalid_first.csv    |   3 +
 .../fixtures/test_labels_invalid_hash.csv     |   4 +
 .../fixtures/test_labels_special_chars.csv    |  10 +
 pnpm-lock.yaml                                |  17 +-
 10 files changed, 591 insertions(+), 4 deletions(-)
 create mode 100644 apps/ensrainbow/src/commands/convert-csv-command.test.ts
 create mode 100644 apps/ensrainbow/src/commands/convert-csv-command.ts
 create mode 100644 apps/ensrainbow/test/fixtures/test_labels_1col.csv
 create mode 100644 apps/ensrainbow/test/fixtures/test_labels_2col.csv
 create mode 100644 apps/ensrainbow/test/fixtures/test_labels_invalid_first.csv
 create mode 100644 apps/ensrainbow/test/fixtures/test_labels_invalid_hash.csv
 create mode 100644 apps/ensrainbow/test/fixtures/test_labels_special_chars.csv

diff --git a/apps/ensrainbow/package.json b/apps/ensrainbow/package.json
index ea7c2b95c..af46315e9 100644
--- a/apps/ensrainbow/package.json
+++ b/apps/ensrainbow/package.json
@@ -38,7 +38,8 @@
     "progress": "^2.0.3",
     "protobufjs": "^7.4.0",
     "viem": "catalog:",
-    "yargs": "^17.7.2"
+    "yargs": "^17.7.2",
+    "csv-simple-parser": "^2.0.2"
   },
   "devDependencies": {
     "@ensnode/shared-configs": "workspace:*",
diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts
index 3fdc0d530..063c48df2 100644
--- a/apps/ensrainbow/src/cli.ts
+++ b/apps/ensrainbow/src/cli.ts
@@ -13,6 +13,7 @@ import {
 } from "@ensnode/ensnode-sdk";
 
 import { convertCommand } from "@/commands/convert-command";
+import { convertCsvCommand } from "@/commands/convert-csv-command";
 // import { ingestCommand } from "@/commands/ingest-command";
 import { ingestProtobufCommand } from "@/commands/ingest-protobuf-command";
 import { purgeCommand } from "@/commands/purge-command";
@@ -61,6 +62,13 @@ interface ConvertArgs {
   "label-set-version": LabelSetVersion;
 }
 
+interface ConvertCsvArgs {
+  "input-file": string;
+  "output-file": string;
+  "label-set-id": LabelSetId;
+  "label-set-version": LabelSetVersion;
+}
+
 export interface CLIOptions {
   exitProcess?: boolean;
 }
@@ -184,7 +192,7 @@ export function createCLI(options: CLIOptions = {}) {
       )
       .command(
         "convert",
-        "Convert rainbow tables from SQL dump to protobuf format",
+        "Convert rainbow tables from SQL dump to ensrainbow format",
         (yargs: Argv) => {
           return yargs
             .option("input-file", {
@@ -194,7 +202,7 @@ export function createCLI(options: CLIOptions = {}) {
             })
             .option("output-file", {
               type: "string",
-              description: "Path to the output protobuf file",
+              description: "Path to the output ensrainbow file",
               default: join(process.cwd(), "rainbow-records.ensrainbow"),
             })
             .option("label-set-id", {
@@ -219,6 +227,43 @@ export function createCLI(options: CLIOptions = {}) {
           });
         },
       )
+      .command(
+        "convert-csv",
+        "Convert rainbow tables from CSV format to ensrainbow format",
+        (yargs: Argv) => {
+          return yargs
+            .option("input-file", {
+              type: "string",
+              description: "Path to the CSV input file",
+              demandOption: true,
+            })
+            .option("output-file", {
+              type: "string",
+              description: "Path to the output ensrainbow file",
+              default: join(process.cwd(), "rainbow-records.ensrainbow"),
+            })
+            .option("label-set-id", {
+              type: "string",
+              description: "Label set id for the rainbow record collection",
+              demandOption: true,
+            })
+            .coerce("label-set-id", buildLabelSetId)
+            .option("label-set-version", {
+              type: "number",
+              description: "Label set version for the rainbow record collection",
+              demandOption: true,
+            })
+            .coerce("label-set-version", buildLabelSetVersion);
+        },
+        async (argv: ArgumentsCamelCase<ConvertCsvArgs>) => {
+          await convertCsvCommand({
+            inputFile: argv["input-file"],
+            outputFile: argv["output-file"],
+            labelSetId: argv["label-set-id"],
+            labelSetVersion: argv["label-set-version"],
+          });
+        },
+      )
       .demandCommand(1, "You must specify a command")
       .fail((msg, err, yargs) => {
         if (process.env.VITEST) {
diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
new file mode 100644
index 000000000..2be46d924
--- /dev/null
+++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
@@ -0,0 +1,241 @@
+import { tmpdir } from "os";
+import { join } from "path";
+import { mkdtemp, rm, stat, writeFile } from "fs/promises";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+
+import { createCLI } from "@/cli";
+import { type LabelSetId, type LabelSetVersion } from "@ensnode/ensnode-sdk";
+import { convertCsvCommand } from "./convert-csv-command";
+
+// Path to test fixtures
+const TEST_FIXTURES_DIR = join(__dirname, "..", "..", "test", "fixtures");
+
+describe("convert-csv-command", () => {
+  let tempDir: string;
+
+  beforeEach(async () => {
+    vi.stubEnv("NODE_ENV", "test");
+    tempDir = await mkdtemp(join(tmpdir(), "ensrainbow-csv-test-"));
+  });
+
+  afterEach(async () => {
+    vi.unstubAllEnvs();
+    vi.restoreAllMocks();
+    await rm(tempDir, { recursive: true, force: true });
+  });
+
+  describe("CSV conversion and ingestion", () => {
+    it("should convert single column CSV and successfully ingest into database", async () => {
+      const inputFile = join(TEST_FIXTURES_DIR, "test_labels_1col.csv");
+      const outputFile = join(tempDir, "output_1col.ensrainbow");
+      const dataDir = join(tempDir, "db_1col");
+
+      // Convert CSV to ensrainbow format
+      await convertCsvCommand({
+        inputFile,
+        outputFile,
+        labelSetId: "test-csv-one-col" as LabelSetId,
+        labelSetVersion: 0 as LabelSetVersion,
+      });
+
+      // Verify the output file was created
+      const stats = await stat(outputFile);
+      expect(stats.isFile()).toBe(true);
+      expect(stats.size).toBeGreaterThan(0);
+
+      // Ingest the converted file into database
+      const cli = createCLI({ exitProcess: false });
+      await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]);
+
+      // Verify database was created
+      const dbStats = await stat(dataDir);
+      expect(dbStats.isDirectory()).toBe(true);
+
+      // Verify database contents by validating it
+      await cli.parse(["validate", "--data-dir", dataDir, "--lite"]);
+
+      // Database validation passed, which means records are accessible
+    });
+
+    it("should convert two column CSV with provided hashes and ingest successfully", async () => {
+      const inputFile = join(TEST_FIXTURES_DIR, "test_labels_2col.csv");
+      const outputFile = join(tempDir, "output_2col.ensrainbow");
+      const dataDir = join(tempDir, "db_2col");
+
+      // Convert CSV to ensrainbow format
+      await convertCsvCommand({
+        inputFile,
+        outputFile,
+        labelSetId: "test-csv-two-col" as LabelSetId,
+        labelSetVersion: 0 as LabelSetVersion,
+      });
+
+      // Verify the output file was created
+      const stats = await stat(outputFile);
+      expect(stats.isFile()).toBe(true);
+      expect(stats.size).toBeGreaterThan(0);
+
+      // Ingest the converted file into database
+      const cli = createCLI({ exitProcess: false });
+      await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]);
+
+      // Verify database was created
+      const dbStats = await stat(dataDir);
+      expect(dbStats.isDirectory()).toBe(true);
+    });
+
+    it("should fail when CSV has inconsistent column count", async () => {
+      const inputFile = join(TEST_FIXTURES_DIR, "test_labels_invalid_first.csv");
+      const outputFile = join(tempDir, "output_invalid.ensrainbow");
+
+      // Convert CSV to ensrainbow format (should fail on inconsistent columns)
+      await expect(
+        convertCsvCommand({
+          inputFile,
+          outputFile,
+          labelSetId: "test-csv-invalid" as LabelSetId,
+          labelSetVersion: 0 as LabelSetVersion,
+        }),
+      ).rejects.toThrow(/CSV conversion failed due to invalid data/);
+    });
+
+    it("should handle CSV with special characters, emojis, unicode, and quoted fields", async () => {
+      const inputFile = join(TEST_FIXTURES_DIR, "test_labels_special_chars.csv");
+      const outputFile = join(tempDir, "output_special.ensrainbow");
+
+      // Convert CSV to ensrainbow format
+      await convertCsvCommand({
+        inputFile,
+        outputFile,
+        labelSetId: "test-csv-special" as LabelSetId,
+        labelSetVersion: 0 as LabelSetVersion,
+      });
+
+      // Verify output file was created
+      const outputStats = await stat(outputFile);
+      expect(outputStats.isFile()).toBe(true);
+      expect(outputStats.size).toBeGreaterThan(0);
+
+      // Verify special characters were processed correctly by checking logs
+      // The conversion completed successfully, which means csv-simple-parser
+      // handled emojis, unicode, quoted fields with commas, etc.
+      expect(true).toBe(true); // Test passes if conversion doesn't crash
+    });
+
+    it("should fail when CSV contains invalid labelhash format", async () => {
+      const inputFile = join(TEST_FIXTURES_DIR, "test_labels_invalid_hash.csv");
+      const outputFile = join(tempDir, "output_invalid_hash.ensrainbow");
+
+      // Convert CSV to ensrainbow format (should fail on invalid hash format)
+      await expect(
+        convertCsvCommand({
+          inputFile,
+          outputFile,
+          labelSetId: "test-csv-invalid-hash" as LabelSetId,
+          labelSetVersion: 0 as LabelSetVersion,
+        }),
+      ).rejects.toThrow(/CSV conversion failed due to invalid data/);
+    });
+  });
+
+  describe("Error handling", () => {
+    it("should throw error for non-existent input file", async () => {
+      const inputFile = join(tempDir, "non-existent.csv");
+      const outputFile = join(tempDir, "output.ensrainbow");
+
+      await expect(
+        convertCsvCommand({
+          inputFile,
+          outputFile,
+          labelSetId: "test-missing" as LabelSetId,
+          labelSetVersion: 0 as LabelSetVersion,
+        }),
+      ).rejects.toThrow();
+    });
+  });
+
+  describe("CLI integration", () => {
+    it("should work through the full CLI pipeline", async () => {
+      const inputFile = join(TEST_FIXTURES_DIR, "test_labels_1col.csv");
+      const outputFile = join(tempDir, "cli_output.ensrainbow");
+      const dataDir = join(tempDir, "cli_db");
+
+      const cli = createCLI({ exitProcess: false });
+
+      // Test convert-csv command through CLI
+      await cli.parse([
+        "convert-csv",
+        "--input-file",
+        inputFile,
+        "--output-file",
+        outputFile,
+        "--label-set-id",
+        "test-cli-csv",
+        "--label-set-version",
+        "0",
+      ]);
+
+      // Verify file was created
+      const stats = await stat(outputFile);
+      expect(stats.isFile()).toBe(true);
+      expect(stats.size).toBeGreaterThan(0);
+
+      // Test ingestion through CLI
+      await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]);
+
+      // Verify database was created
+      const dbStats = await stat(dataDir);
+      expect(dbStats.isDirectory()).toBe(true);
+    });
+  });
+
+  describe("Streaming performance", () => {
+    it("should handle small CSV files efficiently", async () => {
+      const inputFile = join(tempDir, "small_test.csv");
+      const outputFile = join(tempDir, "output_small.ensrainbow");
+      const dataDir = join(tempDir, "db_small");
+
+      // Create a CSV with 100 records to test streaming
+      const records = [];
+      for (let i = 0; i < 100; i++) {
+        records.push(`label${i}`);
+      }
+      await writeFile(inputFile, records.join("\n"));
+
+      const startTime = Date.now();
+
+      // Convert CSV
+      await convertCsvCommand({
+        inputFile,
+        outputFile,
+        labelSetId: "test-small" as LabelSetId,
+        labelSetVersion: 0 as LabelSetVersion,
+      });
+
+      const conversionTime = Date.now() - startTime;
+
+      // Should complete conversion quickly (less than 2 seconds for 100 records)
+      expect(conversionTime).toBeLessThan(2000);
+
+      // Verify file was created
+      const stats = await stat(outputFile);
+      expect(stats.isFile()).toBe(true);
+      expect(stats.size).toBeGreaterThan(0);
+
+      // Test ingestion
+      const cli = createCLI({ exitProcess: false });
+      const ingestStartTime = Date.now();
+
+      await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]);
+
+      const ingestTime = Date.now() - ingestStartTime;
+
+      // Should complete ingestion quickly (less than 3 seconds for 100 records)
+      expect(ingestTime).toBeLessThan(3000);
+
+      // Verify database was created
+      const dbStats = await stat(dataDir);
+      expect(dbStats.isDirectory()).toBe(true);
+    });
+  });
+});
diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts
new file mode 100644
index 000000000..1c04fbf5c
--- /dev/null
+++ b/apps/ensrainbow/src/commands/convert-csv-command.ts
@@ -0,0 +1,248 @@
+/**
+ * ENSRAINBOW CSV FILE CREATION COMMAND
+ *
+ * Converts CSV files to .ensrainbow format with csv-simple-parser
+ * Supports 1-column (label only) and 2-column (label,labelhash) formats
+ */
+
+import { createReadStream, createWriteStream } from "fs";
+import { createInterface } from "readline";
+import { type LabelHash, labelHashToBytes } from "@ensnode/ensnode-sdk";
+import parse from "csv-simple-parser";
+import { labelhash } from "viem";
+import { logger } from "../utils/logger.js";
+import {
+  CURRENT_ENSRAINBOW_FILE_FORMAT_VERSION,
+  createRainbowProtobufRoot,
+} from "../utils/protobuf-schema.js";
+
+/**
+ * Parse CSV using csv-simple-parser
+ */
+function parseCsvLine(line: string): string[] {
+  const result = parse(line);
+  return result.length > 0 ? (result[0] as string[]) : [];
+}
+
+// No label validation - ENS accepts any UTF-8 string
+
+export interface ConvertCsvCommandOptions {
+  inputFile: string;
+  outputFile: string;
+  labelSetId: string;
+  labelSetVersion: number;
+}
+
+interface ConversionStats {
+  totalLines: number;
+  processedRecords: number;
+  skippedRecords: number;
+  invalidLabels: number;
+  duplicates: number;
+  startTime: Date;
+  endTime?: Date;
+}
+
+/**
+ * Process a single CSV line with csv-simple-parser and validation
+ */
+function processStreamingCsvLine(line: string, expectedColumns: number): string[] {
+  if (line.trim() === "") {
+    throw new Error("Empty line");
+  }
+
+  const parsedLine = parseCsvLine(line);
+
+  // Validate column count
+  if (parsedLine.length !== expectedColumns) {
+    throw new Error(
+      `Expected ${expectedColumns} columns, but found ${parsedLine.length} in line: ${line}`,
+    );
+  }
+
+  return parsedLine;
+}
+
+/**
+ * Setup input stream for reading CSV line by line
+ */
+function setupReadStream(inputFile: string) {
+  const fileStream = createReadStream(inputFile, { encoding: "utf8" });
+  return createInterface({
+    input: fileStream,
+    crlfDelay: Infinity,
+  });
+}
+
+/**
+ * Setup output stream for writing protobuf
+ */
+function setupWriteStream(outputFile: string) {
+  // For now, just write directly to file without gzip compression
+  return createWriteStream(outputFile);
+}
+
+/**
+ * Write protobuf header
+ */
+function writeHeader(
+  outputStream: NodeJS.WritableStream,
+  RainbowRecordCollectionType: any,
+  labelSetId: string,
+  labelSetVersion: number,
+) {
+  const headerCollection = RainbowRecordCollectionType.fromObject({
+    format_identifier: "ensrainbow",
+    ensrainbow_file_format_version: CURRENT_ENSRAINBOW_FILE_FORMAT_VERSION,
+    label_set_id: labelSetId,
+    label_set_version: labelSetVersion,
+    records: [], // Header has no records
+  });
+  // Encode and write the header collection with length-prefix encoding
+  outputStream.write(
+    Buffer.from(RainbowRecordCollectionType.encodeDelimited(headerCollection).finish()),
+  );
+  logger.info("Wrote header message with version, label set id and label set version.");
+}
+
+/**
+ * Log conversion summary
+ */
+function logSummary(stats: ConversionStats) {
+  stats.endTime = new Date();
+  const duration = stats.endTime.getTime() - stats.startTime.getTime();
+
+  logger.info("=== Conversion Summary ===");
+  logger.info(`Total lines processed: ${stats.totalLines}`);
+  logger.info(`Valid records: ${stats.processedRecords}`);
+  logger.info(`Skipped records: ${stats.skippedRecords}`);
+  logger.info(`Invalid labels: ${stats.invalidLabels}`);
+  logger.info(`Duplicates found: ${stats.duplicates}`);
+  logger.info(`Duration: ${duration}ms`);
+}
+
+/**
+ * Main CSV conversion command with true streaming using csv-simple-parser
+ */
+export async function convertCsvCommand(options: ConvertCsvCommandOptions): Promise<void> {
+  const stats: ConversionStats = {
+    totalLines: 0,
+    processedRecords: 0,
+    skippedRecords: 0,
+    invalidLabels: 0,
+    duplicates: 0,
+    startTime: new Date(),
+  };
+
+  try {
+    logger.info("Starting conversion from CSV to protobuf format...");
+    logger.info(`Input file: ${options.inputFile}`);
+    logger.info(`Output file: ${options.outputFile}`);
+    logger.info(`Label set id: ${options.labelSetId}`);
+    logger.info(`Label set version: ${options.labelSetVersion}`);
+
+    // Setup protobuf schema
+    const { RainbowRecordType, RainbowRecordCollectionType } = createRainbowProtobufRoot();
+
+    // Setup streams
+    const outputStream = setupWriteStream(options.outputFile);
+
+    // Write header
+    writeHeader(
+      outputStream,
+      RainbowRecordCollectionType,
+      options.labelSetId,
+      options.labelSetVersion,
+    );
+
+    logger.info("Reading and processing CSV file line by line with streaming...");
+
+    // Setup streaming CSV reader
+    const rl = setupReadStream(options.inputFile);
+
+    let expectedColumns: number | null = null;
+    let lineNumber = 0;
+    let processedRecords = 0;
+
+    // Process line by line with csv-simple-parser
+    for await (const line of rl) {
+      lineNumber++;
+
+      // Skip empty lines
+      if (line.trim() === "") {
+        continue;
+      }
+
+      try {
+        // For the first line, detect column count
+        if (expectedColumns === null) {
+          const firstLineParsed = parseCsvLine(line);
+          expectedColumns = firstLineParsed.length;
+          logger.info(`Detected ${expectedColumns} columns using csv-simple-parser`);
+        }
+
+        // Parse current line with csv-simple-parser
+        const parsedColumns = processStreamingCsvLine(line, expectedColumns);
+
+        // Get label (no validation - ENS accepts any UTF-8 string)
+        const label = parsedColumns[0];
+
+        // Build rainbow record immediately (streaming)
+        let rainbowRecord;
+
+        if (parsedColumns.length === 1) {
+          // Single column: compute labelhash using labelhash function
+          const labelHashBytes = labelHashToBytes(labelhash(label));
+
+          rainbowRecord = {
+            labelhash: Buffer.from(labelHashBytes),
+            label: label,
+          };
+        } else {
+          // Two columns: validate and use provided hash
+          const [, providedHash] = parsedColumns;
+
+          // Ensure the hash has 0x prefix for labelHashToBytes
+          const maybeLabelHash = providedHash.startsWith("0x") ? providedHash : `0x${providedHash}`;
+          const labelHash = labelHashToBytes(maybeLabelHash as LabelHash);
+
+          rainbowRecord = {
+            labelhash: Buffer.from(labelHash),
+            label: label,
+          };
+        }
+
+        // Create protobuf message and write immediately
+        const recordMessage = RainbowRecordType.fromObject(rainbowRecord);
+        outputStream.write(Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish()));
+
+        processedRecords++;
+
+        // Log progress for large files
+        if (processedRecords % 10000 === 0) {
+          logger.info(`Processed ${processedRecords} records so far...`);
+        }
+      } catch (error) {
+        const errorMessage = error instanceof Error ? error.message : String(error);
+        throw new Error(
+          `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`,
+        );
+      }
+    }
+
+    stats.totalLines = lineNumber;
+    stats.processedRecords = processedRecords;
+
+    // Close output stream
+    outputStream.end();
+
+    logger.info(`✅ Processed ${processedRecords} records with streaming csv-simple-parser`);
+
+    logSummary(stats);
+    logger.info("✅ CSV conversion completed successfully!");
+  } catch (error) {
+    const errorMessage = error instanceof Error ? error.message : String(error);
+    logger.error("❌ CSV conversion failed:", errorMessage);
+    throw error;
+  }
+}
diff --git a/apps/ensrainbow/test/fixtures/test_labels_1col.csv b/apps/ensrainbow/test/fixtures/test_labels_1col.csv
new file mode 100644
index 000000000..d809bd116
--- /dev/null
+++ b/apps/ensrainbow/test/fixtures/test_labels_1col.csv
@@ -0,0 +1,10 @@
+alice
+bob
+charlie
+domaintest
+example
+foundation
+governance
+hello
+world
+test123
diff --git a/apps/ensrainbow/test/fixtures/test_labels_2col.csv b/apps/ensrainbow/test/fixtures/test_labels_2col.csv
new file mode 100644
index 000000000..f410bf758
--- /dev/null
+++ b/apps/ensrainbow/test/fixtures/test_labels_2col.csv
@@ -0,0 +1,10 @@
+alice,0x9c0257114eb9399a2985f8e75dad7600c5d89fe3824ffa99ec1c3eb8bf3b0501
+bob,0x38e47a7b719dce63662aeaf43440326f551b8a7ee198cee35cb5d517f2d296a2
+charlie,0x87a213ce1ee769e28decedefb98f6fe48890a74ba84957ebf877fb591e37e0de
+domaintest,0xc2d1b32ab4268fbba175baa3dcab1eb8299bc784030b080f28eaf1b9336c0445
+example,0x6fd43e7cffc31bb581d7421c8698e29aa2bd8e7186a394b85299908b4eb9b175
+foundation,0x0d5c1bd818a4086f28314415cb375a937593efab66f8f7d2903bf2a13ed35070
+governance,0xabea6fd3db56a6e6d0242111b43ebb13d1c42709651c032c7894962023a1f90a
+hello,0x1c8aff950685c2ed4bc3174f3472287b56d9517b9c948127319a09a7a36deac8
+world,0x8452c9b9140222b08593a26daa782707297be9f7b3e8281d7b4974769f19afd0
+test123,0xf81b517a242b218999ec8eec0ea6e2ddbef2a367a14e93f4a32a39e260f686ad
diff --git a/apps/ensrainbow/test/fixtures/test_labels_invalid_first.csv b/apps/ensrainbow/test/fixtures/test_labels_invalid_first.csv
new file mode 100644
index 000000000..3d0b7b7e0
--- /dev/null
+++ b/apps/ensrainbow/test/fixtures/test_labels_invalid_first.csv
@@ -0,0 +1,3 @@
+label1,hash1,extra_column
+validlabel
+another_valid
diff --git a/apps/ensrainbow/test/fixtures/test_labels_invalid_hash.csv b/apps/ensrainbow/test/fixtures/test_labels_invalid_hash.csv
new file mode 100644
index 000000000..484983db9
--- /dev/null
+++ b/apps/ensrainbow/test/fixtures/test_labels_invalid_hash.csv
@@ -0,0 +1,4 @@
+validlabel,0x1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef
+invalidhash,not-a-hex-hash
+anotherlabel,0x123
+toolong,0x1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef123456789
diff --git a/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv b/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv
new file mode 100644
index 000000000..a1cc2a55f
--- /dev/null
+++ b/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv
@@ -0,0 +1,10 @@
+🔥emoji-label🚀
+"label,with,commas"
+"label with newline\n character"  
+Ąśćžłñ-unicode
+"label-with-null\0byte"
+"quoted label with spaces"
+中文-chinese
+😀😁😂🤣-multiple-emojis
+"special""quotes""inside"
+café-àçćént
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 2045e22d5..8c8c0b79b 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -462,6 +462,9 @@ importers:
       classic-level:
         specifier: ^1.4.1
         version: 1.4.1
+      csv-simple-parser:
+        specifier: ^2.0.2
+        version: 2.0.2
       hono:
         specifier: 'catalog:'
         version: 4.10.3
@@ -4161,6 +4164,9 @@ packages:
   csstype@3.2.3:
     resolution: {integrity: sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==}
 
+  csv-simple-parser@2.0.2:
+    resolution: {integrity: sha512-G9KUSB7Bh8mRjZcg340FJM96tJYPPfb+UjR6T+dOcdRLChmwOTP6jB9+rJwmqDoaPHMJW/CXabYbJ1ZEjbkrrg==}
+
   cytoscape-cose-bilkent@4.1.0:
     resolution: {integrity: sha512-wgQlVIUJF13Quxiv5e1gstZ08rnZj2XaLHGoFMYXz7SkNfCDOOteKBE6SYRfA9WxxI/iBc3ajfDoc6hb/MRAHQ==}
     peerDependencies:
@@ -4404,6 +4410,9 @@ packages:
   destr@2.0.5:
     resolution: {integrity: sha512-ugFTXCtDZunbzasqBxrK93Ik/DRYsO6S/fedkWEMKqt04xZ4csmnmwGDBAb07QWNaGMAmnTIemsYZCksjATwsA==}
 
+  detect-eol@3.0.1:
+    resolution: {integrity: sha512-ncnuLiZCKO7Kt+3CpwUIV8QnnwpBsSFxGQBY6Nve18K2aOrTim2xpzDa8YunHkePt39OCfV2qOX+b7xjYSDRWg==}
+
   detect-indent@6.1.0:
     resolution: {integrity: sha512-reYkTUJAZb9gUuZ2RvVCNhVHdg62RHnJ7WJl8ftMi4diZ6NWlciOzQN88pUhSELEwflJht4oQDv0F0BMlwaYtA==}
     engines: {node: '>=8'}
@@ -8672,7 +8681,7 @@ snapshots:
   '@expressive-code/plugin-shiki@0.41.3':
     dependencies:
       '@expressive-code/core': 0.41.3
-      shiki: 3.14.0
+      shiki: 3.15.0
 
   '@expressive-code/plugin-text-markers@0.41.3':
     dependencies:
@@ -11662,6 +11671,10 @@ snapshots:
 
   csstype@3.2.3: {}
 
+  csv-simple-parser@2.0.2:
+    dependencies:
+      detect-eol: 3.0.1
+
   cytoscape-cose-bilkent@4.1.0(cytoscape@3.33.1):
     dependencies:
       cose-base: 1.0.3
@@ -11905,6 +11918,8 @@ snapshots:
 
   destr@2.0.5: {}
 
+  detect-eol@3.0.1: {}
+
   detect-indent@6.1.0: {}
 
   detect-libc@2.1.2: {}

From b49144124c2550bc02385f8a4268b6462cd8dbec Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Tue, 30 Sep 2025 17:25:09 +0200
Subject: [PATCH 02/28] refactor

---
 apps/ensrainbow/src/cli.ts                    |   9 +-
 .../src/commands/convert-csv-command.ts       | 242 ++++++++++--------
 2 files changed, 148 insertions(+), 103 deletions(-)

diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts
index 063c48df2..940692729 100644
--- a/apps/ensrainbow/src/cli.ts
+++ b/apps/ensrainbow/src/cli.ts
@@ -67,6 +67,7 @@ interface ConvertCsvArgs {
   "output-file": string;
   "label-set-id": LabelSetId;
   "label-set-version": LabelSetVersion;
+  "progress-interval"?: number;
 }
 
 export interface CLIOptions {
@@ -253,7 +254,12 @@ export function createCLI(options: CLIOptions = {}) {
               description: "Label set version for the rainbow record collection",
               demandOption: true,
             })
-            .coerce("label-set-version", buildLabelSetVersion);
+            .coerce("label-set-version", buildLabelSetVersion)
+            .option("progress-interval", {
+              type: "number",
+              description: "Number of records to process before logging progress",
+              default: 10000,
+            });
         },
         async (argv: ArgumentsCamelCase<ConvertCsvArgs>) => {
           await convertCsvCommand({
@@ -261,6 +267,7 @@ export function createCLI(options: CLIOptions = {}) {
             outputFile: argv["output-file"],
             labelSetId: argv["label-set-id"],
             labelSetVersion: argv["label-set-version"],
+            progressInterval: argv["progress-interval"],
           });
         },
       )
diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts
index 1c04fbf5c..0b4ed5d6b 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.ts
@@ -17,11 +17,14 @@ import {
 } from "../utils/protobuf-schema.js";
 
 /**
- * Parse CSV using csv-simple-parser
+ * Parse CSV using csv-simple-parser with proper type safety
  */
 function parseCsvLine(line: string): string[] {
   const result = parse(line);
-  return result.length > 0 ? (result[0] as string[]) : [];
+  if (result.length === 0) return [];
+  const firstRow = result[0];
+  if (!Array.isArray(firstRow)) return [];
+  return firstRow.filter((item) => typeof item === "string");
 }
 
 // No label validation - ENS accepts any UTF-8 string
@@ -31,14 +34,15 @@ export interface ConvertCsvCommandOptions {
   outputFile: string;
   labelSetId: string;
   labelSetVersion: number;
+  progressInterval?: number;
 }
 
+// Configuration constants
+const DEFAULT_PROGRESS_INTERVAL = 10000;
+
 interface ConversionStats {
   totalLines: number;
   processedRecords: number;
-  skippedRecords: number;
-  invalidLabels: number;
-  duplicates: number;
   startTime: Date;
   endTime?: Date;
 }
@@ -115,12 +119,123 @@ function logSummary(stats: ConversionStats) {
   logger.info("=== Conversion Summary ===");
   logger.info(`Total lines processed: ${stats.totalLines}`);
   logger.info(`Valid records: ${stats.processedRecords}`);
-  logger.info(`Skipped records: ${stats.skippedRecords}`);
-  logger.info(`Invalid labels: ${stats.invalidLabels}`);
-  logger.info(`Duplicates found: ${stats.duplicates}`);
   logger.info(`Duration: ${duration}ms`);
 }
 
+/**
+ * Initialize conversion setup and logging
+ */
+function initializeConversion(options: ConvertCsvCommandOptions) {
+  logger.info("Starting conversion from CSV to protobuf format...");
+  logger.info(`Input file: ${options.inputFile}`);
+  logger.info(`Output file: ${options.outputFile}`);
+  logger.info(`Label set id: ${options.labelSetId}`);
+  logger.info(`Label set version: ${options.labelSetVersion}`);
+
+  const { RainbowRecordType, RainbowRecordCollectionType } = createRainbowProtobufRoot();
+  const outputStream = setupWriteStream(options.outputFile);
+
+  writeHeader(
+    outputStream,
+    RainbowRecordCollectionType,
+    options.labelSetId,
+    options.labelSetVersion,
+  );
+
+  logger.info("Reading and processing CSV file line by line with streaming...");
+
+  return { RainbowRecordType, outputStream };
+}
+
+/**
+ * Create rainbow record from parsed CSV columns
+ */
+function createRainbowRecord(parsedColumns: string[]): { labelhash: Buffer; label: string } {
+  const label = parsedColumns[0];
+
+  if (parsedColumns.length === 1) {
+    // Single column: compute labelhash using labelhash function
+    const labelHashBytes = labelHashToBytes(labelhash(label));
+    return {
+      labelhash: Buffer.from(labelHashBytes),
+      label: label,
+    };
+  } else {
+    // Two columns: validate and use provided hash
+    const [, providedHash] = parsedColumns;
+    const maybeLabelHash = providedHash.startsWith("0x") ? providedHash : `0x${providedHash}`;
+    const labelHash = labelHashToBytes(maybeLabelHash as LabelHash);
+    return {
+      labelhash: Buffer.from(labelHash),
+      label: label,
+    };
+  }
+}
+
+/**
+ * Process a single CSV record
+ */
+function processRecord(
+  line: string,
+  expectedColumns: number,
+  RainbowRecordType: any,
+  outputStream: NodeJS.WritableStream,
+): void {
+  const parsedColumns = processStreamingCsvLine(line, expectedColumns);
+  const rainbowRecord = createRainbowRecord(parsedColumns);
+
+  // Create protobuf message and write immediately
+  const recordMessage = RainbowRecordType.fromObject(rainbowRecord);
+  outputStream.write(Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish()));
+}
+
+/**
+ * Process the entire CSV file
+ */
+async function processCSVFile(
+  rl: ReturnType<typeof setupReadStream>,
+  RainbowRecordType: any,
+  outputStream: NodeJS.WritableStream,
+  progressInterval: number,
+): Promise<{ totalLines: number; processedRecords: number }> {
+  let expectedColumns: number | null = null;
+  let lineNumber = 0;
+  let processedRecords = 0;
+
+  for await (const line of rl) {
+    lineNumber++;
+
+    // Skip empty lines
+    if (line.trim() === "") {
+      continue;
+    }
+
+    try {
+      // For the first line, detect column count
+      if (expectedColumns === null) {
+        const firstLineParsed = parseCsvLine(line);
+        expectedColumns = firstLineParsed.length;
+        logger.info(`Detected ${expectedColumns} columns using csv-simple-parser`);
+      }
+
+      processRecord(line, expectedColumns, RainbowRecordType, outputStream);
+      processedRecords++;
+
+      // Log progress for large files
+      if (processedRecords % progressInterval === 0) {
+        logger.info(`Processed ${processedRecords} records so far...`);
+      }
+    } catch (error) {
+      const errorMessage = error instanceof Error ? error.message : String(error);
+      throw new Error(
+        `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`,
+      );
+    }
+  }
+
+  return { totalLines: lineNumber, processedRecords };
+}
+
 /**
  * Main CSV conversion command with true streaming using csv-simple-parser
  */
@@ -128,121 +243,44 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
   const stats: ConversionStats = {
     totalLines: 0,
     processedRecords: 0,
-    skippedRecords: 0,
-    invalidLabels: 0,
-    duplicates: 0,
     startTime: new Date(),
   };
 
+  let rl: ReturnType<typeof setupReadStream> | null = null;
+
   try {
-    logger.info("Starting conversion from CSV to protobuf format...");
-    logger.info(`Input file: ${options.inputFile}`);
-    logger.info(`Output file: ${options.outputFile}`);
-    logger.info(`Label set id: ${options.labelSetId}`);
-    logger.info(`Label set version: ${options.labelSetVersion}`);
+    const { RainbowRecordType, outputStream } = initializeConversion(options);
 
-    // Setup protobuf schema
-    const { RainbowRecordType, RainbowRecordCollectionType } = createRainbowProtobufRoot();
+    // Setup streaming CSV reader
+    rl = setupReadStream(options.inputFile);
 
-    // Setup streams
-    const outputStream = setupWriteStream(options.outputFile);
+    const progressInterval = options.progressInterval ?? DEFAULT_PROGRESS_INTERVAL;
 
-    // Write header
-    writeHeader(
+    // Process the CSV file
+    const { totalLines, processedRecords } = await processCSVFile(
+      rl,
+      RainbowRecordType,
       outputStream,
-      RainbowRecordCollectionType,
-      options.labelSetId,
-      options.labelSetVersion,
+      progressInterval,
     );
 
-    logger.info("Reading and processing CSV file line by line with streaming...");
-
-    // Setup streaming CSV reader
-    const rl = setupReadStream(options.inputFile);
-
-    let expectedColumns: number | null = null;
-    let lineNumber = 0;
-    let processedRecords = 0;
-
-    // Process line by line with csv-simple-parser
-    for await (const line of rl) {
-      lineNumber++;
-
-      // Skip empty lines
-      if (line.trim() === "") {
-        continue;
-      }
-
-      try {
-        // For the first line, detect column count
-        if (expectedColumns === null) {
-          const firstLineParsed = parseCsvLine(line);
-          expectedColumns = firstLineParsed.length;
-          logger.info(`Detected ${expectedColumns} columns using csv-simple-parser`);
-        }
-
-        // Parse current line with csv-simple-parser
-        const parsedColumns = processStreamingCsvLine(line, expectedColumns);
-
-        // Get label (no validation - ENS accepts any UTF-8 string)
-        const label = parsedColumns[0];
-
-        // Build rainbow record immediately (streaming)
-        let rainbowRecord;
-
-        if (parsedColumns.length === 1) {
-          // Single column: compute labelhash using labelhash function
-          const labelHashBytes = labelHashToBytes(labelhash(label));
-
-          rainbowRecord = {
-            labelhash: Buffer.from(labelHashBytes),
-            label: label,
-          };
-        } else {
-          // Two columns: validate and use provided hash
-          const [, providedHash] = parsedColumns;
-
-          // Ensure the hash has 0x prefix for labelHashToBytes
-          const maybeLabelHash = providedHash.startsWith("0x") ? providedHash : `0x${providedHash}`;
-          const labelHash = labelHashToBytes(maybeLabelHash as LabelHash);
-
-          rainbowRecord = {
-            labelhash: Buffer.from(labelHash),
-            label: label,
-          };
-        }
-
-        // Create protobuf message and write immediately
-        const recordMessage = RainbowRecordType.fromObject(rainbowRecord);
-        outputStream.write(Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish()));
-
-        processedRecords++;
-
-        // Log progress for large files
-        if (processedRecords % 10000 === 0) {
-          logger.info(`Processed ${processedRecords} records so far...`);
-        }
-      } catch (error) {
-        const errorMessage = error instanceof Error ? error.message : String(error);
-        throw new Error(
-          `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`,
-        );
-      }
-    }
-
-    stats.totalLines = lineNumber;
+    stats.totalLines = totalLines;
     stats.processedRecords = processedRecords;
 
     // Close output stream
     outputStream.end();
 
     logger.info(`✅ Processed ${processedRecords} records with streaming csv-simple-parser`);
-
     logSummary(stats);
     logger.info("✅ CSV conversion completed successfully!");
   } catch (error) {
     const errorMessage = error instanceof Error ? error.message : String(error);
     logger.error("❌ CSV conversion failed:", errorMessage);
     throw error;
+  } finally {
+    // Ensure readline interface is properly closed to prevent resource leaks
+    if (rl) {
+      rl.close();
+    }
   }
 }

From 4c18e0b904791a51fb1baf0d3092b58908361629 Mon Sep 17 00:00:00 2001
From: "kwrobel.eth" <djstrong@gmail.com>
Date: Tue, 30 Sep 2025 14:29:40 +0200
Subject: [PATCH 03/28] Create brave-kiwis-notice.md

---
 .changeset/brave-kiwis-notice.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .changeset/brave-kiwis-notice.md

diff --git a/.changeset/brave-kiwis-notice.md b/.changeset/brave-kiwis-notice.md
new file mode 100644
index 000000000..fbdba8bfc
--- /dev/null
+++ b/.changeset/brave-kiwis-notice.md
@@ -0,0 +1,5 @@
+---
+"ensrainbow": patch
+---
+
+feat: add CSV conversion command to ensrainbow CLI

From 5aefe9dab4bff69fbcadf14879186838edd78184 Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Wed, 1 Oct 2025 17:21:11 +0200
Subject: [PATCH 04/28] fix tests

---
 .../src/commands/convert-csv-command.test.ts  | 54 ++++++++++++++-----
 .../src/commands/convert-csv-command.ts       |  7 ++-
 .../test/fixtures/test_labels_1col.csv        |  1 +
 .../test/fixtures/test_labels_2col.csv        |  2 +-
 .../fixtures/test_labels_special_chars.csv    |  3 +-
 5 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
index 2be46d924..16a6c5cdb 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
@@ -4,8 +4,10 @@ import { mkdtemp, rm, stat, writeFile } from "fs/promises";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 
 import { createCLI } from "@/cli";
-import { type LabelSetId, type LabelSetVersion } from "@ensnode/ensnode-sdk";
+import { labelHashToBytes, type LabelSetId, type LabelSetVersion } from "@ensnode/ensnode-sdk";
 import { convertCsvCommand } from "./convert-csv-command";
+import { ENSRainbowDB } from "@/lib/database";
+import { labelhash } from "viem";
 
 // Path to test fixtures
 const TEST_FIXTURES_DIR = join(__dirname, "..", "..", "test", "fixtures");
@@ -47,14 +49,13 @@ describe("convert-csv-command", () => {
       const cli = createCLI({ exitProcess: false });
       await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]);
 
-      // Verify database was created
-      const dbStats = await stat(dataDir);
-      expect(dbStats.isDirectory()).toBe(true);
-
-      // Verify database contents by validating it
-      await cli.parse(["validate", "--data-dir", dataDir, "--lite"]);
-
-      // Database validation passed, which means records are accessible
+      const db = await ENSRainbowDB.open(dataDir);
+      expect(await db.validate()).toBe(true);
+      const recordsCount = await db.getPrecalculatedRainbowRecordCount();
+      expect(recordsCount).toBe(11);
+      expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("123"))))?.label).toBe("123");
+      expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234"))))).toBe(null);
+      await db.close();
     });
 
     it("should convert two column CSV with provided hashes and ingest successfully", async () => {
@@ -79,9 +80,13 @@ describe("convert-csv-command", () => {
       const cli = createCLI({ exitProcess: false });
       await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]);
 
-      // Verify database was created
-      const dbStats = await stat(dataDir);
-      expect(dbStats.isDirectory()).toBe(true);
+      const db = await ENSRainbowDB.open(dataDir);
+      expect(await db.validate()).toBe(true);
+      const recordsCount = await db.getPrecalculatedRainbowRecordCount();
+      expect(recordsCount).toBe(10);
+      expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("test123"))))?.label).toBe("test123");
+      expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234"))))).toBe(null);
+      await db.close();
     });
 
     it("should fail when CSV has inconsistent column count", async () => {
@@ -99,9 +104,10 @@ describe("convert-csv-command", () => {
       ).rejects.toThrow(/CSV conversion failed due to invalid data/);
     });
 
-    it("should handle CSV with special characters, emojis, unicode, and quoted fields", async () => {
+    it.only("should handle CSV with special characters, emojis, unicode, and quoted fields", async () => {
       const inputFile = join(TEST_FIXTURES_DIR, "test_labels_special_chars.csv");
       const outputFile = join(tempDir, "output_special.ensrainbow");
+      const dataDir = join(tempDir, "db_special");
 
       // Convert CSV to ensrainbow format
       await convertCsvCommand({
@@ -119,7 +125,27 @@ describe("convert-csv-command", () => {
       // Verify special characters were processed correctly by checking logs
       // The conversion completed successfully, which means csv-simple-parser
       // handled emojis, unicode, quoted fields with commas, etc.
-      expect(true).toBe(true); // Test passes if conversion doesn't crash
+
+      // Ingest the converted file into database
+      const cli = createCLI({ exitProcess: false });
+      await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]);
+
+      const db = await ENSRainbowDB.open(dataDir);
+      expect(await db.validate()).toBe(true);
+      const recordsCount = await db.getPrecalculatedRainbowRecordCount();
+      expect(recordsCount).toBe(10);
+      const labels = [
+        "🔥emoji-label🚀", 
+        "special\"quotes\"inside",
+        "label with newline\n character",
+        "label-with-null\0byte",
+      ];
+      for (const label of labels) {
+        expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash(label))))?.label).toBe(label);
+      }
+      expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234"))))).toBe(null);
+      await db.close();
+
     });
 
     it("should fail when CSV contains invalid labelhash format", async () => {
diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts
index 0b4ed5d6b..7b08da655 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.ts
@@ -20,15 +20,13 @@ import {
  * Parse CSV using csv-simple-parser with proper type safety
  */
 function parseCsvLine(line: string): string[] {
-  const result = parse(line);
+  const result = parse(line, {optimistic: false});
   if (result.length === 0) return [];
   const firstRow = result[0];
   if (!Array.isArray(firstRow)) return [];
-  return firstRow.filter((item) => typeof item === "string");
+  return firstRow.map((item) => String(item));
 }
 
-// No label validation - ENS accepts any UTF-8 string
-
 export interface ConvertCsvCommandOptions {
   inputFile: string;
   outputFile: string;
@@ -156,6 +154,7 @@ function createRainbowRecord(parsedColumns: string[]): { labelhash: Buffer; labe
   if (parsedColumns.length === 1) {
     // Single column: compute labelhash using labelhash function
     const labelHashBytes = labelHashToBytes(labelhash(label));
+    console.log(label);
     return {
       labelhash: Buffer.from(labelHashBytes),
       label: label,
diff --git a/apps/ensrainbow/test/fixtures/test_labels_1col.csv b/apps/ensrainbow/test/fixtures/test_labels_1col.csv
index d809bd116..302ef8d63 100644
--- a/apps/ensrainbow/test/fixtures/test_labels_1col.csv
+++ b/apps/ensrainbow/test/fixtures/test_labels_1col.csv
@@ -8,3 +8,4 @@ governance
 hello
 world
 test123
+123
diff --git a/apps/ensrainbow/test/fixtures/test_labels_2col.csv b/apps/ensrainbow/test/fixtures/test_labels_2col.csv
index f410bf758..e02a65762 100644
--- a/apps/ensrainbow/test/fixtures/test_labels_2col.csv
+++ b/apps/ensrainbow/test/fixtures/test_labels_2col.csv
@@ -1,7 +1,7 @@
 alice,0x9c0257114eb9399a2985f8e75dad7600c5d89fe3824ffa99ec1c3eb8bf3b0501
 bob,0x38e47a7b719dce63662aeaf43440326f551b8a7ee198cee35cb5d517f2d296a2
 charlie,0x87a213ce1ee769e28decedefb98f6fe48890a74ba84957ebf877fb591e37e0de
-domaintest,0xc2d1b32ab4268fbba175baa3dcab1eb8299bc784030b080f28eaf1b9336c0445
+domaintest,0x56827be2a1678c2593e2a613fe8c4138ec451ab019d70cd890e007f99b513be1
 example,0x6fd43e7cffc31bb581d7421c8698e29aa2bd8e7186a394b85299908b4eb9b175
 foundation,0x0d5c1bd818a4086f28314415cb375a937593efab66f8f7d2903bf2a13ed35070
 governance,0xabea6fd3db56a6e6d0242111b43ebb13d1c42709651c032c7894962023a1f90a
diff --git a/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv b/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv
index a1cc2a55f..300cfc70a 100644
--- a/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv
+++ b/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv
@@ -1,6 +1,7 @@
 🔥emoji-label🚀
 "label,with,commas"
-"label with newline\n character"  
+"label with newline
+ character"
 Ąśćžłñ-unicode
 "label-with-null\0byte"
 "quoted label with spaces"

From f2c8f20309c1d5e3f40c3ad8dc530e1200f697de Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Wed, 1 Oct 2025 18:07:33 +0200
Subject: [PATCH 05/28] use fast-csv package

---
 apps/ensrainbow/package.json                  |   2 +-
 .../src/commands/convert-csv-command.test.ts  |  29 +--
 .../src/commands/convert-csv-command.ts       | 175 +++++++-----------
 .../fixtures/test_labels_special_chars.csv    | Bin 235 -> 234 bytes
 pnpm-lock.yaml                                |  60 ++++--
 5 files changed, 134 insertions(+), 132 deletions(-)

diff --git a/apps/ensrainbow/package.json b/apps/ensrainbow/package.json
index af46315e9..046cb2e2e 100644
--- a/apps/ensrainbow/package.json
+++ b/apps/ensrainbow/package.json
@@ -39,7 +39,7 @@
     "protobufjs": "^7.4.0",
     "viem": "catalog:",
     "yargs": "^17.7.2",
-    "csv-simple-parser": "^2.0.2"
+    "@fast-csv/parse": "^5.0.0"
   },
   "devDependencies": {
     "@ensnode/shared-configs": "workspace:*",
diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
index 16a6c5cdb..795e53bdc 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
@@ -4,10 +4,10 @@ import { mkdtemp, rm, stat, writeFile } from "fs/promises";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 
 import { createCLI } from "@/cli";
-import { labelHashToBytes, type LabelSetId, type LabelSetVersion } from "@ensnode/ensnode-sdk";
-import { convertCsvCommand } from "./convert-csv-command";
 import { ENSRainbowDB } from "@/lib/database";
+import { type LabelSetId, type LabelSetVersion, labelHashToBytes } from "@ensnode/ensnode-sdk";
 import { labelhash } from "viem";
+import { convertCsvCommand } from "./convert-csv-command";
 
 // Path to test fixtures
 const TEST_FIXTURES_DIR = join(__dirname, "..", "..", "test", "fixtures");
@@ -53,8 +53,10 @@ describe("convert-csv-command", () => {
       expect(await db.validate()).toBe(true);
       const recordsCount = await db.getPrecalculatedRainbowRecordCount();
       expect(recordsCount).toBe(11);
-      expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("123"))))?.label).toBe("123");
-      expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234"))))).toBe(null);
+      expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("123"))))?.label).toBe(
+        "123",
+      );
+      expect(await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234")))).toBe(null);
       await db.close();
     });
 
@@ -84,8 +86,10 @@ describe("convert-csv-command", () => {
       expect(await db.validate()).toBe(true);
       const recordsCount = await db.getPrecalculatedRainbowRecordCount();
       expect(recordsCount).toBe(10);
-      expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("test123"))))?.label).toBe("test123");
-      expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234"))))).toBe(null);
+      expect(
+        (await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("test123"))))?.label,
+      ).toBe("test123");
+      expect(await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234")))).toBe(null);
       await db.close();
     });
 
@@ -104,7 +108,7 @@ describe("convert-csv-command", () => {
       ).rejects.toThrow(/CSV conversion failed due to invalid data/);
     });
 
-    it.only("should handle CSV with special characters, emojis, unicode, and quoted fields", async () => {
+    it("should handle CSV with special characters, emojis, unicode, and quoted fields", async () => {
       const inputFile = join(TEST_FIXTURES_DIR, "test_labels_special_chars.csv");
       const outputFile = join(tempDir, "output_special.ensrainbow");
       const dataDir = join(tempDir, "db_special");
@@ -135,17 +139,18 @@ describe("convert-csv-command", () => {
       const recordsCount = await db.getPrecalculatedRainbowRecordCount();
       expect(recordsCount).toBe(10);
       const labels = [
-        "🔥emoji-label🚀", 
-        "special\"quotes\"inside",
+        "🔥emoji-label🚀",
+        'special"quotes"inside',
         "label with newline\n character",
         "label-with-null\0byte",
       ];
       for (const label of labels) {
-        expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash(label))))?.label).toBe(label);
+        expect(
+          (await db.getVersionedRainbowRecord(labelHashToBytes(labelhash(label))))?.label,
+        ).toBe(label);
       }
-      expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234"))))).toBe(null);
+      expect(await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234")))).toBe(null);
       await db.close();
-
     });
 
     it("should fail when CSV contains invalid labelhash format", async () => {
diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts
index 7b08da655..14ae2d4b3 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.ts
@@ -1,14 +1,13 @@
 /**
  * ENSRAINBOW CSV FILE CREATION COMMAND
  *
- * Converts CSV files to .ensrainbow format with csv-simple-parser
+ * Converts CSV files to .ensrainbow format with fast-csv
  * Supports 1-column (label only) and 2-column (label,labelhash) formats
  */
 
 import { createReadStream, createWriteStream } from "fs";
-import { createInterface } from "readline";
 import { type LabelHash, labelHashToBytes } from "@ensnode/ensnode-sdk";
-import parse from "csv-simple-parser";
+import { parse } from "@fast-csv/parse";
 import { labelhash } from "viem";
 import { logger } from "../utils/logger.js";
 import {
@@ -16,17 +15,6 @@ import {
   createRainbowProtobufRoot,
 } from "../utils/protobuf-schema.js";
 
-/**
- * Parse CSV using csv-simple-parser with proper type safety
- */
-function parseCsvLine(line: string): string[] {
-  const result = parse(line, {optimistic: false});
-  if (result.length === 0) return [];
-  const firstRow = result[0];
-  if (!Array.isArray(firstRow)) return [];
-  return firstRow.map((item) => String(item));
-}
-
 export interface ConvertCsvCommandOptions {
   inputFile: string;
   outputFile: string;
@@ -45,37 +33,6 @@ interface ConversionStats {
   endTime?: Date;
 }
 
-/**
- * Process a single CSV line with csv-simple-parser and validation
- */
-function processStreamingCsvLine(line: string, expectedColumns: number): string[] {
-  if (line.trim() === "") {
-    throw new Error("Empty line");
-  }
-
-  const parsedLine = parseCsvLine(line);
-
-  // Validate column count
-  if (parsedLine.length !== expectedColumns) {
-    throw new Error(
-      `Expected ${expectedColumns} columns, but found ${parsedLine.length} in line: ${line}`,
-    );
-  }
-
-  return parsedLine;
-}
-
-/**
- * Setup input stream for reading CSV line by line
- */
-function setupReadStream(inputFile: string) {
-  const fileStream = createReadStream(inputFile, { encoding: "utf8" });
-  return createInterface({
-    input: fileStream,
-    crlfDelay: Infinity,
-  });
-}
-
 /**
  * Setup output stream for writing protobuf
  */
@@ -146,12 +103,12 @@ function initializeConversion(options: ConvertCsvCommandOptions) {
 }
 
 /**
- * Create rainbow record from parsed CSV columns
+ * Create rainbow record from parsed CSV row
  */
-function createRainbowRecord(parsedColumns: string[]): { labelhash: Buffer; label: string } {
-  const label = parsedColumns[0];
+function createRainbowRecord(row: string[]): { labelhash: Buffer; label: string } {
+  const label = String(row[0]);
 
-  if (parsedColumns.length === 1) {
+  if (row.length === 1) {
     // Single column: compute labelhash using labelhash function
     const labelHashBytes = labelHashToBytes(labelhash(label));
     console.log(label);
@@ -161,7 +118,7 @@ function createRainbowRecord(parsedColumns: string[]): { labelhash: Buffer; labe
     };
   } else {
     // Two columns: validate and use provided hash
-    const [, providedHash] = parsedColumns;
+    const providedHash = String(row[1]);
     const maybeLabelHash = providedHash.startsWith("0x") ? providedHash : `0x${providedHash}`;
     const labelHash = labelHashToBytes(maybeLabelHash as LabelHash);
     return {
@@ -175,13 +132,20 @@ function createRainbowRecord(parsedColumns: string[]): { labelhash: Buffer; labe
  * Process a single CSV record
  */
 function processRecord(
-  line: string,
+  row: string[],
   expectedColumns: number,
   RainbowRecordType: any,
   outputStream: NodeJS.WritableStream,
+  lineNumber: number,
 ): void {
-  const parsedColumns = processStreamingCsvLine(line, expectedColumns);
-  const rainbowRecord = createRainbowRecord(parsedColumns);
+  // Validate column count
+  if (row.length !== expectedColumns) {
+    throw new Error(
+      `Expected ${expectedColumns} columns, but found ${row.length} in line ${lineNumber}`,
+    );
+  }
+
+  const rainbowRecord = createRainbowRecord(row);
 
   // Create protobuf message and write immediately
   const recordMessage = RainbowRecordType.fromObject(rainbowRecord);
@@ -189,54 +153,67 @@ function processRecord(
 }
 
 /**
- * Process the entire CSV file
+ * Process the entire CSV file using fast-csv
  */
 async function processCSVFile(
-  rl: ReturnType<typeof setupReadStream>,
+  inputFile: string,
   RainbowRecordType: any,
   outputStream: NodeJS.WritableStream,
   progressInterval: number,
 ): Promise<{ totalLines: number; processedRecords: number }> {
-  let expectedColumns: number | null = null;
-  let lineNumber = 0;
-  let processedRecords = 0;
-
-  for await (const line of rl) {
-    lineNumber++;
-
-    // Skip empty lines
-    if (line.trim() === "") {
-      continue;
-    }
-
-    try {
-      // For the first line, detect column count
-      if (expectedColumns === null) {
-        const firstLineParsed = parseCsvLine(line);
-        expectedColumns = firstLineParsed.length;
-        logger.info(`Detected ${expectedColumns} columns using csv-simple-parser`);
-      }
-
-      processRecord(line, expectedColumns, RainbowRecordType, outputStream);
-      processedRecords++;
-
-      // Log progress for large files
-      if (processedRecords % progressInterval === 0) {
-        logger.info(`Processed ${processedRecords} records so far...`);
-      }
-    } catch (error) {
-      const errorMessage = error instanceof Error ? error.message : String(error);
-      throw new Error(
-        `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`,
-      );
-    }
-  }
-
-  return { totalLines: lineNumber, processedRecords };
+  return new Promise((resolve, reject) => {
+    let expectedColumns: number | null = null;
+    let lineNumber = 0;
+    let processedRecords = 0;
+
+    const fileStream = createReadStream(inputFile, { encoding: "utf8" });
+
+    const csvStream = parse()
+      .on("data", (row: string[]) => {
+        lineNumber++;
+
+        try {
+          // For the first row, detect column count
+          if (expectedColumns === null) {
+            expectedColumns = row.length;
+            logger.info(`Detected ${expectedColumns} columns using fast-csv`);
+          }
+
+          processRecord(row, expectedColumns, RainbowRecordType, outputStream, lineNumber);
+          processedRecords++;
+
+          // Log progress for large files
+          if (processedRecords % progressInterval === 0) {
+            logger.info(`Processed ${processedRecords} records so far...`);
+          }
+        } catch (error) {
+          const errorMessage = error instanceof Error ? error.message : String(error);
+          csvStream.destroy();
+          fileStream.destroy();
+          reject(
+            new Error(
+              `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`,
+            ),
+          );
+        }
+      })
+      .on("error", (error: Error) => {
+        reject(new Error(`CSV parsing error: ${error.message}`));
+      })
+      .on("end", () => {
+        resolve({ totalLines: lineNumber, processedRecords });
+      });
+
+    fileStream
+      .on("error", (error: Error) => {
+        reject(error);
+      })
+      .pipe(csvStream);
+  });
 }
 
 /**
- * Main CSV conversion command with true streaming using csv-simple-parser
+ * Main CSV conversion command with true streaming using fast-csv
  */
 export async function convertCsvCommand(options: ConvertCsvCommandOptions): Promise<void> {
   const stats: ConversionStats = {
@@ -245,19 +222,14 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
     startTime: new Date(),
   };
 
-  let rl: ReturnType<typeof setupReadStream> | null = null;
-
   try {
     const { RainbowRecordType, outputStream } = initializeConversion(options);
 
-    // Setup streaming CSV reader
-    rl = setupReadStream(options.inputFile);
-
     const progressInterval = options.progressInterval ?? DEFAULT_PROGRESS_INTERVAL;
 
     // Process the CSV file
     const { totalLines, processedRecords } = await processCSVFile(
-      rl,
+      options.inputFile,
       RainbowRecordType,
       outputStream,
       progressInterval,
@@ -269,17 +241,12 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
     // Close output stream
     outputStream.end();
 
-    logger.info(`✅ Processed ${processedRecords} records with streaming csv-simple-parser`);
+    logger.info(`✅ Processed ${processedRecords} records with streaming fast-csv`);
     logSummary(stats);
     logger.info("✅ CSV conversion completed successfully!");
   } catch (error) {
     const errorMessage = error instanceof Error ? error.message : String(error);
     logger.error("❌ CSV conversion failed:", errorMessage);
     throw error;
-  } finally {
-    // Ensure readline interface is properly closed to prevent resource leaks
-    if (rl) {
-      rl.close();
-    }
   }
 }
diff --git a/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv b/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv
index 300cfc70a9f1230c7346e7b38832f742eb463706..ac2a1f80d8fad7fafbcde1febbe21d95dd15e545 100644
GIT binary patch
delta 11
ScmaFO_=<5tE+fOl{2Blp00dG1

delta 12
TcmaFG_?mG-E>n!b#Jm~+A!G#J

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 8c8c0b79b..3dea391e0 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -456,15 +456,15 @@ importers:
       '@ensnode/ensrainbow-sdk':
         specifier: workspace:*
         version: link:../../packages/ensrainbow-sdk
+      '@fast-csv/parse':
+        specifier: ^5.0.0
+        version: 5.0.5
       '@hono/node-server':
         specifier: ^1.4.1
         version: 1.19.5(hono@4.10.3)
       classic-level:
         specifier: ^1.4.1
         version: 1.4.1
-      csv-simple-parser:
-        specifier: ^2.0.2
-        version: 2.0.2
       hono:
         specifier: 'catalog:'
         version: 4.10.3
@@ -1518,6 +1518,9 @@ packages:
   '@expressive-code/plugin-text-markers@0.41.3':
     resolution: {integrity: sha512-SN8tkIzDpA0HLAscEYD2IVrfLiid6qEdE9QLlGVSxO1KEw7qYvjpbNBQjUjMr5/jvTJ7ys6zysU2vLPHE0sb2g==}
 
+  '@fast-csv/parse@5.0.5':
+    resolution: {integrity: sha512-M0IbaXZDbxfOnpVE5Kps/a6FGlILLhtLsvWd9qNH3d2TxNnpbNkFf3KD26OmJX6MHq7PdQAl5htStDwnuwHx6w==}
+
   '@fastify/busboy@3.2.0':
     resolution: {integrity: sha512-m9FVDXU3GT2ITSe0UaMA5rU3QkfC/UXtCU8y0gSN/GugTqtVldOBWIB5V6V3sbmenVZUIpU6f+mPEO2+m5iTaA==}
 
@@ -4164,9 +4167,6 @@ packages:
   csstype@3.2.3:
     resolution: {integrity: sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==}
 
-  csv-simple-parser@2.0.2:
-    resolution: {integrity: sha512-G9KUSB7Bh8mRjZcg340FJM96tJYPPfb+UjR6T+dOcdRLChmwOTP6jB9+rJwmqDoaPHMJW/CXabYbJ1ZEjbkrrg==}
-
   cytoscape-cose-bilkent@4.1.0:
     resolution: {integrity: sha512-wgQlVIUJF13Quxiv5e1gstZ08rnZj2XaLHGoFMYXz7SkNfCDOOteKBE6SYRfA9WxxI/iBc3ajfDoc6hb/MRAHQ==}
     peerDependencies:
@@ -4410,9 +4410,6 @@ packages:
   destr@2.0.5:
     resolution: {integrity: sha512-ugFTXCtDZunbzasqBxrK93Ik/DRYsO6S/fedkWEMKqt04xZ4csmnmwGDBAb07QWNaGMAmnTIemsYZCksjATwsA==}
 
-  detect-eol@3.0.1:
-    resolution: {integrity: sha512-ncnuLiZCKO7Kt+3CpwUIV8QnnwpBsSFxGQBY6Nve18K2aOrTim2xpzDa8YunHkePt39OCfV2qOX+b7xjYSDRWg==}
-
   detect-indent@6.1.0:
     resolution: {integrity: sha512-reYkTUJAZb9gUuZ2RvVCNhVHdg62RHnJ7WJl8ftMi4diZ6NWlciOzQN88pUhSELEwflJht4oQDv0F0BMlwaYtA==}
     engines: {node: '>=8'}
@@ -5480,12 +5477,30 @@ packages:
   lodash.debounce@4.0.8:
     resolution: {integrity: sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==}
 
+  lodash.escaperegexp@4.1.2:
+    resolution: {integrity: sha512-TM9YBvyC84ZxE3rgfefxUWiQKLilstD6k7PTGt6wfbtXF8ixIJLOL3VYyV/z+ZiPLsVxAsKAFVwWlWeb2Y8Yyw==}
+
+  lodash.groupby@4.6.0:
+    resolution: {integrity: sha512-5dcWxm23+VAoz+awKmBaiBvzox8+RqMgFhi7UvX9DHZr2HdxHXM/Wrf8cfKpsW37RNrvtPn6hSwNqurSILbmJw==}
+
+  lodash.isfunction@3.0.9:
+    resolution: {integrity: sha512-AirXNj15uRIMMPihnkInB4i3NHeb4iBtNg9WRWuK2o31S+ePwwNmDPaTL3o7dTJ+VXNZim7rFs4rxN4YU1oUJw==}
+
+  lodash.isnil@4.0.0:
+    resolution: {integrity: sha512-up2Mzq3545mwVnMhTDMdfoG1OurpA/s5t88JmQX809eH3C8491iu2sfKhTfhQtKY78oPNhiaHJUpT/dUDAAtng==}
+
+  lodash.isundefined@3.0.1:
+    resolution: {integrity: sha512-MXB1is3s899/cD8jheYYE2V9qTHwKvt+npCwpD+1Sxm3Q3cECXCiYHjeHWXNwr6Q0SOBPrYUDxendrO6goVTEA==}
+
   lodash.sortby@4.7.0:
     resolution: {integrity: sha512-HDWXG8isMntAyRF5vZ7xKuEvOhT4AhlRt/3czTSjvGUxjYCBVRQY48ViDHyfYz9VIoBkW4TMGQNapx+l3RUwdA==}
 
   lodash.startcase@4.4.0:
     resolution: {integrity: sha512-+WKqsK294HMSc2jEbNgpHpd0JfIBhp7rEV4aqXWqFr6AlXov+SlcgB1Fv01y2kGe3Gc8nMW7VA0SrGuSkRfIEg==}
 
+  lodash.uniq@4.5.0:
+    resolution: {integrity: sha512-xfBaXQd9ryd9dlSDvnvI0lvxfLJlYAZzXomUYzLKtUeOQvOP5piqAWuGtrhWeqaXK9hhoM/iyJc5AV+XfsX3HQ==}
+
   lodash@4.17.21:
     resolution: {integrity: sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==}
 
@@ -8687,6 +8702,15 @@ snapshots:
     dependencies:
       '@expressive-code/core': 0.41.3
 
+  '@fast-csv/parse@5.0.5':
+    dependencies:
+      lodash.escaperegexp: 4.1.2
+      lodash.groupby: 4.6.0
+      lodash.isfunction: 3.0.9
+      lodash.isnil: 4.0.0
+      lodash.isundefined: 3.0.1
+      lodash.uniq: 4.5.0
+
   '@fastify/busboy@3.2.0': {}
 
   '@floating-ui/core@1.7.3':
@@ -11671,10 +11695,6 @@ snapshots:
 
   csstype@3.2.3: {}
 
-  csv-simple-parser@2.0.2:
-    dependencies:
-      detect-eol: 3.0.1
-
   cytoscape-cose-bilkent@4.1.0(cytoscape@3.33.1):
     dependencies:
       cose-base: 1.0.3
@@ -11918,8 +11938,6 @@ snapshots:
 
   destr@2.0.5: {}
 
-  detect-eol@3.0.1: {}
-
   detect-indent@6.1.0: {}
 
   detect-libc@2.1.2: {}
@@ -13029,10 +13047,22 @@ snapshots:
 
   lodash.debounce@4.0.8: {}
 
+  lodash.escaperegexp@4.1.2: {}
+
+  lodash.groupby@4.6.0: {}
+
+  lodash.isfunction@3.0.9: {}
+
+  lodash.isnil@4.0.0: {}
+
+  lodash.isundefined@3.0.1: {}
+
   lodash.sortby@4.7.0: {}
 
   lodash.startcase@4.4.0: {}
 
+  lodash.uniq@4.5.0: {}
+
   lodash@4.17.21: {}
 
   long@5.3.2: {}

From e20932db1e0c53549aa1a35aecd5eb76be8564cc Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Mon, 6 Oct 2025 16:44:32 +0200
Subject: [PATCH 06/28] add documentation for csv convert

---
 .../src/commands/convert-csv-command.test.ts  |   8 +-
 .../ensrainbow/concepts/creating-files.mdx    | 593 ++++++++++++++++++
 .../docs/ensrainbow/concepts/data-model.mdx   |  11 +-
 .../docs/ensrainbow/contributing/index.mdx    |   5 +-
 4 files changed, 604 insertions(+), 13 deletions(-)
 create mode 100644 docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx

diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
index 795e53bdc..58c7af900 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
@@ -126,10 +126,6 @@ describe("convert-csv-command", () => {
       expect(outputStats.isFile()).toBe(true);
       expect(outputStats.size).toBeGreaterThan(0);
 
-      // Verify special characters were processed correctly by checking logs
-      // The conversion completed successfully, which means csv-simple-parser
-      // handled emojis, unicode, quoted fields with commas, etc.
-
       // Ingest the converted file into database
       const cli = createCLI({ exitProcess: false });
       await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]);
@@ -141,8 +137,8 @@ describe("convert-csv-command", () => {
       const labels = [
         "🔥emoji-label🚀",
         'special"quotes"inside',
-        "label with newline\n character",
-        "label-with-null\0byte",
+        "label with newline\n character", // new line
+        "label-with-null\0byte", // null byte
       ];
       for (const label of labels) {
         expect(
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
new file mode 100644
index 000000000..f2c9c34cf
--- /dev/null
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
@@ -0,0 +1,593 @@
+---
+title: Creating ENSRainbow Files
+description: Complete guide to creating .ensrainbow files from SQL dumps and CSV data.
+sidebar:
+  label: Creating Files
+  order: 3
+keywords: [ensrainbow, file creation, conversion, sql, csv]
+---
+
+ENSRainbow provides two methods for creating `.ensrainbow` files from different data sources. This guide helps you choose the right method and provides step-by-step instructions.
+
+## Prerequisites
+
+Before creating `.ensrainbow` files, ensure you have:
+
+1. **ENSNode repository cloned**:
+   ```bash
+   git clone https://github.com/namehash/ensnode.git
+   cd ensnode
+   ```
+
+2. **Dependencies installed**:
+   ```bash
+   pnpm install
+   ```
+
+3. **Working directory**: Navigate to the ENSRainbow directory:
+   ```bash
+   cd apps/ensrainbow
+   ```
+
+All commands in this guide assume you're in the `apps/ensrainbow` directory unless otherwise specified.
+
+## Overview
+
+A `.ensrainbow` file is ENSRainbow's binary format for storing label-to-labelhash mappings. It uses Protocol Buffers for efficient serialization and supports streaming for large datasets.
+
+For detailed information about the file format structure, see the [Data Model](/ensrainbow/concepts/data-model) documentation.
+
+## Choosing Your Conversion Method
+
+| Method | Input Format | Use Case | Command |
+|--------|-------------|----------|---------|
+| **SQL Conversion** | Gzipped SQL dump (`ens_names.sql.gz`) | Converting legacy ENS Subgraph data | `pnpm run convert` |
+| **CSV Conversion** | CSV file (1 or 2 columns) | Custom datasets, test data, external sources | `pnpm run convert-csv` |
+
+### When to Use SQL Conversion
+
+- Converting existing ENS Subgraph rainbow tables
+- Working with legacy `ens_names.sql.gz` files
+- Migrating from previous ENS data formats
+
+### When to Use CSV Conversion
+
+- Creating test datasets
+- Converting data from external sources
+- Working with custom label collections
+- Building incremental label sets
+
+## Method 1: Converting from SQL Dumps
+
+The `convert` command processes gzipped SQL dump files from the ENS Subgraph.
+
+### Command Syntax
+
+```bash
+pnpm run convert \
+  --input-file <path/to/ens_names.sql.gz> \
+  --output-file <output.ensrainbow> \
+  --label-set-id <label-set-id> \
+  --label-set-version <version-number>
+```
+
+### Required Parameters
+
+- `--input-file`: Path to the gzipped SQL dump file
+- `--label-set-id`: Identifier for the label set (e.g., `subgraph`, `discovery-a`)
+- `--label-set-version`: Version number for the label set (non-negative integer)
+
+### Optional Parameters
+
+- `--output-file`: Output file path (defaults to `rainbow-records.ensrainbow`)
+
+### Example: Converting ENS Subgraph Data
+
+```bash
+# Convert main ENS Subgraph data
+pnpm run convert \
+  --input-file ens_names.sql.gz \
+  --output-file subgraph_0.ensrainbow \
+  --label-set-id subgraph \
+  --label-set-version 0
+```
+
+### Example: Converting Test Data
+
+```bash
+# Convert ens-test-env data
+pnpm run convert \
+  --input-file test/fixtures/ens_test_env_names.sql.gz \
+  --output-file ens-test-env_0.ensrainbow \
+  --label-set-id ens-test-env \
+  --label-set-version 0
+```
+
+### How It Works
+
+1. **Streams** the gzipped SQL file to avoid memory issues
+2. **Parses** SQL COPY statements to extract label/labelhash pairs
+3. **Validates** each record and skips invalid entries
+4. **Writes** protobuf messages with length-delimited encoding
+5. **Creates** a header message followed by individual record messages
+
+## Method 2: Converting from CSV Files
+
+The `convert-csv` command processes CSV files with flexible column formats.
+
+### Command Syntax
+
+```bash
+pnpm run convert-csv \
+  --input-file <path/to/data.csv> \
+  --output-file <output.ensrainbow> \
+  --label-set-id <label-set-id> \
+  --label-set-version <version-number> \
+  [--progress-interval <number>]
+```
+
+### Required Parameters
+
+- `--input-file`: Path to the CSV file
+- `--label-set-id`: Identifier for the label set
+- `--label-set-version`: Version number for the label set
+
+### Optional Parameters
+
+- `--output-file`: Output file path (defaults to `rainbow-records.ensrainbow`)
+- `--progress-interval`: Progress logging frequency (default: 10000 records)
+
+### CSV Format Support
+
+The CSV converter supports two formats:
+
+#### Single Column Format (Label Only)
+```csv
+ethereum
+vitalik
+ens
+```
+
+The converter automatically computes labelhashes using the `labelhash()` function.
+
+#### Two Column Format (Label + Labelhash)
+```csv
+ethereum,0x541111248b45b7a8dc3f5579f630e74cb01456ea6ac067d3f4d793245a255155
+vitalik,0xaf2caa1c2ca1d027f1ac823b529d0a67cd144264b2789fa2ea4d63a67c7103cc
+ens,0x5cee339e13375638553bdf5a6e36ba80fb9f6a4f0783680884d92b558aa471da
+```
+
+The converter validates that provided labelhashes match the computed hash for each label.
+
+### Example: Creating Test Dataset
+
+```bash
+# Create test dataset from CSV
+pnpm run convert-csv \
+  --input-file test-labels.csv \
+  --output-file test-dataset_0.ensrainbow \
+  --label-set-id test-dataset \
+  --label-set-version 0
+```
+
+### Example: Creating Discovery Dataset
+
+```bash
+# Create discovery dataset (initially empty)
+echo "" > empty.csv
+pnpm run convert-csv \
+  --input-file empty.csv \
+  --output-file discovery-a_0.ensrainbow \
+  --label-set-id discovery-a \
+  --label-set-version 0
+```
+
+### How It Works
+
+1. **Detects** CSV format automatically (1 or 2 columns)
+2. **Streams** CSV parsing using fast-csv for memory efficiency
+3. **Validates** column count and data format
+4. **Computes** or validates labelhashes as needed
+5. **Writes** protobuf messages with the same format as SQL conversion
+
+## Common Workflows
+
+### Workflow 1: Migrating from ENS Subgraph
+
+```bash
+# 1. Convert SQL dump to .ensrainbow
+pnpm run convert \
+  --input-file ens_names.sql.gz \
+  --output-file subgraph_0.ensrainbow \
+  --label-set-id subgraph \
+  --label-set-version 0
+
+# 2. Ingest into LevelDB
+pnpm run ingest-ensrainbow \
+  --input-file subgraph_0.ensrainbow \
+  --data-dir data-subgraph
+
+# 3. Validate the database
+pnpm run validate --data-dir data-subgraph
+
+# 4. Start the API server
+pnpm run serve --data-dir data-subgraph --port 3223
+```
+
+### Workflow 2: Creating Test Environment
+
+```bash
+# 1. Convert test data
+pnpm run convert \
+  --input-file test/fixtures/ens_test_env_names.sql.gz \
+  --output-file ens-test-env_0.ensrainbow \
+  --label-set-id ens-test-env \
+  --label-set-version 0
+
+# 2. Ingest test data
+pnpm run ingest-ensrainbow \
+  --input-file ens-test-env_0.ensrainbow \
+  --data-dir data-test-env
+
+# 3. Run with test data
+pnpm run serve --data-dir data-test-env --port 3223
+```
+
+### Workflow 3: Building Custom Dataset
+
+```bash
+# 1. Create CSV with your labels
+echo "mylabel1
+mylabel2
+mylabel3" > custom-labels.csv
+
+# 2. Convert to .ensrainbow
+pnpm run convert-csv \
+  --input-file custom-labels.csv \
+  --output-file custom_0.ensrainbow \
+  --label-set-id custom \
+  --label-set-version 0
+
+# 3. Ingest and serve
+pnpm run ingest-ensrainbow \
+  --input-file custom_0.ensrainbow \
+  --data-dir data-custom
+
+pnpm run serve --data-dir data-custom --port 3223
+```
+
+### Workflow 4: Using Custom Label Set Server
+
+```bash
+# 1. Configure custom label set server
+export ENSRAINBOW_LABELSET_SERVER_URL="https://my-label-set-server.com"
+
+# 2. Download from custom server
+# The script downloads to labelsets/ subdirectory
+./scripts/download-ensrainbow-files.sh my-dataset 0
+
+# 3. Ingest and serve
+# Files are downloaded to labelsets/ by the script
+pnpm run ingest-ensrainbow \
+  --input-file labelsets/my-dataset_0.ensrainbow \
+  --data-dir data-my-dataset
+
+pnpm run serve --data-dir data-my-dataset --port 3223
+```
+
+:::note[Script Output Locations]
+ENSRainbow download scripts save files to specific subdirectories:
+- **`.ensrainbow` files**: `labelsets/`
+- **Database archives**: `databases/{schema_version}/`
+- **Checksums and licenses**: Same directory as the downloaded file
+:::
+
+## File Naming Conventions
+
+Follow the naming convention: `{label-set-id}_{label-set-version}.ensrainbow`
+
+**Examples:**
+- `subgraph_0.ensrainbow` - Main ENS data, version 0
+- `subgraph_1.ensrainbow` - Main ENS data, version 1 (incremental update)
+- `discovery-a_0.ensrainbow` - Discovery dataset, version 0
+- `ens-test-env_0.ensrainbow` - Test environment data, version 0
+
+## Next Steps
+
+After creating your `.ensrainbow` file:
+
+1. **[Ingest the data](/ensrainbow/contributing/index#data-ingestion-ingest-ensrainbow)** into a ENSRainbow database
+2. **[Validate the database](/ensrainbow/contributing/index#database-validation-validate)** to ensure integrity
+3. **[Start the API server](/ensrainbow/contributing/index#api-server-serve)** to serve the data
+
+For complete CLI reference information, see the [CLI Reference](/ensrainbow/contributing/cli-reference) documentation.
+
+## Creating and Publishing Custom .ensrainbow Files
+
+If you want to create, publish, and distribute your own `.ensrainbow` files, follow these steps:
+
+### 1. Create Your Dataset
+
+First, prepare your data in either SQL or CSV (recommended) format, then convert it using the appropriate method:
+
+```bash
+# For CSV data
+pnpm run convert-csv \
+  --input-file my-labels.csv \
+  --output-file my-dataset_0.ensrainbow \
+  --label-set-id my-dataset \
+  --label-set-version 0
+
+# For SQL data
+pnpm run convert \
+  --input-file my-data.sql.gz \
+  --output-file my-dataset_0.ensrainbow \
+  --label-set-id my-dataset \
+  --label-set-version 0
+```
+
+### 2. Validate Your File
+
+Test your `.ensrainbow` file by ingesting it locally:
+
+```bash
+# Ingest your custom dataset
+pnpm run ingest-ensrainbow \
+  --input-file my-dataset_0.ensrainbow \
+  --data-dir data-my-dataset
+
+# Validate the database
+pnpm run validate --data-dir data-my-dataset
+
+# Test the API
+pnpm run serve --data-dir data-my-dataset --port 3223
+```
+
+### 3. Publish Your File
+
+#### Option A: Direct File Sharing
+- Upload your `.ensrainbow` file to a web server or cloud storage
+- Provide a direct download URL
+- Share checksums for integrity verification
+
+#### Option B: Package as Database Archive
+For better performance, package your data as a pre-built database:
+
+```bash
+# Ingest your .ensrainbow file
+pnpm run ingest-ensrainbow \
+  --input-file my-dataset_0.ensrainbow \
+  --data-dir data-my-dataset
+
+# Package the database
+tar -czvf my-dataset_0.tgz ./data-my-dataset
+
+# Calculate checksum
+sha256sum my-dataset_0.tgz > my-dataset_0.tgz.sha256sum
+```
+
+### 4. Document Your Label Set
+
+Create documentation for your custom label set including:
+
+- **Label Set ID**: The identifier users will specify
+- **Description**: What labels are included and their source
+- **Version**: Current version number
+- **Download URLs**: Where to get the files
+- **Checksums**: For integrity verification
+- **Usage Examples**: How to use your dataset
+
+### Example Documentation Format
+
+```markdown
+## Custom Label Set: my-dataset
+
+**Label Set ID**: `my-dataset`  
+**Current Version**: `0`  
+**Description**: Custom ENS labels from [source description]
+
+### Download
+- Database Archive: `https://example.com/my-dataset_0.tgz`
+- Checksum: `https://example.com/my-dataset_0.tgz.sha256sum`
+
+### Usage
+```bash
+# Using with Docker
+docker run -d \
+  -e DB_SCHEMA_VERSION="3" \
+  -e LABEL_SET_ID="my-dataset" \
+  -e LABEL_SET_VERSION="0" \
+  -p 3223:3223 \
+  ghcr.io/namehash/ensnode/ensrainbow:latest
+```
+
+## Setting Up Your Own Label Set Server
+
+A **Label Set Server** is a storage and hosting service for `.ensrainbow` files and prebuilt database archives. It's not the ENSRainbow API server itself, but rather a way to distribute your custom datasets for others to download and use.
+
+### 1. Choose Your Hosting Platform
+
+You can host your label set files on any web server or cloud storage service:
+
+- **AWS S3**: Industry standard with versioning
+- **Cloudflare R2**: Cost-effective alternative to S3
+- **Simple HTTP server**: For internal/private use
+
+### 2. Organize Your Files
+
+Structure your label set files following ENSRainbow conventions:
+
+```
+my-label-set-server/
+├── labelsets/
+│   ├── my-dataset_0.ensrainbow
+│   ├── my-dataset_0.ensrainbow.sha256sum
+│   ├── my-dataset_1.ensrainbow
+│   └── my-dataset_1.ensrainbow.sha256sum
+└── databases/
+    ├── 3/  # Schema version
+    │   ├── my-dataset_0.tgz
+    │   ├── my-dataset_0.tgz.sha256sum
+    │   ├── my-dataset_1.tgz
+    │   └── my-dataset_1.tgz.sha256sum
+    └── 4/  # Future schema version
+```
+
+### 3. Use Existing Download Scripts
+
+ENSRainbow provides ready-to-use download scripts that users can configure to download from your label set server:
+
+#### Download .ensrainbow Files
+```bash
+# Configure your label set server URL
+export ENSRAINBOW_LABELSET_SERVER_URL="https://my-label-set-server.com"
+
+# Download .ensrainbow file using the existing script
+./scripts/download-ensrainbow-files.sh my-dataset 0
+```
+
+#### Download Prebuilt Database Archives
+```bash
+# Configure your label set server URL
+export ENSRAINBOW_LABELSET_SERVER_URL="https://my-label-set-server.com"
+
+# Download prebuilt database using the existing script
+./scripts/download-prebuilt-database.sh 3 my-dataset 0
+```
+
+#### Script Features
+The existing scripts automatically handle:
+- **Checksum verification** for data integrity
+- **Resume downloads** if files already exist and are valid
+- **License file downloads** (optional)
+- **Progress reporting** for large files
+- **Error handling** with cleanup of partial downloads
+
+### 4. Document Your Label Set Server
+
+Create a README or documentation page for your label set server:
+
+```markdown
+# My Label Set Server
+
+This server hosts custom ENS label sets for ENSRainbow.
+
+## Available Label Sets
+
+### my-dataset
+- **Description**: Custom ENS labels from [source]
+- **Versions**: 0, 1
+- **Schema Versions**: 3
+- **Base URL**: `https://my-label-set-server.com`
+
+### another-dataset
+- **Description**: Additional labels from [source]
+- **Versions**: 0
+- **Schema Versions**: 3
+- **Base URL**: `https://my-label-set-server.com`
+```
+
+## Usage
+
+Users should have the ENSNode repository cloned and be in the `apps/ensrainbow` directory.
+
+### Option 1: Download .ensrainbow Files
+
+```bash
+# Configure your label set server
+export ENSRAINBOW_LABELSET_SERVER_URL="https://my-label-set-server.com"
+
+# Download .ensrainbow file
+./scripts/download-ensrainbow-files.sh my-dataset 0
+
+# Ingest into ENSRainbow
+pnpm run ingest-ensrainbow \
+  --input-file labelsets/my-dataset_0.ensrainbow \
+  --data-dir data-my-dataset
+
+# Start ENSRainbow server
+pnpm run serve --data-dir data-my-dataset --port 3223
+```
+
+### Option 2: Download Prebuilt Databases (Faster)
+
+```bash
+# Configure your label set server
+export ENSRAINBOW_LABELSET_SERVER_URL="https://my-label-set-server.com"
+
+# Download prebuilt database
+./scripts/download-prebuilt-database.sh 3 my-dataset 0
+
+# Extract database
+tar -xzf databases/3/my-dataset_0.tgz -C data-my-dataset --strip-components=1
+
+# Start ENSRainbow server
+pnpm run serve --data-dir data-my-dataset --port 3223
+```
+
+### 5. Version Management
+
+Implement proper versioning for your label sets:
+
+```bash
+# When releasing a new version
+LABEL_SET_ID="my-dataset"
+NEW_VERSION="1"
+
+# Create new .ensrainbow file
+pnpm run convert-csv \
+  --input-file updated-labels.csv \
+  --output-file ${LABEL_SET_ID}_${NEW_VERSION}.ensrainbow \
+  --label-set-id ${LABEL_SET_ID} \
+  --label-set-version ${NEW_VERSION}
+
+# Create prebuilt database
+pnpm run ingest-ensrainbow \
+  --input-file ${LABEL_SET_ID}_${NEW_VERSION}.ensrainbow \
+  --data-dir data-${LABEL_SET_ID}-${NEW_VERSION}
+
+tar -czvf ${LABEL_SET_ID}_${NEW_VERSION}.tgz ./data-${LABEL_SET_ID}-${NEW_VERSION}
+
+# Calculate checksums
+sha256sum ${LABEL_SET_ID}_${NEW_VERSION}.ensrainbow > ${LABEL_SET_ID}_${NEW_VERSION}.ensrainbow.sha256sum
+sha256sum ${LABEL_SET_ID}_${NEW_VERSION}.tgz > ${LABEL_SET_ID}_${NEW_VERSION}.tgz.sha256sum
+
+# Upload to your label set server
+# (implementation depends on your hosting platform)
+```
+
+### 6. Testing Your Label Set Server
+
+Before publishing, test that your label set server works correctly:
+
+```bash
+# Set your test server URL
+export ENSRAINBOW_LABELSET_SERVER_URL="https://my-label-set-server.com"
+
+# Test downloading .ensrainbow file
+./scripts/download-ensrainbow-files.sh my-dataset 0
+
+# Verify checksum was validated
+# The script will fail if checksums don't match
+
+# Test downloading prebuilt database
+./scripts/download-prebuilt-database.sh 3 my-dataset 0
+
+# Verify the database works
+pnpm run ingest-ensrainbow \
+  --input-file labelsets/my-dataset_0.ensrainbow \
+  --data-dir test-data
+
+pnpm run validate --data-dir test-data
+```
+
+## Running Your Own ENSRainbow Server
+
+If you want to run your own ENSRainbow API server (separate from the label set server), see the [Local Development](/ensrainbow/contributing/local-development) guide for instructions on setting up and running ENSRainbow locally or in production.
+
+## Related Documentation
+
+- **[Data Model](/ensrainbow/concepts/data-model)** - Understanding the `.ensrainbow` file format
+- **[Label Sets & Versioning](/ensrainbow/concepts/label-sets-and-versioning)** - Managing label set versions
+- **[CLI Reference](/ensrainbow/contributing/cli-reference)** - Complete command documentation
+- **[Local Development](/ensrainbow/contributing/local-development)** - Setting up your development environment
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx
index 8978ca5a9..e1df686d0 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx
@@ -104,15 +104,14 @@ subgraph_0.ensrainbow     # labelSetId = "subgraph", version = 0
 subgraph_1.ensrainbow     # next version with incremental labelhash-to-label mappings added
 ```
 
-## Converting Legacy SQL Data
+## Creating ENSRainbow Files
 
-If you have a legacy gzipped rainbow table (`ens_names.sql.gz`) from the ENS Subgraph, you can convert it to the `.ensrainbow` format:
+ENSRainbow provides two methods for creating `.ensrainbow` files from different data sources:
 
-```bash title="Convert legacy SQL data"
-pnpm run convert --input-file path/to/ens_names.sql.gz --output-file subgraph-0.ensrainbow
-```
+- **SQL Conversion**: Convert legacy ENS Subgraph data (`ens_names.sql.gz`) using `pnpm run convert`
+- **CSV Conversion**: Convert custom datasets from CSV files using `pnpm run convert-csv`
 
-This conversion process allows you to migrate existing rainbow table data that was previously stored in SQL format to ENSRainbow's optimized binary format. The resulting `.ensrainbow` file will be equivalent to the rainbow tables used by the ENS Subgraph, maintaining the same label-to-labelhash mappings while providing better performance and storage efficiency.
+For complete instructions, examples, and workflow guidance, see the [Creating ENSRainbow Files](/ensrainbow/concepts/creating-files) guide.
 
 ## Ingestion Process
 
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx
index 64556f1eb..401a0f986 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx
@@ -17,6 +17,7 @@ This guide covers running ENSRainbow locally for development and contributions.
 For focused guidance on specific topics, check out these dedicated pages:
 
 <LinkCard title="Local Development" href="/ensrainbow/contributing/local-development" />
+<LinkCard title="Creating ENSRainbow Files" href="/ensrainbow/concepts/creating-files" />
 <LinkCard title="CLI Reference" href="/ensrainbow/contributing/cli-reference" />
 <LinkCard title="Service Management" href="/ensrainbow/contributing/service-management" />
 <LinkCard title="System Requirements" href="/ensrainbow/contributing/system-requirements" />
@@ -24,6 +25,7 @@ For focused guidance on specific topics, check out these dedicated pages:
 
 :::tip[Choose Your Path]
 - **New to the project?** Start with [Local Development](/ensrainbow/contributing/local-development)
+- **Creating custom datasets?** See [Creating ENSRainbow Files](/ensrainbow/concepts/creating-files)
 - **Need CLI help?** Check the [CLI Reference](/ensrainbow/contributing/cli-reference)
 - **Building for production?** See [Building Docker Images](/ensrainbow/contributing/building)
 :::
@@ -41,6 +43,7 @@ Follow these steps to start contributing to ENSRainbow:
 ## Quick Reference
 
 - **Need to build from source?** → [Building Docker Images](/ensrainbow/contributing/building)
+- **Creating custom datasets?** → [Creating ENSRainbow Files](/ensrainbow/concepts/creating-files)
 - **Looking for CLI commands?** → [CLI Reference](/ensrainbow/contributing/cli-reference)
 - **Running into issues?** → [Troubleshooting](/ensrainbow/usage/troubleshooting)
 - **Want to understand the data flow?** → [Data Model](/ensrainbow/concepts/data-model)
@@ -265,7 +268,7 @@ These steps are typically performed by project maintainers for releasing officia
 
 ### 1. Prepare `.ensrainbow` Files
 
-This section covers the conversion of source data (like SQL dumps or empty files for initial datasets) into the `.ensrainbow` format. The `time` command is used here to measure the duration of potentially long-running conversion processes.
+This section covers the conversion of source data (like SQL dumps or empty files for initial datasets) into the `.ensrainbow` format. For detailed conversion instructions and examples, see the [Creating ENSRainbow Files](/ensrainbow/concepts/creating-files) guide.
 
 **For the `subgraph` Label Set (main dataset):**
 This command converts a SQL dump file (`ens_names.sql.gz`) into an `.ensrainbow` file for version 0 of the `subgraph` Label Set.

From b9c31b08422a1b71100bcec7ac2940a11bb5e35b Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Fri, 17 Oct 2025 22:45:34 +0200
Subject: [PATCH 07/28] feat: add filtering capabilities to CSV conversion

- Introduced `--existing-db-path` option to filter out existing labels from an ENSRainbow database during CSV conversion.
- Enhanced conversion process to skip duplicate labels within the same CSV file.
- Updated logging to include statistics on filtered labels.
- Added comprehensive tests for filtering functionality and updated documentation to reflect new features.
---
 apps/ensrainbow/src/cli.ts                    |   6 +
 .../src/commands/convert-csv-command.test.ts  | 189 ++++++++++++++++++
 .../src/commands/convert-csv-command.ts       | 111 +++++++++-
 .../ensrainbow/concepts/creating-files.mdx    |  86 +++++++-
 4 files changed, 379 insertions(+), 13 deletions(-)

diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts
index 940692729..d9d38c4f9 100644
--- a/apps/ensrainbow/src/cli.ts
+++ b/apps/ensrainbow/src/cli.ts
@@ -68,6 +68,7 @@ interface ConvertCsvArgs {
   "label-set-id": LabelSetId;
   "label-set-version": LabelSetVersion;
   "progress-interval"?: number;
+  "existing-db-path"?: string;
 }
 
 export interface CLIOptions {
@@ -259,6 +260,10 @@ export function createCLI(options: CLIOptions = {}) {
               type: "number",
               description: "Number of records to process before logging progress",
               default: 10000,
+            })
+            .option("existing-db-path", {
+              type: "string",
+              description: "Path to existing ENSRainbow database to filter out existing labels",
             });
         },
         async (argv: ArgumentsCamelCase<ConvertCsvArgs>) => {
@@ -268,6 +273,7 @@ export function createCLI(options: CLIOptions = {}) {
             labelSetId: argv["label-set-id"],
             labelSetVersion: argv["label-set-version"],
             progressInterval: argv["progress-interval"],
+            existingDbPath: argv["existing-db-path"],
           });
         },
       )
diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
index 58c7af900..9e2569ab2 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
@@ -216,6 +216,195 @@ describe("convert-csv-command", () => {
     });
   });
 
+  describe("Filtering functionality", () => {
+    it("should filter out labels that already exist in the database", async () => {
+      const inputFile = join(TEST_FIXTURES_DIR, "test_labels_1col.csv");
+      const outputFile = join(tempDir, "output_filtered.ensrainbow");
+      const dataDir = join(tempDir, "db_filtered");
+
+      // First, create an initial database with some labels
+      const initialOutputFile = join(tempDir, "initial.ensrainbow");
+      await convertCsvCommand({
+        inputFile,
+        outputFile: initialOutputFile,
+        labelSetId: "test-filtering" as LabelSetId,
+        labelSetVersion: 0 as LabelSetVersion,
+      });
+
+      // Ingest the initial file
+      const cli = createCLI({ exitProcess: false });
+      await cli.parse([
+        "ingest-ensrainbow",
+        "--input-file",
+        initialOutputFile,
+        "--data-dir",
+        dataDir,
+      ]);
+
+      // Verify initial database
+      const db = await ENSRainbowDB.open(dataDir);
+      expect(await db.validate()).toBe(true);
+      const initialCount = await db.getPrecalculatedRainbowRecordCount();
+      expect(initialCount).toBe(11);
+      await db.close();
+
+      // Now convert the same CSV file again, but with filtering enabled
+      await convertCsvCommand({
+        inputFile,
+        outputFile,
+        labelSetId: "test-filtering" as LabelSetId,
+        labelSetVersion: 0 as LabelSetVersion, // Use same version as initial
+        existingDbPath: dataDir,
+      });
+
+      // Verify the filtered output file was created
+      const outputStats = await stat(outputFile);
+      expect(outputStats.isFile()).toBe(true);
+
+      // The filtered file should be smaller than the original since it excludes existing labels
+      const initialStats = await stat(initialOutputFile);
+      expect(outputStats.size).toBeLessThan(initialStats.size);
+
+      // Verify that the filtered file contains fewer records
+      const filteredDataDir = join(tempDir, "db_filtered_result");
+      await cli.parse([
+        "ingest-ensrainbow",
+        "--input-file",
+        outputFile,
+        "--data-dir",
+        filteredDataDir,
+      ]);
+
+      const filteredDb = await ENSRainbowDB.open(filteredDataDir);
+      expect(await filteredDb.validate()).toBe(true);
+      const filteredCount = await filteredDb.getPrecalculatedRainbowRecordCount();
+      expect(filteredCount).toBe(0); // All labels should be filtered out since they already exist
+      await filteredDb.close();
+    });
+
+    it("should filter out duplicate labels within the same conversion", async () => {
+      // Create a CSV file with duplicate labels
+      const csvContent = "label1\nlabel2\nlabel1\nlabel3\nlabel2\nlabel4";
+      const inputFile = join(tempDir, "duplicates.csv");
+      await writeFile(inputFile, csvContent);
+
+      const outputFile = join(tempDir, "output_no_duplicates.ensrainbow");
+
+      // Convert CSV with duplicate filtering
+      await convertCsvCommand({
+        inputFile,
+        outputFile,
+        labelSetId: "test-duplicates" as LabelSetId,
+        labelSetVersion: 0 as LabelSetVersion,
+      });
+
+      // Verify the output file was created
+      const stats = await stat(outputFile);
+      expect(stats.isFile()).toBe(true);
+      expect(stats.size).toBeGreaterThan(0);
+
+      // Ingest and verify only unique labels were processed
+      const dataDir = join(tempDir, "db_no_duplicates");
+      const cli = createCLI({ exitProcess: false });
+      await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]);
+
+      const db = await ENSRainbowDB.open(dataDir);
+      expect(await db.validate()).toBe(true);
+
+      // Should have 4 unique labels (label1, label2, label3, label4)
+      const recordsCount = await db.getPrecalculatedRainbowRecordCount();
+      expect(recordsCount).toBe(4);
+
+      // Verify specific labels exist
+      expect(
+        (await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("label1"))))?.label,
+      ).toBe("label1");
+      expect(
+        (await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("label2"))))?.label,
+      ).toBe("label2");
+      expect(
+        (await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("label3"))))?.label,
+      ).toBe("label3");
+      expect(
+        (await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("label4"))))?.label,
+      ).toBe("label4");
+
+      await db.close();
+    });
+
+    it("should handle non-existent database path gracefully", async () => {
+      const inputFile = join(TEST_FIXTURES_DIR, "test_labels_1col.csv");
+      const outputFile = join(tempDir, "output_no_db.ensrainbow");
+      const nonExistentDbPath = join(tempDir, "non-existent-db");
+
+      // Should not throw error even with non-existent database path
+      await expect(
+        convertCsvCommand({
+          inputFile,
+          outputFile,
+          labelSetId: "test-no-db" as LabelSetId,
+          labelSetVersion: 0 as LabelSetVersion,
+          existingDbPath: nonExistentDbPath,
+        }),
+      ).resolves.not.toThrow();
+
+      // Verify the output file was still created
+      const stats = await stat(outputFile);
+      expect(stats.isFile()).toBe(true);
+      expect(stats.size).toBeGreaterThan(0);
+    });
+
+    it("should work through CLI with existing database path", async () => {
+      const inputFile = join(TEST_FIXTURES_DIR, "test_labels_1col.csv");
+      const outputFile = join(tempDir, "cli_output_with_db.ensrainbow");
+      const dataDir = join(tempDir, "cli_db_with_filtering");
+
+      // First create a database
+      const initialOutputFile = join(tempDir, "initial_cli.ensrainbow");
+      const cli = createCLI({ exitProcess: false });
+
+      await cli.parse([
+        "convert-csv",
+        "--input-file",
+        inputFile,
+        "--output-file",
+        initialOutputFile,
+        "--label-set-id",
+        "test-cli-filtering",
+        "--label-set-version",
+        "0",
+      ]);
+
+      await cli.parse([
+        "ingest-ensrainbow",
+        "--input-file",
+        initialOutputFile,
+        "--data-dir",
+        dataDir,
+      ]);
+
+      // Now test CLI with existing database path
+      await cli.parse([
+        "convert-csv",
+        "--input-file",
+        inputFile,
+        "--output-file",
+        outputFile,
+        "--label-set-id",
+        "test-cli-filtering",
+        "--label-set-version",
+        "1",
+        "--existing-db-path",
+        dataDir,
+      ]);
+
+      // Verify file was created
+      const stats = await stat(outputFile);
+      expect(stats.isFile()).toBe(true);
+      expect(stats.size).toBeGreaterThan(0);
+    });
+  });
+
   describe("Streaming performance", () => {
     it("should handle small CSV files efficiently", async () => {
       const inputFile = join(tempDir, "small_test.csv");
diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts
index 14ae2d4b3..34f64d935 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.ts
@@ -9,6 +9,7 @@ import { createReadStream, createWriteStream } from "fs";
 import { type LabelHash, labelHashToBytes } from "@ensnode/ensnode-sdk";
 import { parse } from "@fast-csv/parse";
 import { labelhash } from "viem";
+import { ENSRainbowDB } from "../lib/database.js";
 import { logger } from "../utils/logger.js";
 import {
   CURRENT_ENSRAINBOW_FILE_FORMAT_VERSION,
@@ -21,6 +22,7 @@ export interface ConvertCsvCommandOptions {
   labelSetId: string;
   labelSetVersion: number;
   progressInterval?: number;
+  existingDbPath?: string; // Path to existing ENSRainbow database to check for existing labels
 }
 
 // Configuration constants
@@ -29,6 +31,8 @@ const DEFAULT_PROGRESS_INTERVAL = 10000;
 interface ConversionStats {
   totalLines: number;
   processedRecords: number;
+  filteredExistingLabels: number;
+  filteredDuplicates: number;
   startTime: Date;
   endTime?: Date;
 }
@@ -74,19 +78,47 @@ function logSummary(stats: ConversionStats) {
   logger.info("=== Conversion Summary ===");
   logger.info(`Total lines processed: ${stats.totalLines}`);
   logger.info(`Valid records: ${stats.processedRecords}`);
+  logger.info(`Filtered existing labels: ${stats.filteredExistingLabels}`);
+  logger.info(`Filtered duplicates: ${stats.filteredDuplicates}`);
   logger.info(`Duration: ${duration}ms`);
 }
 
+/**
+ * Check if a labelhash exists in the ENSRainbow database
+ */
+async function checkLabelHashExists(db: ENSRainbowDB, labelHashBytes: Buffer): Promise<boolean> {
+  try {
+    const record = await db.getVersionedRainbowRecord(labelHashBytes);
+    return record !== null;
+  } catch (error) {
+    // If there's an error checking, assume it doesn't exist
+    return false;
+  }
+}
+
 /**
  * Initialize conversion setup and logging
  */
-function initializeConversion(options: ConvertCsvCommandOptions) {
+async function initializeConversion(options: ConvertCsvCommandOptions) {
   logger.info("Starting conversion from CSV to protobuf format...");
   logger.info(`Input file: ${options.inputFile}`);
   logger.info(`Output file: ${options.outputFile}`);
   logger.info(`Label set id: ${options.labelSetId}`);
   logger.info(`Label set version: ${options.labelSetVersion}`);
 
+  // Open existing database if path is provided
+  let existingDb: ENSRainbowDB | null = null;
+  if (options.existingDbPath) {
+    try {
+      logger.info(`Opening existing database for filtering: ${options.existingDbPath}`);
+      existingDb = await ENSRainbowDB.open(options.existingDbPath);
+      logger.info("Successfully opened existing database for label filtering");
+    } catch (error) {
+      logger.warn(`Failed to open existing database at ${options.existingDbPath}: ${error}`);
+      logger.warn("Proceeding without filtering existing labels");
+    }
+  }
+
   const { RainbowRecordType, RainbowRecordCollectionType } = createRainbowProtobufRoot();
   const outputStream = setupWriteStream(options.outputFile);
 
@@ -99,7 +131,7 @@ function initializeConversion(options: ConvertCsvCommandOptions) {
 
   logger.info("Reading and processing CSV file line by line with streaming...");
 
-  return { RainbowRecordType, outputStream };
+  return { RainbowRecordType, outputStream, existingDb };
 }
 
 /**
@@ -131,13 +163,16 @@ function createRainbowRecord(row: string[]): { labelhash: Buffer; label: string
 /**
  * Process a single CSV record
  */
-function processRecord(
+async function processRecord(
   row: string[],
   expectedColumns: number,
   RainbowRecordType: any,
   outputStream: NodeJS.WritableStream,
   lineNumber: number,
-): void {
+  existingDb: ENSRainbowDB | null,
+  writtenLabels: Set<string>,
+  stats: ConversionStats,
+): Promise<boolean> {
   // Validate column count
   if (row.length !== expectedColumns) {
     throw new Error(
@@ -146,10 +181,32 @@ function processRecord(
   }
 
   const rainbowRecord = createRainbowRecord(row);
+  const label = rainbowRecord.label;
+  const labelHashBytes = rainbowRecord.labelhash;
+
+  // Check if labelhash already exists in the database
+  if (existingDb) {
+    const existsInDb = await checkLabelHashExists(existingDb, labelHashBytes);
+    if (existsInDb) {
+      stats.filteredExistingLabels++;
+      return false; // Skip this record
+    }
+  }
+
+  // Check if label is a duplicate within this conversion
+  if (writtenLabels.has(label)) {
+    stats.filteredDuplicates++;
+    return false; // Skip this record
+  }
+
+  // Add label to written set to track duplicates
+  writtenLabels.add(label);
 
   // Create protobuf message and write immediately
   const recordMessage = RainbowRecordType.fromObject(rainbowRecord);
   outputStream.write(Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish()));
+
+  return true; // Record was processed
 }
 
 /**
@@ -160,16 +217,19 @@ async function processCSVFile(
   RainbowRecordType: any,
   outputStream: NodeJS.WritableStream,
   progressInterval: number,
+  existingDb: ENSRainbowDB | null,
+  stats: ConversionStats,
 ): Promise<{ totalLines: number; processedRecords: number }> {
   return new Promise((resolve, reject) => {
     let expectedColumns: number | null = null;
     let lineNumber = 0;
     let processedRecords = 0;
+    const writtenLabels = new Set<string>(); // Track labels written in this conversion
 
     const fileStream = createReadStream(inputFile, { encoding: "utf8" });
 
     const csvStream = parse()
-      .on("data", (row: string[]) => {
+      .on("data", async (row: string[]) => {
         lineNumber++;
 
         try {
@@ -179,12 +239,26 @@ async function processCSVFile(
             logger.info(`Detected ${expectedColumns} columns using fast-csv`);
           }
 
-          processRecord(row, expectedColumns, RainbowRecordType, outputStream, lineNumber);
-          processedRecords++;
+          const wasProcessed = await processRecord(
+            row,
+            expectedColumns,
+            RainbowRecordType,
+            outputStream,
+            lineNumber,
+            existingDb,
+            writtenLabels,
+            stats,
+          );
+
+          if (wasProcessed) {
+            processedRecords++;
+          }
 
           // Log progress for large files
-          if (processedRecords % progressInterval === 0) {
-            logger.info(`Processed ${processedRecords} records so far...`);
+          if (lineNumber % progressInterval === 0) {
+            logger.info(
+              `Processed ${lineNumber} lines, written ${processedRecords} records so far...`,
+            );
           }
         } catch (error) {
           const errorMessage = error instanceof Error ? error.message : String(error);
@@ -219,11 +293,16 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
   const stats: ConversionStats = {
     totalLines: 0,
     processedRecords: 0,
+    filteredExistingLabels: 0,
+    filteredDuplicates: 0,
     startTime: new Date(),
   };
 
+  let existingDb: ENSRainbowDB | null = null;
+
   try {
-    const { RainbowRecordType, outputStream } = initializeConversion(options);
+    const { RainbowRecordType, outputStream, existingDb: db } = await initializeConversion(options);
+    existingDb = db;
 
     const progressInterval = options.progressInterval ?? DEFAULT_PROGRESS_INTERVAL;
 
@@ -233,6 +312,8 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
       RainbowRecordType,
       outputStream,
       progressInterval,
+      existingDb,
+      stats,
     );
 
     stats.totalLines = totalLines;
@@ -248,5 +329,15 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
     const errorMessage = error instanceof Error ? error.message : String(error);
     logger.error("❌ CSV conversion failed:", errorMessage);
     throw error;
+  } finally {
+    // Clean up database connection
+    if (existingDb) {
+      try {
+        await existingDb.close();
+        logger.info("Closed existing database connection");
+      } catch (error) {
+        logger.warn(`Failed to close existing database: ${error}`);
+      }
+    }
   }
 }
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
index f2c9c34cf..125e9916a 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
@@ -123,7 +123,8 @@ pnpm run convert-csv \
   --output-file <output.ensrainbow> \
   --label-set-id <label-set-id> \
   --label-set-version <version-number> \
-  [--progress-interval <number>]
+  [--progress-interval <number>] \
+  [--existing-db-path <path/to/existing/database>]
 ```
 
 ### Required Parameters
@@ -136,6 +137,7 @@ pnpm run convert-csv \
 
 - `--output-file`: Output file path (defaults to `rainbow-records.ensrainbow`)
 - `--progress-interval`: Progress logging frequency (default: 10000 records)
+- `--existing-db-path`: Path to existing ENSRainbow database to filter out existing labels
 
 ### CSV Format Support
 
@@ -159,6 +161,42 @@ ens,0x5cee339e13375638553bdf5a6e36ba80fb9f6a4f0783680884d92b558aa471da
 
 The converter validates that provided labelhashes match the computed hash for each label.
 
+### Label Filtering
+
+The CSV converter includes built-in filtering capabilities to prevent duplicate labels:
+
+#### Filtering Existing Labels
+Use `--existing-db-path` to filter out labels that already exist in an existing ENSRainbow database:
+
+```bash
+pnpm run convert-csv \
+  --input-file new-labels.csv \
+  --output-file incremental_1.ensrainbow \
+  --label-set-id my-dataset \
+  --label-set-version 1 \
+  --existing-db-path data-my-dataset
+```
+
+This will:
+- Check each label against the existing database
+- Skip labels that already exist (avoiding duplicates)
+- Only write new labels to the output file
+- Log filtering statistics in the conversion summary
+
+#### Filtering Duplicate Labels Within CSV
+The converter automatically filters duplicate labels within the same CSV file, keeping only the first occurrence of each label.
+
+#### Filtering Statistics
+The conversion process logs detailed statistics:
+```
+=== Conversion Summary ===
+Total lines processed: 1000
+Valid records: 850
+Filtered existing labels: 100
+Filtered duplicates: 50
+Duration: 150ms
+```
+
 ### Example: Creating Test Dataset
 
 ```bash
@@ -188,7 +226,9 @@ pnpm run convert-csv \
 2. **Streams** CSV parsing using fast-csv for memory efficiency
 3. **Validates** column count and data format
 4. **Computes** or validates labelhashes as needed
-5. **Writes** protobuf messages with the same format as SQL conversion
+5. **Filters** existing labels if `--existing-db-path` is provided
+6. **Filters** duplicate labels within the same CSV file
+7. **Writes** protobuf messages with the same format as SQL conversion
 
 ## Common Workflows
 
@@ -256,7 +296,39 @@ pnpm run ingest-ensrainbow \
 pnpm run serve --data-dir data-custom --port 3223
 ```
 
-### Workflow 4: Using Custom Label Set Server
+### Workflow 4: Creating Incremental Updates
+
+```bash
+# 1. Create initial dataset
+pnpm run convert-csv \
+  --input-file initial-labels.csv \
+  --output-file my-dataset_0.ensrainbow \
+  --label-set-id my-dataset \
+  --label-set-version 0
+
+# 2. Ingest initial data
+pnpm run ingest-ensrainbow \
+  --input-file my-dataset_0.ensrainbow \
+  --data-dir data-my-dataset
+
+# 3. Create incremental update (filtering existing labels)
+pnpm run convert-csv \
+  --input-file new-labels.csv \
+  --output-file my-dataset_1.ensrainbow \
+  --label-set-id my-dataset \
+  --label-set-version 1 \
+  --existing-db-path data-my-dataset
+
+# 4. Ingest incremental update
+pnpm run ingest-ensrainbow \
+  --input-file my-dataset_1.ensrainbow \
+  --data-dir data-my-dataset
+
+# 5. Serve updated data
+pnpm run serve --data-dir data-my-dataset --port 3223
+```
+
+### Workflow 5: Using Custom Label Set Server
 
 ```bash
 # 1. Configure custom label set server
@@ -318,6 +390,14 @@ pnpm run convert-csv \
   --label-set-id my-dataset \
   --label-set-version 0
 
+# For CSV data with filtering (if you have an existing database)
+pnpm run convert-csv \
+  --input-file my-labels.csv \
+  --output-file my-dataset_1.ensrainbow \
+  --label-set-id my-dataset \
+  --label-set-version 1 \
+  --existing-db-path data-my-dataset
+
 # For SQL data
 pnpm run convert \
   --input-file my-data.sql.gz \

From e2b9255224621dac9208bb6c6f2ca00b6fbaf75c Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Mon, 24 Nov 2025 13:26:01 +0100
Subject: [PATCH 08/28] feat: enhance CSV conversion with Bloom filter and
 deduplication options

- Added new command-line options for CSV conversion: `--silent`, `--disable-dedup`, `--cache-size`, `--use-bloom-filter`, and `--bloom-filter-size`.
- Implemented a deduplication database using ClassicLevel with optional Bloom filter for faster processing.
- Updated the conversion process to support deduplication and improved memory management.
- Enhanced logging for large file processing and added tests for new deduplication features.
---
 apps/ensrainbow/package.json                  |   4 +-
 apps/ensrainbow/src/cli.ts                    |  47 ++-
 .../src/commands/convert-csv-command.test.ts  |  39 +-
 .../src/commands/convert-csv-command.ts       | 342 +++++++++++++++---
 pnpm-lock.yaml                                |  51 +++
 5 files changed, 427 insertions(+), 56 deletions(-)

diff --git a/apps/ensrainbow/package.json b/apps/ensrainbow/package.json
index 046cb2e2e..341e0d440 100644
--- a/apps/ensrainbow/package.json
+++ b/apps/ensrainbow/package.json
@@ -19,7 +19,8 @@
     "validate:lite": "tsx src/cli.ts validate --lite",
     "purge": "tsx src/cli.ts purge",
     "convert": "tsx src/cli.ts convert",
-    "test": "vitest",
+    "convert-csv": "NODE_OPTIONS='--expose-gc --max-old-space-size=4096' tsx src/cli.ts convert-csv",
+    "test": "NODE_OPTIONS='--max-old-space-size=8192' vitest",
     "test:coverage": "vitest --coverage",
     "lint": "biome check --write .",
     "lint:ci": "biome ci",
@@ -32,6 +33,7 @@
     "@ensnode/ensrainbow-sdk": "workspace:*",
     "@ensnode/ensnode-sdk": "workspace:*",
     "@hono/node-server": "^1.4.1",
+    "bloom-filters": "^3.0.4",
     "classic-level": "^1.4.1",
     "hono": "catalog:",
     "pino": "catalog:",
diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts
index d9d38c4f9..6e6bb4f32 100644
--- a/apps/ensrainbow/src/cli.ts
+++ b/apps/ensrainbow/src/cli.ts
@@ -69,6 +69,11 @@ interface ConvertCsvArgs {
   "label-set-version": LabelSetVersion;
   "progress-interval"?: number;
   "existing-db-path"?: string;
+  "silent"?: boolean;
+  "disable-dedup"?: boolean;
+  "cache-size"?: number;
+  "use-bloom-filter"?: boolean;
+  "bloom-filter-size"?: number;
 }
 
 export interface CLIOptions {
@@ -261,10 +266,35 @@ export function createCLI(options: CLIOptions = {}) {
               description: "Number of records to process before logging progress",
               default: 10000,
             })
-            .option("existing-db-path", {
-              type: "string",
-              description: "Path to existing ENSRainbow database to filter out existing labels",
-            });
+        .option("existing-db-path", {
+          type: "string",
+          description: "Path to existing ENSRainbow database to filter out existing labels",
+        })
+        .option("silent", {
+          type: "boolean",
+          description: "Disable progress bar (useful for scripts)",
+          default: false,
+        })
+        .option("disable-dedup", {
+          type: "boolean",
+          description: "Disable deduplication within CSV file (faster but may create duplicates)",
+          default: false,
+        })
+        .option("cache-size", {
+          type: "number",
+          description: "Cache size for deduplication (default: 5000)",
+          default: 5000,
+        })
+        .option("use-bloom-filter", {
+          type: "boolean",
+          description: "Use Bloom filter for faster deduplication (default: false)",
+          default: false,
+        })
+        .option("bloom-filter-size", {
+          type: "number",
+          description: "Expected number of items for Bloom filter (default: 10000000)",
+          default: 10000000,
+        });
         },
         async (argv: ArgumentsCamelCase<ConvertCsvArgs>) => {
           await convertCsvCommand({
@@ -272,8 +302,13 @@ export function createCLI(options: CLIOptions = {}) {
             outputFile: argv["output-file"],
             labelSetId: argv["label-set-id"],
             labelSetVersion: argv["label-set-version"],
-            progressInterval: argv["progress-interval"],
-            existingDbPath: argv["existing-db-path"],
+          progressInterval: argv["progress-interval"],
+          existingDbPath: argv["existing-db-path"],
+          silent: argv["silent"],
+          noDedup: argv["disable-dedup"],
+            cacheSize: argv["cache-size"],
+            useBloomFilter: argv["use-bloom-filter"],
+            bloomFilterSize: argv["bloom-filter-size"],
           });
         },
       )
diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
index 9e2569ab2..c6ddadb03 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
@@ -38,6 +38,7 @@ describe("convert-csv-command", () => {
         outputFile,
         labelSetId: "test-csv-one-col" as LabelSetId,
         labelSetVersion: 0 as LabelSetVersion,
+        silent: true,
       });
 
       // Verify the output file was created
@@ -71,6 +72,7 @@ describe("convert-csv-command", () => {
         outputFile,
         labelSetId: "test-csv-two-col" as LabelSetId,
         labelSetVersion: 0 as LabelSetVersion,
+        silent: true,
       });
 
       // Verify the output file was created
@@ -119,6 +121,7 @@ describe("convert-csv-command", () => {
         outputFile,
         labelSetId: "test-csv-special" as LabelSetId,
         labelSetVersion: 0 as LabelSetVersion,
+        silent: true,
       });
 
       // Verify output file was created
@@ -229,6 +232,7 @@ describe("convert-csv-command", () => {
         outputFile: initialOutputFile,
         labelSetId: "test-filtering" as LabelSetId,
         labelSetVersion: 0 as LabelSetVersion,
+        silent: true,
       });
 
       // Ingest the initial file
@@ -255,6 +259,7 @@ describe("convert-csv-command", () => {
         labelSetId: "test-filtering" as LabelSetId,
         labelSetVersion: 0 as LabelSetVersion, // Use same version as initial
         existingDbPath: dataDir,
+        silent: true,
       });
 
       // Verify the filtered output file was created
@@ -296,6 +301,7 @@ describe("convert-csv-command", () => {
         outputFile,
         labelSetId: "test-duplicates" as LabelSetId,
         labelSetVersion: 0 as LabelSetVersion,
+        silent: true,
       });
 
       // Verify the output file was created
@@ -400,10 +406,10 @@ describe("convert-csv-command", () => {
 
       // Verify file was created
       const stats = await stat(outputFile);
-      expect(stats.isFile()).toBe(true);
-      expect(stats.size).toBeGreaterThan(0);
-    });
+    expect(stats.isFile()).toBe(true);
+    expect(stats.size).toBeGreaterThan(0);
   });
+});
 
   describe("Streaming performance", () => {
     it("should handle small CSV files efficiently", async () => {
@@ -426,6 +432,7 @@ describe("convert-csv-command", () => {
         outputFile,
         labelSetId: "test-small" as LabelSetId,
         labelSetVersion: 0 as LabelSetVersion,
+        silent: true,
       });
 
       const conversionTime = Date.now() - startTime;
@@ -453,5 +460,31 @@ describe("convert-csv-command", () => {
       const dbStats = await stat(dataDir);
       expect(dbStats.isDirectory()).toBe(true);
     });
+
+    it("should handle CSV files with many unique labels", async () => {
+      const inputFile = join(tempDir, "many_labels.csv");
+      const outputFile = join(tempDir, "output_many_labels.ensrainbow");
+
+      // Create a CSV with 50,000 unique labels (tests deduplication with increased memory limit)
+      const records = [];
+      for (let i = 0; i < 50_000; i++) {
+        records.push(`label${i}`);
+      }
+      await writeFile(inputFile, records.join("\n"));
+
+      // This should work without memory issues
+      await convertCsvCommand({
+        inputFile,
+        outputFile,
+        labelSetId: "test-many-labels" as LabelSetId,
+        labelSetVersion: 0 as LabelSetVersion,
+        silent: true,
+      });
+
+      // Verify file was created
+      const stats = await stat(outputFile);
+      expect(stats.isFile()).toBe(true);
+      expect(stats.size).toBeGreaterThan(0);
+    }, 60000); // 60 second timeout for large file test
   });
 });
diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts
index 34f64d935..0e0c8ac0e 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.ts
@@ -5,10 +5,15 @@
  * Supports 1-column (label only) and 2-column (label,labelhash) formats
  */
 
-import { createReadStream, createWriteStream } from "fs";
+import { createReadStream, createWriteStream, statSync } from "fs";
+import { rmSync } from "fs";
+import { join } from "path";
 import { type LabelHash, labelHashToBytes } from "@ensnode/ensnode-sdk";
 import { parse } from "@fast-csv/parse";
 import { labelhash } from "viem";
+import { ClassicLevel } from "classic-level";
+import ProgressBar from "progress";
+import bloomFilters from "bloom-filters";
 import { ENSRainbowDB } from "../lib/database.js";
 import { logger } from "../utils/logger.js";
 import {
@@ -16,6 +21,129 @@ import {
   createRainbowProtobufRoot,
 } from "../utils/protobuf-schema.js";
 
+/**
+ * Simple deduplication database using ClassicLevel directly
+ */
+class DeduplicationDB {
+  private pendingWrites: Map<string, string> = new Map();
+  private cache: Map<string, boolean> = new Map();
+  private cacheSize: number;
+  private bloomFilter: typeof bloomFilters.BloomFilter | null = null;
+
+  constructor(private db: ClassicLevel<string, string>, cacheSize: number = 10000, useBloomFilter: boolean = false, expectedItems: number = 10000000) {
+    this.cacheSize = cacheSize;
+    
+    if (useBloomFilter) {
+      // Create Bloom filter with 0.1% false positive rate
+      this.bloomFilter = bloomFilters.BloomFilter.create(expectedItems, 0.01);
+      logger.info(`Created Bloom filter for ${expectedItems} items (~${(this.bloomFilter.size / 8 / 1024 / 1024).toFixed(2)} MB)`);
+    }
+  }
+
+  async has(key: string): Promise<boolean> {
+    // Check cache first
+    if (this.cache.has(key)) {
+      return this.cache.get(key)!;
+    }
+
+    // Check pending writes
+    if (this.pendingWrites.has(key)) {
+      this.cache.set(key, true);
+      return true;
+    }
+
+    // Use Bloom filter if available
+    if (this.bloomFilter) {
+      // If Bloom filter says "not present", we can skip LevelDB check
+      if (!this.bloomFilter.has(key)) {
+        this.cache.set(key, false);
+        return false;
+      }
+      // Bloom filter says "maybe present" - need to check LevelDB
+    }
+
+    // Check database
+    try {
+      await this.db.get(key);
+      this.cache.set(key, true);
+      return true;
+    } catch (error) {
+      this.cache.set(key, false);
+      return false;
+    }
+  }
+
+  async add(key: string, value: string): Promise<void> {
+    this.pendingWrites.set(key, value);
+    this.cache.set(key, true); // Cache the fact that this key exists
+    
+    // Add to Bloom filter if available
+    if (this.bloomFilter) {
+      this.bloomFilter.add(key);
+    }
+    
+    // Check cache size periodically (not on every add)
+    this.evictCacheIfNeeded();
+    
+    // Flush to database periodically (smaller batch to reduce memory usage)
+    if (this.pendingWrites.size >= 5000) {
+      await this.flush();
+    }
+  }
+
+  private evictCacheIfNeeded(): void {
+    // Limit cache size - only evict when significantly exceeded
+    if (this.cache.size > this.cacheSize * 1.2) {
+      // Remove oldest 20% of entries
+      let toRemove = Math.floor(this.cacheSize * 0.2);
+      for (const key of this.cache.keys()) {
+        if (toRemove-- <= 0) break;
+        this.cache.delete(key);
+      }
+    }
+  }
+
+  async flush(): Promise<void> {
+    if (this.pendingWrites.size === 0) return;
+
+    const batch = this.db.batch();
+    for (const [key, value] of this.pendingWrites) {
+      batch.put(key, value);
+    }
+    await batch.write();
+    this.pendingWrites.clear();
+    
+    // Hint to garbage collector after large batch
+    if (global.gc) {
+      global.gc();
+    }
+  }
+
+  async close(): Promise<void> {
+    await this.flush();
+    await this.db.close();
+  }
+}
+
+
+/**
+ * Sets up a simple progress bar that shows speed without total count.
+ */
+function setupProgressBar(): ProgressBar {
+  return new ProgressBar(
+    "Processing CSV [:bar] :current lines - :rate lines/sec",
+    {
+      complete: "=",
+      incomplete: " ",
+      width: 40,
+      total: 200000000, // Very large total for big files
+    },
+  );
+}
+
+/**
+ * Options for CSV conversion command
+ */
 export interface ConvertCsvCommandOptions {
   inputFile: string;
   outputFile: string;
@@ -23,6 +151,11 @@ export interface ConvertCsvCommandOptions {
   labelSetVersion: number;
   progressInterval?: number;
   existingDbPath?: string; // Path to existing ENSRainbow database to check for existing labels
+  silent?: boolean; // Disable progress bar for tests
+  noDedup?: boolean; // Disable deduplication within CSV file
+  cacheSize?: number; // Cache size for deduplication (default: 10000)
+  useBloomFilter?: boolean; // Use Bloom filter for faster deduplication (default: false)
+  bloomFilterSize?: number; // Expected number of items for Bloom filter (default: 10000000)
 }
 
 // Configuration constants
@@ -106,6 +239,20 @@ async function initializeConversion(options: ConvertCsvCommandOptions) {
   logger.info(`Label set id: ${options.labelSetId}`);
   logger.info(`Label set version: ${options.labelSetVersion}`);
 
+  // Check file size and warn for very large files
+  try {
+    const stats = statSync(options.inputFile);
+    const fileSizeMB = (stats.size / (1024 * 1024)).toFixed(2);
+    logger.info(`Input file size: ${fileSizeMB} MB`);
+    
+    if (stats.size > 1024 * 1024 * 1024) { // > 1GB
+      logger.warn("⚠️  Processing a very large file. This may take significant time and memory.");
+      logger.warn("💡 Consider using --existing-db-path to filter out existing labels for better performance.");
+    }
+  } catch (error) {
+    logger.warn(`Could not determine file size: ${error}`);
+  }
+
   // Open existing database if path is provided
   let existingDb: ENSRainbowDB | null = null;
   if (options.existingDbPath) {
@@ -143,7 +290,6 @@ function createRainbowRecord(row: string[]): { labelhash: Buffer; label: string
   if (row.length === 1) {
     // Single column: compute labelhash using labelhash function
     const labelHashBytes = labelHashToBytes(labelhash(label));
-    console.log(label);
     return {
       labelhash: Buffer.from(labelHashBytes),
       label: label,
@@ -161,7 +307,7 @@ function createRainbowRecord(row: string[]): { labelhash: Buffer; label: string
 }
 
 /**
- * Process a single CSV record
+ * Process a single CSV record with LevelDB-based deduplication
  */
 async function processRecord(
   row: string[],
@@ -170,7 +316,7 @@ async function processRecord(
   outputStream: NodeJS.WritableStream,
   lineNumber: number,
   existingDb: ENSRainbowDB | null,
-  writtenLabels: Set<string>,
+  dedupDb: DeduplicationDB | null,
   stats: ConversionStats,
 ): Promise<boolean> {
   // Validate column count
@@ -184,7 +330,7 @@ async function processRecord(
   const label = rainbowRecord.label;
   const labelHashBytes = rainbowRecord.labelhash;
 
-  // Check if labelhash already exists in the database
+  // Check if labelhash already exists in the existing database
   if (existingDb) {
     const existsInDb = await checkLabelHashExists(existingDb, labelHashBytes);
     if (existsInDb) {
@@ -193,14 +339,17 @@ async function processRecord(
     }
   }
 
-  // Check if label is a duplicate within this conversion
-  if (writtenLabels.has(label)) {
-    stats.filteredDuplicates++;
-    return false; // Skip this record
-  }
+  // Check if label is a duplicate within this conversion using LevelDB (if enabled)
+  if (dedupDb) {
+    const existsInDedupDb = await dedupDb.has(label);
+    if (existsInDedupDb) {
+      stats.filteredDuplicates++;
+      return false; // Skip this record
+    }
 
-  // Add label to written set to track duplicates
-  writtenLabels.add(label);
+    // Add label to deduplication database
+    await dedupDb.add(label, "");
+  }
 
   // Create protobuf message and write immediately
   const recordMessage = RainbowRecordType.fromObject(rainbowRecord);
@@ -218,49 +367,89 @@ async function processCSVFile(
   outputStream: NodeJS.WritableStream,
   progressInterval: number,
   existingDb: ENSRainbowDB | null,
+  dedupDb: DeduplicationDB | null,
   stats: ConversionStats,
+  progressBar: ProgressBar | null,
 ): Promise<{ totalLines: number; processedRecords: number }> {
-  return new Promise((resolve, reject) => {
-    let expectedColumns: number | null = null;
-    let lineNumber = 0;
-    let processedRecords = 0;
-    const writtenLabels = new Set<string>(); // Track labels written in this conversion
+  let expectedColumns: number | null = null;
+  let lineNumber = 0;
+  let processedRecords = 0;
+  let lastLoggedLine = 0; // Track last logged line to avoid duplicate logs
+  const startTime = Date.now(); // Track start time for overall processing
+  let lastLogTime = Date.now(); // Track time of last log for chunk timing
+  
+  // LevelDB-based deduplication: Uses temporary database to avoid RAM limits
 
-    const fileStream = createReadStream(inputFile, { encoding: "utf8" });
+  const fileStream = createReadStream(inputFile, { encoding: "utf8" });
+
+  return new Promise((resolve, reject) => {
+    let pendingCount = 0;
+    const MAX_PENDING = 100; // Smaller limit to reduce memory
 
     const csvStream = parse()
-      .on("data", async (row: string[]) => {
+      .on("data", (row: string[]) => {
         lineNumber++;
 
-        try {
-          // For the first row, detect column count
-          if (expectedColumns === null) {
-            expectedColumns = row.length;
-            logger.info(`Detected ${expectedColumns} columns using fast-csv`);
-          }
+        // For the first row, detect column count
+        if (expectedColumns === null) {
+          expectedColumns = row.length;
+          logger.info(`Detected ${expectedColumns} columns using fast-csv`);
+        }
 
-          const wasProcessed = await processRecord(
-            row,
-            expectedColumns,
-            RainbowRecordType,
-            outputStream,
-            lineNumber,
-            existingDb,
-            writtenLabels,
-            stats,
+        // Log progress synchronously when line is read (not in async callback)
+        // This ensures logs appear at the correct intervals
+        if (lineNumber % progressInterval === 0 && lineNumber !== lastLoggedLine) {
+          const currentTime = Date.now();
+          const chunkTime = currentTime - lastLogTime; // Time for this 10k chunk
+          const totalElapsed = currentTime - startTime; // Total time since start
+          const chunkTimeSeconds = (chunkTime / 1000).toFixed(2);
+          const totalTimeSeconds = (totalElapsed / 1000).toFixed(2);
+          const linesPerSecond = ((progressInterval / chunkTime) * 1000).toFixed(0);
+          
+          lastLoggedLine = lineNumber;
+          lastLogTime = currentTime;
+          
+          // Note: processedRecords may be slightly behind due to async processing
+          logger.info(
+            `Processed ${lineNumber} lines, written ${processedRecords} records | ` +
+            `Chunk: ${chunkTimeSeconds}s (${linesPerSecond} lines/sec) | ` +
+            `Total: ${totalTimeSeconds}s`
           );
+        }
+
+        // Backpressure: pause if too many pending
+        if (pendingCount >= MAX_PENDING) {
+          csvStream.pause();
+        }
 
+        pendingCount++;
+        processRecord(
+          row,
+          expectedColumns,
+          RainbowRecordType,
+          outputStream,
+          lineNumber,
+          existingDb,
+          dedupDb,
+          stats,
+        ).then((wasProcessed) => {
           if (wasProcessed) {
             processedRecords++;
           }
-
-          // Log progress for large files
-          if (lineNumber % progressInterval === 0) {
-            logger.info(
-              `Processed ${lineNumber} lines, written ${processedRecords} records so far...`,
-            );
+          
+          // Update progress bar every 1000 lines
+          if (lineNumber % 1000 === 0 && progressBar) {
+            progressBar.tick(1000);
+            progressBar.curr = lineNumber;
+          }
+          
+          pendingCount--;
+          
+          // Resume when under threshold
+          if (csvStream.isPaused() && pendingCount < MAX_PENDING / 2) {
+            csvStream.resume();
           }
-        } catch (error) {
+        }).catch((error) => {
           const errorMessage = error instanceof Error ? error.message : String(error);
           csvStream.destroy();
           fileStream.destroy();
@@ -269,12 +458,18 @@ async function processCSVFile(
               `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`,
             ),
           );
-        }
+        });
       })
       .on("error", (error: Error) => {
         reject(new Error(`CSV parsing error: ${error.message}`));
       })
-      .on("end", () => {
+      .on("end", async () => {
+        // Wait for all pending to complete
+        while (pendingCount > 0) {
+          await new Promise(resolve => setTimeout(resolve, 10));
+        }
+        const dedupStatus = dedupDb ? "LevelDB deduplication completed" : "Deduplication disabled";
+        logger.info(dedupStatus);
         resolve({ totalLines: lineNumber, processedRecords });
       });
 
@@ -299,13 +494,38 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
   };
 
   let existingDb: ENSRainbowDB | null = null;
+  let dedupDb: DeduplicationDB | null = null;
+  let tempDedupDir: string | null = null;
 
   try {
     const { RainbowRecordType, outputStream, existingDb: db } = await initializeConversion(options);
     existingDb = db;
 
+    // Create temporary deduplication database (if not disabled)
+    if (!options.noDedup) {
+      tempDedupDir = join(process.cwd(), 'temp-dedup-' + Date.now());
+      logger.info(`Creating temporary deduplication database at: ${tempDedupDir}`);
+      const tempDb = new ClassicLevel<string, string>(tempDedupDir, {
+        keyEncoding: 'utf8',
+        valueEncoding: 'utf8',
+        createIfMissing: true,
+      });
+      await tempDb.open();
+      dedupDb = new DeduplicationDB(
+        tempDb, 
+        options.cacheSize ?? 10000,
+        options.useBloomFilter ?? false,
+        options.bloomFilterSize ?? 10000000
+      );
+    } else {
+      logger.info("Deduplication disabled - processing all records");
+    }
+
     const progressInterval = options.progressInterval ?? DEFAULT_PROGRESS_INTERVAL;
 
+    // Set up progress bar (only if not silent)
+    const progressBar = options.silent ? null : setupProgressBar();
+
     // Process the CSV file
     const { totalLines, processedRecords } = await processCSVFile(
       options.inputFile,
@@ -313,11 +533,21 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
       outputStream,
       progressInterval,
       existingDb,
+      dedupDb,
       stats,
+      progressBar,
     );
 
-    stats.totalLines = totalLines;
-    stats.processedRecords = processedRecords;
+            stats.totalLines = totalLines;
+            stats.processedRecords = processedRecords;
+
+            // Log final progress for large files
+            if (totalLines > 10_000) {
+              const dedupStatus = options.noDedup ? "dedup disabled" : "LevelDB dedup active";
+              logger.info(
+                `✅ Completed processing ${totalLines.toLocaleString()} lines, wrote ${processedRecords.toLocaleString()} records (${dedupStatus})`,
+              );
+            }
 
     // Close output stream
     outputStream.end();
@@ -330,7 +560,17 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
     logger.error("❌ CSV conversion failed:", errorMessage);
     throw error;
   } finally {
-    // Clean up database connection
+    // Clean up deduplication database
+    if (dedupDb) {
+      try {
+        await dedupDb.close();
+        logger.info("Closed deduplication database");
+      } catch (error) {
+        logger.warn(`Failed to close deduplication database: ${error}`);
+      }
+    }
+
+    // Clean up existing database connection
     if (existingDb) {
       try {
         await existingDb.close();
@@ -339,5 +579,15 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
         logger.warn(`Failed to close existing database: ${error}`);
       }
     }
+
+    // Remove temporary deduplication database directory
+    if (tempDedupDir) {
+      try {
+        rmSync(tempDedupDir, { recursive: true, force: true });
+        logger.info(`Removed temporary deduplication database: ${tempDedupDir}`);
+      } catch (error) {
+        logger.warn(`Failed to remove temporary deduplication database: ${error}`);
+      }
+    }
   }
 }
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 3dea391e0..3ccf052d8 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -462,6 +462,9 @@ importers:
       '@hono/node-server':
         specifier: ^1.4.1
         version: 1.19.5(hono@4.10.3)
+      bloom-filters:
+        specifier: ^3.0.4
+        version: 3.0.4
       classic-level:
         specifier: ^1.4.1
         version: 1.4.1
@@ -3435,6 +3438,9 @@ packages:
   '@types/sax@1.2.7':
     resolution: {integrity: sha512-rO73L89PJxeYM3s3pPPjiPgVVcymqU490g0YO5n5By0k2Erzj6tay/4lr1CHAAU4JyOWd1rpQ8bCf6cZfHU96A==}
 
+  '@types/seedrandom@3.0.8':
+    resolution: {integrity: sha512-TY1eezMU2zH2ozQoAFAQFOPpvP15g+ZgSfTZt31AUUH/Rxtnz3H+A/Sv1Snw2/amp//omibc+AEkTaA8KUeOLQ==}
+
   '@types/tar@6.1.13':
     resolution: {integrity: sha512-IznnlmU5f4WcGTh2ltRu/Ijpmk8wiWXfF0VA4s+HPjHZgvFggk1YaIkbo5krX/zUCzWF8N/l4+W/LNxnvAJ8nw==}
 
@@ -3783,6 +3789,10 @@ packages:
   base-64@1.0.0:
     resolution: {integrity: sha512-kwDPIFCGx0NZHog36dj+tHiwP4QMzsZ3AgMViUBKI0+V5n4U0ufTCUMhnQ04diaRI8EX/QcPfql7zlhZ7j4zgg==}
 
+  base64-arraybuffer@1.0.2:
+    resolution: {integrity: sha512-I3yl4r9QB5ZRY3XuJVEPfc2XhZO6YweFPI+UovAzn+8/hb3oJ6lnysaFcjVpkCPfVWFUDvoZ8kmVDP7WyRtYtQ==}
+    engines: {node: '>= 0.6.0'}
+
   base64-js@1.5.1:
     resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
 
@@ -3813,6 +3823,10 @@ packages:
   bintrees@1.0.2:
     resolution: {integrity: sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==}
 
+  bloom-filters@3.0.4:
+    resolution: {integrity: sha512-BdnPWo2OpYhlvuP2fRzJBdioMCkm7Zp0HCf8NJgF5Mbyqy7VQ/CnTiVWMMyq4EZCBHwj0Kq6098gW2/3RsZsrA==}
+    engines: {node: '>=12'}
+
   boolbase@1.0.0:
     resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==}
 
@@ -4167,6 +4181,9 @@ packages:
   csstype@3.2.3:
     resolution: {integrity: sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==}
 
+  cuint@0.2.2:
+    resolution: {integrity: sha512-d4ZVpCW31eWwCMe1YT3ur7mUDnTXbgwyzaL320DrcRT45rfjYxkt5QWLrmOJ+/UEAI2+fQgKe/fCjR8l4TpRgw==}
+
   cytoscape-cose-bilkent@4.1.0:
     resolution: {integrity: sha512-wgQlVIUJF13Quxiv5e1gstZ08rnZj2XaLHGoFMYXz7SkNfCDOOteKBE6SYRfA9WxxI/iBc3ajfDoc6hb/MRAHQ==}
     peerDependencies:
@@ -6537,6 +6554,9 @@ packages:
   recma-stringify@1.0.0:
     resolution: {integrity: sha512-cjwII1MdIIVloKvC9ErQ+OgAtwHBmcZ0Bg4ciz78FtbT8In39aAYbaA7zvxQ61xVMSPE8WxhLwLbhif4Js2C+g==}
 
+  reflect-metadata@0.1.14:
+    resolution: {integrity: sha512-ZhYeb6nRaXCfhnndflDK8qI6ZQ/YcWZCISRAWICW9XYqMUwjZM9Z0DveWX/ABN01oxSHwVxKQmxeYZSsm0jh5A==}
+
   regex-recursion@6.0.2:
     resolution: {integrity: sha512-0YCaSCq2VRIebiaUviZNs0cBz1kg5kVS2UKUfNIx8YVs1cN3AV7NTctO5FOKBA+UT2BPJIWZauYHPqJODG50cg==}
 
@@ -6708,6 +6728,9 @@ packages:
   secure-json-parse@4.1.0:
     resolution: {integrity: sha512-l4KnYfEyqYJxDwlNVyRfO2E4NTHfMKAWdUuA8J0yve2Dz/E/PdBepY03RvyJpssIpRFwJoCD55wA+mEDs6ByWA==}
 
+  seedrandom@3.0.5:
+    resolution: {integrity: sha512-8OwmbklUNzwezjGInmZ+2clQmExQPvomqjL7LFqOYqtmuxRgQYqOD3mHaU+MvZn5FLUeVxVfQjwLZW/n/JFuqg==}
+
   semver-compare@1.0.0:
     resolution: {integrity: sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow==}
 
@@ -7767,6 +7790,9 @@ packages:
   xxhash-wasm@1.1.0:
     resolution: {integrity: sha512-147y/6YNh+tlp6nd/2pWq38i9h6mz/EuQ6njIrmW8D1BS5nCqs0P6DG+m6zTGnNz5I+uhZ0SHxBs9BsPrwcKDA==}
 
+  xxhashjs@0.2.2:
+    resolution: {integrity: sha512-AkTuIuVTET12tpsVIQo+ZU6f/qDmKuRUcjaqR+OIvm+aCBsZ95i7UVY5WJ9TMsSaZ0DA2WxoZ4acu0sPH+OKAw==}
+
   y18n@5.0.8:
     resolution: {integrity: sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==}
     engines: {node: '>=10'}
@@ -10818,6 +10844,8 @@ snapshots:
     dependencies:
       '@types/node': 22.18.13
 
+  '@types/seedrandom@3.0.8': {}
+
   '@types/tar@6.1.13':
     dependencies:
       '@types/node': 22.18.13
@@ -11294,6 +11322,8 @@ snapshots:
 
   base-64@1.0.0: {}
 
+  base64-arraybuffer@1.0.2: {}
+
   base64-js@1.5.1: {}
 
   baseline-browser-mapping@2.8.21: {}
@@ -11320,6 +11350,17 @@ snapshots:
 
   bintrees@1.0.2: {}
 
+  bloom-filters@3.0.4:
+    dependencies:
+      '@types/seedrandom': 3.0.8
+      base64-arraybuffer: 1.0.2
+      is-buffer: 2.0.5
+      lodash: 4.17.21
+      long: 5.3.2
+      reflect-metadata: 0.1.14
+      seedrandom: 3.0.5
+      xxhashjs: 0.2.2
+
   boolbase@1.0.0: {}
 
   boring-avatars@1.11.2: {}
@@ -11695,6 +11736,8 @@ snapshots:
 
   csstype@3.2.3: {}
 
+  cuint@0.2.2: {}
+
   cytoscape-cose-bilkent@4.1.0(cytoscape@3.33.1):
     dependencies:
       cose-base: 1.0.3
@@ -14442,6 +14485,8 @@ snapshots:
       unified: 11.0.5
       vfile: 6.0.3
 
+  reflect-metadata@0.1.14: {}
+
   regex-recursion@6.0.2:
     dependencies:
       regex-utilities: 2.3.0
@@ -14708,6 +14753,8 @@ snapshots:
 
   secure-json-parse@4.1.0: {}
 
+  seedrandom@3.0.5: {}
+
   semver-compare@1.0.0: {}
 
   semver@6.3.1: {}
@@ -15814,6 +15861,10 @@ snapshots:
 
   xxhash-wasm@1.1.0: {}
 
+  xxhashjs@0.2.2:
+    dependencies:
+      cuint: 0.2.2
+
   y18n@5.0.8: {}
 
   yallist@3.1.1: {}

From 2c94d417a9d8fc631c2035e9c245a49410fe727b Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Mon, 24 Nov 2025 13:27:31 +0100
Subject: [PATCH 09/28] refactor: simplify command options in package.json

---
 apps/ensrainbow/package.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/ensrainbow/package.json b/apps/ensrainbow/package.json
index 341e0d440..7379e93d3 100644
--- a/apps/ensrainbow/package.json
+++ b/apps/ensrainbow/package.json
@@ -19,8 +19,8 @@
     "validate:lite": "tsx src/cli.ts validate --lite",
     "purge": "tsx src/cli.ts purge",
     "convert": "tsx src/cli.ts convert",
-    "convert-csv": "NODE_OPTIONS='--expose-gc --max-old-space-size=4096' tsx src/cli.ts convert-csv",
-    "test": "NODE_OPTIONS='--max-old-space-size=8192' vitest",
+    "convert-csv": "tsx src/cli.ts convert-csv",
+    "test": "vitest",
     "test:coverage": "vitest --coverage",
     "lint": "biome check --write .",
     "lint:ci": "biome ci",

From 721a50d4507261fcf4efb93c347c430e2c364a1d Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Thu, 11 Dec 2025 20:56:40 +0100
Subject: [PATCH 10/28] refactor: improve memory management and logging in CSV
 conversion

- Added a function to estimate memory usage of Maps for better tracking.
- Reduced default cache size in DeduplicationDB from 10000 to 1000.
- Enhanced backpressure handling during CSV writing to prevent memory overflow.
- Updated logging to include output backpressure events and improved performance for large files.
- Streamlined the CSV processing to operate in a completely sequential manner.
---
 .../src/commands/convert-csv-command.ts       | 245 +++++++++---------
 1 file changed, 128 insertions(+), 117 deletions(-)

diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts
index 0e0c8ac0e..db7478664 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.ts
@@ -21,17 +21,27 @@ import {
   createRainbowProtobufRoot,
 } from "../utils/protobuf-schema.js";
 
+/**
+ * Estimate memory usage of a Map (rough approximation)
+ */
+function estimateMapMemory(map: Map<string, any>): number {
+  let total = 0;
+  for (const [key, value] of map) {
+    // Rough estimate: key size + value size + Map overhead (48 bytes per entry)
+    total += (key.length * 2) + (typeof value === 'string' ? value.length * 2 : 8) + 48;
+  }
+  return total;
+}
+
 /**
  * Simple deduplication database using ClassicLevel directly
  */
 class DeduplicationDB {
   private pendingWrites: Map<string, string> = new Map();
-  private cache: Map<string, boolean> = new Map();
-  private cacheSize: number;
   private bloomFilter: typeof bloomFilters.BloomFilter | null = null;
 
-  constructor(private db: ClassicLevel<string, string>, cacheSize: number = 10000, useBloomFilter: boolean = false, expectedItems: number = 10000000) {
-    this.cacheSize = cacheSize;
+  constructor(private db: ClassicLevel<string, string>, cacheSize: number = 1000, useBloomFilter: boolean = false, expectedItems: number = 10000000) {
+    // No in-memory cache - LevelDB has its own internal cache
     
     if (useBloomFilter) {
       // Create Bloom filter with 0.1% false positive rate
@@ -41,68 +51,41 @@ class DeduplicationDB {
   }
 
   async has(key: string): Promise<boolean> {
-    // Check cache first
-    if (this.cache.has(key)) {
-      return this.cache.get(key)!;
-    }
-
-    // Check pending writes
+    // Check pending writes first (not yet flushed to DB)
     if (this.pendingWrites.has(key)) {
-      this.cache.set(key, true);
       return true;
     }
 
-    // Use Bloom filter if available
+    // Use Bloom filter if available (skip expensive DB lookup)
     if (this.bloomFilter) {
-      // If Bloom filter says "not present", we can skip LevelDB check
       if (!this.bloomFilter.has(key)) {
-        this.cache.set(key, false);
         return false;
       }
-      // Bloom filter says "maybe present" - need to check LevelDB
     }
 
-    // Check database
+    // Check database (LevelDB has its own internal cache)
     try {
       await this.db.get(key);
-      this.cache.set(key, true);
       return true;
     } catch (error) {
-      this.cache.set(key, false);
       return false;
     }
   }
 
   async add(key: string, value: string): Promise<void> {
     this.pendingWrites.set(key, value);
-    this.cache.set(key, true); // Cache the fact that this key exists
     
     // Add to Bloom filter if available
     if (this.bloomFilter) {
       this.bloomFilter.add(key);
     }
     
-    // Check cache size periodically (not on every add)
-    this.evictCacheIfNeeded();
-    
-    // Flush to database periodically (smaller batch to reduce memory usage)
-    if (this.pendingWrites.size >= 5000) {
+    // Flush frequently to keep pendingWrites small
+    if (this.pendingWrites.size >= 1000) {
       await this.flush();
     }
   }
 
-  private evictCacheIfNeeded(): void {
-    // Limit cache size - only evict when significantly exceeded
-    if (this.cache.size > this.cacheSize * 1.2) {
-      // Remove oldest 20% of entries
-      let toRemove = Math.floor(this.cacheSize * 0.2);
-      for (const key of this.cache.keys()) {
-        if (toRemove-- <= 0) break;
-        this.cache.delete(key);
-      }
-    }
-  }
-
   async flush(): Promise<void> {
     if (this.pendingWrites.size === 0) return;
 
@@ -123,6 +106,15 @@ class DeduplicationDB {
     await this.flush();
     await this.db.close();
   }
+
+  getMemoryStats(): { pendingWrites: number; cache: number; pendingWritesMB: number; cacheMB: number } {
+    return {
+      pendingWrites: this.pendingWrites.size,
+      cache: 0, // Cache disabled - using LevelDB's internal cache
+      pendingWritesMB: estimateMapMemory(this.pendingWrites) / 1024 / 1024,
+      cacheMB: 0,
+    };
+  }
 }
 
 
@@ -159,13 +151,14 @@ export interface ConvertCsvCommandOptions {
 }
 
 // Configuration constants
-const DEFAULT_PROGRESS_INTERVAL = 10000;
+const DEFAULT_PROGRESS_INTERVAL = 50000; // Increased from 10k to 50k to reduce logging load
 
 interface ConversionStats {
   totalLines: number;
   processedRecords: number;
   filteredExistingLabels: number;
   filteredDuplicates: number;
+  outputBackpressureEvents: number;
   startTime: Date;
   endTime?: Date;
 }
@@ -174,8 +167,12 @@ interface ConversionStats {
  * Setup output stream for writing protobuf
  */
 function setupWriteStream(outputFile: string) {
-  // For now, just write directly to file without gzip compression
-  return createWriteStream(outputFile);
+  // Use very small highWaterMark (16KB) to trigger backpressure early and frequently
+  // This prevents unbounded buffer growth when writes are faster than disk I/O
+  // Smaller buffer = more frequent backpressure = better memory control
+  return createWriteStream(outputFile, {
+    highWaterMark: 16 * 1024, // 16KB buffer - very small to catch backpressure early
+  });
 }
 
 /**
@@ -213,6 +210,7 @@ function logSummary(stats: ConversionStats) {
   logger.info(`Valid records: ${stats.processedRecords}`);
   logger.info(`Filtered existing labels: ${stats.filteredExistingLabels}`);
   logger.info(`Filtered duplicates: ${stats.filteredDuplicates}`);
+  logger.info(`Output backpressure events: ${stats.outputBackpressureEvents}`);
   logger.info(`Duration: ${duration}ms`);
 }
 
@@ -246,8 +244,8 @@ async function initializeConversion(options: ConvertCsvCommandOptions) {
     logger.info(`Input file size: ${fileSizeMB} MB`);
     
     if (stats.size > 1024 * 1024 * 1024) { // > 1GB
-      logger.warn("⚠️  Processing a very large file. This may take significant time and memory.");
-      logger.warn("💡 Consider using --existing-db-path to filter out existing labels for better performance.");
+      logger.warn("⚠️  Processing a very large file - using SEQUENTIAL mode.");
+      logger.warn("💡 Use --existing-db-path to filter existing labels and speed up processing.");
     }
   } catch (error) {
     logger.warn(`Could not determine file size: ${error}`);
@@ -351,15 +349,27 @@ async function processRecord(
     await dedupDb.add(label, "");
   }
 
-  // Create protobuf message and write immediately
+  // Create protobuf message and write with backpressure handling
   const recordMessage = RainbowRecordType.fromObject(rainbowRecord);
-  outputStream.write(Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish()));
+  const buffer = Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish());
+  
+  // Check if write returns false (buffer full) - if so, wait for drain
+  const canContinue = outputStream.write(buffer);
+  if (!canContinue) {
+    // Buffer is full - signal backpressure
+    stats.outputBackpressureEvents++;
+    // Wait for drain event before continuing
+    // Note: The CSV stream should be paused by the caller when backpressure is detected
+    await new Promise<void>((resolve) => {
+      outputStream.once('drain', resolve);
+    });
+  }
 
   return true; // Record was processed
 }
 
 /**
- * Process the entire CSV file using fast-csv
+ * Process the entire CSV file - COMPLETELY SEQUENTIAL (one row at a time)
  */
 async function processCSVFile(
   inputFile: string,
@@ -374,102 +384,97 @@ async function processCSVFile(
   let expectedColumns: number | null = null;
   let lineNumber = 0;
   let processedRecords = 0;
-  let lastLoggedLine = 0; // Track last logged line to avoid duplicate logs
-  const startTime = Date.now(); // Track start time for overall processing
-  let lastLogTime = Date.now(); // Track time of last log for chunk timing
-  
-  // LevelDB-based deduplication: Uses temporary database to avoid RAM limits
+  let lastLoggedLine = 0;
+  const startTime = Date.now();
+  let lastLogTime = Date.now();
 
   const fileStream = createReadStream(inputFile, { encoding: "utf8" });
 
   return new Promise((resolve, reject) => {
-    let pendingCount = 0;
-    const MAX_PENDING = 100; // Smaller limit to reduce memory
+    const csvStream = parse(); // Sequential processing via pause/resume
+    let isProcessing = false;
+
+    csvStream
+      .on("data", async (row: string[]) => {
+        // PAUSE IMMEDIATELY - process one row at a time
+        csvStream.pause();
+        isProcessing = true;
 
-    const csvStream = parse()
-      .on("data", (row: string[]) => {
         lineNumber++;
 
-        // For the first row, detect column count
-        if (expectedColumns === null) {
-          expectedColumns = row.length;
-          logger.info(`Detected ${expectedColumns} columns using fast-csv`);
-        }
+        try {
+          // Detect column count on first row
+          if (expectedColumns === null) {
+            expectedColumns = row.length;
+            logger.info(`Detected ${expectedColumns} columns - SEQUENTIAL processing mode`);
+          }
 
-        // Log progress synchronously when line is read (not in async callback)
-        // This ensures logs appear at the correct intervals
-        if (lineNumber % progressInterval === 0 && lineNumber !== lastLoggedLine) {
-          const currentTime = Date.now();
-          const chunkTime = currentTime - lastLogTime; // Time for this 10k chunk
-          const totalElapsed = currentTime - startTime; // Total time since start
-          const chunkTimeSeconds = (chunkTime / 1000).toFixed(2);
-          const totalTimeSeconds = (totalElapsed / 1000).toFixed(2);
-          const linesPerSecond = ((progressInterval / chunkTime) * 1000).toFixed(0);
-          
-          lastLoggedLine = lineNumber;
-          lastLogTime = currentTime;
-          
-          // Note: processedRecords may be slightly behind due to async processing
-          logger.info(
-            `Processed ${lineNumber} lines, written ${processedRecords} records | ` +
-            `Chunk: ${chunkTimeSeconds}s (${linesPerSecond} lines/sec) | ` +
-            `Total: ${totalTimeSeconds}s`
-          );
-        }
+          // Log progress (less frequently to avoid logger crashes)
+          if (lineNumber % progressInterval === 0 && lineNumber !== lastLoggedLine) {
+            const currentTime = Date.now();
+            const chunkTime = currentTime - lastLogTime;
+            const totalElapsed = currentTime - startTime;
+            const chunkTimeSeconds = (chunkTime / 1000).toFixed(2);
+            const totalTimeSeconds = (totalElapsed / 1000).toFixed(2);
+            const linesPerSecond = ((progressInterval / chunkTime) * 1000).toFixed(0);
+            
+            lastLoggedLine = lineNumber;
+            lastLogTime = currentTime;
+            
+            const memUsage = process.memoryUsage();
+            const memInfo = `RSS=${(memUsage.rss / 1024 / 1024).toFixed(0)}MB, Heap=${(memUsage.heapUsed / 1024 / 1024).toFixed(0)}MB`;
+            
+            let dedupInfo = "";
+            if (dedupDb) {
+              const dedupStats = dedupDb.getMemoryStats();
+              dedupInfo = ` | Dedup: ${dedupStats.pendingWrites}/${dedupStats.cache}`;
+            }
+            
+            // Use console.log instead of logger to avoid worker thread issues
+            console.log(
+              `[${new Date().toISOString()}] Line ${lineNumber}, written ${processedRecords} | ` +
+              `${linesPerSecond} lines/sec | ${memInfo}${dedupInfo}`
+            );
+          }
 
-        // Backpressure: pause if too many pending
-        if (pendingCount >= MAX_PENDING) {
-          csvStream.pause();
-        }
+          // Process this one record
+          const wasProcessed = await processRecord(
+            row,
+            expectedColumns,
+            RainbowRecordType,
+            outputStream,
+            lineNumber,
+            existingDb,
+            dedupDb,
+            stats,
+          );
 
-        pendingCount++;
-        processRecord(
-          row,
-          expectedColumns,
-          RainbowRecordType,
-          outputStream,
-          lineNumber,
-          existingDb,
-          dedupDb,
-          stats,
-        ).then((wasProcessed) => {
           if (wasProcessed) {
             processedRecords++;
           }
-          
-          // Update progress bar every 1000 lines
+
+          // Update progress bar
           if (lineNumber % 1000 === 0 && progressBar) {
             progressBar.tick(1000);
             progressBar.curr = lineNumber;
           }
-          
-          pendingCount--;
-          
-          // Resume when under threshold
-          if (csvStream.isPaused() && pendingCount < MAX_PENDING / 2) {
-            csvStream.resume();
-          }
-        }).catch((error) => {
+
+          // Done processing - resume for next row
+          isProcessing = false;
+          csvStream.resume();
+
+        } catch (error) {
           const errorMessage = error instanceof Error ? error.message : String(error);
           csvStream.destroy();
           fileStream.destroy();
-          reject(
-            new Error(
-              `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`,
-            ),
-          );
-        });
+          reject(new Error(`Failed on line ${lineNumber}: ${errorMessage}`));
+        }
       })
       .on("error", (error: Error) => {
         reject(new Error(`CSV parsing error: ${error.message}`));
       })
-      .on("end", async () => {
-        // Wait for all pending to complete
-        while (pendingCount > 0) {
-          await new Promise(resolve => setTimeout(resolve, 10));
-        }
-        const dedupStatus = dedupDb ? "LevelDB deduplication completed" : "Deduplication disabled";
-        logger.info(dedupStatus);
+      .on("end", () => {
+        logger.info(`Sequential processing complete`);
         resolve({ totalLines: lineNumber, processedRecords });
       });
 
@@ -490,6 +495,7 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
     processedRecords: 0,
     filteredExistingLabels: 0,
     filteredDuplicates: 0,
+    outputBackpressureEvents: 0,
     startTime: new Date(),
   };
 
@@ -509,11 +515,16 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
         keyEncoding: 'utf8',
         valueEncoding: 'utf8',
         createIfMissing: true,
+        // Aggressive memory limits
+        cacheSize: 2 * 1024 * 1024, // 2MB block cache (minimal)
+        writeBufferSize: 4 * 1024 * 1024, // 4MB write buffer (minimal)
+        maxOpenFiles: 100, // Limit open files
+        compression: false, // Disable compression to reduce CPU/memory
       });
       await tempDb.open();
       dedupDb = new DeduplicationDB(
         tempDb, 
-        options.cacheSize ?? 10000,
+        options.cacheSize ?? 1000, // Reduced default from 10000 to 1000
         options.useBloomFilter ?? false,
         options.bloomFilterSize ?? 10000000
       );

From 56bc3563a512da001524cd501368dde8816d3118 Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Mon, 15 Dec 2025 15:44:59 +0100
Subject: [PATCH 11/28] refactor: streamline CSV conversion CLI options and
 improve logging

- Removed unused command-line options for deduplication and Bloom filter from the CLI interface.
- Updated default progress interval from 10000 to 50000 records for improved performance.
- Enhanced logging for file processing and memory management during CSV conversion.
- Cleaned up code for better readability and maintainability.
---
 apps/ensrainbow/src/cli.ts                    |  56 ++-----
 .../src/commands/convert-csv-command.test.ts  |  15 +-
 .../src/commands/convert-csv-command.ts       | 141 +++++++-----------
 .../ensrainbow/concepts/creating-files.mdx    |   6 +-
 4 files changed, 82 insertions(+), 136 deletions(-)

diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts
index 6e6bb4f32..de84a0963 100644
--- a/apps/ensrainbow/src/cli.ts
+++ b/apps/ensrainbow/src/cli.ts
@@ -69,11 +69,7 @@ interface ConvertCsvArgs {
   "label-set-version": LabelSetVersion;
   "progress-interval"?: number;
   "existing-db-path"?: string;
-  "silent"?: boolean;
-  "disable-dedup"?: boolean;
-  "cache-size"?: number;
-  "use-bloom-filter"?: boolean;
-  "bloom-filter-size"?: number;
+  silent?: boolean;
 }
 
 export interface CLIOptions {
@@ -264,37 +260,17 @@ export function createCLI(options: CLIOptions = {}) {
             .option("progress-interval", {
               type: "number",
               description: "Number of records to process before logging progress",
-              default: 10000,
+              default: 50000,
             })
-        .option("existing-db-path", {
-          type: "string",
-          description: "Path to existing ENSRainbow database to filter out existing labels",
-        })
-        .option("silent", {
-          type: "boolean",
-          description: "Disable progress bar (useful for scripts)",
-          default: false,
-        })
-        .option("disable-dedup", {
-          type: "boolean",
-          description: "Disable deduplication within CSV file (faster but may create duplicates)",
-          default: false,
-        })
-        .option("cache-size", {
-          type: "number",
-          description: "Cache size for deduplication (default: 5000)",
-          default: 5000,
-        })
-        .option("use-bloom-filter", {
-          type: "boolean",
-          description: "Use Bloom filter for faster deduplication (default: false)",
-          default: false,
-        })
-        .option("bloom-filter-size", {
-          type: "number",
-          description: "Expected number of items for Bloom filter (default: 10000000)",
-          default: 10000000,
-        });
+            .option("existing-db-path", {
+              type: "string",
+              description: "Path to existing ENSRainbow database to filter out existing labels",
+            })
+            .option("silent", {
+              type: "boolean",
+              description: "Disable progress bar (useful for scripts)",
+              default: false,
+            });
         },
         async (argv: ArgumentsCamelCase<ConvertCsvArgs>) => {
           await convertCsvCommand({
@@ -302,13 +278,9 @@ export function createCLI(options: CLIOptions = {}) {
             outputFile: argv["output-file"],
             labelSetId: argv["label-set-id"],
             labelSetVersion: argv["label-set-version"],
-          progressInterval: argv["progress-interval"],
-          existingDbPath: argv["existing-db-path"],
-          silent: argv["silent"],
-          noDedup: argv["disable-dedup"],
-            cacheSize: argv["cache-size"],
-            useBloomFilter: argv["use-bloom-filter"],
-            bloomFilterSize: argv["bloom-filter-size"],
+            progressInterval: argv["progress-interval"],
+            existingDbPath: argv["existing-db-path"],
+            silent: argv["silent"],
           });
         },
       )
diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
index c6ddadb03..4f5b37eb6 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
@@ -1,12 +1,15 @@
+import { mkdtemp, rm, stat, writeFile } from "fs/promises";
 import { tmpdir } from "os";
 import { join } from "path";
-import { mkdtemp, rm, stat, writeFile } from "fs/promises";
+
+import { labelhash } from "viem";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 
+import { type LabelSetId, type LabelSetVersion, labelHashToBytes } from "@ensnode/ensnode-sdk";
+
 import { createCLI } from "@/cli";
 import { ENSRainbowDB } from "@/lib/database";
-import { type LabelSetId, type LabelSetVersion, labelHashToBytes } from "@ensnode/ensnode-sdk";
-import { labelhash } from "viem";
+
 import { convertCsvCommand } from "./convert-csv-command";
 
 // Path to test fixtures
@@ -406,10 +409,10 @@ describe("convert-csv-command", () => {
 
       // Verify file was created
       const stats = await stat(outputFile);
-    expect(stats.isFile()).toBe(true);
-    expect(stats.size).toBeGreaterThan(0);
+      expect(stats.isFile()).toBe(true);
+      expect(stats.size).toBeGreaterThan(0);
+    });
   });
-});
 
   describe("Streaming performance", () => {
     it("should handle small CSV files efficiently", async () => {
diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts
index db7478664..47d790a69 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.ts
@@ -5,15 +5,16 @@
  * Supports 1-column (label only) and 2-column (label,labelhash) formats
  */
 
-import { createReadStream, createWriteStream, statSync } from "fs";
-import { rmSync } from "fs";
+import { createReadStream, createWriteStream, rmSync, statSync } from "fs";
 import { join } from "path";
-import { type LabelHash, labelHashToBytes } from "@ensnode/ensnode-sdk";
+
 import { parse } from "@fast-csv/parse";
-import { labelhash } from "viem";
 import { ClassicLevel } from "classic-level";
 import ProgressBar from "progress";
-import bloomFilters from "bloom-filters";
+import { labelhash } from "viem";
+
+import { type LabelHash, labelHashToBytes } from "@ensnode/ensnode-sdk";
+
 import { ENSRainbowDB } from "../lib/database.js";
 import { logger } from "../utils/logger.js";
 import {
@@ -28,7 +29,7 @@ function estimateMapMemory(map: Map<string, any>): number {
   let total = 0;
   for (const [key, value] of map) {
     // Rough estimate: key size + value size + Map overhead (48 bytes per entry)
-    total += (key.length * 2) + (typeof value === 'string' ? value.length * 2 : 8) + 48;
+    total += key.length * 2 + (typeof value === "string" ? value.length * 2 : 8) + 48;
   }
   return total;
 }
@@ -38,16 +39,9 @@ function estimateMapMemory(map: Map<string, any>): number {
  */
 class DeduplicationDB {
   private pendingWrites: Map<string, string> = new Map();
-  private bloomFilter: typeof bloomFilters.BloomFilter | null = null;
 
-  constructor(private db: ClassicLevel<string, string>, cacheSize: number = 1000, useBloomFilter: boolean = false, expectedItems: number = 10000000) {
+  constructor(private db: ClassicLevel<string, string>) {
     // No in-memory cache - LevelDB has its own internal cache
-    
-    if (useBloomFilter) {
-      // Create Bloom filter with 0.1% false positive rate
-      this.bloomFilter = bloomFilters.BloomFilter.create(expectedItems, 0.01);
-      logger.info(`Created Bloom filter for ${expectedItems} items (~${(this.bloomFilter.size / 8 / 1024 / 1024).toFixed(2)} MB)`);
-    }
   }
 
   async has(key: string): Promise<boolean> {
@@ -56,13 +50,6 @@ class DeduplicationDB {
       return true;
     }
 
-    // Use Bloom filter if available (skip expensive DB lookup)
-    if (this.bloomFilter) {
-      if (!this.bloomFilter.has(key)) {
-        return false;
-      }
-    }
-
     // Check database (LevelDB has its own internal cache)
     try {
       await this.db.get(key);
@@ -74,12 +61,7 @@ class DeduplicationDB {
 
   async add(key: string, value: string): Promise<void> {
     this.pendingWrites.set(key, value);
-    
-    // Add to Bloom filter if available
-    if (this.bloomFilter) {
-      this.bloomFilter.add(key);
-    }
-    
+
     // Flush frequently to keep pendingWrites small
     if (this.pendingWrites.size >= 1000) {
       await this.flush();
@@ -95,7 +77,7 @@ class DeduplicationDB {
     }
     await batch.write();
     this.pendingWrites.clear();
-    
+
     // Hint to garbage collector after large batch
     if (global.gc) {
       global.gc();
@@ -107,7 +89,12 @@ class DeduplicationDB {
     await this.db.close();
   }
 
-  getMemoryStats(): { pendingWrites: number; cache: number; pendingWritesMB: number; cacheMB: number } {
+  getMemoryStats(): {
+    pendingWrites: number;
+    cache: number;
+    pendingWritesMB: number;
+    cacheMB: number;
+  } {
     return {
       pendingWrites: this.pendingWrites.size,
       cache: 0, // Cache disabled - using LevelDB's internal cache
@@ -117,20 +104,16 @@ class DeduplicationDB {
   }
 }
 
-
 /**
  * Sets up a simple progress bar that shows speed without total count.
  */
 function setupProgressBar(): ProgressBar {
-  return new ProgressBar(
-    "Processing CSV [:bar] :current lines - :rate lines/sec",
-    {
-      complete: "=",
-      incomplete: " ",
-      width: 40,
-      total: 200000000, // Very large total for big files
-    },
-  );
+  return new ProgressBar("Processing CSV [:bar] :current lines - :rate lines/sec", {
+    complete: "=",
+    incomplete: " ",
+    width: 40,
+    total: 200000000, // Very large total for big files
+  });
 }
 
 /**
@@ -144,10 +127,6 @@ export interface ConvertCsvCommandOptions {
   progressInterval?: number;
   existingDbPath?: string; // Path to existing ENSRainbow database to check for existing labels
   silent?: boolean; // Disable progress bar for tests
-  noDedup?: boolean; // Disable deduplication within CSV file
-  cacheSize?: number; // Cache size for deduplication (default: 10000)
-  useBloomFilter?: boolean; // Use Bloom filter for faster deduplication (default: false)
-  bloomFilterSize?: number; // Expected number of items for Bloom filter (default: 10000000)
 }
 
 // Configuration constants
@@ -242,8 +221,9 @@ async function initializeConversion(options: ConvertCsvCommandOptions) {
     const stats = statSync(options.inputFile);
     const fileSizeMB = (stats.size / (1024 * 1024)).toFixed(2);
     logger.info(`Input file size: ${fileSizeMB} MB`);
-    
-    if (stats.size > 1024 * 1024 * 1024) { // > 1GB
+
+    if (stats.size > 1024 * 1024 * 1024) {
+      // > 1GB
       logger.warn("⚠️  Processing a very large file - using SEQUENTIAL mode.");
       logger.warn("💡 Use --existing-db-path to filter existing labels and speed up processing.");
     }
@@ -352,7 +332,7 @@ async function processRecord(
   // Create protobuf message and write with backpressure handling
   const recordMessage = RainbowRecordType.fromObject(rainbowRecord);
   const buffer = Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish());
-  
+
   // Check if write returns false (buffer full) - if so, wait for drain
   const canContinue = outputStream.write(buffer);
   if (!canContinue) {
@@ -361,7 +341,7 @@ async function processRecord(
     // Wait for drain event before continuing
     // Note: The CSV stream should be paused by the caller when backpressure is detected
     await new Promise<void>((resolve) => {
-      outputStream.once('drain', resolve);
+      outputStream.once("drain", resolve);
     });
   }
 
@@ -417,23 +397,23 @@ async function processCSVFile(
             const chunkTimeSeconds = (chunkTime / 1000).toFixed(2);
             const totalTimeSeconds = (totalElapsed / 1000).toFixed(2);
             const linesPerSecond = ((progressInterval / chunkTime) * 1000).toFixed(0);
-            
+
             lastLoggedLine = lineNumber;
             lastLogTime = currentTime;
-            
+
             const memUsage = process.memoryUsage();
             const memInfo = `RSS=${(memUsage.rss / 1024 / 1024).toFixed(0)}MB, Heap=${(memUsage.heapUsed / 1024 / 1024).toFixed(0)}MB`;
-            
+
             let dedupInfo = "";
             if (dedupDb) {
               const dedupStats = dedupDb.getMemoryStats();
               dedupInfo = ` | Dedup: ${dedupStats.pendingWrites}/${dedupStats.cache}`;
             }
-            
+
             // Use console.log instead of logger to avoid worker thread issues
             console.log(
               `[${new Date().toISOString()}] Line ${lineNumber}, written ${processedRecords} | ` +
-              `${linesPerSecond} lines/sec | ${memInfo}${dedupInfo}`
+                `${linesPerSecond} lines/sec | ${memInfo}${dedupInfo}`,
             );
           }
 
@@ -462,7 +442,6 @@ async function processCSVFile(
           // Done processing - resume for next row
           isProcessing = false;
           csvStream.resume();
-
         } catch (error) {
           const errorMessage = error instanceof Error ? error.message : String(error);
           csvStream.destroy();
@@ -507,30 +486,21 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
     const { RainbowRecordType, outputStream, existingDb: db } = await initializeConversion(options);
     existingDb = db;
 
-    // Create temporary deduplication database (if not disabled)
-    if (!options.noDedup) {
-      tempDedupDir = join(process.cwd(), 'temp-dedup-' + Date.now());
-      logger.info(`Creating temporary deduplication database at: ${tempDedupDir}`);
-      const tempDb = new ClassicLevel<string, string>(tempDedupDir, {
-        keyEncoding: 'utf8',
-        valueEncoding: 'utf8',
-        createIfMissing: true,
-        // Aggressive memory limits
-        cacheSize: 2 * 1024 * 1024, // 2MB block cache (minimal)
-        writeBufferSize: 4 * 1024 * 1024, // 4MB write buffer (minimal)
-        maxOpenFiles: 100, // Limit open files
-        compression: false, // Disable compression to reduce CPU/memory
-      });
-      await tempDb.open();
-      dedupDb = new DeduplicationDB(
-        tempDb, 
-        options.cacheSize ?? 1000, // Reduced default from 10000 to 1000
-        options.useBloomFilter ?? false,
-        options.bloomFilterSize ?? 10000000
-      );
-    } else {
-      logger.info("Deduplication disabled - processing all records");
-    }
+    // Create temporary deduplication database
+    tempDedupDir = join(process.cwd(), "temp-dedup-" + Date.now());
+    logger.info(`Creating temporary deduplication database at: ${tempDedupDir}`);
+    const tempDb = new ClassicLevel<string, string>(tempDedupDir, {
+      keyEncoding: "utf8",
+      valueEncoding: "utf8",
+      createIfMissing: true,
+      // Aggressive memory limits
+      cacheSize: 2 * 1024 * 1024, // 2MB block cache (minimal)
+      writeBufferSize: 4 * 1024 * 1024, // 4MB write buffer (minimal)
+      maxOpenFiles: 100, // Limit open files
+      compression: false, // Disable compression to reduce CPU/memory
+    });
+    await tempDb.open();
+    dedupDb = new DeduplicationDB(tempDb);
 
     const progressInterval = options.progressInterval ?? DEFAULT_PROGRESS_INTERVAL;
 
@@ -549,16 +519,15 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
       progressBar,
     );
 
-            stats.totalLines = totalLines;
-            stats.processedRecords = processedRecords;
+    stats.totalLines = totalLines;
+    stats.processedRecords = processedRecords;
 
-            // Log final progress for large files
-            if (totalLines > 10_000) {
-              const dedupStatus = options.noDedup ? "dedup disabled" : "LevelDB dedup active";
-              logger.info(
-                `✅ Completed processing ${totalLines.toLocaleString()} lines, wrote ${processedRecords.toLocaleString()} records (${dedupStatus})`,
-              );
-            }
+    // Log final progress for large files
+    if (totalLines > 10_000) {
+      logger.info(
+        `✅ Completed processing ${totalLines.toLocaleString()} lines, wrote ${processedRecords.toLocaleString()} records (LevelDB dedup active)`,
+      );
+    }
 
     // Close output stream
     outputStream.end();
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
index 125e9916a..2d9ec8c10 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
@@ -124,7 +124,8 @@ pnpm run convert-csv \
   --label-set-id <label-set-id> \
   --label-set-version <version-number> \
   [--progress-interval <number>] \
-  [--existing-db-path <path/to/existing/database>]
+  [--existing-db-path <path/to/existing/database>] \
+  [--silent]
 ```
 
 ### Required Parameters
@@ -136,8 +137,9 @@ pnpm run convert-csv \
 ### Optional Parameters
 
 - `--output-file`: Output file path (defaults to `rainbow-records.ensrainbow`)
-- `--progress-interval`: Progress logging frequency (default: 10000 records)
+- `--progress-interval`: Progress logging frequency (default: 50000 records)
 - `--existing-db-path`: Path to existing ENSRainbow database to filter out existing labels
+- `--silent`: Disable progress bar (useful for scripts and automated workflows)
 
 ### CSV Format Support
 

From 11992d7abab25e36184d504bd9a82ab660b9e111 Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Mon, 15 Dec 2025 16:33:19 +0100
Subject: [PATCH 12/28] fix: improve error handling and logging in CSV
 conversion tests

---
 .../src/commands/convert-csv-command.test.ts  |  4 +-
 .../src/commands/convert-csv-command.ts       | 47 +++++++++++++++----
 2 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
index 4f5b37eb6..f3e85f6fa 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts
@@ -110,7 +110,7 @@ describe("convert-csv-command", () => {
           labelSetId: "test-csv-invalid" as LabelSetId,
           labelSetVersion: 0 as LabelSetVersion,
         }),
-      ).rejects.toThrow(/CSV conversion failed due to invalid data/);
+      ).rejects.toThrow(/Failed on line 1: Invalid labelHash/);
     });
 
     it("should handle CSV with special characters, emojis, unicode, and quoted fields", async () => {
@@ -167,7 +167,7 @@ describe("convert-csv-command", () => {
           labelSetId: "test-csv-invalid-hash" as LabelSetId,
           labelSetVersion: 0 as LabelSetVersion,
         }),
-      ).rejects.toThrow(/CSV conversion failed due to invalid data/);
+      ).rejects.toThrow(/Failed on line 2: Invalid labelHash/);
     });
   });
 
diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts
index 47d790a69..3a0f14d84 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.ts
@@ -274,13 +274,22 @@ function createRainbowRecord(row: string[]): { labelhash: Buffer; label: string
     };
   } else {
     // Two columns: validate and use provided hash
-    const providedHash = String(row[1]);
+    // Trim whitespace from hash (metadata), but preserve label as-is
+    const providedHash = String(row[1]).trim();
+    if (providedHash === "") {
+      throw new Error("LabelHash cannot be empty");
+    }
     const maybeLabelHash = providedHash.startsWith("0x") ? providedHash : `0x${providedHash}`;
-    const labelHash = labelHashToBytes(maybeLabelHash as LabelHash);
-    return {
-      labelhash: Buffer.from(labelHash),
-      label: label,
-    };
+    try {
+      const labelHash = labelHashToBytes(maybeLabelHash as LabelHash);
+      return {
+        labelhash: Buffer.from(labelHash),
+        label: label,
+      };
+    } catch (error) {
+      const errorMessage = error instanceof Error ? error.message : String(error);
+      throw new Error(`Invalid labelHash: ${errorMessage}`);
+    }
   }
 }
 
@@ -373,6 +382,14 @@ async function processCSVFile(
   return new Promise((resolve, reject) => {
     const csvStream = parse(); // Sequential processing via pause/resume
     let isProcessing = false;
+    let streamEnded = false;
+
+    const checkAndResolve = () => {
+      if (streamEnded && !isProcessing) {
+        logger.info(`Sequential processing complete`);
+        resolve({ totalLines: lineNumber, processedRecords });
+      }
+    };
 
     csvStream
       .on("data", async (row: string[]) => {
@@ -383,7 +400,16 @@ async function processCSVFile(
         lineNumber++;
 
         try {
-          // Detect column count on first row
+          // Skip empty rows (no columns or all empty strings)
+          const isEmptyRow = row.length === 0 || row.every((cell) => cell === "");
+          if (isEmptyRow) {
+            isProcessing = false;
+            csvStream.resume();
+            checkAndResolve();
+            return;
+          }
+
+          // Detect column count on first non-empty row
           if (expectedColumns === null) {
             expectedColumns = row.length;
             logger.info(`Detected ${expectedColumns} columns - SEQUENTIAL processing mode`);
@@ -442,6 +468,7 @@ async function processCSVFile(
           // Done processing - resume for next row
           isProcessing = false;
           csvStream.resume();
+          checkAndResolve();
         } catch (error) {
           const errorMessage = error instanceof Error ? error.message : String(error);
           csvStream.destroy();
@@ -453,8 +480,8 @@ async function processCSVFile(
         reject(new Error(`CSV parsing error: ${error.message}`));
       })
       .on("end", () => {
-        logger.info(`Sequential processing complete`);
-        resolve({ totalLines: lineNumber, processedRecords });
+        streamEnded = true;
+        checkAndResolve();
       });
 
     fileStream
@@ -537,7 +564,7 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
     logger.info("✅ CSV conversion completed successfully!");
   } catch (error) {
     const errorMessage = error instanceof Error ? error.message : String(error);
-    logger.error("❌ CSV conversion failed:", errorMessage);
+    logger.error(`❌ CSV conversion failed: ${errorMessage}`);
     throw error;
   } finally {
     // Clean up deduplication database

From 3dea60ecf687863f3de34a3589512720618373e7 Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Tue, 16 Dec 2025 21:59:20 +0100
Subject: [PATCH 13/28] refactor: update CSV conversion logic and improve
 deduplication handling

---
 .../src/commands/convert-csv-command.ts       | 44 +++++++++----------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts
index 3a0f14d84..4770a0b1e 100644
--- a/apps/ensrainbow/src/commands/convert-csv-command.ts
+++ b/apps/ensrainbow/src/commands/convert-csv-command.ts
@@ -112,7 +112,7 @@ function setupProgressBar(): ProgressBar {
     complete: "=",
     incomplete: " ",
     width: 40,
-    total: 200000000, // Very large total for big files
+    total: 300000000, // Very large total for big files
   });
 }
 
@@ -225,7 +225,6 @@ async function initializeConversion(options: ConvertCsvCommandOptions) {
     if (stats.size > 1024 * 1024 * 1024) {
       // > 1GB
       logger.warn("⚠️  Processing a very large file - using SEQUENTIAL mode.");
-      logger.warn("💡 Use --existing-db-path to filter existing labels and speed up processing.");
     }
   } catch (error) {
     logger.warn(`Could not determine file size: ${error}`);
@@ -303,7 +302,7 @@ async function processRecord(
   outputStream: NodeJS.WritableStream,
   lineNumber: number,
   existingDb: ENSRainbowDB | null,
-  dedupDb: DeduplicationDB | null,
+  dedupDb: DeduplicationDB,
   stats: ConversionStats,
 ): Promise<boolean> {
   // Validate column count
@@ -326,18 +325,16 @@ async function processRecord(
     }
   }
 
-  // Check if label is a duplicate within this conversion using LevelDB (if enabled)
-  if (dedupDb) {
-    const existsInDedupDb = await dedupDb.has(label);
-    if (existsInDedupDb) {
-      stats.filteredDuplicates++;
-      return false; // Skip this record
-    }
-
-    // Add label to deduplication database
-    await dedupDb.add(label, "");
+  // Check if label is a duplicate within this conversion using LevelDB
+  const existsInDedupDb = await dedupDb.has(label);
+  if (existsInDedupDb) {
+    stats.filteredDuplicates++;
+    return false; // Skip this record
   }
 
+  // Add label to deduplication database
+  await dedupDb.add(label, "");
+
   // Create protobuf message and write with backpressure handling
   const recordMessage = RainbowRecordType.fromObject(rainbowRecord);
   const buffer = Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish());
@@ -366,7 +363,7 @@ async function processCSVFile(
   outputStream: NodeJS.WritableStream,
   progressInterval: number,
   existingDb: ENSRainbowDB | null,
-  dedupDb: DeduplicationDB | null,
+  dedupDb: DeduplicationDB,
   stats: ConversionStats,
   progressBar: ProgressBar | null,
 ): Promise<{ totalLines: number; processedRecords: number }> {
@@ -419,9 +416,6 @@ async function processCSVFile(
           if (lineNumber % progressInterval === 0 && lineNumber !== lastLoggedLine) {
             const currentTime = Date.now();
             const chunkTime = currentTime - lastLogTime;
-            const totalElapsed = currentTime - startTime;
-            const chunkTimeSeconds = (chunkTime / 1000).toFixed(2);
-            const totalTimeSeconds = (totalElapsed / 1000).toFixed(2);
             const linesPerSecond = ((progressInterval / chunkTime) * 1000).toFixed(0);
 
             lastLoggedLine = lineNumber;
@@ -430,11 +424,8 @@ async function processCSVFile(
             const memUsage = process.memoryUsage();
             const memInfo = `RSS=${(memUsage.rss / 1024 / 1024).toFixed(0)}MB, Heap=${(memUsage.heapUsed / 1024 / 1024).toFixed(0)}MB`;
 
-            let dedupInfo = "";
-            if (dedupDb) {
-              const dedupStats = dedupDb.getMemoryStats();
-              dedupInfo = ` | Dedup: ${dedupStats.pendingWrites}/${dedupStats.cache}`;
-            }
+            const dedupStats = dedupDb.getMemoryStats();
+            const dedupInfo = ` | Dedup: ${dedupStats.pendingWrites}/${dedupStats.cache}`;
 
             // Use console.log instead of logger to avoid worker thread issues
             console.log(
@@ -496,6 +487,11 @@ async function processCSVFile(
  * Main CSV conversion command with true streaming using fast-csv
  */
 export async function convertCsvCommand(options: ConvertCsvCommandOptions): Promise<void> {
+  // Validate that existingDbPath is provided when labelSetVersion > 0
+  if (options.labelSetVersion > 0 && !options.existingDbPath) {
+    throw new Error("existingDbPath must be specified if label set version is higher than 0");
+  }
+
   const stats: ConversionStats = {
     totalLines: 0,
     processedRecords: 0,
@@ -506,7 +502,7 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
   };
 
   let existingDb: ENSRainbowDB | null = null;
-  let dedupDb: DeduplicationDB | null = null;
+  let dedupDb: DeduplicationDB | undefined;
   let tempDedupDir: string | null = null;
 
   try {
@@ -568,7 +564,7 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom
     throw error;
   } finally {
     // Clean up deduplication database
-    if (dedupDb) {
+    if (dedupDb !== undefined) {
       try {
         await dedupDb.close();
         logger.info("Closed deduplication database");

From 42c06a1877c2bff723ba50b343b27f7f7f5ecefb Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Wed, 17 Sep 2025 16:25:50 +0200
Subject: [PATCH 14/28] Enhance documentation for label sets and versions
 across various files, adding references to the glossary for clarity. Update
 environment variable descriptions in `.env.local.example` files and improve
 comments in scripts to ensure consistent understanding of label set concepts.

---
 apps/ensindexer/.env.local.example               |  3 ++-
 apps/ensrainbow/.env.local.example               | 16 ++--------------
 .../scripts/download-ensrainbow-files.sh         |  4 ++--
 .../scripts/download-prebuilt-database.sh        |  4 ++--
 .../docs/ensrainbow/concepts/glossary.mdx        |  4 ++--
 .../concepts/label-sets-and-versioning.mdx       |  2 +-
 .../concepts/typescript-interfaces.mdx           |  4 ++--
 .../docs/ensrainbow/contributing/index.mdx       |  4 ++--
 .../docs/ensrainbow/usage/configuration.mdx      |  4 ++--
 packages/ensnode-sdk/src/ensrainbow/types.ts     |  5 +----
 10 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/apps/ensindexer/.env.local.example b/apps/ensindexer/.env.local.example
index 73439e54b..cef0a6dbd 100644
--- a/apps/ensindexer/.env.local.example
+++ b/apps/ensindexer/.env.local.example
@@ -213,7 +213,7 @@ ENSRAINBOW_URL=http://localhost:3223
 # https://ensnode.io/ensrainbow/usage/available-label-sets/
 #
 # LABEL_SET_ID: The label set identifier that will be used for label healing requests sent to ENSRainbow.
-# Each label set id references a collection of rainbow records.
+# See https://ensnode.io/ensrainbow/concepts/glossary/ for definition of "label set".
 # This must match the label set ID configured in your ENSRainbow server.
 #
 # For full subgraph backwards compatibility, LABEL_SET_ID must be set to "subgraph"
@@ -221,6 +221,7 @@ ENSRAINBOW_URL=http://localhost:3223
 LABEL_SET_ID=subgraph
 
 # LABEL_SET_VERSION: A non-negative integer representing the version of the label set to request from ENSRainbow.
+# See https://ensnode.io/ensrainbow/concepts/glossary/ for definition of "label set".
 # This "fully pins" ENSIndexer to a deterministic set of ENSRainbow label healing responses across time,
 # even if the connected ENSRainbow later ingests additional records into the same label set.
 # This must be less than or equal to the label set version configured in your ENSRainbow server.
diff --git a/apps/ensrainbow/.env.local.example b/apps/ensrainbow/.env.local.example
index dbc7cb067..4c9bf6386 100644
--- a/apps/ensrainbow/.env.local.example
+++ b/apps/ensrainbow/.env.local.example
@@ -27,13 +27,7 @@ DB_SCHEMA_VERSION=3
 
 # --- Label Set ID ---
 #
-# Definition: An identifier that categorizes the type of ENSRainbow data (e.g., a
-#             full production set vs. a minimal test set).
-#
-# Goal: To provide different "namespaces" or categories of ENSRainbow data. This allows
-#       ENSRainbow to serve different datasets for different use cases from the
-#       same Docker image, such as a tiny dataset for testing versus a
-#       comprehensive one for production.
+# See https://ensnode.io/ensrainbow/concepts/glossary/ for definition of "label set".
 #
 # How to configure: Choose the Label Set ID that matches your needs.
 # - `subgraph`:  The current production dataset. Use this for current production applications.
@@ -44,13 +38,7 @@ LABEL_SET_ID=ens-test-env
 
 # --- Label Set Version ---
 #
-# Definition: A non-negative integer representing an incremental update to a
-#             dataset within a specific `LABEL_SET_ID`.
-#
-# Goal: To support the deterministic evolution of datasets over time. As new
-#       ENS names are discovered, they are added in new, incremental versions.
-#       This allows services like ENSIndexer to achieve reproducible results
-#       by targeting a specific, immutable version of the data.
+# See https://ensnode.io/ensrainbow/concepts/glossary/ for definition of "label set".
 #
 # How to configure:
 # - To get the most up-to-date data, use the highest available version number
diff --git a/apps/ensrainbow/scripts/download-ensrainbow-files.sh b/apps/ensrainbow/scripts/download-ensrainbow-files.sh
index 00d894399..ac84faa49 100755
--- a/apps/ensrainbow/scripts/download-ensrainbow-files.sh
+++ b/apps/ensrainbow/scripts/download-ensrainbow-files.sh
@@ -8,8 +8,8 @@ set -euo pipefail
 # This script downloads a specific ENSRainbow labelset file.
 #
 # The labelsets are identified by:
-#   - LABEL_SET_ID: The identifier for a label set, which is a collection of ENS labelhash-to-label mappings from a specific source.
-#   - LABEL_SET_VERSION: A non-negative integer representing the version of a label set.
+#   - LABEL_SET_ID: The identifier for a label set (see https://ensnode.io/ensrainbow/concepts/glossary/ for definition).
+#   - LABEL_SET_VERSION: A non-negative integer representing the version of a label set (see https://ensnode.io/ensrainbow/concepts/glossary/ for definition).
 #
 # This script requires these two identifiers as command-line arguments to
 # download the correct labelset file (.ensrainbow), its checksum, and a
diff --git a/apps/ensrainbow/scripts/download-prebuilt-database.sh b/apps/ensrainbow/scripts/download-prebuilt-database.sh
index e2f7a99ce..c2abdd948 100644
--- a/apps/ensrainbow/scripts/download-prebuilt-database.sh
+++ b/apps/ensrainbow/scripts/download-prebuilt-database.sh
@@ -13,8 +13,8 @@ set -euo pipefail
 #
 # The database is versioned using a three-part system:
 #   - DB_SCHEMA_VERSION: The physical layout/structure of the database.
-#   - LABEL_SET_ID: The identifier for a label set, which is a collection of ENS labelhash-to-label mappings from a specific source.
-#   - LABEL_SET_VERSION: A non-negative integer representing the version of a label set.
+#   - LABEL_SET_ID: The identifier for a label set (see https://ensnode.io/ensrainbow/concepts/glossary/ for definition).
+#   - LABEL_SET_VERSION: A non-negative integer representing the version of a label set (see https://ensnode.io/ensrainbow/concepts/glossary/ for definition).
 #
 # This script requires these three identifiers as command-line arguments to
 # download the correct pre-built database archive (.tgz), its checksum, and a
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
index 46b487837..888222fe1 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
@@ -15,9 +15,9 @@ This page defines the **core terminology** used throughout the ENSRainbow codeba
 | **Labelhash** | `keccak256` hash of the UTF-8 bytes of a label (no pre-normalisation), represented as a **0x-prefixed 64-digit lowercase hex** string (32 bytes). | `0xaf2caa…03cc` |
 | **Heal** | The act of converting a _labelhash_ back to its original _label_ via a rainbow table lookup. | `heal('0xaf2c…') → 'vitalik'` |
 | **Rainbow Record** | An entry mapping a `labelhash` ➜ `label`. Persisted as a LevelDB key (labelhash bytes) and UTF-8 value (_see Data Model_). | – |
-| **Label Set** | A logical collection of rainbow records that share a common **source** and **versioning** scheme (e.g. subgraph v0). Identified by `labelSetId` & `labelSetVersion`. | id: `subgraph`, version: `0` |
+| **Label Set** | A logical collection of rainbow records (ENS labelhash-to-label mappings) that share a common **source** and **versioning** scheme. Each label set represents a dataset snapshot that enables deterministic healing across time. Label sets are identified by a `labelSetId` and `labelSetVersion`. | id: `subgraph`, version: `0` |
 | **Label Set ID** | String (1-50 chars) consisting of lowercase ASCII letters and hyphens that names a label set. | `subgraph`, `discovery-a` |
-| **Label Set Version** | Non-negative integer that monotonically increases when new labelhash-to-label mappings are added to a label set. Each version contains incremental additions since the previous version. Enables deterministic healing across time. | `0`, `1`, `2` |
+| **Label Set Version** | Non-negative integer that monotonically increases when new labelhash-to-label mappings are added to a label set. Each version contains incremental additions since the previous version. Version `0` is always the initial dataset. Enables deterministic healing across time by allowing clients to pin to specific versions for reproducible results. | `0`, `1`, `2` |
 | **Healable Count** | Total number of labels that can currently be healed by the running server. Exposed via `/count`. | `7 892 001` |
 | **Status Code** | High-level outcome of an API call – either `success` or `error`. | – |
 | **Error Code** | HTTP-style numeric code describing the error (`400`, `404`, `500`). | – |
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/label-sets-and-versioning.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/label-sets-and-versioning.mdx
index cc71e1517..4135009b6 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/label-sets-and-versioning.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/label-sets-and-versioning.mdx
@@ -9,7 +9,7 @@ keywords: [ensrainbow, versioning, label sets, deterministic]
 
 ## Why Label Sets & Versions?
 
-A **label set** is analogous to a _dataset snapshot_. Every time the upstream data (e.g. an on-chain subgraph export) changes, we mint a new **label set version** so that:
+A **label set** (see [Glossary](/ensrainbow/concepts/glossary/) for definition) is analogous to a _dataset snapshot_. Every time the upstream data (e.g. an on-chain subgraph export) changes, we mint a new **label set version** so that:
 
 ### 1. Deterministic Results
 Clients that pin _version `N`_ are guaranteed to get the _exact same_ heal response today, tomorrow, and two years from now.
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/typescript-interfaces.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/typescript-interfaces.mdx
index adc32470a..333ebcd0f 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/typescript-interfaces.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/typescript-interfaces.mdx
@@ -7,7 +7,7 @@ sidebar:
 keywords: [ensrainbow, typescript, interfaces, types]
 ---
 
-ENSRainbow's TypeScript APIs expose two companion interfaces that describe **which label sets are available (server-side) or requested (client-side)**.
+ENSRainbow's TypeScript APIs expose two companion interfaces that describe **which label sets** (see [Glossary](/ensrainbow/concepts/glossary/) for definition) **are available (server-side) or requested (client-side)**.
 
 ## Server Label Set
 
@@ -24,7 +24,7 @@ interface EnsRainbowServerLabelSet {
 
 #### Fields
 
-- **`labelSetId`** identifies **which label set** the server is currently serving.
+- **`labelSetId`** identifies **which label set** (see [Glossary](/ensrainbow/concepts/glossary/) for definition) the server is currently serving.
 - **`highestLabelSetVersion`** is the **highest version** available through the server for the label set id. The server will not return labels from a version _greater_ than this value (unless it ingests another incremental label set).
 
 ## Client Label Set
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx
index 401a0f986..216fd9ad2 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx
@@ -185,11 +185,11 @@ When using ENSRainbow with Docker, the following environment variables control w
   - **Goal**: Ensures compatibility between the ENSRainbow software and the structure of downloaded database files that are prebuilt for startup-time optimizations.
   - **Configuration**: It is strongly recommended to use the latest available schema version unless you have specific compatibility requirements.
 
-- **`LABEL_SET_ID`**: The identifier for a **Label Set**, which is a collection of ENS labelhash-to-label mappings from a specific source.
+- **`LABEL_SET_ID`**: The identifier for a **Label Set** (see [Glossary](/ensrainbow/concepts/glossary/) for definition).
   - **Goal**: To enable the extensible definition of new label sets (e.g., subgraph vs. production vs. test).
   - **Configuration**: See the [Available Label Sets](/ensrainbow/usage/available-label-sets) page for a complete list of currently available label set IDs and their descriptions.
 
-- **`LABEL_SET_VERSION`**: A non-negative integer representing the version of a **Label Set**.
+- **`LABEL_SET_VERSION`**: A non-negative integer representing the version of a **Label Set** (see [Glossary](/ensrainbow/concepts/glossary/) for definition).
   - **Goal**: To support the deterministic evolution of datasets over time, allowing services to achieve reproducible results.
   - **Configuration**: Use the highest available version number for the most up-to-date data. Versions are sequential and incremental:
     - `0` - The initial/base version of the **Label Set**.
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/usage/configuration.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/usage/configuration.mdx
index daed73df7..1e773b9e0 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/usage/configuration.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/usage/configuration.mdx
@@ -33,12 +33,12 @@ These variables are **only used by shell scripts** for downloading and setting u
   - **Configuration**: It is strongly recommended to use the latest available schema version unless you have specific compatibility requirements.
   - **Used by**: Download scripts to fetch the correct database format.
 
-- **`LABEL_SET_ID`**: The identifier for a **label set**, which is a collection of ENS labelhash-to-label mappings from a specific source.
+- **`LABEL_SET_ID`**: The identifier for a **label set** (see [Glossary](/ensrainbow/concepts/glossary/) for definition).
   - **Goal**: To enable the extensible definition of new label sets (e.g., subgraph vs. production vs. test).
   - **Configuration**: See the [Available Label Sets](/ensrainbow/usage/available-label-sets) page for a complete list of currently available label set IDs and their descriptions.
   - **Used by**: Download scripts to fetch the correct label set.
 
-- **`LABEL_SET_VERSION`**: A non-negative integer representing the version of a **label set**.
+- **`LABEL_SET_VERSION`**: A non-negative integer representing the version of a **label set** (see [Glossary](/ensrainbow/concepts/glossary/) for definition).
   - **Goal**: To support the deterministic evolution of datasets over time, allowing services to achieve reproducible results.
   - **Configuration**: Use the highest available version number for the most up-to-date data. Versions are sequential and incremental:
     - `0` - The initial/base version of the **label set**.
diff --git a/packages/ensnode-sdk/src/ensrainbow/types.ts b/packages/ensnode-sdk/src/ensrainbow/types.ts
index e2a93019b..d59c1deb9 100644
--- a/packages/ensnode-sdk/src/ensrainbow/types.ts
+++ b/packages/ensnode-sdk/src/ensrainbow/types.ts
@@ -1,8 +1,5 @@
 /**
- * A label set ID identifies a set of labels that can be used for deterministic healing.
- * A label set allows clients to deterministically heal their state against a server,
- * ensuring that both are operating on the same version of data.
- *
+ * A label set ID identifies a label set (see https://ensnode.io/ensrainbow/concepts/glossary/ for definition).
  * It is guaranteed to be 1 to 50 characters long and contain only lowercase letters (a-z)
  * and hyphens (-).
  */

From 73376a7398b61eab941137798f51ec09793d9596 Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Mon, 22 Sep 2025 16:11:38 +0200
Subject: [PATCH 15/28] Update environment variable documentation and improve
 comments in scripts to enhance clarity on label sets and versions. Add
 glossary references for better understanding across multiple files.

---
 apps/ensindexer/.env.local.example            | 18 +---
 apps/ensrainbow/.env.local.example            | 32 +------
 .../scripts/download-ensrainbow-files.sh      |  6 +-
 .../scripts/download-prebuilt-database.sh     |  8 +-
 .../docs/ensrainbow/concepts/data-model.mdx   |  2 +-
 .../docs/ensrainbow/concepts/glossary.mdx     | 89 +++++++++++++++----
 .../concepts/label-sets-and-versioning.mdx    |  4 +-
 .../concepts/typescript-interfaces.mdx        |  4 +-
 .../docs/ensrainbow/contributing/index.mdx    |  4 +-
 .../docs/ensrainbow/usage/configuration.mdx   |  4 +-
 packages/ensnode-sdk/src/ensrainbow/types.ts  |  2 +-
 11 files changed, 99 insertions(+), 74 deletions(-)

diff --git a/apps/ensindexer/.env.local.example b/apps/ensindexer/.env.local.example
index cef0a6dbd..0f56f8a95 100644
--- a/apps/ensindexer/.env.local.example
+++ b/apps/ensindexer/.env.local.example
@@ -206,28 +206,18 @@ PLUGINS=subgraph,basenames,lineanames,threedns,protocol-acceleration,registrars,
 ENSRAINBOW_URL=http://localhost:3223
 
 # Pinned Label Set Configuration for requests to ENSRainbow
+# ENSRainbow label set configuration (see https://ensnode.io/ensrainbow/usage/configuration for details)
 # Required. ENSIndexer must be pinned to a specific label set ID and version to ensure deterministic
 # indexing results across time.
 #
 # For a list of available label sets and their configurations, visit:
 # https://ensnode.io/ensrainbow/usage/available-label-sets/
 #
-# LABEL_SET_ID: The label set identifier that will be used for label healing requests sent to ENSRainbow.
-# See https://ensnode.io/ensrainbow/concepts/glossary/ for definition of "label set".
-# This must match the label set ID configured in your ENSRainbow server.
-#
-# For full subgraph backwards compatibility, LABEL_SET_ID must be set to "subgraph"
-# and LABEL_SET_VERSION must be set to 0.
+# LABEL_SET_ID: see https://ensnode.io/ensrainbow/concepts/glossary#label-set-id.
 LABEL_SET_ID=subgraph
 
-# LABEL_SET_VERSION: A non-negative integer representing the version of the label set to request from ENSRainbow.
-# See https://ensnode.io/ensrainbow/concepts/glossary/ for definition of "label set".
-# This "fully pins" ENSIndexer to a deterministic set of ENSRainbow label healing responses across time,
-# even if the connected ENSRainbow later ingests additional records into the same label set.
-# This must be less than or equal to the label set version configured in your ENSRainbow server.
-#
-# For full subgraph backwards compatibility, LABEL_SET_ID must be set to "subgraph"
-# and LABEL_SET_VERSION must be set to 0.
+# LABEL_SET_VERSION: see https://ensnode.io/ensrainbow/concepts/
+glossary#label-set-version.
 LABEL_SET_VERSION=0
 
 # The "primary" ENSIndexer service URL
diff --git a/apps/ensrainbow/.env.local.example b/apps/ensrainbow/.env.local.example
index 4c9bf6386..7eb88231b 100644
--- a/apps/ensrainbow/.env.local.example
+++ b/apps/ensrainbow/.env.local.example
@@ -12,37 +12,11 @@ LOG_LEVEL=info
 # container will download on its first startup. This approach keeps the Docker
 # image small and allows data to be updated independently of the application code.
 
-# --- Database Schema Version ---
-#
-# Definition: Specifies the version of the physical database layout. It is not
-#             related to the application's API version.
-#
-# Goal: Ensures the application's code is compatible with the on-disk database
-#       format. Think of it like a migration version for a traditional database.
-#
-# How to configure: You should almost always use the latest available version
-# (e.g., `3`) to run the most recent and efficient database structure. Only use
-# an older version if you have a specific, legacy system requirement.
+# Database schema version (see https://ensnode.io/ensrainbow/usage/configuration for details)
 DB_SCHEMA_VERSION=3
 
-# --- Label Set ID ---
-#
-# See https://ensnode.io/ensrainbow/concepts/glossary/ for definition of "label set".
-#
-# How to configure: Choose the Label Set ID that matches your needs.
-# - `subgraph`:  The current production dataset. Use this for current production applications.
-# - `ens-test-env`:   A very small, lightweight dataset used for testing with the ens-test-env.
-# - `searchlight`: (Future) An extended dataset with additional label discoveries.
-# - `discovery-a`:  (Future) A dataset for dynamically discovered labels.
+# Label set ID (see https://ensnode.io/ensrainbow/concepts/glossary#label-set-id)
 LABEL_SET_ID=ens-test-env
 
-# --- Label Set Version ---
-#
-# See https://ensnode.io/ensrainbow/concepts/glossary/ for definition of "label set".
-#
-# How to configure:
-# - To get the most up-to-date data, use the highest available version number
-#   for your chosen `LABEL_SET_ID`. Versions start at `0`.
-# - If you need to reproduce a result from an earlier point in time, you would
-#   use a specific, older version number.
+# Label set version (see https://ensnode.io/ensrainbow/concepts/glossary#label-set-version)
 LABEL_SET_VERSION=0
diff --git a/apps/ensrainbow/scripts/download-ensrainbow-files.sh b/apps/ensrainbow/scripts/download-ensrainbow-files.sh
index ac84faa49..bf0c2245f 100755
--- a/apps/ensrainbow/scripts/download-ensrainbow-files.sh
+++ b/apps/ensrainbow/scripts/download-ensrainbow-files.sh
@@ -8,8 +8,10 @@ set -euo pipefail
 # This script downloads a specific ENSRainbow labelset file.
 #
 # The labelsets are identified by:
-#   - LABEL_SET_ID: The identifier for a label set (see https://ensnode.io/ensrainbow/concepts/glossary/ for definition).
-#   - LABEL_SET_VERSION: A non-negative integer representing the version of a label set (see https://ensnode.io/ensrainbow/concepts/glossary/ for definition).
+#   - LABEL_SET_ID
+#   - LABEL_SET_VERSION
+#
+# See https://ensnode.io/ensrainbow/concepts/glossary/ for details.
 #
 # This script requires these two identifiers as command-line arguments to
 # download the correct labelset file (.ensrainbow), its checksum, and a
diff --git a/apps/ensrainbow/scripts/download-prebuilt-database.sh b/apps/ensrainbow/scripts/download-prebuilt-database.sh
index c2abdd948..af49d08e2 100644
--- a/apps/ensrainbow/scripts/download-prebuilt-database.sh
+++ b/apps/ensrainbow/scripts/download-prebuilt-database.sh
@@ -12,9 +12,11 @@ set -euo pipefail
 # saving the end-user from a slow and resource-intensive data ingestion process.
 #
 # The database is versioned using a three-part system:
-#   - DB_SCHEMA_VERSION: The physical layout/structure of the database.
-#   - LABEL_SET_ID: The identifier for a label set (see https://ensnode.io/ensrainbow/concepts/glossary/ for definition).
-#   - LABEL_SET_VERSION: A non-negative integer representing the version of a label set (see https://ensnode.io/ensrainbow/concepts/glossary/ for definition).
+#   - DB_SCHEMA_VERSION
+#   - LABEL_SET_ID
+#   - LABEL_SET_VERSION
+#
+# See https://ensnode.io/ensrainbow/concepts/glossary/ for details.
 #
 # This script requires these three identifiers as command-line arguments to
 # download the correct pre-built database archive (.tgz), its checksum, and a
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx
index e1df686d0..51eb7aabb 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx
@@ -136,6 +136,6 @@ graph LR;
 
 ## Related Documentation
 
-- **[Glossary](/ensrainbow/concepts/glossary)** - Key terms like System Key, Ingestion, etc.
+- **[Glossary](/ensrainbow/concepts/glossary)** - Key terms like [System Key](/ensrainbow/concepts/glossary#system-key), [Ingestion](/ensrainbow/concepts/glossary#ingestion), etc.
 - **[Label Sets & Versioning](/ensrainbow/concepts/label-sets-and-versioning)** - Understanding the versioning system
 - **[TypeScript Interfaces](/ensrainbow/concepts/typescript-interfaces)** - Type definitions for working with the data 
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
index 888222fe1..a18c46570 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
@@ -9,22 +9,79 @@ keywords: [ensrainbow, glossary, terminology, definitions]
 
 This page defines the **core terminology** used throughout the ENSRainbow codebase and documentation. If you notice an unfamiliar word elsewhere in the docs, come back to this page – it is probably defined here.
 
-| Term | Definition | Example |
-|------|------------|---------|
-| **Label** | A single component of an ENS name (characters between two dots). Can contain **any** valid UTF-8 string – it may or may not be ENS-normalised. | `vitalik`, `😺`, `example.eth` has labels `example` & `eth` |
-| **Labelhash** | `keccak256` hash of the UTF-8 bytes of a label (no pre-normalisation), represented as a **0x-prefixed 64-digit lowercase hex** string (32 bytes). | `0xaf2caa…03cc` |
-| **Heal** | The act of converting a _labelhash_ back to its original _label_ via a rainbow table lookup. | `heal('0xaf2c…') → 'vitalik'` |
-| **Rainbow Record** | An entry mapping a `labelhash` ➜ `label`. Persisted as a LevelDB key (labelhash bytes) and UTF-8 value (_see Data Model_). | – |
-| **Label Set** | A logical collection of rainbow records (ENS labelhash-to-label mappings) that share a common **source** and **versioning** scheme. Each label set represents a dataset snapshot that enables deterministic healing across time. Label sets are identified by a `labelSetId` and `labelSetVersion`. | id: `subgraph`, version: `0` |
-| **Label Set ID** | String (1-50 chars) consisting of lowercase ASCII letters and hyphens that names a label set. | `subgraph`, `discovery-a` |
-| **Label Set Version** | Non-negative integer that monotonically increases when new labelhash-to-label mappings are added to a label set. Each version contains incremental additions since the previous version. Version `0` is always the initial dataset. Enables deterministic healing across time by allowing clients to pin to specific versions for reproducible results. | `0`, `1`, `2` |
-| **Healable Count** | Total number of labels that can currently be healed by the running server. Exposed via `/count`. | `7 892 001` |
-| **Status Code** | High-level outcome of an API call – either `success` or `error`. | – |
-| **Error Code** | HTTP-style numeric code describing the error (`400`, `404`, `500`). | – |
-| **Rainbow Table** | A pre-computed set of `labelhash → label` pairs used for healing. | – |
-| **Ingestion** | One-off process that streams a `.ensrainbow` snapshot into LevelDB. | `pnpm run ingest subgraph_0.ensrainbow` |
-| **System Key** | Special LevelDB key (length ≠ 32 bytes) storing metadata such as schema version, label set id, etc. | `0xff 0xff 0xff 0xfd` |
-| **ENS Normalization** | The ENSIP-15 canonicalisation process; ENSRainbow stores labels **as-is**, even if not normalised. | – |
+## Label
+
+A single component of an ENS name (characters between two dots). Can contain **any** valid UTF-8 string – it may or may not be ENS-normalised.
+
+**Example:** `vitalik`, `😺`, `example.eth` has labels `example` & `eth`
+
+## Labelhash
+
+`keccak256` hash of the UTF-8 bytes of a label (no pre-normalisation), represented as a **0x-prefixed 64-digit lowercase hex** string (32 bytes).
+
+**Example:** `0xaf2caa…03cc`
+
+## Heal
+
+The act of converting a _labelhash_ back to its original _label_ via a rainbow table lookup.
+
+**Example:** `heal('0xaf2c…') → 'vitalik'`
+
+## Rainbow Record
+
+An entry mapping a `labelhash` ➜ `label`. Persisted as a LevelDB key (labelhash bytes) and UTF-8 value (_see Data Model_).
+
+## Label Set
+
+A logical collection of [rainbow records](#rainbow-record) that share a common **source** and **versioning** scheme. Each label set represents a dataset snapshot that enables deterministic healing across time. Label sets are identified by a `labelSetId` and `labelSetVersion`.
+
+**Example:** id: `subgraph`, version: `0`
+
+## Label Set ID
+
+String (1-50 chars) consisting of lowercase ASCII letters and hyphens that names a label set.
+
+**Example:** `subgraph`, `discovery-a`
+
+## Label Set Version
+
+Non-negative integer that monotonically increases when new labelhash-to-label mappings are added to a label set. Each version contains incremental additions since the previous version. Version `0` is always the initial dataset. Enables deterministic healing across time by allowing clients to pin to specific versions for reproducible results.
+
+**Example:** `0`, `1`, `2`
+
+## Healable Count
+
+Total number of labels that can currently be healed by the running server. Exposed via `/count`.
+
+**Example:** `7 892 001`
+
+## Status Code
+
+High-level outcome of an API call – either `success` or `error`.
+
+## Error Code
+
+HTTP-style numeric code describing the error (`400`, `404`, `500`).
+
+## Rainbow Table
+
+A pre-computed set of `labelhash → label` pairs used for healing.
+
+## Ingestion
+
+One-off process that streams a `.ensrainbow` snapshot into LevelDB.
+
+**Example:** `pnpm run ingest subgraph_0.ensrainbow`
+
+## System Key
+
+Special LevelDB key (length ≠ 32 bytes) storing metadata such as schema version, label set id, etc.
+
+**Example:** `0xff 0xff 0xff 0xfd`
+
+## ENS Normalization
+
+The ENSIP-15 canonicalisation process; ENSRainbow stores labels **as-is**, even if not normalised.
 
 ## Related Documentation
 
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/label-sets-and-versioning.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/label-sets-and-versioning.mdx
index 4135009b6..5c05f3a53 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/label-sets-and-versioning.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/label-sets-and-versioning.mdx
@@ -9,7 +9,7 @@ keywords: [ensrainbow, versioning, label sets, deterministic]
 
 ## Why Label Sets & Versions?
 
-A **label set** (see [Glossary](/ensrainbow/concepts/glossary/) for definition) is analogous to a _dataset snapshot_. Every time the upstream data (e.g. an on-chain subgraph export) changes, we mint a new **label set version** so that:
+A **[label set](/ensrainbow/concepts/glossary#label-set)** is analogous to a _dataset snapshot_. Every time the upstream data (e.g. an on-chain subgraph export) changes, we mint a new **[label set version](/ensrainbow/concepts/glossary#label-set-version)** so that:
 
 ### 1. Deterministic Results
 Clients that pin _version `N`_ are guaranteed to get the _exact same_ heal response today, tomorrow, and two years from now.
@@ -51,6 +51,6 @@ This flexibility ensures applications can choose between **staying current** wit
 
 ## Related Documentation
 
-- **[Glossary](/ensrainbow/concepts/glossary)** - Key terminology including version-related terms
+- **[Glossary](/ensrainbow/concepts/glossary)** - Key terminology including [label set](/ensrainbow/concepts/glossary#label-set) and [label set version](/ensrainbow/concepts/glossary#label-set-version) terms
 - **[TypeScript Interfaces](/ensrainbow/concepts/typescript-interfaces)** - Type definitions for ENSRainbow's server and client
 - **[Data Model](/ensrainbow/concepts/data-model)** - How versions are stored in the database 
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/typescript-interfaces.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/typescript-interfaces.mdx
index 333ebcd0f..ac7e4b95e 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/typescript-interfaces.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/typescript-interfaces.mdx
@@ -7,7 +7,7 @@ sidebar:
 keywords: [ensrainbow, typescript, interfaces, types]
 ---
 
-ENSRainbow's TypeScript APIs expose two companion interfaces that describe **which label sets** (see [Glossary](/ensrainbow/concepts/glossary/) for definition) **are available (server-side) or requested (client-side)**.
+ENSRainbow's TypeScript APIs expose two companion interfaces that describe **which [label sets](/ensrainbow/concepts/glossary#label-set)** **are available (server-side) or requested (client-side)**.
 
 ## Server Label Set
 
@@ -24,7 +24,7 @@ interface EnsRainbowServerLabelSet {
 
 #### Fields
 
-- **`labelSetId`** identifies **which label set** (see [Glossary](/ensrainbow/concepts/glossary/) for definition) the server is currently serving.
+- **`labelSetId`** identifies **which [label set](/ensrainbow/concepts/glossary#label-set)** the server is currently serving.
 - **`highestLabelSetVersion`** is the **highest version** available through the server for the label set id. The server will not return labels from a version _greater_ than this value (unless it ingests another incremental label set).
 
 ## Client Label Set
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx
index 216fd9ad2..5151600cb 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx
@@ -185,11 +185,11 @@ When using ENSRainbow with Docker, the following environment variables control w
   - **Goal**: Ensures compatibility between the ENSRainbow software and the structure of downloaded database files that are prebuilt for startup-time optimizations.
   - **Configuration**: It is strongly recommended to use the latest available schema version unless you have specific compatibility requirements.
 
-- **`LABEL_SET_ID`**: The identifier for a **Label Set** (see [Glossary](/ensrainbow/concepts/glossary/) for definition).
+- **`LABEL_SET_ID`**: See **[Label Set ID](/ensrainbow/concepts/glossary#label-set-id)**.
   - **Goal**: To enable the extensible definition of new label sets (e.g., subgraph vs. production vs. test).
   - **Configuration**: See the [Available Label Sets](/ensrainbow/usage/available-label-sets) page for a complete list of currently available label set IDs and their descriptions.
 
-- **`LABEL_SET_VERSION`**: A non-negative integer representing the version of a **Label Set** (see [Glossary](/ensrainbow/concepts/glossary/) for definition).
+- **`LABEL_SET_VERSION`**: See **[Label Set Version](/ensrainbow/concepts/glossary#label-set-version)**.
   - **Goal**: To support the deterministic evolution of datasets over time, allowing services to achieve reproducible results.
   - **Configuration**: Use the highest available version number for the most up-to-date data. Versions are sequential and incremental:
     - `0` - The initial/base version of the **Label Set**.
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/usage/configuration.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/usage/configuration.mdx
index 1e773b9e0..a69743a00 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/usage/configuration.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/usage/configuration.mdx
@@ -33,12 +33,12 @@ These variables are **only used by shell scripts** for downloading and setting u
   - **Configuration**: It is strongly recommended to use the latest available schema version unless you have specific compatibility requirements.
   - **Used by**: Download scripts to fetch the correct database format.
 
-- **`LABEL_SET_ID`**: The identifier for a **label set** (see [Glossary](/ensrainbow/concepts/glossary/) for definition).
+- **`LABEL_SET_ID`**: See **[Label Set ID](/ensrainbow/concepts/glossary#label-set-id)**.
   - **Goal**: To enable the extensible definition of new label sets (e.g., subgraph vs. production vs. test).
   - **Configuration**: See the [Available Label Sets](/ensrainbow/usage/available-label-sets) page for a complete list of currently available label set IDs and their descriptions.
   - **Used by**: Download scripts to fetch the correct label set.
 
-- **`LABEL_SET_VERSION`**: A non-negative integer representing the version of a **label set** (see [Glossary](/ensrainbow/concepts/glossary/) for definition).
+- **`LABEL_SET_VERSION`**: See **[Label Set Version](/ensrainbow/concepts/glossary#label-set-version)**.
   - **Goal**: To support the deterministic evolution of datasets over time, allowing services to achieve reproducible results.
   - **Configuration**: Use the highest available version number for the most up-to-date data. Versions are sequential and incremental:
     - `0` - The initial/base version of the **label set**.
diff --git a/packages/ensnode-sdk/src/ensrainbow/types.ts b/packages/ensnode-sdk/src/ensrainbow/types.ts
index d59c1deb9..d7d8ca32b 100644
--- a/packages/ensnode-sdk/src/ensrainbow/types.ts
+++ b/packages/ensnode-sdk/src/ensrainbow/types.ts
@@ -1,5 +1,5 @@
 /**
- * A label set ID identifies a label set (see https://ensnode.io/ensrainbow/concepts/glossary/ for definition).
+ * A label set ID identifies a label set (see https://ensnode.io/ensrainbow/concepts/glossary#label-set for definition).
  * It is guaranteed to be 1 to 50 characters long and contain only lowercase letters (a-z)
  * and hyphens (-).
  */

From 62d87a54b40f884f2e57b0b206674fa81a85a56a Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Sat, 27 Sep 2025 22:11:24 +0200
Subject: [PATCH 16/28] Update glossary with environment variable definitions
 and enhance descriptions in Terraform variables for clarity on label sets and
 versions. Add references to the glossary for improved understanding.

---
 .../docs/ensrainbow/concepts/glossary.mdx     | 36 +++++++++++++++----
 terraform/variables.tf                        |  8 ++---
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
index a18c46570..f4fe120ed 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
@@ -9,41 +9,41 @@ keywords: [ensrainbow, glossary, terminology, definitions]
 
 This page defines the **core terminology** used throughout the ENSRainbow codebase and documentation. If you notice an unfamiliar word elsewhere in the docs, come back to this page – it is probably defined here.
 
-## Label
+## Label {#label}
 
 A single component of an ENS name (characters between two dots). Can contain **any** valid UTF-8 string – it may or may not be ENS-normalised.
 
 **Example:** `vitalik`, `😺`, `example.eth` has labels `example` & `eth`
 
-## Labelhash
+## Labelhash {#labelhash}
 
 `keccak256` hash of the UTF-8 bytes of a label (no pre-normalisation), represented as a **0x-prefixed 64-digit lowercase hex** string (32 bytes).
 
 **Example:** `0xaf2caa…03cc`
 
-## Heal
+## Heal {#heal}
 
 The act of converting a _labelhash_ back to its original _label_ via a rainbow table lookup.
 
 **Example:** `heal('0xaf2c…') → 'vitalik'`
 
-## Rainbow Record
+## Rainbow Record {#rainbow-record}
 
 An entry mapping a `labelhash` ➜ `label`. Persisted as a LevelDB key (labelhash bytes) and UTF-8 value (_see Data Model_).
 
-## Label Set
+## Label Set {#label-set}
 
 A logical collection of [rainbow records](#rainbow-record) that share a common **source** and **versioning** scheme. Each label set represents a dataset snapshot that enables deterministic healing across time. Label sets are identified by a `labelSetId` and `labelSetVersion`.
 
 **Example:** id: `subgraph`, version: `0`
 
-## Label Set ID
+## Label Set ID {#label-set-id}
 
 String (1-50 chars) consisting of lowercase ASCII letters and hyphens that names a label set.
 
 **Example:** `subgraph`, `discovery-a`
 
-## Label Set Version
+## Label Set Version {#label-set-version}
 
 Non-negative integer that monotonically increases when new labelhash-to-label mappings are added to a label set. Each version contains incremental additions since the previous version. Version `0` is always the initial dataset. Enables deterministic healing across time by allowing clients to pin to specific versions for reproducible results.
 
@@ -83,6 +83,28 @@ Special LevelDB key (length ≠ 32 bytes) storing metadata such as schema versio
 
 The ENSIP-15 canonicalisation process; ENSRainbow stores labels **as-is**, even if not normalised.
 
+## Environment Variables
+
+### LABEL_SET_ID {#label-set-id-env}
+
+Environment variable that specifies the identifier for a [label set](#label-set). See [Label Set ID](#label-set-id) for the definition of this identifier.
+
+**Used by:** Download scripts and Docker entrypoint to fetch the correct label set
+
+### LABEL_SET_VERSION {#label-set-version-env}
+
+Environment variable that specifies the version of a [label set](#label-set). See [Label Set Version](#label-set-version) for the definition of this version number.
+
+**Used by:** Download scripts and Docker entrypoint to fetch the correct label set version
+
+### LOG_LEVEL {#log-level-env}
+
+Environment variable that controls the verbosity of logging output.
+
+**Format:** String  
+**Valid Values:** `fatal`, `error`, `warn`, `info`, `debug`, `trace`, `silent`  
+**Default:** `info`
+
 ## Related Documentation
 
 - **[Label Sets & Versioning](/ensrainbow/concepts/label-sets-and-versioning)** - Understanding the versioning system
diff --git a/terraform/variables.tf b/terraform/variables.tf
index c3bb1bb6b..29fb78603 100644
--- a/terraform/variables.tf
+++ b/terraform/variables.tf
@@ -43,21 +43,21 @@ variable "quicknode_endpoint_name" {
 # The "fully pinned" label set reference that ENSIndexer will request ENSRainbow use for deterministic label healing across time. This label set reference is "fully pinned" as it requires both the labelSetId and labelSetVersion fields to be defined.
 variable "ensindexer_label_set_id" {
   type        = string
-  description = "The label set ID that ENSIndexer will request from ENSRainbow for deterministic label healing (e.g., 'subgraph', 'ens-test-env')"
+  description = "The label set ID that ENSIndexer will request from ENSRainbow for deterministic label healing. See https://ensnode.io/ensrainbow/concepts/glossary/#label-set-id-env for definition."
 }
 
 variable "ensindexer_label_set_version" {
   type        = string
-  description = "The label set version that ENSIndexer will request from ENSRainbow for deterministic label healing (e.g., '0', '1')"
+  description = "The label set version that ENSIndexer will request from ENSRainbow for deterministic label healing. See https://ensnode.io/ensrainbow/concepts/glossary/#label-set-version-env for definition."
 }
 
 # Label set that ENSRainbow will offer to its clients
 variable "ensrainbow_label_set_id" {
   type        = string
-  description = "The label set ID that ENSRainbow will offer to its clients (e.g., 'subgraph', 'ens-test-env')"
+  description = "The label set ID that ENSRainbow will offer to its clients. See https://ensnode.io/ensrainbow/concepts/glossary/#label-set-id-env for definition."
 }
 
 variable "ensrainbow_label_set_version" {
   type        = string
-  description = "The highest label set version that ENSRainbow will offer to its clients (e.g., '0', '1')"
+  description = "The highest label set version that ENSRainbow will offer to its clients. See https://ensnode.io/ensrainbow/concepts/glossary/#label-set-version-env for definition."
 }

From e954fa188eebcbc2bdf6099560ce3da18852fd01 Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Sat, 27 Sep 2025 22:21:59 +0200
Subject: [PATCH 17/28] Update terminology documentation to clarify the
 definition of LabelHash and add references to the ENSRainbow Glossary for
 comprehensive understanding.

---
 docs/ensnode.io/src/content/docs/docs/reference/terminology.mdx | 2 +-
 packages/ensnode-sdk/src/ens/types.ts                           | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/ensnode.io/src/content/docs/docs/reference/terminology.mdx b/docs/ensnode.io/src/content/docs/docs/reference/terminology.mdx
index 72ad23859..62508cdf0 100644
--- a/docs/ensnode.io/src/content/docs/docs/reference/terminology.mdx
+++ b/docs/ensnode.io/src/content/docs/docs/reference/terminology.mdx
@@ -89,7 +89,7 @@ When rendering a **name**, one must take care to differentiate between an _unkno
 
 ### LabelHash, labelhash function
 
-In this terminology reference, we say that the **LabelHash** of a **Label** is the 32-byte hashed result of calling the **`labelhash` function** with that **Label** as input.
+In this terminology reference, we say that the **LabelHash** of a **Label** is the result of calling the **`labelhash` function** with that **Label** as input. For the complete technical definition, see the [ENSRainbow Glossary](/docs/ensrainbow/concepts/glossary#labelhash).
 
 That is, `0xaf2caa1c2ca1d027f1ac823b529d0a67cd144264b2789fa2ea4d63a67c7103cc` is the **LabelHash** of `vitalik`, which is the result of calling the **`labelhash` function** like so:
 
diff --git a/packages/ensnode-sdk/src/ens/types.ts b/packages/ensnode-sdk/src/ens/types.ts
index 6321f2bc5..1ea876da2 100644
--- a/packages/ensnode-sdk/src/ens/types.ts
+++ b/packages/ensnode-sdk/src/ens/types.ts
@@ -47,6 +47,7 @@ export type NormalizedName = Name & { __brand: "NormalizedName" };
  *
  * @see https://docs.ens.domains/terminology#labelhash
  * @see https://ensnode.io/docs/reference/terminology#labels-labelhashes-labelhash-function
+ * @see https://docs.ensnode.io/docs/ensrainbow/concepts/glossary#labelhash
  */
 export type LabelHash = Hex;
 

From b58612b1b5d89b42c15612fbb9909106cc9bf5a0 Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Sat, 27 Sep 2025 22:22:49 +0200
Subject: [PATCH 18/28] Enhance documentation by adding glossary references for
 the term "heal" in the ENSRainbow overview and API client comments, improving
 clarity and user understanding.

---
 docs/ensnode.io/src/content/docs/ensrainbow/index.mdx | 6 +++---
 packages/ensrainbow-sdk/src/client.ts                 | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/index.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/index.mdx
index 7b13badcf..a6cfb44f1 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/index.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/index.mdx
@@ -1,6 +1,6 @@
 ---
 title: What is ENSRainbow?
-description: ENSRainbow is an open-source service that heals unknown ENS names by converting encoded labelhashes back to human-readable labels.
+description: ENSRainbow is an open-source service that [heals](/docs/ensrainbow/concepts/glossary#heal) unknown ENS names by converting encoded labelhashes back to human-readable labels.
 sidebar:
   label: Overview
   order: 1
@@ -18,11 +18,11 @@ The ENS Registry allows subnames to be created onchain without revealing onchain
 
 ## How ENSRainbow Helps
 
-ENSRainbow significantly improves "name healing" coverage compared to relying solely on services like the ENS Subgraph. Its goal is to heal as many ENS names as possible, minimizing the probability that end-users encounter unknown labels.
+ENSRainbow significantly improves "name [healing](/docs/ensrainbow/concepts/glossary#heal)" coverage compared to relying solely on services like the ENS Subgraph. Its goal is to heal as many ENS names as possible, minimizing the probability that end-users encounter unknown labels.
 
 Key aspects of ENSRainbow include:
 
-*   **Resolving Encoded Labelhashes:** It translates cryptic labelhashes into human-readable labels.
+*   **Resolving Encoded Labelhashes:** It translates cryptic labelhashes into human-readable labels via [healing](/docs/ensrainbow/concepts/glossary#heal).
 *   **Sidecar to ENSNode:** It integrates with ENSNode to provide deterministic name healing across time.
 *   **Improved Healing Coverage:** Aims to minimize the probability that end-users encounter unknown labels, far exceeding previous capabilities.
 *   **Extensible and Decentralized Data Management:** Uses a label set ID and label set version system for enabling any number of different label sets (collections of rainbow records) to support incremental updates across time.
diff --git a/packages/ensrainbow-sdk/src/client.ts b/packages/ensrainbow-sdk/src/client.ts
index 7d599b71c..9c6d3d9aa 100644
--- a/packages/ensrainbow-sdk/src/client.ts
+++ b/packages/ensrainbow-sdk/src/client.ts
@@ -242,7 +242,7 @@ export class EnsRainbowApiClient implements EnsRainbow.ApiClient {
   }
 
   /**
-   * Attempt to heal a labelHash to its original label.
+   * Attempt to [heal](/docs/ensrainbow/concepts/glossary#heal) a labelHash to its original label.
    *
    * Note on returned labels: ENSRainbow returns labels exactly as they are
    * represented in source rainbow table data. This means:

From f540f70cc72af5c2245041913d9488a2b1b9ccf1 Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Sat, 27 Sep 2025 22:33:29 +0200
Subject: [PATCH 19/28] Enhance documentation by adding glossary references for
 "rainbow record" in multiple files, improving clarity and user understanding
 of the data model and related concepts.

---
 apps/ensrainbow/src/lib/database.ts                            | 2 ++
 apps/ensrainbow/src/lib/rainbow-record.ts                      | 2 +-
 apps/ensrainbow/src/utils/rainbow-record.ts                    | 3 +++
 .../src/content/docs/ensrainbow/concepts/data-model.mdx        | 3 +++
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/apps/ensrainbow/src/lib/database.ts b/apps/ensrainbow/src/lib/database.ts
index fcd154063..ec2195db6 100644
--- a/apps/ensrainbow/src/lib/database.ts
+++ b/apps/ensrainbow/src/lib/database.ts
@@ -84,6 +84,8 @@ export function isRainbowRecordKey(key: ByteArray): boolean {
 /**
  * Type representing the ENSRainbow LevelDB database.
  *
+ * For user-facing documentation, see the [Data Model documentation](/docs/ensrainbow/concepts/data-model).
+ *
  * Schema:
  * - Keys are binary encoded and represent:
  *   - For rainbow records: The raw bytes of the ENS labelHash. Always a byte length of 32.
diff --git a/apps/ensrainbow/src/lib/rainbow-record.ts b/apps/ensrainbow/src/lib/rainbow-record.ts
index bbdaf1740..d68fdee06 100644
--- a/apps/ensrainbow/src/lib/rainbow-record.ts
+++ b/apps/ensrainbow/src/lib/rainbow-record.ts
@@ -3,7 +3,7 @@ import { buildLabelSetVersion, type Label, type LabelSetVersion } from "@ensnode
 import { getErrorMessage } from "@/utils/error-utils";
 
 /**
- * A versioned rainbow record.
+ * A versioned [rainbow record](/docs/ensrainbow/concepts/glossary#rainbow-record).
  */
 export interface VersionedRainbowRecord {
   /** The original label string */
diff --git a/apps/ensrainbow/src/utils/rainbow-record.ts b/apps/ensrainbow/src/utils/rainbow-record.ts
index 6f8d7e270..4031b73a6 100644
--- a/apps/ensrainbow/src/utils/rainbow-record.ts
+++ b/apps/ensrainbow/src/utils/rainbow-record.ts
@@ -3,6 +3,9 @@ import type { ByteArray } from "viem";
 import type { LabelHash } from "@ensnode/ensnode-sdk";
 import { labelHashToBytes } from "@ensnode/ensnode-sdk";
 
+/**
+ * A [rainbow record](/docs/ensrainbow/concepts/glossary#rainbow-record) interface.
+ */
 export interface RainbowRecord {
   labelHash: ByteArray;
   label: string;
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx
index 51eb7aabb..6250bdd96 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx
@@ -19,6 +19,8 @@ ENSRainbow stores its rainbow table in a **LevelDB** database. The schema is int
 
 ## Versioned Rainbow Record
 
+A [rainbow record](/docs/ensrainbow/concepts/glossary#rainbow-record) with versioning information:
+
 ```ts
 interface VersionedRainbowRecord {
   label: string;          // original label
@@ -58,6 +60,7 @@ Protocol Buffers is Google's language-neutral, platform-neutral extensible mecha
 The `.ensrainbow` file format uses two main message types:
 
 ```protobuf
+// Protobuf serialization format for [rainbow records](/docs/ensrainbow/concepts/glossary#rainbow-record)
 message RainbowRecord {
   bytes labelhash = 1;    // 32-byte labelhash
   string label = 2;       // original label string

From 6946cc1e53c5c52ec7073fb503972330e42e7b40 Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Sat, 27 Sep 2025 23:26:31 +0200
Subject: [PATCH 20/28] Refine glossary documentation by correcting formatting
 for LABEL_SET_VERSION references and enhancing clarity in environment
 variable definitions. Update examples for consistency and improve overall
 readability.

---
 apps/ensindexer/.env.local.example            |  3 +--
 .../docs/ensrainbow/concepts/glossary.mdx     | 22 ++++++++++---------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/apps/ensindexer/.env.local.example b/apps/ensindexer/.env.local.example
index 0f56f8a95..e6a4eb99f 100644
--- a/apps/ensindexer/.env.local.example
+++ b/apps/ensindexer/.env.local.example
@@ -216,8 +216,7 @@ ENSRAINBOW_URL=http://localhost:3223
 # LABEL_SET_ID: see https://ensnode.io/ensrainbow/concepts/glossary#label-set-id.
 LABEL_SET_ID=subgraph
 
-# LABEL_SET_VERSION: see https://ensnode.io/ensrainbow/concepts/
-glossary#label-set-version.
+# LABEL_SET_VERSION: see https://ensnode.io/ensrainbow/concepts/glossary#label-set-version.
 LABEL_SET_VERSION=0
 
 # The "primary" ENSIndexer service URL
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
index f4fe120ed..d188c0cd1 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
@@ -9,41 +9,41 @@ keywords: [ensrainbow, glossary, terminology, definitions]
 
 This page defines the **core terminology** used throughout the ENSRainbow codebase and documentation. If you notice an unfamiliar word elsewhere in the docs, come back to this page – it is probably defined here.
 
-## Label {#label}
+## Label
 
 A single component of an ENS name (characters between two dots). Can contain **any** valid UTF-8 string – it may or may not be ENS-normalised.
 
 **Example:** `vitalik`, `😺`, `example.eth` has labels `example` & `eth`
 
-## Labelhash {#labelhash}
+## Labelhash
 
 `keccak256` hash of the UTF-8 bytes of a label (no pre-normalisation), represented as a **0x-prefixed 64-digit lowercase hex** string (32 bytes).
 
 **Example:** `0xaf2caa…03cc`
 
-## Heal {#heal}
+## Heal
 
 The act of converting a _labelhash_ back to its original _label_ via a rainbow table lookup.
 
 **Example:** `heal('0xaf2c…') → 'vitalik'`
 
-## Rainbow Record {#rainbow-record}
+## Rainbow Record
 
 An entry mapping a `labelhash` ➜ `label`. Persisted as a LevelDB key (labelhash bytes) and UTF-8 value (_see Data Model_).
 
-## Label Set {#label-set}
+## Label Set
 
 A logical collection of [rainbow records](#rainbow-record) that share a common **source** and **versioning** scheme. Each label set represents a dataset snapshot that enables deterministic healing across time. Label sets are identified by a `labelSetId` and `labelSetVersion`.
 
 **Example:** id: `subgraph`, version: `0`
 
-## Label Set ID {#label-set-id}
+## Label Set ID
 
 String (1-50 chars) consisting of lowercase ASCII letters and hyphens that names a label set.
 
 **Example:** `subgraph`, `discovery-a`
 
-## Label Set Version {#label-set-version}
+## Label Set Version
 
 Non-negative integer that monotonically increases when new labelhash-to-label mappings are added to a label set. Each version contains incremental additions since the previous version. Version `0` is always the initial dataset. Enables deterministic healing across time by allowing clients to pin to specific versions for reproducible results.
 
@@ -85,19 +85,21 @@ The ENSIP-15 canonicalisation process; ENSRainbow stores labels **as-is**, even
 
 ## Environment Variables
 
-### LABEL_SET_ID {#label-set-id-env}
+<a id="label-set-id-env"></a>
+### LABEL_SET_ID
 
 Environment variable that specifies the identifier for a [label set](#label-set). See [Label Set ID](#label-set-id) for the definition of this identifier.
 
 **Used by:** Download scripts and Docker entrypoint to fetch the correct label set
 
-### LABEL_SET_VERSION {#label-set-version-env}
+<a id="label-set-version-env"></a>
+### LABEL_SET_VERSION
 
 Environment variable that specifies the version of a [label set](#label-set). See [Label Set Version](#label-set-version) for the definition of this version number.
 
 **Used by:** Download scripts and Docker entrypoint to fetch the correct label set version
 
-### LOG_LEVEL {#log-level-env}
+### LOG_LEVEL
 
 Environment variable that controls the verbosity of logging output.
 

From 25e03d9690ffcff0ea38ef870c14e5d8b14709c3 Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Sat, 27 Sep 2025 23:37:37 +0200
Subject: [PATCH 21/28] Update glossary and Terraform variable descriptions to
 correct label set reference links, ensuring consistency and clarity in
 documentation. Enhance user understanding of environment variables related to
 label sets.

---
 .../src/content/docs/ensrainbow/concepts/glossary.mdx     | 2 --
 terraform/variables.tf                                    | 8 ++++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
index d188c0cd1..c5208b832 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
@@ -85,14 +85,12 @@ The ENSIP-15 canonicalisation process; ENSRainbow stores labels **as-is**, even
 
 ## Environment Variables
 
-<a id="label-set-id-env"></a>
 ### LABEL_SET_ID
 
 Environment variable that specifies the identifier for a [label set](#label-set). See [Label Set ID](#label-set-id) for the definition of this identifier.
 
 **Used by:** Download scripts and Docker entrypoint to fetch the correct label set
 
-<a id="label-set-version-env"></a>
 ### LABEL_SET_VERSION
 
 Environment variable that specifies the version of a [label set](#label-set). See [Label Set Version](#label-set-version) for the definition of this version number.
diff --git a/terraform/variables.tf b/terraform/variables.tf
index 29fb78603..bf0c19ad9 100644
--- a/terraform/variables.tf
+++ b/terraform/variables.tf
@@ -43,21 +43,21 @@ variable "quicknode_endpoint_name" {
 # The "fully pinned" label set reference that ENSIndexer will request ENSRainbow use for deterministic label healing across time. This label set reference is "fully pinned" as it requires both the labelSetId and labelSetVersion fields to be defined.
 variable "ensindexer_label_set_id" {
   type        = string
-  description = "The label set ID that ENSIndexer will request from ENSRainbow for deterministic label healing. See https://ensnode.io/ensrainbow/concepts/glossary/#label-set-id-env for definition."
+  description = "The label set ID that ENSIndexer will request from ENSRainbow for deterministic label healing. See https://ensnode.io/ensrainbow/concepts/glossary/#label_set_id for definition."
 }
 
 variable "ensindexer_label_set_version" {
   type        = string
-  description = "The label set version that ENSIndexer will request from ENSRainbow for deterministic label healing. See https://ensnode.io/ensrainbow/concepts/glossary/#label-set-version-env for definition."
+  description = "The label set version that ENSIndexer will request from ENSRainbow for deterministic label healing. See https://ensnode.io/ensrainbow/concepts/glossary/#label_set_version for definition."
 }
 
 # Label set that ENSRainbow will offer to its clients
 variable "ensrainbow_label_set_id" {
   type        = string
-  description = "The label set ID that ENSRainbow will offer to its clients. See https://ensnode.io/ensrainbow/concepts/glossary/#label-set-id-env for definition."
+  description = "The label set ID that ENSRainbow will offer to its clients. See https://ensnode.io/ensrainbow/concepts/glossary/#label_set_id for definition."
 }
 
 variable "ensrainbow_label_set_version" {
   type        = string
-  description = "The highest label set version that ENSRainbow will offer to its clients. See https://ensnode.io/ensrainbow/concepts/glossary/#label-set-version-env for definition."
+  description = "The highest label set version that ENSRainbow will offer to its clients. See https://ensnode.io/ensrainbow/concepts/glossary/#label_set_version for definition."
 }

From ebb03ff736bf8ffe32c6593fc641a36e18d1ca0d Mon Sep 17 00:00:00 2001
From: "kwrobel.eth" <djstrong@gmail.com>
Date: Sun, 28 Sep 2025 00:07:04 +0200
Subject: [PATCH 22/28] Fix labelhash glossary URL in types.ts

Updated the URL for the labelhash glossary reference.
---
 packages/ensnode-sdk/src/ens/types.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/ensnode-sdk/src/ens/types.ts b/packages/ensnode-sdk/src/ens/types.ts
index 1ea876da2..c8d244a6d 100644
--- a/packages/ensnode-sdk/src/ens/types.ts
+++ b/packages/ensnode-sdk/src/ens/types.ts
@@ -47,7 +47,7 @@ export type NormalizedName = Name & { __brand: "NormalizedName" };
  *
  * @see https://docs.ens.domains/terminology#labelhash
  * @see https://ensnode.io/docs/reference/terminology#labels-labelhashes-labelhash-function
- * @see https://docs.ensnode.io/docs/ensrainbow/concepts/glossary#labelhash
+ * @see https://ensnode.io/ensrainbow/concepts/glossary#labelhash
  */
 export type LabelHash = Hex;
 

From 57836edd1b893e7018f5f194125c7e78f91ac6e2 Mon Sep 17 00:00:00 2001
From: "kwrobel.eth" <djstrong@gmail.com>
Date: Mon, 29 Sep 2025 17:54:11 +0200
Subject: [PATCH 23/28] Create fifty-spies-call.md

---
 .changeset/fifty-spies-call.md | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 .changeset/fifty-spies-call.md

diff --git a/.changeset/fifty-spies-call.md b/.changeset/fifty-spies-call.md
new file mode 100644
index 000000000..757b6846f
--- /dev/null
+++ b/.changeset/fifty-spies-call.md
@@ -0,0 +1,9 @@
+---
+"ensindexer": patch
+"ensrainbow": patch
+"@docs/ensnode": patch
+"@ensnode/ensnode-sdk": patch
+"@ensnode/ensrainbow-sdk": patch
+---
+
+Refine ENSRainbow Docs

From 9d7b12b522fe96b48b07fd7999195ccabc34a0e5 Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Wed, 17 Dec 2025 12:00:22 +0100
Subject: [PATCH 24/28] Enhance ENSRainbow documentation by adding detailed
 instructions for creating `.ensrainbow` files from both SQL and CSV sources.
 Update the description of the `searchlight` label set to reflect its
 availability and improved dataset features. Revise performance metrics for
 the `searchlight` dataset and include usage examples for downloading the
 extended discovery dataset.

---
 .../docs/ensrainbow/concepts/glossary.mdx     |  4 ++-
 .../docs/ensrainbow/concepts/performance.mdx  |  2 +-
 .../src/content/docs/ensrainbow/faq.mdx       | 33 +++++++++++++++----
 .../ensrainbow/usage/available-label-sets.mdx | 15 ++++++---
 4 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
index c5208b832..f8fdebc85 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/glossary.mdx
@@ -41,7 +41,7 @@ A logical collection of [rainbow records](#rainbow-record) that share a common *
 
 String (1-50 chars) consisting of lowercase ASCII letters and hyphens that names a label set.
 
-**Example:** `subgraph`, `discovery-a`
+**Example:** `subgraph`, `discovery-a`, `searchlight`
 
 ## Label Set Version
 
@@ -85,6 +85,8 @@ The ENSIP-15 canonicalisation process; ENSRainbow stores labels **as-is**, even
 
 ## Environment Variables
 
+These environment variables are typically set in Docker containers, shell scripts, or system configuration files. See the [Configuration](/ensrainbow/usage/configuration) guide for complete setup instructions.
+
 ### LABEL_SET_ID
 
 Environment variable that specifies the identifier for a [label set](#label-set). See [Label Set ID](#label-set-id) for the definition of this identifier.
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/performance.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/performance.mdx
index c883fd44d..25444ba80 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/performance.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/performance.mdx
@@ -10,6 +10,6 @@ keywords: [ensrainbow, performance]
 |---------|-------------------|--------------|--------------|
 | `ens-test-env / 0` | 1 MB | 5 MB | < 30 s |
 | `subgraph / 0` | 3 GB | 7 GB | ~20 min |
-| `searchlight / latest` | 8 GB | 18 GB | ~50 min |
+| `searchlight / 0` | 13 GB | 29 GB | ~30 min |
 
 `*` Times measured on a 4-core CPU & NVMe SSD. 
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/faq.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/faq.mdx
index fa0d5704b..5be779975 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/faq.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/faq.mdx
@@ -16,26 +16,45 @@ New label-set versions are generated manually for now:
 - **`subgraph`**: data from the ENS Subgraph rainbow tables, no plans to update
 - **`discovery-a`**: may be updated periodically as new labels are dynamically discovered
 - **`ens-test-env`**: Static test dataset, no plans to update
-- **`searchlight`**: Enhanced discovery dataset (coming soon)
+- **`searchlight`**: Enhanced discovery dataset with additional label discoveries beyond the subgraph
 
 To stay informed about new versions, monitor the [Available Label Sets](/ensrainbow/usage/available-label-sets/) documentation page.
 
 ## How can I create my own .ensrainbow file?
 
-Currently, the `convert` command is the **only way** to create new `.ensrainbow` files from scratch. This command converts PostgreSQL rainbow table dumps (`.sql.gz` format) into the binary protobuf format that ENSRainbow uses.
+ENSRainbow provides two methods for creating `.ensrainbow` files from different data sources:
 
-**To create a custom .ensrainbow file:**
+### Method 1: SQL Conversion
+The `convert` command converts PostgreSQL rainbow table dumps (`.sql.gz` format) into the binary protobuf format that ENSRainbow uses.
+
+**To create a custom .ensrainbow file from SQL:**
 
 1. **Prepare your data** as a PostgreSQL dump file (`.sql.gz`) with ENS labels and labelhashes
 2. **Run the convert command:**
    ```bash
-   pnpm run convert --input-file your_data.sql.gz --output-file custom.ensrainbow
+   pnpm run convert \
+     --input-file your_data.sql.gz \
+     --output-file custom.ensrainbow \
+     --label-set-id custom \
+     --label-set-version 0
    ```
-3. **Specify the label set details** using `--label-set-id` and `--label-set-version` flags
 
-**Note:** You can download existing `.ensrainbow` files using the download scripts, but for creating entirely new files with your own data, the `convert` command is currently the only option available.
+### Method 2: CSV Conversion
+The `convert-csv` command converts CSV files (with 1 or 2 columns) into `.ensrainbow` format. This is ideal for custom datasets, test data, or external sources.
+
+**To create a custom .ensrainbow file from CSV:**
+
+1. **Prepare your data** as a CSV file with labels (and optionally labelhashes)
+2. **Run the convert-csv command:**
+   ```bash
+   pnpm run convert-csv \
+     --input-file your_labels.csv \
+     --output-file custom.ensrainbow \
+     --label-set-id custom \
+     --label-set-version 0
+   ```
 
-See the [CLI Reference](/ensrainbow/contributing/cli-reference/) for detailed command usage.
+For complete instructions, examples, and workflow guidance, see the [Creating ENSRainbow Files](/ensrainbow/concepts/creating-files) guide. See the [CLI Reference](/ensrainbow/contributing/cli-reference/) for detailed command usage.
 
 ## Does ENSRainbow normalise labels?
 No. It returns labels exactly as stored. Your client should perform ENS Normalisation if required.
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/usage/available-label-sets.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/usage/available-label-sets.mdx
index 78f808b75..4539b316d 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/usage/available-label-sets.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/usage/available-label-sets.mdx
@@ -20,6 +20,7 @@ Use these **currently available** identifiers with the [download script](/ensrai
 ./scripts/download-ensrainbow-files.sh subgraph 0        # Production dataset
 ./scripts/download-ensrainbow-files.sh ens-test-env 0    # Test dataset  
 ./scripts/download-ensrainbow-files.sh discovery-a 0     # Discovery dataset
+./scripts/download-ensrainbow-files.sh searchlight 0     # Extended discovery dataset
 ```
 
 ## Available Label Sets
@@ -54,15 +55,13 @@ The information below reflects actual availability as of the last check.
 |---------|--------|-------------|
 | `0` | ✅ Available | Initial empty dataset for dynamic discoveries |
 
-### Planned/Coming Soon
-
-#### `searchlight` 
+#### `searchlight`
 **Source**: Extended discovery mechanisms  
-**Description**: Enhanced dataset with additional label discoveries beyond the subgraph.
+**Description**: Enhanced dataset with additional label discoveries beyond the subgraph, providing maximum healing coverage.
 
 | Version | Status | Description |
 |---------|--------|-------------|
-| `0` | 🚧 Coming Soon | Extended dataset with additional discoveries |
+| `0` | ✅ Available | Extended dataset with additional discoveries |
 
 ## Usage Examples
 
@@ -84,6 +83,12 @@ cd apps/ensrainbow
 ./scripts/download-ensrainbow-files.sh discovery-a 0
 ```
 
+### For Maximum Coverage
+```bash title="Download extended discovery dataset"
+cd apps/ensrainbow
+./scripts/download-ensrainbow-files.sh searchlight 0
+```
+
 ### For API Configuration
 ```bash title="Environment variables"
 export LABEL_SET_ID=subgraph

From 3cbfaef055f4d78f65b89172464c5874bd6b231f Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Thu, 8 Jan 2026 23:18:53 +0100
Subject: [PATCH 25/28] Remove unnecessary code block from creating-files
 documentation

---
 .../src/content/docs/ensrainbow/concepts/creating-files.mdx   | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
index 67c7f6eb1..598ba690d 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
@@ -459,10 +459,6 @@ Create documentation for your custom label set including:
 
 ### Usage
 
-```
-
-
-```bash
 # Using with Docker
 docker run -d \
   -e DB_SCHEMA_VERSION="3" \

From 1abee906c76315cbecf8b31c26038750d928b976 Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Fri, 9 Jan 2026 14:13:15 +0100
Subject: [PATCH 26/28] chore: update documentation links to use absolute URLs
 and remove unused dependencies from package.json

---
 apps/ensrainbow/package.json                  |  1 -
 apps/ensrainbow/src/lib/database.ts           |  2 +-
 apps/ensrainbow/src/lib/rainbow-record.ts     |  2 +-
 apps/ensrainbow/src/utils/rainbow-record.ts   |  2 +-
 .../ensrainbow/concepts/creating-files.mdx    |  4 +-
 .../concepts/label-sets-and-versioning.mdx    |  2 +-
 packages/ensrainbow-sdk/src/client.ts         |  2 +-
 pnpm-lock.yaml                                | 71 +------------------
 8 files changed, 10 insertions(+), 76 deletions(-)

diff --git a/apps/ensrainbow/package.json b/apps/ensrainbow/package.json
index 0efb9fcec..704a88cf7 100644
--- a/apps/ensrainbow/package.json
+++ b/apps/ensrainbow/package.json
@@ -33,7 +33,6 @@
     "@ensnode/ensrainbow-sdk": "workspace:*",
     "@ensnode/ensnode-sdk": "workspace:*",
     "@hono/node-server": "^1.4.1",
-    "bloom-filters": "^3.0.4",
     "classic-level": "^1.4.1",
     "hono": "catalog:",
     "pino": "catalog:",
diff --git a/apps/ensrainbow/src/lib/database.ts b/apps/ensrainbow/src/lib/database.ts
index ec2195db6..027d7ff53 100644
--- a/apps/ensrainbow/src/lib/database.ts
+++ b/apps/ensrainbow/src/lib/database.ts
@@ -84,7 +84,7 @@ export function isRainbowRecordKey(key: ByteArray): boolean {
 /**
  * Type representing the ENSRainbow LevelDB database.
  *
- * For user-facing documentation, see the [Data Model documentation](/docs/ensrainbow/concepts/data-model).
+ * For user-facing documentation, see the [Data Model documentation](https://ensnode.io/ensrainbow/concepts/data-model).
  *
  * Schema:
  * - Keys are binary encoded and represent:
diff --git a/apps/ensrainbow/src/lib/rainbow-record.ts b/apps/ensrainbow/src/lib/rainbow-record.ts
index d68fdee06..bf5e04f37 100644
--- a/apps/ensrainbow/src/lib/rainbow-record.ts
+++ b/apps/ensrainbow/src/lib/rainbow-record.ts
@@ -3,7 +3,7 @@ import { buildLabelSetVersion, type Label, type LabelSetVersion } from "@ensnode
 import { getErrorMessage } from "@/utils/error-utils";
 
 /**
- * A versioned [rainbow record](/docs/ensrainbow/concepts/glossary#rainbow-record).
+ * A versioned [rainbow record](https://ensnode.io/ensrainbow/concepts/glossary#rainbow-record).
  */
 export interface VersionedRainbowRecord {
   /** The original label string */
diff --git a/apps/ensrainbow/src/utils/rainbow-record.ts b/apps/ensrainbow/src/utils/rainbow-record.ts
index 4031b73a6..8a51ec915 100644
--- a/apps/ensrainbow/src/utils/rainbow-record.ts
+++ b/apps/ensrainbow/src/utils/rainbow-record.ts
@@ -4,7 +4,7 @@ import type { LabelHash } from "@ensnode/ensnode-sdk";
 import { labelHashToBytes } from "@ensnode/ensnode-sdk";
 
 /**
- * A [rainbow record](/docs/ensrainbow/concepts/glossary#rainbow-record) interface.
+ * A [rainbow record](https://ensnode.io/ensrainbow/concepts/glossary#rainbow-record) interface.
  */
 export interface RainbowRecord {
   labelHash: ByteArray;
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
index 598ba690d..7435ae665 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx
@@ -446,7 +446,7 @@ Create documentation for your custom label set including:
 
 ### Example Documentation Format
 
-```markdown
+````markdown
 ## Custom Label Set: my-dataset
 
 **Label Set ID**: `my-dataset`  
@@ -460,6 +460,7 @@ Create documentation for your custom label set including:
 ### Usage
 
 # Using with Docker
+```bash
 docker run -d \
   -e DB_SCHEMA_VERSION="3" \
   -e LABEL_SET_ID="my-dataset" \
@@ -467,6 +468,7 @@ docker run -d \
   -p 3223:3223 \
   ghcr.io/namehash/ensnode/ensrainbow:latest
 ```
+````
 
 ## Setting Up Your Own Label Set Server
 
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/label-sets-and-versioning.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/label-sets-and-versioning.mdx
index 5c05f3a53..480b1de17 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/label-sets-and-versioning.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/label-sets-and-versioning.mdx
@@ -9,7 +9,7 @@ keywords: [ensrainbow, versioning, label sets, deterministic]
 
 ## Why Label Sets & Versions?
 
-A **[label set](/ensrainbow/concepts/glossary#label-set)** is analogous to a _dataset snapshot_. Every time the upstream data (e.g. an on-chain subgraph export) changes, we mint a new **[label set version](/ensrainbow/concepts/glossary#label-set-version)** so that:
+A **[label set](/ensrainbow/concepts/glossary#label-set)** is analogous to a _dataset snapshot_. Every time upstream data grows (e.g. additional rainbow records created), we create a new **[label set version](/ensrainbow/concepts/glossary#label-set-version)** so that:
 
 ### 1. Deterministic Results
 Clients that pin _version `N`_ are guaranteed to get the _exact same_ heal response today, tomorrow, and two years from now.
diff --git a/packages/ensrainbow-sdk/src/client.ts b/packages/ensrainbow-sdk/src/client.ts
index 9c6d3d9aa..87182d976 100644
--- a/packages/ensrainbow-sdk/src/client.ts
+++ b/packages/ensrainbow-sdk/src/client.ts
@@ -242,7 +242,7 @@ export class EnsRainbowApiClient implements EnsRainbow.ApiClient {
   }
 
   /**
-   * Attempt to [heal](/docs/ensrainbow/concepts/glossary#heal) a labelHash to its original label.
+   * Attempt to [heal](https://ensnode.io/ensrainbow/concepts/glossary#heal) a labelHash to its original label.
    *
    * Note on returned labels: ENSRainbow returns labels exactly as they are
    * represented in source rainbow table data. This means:
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 15b2fce0a..e3c5c59ce 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -495,9 +495,6 @@ importers:
       '@hono/node-server':
         specifier: ^1.4.1
         version: 1.19.5(hono@4.10.3)
-      bloom-filters:
-        specifier: ^3.0.4
-        version: 3.0.4
       classic-level:
         specifier: ^1.4.1
         version: 1.4.1
@@ -4205,9 +4202,6 @@ packages:
   '@types/sax@1.2.7':
     resolution: {integrity: sha512-rO73L89PJxeYM3s3pPPjiPgVVcymqU490g0YO5n5By0k2Erzj6tay/4lr1CHAAU4JyOWd1rpQ8bCf6cZfHU96A==}
 
-  '@types/seedrandom@3.0.8':
-    resolution: {integrity: sha512-TY1eezMU2zH2ozQoAFAQFOPpvP15g+ZgSfTZt31AUUH/Rxtnz3H+A/Sv1Snw2/amp//omibc+AEkTaA8KUeOLQ==}
-
   '@types/tar@6.1.13':
     resolution: {integrity: sha512-IznnlmU5f4WcGTh2ltRu/Ijpmk8wiWXfF0VA4s+HPjHZgvFggk1YaIkbo5krX/zUCzWF8N/l4+W/LNxnvAJ8nw==}
 
@@ -4571,10 +4565,6 @@ packages:
   base-64@1.0.0:
     resolution: {integrity: sha512-kwDPIFCGx0NZHog36dj+tHiwP4QMzsZ3AgMViUBKI0+V5n4U0ufTCUMhnQ04diaRI8EX/QcPfql7zlhZ7j4zgg==}
 
-  base64-arraybuffer@1.0.2:
-    resolution: {integrity: sha512-I3yl4r9QB5ZRY3XuJVEPfc2XhZO6YweFPI+UovAzn+8/hb3oJ6lnysaFcjVpkCPfVWFUDvoZ8kmVDP7WyRtYtQ==}
-    engines: {node: '>= 0.6.0'}
-
   base64-js@1.5.1:
     resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
 
@@ -4605,10 +4595,6 @@ packages:
   bintrees@1.0.2:
     resolution: {integrity: sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==}
 
-  bloom-filters@3.0.4:
-    resolution: {integrity: sha512-BdnPWo2OpYhlvuP2fRzJBdioMCkm7Zp0HCf8NJgF5Mbyqy7VQ/CnTiVWMMyq4EZCBHwj0Kq6098gW2/3RsZsrA==}
-    engines: {node: '>=12'}
-
   boolbase@1.0.0:
     resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==}
 
@@ -4969,9 +4955,6 @@ packages:
   csstype@3.2.3:
     resolution: {integrity: sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==}
 
-  cuint@0.2.2:
-    resolution: {integrity: sha512-d4ZVpCW31eWwCMe1YT3ur7mUDnTXbgwyzaL320DrcRT45rfjYxkt5QWLrmOJ+/UEAI2+fQgKe/fCjR8l4TpRgw==}
-
   cytoscape-cose-bilkent@4.1.0:
     resolution: {integrity: sha512-wgQlVIUJF13Quxiv5e1gstZ08rnZj2XaLHGoFMYXz7SkNfCDOOteKBE6SYRfA9WxxI/iBc3ajfDoc6hb/MRAHQ==}
     peerDependencies:
@@ -7394,9 +7377,6 @@ packages:
   recma-stringify@1.0.0:
     resolution: {integrity: sha512-cjwII1MdIIVloKvC9ErQ+OgAtwHBmcZ0Bg4ciz78FtbT8In39aAYbaA7zvxQ61xVMSPE8WxhLwLbhif4Js2C+g==}
 
-  reflect-metadata@0.1.14:
-    resolution: {integrity: sha512-ZhYeb6nRaXCfhnndflDK8qI6ZQ/YcWZCISRAWICW9XYqMUwjZM9Z0DveWX/ABN01oxSHwVxKQmxeYZSsm0jh5A==}
-
   regex-recursion@6.0.2:
     resolution: {integrity: sha512-0YCaSCq2VRIebiaUviZNs0cBz1kg5kVS2UKUfNIx8YVs1cN3AV7NTctO5FOKBA+UT2BPJIWZauYHPqJODG50cg==}
 
@@ -7568,9 +7548,6 @@ packages:
   secure-json-parse@4.1.0:
     resolution: {integrity: sha512-l4KnYfEyqYJxDwlNVyRfO2E4NTHfMKAWdUuA8J0yve2Dz/E/PdBepY03RvyJpssIpRFwJoCD55wA+mEDs6ByWA==}
 
-  seedrandom@3.0.5:
-    resolution: {integrity: sha512-8OwmbklUNzwezjGInmZ+2clQmExQPvomqjL7LFqOYqtmuxRgQYqOD3mHaU+MvZn5FLUeVxVfQjwLZW/n/JFuqg==}
-
   semver-compare@1.0.0:
     resolution: {integrity: sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow==}
 
@@ -8649,9 +8626,6 @@ packages:
   xxhash-wasm@1.1.0:
     resolution: {integrity: sha512-147y/6YNh+tlp6nd/2pWq38i9h6mz/EuQ6njIrmW8D1BS5nCqs0P6DG+m6zTGnNz5I+uhZ0SHxBs9BsPrwcKDA==}
 
-  xxhashjs@0.2.2:
-    resolution: {integrity: sha512-AkTuIuVTET12tpsVIQo+ZU6f/qDmKuRUcjaqR+OIvm+aCBsZ95i7UVY5WJ9TMsSaZ0DA2WxoZ4acu0sPH+OKAw==}
-
   y18n@5.0.8:
     resolution: {integrity: sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==}
     engines: {node: '>=10'}
@@ -12613,8 +12587,6 @@ snapshots:
     dependencies:
       '@types/node': 22.18.13
 
-  '@types/seedrandom@3.0.8': {}
-
   '@types/tar@6.1.13':
     dependencies:
       '@types/node': 22.18.13
@@ -12661,22 +12633,6 @@ snapshots:
       chai: 6.2.0
       tinyrainbow: 3.0.3
 
-  '@vitest/mocker@4.0.5(vite@7.1.12(@types/node@20.19.24)(jiti@2.6.1)(lightningcss@1.30.2)(tsx@4.20.6)(yaml@2.8.1))':
-    dependencies:
-      '@vitest/spy': 4.0.5
-      estree-walker: 3.0.3
-      magic-string: 0.30.21
-    optionalDependencies:
-      vite: 7.1.12(@types/node@20.19.24)(jiti@2.6.1)(lightningcss@1.30.2)(tsx@4.20.6)(yaml@2.8.1)
-
-  '@vitest/mocker@4.0.5(vite@7.1.12(@types/node@22.18.13)(jiti@2.6.1)(lightningcss@1.30.2)(tsx@4.20.6)(yaml@2.8.1))':
-    dependencies:
-      '@vitest/spy': 4.0.5
-      estree-walker: 3.0.3
-      magic-string: 0.30.21
-    optionalDependencies:
-      vite: 7.1.12(@types/node@22.18.13)(jiti@2.6.1)(lightningcss@1.30.2)(tsx@4.20.6)(yaml@2.8.1)
-
   '@vitest/mocker@4.0.5(vite@7.1.12(@types/node@24.10.4)(jiti@2.6.1)(lightningcss@1.30.2)(tsx@4.20.6)(yaml@2.8.1))':
     dependencies:
       '@vitest/spy': 4.0.5
@@ -13128,8 +13084,6 @@ snapshots:
 
   base-64@1.0.0: {}
 
-  base64-arraybuffer@1.0.2: {}
-
   base64-js@1.5.1: {}
 
   baseline-browser-mapping@2.8.21: {}
@@ -13156,17 +13110,6 @@ snapshots:
 
   bintrees@1.0.2: {}
 
-  bloom-filters@3.0.4:
-    dependencies:
-      '@types/seedrandom': 3.0.8
-      base64-arraybuffer: 1.0.2
-      is-buffer: 2.0.5
-      lodash: 4.17.21
-      long: 5.3.2
-      reflect-metadata: 0.1.14
-      seedrandom: 3.0.5
-      xxhashjs: 0.2.2
-
   boolbase@1.0.0: {}
 
   boring-avatars@1.11.2: {}
@@ -13547,8 +13490,6 @@ snapshots:
 
   csstype@3.2.3: {}
 
-  cuint@0.2.2: {}
-
   cytoscape-cose-bilkent@4.1.0(cytoscape@3.33.1):
     dependencies:
       cose-base: 1.0.3
@@ -16457,8 +16398,6 @@ snapshots:
       unified: 11.0.5
       vfile: 6.0.3
 
-  reflect-metadata@0.1.14: {}
-
   regex-recursion@6.0.2:
     dependencies:
       regex-utilities: 2.3.0
@@ -16725,8 +16664,6 @@ snapshots:
 
   secure-json-parse@4.1.0: {}
 
-  seedrandom@3.0.5: {}
-
   semver-compare@1.0.0: {}
 
   semver@6.3.1: {}
@@ -17639,7 +17576,7 @@ snapshots:
   vitest@4.0.5(@types/debug@4.1.12)(@types/node@20.19.24)(jiti@2.6.1)(jsdom@27.0.1(postcss@8.5.6))(lightningcss@1.30.2)(tsx@4.20.6)(yaml@2.8.1):
     dependencies:
       '@vitest/expect': 4.0.5
-      '@vitest/mocker': 4.0.5(vite@7.1.12(@types/node@20.19.24)(jiti@2.6.1)(lightningcss@1.30.2)(tsx@4.20.6)(yaml@2.8.1))
+      '@vitest/mocker': 4.0.5(vite@7.1.12(@types/node@24.10.4)(jiti@2.6.1)(lightningcss@1.30.2)(tsx@4.20.6)(yaml@2.8.1))
       '@vitest/pretty-format': 4.0.5
       '@vitest/runner': 4.0.5
       '@vitest/snapshot': 4.0.5
@@ -17679,7 +17616,7 @@ snapshots:
   vitest@4.0.5(@types/debug@4.1.12)(@types/node@22.18.13)(jiti@2.6.1)(jsdom@27.0.1(postcss@8.5.6))(lightningcss@1.30.2)(tsx@4.20.6)(yaml@2.8.1):
     dependencies:
       '@vitest/expect': 4.0.5
-      '@vitest/mocker': 4.0.5(vite@7.1.12(@types/node@22.18.13)(jiti@2.6.1)(lightningcss@1.30.2)(tsx@4.20.6)(yaml@2.8.1))
+      '@vitest/mocker': 4.0.5(vite@7.1.12(@types/node@24.10.4)(jiti@2.6.1)(lightningcss@1.30.2)(tsx@4.20.6)(yaml@2.8.1))
       '@vitest/pretty-format': 4.0.5
       '@vitest/runner': 4.0.5
       '@vitest/snapshot': 4.0.5
@@ -17960,10 +17897,6 @@ snapshots:
 
   xxhash-wasm@1.1.0: {}
 
-  xxhashjs@0.2.2:
-    dependencies:
-      cuint: 0.2.2
-
   y18n@5.0.8: {}
 
   yallist@3.1.1: {}

From dee381ca4ff4584d9a505f909012ca6be5ebc742 Mon Sep 17 00:00:00 2001
From: djstrong <djstrong@gmail.com>
Date: Mon, 12 Jan 2026 14:40:53 +0100
Subject: [PATCH 27/28] docs: add comprehensive section on unknown labels in
 ENS and link to new concept page

---
 .../docs/ensrainbow/concepts/index.mdx        |  13 +-
 .../ensrainbow/concepts/unknown-labels.mdx    | 229 ++++++++++++++++++
 .../src/content/docs/ensrainbow/index.mdx     |   2 +
 3 files changed, 241 insertions(+), 3 deletions(-)
 create mode 100644 docs/ensnode.io/src/content/docs/ensrainbow/concepts/unknown-labels.mdx

diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/index.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/index.mdx
index 7d5835a0f..9d24aa2d6 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/index.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/index.mdx
@@ -18,6 +18,12 @@ This section covers the fundamental concepts needed to understand and work with
   href="/ensrainbow/concepts/glossary/"
 />
 
+<LinkCard
+  title="Unknown Labels"
+  description="Understanding the fundamental problem of unknown labels in ENS and how ENSRainbow mitigates it"
+  href="/ensrainbow/concepts/unknown-labels/"
+/>
+
 <LinkCard
   title="Data Model"
   description="Detailed schema and data organization in ENSRainbow's LevelDB database"
@@ -61,9 +67,10 @@ This section covers the fundamental concepts needed to understand and work with
 If you're new to ENSRainbow, we recommend following this learning path:
 
 1. **Start with the [Glossary](/ensrainbow/concepts/glossary/)** to familiarize yourself with key terms
-2. **Read [Label Sets & Versioning](/ensrainbow/concepts/label-sets-and-versioning/)** to understand the core versioning concepts
-3. **Review the [Data Model](/ensrainbow/concepts/data-model/)** to see how data is organized
-4. **Check out [TypeScript Interfaces](/ensrainbow/concepts/typescript-interfaces/)** if you're planning to use the SDK
+2. **Read [Unknown Labels](/ensrainbow/concepts/unknown-labels/)** to understand the fundamental problem ENSRainbow solves
+3. **Read [Label Sets & Versioning](/ensrainbow/concepts/label-sets-and-versioning/)** to understand the core versioning concepts
+4. **Review the [Data Model](/ensrainbow/concepts/data-model/)** to see how data is organized
+5. **Check out [TypeScript Interfaces](/ensrainbow/concepts/typescript-interfaces/)** if you're planning to use the SDK
 
 For deployment and operational considerations, also review [Technical Versioning](/ensrainbow/concepts/versioning/) and [Architecture](/ensrainbow/concepts/architecture/).
 
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/unknown-labels.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/unknown-labels.mdx
new file mode 100644
index 000000000..e7d3fe962
--- /dev/null
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/unknown-labels.mdx
@@ -0,0 +1,229 @@
+---
+title: Unknown Labels
+description: Understanding the fundamental problem of unknown labels in ENS and how ENSRainbow mitigates it.
+sidebar:
+  label: Unknown Labels
+  order: 2
+keywords: [ensrainbow, unknown labels, labelhash, healing, rainbow tables, keccak256]
+---
+
+## The Problem: Unknown Labels
+
+When querying indexed ENS names, you may encounter labels represented as [**Encoded LabelHashes**](/docs/reference/terminology#encoded-labelhash) like `[428...b0b]` instead of human-readable strings.
+
+An **Unknown Label** is a [**Label**](/ensrainbow/concepts/glossary#label) for which the human-readable string is not available, and only the [**LabelHash**](/ensrainbow/concepts/glossary#labelhash) is known. These labels are displayed using the format `[{LabelHash}]`.
+
+Unknown labels are an unfortunate user experience issue in the ENS ecosystem. They make names difficult to read, understand, and work with in applications.
+
+### Format: Encoded LabelHash
+
+When an unknown label is encountered, it is represented as an **Encoded LabelHash** in the format `[{labelhash}]`, where `{labelhash}` is the 64-character hexadecimal representation of the labelhash (without the `0x` prefix).
+
+**Examples:**
+- `vitalik.eth` — a normalized name with known labels
+- `[731f7025b488151de311c24abc1f27f02940bde412246fbdb3dea0d4f0663b22].eth` — a name with an unknown label encoded as a labelhash, followed by the known label `eth`
+- `731f7025b488151de311c24abc1f27f02940bde412246fbdb3dea0d4f0663b22.eth` — a name where the first label is known and happens to contain 64 hex characters (no square brackets)
+
+The square bracket encoding is essential to differentiate between an unknown label (represented by its labelhash) and a known label that literally contains hex characters.
+
+## What Causes Unknown Labels?
+
+Unknown labels arise from the fundamental design of the ENS protocol:
+
+### The ENS Registry Design
+
+In the ENS `Registry` contract, only the **name**'s **node** is registered on-chain. The **node** is a 32-byte hash computed using the `namehash` function, which recursively applies the `labelhash` function to each label in the name.
+
+**The critical point:** The Registry stores only the node (the hash), not the human-readable labels that compose the name. This means:
+
+1. **Direct Registry calls** can create subnames by providing only the labelhash, without revealing the label string on-chain
+2. **Historical data** from before label emission was standardized may not include label information
+3. **Event logs** may only contain labelhashes, not the original label strings
+
+### When Labels Can Be Made Known
+
+In many cases, the labels that make up a name can be made **known**:
+
+- **Contract events**: Some contracts (like `ETHRegistrarController` or `NameWrapper`) emit the human-readable label in their events
+- **[Rainbow table](/ensrainbow/concepts/glossary#rainbow-table) lookups**: The human-readable label for a given labelhash can sometimes be determined via rainbow table lookups and other strategies
+- **ENSRainbow [healing](/ensrainbow/concepts/glossary#heal)**: ENSRainbow attempts to heal unknown labels by looking up labelhashes in its rainbow tables
+
+However, if none of these methods succeed, the label remains unknown and must be represented as an encoded labelhash.
+
+## Why Unknown Labels Are Forever a Consideration
+
+Unknown labels are a **permanent architectural constraint** of the ENS protocol, not a temporary issue that can be fully eliminated. Here's why:
+
+### 1. Protocol Design Immutability
+
+The ENS Registry contract design is immutable—it cannot be changed retroactively. The fundamental design decision to store only nodes (hashes) rather than labels means that:
+
+- Historical subnames created without label emission will always have unknown labels unless they can be healed via rainbow tables
+- Future subnames can still be created via direct Registry calls that only provide labelhashes
+- The protocol cannot require label emission for all subname creation methods without breaking backward compatibility
+
+### 2. Cryptographic One-Way Function
+
+The `labelhash` function uses `keccak256`, a cryptographic hash function that is **one-way**:
+
+- Given a label, you can compute its labelhash: `labelhash("vitalik") → 0xaf2c...`
+- Given a labelhash, you **cannot** reverse it to get the original label: `0xaf2c... → ???`
+
+This means that without external knowledge (rainbow tables, event logs, etc.), a labelhash cannot be converted back to its original label.
+
+### 3. Ongoing Subname Creation
+
+New subnames continue to be created on-chain. While many modern contracts emit label information in events, the protocol itself does not guarantee this. Unknown labels will continue to appear as long as:
+
+- Direct Registry calls are used to create subnames
+- Contracts that don't emit label information are used
+- Historical data without label information is indexed
+
+## How Unknown Labels Influence Indexing
+
+Unknown labels significantly impact how ENS data is indexed and queried:
+
+### Indexing Process
+
+When ENSNode indexes onchain events where a subname is created in the ENS Registry:
+
+1. **The labelhash is always known** from the onchain event data
+2. **The label may be unknown** if it wasn't emitted in the event
+3. **ENSRainbow lookup is attempted**: ENSNode attempts to lookup the label for the labelhash through an attached ENSRainbow server
+4. **Representation decision**:
+   - If the lookup succeeds: ENSNode represents the subname using its true label
+   - If the lookup fails: ENSNode represents the "unknown label" using its labelhash in the format `[labelhash]`
+
+### Label Mutability Over Time
+
+The representation of labels can change over time as ENSRainbow's healing capabilities improve:
+
+- **Time 1**: ENSRainbow cannot heal label X → label is represented as `[labelhash]`
+- **Time 2**: ENSRainbow gains the ability to heal label X → label transitions from unknown to known
+
+This mutability means that:
+- Label representations should not be used as immutable identifiers
+- The node (computed via `namehash`) should always be used as the stable identifier for querying
+- For deterministic results, pin healing to a specific label set ID + version (see [Label Sets & Versioning](/ensrainbow/concepts/label-sets-and-versioning))
+
+### Subgraph-Unindexable Labels
+
+The legacy ENS Subgraph specifies that **Unknown Labels** and labels containing certain UTF-8 characters are "invalid" or "subgraph-unindexable". These include:
+
+1. `\0` (null byte) - PostgreSQL does not allow storing this character in text fields
+2. `.` (period) - Conflicts with ENS label separator logic
+3. `[` (left square bracket) - Conflicts with "unknown label" representations
+4. `]` (right square bracket) - Conflicts with "unknown label" representations
+
+In ENSNode's default *Interpreted Labels* mode (`SUBGRAPH_COMPAT=false`), when a `subgraph-unindexable` label is encountered, it will be represented as an **Encoded LabelHash** even if the actual label data is available. This prevents database and parsing issues while maintaining compatibility with ENS tooling.
+
+## How Unknown Labels Influence Apps and User Interfaces
+
+Unknown labels create significant challenges for applications building on ENS:
+
+### Display Challenges
+
+**Rendering unknown labels** requires careful handling to differentiate between:
+- An unknown label (represented as `[labelhash]`)
+- A known label that happens to contain hex characters
+
+Apps must:
+- Display encoded labelhashes in a way that clearly indicates they're placeholders
+- Handle names that mix known and unknown labels
+- Provide fallback UI when names contain unknown labels
+
+### Querying Challenges
+
+When querying ENSNode for names:
+
+1. **Use nodes, not names**: Always use the node (computed via `namehash`) as the stable identifier, not the name string
+2. **Normalization awareness**: When querying from user input, normalize first; when querying from onchain data, don't normalize
+3. **Encoded LabelHash-aware namehash**: Use implementations like [viem's namehash](https://github.com/wevm/viem/blob/fe558fdef7e2e9cd5f3f57d8bdeae0c7ff67a1b0/src/utils/ens/namehash.ts#L36-L51) that handle encoded labelhashes correctly
+
+### User Experience Impact
+
+Unknown labels create poor user experiences:
+- **Readability**: Names become unreadable (e.g., `[428...b0b].eth` instead of `example.eth`)
+- **Search & Discovery**: Search and filtering become difficult when names contain hashes
+- **Trust & Usability**: Cryptic hash representations reduce user trust and application usability
+- **Accessibility**: Screen readers and other assistive technologies struggle with hash-based representations
+
+## The Solution: How ENSRainbow Works
+
+ENSRainbow mitigates the unknown labels problem by providing a **[healing](/ensrainbow/concepts/glossary#heal) service** that converts labelhashes back to human-readable labels via [rainbow table](/ensrainbow/concepts/glossary#rainbow-table) lookups.
+
+### What is Healing?
+
+[**Healing**](/ensrainbow/concepts/glossary#heal) is the act of converting a labelhash back to its original label via a rainbow table lookup. ENSRainbow maintains pre-computed mappings of `labelhash → label` pairs (called [rainbow records](/ensrainbow/concepts/glossary#rainbow-record)) that enable this reverse lookup.
+
+### How ENSRainbow Works
+
+ENSRainbow operates as a **sidecar service** to ENSNode:
+
+1. **[Rainbow Table](/ensrainbow/concepts/glossary#rainbow-table) Storage**: ENSRainbow maintains LevelDB databases containing millions of labelhash-to-label mappings
+2. **HTTP API**: Provides a lightweight HTTP API (`GET /v1/heal/{labelhash}`) that returns the corresponding label if found (optionally scoped via `label_set_id` and `label_set_version` query parameters)
+3. **Integration with ENSNode**: During indexing, ENSNode automatically queries ENSRainbow when it encounters an unknown labelhash
+4. **Deterministic Healing**: Uses label set IDs and versions to ensure deterministic healing across time
+
+### Healing Process
+
+When ENSNode encounters an unknown label during indexing:
+
+```typescript
+// 1. ENSNode encounters labelhash: 0xaf2caa1c2ca1d027f1ac823b529d0a67cd144264b2789fa2ea4d63a67c7103cc
+// 2. ENSNode queries ENSRainbow: GET /v1/heal/0xaf2caa1c2ca1d027f1ac823b529d0a67cd144264b2789fa2ea4d63a67c7103cc
+// 3. ENSRainbow looks up in rainbow table
+// 4. If found: Returns { "status": "success", "label": "vitalik" }
+// 5. ENSNode stores the name as "vitalik.eth" instead of "[af2c...].eth"
+```
+
+### Coverage and Limitations
+
+ENSRainbow significantly improves healing coverage compared to relying solely on services like the ENS Subgraph. However:
+
+- **Not all labels can be healed**: Some labelhashes may never be recoverable if the original label was never emitted anywhere
+- **Growing coverage**: ENSRainbow's goal is to heal as many ENS names as possible, minimizing the probability that end-users encounter unknown labels
+- **Multiple label sets**: Different [label sets](/ensrainbow/concepts/glossary#label-set) (identified by [label set ID](/ensrainbow/concepts/glossary#label-set-id)) can provide different coverage, allowing the ecosystem to contribute additional healing data
+
+### Label Sets and Versioning
+
+ENSRainbow uses a [**label set**](/ensrainbow/concepts/glossary#label-set) system to organize rainbow table data:
+
+- **[Label Set ID](/ensrainbow/concepts/glossary#label-set-id)**: Identifies a collection of [rainbow records](/ensrainbow/concepts/glossary#rainbow-record) (e.g., `subgraph`, `discovery-a`)
+- **[Label Set Version](/ensrainbow/concepts/glossary#label-set-version)**: Monotonically increasing version numbers that enable incremental updates
+- **Deterministic Results**: Clients can pin to specific versions for reproducible healing results
+
+This system enables:
+- **Extensibility**: New label sets can be created and published by anyone
+- **Incremental Updates**: New versions add mappings without invalidating previous versions
+- **Deterministic Healing**: Applications can rely on consistent results over time
+
+For more details, see [Label Sets & Versioning](/ensrainbow/concepts/label-sets-and-versioning).
+
+## Background: Why Rainbow Tables Are Necessary
+
+To fully understand why unknown labels are a hard problem, it's helpful to understand the underlying cryptography that makes ENSRainbow necessary.
+
+The [**labelhash**](/ensrainbow/concepts/glossary#labelhash) function computes a 32-byte hash of a label using `keccak256`.
+
+```typescript
+import { labelhash } from 'viem';
+const labelHash = labelhash("vitalik");
+// Returns: 0xaf2caa1c2ca1d027f1ac823b529d0a67cd144264b2789fa2ea4d63a67c7103cc
+```
+
+`keccak256` is a cryptographic hash function that is **one-way**:
+
+- Given a label, you can compute its labelhash: `labelhash("vitalik") → 0xaf2c...`
+- Given a labelhash, you **cannot** reverse it to get the original label: `0xaf2c... → ???`
+
+This means that without external knowledge (rainbow tables, event logs, etc.), a labelhash cannot be converted back to its original label. This one-way property is exactly why rainbow tables—and ENSRainbow—are necessary.
+
+## Related Documentation
+
+- **[Glossary](/ensrainbow/concepts/glossary)** - Key terminology including [labelhash](/ensrainbow/concepts/glossary#labelhash), [heal](/ensrainbow/concepts/glossary#heal), and [rainbow table](/ensrainbow/concepts/glossary#rainbow-table)
+- **[Label Sets & Versioning](/ensrainbow/concepts/label-sets-and-versioning)** - Understanding how ENSRainbow organizes healing data
+- **[Architecture](/ensrainbow/concepts/architecture)** - High-level system architecture and data flow
+- **[API Reference](/ensrainbow/usage/api)** - Complete HTTP API documentation for ENSRainbow
+- **[Terminology Reference](/docs/reference/terminology)** - Comprehensive ENSNode terminology including [Unknown Label](/docs/reference/terminology#unknown-label) and [Encoded LabelHash](/docs/reference/terminology#encoded-labelhash)
+- **[Querying Best Practices](/docs/usage/querying-best-practices)** - How to handle unknown labels when querying ENSNode
diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/index.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/index.mdx
index a6cfb44f1..83b73ca93 100644
--- a/docs/ensnode.io/src/content/docs/ensrainbow/index.mdx
+++ b/docs/ensnode.io/src/content/docs/ensrainbow/index.mdx
@@ -16,6 +16,8 @@ ENSRainbow builds upon work from The Graph Protocol (original ENS rainbow tables
 
 The ENS Registry allows subnames to be created onchain without revealing onchain what those subnames are. As a result, when querying indexed ENS names, some names include labels represented as encoded labelhashes (e.g., `[428...b0b]`). These represent unknown labels and are an unfortunate user experience issue in the ENS ecosystem.
 
+For a comprehensive explanation of unknown labels — what they are, why they exist, how they're formatted, and their impact on indexing and applications — see [Unknown Labels](/ensrainbow/concepts/unknown-labels/).
+
 ## How ENSRainbow Helps
 
 ENSRainbow significantly improves "name [healing](/docs/ensrainbow/concepts/glossary#heal)" coverage compared to relying solely on services like the ENS Subgraph. Its goal is to heal as many ENS names as possible, minimizing the probability that end-users encounter unknown labels.

From ea5399a8f815e660146f9a1541483a55b7415090 Mon Sep 17 00:00:00 2001
From: "kwrobel.eth" <djstrong@gmail.com>
Date: Mon, 12 Jan 2026 15:30:10 +0100
Subject: [PATCH 28/28] Create lucky-eagles-hammer.md

---
 .changeset/lucky-eagles-hammer.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .changeset/lucky-eagles-hammer.md

diff --git a/.changeset/lucky-eagles-hammer.md b/.changeset/lucky-eagles-hammer.md
new file mode 100644
index 000000000..1c8815fc7
--- /dev/null
+++ b/.changeset/lucky-eagles-hammer.md
@@ -0,0 +1,5 @@
+---
+"@docs/ensnode": patch
+---
+
+Document Introductory ENSRainbow Topics