diff --git a/docs/api-reference.md b/docs/api-reference.md index b9d8983d..cd611ed9 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -1,7 +1,7 @@ # Concept-RAG API Reference -**Schema Version:** v7 (December 2025) -**Tools:** 10 MCP tools +**Schema Version:** v8 (December 2025) +**Tools:** 12 MCP tools This document provides JSON input and output schemas for all MCP tools. For tool selection guidance, decision trees, and usage patterns, see [tool-selection-guide.md](tool-selection-guide.md). @@ -32,7 +32,8 @@ Search document summaries and metadata to discover relevant documents. ```json [ { - "source": "string", + "catalog_id": 0, + "title": "string", "summary": "string", "score": "string", "expanded_terms": ["string"] @@ -42,7 +43,8 @@ Search document summaries and metadata to discover relevant documents. | Field | Type | Description | |-------|------|-------------| -| `source` | string | Full file path to document | +| `catalog_id` | number | Document ID for subsequent tool calls | +| `title` | string | Document title | | `summary` | string | Document summary text | | `score` | string | Combined hybrid score (0.000-1.000) | | `expanded_terms` | string[] | Expanded query terms | @@ -88,8 +90,11 @@ Search across all document chunks using hybrid search. ```json [ { + "catalog_id": 0, + "title": "string", "text": "string", - "source": "string", + "page_number": 0, + "concepts": ["string"], "score": "string", "expanded_terms": ["string"] } @@ -98,8 +103,11 @@ Search across all document chunks using hybrid search. | Field | Type | Description | |-------|------|-------------| +| `catalog_id` | number | Document ID for subsequent tool calls | +| `title` | string | Document title | | `text` | string | Chunk content | -| `source` | string | Source document path | +| `page_number` | number | Page number in document | +| `concepts` | string[] | Concept names in chunk | | `score` | string | Combined hybrid score (0.000-1.000) | | `expanded_terms` | string[] | Expanded query terms | @@ -127,25 +135,24 @@ Search within a single known document. ```json { "text": "string", - "source": "string" + "catalog_id": 0 } ``` | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| | `text` | string | ✅ | — | Search query | -| `source` | string | ✅ | — | Full file path of document | +| `catalog_id` | number | ✅ | — | Document ID from `catalog_search` | -> **Debug Output:** Enable via `DEBUG_SEARCH=true` environment variable. +> **Note:** First use `catalog_search` to find the document and get its `catalog_id`. #### Output Schema ```json [ { - "text": "string", - "source": "string", "title": "string", + "text": "string", "concepts": ["string"], "concept_ids": [0] } @@ -154,13 +161,12 @@ Search within a single known document. | Field | Type | Description | |-------|------|-------------| -| `text` | string | Chunk content | -| `source` | string | Source document path | | `title` | string | Document title | +| `text` | string | Chunk content | | `concepts` | string[] | Concept names in chunk | | `concept_ids` | number[] | Concept IDs | -**Limits:** 5 chunks max (fixed limit for single-document search). +**Limits:** Top chunks from the document (fixed limit for single-document search). --- @@ -175,14 +181,14 @@ Find chunks associated with a concept, organized hierarchically. ```json { "concept": "string", - "source_filter": "string" + "title_filter": "string" } ``` | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| | `concept` | string | ✅ | — | Concept to search for | -| `source_filter` | string | ❌ | — | Filter by source path | +| `title_filter` | string | ❌ | — | Filter by document title | **Result Filtering:** Returns all matching sources and chunks (no fixed limit). @@ -195,12 +201,14 @@ Find chunks associated with a concept, organized hierarchically. "concept": "string", "concept_id": 0, "summary": "string", + "image_ids": [0], "related_concepts": ["string"], "synonyms": ["string"], "broader_terms": ["string"], "narrower_terms": ["string"], "sources": [ { + "catalog_id": 0, "title": "string", "pages": [0], "match_type": "primary|related", @@ -209,8 +217,9 @@ Find chunks associated with a concept, organized hierarchically. ], "chunks": [ { - "text": "string", + "catalog_id": 0, "title": "string", + "text": "string", "page": 0, "concept_density": "string", "concepts": ["string"] @@ -220,7 +229,8 @@ Find chunks associated with a concept, organized hierarchically. "total_documents": 0, "total_chunks": 0, "sources_returned": 0, - "chunks_returned": 0 + "chunks_returned": 0, + "images_found": 0 }, "score": "string" } @@ -231,18 +241,23 @@ Find chunks associated with a concept, organized hierarchically. | `concept` | string | Matched concept name | | `concept_id` | number | Concept identifier | | `summary` | string | Concept summary | +| `image_ids` | number[] | Visual IDs for `get_visuals` | | `related_concepts` | string[] | Related concepts | | `synonyms` | string[] | Alternative names | | `broader_terms` | string[] | More general concepts | | `narrower_terms` | string[] | More specific concepts | +| `sources[].catalog_id` | number | Document ID | | `sources[].title` | string | Document title | | `sources[].pages` | number[] | Page numbers | | `sources[].match_type` | string | `"primary"` or `"related"` | | `sources[].via_concept` | string? | Linking concept if related | +| `chunks[].catalog_id` | number | Document ID | +| `chunks[].title` | string | Document title | | `chunks[].text` | string | Chunk content | | `chunks[].page` | number | Page number | | `chunks[].concept_density` | string | Prominence (0.000-1.000) | | `stats` | object | Search statistics | +| `stats.images_found` | number | Count of associated visuals | | `score` | string | Combined hybrid score (0.000-1.000) | #### Additional Fields with Debug Enabled @@ -578,6 +593,70 @@ Find concepts in a category's documents. --- +## Visual Content + +### get_visuals + +Retrieve visual content (diagrams, charts, tables, figures) from documents. + +#### Input Schema + +```json +{ + "ids": [0], + "catalog_id": 0, + "visual_type": "diagram|flowchart|chart|table|figure", + "concept": "string", + "limit": 20 +} +``` + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `ids` | number[] | ❌ | — | Retrieve specific visuals by ID (from `concept_search` `image_ids`) | +| `catalog_id` | number | ❌ | — | Filter by document ID | +| `visual_type` | string | ❌ | — | Filter by type | +| `concept` | string | ❌ | — | Filter by associated concept | +| `limit` | number | ❌ | `20` | Maximum results | + +> **Note:** Use `ids` to fetch visuals returned by `concept_search` `image_ids`. Use `catalog_id` to browse all visuals in a document. + +#### Output Schema + +```json +{ + "visuals": [ + { + "id": 0, + "catalog_id": 0, + "catalog_title": "string", + "visual_type": "string", + "page_number": 0, + "description": "string", + "image_path": "string", + "concepts": ["string"] + } + ], + "total_returned": 0, + "filters_applied": {} +} +``` + +| Field | Type | Description | +|-------|------|-------------| +| `visuals[].id` | number | Visual ID | +| `visuals[].catalog_id` | number | Document ID | +| `visuals[].catalog_title` | string | Document title | +| `visuals[].visual_type` | string | Type: diagram, flowchart, chart, table, figure | +| `visuals[].page_number` | number | Page in document | +| `visuals[].description` | string | Semantic description | +| `visuals[].image_path` | string | Path to image file | +| `visuals[].concepts` | string[] | Associated concept names | +| `total_returned` | number | Count of visuals returned | +| `filters_applied` | object | Applied filter parameters | + +--- + ## Error Schema All tools return errors in this format: @@ -630,3 +709,4 @@ All tools return errors in this format: | `category_search` | 30-130ms | | `list_categories` | 10-50ms | | `list_concepts_in_category` | 30-100ms | +| `get_visuals` | 20-100ms | diff --git a/docs/architecture/adr0056-diagram-awareness.md b/docs/architecture/adr0056-diagram-awareness.md new file mode 100644 index 00000000..c5705711 --- /dev/null +++ b/docs/architecture/adr0056-diagram-awareness.md @@ -0,0 +1,172 @@ +# ADR0056: Diagram Awareness + +## Status + +Accepted + +## Context + +Concept-RAG currently processes PDF and EPUB documents to extract text-based chunks and concepts. However, many technical documents contain valuable visual content (diagrams, flowcharts, charts, figures) that convey information not captured in text. This visual information is lost during ingestion. + +**Current state:** +- Documents are chunked as text segments only +- Diagrams are either ignored or produce garbled OCR artifacts +- Search results cannot surface or leverage visual content +- Users cannot find documents based on diagram content + +**Desired state:** +- Diagrams with semantic meaning are detected and extracted during ingestion +- Visual content is stored as searchable "visual tokens" +- Search results can be enriched with relevant diagrams +- Visual inference enables concept discovery from diagrams + +## Decision + +We will add diagram awareness to Concept-RAG using a Vision LLM approach with the following design decisions: + +### 1. Vision LLM for Semantic Understanding (Not CLIP) + +**Decision:** Use Vision LLM (GPT-4V/Claude 3 via OpenRouter) for diagram classification and description. + +**Rationale:** +- CLIP was trained on natural images and struggles with technical diagrams (UML, flowcharts, architecture diagrams) +- CLIP cannot extract semantic meaning—only visual similarity +- CLIP produces embeddings in a different vector space (512-768 dim) incompatible with our 384-dim text embeddings +- Vision LLMs can classify diagram types, understand relationships, and extract concepts + +### 2. Store Only Semantic Diagrams + +**Decision:** Only store diagrams with semantic meaning. Photos, screenshots, logos, and decorative images are detected but NOT stored. + +**Rationale:** +- The goal is to aid text comprehension, not store images +- Photos and decorative images add no semantic value to search +- Reduces storage bloat and search noise +- Classification gate filters content before storage + +**Visual types stored:** +- `diagram`: flowcharts, UML, architecture, state machines, sequence diagrams +- `chart`: bar, line, pie, scatter, histogram +- `table`: structured tabular data +- `figure`: technical illustrations with labels + +**NOT stored:** +- Photos, screenshots, decorative images, logos, icons + +### 3. Grayscale Storage with Color Analysis + +**Decision:** Store extracted images as grayscale PNG files. Vision LLM receives full-color image during analysis. + +**Rationale:** +- ~66% storage reduction (3 channels → 1 channel) +- Most technical diagrams are already black/white +- Semantic meaning is captured in the text description +- Color information (e.g., "the red error path") is encoded in the LLM-generated description +- Stored images are primarily for human reference/verification + +### 4. New `visuals` Table (Not Extending Chunks) + +**Decision:** Create a new `visuals` table rather than extending the existing `chunks` table. + +**Rationale:** +- Clean separation of concerns—chunks are for text, visuals are for images +- Different indexing requirements +- Avoids schema pollution in the chunks table +- Visuals link to chunks via `chunk_ids` array for context + +### 5. External Image Storage with DB References + +**Decision:** Store images as external PNG files with database references. + +**Rationale:** +- Aligns with existing pattern (documents stored externally, referenced in catalog) +- Avoids significant database size increase +- Efficient for image serving if needed +- Simple file system operations for cleanup + +**File structure:** +``` +~/.concept_rag/ +├── visuals.lance/ # New table +└── images/ # New folder + └── {catalog_id}/ + └── p{page}_v{index}.png +``` + +### 6. Non-Destructive Database Migration + +**Decision:** Add visuals capability via migration script that creates new table without modifying existing tables. + +**Rationale:** +- Production databases should not be disrupted +- Existing catalog, chunks, concepts, categories tables remain unchanged +- Incremental adoption—visuals can be extracted for existing documents later +- Safe rollback by simply dropping the new table + +## Consequences + +### Positive +- Diagrams become searchable via semantic descriptions +- Concepts can be extracted from visual content +- Search results enriched with relevant diagrams +- Non-destructive migration preserves existing data +- Grayscale storage reduces footprint by ~66% + +### Negative +- Vision LLM API costs (~$0.01-0.03 per image) +- Additional processing time during ingestion +- External dependency on Vision LLM availability +- Two-step classification + description increases API calls + +### Neutral +- New `visuals` table adds minimal database complexity +- Images stored externally (consistent with document storage pattern) +- Requires Python for layout detection (optional, can use pure JS alternatives) + +## Schema + +``` +visuals table: +├── id: number # Hash-based ID +├── catalog_id: number # FK to catalog +├── catalog_title: string # Derived +├── image_path: string # Path to grayscale PNG +├── description: string # LLM-generated semantic description +├── vector: Float32Array # 384-dim embedding of description +├── visual_type: string # diagram|chart|table|figure +├── page_number: number # Page in source document +├── bounding_box: string # JSON: {x, y, width, height} +├── concept_ids: number[] # Concepts from description +├── concept_names: string[] # Derived +└── chunk_ids: number[] # Nearby text chunks +``` + +## Implementation + +Three scripts for incremental adoption: + +1. **`add-visuals-table.ts`**: Migration script to add empty visuals table +2. **`extract-visuals.ts`**: Extract diagrams from documents +3. **`describe-visuals.ts`**: Generate semantic descriptions + +## Alternatives Considered + +### CLIP Embeddings +- **Rejected:** Incompatible embedding space, poor diagram understanding, no concept extraction + +### Store All Visuals +- **Rejected:** Photos/decorative images add noise, increase storage without semantic value + +### Color Image Storage +- **Rejected:** 3x storage cost, minimal benefit since meaning captured in description + +### Extend Chunks Table +- **Rejected:** Schema pollution, different indexing needs, chunks designed for text + +## References + +- [Issue #51: Add diagram awareness](https://github.com/m2ux/concept-rag/issues/51) +- [ADR0009: Three Table Architecture](./adr0009-three-table-architecture.md) +- [ADR0046: Document Type Classification](./adr0046-document-type-classification.md) + + diff --git a/docs/tool-selection-guide.md b/docs/tool-selection-guide.md index b0c4c2e3..65e80e7a 100644 --- a/docs/tool-selection-guide.md +++ b/docs/tool-selection-guide.md @@ -6,7 +6,7 @@ This guide helps AI agents and developers select the appropriate MCP tool for th ## Overview -Concept-RAG provides **11 MCP tools** organized into five categories: +Concept-RAG provides **12 MCP tools** organized into six categories: | Category | Tools | Purpose | |----------|-------|---------| @@ -15,6 +15,7 @@ Concept-RAG provides **11 MCP tools** organized into five categories: | **Content Search** | `broad_chunks_search`, `chunks_search` | Search within document content | | **Concept Analysis** | `concept_search`, `extract_concepts`, `source_concepts`, `concept_sources` | Analyze and track concepts | | **Category Browsing** | `category_search`, `list_categories`, `list_concepts_in_category` | Browse by domain/category | +| **Visual Content** | `get_visuals` | Retrieve diagrams, charts, tables, figures | --- @@ -42,7 +43,7 @@ START: User asks a question │ └─ YES → Use `concept_search` (highest precision) │ ├─ Do they already know the SPECIFIC DOCUMENT they want to search within? -│ ├─ YES → Use `chunks_search` (requires source path) +│ ├─ YES → Use `chunks_search` (requires catalog_id from catalog_search) │ └─ NO → Continue... │ ├─ Are they searching for SPECIFIC PHRASES, KEYWORDS, or asking NATURAL LANGUAGE QUESTIONS? @@ -101,14 +102,14 @@ START: User asks a question ### chunks_search ✅ You know which document contains the information -✅ Following up from `catalog_search` results with a specific source +✅ Following up from `catalog_search` results with a specific `catalog_id` ✅ Focused analysis of one document's content -✅ Have the exact source path from a previous search +✅ Have the `catalog_id` from a previous search ❌ Don't know which document to search (use `catalog_search` first) ❌ Need to search across multiple documents (use `broad_chunks_search`) ❌ Tracking concepts across entire library (use `concept_search`) -❌ Don't have the exact source path +❌ Don't have the `catalog_id` --- @@ -204,6 +205,26 @@ START: User asks a question --- +### get_visuals + +✅ Fetching visuals by ID (from `concept_search` `image_ids`) +✅ Looking for diagrams, charts, or figures that illustrate a concept +✅ Finding visual representations from a specific document +✅ Browsing available diagrams by type (diagram, flowchart, chart, table, figure) + +❌ Text-based search (use `broad_chunks_search` or `chunks_search`) +❌ Finding documents by title (use `catalog_search`) +❌ Searching for concepts in text (use `concept_search`) + +**Parameters:** +- `ids`: Retrieve specific visuals by ID (from `concept_search` `image_ids`) +- `catalog_id`: Filter by document +- `visual_type`: Filter by type (diagram, flowchart, chart, table, figure) +- `concept`: Filter by associated concept +- `limit`: Maximum results (default: 20) + +--- + ## Common Workflows ### 1. Explore Your Library @@ -215,9 +236,9 @@ category_search → browse documents in each area ### 2. Research a Topic ``` -catalog_search → find relevant documents +catalog_search → find relevant documents (get catalog_id) ↓ -chunks_search → dive into specific document +chunks_search (catalog_id) → dive into specific document ↓ extract_concepts → understand document's conceptual structure ``` @@ -240,6 +261,22 @@ category_search → browse documents in domain list_concepts_in_category → understand domain vocabulary ``` +### 5. Enrich Search with Diagrams +``` +concept_search → find concept (includes image_ids) + ↓ +get_visuals (ids: ) → fetch diagrams for the concept + ↓ +Combine text + visuals for comprehensive understanding +``` + +### 6. Browse Diagrams in a Document +``` +catalog_search → find the document (get catalog_id) + ↓ +get_visuals (catalog_id: ) → list all diagrams in document +``` + --- ## Tool Selection Validation Test Cases @@ -254,7 +291,7 @@ list_concepts_in_category → understand domain vocabulary | "What concepts are in distributed systems?" | `list_concepts_in_category` | Concepts within category | | "How do teams collaborate?" | `broad_chunks_search` | Natural language question | | "strategic planning frameworks" | `broad_chunks_search` | Multi-word phrase | -| "Search Sun Tzu for deception" | `chunks_search` | Known document | +| "Search Sun Tzu for deception" | `chunks_search` | Known document (use catalog_id) | | "Extract concepts from Art of War" | `extract_concepts` | Explicit extraction request | | "documents about healthcare" | `catalog_search` | Document discovery | | "organizational learning" | `concept_search` | Conceptual term | @@ -264,6 +301,9 @@ list_concepts_in_category → understand domain vocabulary | "Find sources for TDD, DI, and CI" | `source_concepts` | Multi-concept source lookup | | "List sources for each concept separately" | `concept_sources` | Per-concept bibliographies | | "What books cover the most of these topics?" | `source_concepts` | Overlap analysis | +| "Show me diagrams about architecture" | `get_visuals` | Visual content by concept | +| "What diagrams are in this book?" | `get_visuals` | Visual content by document | +| "Find flowcharts" | `get_visuals` | Visual content by type | --- diff --git a/package-lock.json b/package-lock.json index 4ba3b7eb..66d96022 100644 --- a/package-lock.json +++ b/package-lock.json @@ -20,7 +20,11 @@ "ini": "^6.0.0", "minimist": "^1.2.8", "pdf-parse": "^1.1.1", +<<<<<<< HEAD + "sharp": "^0.34.5" +======= "sharp": "^0.33.5" +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 }, "bin": { "concept-rag": "dist/conceptual_index.js" @@ -28,7 +32,11 @@ "devDependencies": { "@types/minimist": "^1.2.5", "@types/node": "^22.10.7", +<<<<<<< HEAD + "@types/sharp": "^0.31.1", +======= "@types/sharp": "^0.32.0", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "@vitest/coverage-v8": "^4.0.13", "@vitest/ui": "^4.0.9", "dependency-cruiser": "^17.3.1", @@ -542,10 +550,26 @@ "node": ">=18" } }, +<<<<<<< HEAD + "node_modules/@img/colour": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@img/colour/-/colour-1.0.0.tgz", + "integrity": "sha512-A5P/LfWGFSl6nsckYtjw9da+19jB8hkJ6ACTGcDfEJ0aE+l2n2El7dsVM7UVHZQ9s2lmYMWlrS21YLy2IR1LUw==", + "license": "MIT", + "engines": { + "node": ">=18" + } + }, + "node_modules/@img/sharp-darwin-arm64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.5.tgz", + "integrity": "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w==", +======= "node_modules/@img/sharp-darwin-arm64": { "version": "0.33.5", "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.33.5.tgz", "integrity": "sha512-UT4p+iz/2H4twwAoLCqfA9UH5pI6DggwKEGuaPy7nCVQ8ZsiY5PIcrRvD1DzuY3qYL07NtIQcWnBSY/heikIFQ==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "arm64" ], @@ -561,6 +585,15 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { +<<<<<<< HEAD + "@img/sharp-libvips-darwin-arm64": "1.2.4" + } + }, + "node_modules/@img/sharp-darwin-x64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.5.tgz", + "integrity": "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw==", +======= "@img/sharp-libvips-darwin-arm64": "1.0.4" } }, @@ -568,6 +601,7 @@ "version": "0.33.5", "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.33.5.tgz", "integrity": "sha512-fyHac4jIc1ANYGRDxtiqelIbdWkIuQaI84Mv45KvGRRxSAa7o7d1ZKAOBaYbnepLC1WqxfpimdeWfvqqSGwR2Q==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "x64" ], @@ -583,6 +617,15 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { +<<<<<<< HEAD + "@img/sharp-libvips-darwin-x64": "1.2.4" + } + }, + "node_modules/@img/sharp-libvips-darwin-arm64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.4.tgz", + "integrity": "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g==", +======= "@img/sharp-libvips-darwin-x64": "1.0.4" } }, @@ -590,6 +633,7 @@ "version": "1.0.4", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.0.4.tgz", "integrity": "sha512-XblONe153h0O2zuFfTAbQYAX2JhYmDHeWikp1LM9Hul9gVPjFY427k6dFEcOL72O01QxQsWi761svJ/ev9xEDg==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "arm64" ], @@ -603,9 +647,15 @@ } }, "node_modules/@img/sharp-libvips-darwin-x64": { +<<<<<<< HEAD + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.4.tgz", + "integrity": "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg==", +======= "version": "1.0.4", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.0.4.tgz", "integrity": "sha512-xnGR8YuZYfJGmWPvmlunFaWJsb9T/AO2ykoP3Fz/0X5XV2aoYBPkX6xqCQvUTKKiLddarLaxpzNe+b1hjeWHAQ==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "x64" ], @@ -619,9 +669,15 @@ } }, "node_modules/@img/sharp-libvips-linux-arm": { +<<<<<<< HEAD + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.4.tgz", + "integrity": "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==", +======= "version": "1.0.5", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.0.5.tgz", "integrity": "sha512-gvcC4ACAOPRNATg/ov8/MnbxFDJqf/pDePbBnuBDcjsI8PssmjoKMAz4LtLaVi+OnSb5FK/yIOamqDwGmXW32g==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "arm" ], @@ -635,9 +691,15 @@ } }, "node_modules/@img/sharp-libvips-linux-arm64": { +<<<<<<< HEAD + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.4.tgz", + "integrity": "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==", +======= "version": "1.0.4", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.0.4.tgz", "integrity": "sha512-9B+taZ8DlyyqzZQnoeIvDVR/2F4EbMepXMc/NdVbkzsJbzkUjhXv/70GQJ7tdLA4YJgNP25zukcxpX2/SueNrA==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "arm64" ], @@ -650,10 +712,49 @@ "url": "https://opencollective.com/libvips" } }, +<<<<<<< HEAD + "node_modules/@img/sharp-libvips-linux-ppc64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.2.4.tgz", + "integrity": "sha512-FMuvGijLDYG6lW+b/UvyilUWu5Ayu+3r2d1S8notiGCIyYU/76eig1UfMmkZ7vwgOrzKzlQbFSuQfgm7GYUPpA==", + "cpu": [ + "ppc64" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-riscv64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-riscv64/-/sharp-libvips-linux-riscv64-1.2.4.tgz", + "integrity": "sha512-oVDbcR4zUC0ce82teubSm+x6ETixtKZBh/qbREIOcI3cULzDyb18Sr/Wcyx7NRQeQzOiHTNbZFF1UwPS2scyGA==", + "cpu": [ + "riscv64" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-s390x": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.2.4.tgz", + "integrity": "sha512-qmp9VrzgPgMoGZyPvrQHqk02uyjA0/QrTO26Tqk6l4ZV0MPWIW6LTkqOIov+J1yEu7MbFQaDpwdwJKhbJvuRxQ==", +======= "node_modules/@img/sharp-libvips-linux-s390x": { "version": "1.0.4", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.0.4.tgz", "integrity": "sha512-u7Wz6ntiSSgGSGcjZ55im6uvTrOxSIS8/dgoVMoiGE9I6JAfU50yH5BoDlYA1tcuGS7g/QNtetJnxA6QEsCVTA==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "s390x" ], @@ -667,9 +768,15 @@ } }, "node_modules/@img/sharp-libvips-linux-x64": { +<<<<<<< HEAD + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.4.tgz", + "integrity": "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==", +======= "version": "1.0.4", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.0.4.tgz", "integrity": "sha512-MmWmQ3iPFZr0Iev+BAgVMb3ZyC4KeFc3jFxnNbEPas60e1cIfevbtuyf9nDGIzOaW9PdnDciJm+wFFaTlj5xYw==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "x64" ], @@ -683,9 +790,15 @@ } }, "node_modules/@img/sharp-libvips-linuxmusl-arm64": { +<<<<<<< HEAD + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.4.tgz", + "integrity": "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==", +======= "version": "1.0.4", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.0.4.tgz", "integrity": "sha512-9Ti+BbTYDcsbp4wfYib8Ctm1ilkugkA/uscUn6UXK1ldpC1JjiXbLfFZtRlBhjPZ5o1NCLiDbg8fhUPKStHoTA==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "arm64" ], @@ -699,9 +812,15 @@ } }, "node_modules/@img/sharp-libvips-linuxmusl-x64": { +<<<<<<< HEAD + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.4.tgz", + "integrity": "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==", +======= "version": "1.0.4", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.0.4.tgz", "integrity": "sha512-viYN1KX9m+/hGkJtvYYp+CCLgnJXwiQB39damAO7WMdKWlIhmYTfHjwSbQeUK/20vY154mwezd9HflVFM1wVSw==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "x64" ], @@ -715,9 +834,15 @@ } }, "node_modules/@img/sharp-linux-arm": { +<<<<<<< HEAD + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.5.tgz", + "integrity": "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==", +======= "version": "0.33.5", "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.33.5.tgz", "integrity": "sha512-JTS1eldqZbJxjvKaAkxhZmBqPRGmxgu+qFKSInv8moZ2AmT5Yib3EQ1c6gp493HvrvV8QgdOXdyaIBrhvFhBMQ==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "arm" ], @@ -733,6 +858,15 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { +<<<<<<< HEAD + "@img/sharp-libvips-linux-arm": "1.2.4" + } + }, + "node_modules/@img/sharp-linux-arm64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.5.tgz", + "integrity": "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==", +======= "@img/sharp-libvips-linux-arm": "1.0.5" } }, @@ -740,6 +874,7 @@ "version": "0.33.5", "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.33.5.tgz", "integrity": "sha512-JMVv+AMRyGOHtO1RFBiJy/MBsgz0x4AWrT6QoEVVTyh1E39TrCUpTRI7mx9VksGX4awWASxqCYLCV4wBZHAYxA==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "arm64" ], @@ -755,6 +890,59 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { +<<<<<<< HEAD + "@img/sharp-libvips-linux-arm64": "1.2.4" + } + }, + "node_modules/@img/sharp-linux-ppc64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-ppc64/-/sharp-linux-ppc64-0.34.5.tgz", + "integrity": "sha512-7zznwNaqW6YtsfrGGDA6BRkISKAAE1Jo0QdpNYXNMHu2+0dTrPflTLNkpc8l7MUP5M16ZJcUvysVWWrMefZquA==", + "cpu": [ + "ppc64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-ppc64": "1.2.4" + } + }, + "node_modules/@img/sharp-linux-riscv64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-riscv64/-/sharp-linux-riscv64-0.34.5.tgz", + "integrity": "sha512-51gJuLPTKa7piYPaVs8GmByo7/U7/7TZOq+cnXJIHZKavIRHAP77e3N2HEl3dgiqdD/w0yUfiJnII77PuDDFdw==", + "cpu": [ + "riscv64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-riscv64": "1.2.4" + } + }, + "node_modules/@img/sharp-linux-s390x": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.5.tgz", + "integrity": "sha512-nQtCk0PdKfho3eC5MrbQoigJ2gd1CgddUMkabUj+rBevs8tZ2cULOx46E7oyX+04WGfABgIwmMC0VqieTiR4jg==", +======= "@img/sharp-libvips-linux-arm64": "1.0.4" } }, @@ -762,6 +950,7 @@ "version": "0.33.5", "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.33.5.tgz", "integrity": "sha512-y/5PCd+mP4CA/sPDKl2961b+C9d+vPAveS33s6Z3zfASk2j5upL6fXVPZi7ztePZ5CuH+1kW8JtvxgbuXHRa4Q==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "s390x" ], @@ -777,6 +966,15 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { +<<<<<<< HEAD + "@img/sharp-libvips-linux-s390x": "1.2.4" + } + }, + "node_modules/@img/sharp-linux-x64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.5.tgz", + "integrity": "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==", +======= "@img/sharp-libvips-linux-s390x": "1.0.4" } }, @@ -784,6 +982,7 @@ "version": "0.33.5", "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.33.5.tgz", "integrity": "sha512-opC+Ok5pRNAzuvq1AG0ar+1owsu842/Ab+4qvU879ippJBHvyY5n2mxF1izXqkPYlGuP/M556uh53jRLJmzTWA==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "x64" ], @@ -799,6 +998,15 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { +<<<<<<< HEAD + "@img/sharp-libvips-linux-x64": "1.2.4" + } + }, + "node_modules/@img/sharp-linuxmusl-arm64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.5.tgz", + "integrity": "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==", +======= "@img/sharp-libvips-linux-x64": "1.0.4" } }, @@ -806,6 +1014,7 @@ "version": "0.33.5", "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.33.5.tgz", "integrity": "sha512-XrHMZwGQGvJg2V/oRSUfSAfjfPxO+4DkiRh6p2AFjLQztWUuY/o8Mq0eMQVIY7HJ1CDQUJlxGGZRw1a5bqmd1g==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "arm64" ], @@ -821,6 +1030,15 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { +<<<<<<< HEAD + "@img/sharp-libvips-linuxmusl-arm64": "1.2.4" + } + }, + "node_modules/@img/sharp-linuxmusl-x64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.5.tgz", + "integrity": "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==", +======= "@img/sharp-libvips-linuxmusl-arm64": "1.0.4" } }, @@ -828,6 +1046,7 @@ "version": "0.33.5", "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.33.5.tgz", "integrity": "sha512-WT+d/cgqKkkKySYmqoZ8y3pxx7lx9vVejxW/W4DOFMYVSkErR+w7mf2u8m/y4+xHe7yY9DAXQMWQhpnMuFfScw==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "x64" ], @@ -843,6 +1062,15 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { +<<<<<<< HEAD + "@img/sharp-libvips-linuxmusl-x64": "1.2.4" + } + }, + "node_modules/@img/sharp-wasm32": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.5.tgz", + "integrity": "sha512-OdWTEiVkY2PHwqkbBI8frFxQQFekHaSSkUIJkwzclWZe64O1X4UlUjqqqLaPbUpMOQk6FBu/HtlGXNblIs0huw==", +======= "@img/sharp-libvips-linuxmusl-x64": "1.0.4" } }, @@ -850,13 +1078,18 @@ "version": "0.33.5", "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.33.5.tgz", "integrity": "sha512-ykUW4LVGaMcU9lu9thv85CbRMAwfeadCJHRsg2GmeRa/cJxsVY9Rbd57JcMxBkKHag5U/x7TSBpScF4U8ElVzg==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "wasm32" ], "license": "Apache-2.0 AND LGPL-3.0-or-later AND MIT", "optional": true, "dependencies": { +<<<<<<< HEAD + "@emnapi/runtime": "^1.7.0" +======= "@emnapi/runtime": "^1.2.0" +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 }, "engines": { "node": "^18.17.0 || ^20.3.0 || >=21.0.0" @@ -865,10 +1098,36 @@ "url": "https://opencollective.com/libvips" } }, +<<<<<<< HEAD + "node_modules/@img/sharp-win32-arm64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.5.tgz", + "integrity": "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0 AND LGPL-3.0-or-later", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-win32-ia32": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.5.tgz", + "integrity": "sha512-FV9m/7NmeCmSHDD5j4+4pNI8Cp3aW+JvLoXcTUo0IqyjSfAZJ8dIUmijx1qaJsIiU+Hosw6xM5KijAWRJCSgNg==", +======= "node_modules/@img/sharp-win32-ia32": { "version": "0.33.5", "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.33.5.tgz", "integrity": "sha512-T36PblLaTwuVJ/zw/LaH0PdZkRz5rd3SmMHX8GSmR7vtNSP5Z6bQkExdSK7xGWyxLw4sUknBuugTelgw2faBbQ==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "ia32" ], @@ -885,9 +1144,15 @@ } }, "node_modules/@img/sharp-win32-x64": { +<<<<<<< HEAD + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.5.tgz", + "integrity": "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw==", +======= "version": "0.33.5", "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.33.5.tgz", "integrity": "sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg==", +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 "cpu": [ "x64" ], @@ -2285,6 +2550,15 @@ "license": "MIT" }, "node_modules/@types/sharp": { +<<<<<<< HEAD + "version": "0.31.1", + "resolved": "https://registry.npmjs.org/@types/sharp/-/sharp-0.31.1.tgz", + "integrity": "sha512-5nWwamN9ZFHXaYEincMSuza8nNfOof8nmO+mcI+Agx1uMUk4/pQnNIcix+9rLPXzKrm1pS34+6WRDbDV0Jn7ag==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" +======= "version": "0.32.0", "resolved": "https://registry.npmjs.org/@types/sharp/-/sharp-0.32.0.tgz", "integrity": "sha512-OOi3kL+FZDnPhVzsfD37J88FNeZh6gQsGcLc95NbeURRGvmSjeXiDcyWzF2o3yh/gQAUn2uhh/e+CPCa5nwAxw==", @@ -2293,6 +2567,7 @@ "license": "MIT", "dependencies": { "sharp": "*" +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 } }, "node_modules/@types/uuid": { @@ -6408,6 +6683,17 @@ "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==" }, "node_modules/sharp": { +<<<<<<< HEAD + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.5.tgz", + "integrity": "sha512-Ou9I5Ft9WNcCbXrU9cMgPBcCK8LiwLqcbywW3t4oDV37n1pzpuNLsYiAV8eODnjbtQlSDwZ2cUEeQz4E54Hltg==", + "hasInstallScript": true, + "license": "Apache-2.0", + "dependencies": { + "@img/colour": "^1.0.0", + "detect-libc": "^2.1.2", + "semver": "^7.7.3" +======= "version": "0.33.5", "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.33.5.tgz", "integrity": "sha512-haPVm1EkS9pgvHrQ/F3Xy+hgcuMV0Wm9vfIBSiwZ05k+xgb0PkBQpGsAA/oWdDobNaZTH5ppvHtzCFbnSEwHVw==", @@ -6417,6 +6703,7 @@ "color": "^4.2.3", "detect-libc": "^2.0.3", "semver": "^7.6.3" +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 }, "engines": { "node": "^18.17.0 || ^20.3.0 || >=21.0.0" @@ -6425,6 +6712,32 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { +<<<<<<< HEAD + "@img/sharp-darwin-arm64": "0.34.5", + "@img/sharp-darwin-x64": "0.34.5", + "@img/sharp-libvips-darwin-arm64": "1.2.4", + "@img/sharp-libvips-darwin-x64": "1.2.4", + "@img/sharp-libvips-linux-arm": "1.2.4", + "@img/sharp-libvips-linux-arm64": "1.2.4", + "@img/sharp-libvips-linux-ppc64": "1.2.4", + "@img/sharp-libvips-linux-riscv64": "1.2.4", + "@img/sharp-libvips-linux-s390x": "1.2.4", + "@img/sharp-libvips-linux-x64": "1.2.4", + "@img/sharp-libvips-linuxmusl-arm64": "1.2.4", + "@img/sharp-libvips-linuxmusl-x64": "1.2.4", + "@img/sharp-linux-arm": "0.34.5", + "@img/sharp-linux-arm64": "0.34.5", + "@img/sharp-linux-ppc64": "0.34.5", + "@img/sharp-linux-riscv64": "0.34.5", + "@img/sharp-linux-s390x": "0.34.5", + "@img/sharp-linux-x64": "0.34.5", + "@img/sharp-linuxmusl-arm64": "0.34.5", + "@img/sharp-linuxmusl-x64": "0.34.5", + "@img/sharp-wasm32": "0.34.5", + "@img/sharp-win32-arm64": "0.34.5", + "@img/sharp-win32-ia32": "0.34.5", + "@img/sharp-win32-x64": "0.34.5" +======= "@img/sharp-darwin-arm64": "0.33.5", "@img/sharp-darwin-x64": "0.33.5", "@img/sharp-libvips-darwin-arm64": "1.0.4", @@ -6444,6 +6757,7 @@ "@img/sharp-wasm32": "0.33.5", "@img/sharp-win32-ia32": "0.33.5", "@img/sharp-win32-x64": "0.33.5" +>>>>>>> a9237931427eb544ef35a83853dd6f4a2e159636 } }, "node_modules/sharp/node_modules/detect-libc": { diff --git a/package.json b/package.json index 5e54f5bc..bd39b1d8 100644 --- a/package.json +++ b/package.json @@ -48,12 +48,11 @@ "ini": "^6.0.0", "minimist": "^1.2.8", "pdf-parse": "^1.1.1", - "sharp": "^0.33.5" + "sharp": "^0.34.5" }, "devDependencies": { "@types/minimist": "^1.2.5", "@types/node": "^22.10.7", - "@types/sharp": "^0.32.0", "@vitest/coverage-v8": "^4.0.13", "@vitest/ui": "^4.0.9", "dependency-cruiser": "^17.3.1", diff --git a/prompts/visual-classification.txt b/prompts/visual-classification.txt new file mode 100644 index 00000000..ff8390ec --- /dev/null +++ b/prompts/visual-classification.txt @@ -0,0 +1,26 @@ +Analyze this image from a technical document. + +Classify it as ONE of: +- diagram: flowcharts, UML, architecture diagrams, state machines, sequence diagrams, dependency graphs +- flowchart: process flows, decision trees, workflow diagrams +- chart: bar charts, line graphs, pie charts, scatter plots, histograms +- table: structured tabular data, matrices +- figure: technical illustrations with labels, annotated diagrams +- skip: photographs, screenshots, decorative images, logos, icons, cover images, AND any of the following: + +MUST classify as "skip": +- Scanned pages or page fragments containing mostly text +- Images that are primarily text with only small graphical elements +- Horizontal or vertical strips/slices of pages +- Images with extreme aspect ratios (very wide and short, or very tall and narrow) +- Low quality or blurry scans +- Pages from OCR-scanned documents + +IMPORTANT: Only classify as diagram/flowchart/chart/table/figure if: +1. The image has clear semantic technical meaning +2. The PRIMARY content is the diagram/chart, not surrounding text +3. The image appears to be an intentional figure, not a page scan artifact + +Respond with ONLY a JSON object: +{"type": "", "confidence": <0-1>, "reason": ""} + diff --git a/prompts/visual-description.txt b/prompts/visual-description.txt new file mode 100644 index 00000000..4215cd21 --- /dev/null +++ b/prompts/visual-description.txt @@ -0,0 +1,20 @@ +Describe this diagram from a technical document. + +Focus on the SEMANTIC MEANING, not visual appearance: +1. What system, process, or concept does this diagram represent? +2. What are the key components or entities shown? +3. What relationships or flows are depicted? +4. What technical concepts does this illustrate? + +Provide: +1. A concise description (2-4 sentences) capturing the semantic meaning +2. Classification as: diagram, flowchart, chart, table, or figure +3. Key technical concepts illustrated (3-8 concepts) + +Respond with ONLY a JSON object: +{ + "description": "", + "type": "", + "concepts": ["concept1", "concept2", ...] +} + diff --git a/scripts/add-visuals-table.ts b/scripts/add-visuals-table.ts new file mode 100644 index 00000000..dbca55c3 --- /dev/null +++ b/scripts/add-visuals-table.ts @@ -0,0 +1,179 @@ +/** + * Migration script to add visuals table to existing database + * + * This script safely augments a production database by: + * 1. Creating the `visuals` table with proper schema + * 2. Creating the `images/` directory for storing extracted diagrams + * + * **Non-destructive:** Does NOT modify existing tables (catalog, chunks, concepts, categories) + * + * Usage: + * npx tsx scripts/add-visuals-table.ts [--dbpath ] + * + * Options: + * --dbpath Path to database directory (default: ~/.concept_rag) + * --force Recreate visuals table if it already exists + * + * Examples: + * npx tsx scripts/add-visuals-table.ts + * npx tsx scripts/add-visuals-table.ts --dbpath /path/to/db + * npx tsx scripts/add-visuals-table.ts --force + */ + +import * as lancedb from '@lancedb/lancedb'; +import * as path from 'path'; +import * as os from 'os'; +import * as fs from 'fs'; +import minimist from 'minimist'; + +// Parse command line arguments +const args = minimist(process.argv.slice(2)); +const dbPath = args.dbpath || path.join(os.homedir(), '.concept_rag'); +const force = args.force || false; + +/** + * Create an empty row with proper schema for the visuals table. + * LanceDB infers schema from the first row inserted. + * + * Note: + * - LanceDB prefers regular number arrays for vectors, not Float32Array. + * - Empty arrays cannot be used for type inference, so we use [0] placeholder. + */ +function createSchemaRow(): Record { + // Create a 384-dim zero vector as a regular array + const zeroVector = new Array(384).fill(0); + + return { + id: 0, + catalog_id: 0, + catalog_title: '', + image_path: '', + description: '', + vector: zeroVector, + visual_type: 'diagram', + page_number: 0, + bounding_box: '', + // Use [0] placeholder for type inference (will be deleted) + concept_ids: [0], + concept_names: [''], + chunk_ids: [0] + }; +} + +async function migrate() { + console.log('🎨 Diagram Awareness Migration'); + console.log('================================\n'); + + // Verify database exists + if (!fs.existsSync(dbPath)) { + console.error(`❌ Database not found at: ${dbPath}`); + console.error(' Run seeding first to create the database.'); + process.exit(1); + } + + console.log(`📦 Connecting to database: ${dbPath}`); + const db = await lancedb.connect(dbPath); + + // List existing tables + const existingTables = await db.tableNames(); + console.log(`✅ Existing tables: ${existingTables.join(', ')}`); + + // Verify core tables exist + const requiredTables = ['catalog', 'chunks', 'concepts', 'categories']; + const missingTables = requiredTables.filter(t => !existingTables.includes(t)); + + if (missingTables.length > 0) { + console.error(`\n❌ Missing required tables: ${missingTables.join(', ')}`); + console.error(' This database appears incomplete. Run seeding first.'); + process.exit(1); + } + + // Check if visuals table already exists + if (existingTables.includes('visuals')) { + if (force) { + console.log('\n⚠️ Visuals table exists. --force specified, dropping and recreating...'); + await db.dropTable('visuals'); + } else { + console.log('\n✅ Visuals table already exists.'); + console.log(' Use --force to drop and recreate.'); + + // Show current stats + const visuals = await db.openTable('visuals'); + const count = await visuals.countRows(); + console.log(` Current row count: ${count}`); + + // Verify images directory + const imagesDir = path.join(dbPath, 'images'); + if (fs.existsSync(imagesDir)) { + console.log(` Images directory exists: ${imagesDir}`); + } + + process.exit(0); + } + } + + // Create images directory + const imagesDir = path.join(dbPath, 'images'); + console.log(`\n📁 Creating images directory: ${imagesDir}`); + + if (!fs.existsSync(imagesDir)) { + fs.mkdirSync(imagesDir, { recursive: true }); + console.log(' ✅ Created'); + } else { + console.log(' ✅ Already exists'); + } + + // Create visuals table with schema + console.log('\n📊 Creating visuals table...'); + + // Create with schema row, then delete it + const schemaRow = createSchemaRow(); + const visualsTable = await db.createTable('visuals', [schemaRow]); + + // Delete the schema row (id = 0) + await visualsTable.delete('id = 0'); + + console.log(' ✅ Visuals table created'); + + // Verify schema + const schema = await visualsTable.schema(); + console.log('\n📋 Table schema:'); + for (const field of schema.fields) { + console.log(` - ${field.name}: ${field.type}`); + } + + // Final stats + console.log('\n================================'); + console.log('✅ Migration complete!\n'); + + console.log('📊 Database summary:'); + for (const tableName of [...requiredTables, 'visuals']) { + const table = await db.openTable(tableName); + const count = await table.countRows(); + const marker = tableName === 'visuals' ? ' ★ NEW' : ''; + console.log(` ${tableName}: ${count} rows${marker}`); + } + + console.log('\n📁 Storage structure:'); + console.log(` ${dbPath}/`); + console.log(' ├── catalog.lance/'); + console.log(' ├── chunks.lance/'); + console.log(' ├── concepts.lance/'); + console.log(' ├── categories.lance/'); + console.log(' ├── visuals.lance/ ★ NEW'); + console.log(' └── images/ ★ NEW'); + + console.log('\n🎯 Next steps:'); + console.log(' 1. Run extract-visuals.ts to extract diagrams from documents'); + console.log(' 2. Run describe-visuals.ts to generate semantic descriptions'); +} + +migrate().catch(err => { + console.error('\n❌ Migration failed:', err.message); + if (err.stack) { + console.error('\nStack trace:'); + console.error(err.stack); + } + process.exit(1); +}); + diff --git a/scripts/describe-visuals.ts b/scripts/describe-visuals.ts new file mode 100644 index 00000000..fe29df30 --- /dev/null +++ b/scripts/describe-visuals.ts @@ -0,0 +1,366 @@ +/** + * Describe Visuals Script + * + * Generates semantic descriptions for extracted visuals using Vision LLM. + * Updates the visuals table with: + * - Semantic description + * - Updated embeddings + * - Extracted concepts + * - Linked chunk IDs + * + * Usage: + * npx tsx scripts/describe-visuals.ts [options] + * + * Options: + * --dbpath Database path (default: ~/.concept_rag) + * --catalog-id Describe visuals for specific catalog ID + * --limit Limit number of visuals to process + * --redescribe Re-describe visuals that already have descriptions + * --model Vision model to use (default: anthropic/claude-sonnet-4) + * --dry-run Show what would be processed without calling API + * --cleanup Remove stale visual records with missing image files + * + * Examples: + * npx tsx scripts/describe-visuals.ts + * npx tsx scripts/describe-visuals.ts --catalog-id 12345678 + * npx tsx scripts/describe-visuals.ts --redescribe --limit 10 + */ + +import * as lancedb from '@lancedb/lancedb'; +import * as path from 'path'; +import * as os from 'os'; +import * as fs from 'fs'; +import minimist from 'minimist'; +import { createVisionLLMService } from '../src/infrastructure/visual-extraction/vision-llm-service.js'; +import { SimpleEmbeddingService } from '../src/infrastructure/embeddings/simple-embedding-service.js'; +import { hashToId } from '../src/infrastructure/utils/hash.js'; + +// Parse command line arguments +const args = minimist(process.argv.slice(2)); +const dbPath = args.dbpath || path.join(os.homedir(), '.concept_rag'); +const catalogIdFilter = args['catalog-id'] ? parseInt(args['catalog-id'], 10) : undefined; +const limit = args.limit ? parseInt(args.limit, 10) : undefined; +const redescribe = args.redescribe || false; +const visionModel = args.model as string | undefined; +const dryRun = args['dry-run'] || false; +const cleanupStale = args.cleanup || false; + +// Rate limiting: Vision API calls per second +const RATE_LIMIT_DELAY_MS = 2000; + +/** + * Sleep for a specified number of milliseconds. + */ +function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +/** + * Extract simple concepts from a description. + * Uses keyword extraction for MVP - can be enhanced with LLM later. + */ +function extractConceptsFromDescription(description: string): string[] { + // Common technical terms to look for + const technicalPatterns = [ + /dependency injection/gi, + /microservices?/gi, + /architecture/gi, + /design patterns?/gi, + /data flow/gi, + /state machine/gi, + /sequence diagram/gi, + /class diagram/gi, + /flowchart/gi, + /workflow/gi, + /api/gi, + /database/gi, + /components?/gi, + /modules?/gi, + /layers?/gi, + /interfaces?/gi, + /services?/gi, + /controllers?/gi, + /repositories?/gi, + /entities/gi, + /domain/gi, + /infrastructure/gi, + /presentation/gi, + /business logic/gi, + /use cases?/gi, + /clean architecture/gi, + /hexagonal/gi, + /onion/gi, + /mvc/gi, + /mvvm/gi, + /solid/gi, + /dry/gi, + /kiss/gi, + ]; + + const concepts = new Set(); + + for (const pattern of technicalPatterns) { + const matches = description.match(pattern); + if (matches) { + for (const match of matches) { + concepts.add(match.toLowerCase()); + } + } + } + + return Array.from(concepts).slice(0, 10); // Limit to 10 concepts +} + +async function main() { + console.log('📝 Visual Description Generator'); + console.log('================================\n'); + + const apiKey = process.env.OPENROUTER_API_KEY; + if (!apiKey && !dryRun) { + console.error('❌ OPENROUTER_API_KEY environment variable is required'); + console.error(' Get an API key from https://openrouter.ai/'); + process.exit(1); + } + + // Verify database exists + if (!fs.existsSync(dbPath)) { + console.error(`❌ Database not found at: ${dbPath}`); + process.exit(1); + } + + // Connect to database + console.log(`📦 Connecting to database: ${dbPath}`); + const db = await lancedb.connect(dbPath); + + // Verify tables exist + const tables = await db.tableNames(); + if (!tables.includes('visuals')) { + console.error('❌ Visuals table not found. Run add-visuals-table.ts first.'); + process.exit(1); + } + if (!tables.includes('concepts')) { + console.error('❌ Concepts table not found.'); + process.exit(1); + } + if (!tables.includes('chunks')) { + console.error('❌ Chunks table not found.'); + process.exit(1); + } + + const visuals = await db.openTable('visuals'); + const concepts = await db.openTable('concepts'); + const chunks = await db.openTable('chunks'); + + // Cleanup stale records if requested + if (cleanupStale) { + console.log('\n🧹 Cleaning up stale visual records...'); + const allVisuals = await visuals.query().limit(100000).toArray(); + let removedCount = 0; + + for (const visual of allVisuals) { + const imagePath = path.join(dbPath, visual.image_path); + if (!fs.existsSync(imagePath)) { + await visuals.delete(`id = ${visual.id}`); + removedCount++; + } + } + + if (removedCount > 0) { + console.log(` Removed ${removedCount} stale records`); + } else { + console.log(' No stale records found'); + } + + const visualCount = await visuals.countRows(); + console.log(` Visuals table now has ${visualCount} rows`); + } + + // Get visuals to process + let visualEntries: any[] = []; + + if (catalogIdFilter) { + const entries = await visuals.query().where(`catalog_id = ${catalogIdFilter}`).toArray(); + visualEntries = entries; + } else { + const allEntries = await visuals.query().limit(10000).toArray(); + visualEntries = allEntries; + } + + // Filter by description status + if (!redescribe) { + visualEntries = visualEntries.filter((v: any) => + !v.description || + v.description.includes('pending description') || + v.description.includes('description unavailable') + ); + } + + if (limit && visualEntries.length > limit) { + visualEntries = visualEntries.slice(0, limit); + } + + console.log(`🖼️ Found ${visualEntries.length} visuals to process`); + + if (visualEntries.length === 0) { + console.log(' No visuals need description.'); + process.exit(0); + } + + if (dryRun) { + console.log('\n🔍 Dry run mode - showing what would be processed:\n'); + for (const entry of visualEntries.slice(0, 10)) { + console.log(` 📷 Visual ${entry.id}`); + console.log(` Page: ${entry.page_number}, Type: ${entry.visual_type}`); + console.log(` Image: ${entry.image_path}`); + } + if (visualEntries.length > 10) { + console.log(` ... and ${visualEntries.length - 10} more`); + } + console.log('\n Run without --dry-run to generate descriptions.'); + process.exit(0); + } + + // Create services + const visionService = createVisionLLMService({ + apiKey, + model: visionModel + }); + const embeddingService = new SimpleEmbeddingService(); + + // Build concept name lookup + console.log('\n📚 Loading concept index...'); + const conceptEntries = await concepts.query().limit(100000).toArray(); + const conceptNameToId = new Map(); + for (const c of conceptEntries) { + if (c.name) { + conceptNameToId.set(c.name.toLowerCase(), c.id); + } + } + console.log(` Loaded ${conceptNameToId.size} concepts`); + + // Build chunk lookup by catalog_id and page + console.log('📄 Loading chunk index...'); + const chunkEntries = await chunks.query().limit(100000).toArray(); + const chunksByPage = new Map(); // "catalogId-page" -> chunk IDs + for (const chunk of chunkEntries) { + if (chunk.catalog_id && chunk.page_number) { + const key = `${chunk.catalog_id}-${chunk.page_number}`; + if (!chunksByPage.has(key)) { + chunksByPage.set(key, []); + } + chunksByPage.get(key)!.push(chunk.id); + } + } + console.log(` Indexed chunks for ${chunksByPage.size} pages`); + + let processed = 0; + let errors = 0; + let skippedMissing = 0; + + // Process each visual + for (let i = 0; i < visualEntries.length; i++) { + const visual = visualEntries[i]; + const imagePath = path.join(dbPath, visual.image_path); + + // Check image exists - silently skip missing images (stale records) + if (!fs.existsSync(imagePath)) { + skippedMissing++; + continue; + } + + console.log(`\n[${i + 1}/${visualEntries.length}] 📷 Visual ${visual.id}`); + console.log(` Page ${visual.page_number}, Type: ${visual.visual_type}`); + + try { + // Generate description + process.stdout.write(' 🔍 Generating description...'); + const descResult = await visionService.describeVisual(imagePath); + console.log(' ✅'); + + // Extract concepts from description + const extractedConcepts = [ + ...descResult.concepts, + ...extractConceptsFromDescription(descResult.description) + ]; + const uniqueConcepts = [...new Set(extractedConcepts.map(c => c.toLowerCase()))]; + + // Map concept names to IDs + const conceptIds: number[] = []; + const conceptNames: string[] = []; + for (const conceptName of uniqueConcepts) { + const conceptId = conceptNameToId.get(conceptName); + if (conceptId) { + conceptIds.push(conceptId); + conceptNames.push(conceptName); + } + } + + // Find chunks on same page + const pageKey = `${visual.catalog_id}-${visual.page_number}`; + const chunkIds = chunksByPage.get(pageKey) || []; + + // Generate embedding for description + const vector = embeddingService.generateEmbedding(descResult.description); + + // Update visual record + // LanceDB doesn't support update, so we delete and re-add + await visuals.delete(`id = ${visual.id}`); + + await visuals.add([{ + id: visual.id, + catalog_id: visual.catalog_id, + catalog_title: visual.catalog_title, + image_path: visual.image_path, + description: descResult.description, + vector, + visual_type: descResult.type, + page_number: visual.page_number, + bounding_box: visual.bounding_box || '', + concept_ids: conceptIds.length > 0 ? conceptIds : [0], + concept_names: conceptNames.length > 0 ? conceptNames : [''], + chunk_ids: chunkIds.length > 0 ? chunkIds : [0] + }]); + + console.log(` 📝 Description: ${descResult.description.substring(0, 80)}...`); + console.log(` 🏷️ Concepts: ${conceptNames.length > 0 ? conceptNames.join(', ') : 'none'}`); + console.log(` 📄 Linked chunks: ${chunkIds.length}`); + + processed++; + + // Rate limiting + if (i < visualEntries.length - 1) { + await sleep(RATE_LIMIT_DELAY_MS); + } + + } catch (error: any) { + console.log(` ❌ Error: ${error.message}`); + errors++; + } + } + + // Final summary + console.log('\n================================'); + console.log('✅ Description generation complete!\n'); + console.log('📊 Summary:'); + console.log(` Visuals processed: ${processed}`); + if (skippedMissing > 0) { + console.log(` Skipped (stale records): ${skippedMissing}`); + } + if (errors > 0) { + console.log(` Errors: ${errors}`); + } + + // Verify visuals table + const visualCount = await visuals.countRows(); + console.log(`\n Visuals table: ${visualCount} rows`); +} + +main().catch(err => { + console.error('\n❌ Description generation failed:', err.message); + if (err.stack) { + console.error('\nStack trace:'); + console.error(err.stack); + } + process.exit(1); +}); + diff --git a/scripts/extract-visuals.ts b/scripts/extract-visuals.ts new file mode 100644 index 00000000..9dae9cf2 --- /dev/null +++ b/scripts/extract-visuals.ts @@ -0,0 +1,339 @@ +/** + * Extract Visuals Script + * + * Extracts diagrams from PDF and EPUB documents in the catalog and stores them + * as grayscale images with metadata in the visuals table. + * + * Uses LOCAL classification model - no API key required for extraction! + * Only diagrams with semantic meaning are stored: + * - Flowcharts, UML, architecture diagrams + * - Charts and graphs + * - Tables + * - Technical figures + * + * Photos, screenshots, and decorative images are filtered out. + * + * Supported formats: + * - PDF: Native and scanned documents + * - EPUB: Electronic book format with embedded images + * + * Usage: + * npx tsx scripts/extract-visuals.ts [options] + * + * Options: + * --dbpath Database path (default: ~/.concept_rag) + * --source Extract from specific document (partial match on title) + * --catalog-id Extract from specific catalog ID + * --limit Limit number of documents to process + * --dpi Rendering DPI for PDFs (default: 150) + * --dry-run Show what would be extracted without saving + * --resume Skip documents that already have visuals in the database + * --force-type Force document type: native, scanned, or mixed (PDF only) + * --min-score Minimum classification score (0-1, default: 0.5) + * + * Examples: + * npx tsx scripts/extract-visuals.ts + * npx tsx scripts/extract-visuals.ts --source "Clean Architecture" + * npx tsx scripts/extract-visuals.ts --source "Design It" # EPUB + * npx tsx scripts/extract-visuals.ts --catalog-id 12345678 + * npx tsx scripts/extract-visuals.ts --limit 5 --dry-run + * npx tsx scripts/extract-visuals.ts --force-type scanned + * + * Prerequisites: + * - poppler-utils (pdftoppm, pdfimages) - for PDF processing + * - Python 3.8+ with LayoutParser (run: cd scripts/python && ./setup.sh) + */ + +import * as lancedb from '@lancedb/lancedb'; +import * as path from 'path'; +import * as os from 'os'; +import * as fs from 'fs'; +import minimist from 'minimist'; +import { VisualExtractor } from '../src/infrastructure/visual-extraction/visual-extractor.js'; +import { isPdfToolsAvailable } from '../src/infrastructure/visual-extraction/pdf-page-renderer.js'; +import { isLocalClassifierAvailable } from '../src/infrastructure/visual-extraction/local-classifier.js'; +import { hashToId } from '../src/infrastructure/utils/hash.js'; +import { serializeBoundingBox } from '../src/domain/models/visual.js'; +import { SimpleEmbeddingService } from '../src/infrastructure/embeddings/simple-embedding-service.js'; +import type { DocumentType } from '../src/infrastructure/visual-extraction/document-analyzer.js'; + +// Parse command line arguments +const args = minimist(process.argv.slice(2)); +const dbPath = args.dbpath || path.join(os.homedir(), '.concept_rag'); +const sourceFilter = args.source as string | undefined; +const catalogIdFilter = args['catalog-id'] ? parseInt(args['catalog-id'], 10) : undefined; +const limit = args.limit ? parseInt(args.limit, 10) : undefined; +const renderDpi = args.dpi ? parseInt(args.dpi, 10) : 150; +const dryRun = args['dry-run'] || false; +const resumeMode = args.resume || false; +const forceType = args['force-type'] as DocumentType | undefined; +const minScore = args['min-score'] ? parseFloat(args['min-score']) : 0.5; + +async function main() { + console.log('🖼️ Visual Extraction (Local Classification)'); + console.log('=============================================\n'); + + // Check prerequisites + if (!isPdfToolsAvailable()) { + console.error('❌ PDF tools not found. Install poppler-utils:'); + console.error(' Ubuntu/Debian: sudo apt install poppler-utils'); + console.error(' macOS: brew install poppler'); + process.exit(1); + } + + // Check local classifier (warn but don't fail - native PDFs work without it) + const hasLocalClassifier = isLocalClassifierAvailable(); + if (!hasLocalClassifier) { + console.log('⚠️ Local classifier not available (scanned PDFs may not work)'); + console.log(' To enable: cd scripts/python && ./setup.sh\n'); + } else { + console.log('✅ Local classifier available (no API key needed)\n'); + } + + // Verify database exists + if (!fs.existsSync(dbPath)) { + console.error(`❌ Database not found at: ${dbPath}`); + process.exit(1); + } + + // Connect to database + console.log(`📦 Connecting to database: ${dbPath}`); + const db = await lancedb.connect(dbPath); + + // Verify tables exist + const tables = await db.tableNames(); + if (!tables.includes('catalog')) { + console.error('❌ Catalog table not found'); + process.exit(1); + } + if (!tables.includes('visuals')) { + console.error('❌ Visuals table not found. Run add-visuals-table.ts first.'); + process.exit(1); + } + + const catalog = await db.openTable('catalog'); + const visuals = await db.openTable('visuals'); + + // Get catalog entries to process + let catalogEntries: any[] = []; + + if (catalogIdFilter) { + const entries = await catalog.query().where(`id = ${catalogIdFilter}`).toArray(); + catalogEntries = entries; + } else { + const allEntries = await catalog.query().limit(10000).toArray(); + + if (sourceFilter) { + const filterLower = sourceFilter.toLowerCase(); + catalogEntries = allEntries.filter((e: any) => + (e.title || '').toLowerCase().includes(filterLower) || + (e.source || '').toLowerCase().includes(filterLower) + ); + } else { + catalogEntries = allEntries; + } + } + + if (limit && catalogEntries.length > limit) { + catalogEntries = catalogEntries.slice(0, limit); + } + + // In resume mode, filter out documents that already have visuals + let skippedCount = 0; + if (resumeMode) { + console.log('🔄 Resume mode: checking for already-processed documents...'); + const existingVisuals = await visuals.query().select(['catalog_id']).limit(100000).toArray(); + const processedCatalogIds = new Set(existingVisuals.map((v: any) => v.catalog_id)); + + const originalCount = catalogEntries.length; + catalogEntries = catalogEntries.filter((e: any) => !processedCatalogIds.has(e.id)); + skippedCount = originalCount - catalogEntries.length; + + if (skippedCount > 0) { + console.log(` ⏭️ Skipping ${skippedCount} documents with existing visuals`); + } + } + + console.log(`📚 Found ${catalogEntries.length} documents to process`); + + if (catalogEntries.length === 0) { + console.log(' No documents matched the filter criteria.'); + if (resumeMode && skippedCount > 0) { + console.log(` (${skippedCount} documents already have visuals)`); + } + process.exit(0); + } + + if (dryRun) { + console.log('\n🔍 Dry run mode - showing what would be processed:\n'); + for (const entry of catalogEntries) { + console.log(` 📄 ${entry.title || 'Untitled'}`); + console.log(` Source: ${entry.source || 'Unknown'}`); + console.log(` ID: ${entry.id}`); + } + console.log('\n Run without --dry-run to extract visuals.'); + process.exit(0); + } + + // Create extractor and embedding service + const extractor = new VisualExtractor(dbPath, { + config: { renderDpi } + }); + const embeddingService = new SimpleEmbeddingService(); + + let totalVisuals = 0; + let totalFiltered = 0; + let totalPreFiltered = 0; + let totalErrors = 0; + let nativeCount = 0; + let scannedCount = 0; + let epubCount = 0; + + // Process each document + for (let i = 0; i < catalogEntries.length; i++) { + const entry = catalogEntries[i]; + const title = entry.title || 'Untitled'; + const source = entry.source || ''; + const catalogId = entry.id; + + console.log(`\n[${i + 1}/${catalogEntries.length}] 📄 ${title}`); + + // Check if source file exists and is a supported format (PDF or EPUB) + const ext = source ? source.toLowerCase().slice(source.lastIndexOf('.')) : ''; + const supportedFormats = ['.pdf', '.epub']; + + if (!source || !supportedFormats.includes(ext)) { + console.log(` ⏭️ Skipping (unsupported format: ${ext || 'no extension'})`); + continue; + } + + if (!fs.existsSync(source)) { + console.log(` ⚠️ Source file not found: ${source}`); + continue; + } + + // For PDF-only checks + const isPdf = ext === '.pdf'; + const isEpub = ext === '.epub'; + + // Build document info for intuitive folder naming + const documentInfo = { + title, + author: entry.author || undefined, + year: entry.year || undefined, + id: catalogId + }; + + // Extract visuals using unified extract() method + const result = await extractor.extract(source, catalogId, documentInfo, { + forceDocumentType: isPdf ? forceType : undefined, // Force type only applies to PDFs + minClassificationScore: minScore, + onProgress: (stage, current, total, message) => { + const stageIcon = stage === 'rendering' ? '📷' : + stage === 'classifying' ? '🔍' : + stage === 'extracting' ? '✂️' : '🏷️'; + process.stdout.write(`\r ${stageIcon} ${stage}: ${current}/${total} ${message || ''}`.padEnd(80)); + } + }); + + // Clear progress line + process.stdout.write('\r' + ' '.repeat(80) + '\r'); + + // Track document types + if (result.documentFormat === 'epub') { + epubCount++; + } else if (result.documentType === 'scanned') { + scannedCount++; + } else { + nativeCount++; + } + + // Report results + const formatLabel = result.documentFormat === 'epub' ? 'epub' : result.documentType; + console.log(` 📁 Folder: ${result.folderSlug} (${formatLabel})`); + const filterSummary = result.imagesPreFiltered > 0 + ? `Pre-filtered: ${result.imagesPreFiltered} page-sized, Classified: ${result.imagesFiltered} skip` + : `Filtered: ${result.imagesFiltered} non-semantic`; + console.log(` ✅ Extracted: ${result.visuals.length} visuals, ${filterSummary}`); + + if (result.errors.length > 0) { + console.log(` ⚠️ Errors: ${result.errors.length}`); + for (const error of result.errors.slice(0, 3)) { + console.log(` - ${error}`); + } + if (result.errors.length > 3) { + console.log(` ... and ${result.errors.length - 3} more`); + } + } + + // Add visuals to database + for (const visual of result.visuals) { + // Generate ID + const visualId = hashToId(`${catalogId}-${visual.pageNumber}-${visual.visualIndex}`); + + // Create placeholder description (will be filled by describe-visuals.ts) + const description = `Visual on page ${visual.pageNumber} (pending description)`; + const vector = embeddingService.generateEmbedding(description); + + const visualRecord = { + id: visualId, + catalog_id: catalogId, + catalog_title: title, + image_path: visual.imagePath, + description, + vector, + visual_type: visual.type, + page_number: visual.pageNumber, + bounding_box: serializeBoundingBox(visual.boundingBox), + concept_ids: [0], // Placeholder + concept_names: [''], // Placeholder + chunk_ids: [0] // Placeholder - will be linked later + }; + + try { + await visuals.add([visualRecord]); + } catch (addError: any) { + console.log(` ⚠️ Failed to add visual: ${addError.message}`); + totalErrors++; + } + } + + totalVisuals += result.visuals.length; + totalFiltered += result.imagesFiltered; + totalPreFiltered += result.imagesPreFiltered; + totalErrors += result.errors.length; + } + + // Final summary + console.log('\n============================================='); + console.log('✅ Extraction complete!\n'); + console.log('📊 Summary:'); + console.log(` Documents processed: ${catalogEntries.length}`); + console.log(` Formats: ${nativeCount} PDF native, ${scannedCount} PDF scanned, ${epubCount} EPUB`); + console.log(` Visuals extracted: ${totalVisuals}`); + if (totalPreFiltered > 0) { + console.log(` Page-sized images pre-filtered: ${totalPreFiltered}`); + } + console.log(` Non-semantic filtered: ${totalFiltered}`); + console.log(` API calls made: 0 (local classification)`); + if (totalErrors > 0) { + console.log(` Errors: ${totalErrors}`); + } + + // Verify visuals table + const visualCount = await visuals.countRows(); + console.log(`\n Visuals table: ${visualCount} rows`); + + console.log('\n🎯 Next steps:'); + console.log(' Run describe-visuals.ts to generate semantic descriptions'); + console.log(' (This step requires OPENROUTER_API_KEY)'); +} + +main().catch(err => { + console.error('\n❌ Extraction failed:', err.message); + if (err.stack) { + console.error('\nStack trace:'); + console.error(err.stack); + } + process.exit(1); +}); diff --git a/scripts/python/classify_visual.py b/scripts/python/classify_visual.py new file mode 100644 index 00000000..89571025 --- /dev/null +++ b/scripts/python/classify_visual.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +""" +Classify images using LayoutParser local model. + +Supports two modes: +1. CLASSIFY: Is this image a diagram/table/skip? (for native PDF images) +2. DETECT: Find diagram regions within a page image (for scanned PDFs) + +Usage: + # Classify a single image (native PDF) + python classify_visual.py classify [--min-score 0.5] + + # Detect regions in a page image (scanned PDF) + python classify_visual.py detect [--min-score 0.5] + +Output: + JSON with classification result or detected regions +""" + +import sys +import json +import argparse +import os + +# Suppress torch warnings +os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' +import warnings +warnings.filterwarnings('ignore', category=UserWarning) + +try: + import layoutparser as lp + from PIL import Image + LAYOUTPARSER_AVAILABLE = True +except ImportError: + LAYOUTPARSER_AVAILABLE = False + +# Load pre-trained model (cached after first load) +MODEL = None + +def get_model(): + """Get or initialize the LayoutParser model.""" + global MODEL + if MODEL is None: + if not LAYOUTPARSER_AVAILABLE: + raise RuntimeError( + "LayoutParser not installed. Run:\n" + " cd scripts/python && python -m venv venv && source venv/bin/activate\n" + " pip install -r requirements.txt\n" + " pip install 'git+https://github.com/facebookresearch/detectron2.git'" + ) + + # PubLayNet model - trained on 330k+ scientific documents + # Detects: Text, Title, List, Table, Figure + + # Check for local model weights to avoid Dropbox URL parsing issues + import os + home = os.path.expanduser("~") + local_weights = os.path.join(home, ".torch/iopath_cache/s/dgy9c10wykk4lq4/model_final.pth") + local_config = os.path.join(home, ".torch/iopath_cache/s/f3b12qc4hc0yh4m/config.yml") + + if os.path.exists(local_weights) and os.path.exists(local_config): + # Use local files directly + MODEL = lp.Detectron2LayoutModel( + config_path=local_config, + model_path=local_weights, + extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.3], + label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"} + ) + else: + # Fall back to LayoutParser's default download + MODEL = lp.Detectron2LayoutModel( + config_path='lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config', + extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.3], + label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"} + ) + return MODEL + + +def classify_image(image_path: str, min_score: float = 0.5) -> dict: + """ + Classify a single image (from pdfimages extraction). + + Determines if the image is primarily a Figure or Table. + Returns the dominant element type, or 'skip' if no figure/table detected. + + Args: + image_path: Path to the image file + min_score: Minimum confidence score (0-1) + + Returns: + dict with keys: type, score, skip + """ + image = Image.open(image_path).convert("RGB") + model = get_model() + + layout = model.detect(image) + + # Find the largest/highest-confidence figure or table + best_match = None + best_score = 0 + image_area = image.width * image.height + + for block in layout: + if block.score >= min_score and block.type in ["Figure", "Table"]: + # Score combines confidence and relative area + block_area = block.block.width * block.block.height + combined_score = block.score * (block_area / image_area) + + if combined_score > best_score: + best_score = combined_score + best_match = block + + if best_match: + # Map to visual types used by concept-rag + visual_type = "figure" if best_match.type == "Figure" else "table" + return { + "type": visual_type, + "score": round(best_match.score, 3), + "skip": False + } + else: + return { + "type": "skip", + "score": 0, + "skip": True + } + + +def detect_regions(image_path: str, min_score: float = 0.5) -> list: + """ + Detect all figure/table regions in a page image (for scanned PDFs). + + Returns bounding boxes for each detected region that can be cropped. + + Args: + image_path: Path to the page image + min_score: Minimum confidence score (0-1) + + Returns: + List of dicts with keys: type, score, bbox + """ + image = Image.open(image_path).convert("RGB") + model = get_model() + + layout = model.detect(image) + + results = [] + for block in layout: + if block.score >= min_score and block.type in ["Figure", "Table"]: + # Map to visual types used by concept-rag + visual_type = "figure" if block.type == "Figure" else "table" + + results.append({ + "type": visual_type, + "score": round(block.score, 3), + "bbox": { + "x": int(block.block.x_1), + "y": int(block.block.y_1), + "width": int(block.block.width), + "height": int(block.block.height) + } + }) + + # Sort by position (top to bottom, left to right) + results.sort(key=lambda r: (r["bbox"]["y"], r["bbox"]["x"])) + + return results + + +def main(): + parser = argparse.ArgumentParser( + description="Classify document images using local layout detection model" + ) + parser.add_argument( + "mode", + choices=["classify", "detect"], + help="classify: single image classification, detect: find regions in page" + ) + parser.add_argument( + "image_path", + help="Path to image file" + ) + parser.add_argument( + "--min-score", + type=float, + default=0.5, + help="Minimum confidence score (0-1, default: 0.5)" + ) + + args = parser.parse_args() + + # Verify image exists + if not os.path.exists(args.image_path): + print(json.dumps({"error": f"Image not found: {args.image_path}"})) + sys.exit(1) + + try: + if args.mode == "classify": + result = classify_image(args.image_path, args.min_score) + else: + result = detect_regions(args.image_path, args.min_score) + + print(json.dumps(result)) + except Exception as e: + print(json.dumps({"error": str(e)})) + sys.exit(1) + + +if __name__ == "__main__": + main() + diff --git a/scripts/python/requirements.txt b/scripts/python/requirements.txt new file mode 100644 index 00000000..4bb8678e --- /dev/null +++ b/scripts/python/requirements.txt @@ -0,0 +1,10 @@ +# Layout detection dependencies +layoutparser==0.3.4 +torch>=2.0.0 +torchvision>=0.15.0 +Pillow>=9.0.0 +opencv-python>=4.8.0 + +# Detectron2 must be installed separately: +# pip install 'git+https://github.com/facebookresearch/detectron2.git' + diff --git a/scripts/python/setup.sh b/scripts/python/setup.sh new file mode 100755 index 00000000..184c1b6a --- /dev/null +++ b/scripts/python/setup.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Setup script for Python layout detection environment + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +echo "🐍 Setting up Python environment for layout detection..." + +# Check Python version +PYTHON_VERSION=$(python3 --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2) +echo " Python version: $PYTHON_VERSION" + +# Create virtual environment if it doesn't exist +if [ ! -d "venv" ]; then + echo "📦 Creating virtual environment..." + python3 -m venv venv +fi + +# Activate virtual environment +source venv/bin/activate + +# Upgrade pip +echo "📥 Upgrading pip..." +pip install --upgrade pip + +# Install requirements +echo "📥 Installing requirements..." +pip install -r requirements.txt + +# Install Detectron2 +echo "📥 Installing Detectron2 (this may take a few minutes)..." +pip install 'git+https://github.com/facebookresearch/detectron2.git' + +# Verify installation +echo "✅ Verifying installation..." +python -c "import layoutparser as lp; print(' LayoutParser:', lp.__version__)" +python -c "import detectron2; print(' Detectron2: installed')" + +echo "" +echo "✅ Setup complete!" +echo "" +echo "To use the classifier:" +echo " source scripts/python/venv/bin/activate" +echo " python scripts/python/classify_visual.py classify " +echo "" +echo "Or from TypeScript (auto-detects venv):" +echo " import { classifyImage } from './local-classifier.js'" + diff --git a/scripts/seed-test-visuals.ts b/scripts/seed-test-visuals.ts new file mode 100644 index 00000000..716822e9 --- /dev/null +++ b/scripts/seed-test-visuals.ts @@ -0,0 +1,236 @@ +/** + * Seed Test Visuals Script + * + * Populates the test database with sample visual data for testing + * the get_visuals MCP tool and visual enrichment features. + * + * Usage: + * npx tsx scripts/seed-test-visuals.ts + */ + +import * as lancedb from '@lancedb/lancedb'; +import * as path from 'path'; +import * as fs from 'fs'; +import { SimpleEmbeddingService } from '../src/infrastructure/embeddings/simple-embedding-service.js'; + +const TEST_DB_PATH = path.join(process.cwd(), 'db/test'); +const IMAGES_DIR = path.join(TEST_DB_PATH, 'images'); + +// Sample visuals to create - linked to actual catalog entries and concepts +const SAMPLE_VISUALS = [ + { + catalogId: 3155035939, // 1-s2.0-S2096720925000132-main + catalogTitle: 'Blockchain Interoperability Survey', + description: 'Architecture diagram showing the layered blockchain interoperability stack with cross-chain communication protocols, consensus mechanisms, and transaction routing components.', + visualType: 'diagram', + pageNumber: 5, + concepts: ['blockchain', 'interoperability', 'cross-chain', 'consensus', 'architecture'] + }, + { + catalogId: 495016259, // 1711.03936v2 + catalogTitle: 'Deep Learning Paper', + description: 'Neural network architecture flowchart depicting the forward propagation through convolutional layers, pooling operations, and fully connected layers for image classification.', + visualType: 'flowchart', + pageNumber: 3, + concepts: ['neural network', 'deep learning', 'convolutional', 'architecture'] + }, + { + catalogId: 3213084581, // 2006.15918v1 + catalogTitle: 'Distributed Systems Research', + description: 'Sequence diagram illustrating the consensus protocol message flow between distributed nodes, showing propose, prepare, commit, and acknowledge phases.', + visualType: 'diagram', + pageNumber: 8, + concepts: ['distributed systems', 'consensus protocol', 'message passing'] + }, + { + catalogId: 3974015912, // 2204.11193v1 + catalogTitle: 'Machine Learning Framework', + description: 'Performance comparison bar chart showing training time, inference latency, and memory usage across different model architectures and hardware configurations.', + visualType: 'chart', + pageNumber: 12, + concepts: ['performance', 'machine learning', 'benchmark', 'optimization'] + }, + { + catalogId: 4104765478, // 2302.12125v2 + catalogTitle: 'Smart Contract Security', + description: 'State machine diagram representing smart contract lifecycle states including deployed, active, paused, and terminated with transition conditions.', + visualType: 'diagram', + pageNumber: 6, + concepts: ['smart contract', 'state machine', 'security', 'lifecycle'] + }, + { + catalogId: 2697195125, // 2303.10844v2 + catalogTitle: 'Cryptographic Protocols', + description: 'Table comparing cryptographic hash functions including SHA-256, SHA-3, and BLAKE2 across security level, performance, and use cases.', + visualType: 'table', + pageNumber: 4, + concepts: ['cryptography', 'hash function', 'security'] + }, + { + catalogId: 2157974058, // 2993600.2993611 + catalogTitle: 'API Design Patterns', + description: 'UML class diagram showing the repository pattern implementation with interfaces, concrete implementations, and dependency injection relationships.', + visualType: 'diagram', + pageNumber: 7, + concepts: ['design patterns', 'repository pattern', 'dependency injection', 'uml'] + }, + { + catalogId: 837451997, // 3696429 + catalogTitle: 'Database Systems', + description: 'Entity-relationship diagram showing database schema with users, transactions, blocks, and smart contracts entities and their relationships.', + visualType: 'figure', + pageNumber: 10, + concepts: ['database', 'entity relationship', 'schema', 'data modeling'] + } +]; + +// Simple hash function for generating IDs +function hashToId(input: string): number { + let hash = 0; + for (let i = 0; i < input.length; i++) { + const char = input.charCodeAt(i); + hash = ((hash << 5) - hash) + char; + hash = hash & hash; + } + return Math.abs(hash); +} + +async function main() { + console.log('🎨 Seeding Test Visuals'); + console.log('========================\n'); + + // Verify database exists + if (!fs.existsSync(TEST_DB_PATH)) { + console.error(`❌ Test database not found at: ${TEST_DB_PATH}`); + process.exit(1); + } + + // Connect to database + console.log(`📦 Connecting to database: ${TEST_DB_PATH}`); + const db = await lancedb.connect(TEST_DB_PATH); + + // Verify tables exist + const tables = await db.tableNames(); + if (!tables.includes('visuals')) { + console.error('❌ Visuals table not found. Run add-visuals-table.ts first.'); + process.exit(1); + } + + const visuals = await db.openTable('visuals'); + const concepts = await db.openTable('concepts'); + const chunks = await db.openTable('chunks'); + + // Build concept name to ID lookup + console.log('📚 Building concept index...'); + const conceptEntries = await concepts.query().limit(10000).toArray(); + const conceptNameToId = new Map(); + for (const c of conceptEntries) { + if (c.name) { + conceptNameToId.set(c.name.toLowerCase(), c.id); + } + } + console.log(` Found ${conceptNameToId.size} concepts`); + + // Build chunk lookup by catalog_id + console.log('📄 Building chunk index...'); + const chunkEntries = await chunks.query().limit(10000).toArray(); + const chunksByCatalog = new Map(); + for (const chunk of chunkEntries) { + if (chunk.catalog_id) { + if (!chunksByCatalog.has(chunk.catalog_id)) { + chunksByCatalog.set(chunk.catalog_id, []); + } + chunksByCatalog.get(chunk.catalog_id)!.push(chunk.id); + } + } + console.log(` Indexed chunks for ${chunksByCatalog.size} documents`); + + // Create embedding service + const embeddingService = new SimpleEmbeddingService(); + + // Ensure images directory exists + if (!fs.existsSync(IMAGES_DIR)) { + fs.mkdirSync(IMAGES_DIR, { recursive: true }); + } + + // Clear existing visuals + const existingCount = await visuals.countRows(); + if (existingCount > 0) { + console.log(`\n🗑️ Clearing ${existingCount} existing visuals...`); + // Delete all by querying all IDs and deleting + const existing = await visuals.query().limit(10000).toArray(); + for (const v of existing) { + await visuals.delete(`id = ${v.id}`); + } + } + + console.log('\n📷 Creating sample visuals...\n'); + + const visualRows: any[] = []; + + for (const sample of SAMPLE_VISUALS) { + // Generate unique ID + const id = hashToId(`${sample.catalogId}-${sample.pageNumber}-${sample.visualType}`); + + // Map concept names to IDs + const conceptIds: number[] = []; + const conceptNames: string[] = []; + for (const conceptName of sample.concepts) { + const conceptId = conceptNameToId.get(conceptName.toLowerCase()); + if (conceptId) { + conceptIds.push(conceptId); + conceptNames.push(conceptName); + } else { + // Include concept name even if not in DB + conceptNames.push(conceptName); + } + } + + // Get chunk IDs for this catalog + const chunkIds = chunksByCatalog.get(sample.catalogId)?.slice(0, 5) || []; + + // Generate embedding for description + const vector = embeddingService.generateEmbedding(sample.description); + + // Create placeholder image path (we won't create actual images for tests) + const imagePath = `images/${sample.catalogId}/p${sample.pageNumber}_v1.png`; + + console.log(` ✅ ${sample.visualType}: "${sample.description.substring(0, 50)}..."`); + console.log(` Concepts: ${conceptNames.join(', ')}`); + console.log(` Chunks linked: ${chunkIds.length}`); + + visualRows.push({ + id, + catalog_id: sample.catalogId, + catalog_title: sample.catalogTitle, + image_path: imagePath, + description: sample.description, + vector, + visual_type: sample.visualType, + page_number: sample.pageNumber, + bounding_box: JSON.stringify({ x: 50, y: 100, width: 400, height: 300 }), + concept_ids: conceptIds.length > 0 ? conceptIds : [0], + concept_names: conceptNames.length > 0 ? conceptNames : [''], + chunk_ids: chunkIds.length > 0 ? chunkIds : [0] + }); + } + + // Add all visuals + await visuals.add(visualRows); + + // Verify + const finalCount = await visuals.countRows(); + + console.log('\n========================'); + console.log('✅ Seeding complete!\n'); + console.log('📊 Summary:'); + console.log(` Visuals added: ${visualRows.length}`); + console.log(` Total in table: ${finalCount}`); + console.log(` Types: diagram, flowchart, chart, table, figure`); +} + +main().catch(err => { + console.error('\n❌ Seeding failed:', err.message); + process.exit(1); +}); + diff --git a/scripts/test-get-visuals.ts b/scripts/test-get-visuals.ts new file mode 100644 index 00000000..530fc204 --- /dev/null +++ b/scripts/test-get-visuals.ts @@ -0,0 +1,63 @@ +/** + * Test get_visuals functionality with test database + */ + +import * as lancedb from '@lancedb/lancedb'; +import * as path from 'path'; +import { LanceDBVisualRepository } from '../src/infrastructure/lancedb/repositories/lancedb-visual-repository.js'; + +const TEST_DB_PATH = path.join(process.cwd(), 'db/test'); + +async function main() { + console.log('🧪 Testing get_visuals functionality\n'); + + const db = await lancedb.connect(TEST_DB_PATH); + const visualsTable = await db.openTable('visuals'); + const repo = new LanceDBVisualRepository(visualsTable); + + // Test 1: Find by concept name + console.log('=== Test 1: Find by concept name (blockchain) ==='); + const blockchainVisuals = await repo.findByConceptName('blockchain', 10); + console.log(`Found ${blockchainVisuals.length} visuals`); + blockchainVisuals.forEach(v => { + console.log(` - [${v.visualType}] ${v.description.substring(0, 60)}...`); + console.log(` Concepts: ${v.conceptNames?.join(', ')}`); + }); + + // Test 2: Find by visual type + console.log('\n=== Test 2: Find by visual type (diagram) ==='); + const diagrams = await repo.findByType('diagram', 10); + console.log(`Found ${diagrams.length} diagrams`); + diagrams.forEach(v => { + console.log(` - Page ${v.pageNumber}: ${v.description.substring(0, 50)}...`); + }); + + // Test 3: Find by concept (architecture) + console.log('\n=== Test 3: Find by concept (architecture) ==='); + const archVisuals = await repo.findByConceptName('architecture', 10); + console.log(`Found ${archVisuals.length} visuals`); + archVisuals.forEach(v => { + console.log(` - [${v.visualType}] ${v.description.substring(0, 50)}...`); + }); + + // Test 4: Find by catalog ID + console.log('\n=== Test 4: Find by catalog ID (3155035939) ==='); + const catalogVisuals = await repo.findByCatalogId(3155035939, 10); + console.log(`Found ${catalogVisuals.length} visuals for catalog`); + catalogVisuals.forEach(v => { + console.log(` - [${v.visualType}] Page ${v.pageNumber}`); + }); + + // Test 5: Total count + console.log('\n=== Test 5: Total count ==='); + const count = await repo.count(); + console.log(`Total visuals: ${count}`); + + console.log('\n✅ All tests passed!'); +} + +main().catch(err => { + console.error('❌ Test failed:', err); + process.exit(1); +}); + diff --git a/scripts/test-slugify.ts b/scripts/test-slugify.ts new file mode 100644 index 00000000..13453804 --- /dev/null +++ b/scripts/test-slugify.ts @@ -0,0 +1,89 @@ +/** + * Test script for slugify utilities + */ + +import { + slugifyDocument, + extractAuthorSurname, + extractShortTitle, + extractYear, + formatVisualFilename +} from '../src/infrastructure/utils/slugify.js'; + +// Test cases +const tests = [ + { + input: { title: 'Clean Architecture', author: 'Robert C. Martin', year: 2017 }, + expected: 'martin_clean-architecture_2017' + }, + { + // Subtitles after : are removed by design + input: { title: 'Design Patterns: Elements of Reusable Object-Oriented Software', author: 'Gamma, Erich et al.', year: 1994 }, + expected: 'gamma_design-patterns_1994' + }, + { + input: { title: 'The Art of War', author: 'Sun Tzu' }, + expected: 'tzu_art-of-war_undated' + }, + { + // Subtitles after : are removed by design + input: { title: 'Bitcoin: A Peer-to-Peer Electronic Cash System', author: 'Satoshi Nakamoto', year: '2008' }, + expected: 'nakamoto_bitcoin_2008' + }, + { + input: { title: 'Cosmos Blockchain Overview', year: 2023 }, + expected: 'unknown_cosmos-blockchain-overview_2023' + }, + { + // Test with first name last name format + input: { title: 'Domain-Driven Design', author: 'Eric Evans', year: 2003 }, + expected: 'evans_domain-driven-design_2003' + } +]; + +console.log('Testing slugifyDocument:\n'); +let passed = 0; +let failed = 0; + +for (const test of tests) { + const result = slugifyDocument(test.input); + const pass = result === test.expected; + if (pass) { + console.log(` ✅ ${test.input.title}`); + console.log(` → ${result}`); + passed++; + } else { + console.log(` ❌ ${test.input.title}`); + console.log(` Expected: ${test.expected}`); + console.log(` Got: ${result}`); + failed++; + } +} + +console.log('\nTesting formatVisualFilename:\n'); +const fnTests = [ + { page: 1, index: 0, expected: 'p001_v0.png' }, + { page: 42, index: 2, expected: 'p042_v2.png' }, + { page: 100, index: 0, expected: 'p100_v0.png' }, +]; + +for (const test of fnTests) { + const result = formatVisualFilename(test.page, test.index); + const pass = result === test.expected; + if (pass) { + console.log(` ✅ Page ${test.page}, index ${test.index} → ${result}`); + passed++; + } else { + console.log(` ❌ Page ${test.page}, index ${test.index}`); + console.log(` Expected: ${test.expected}`); + console.log(` Got: ${result}`); + failed++; + } +} + +console.log(`\n${passed} passed, ${failed} failed`); + +if (failed > 0) { + process.exit(1); +} + diff --git a/scripts/update-image-metadata.ts b/scripts/update-image-metadata.ts new file mode 100644 index 00000000..33c8c1d7 --- /dev/null +++ b/scripts/update-image-metadata.ts @@ -0,0 +1,211 @@ +/** + * Update Image Metadata Script + * + * Adds embedded metadata (EXIF) to existing extracted images. + * This script reads metadata from the visuals table and embeds it + * into the corresponding PNG files. + * + * Metadata embedded: + * - Title (document title) + * - Author + * - Year + * - Page number + * - Image index + * - Catalog ID + * + * Usage: + * npx tsx scripts/update-image-metadata.ts [options] + * + * Options: + * --dbpath Database path (default: ~/.concept_rag) + * --catalog-id Update images for specific catalog ID only + * --dry-run Show what would be updated without making changes + * --limit Limit number of images to process + */ + +import * as lancedb from '@lancedb/lancedb'; +import * as path from 'path'; +import * as os from 'os'; +import * as fs from 'fs'; +import minimist from 'minimist'; +import { embedMetadataInPng, type ImageEmbeddedMetadata } from '../src/infrastructure/visual-extraction/image-processor.js'; + +// Parse command line arguments +const args = minimist(process.argv.slice(2)); +const dbPath = args.dbpath || path.join(os.homedir(), '.concept_rag'); +const catalogIdFilter = args['catalog-id'] ? parseInt(args['catalog-id'], 10) : undefined; +const dryRun = args['dry-run'] || false; +const limit = args.limit ? parseInt(args.limit, 10) : undefined; + +interface VisualRecord { + id: number; + catalog_id: number; + catalog_title: string; + image_path: string; + page_number: number; +} + +interface CatalogRecord { + id: number; + title: string; + author?: string; + year?: number; + source?: string; +} + +async function main() { + console.log('🖼️ Update Image Metadata'); + console.log('=========================\n'); + + // Verify database exists + if (!fs.existsSync(dbPath)) { + console.error(`❌ Database not found at: ${dbPath}`); + process.exit(1); + } + + // Connect to database + console.log(`📦 Connecting to database: ${dbPath}`); + const db = await lancedb.connect(dbPath); + + // Verify tables exist + const tables = await db.tableNames(); + if (!tables.includes('visuals')) { + console.error('❌ Visuals table not found'); + process.exit(1); + } + if (!tables.includes('catalog')) { + console.error('❌ Catalog table not found'); + process.exit(1); + } + + const visualsTable = await db.openTable('visuals'); + const catalogTable = await db.openTable('catalog'); + + // Get visuals to update + let visuals: VisualRecord[]; + if (catalogIdFilter) { + visuals = await visualsTable.query() + .where(`catalog_id = ${catalogIdFilter}`) + .select(['id', 'catalog_id', 'catalog_title', 'image_path', 'page_number']) + .limit(limit || 100000) + .toArray() as VisualRecord[]; + } else { + visuals = await visualsTable.query() + .select(['id', 'catalog_id', 'catalog_title', 'image_path', 'page_number']) + .limit(limit || 100000) + .toArray() as VisualRecord[]; + } + + console.log(`📚 Found ${visuals.length} images to update\n`); + + if (visuals.length === 0) { + console.log(' No images found matching criteria.'); + process.exit(0); + } + + if (dryRun) { + console.log('🔍 Dry run mode - showing what would be updated:\n'); + } + + // Build catalog lookup for author/year info + const catalogIds = [...new Set(visuals.map(v => v.catalog_id))]; + const catalogLookup = new Map(); + + for (const catId of catalogIds) { + const entries = await catalogTable.query() + .where(`id = ${catId}`) + .select(['id', 'title', 'author', 'year', 'source']) + .limit(1) + .toArray() as CatalogRecord[]; + + if (entries.length > 0) { + catalogLookup.set(catId, entries[0]); + } + } + + let updated = 0; + let skipped = 0; + let errors = 0; + + for (let i = 0; i < visuals.length; i++) { + const visual = visuals[i]; + const catalog = catalogLookup.get(visual.catalog_id); + + // Build full image path + const imagePath = path.join(dbPath, visual.image_path); + + // Parse image index from filename (e.g., p42_v0.png -> 0) + const filename = path.basename(visual.image_path); + const indexMatch = filename.match(/v(\d+)\.png$/); + const imageIndex = indexMatch ? parseInt(indexMatch[1], 10) : 0; + + // Progress indicator + const progress = `[${i + 1}/${visuals.length}]`; + + if (!fs.existsSync(imagePath)) { + console.log(`${progress} ⚠️ Skipping (file not found): ${visual.image_path}`); + skipped++; + continue; + } + + // Build metadata + const metadata: ImageEmbeddedMetadata = { + title: catalog?.title || visual.catalog_title, + author: catalog?.author, + year: catalog?.year, + pageNumber: visual.page_number, + imageIndex, + catalogId: visual.catalog_id, + source: catalog?.source + }; + + if (dryRun) { + console.log(`${progress} Would update: ${visual.image_path}`); + console.log(` Title: ${metadata.title}`); + console.log(` Author: ${metadata.author || 'N/A'}`); + console.log(` Year: ${metadata.year || 'N/A'}`); + console.log(` Page: ${metadata.pageNumber}, Index: ${metadata.imageIndex}`); + updated++; + } else { + try { + await embedMetadataInPng(imagePath, metadata); + updated++; + + // Show progress every 10 images or for first/last + if (i === 0 || i === visuals.length - 1 || (i + 1) % 10 === 0) { + console.log(`${progress} ✅ Updated: ${visual.image_path}`); + } + } catch (error: any) { + console.log(`${progress} ❌ Error: ${visual.image_path} - ${error.message}`); + errors++; + } + } + } + + // Summary + console.log('\n========================='); + console.log('✅ Metadata update complete!\n'); + console.log('📊 Summary:'); + console.log(` Images processed: ${visuals.length}`); + console.log(` Successfully updated: ${updated}`); + if (skipped > 0) { + console.log(` Skipped (not found): ${skipped}`); + } + if (errors > 0) { + console.log(` Errors: ${errors}`); + } + + if (dryRun) { + console.log('\n Run without --dry-run to apply changes.'); + } +} + +main().catch(err => { + console.error('\n❌ Script failed:', err.message); + if (err.stack) { + console.error('\nStack trace:'); + console.error(err.stack); + } + process.exit(1); +}); + diff --git a/src/__tests__/e2e/visual-search.e2e.test.ts b/src/__tests__/e2e/visual-search.e2e.test.ts new file mode 100644 index 00000000..8a2f288f --- /dev/null +++ b/src/__tests__/e2e/visual-search.e2e.test.ts @@ -0,0 +1,434 @@ +/** + * E2E Test: Visual Search Integration + * + * Tests the visual/image search functionality against the test database: + * 1. GetVisualsTool retrieves visuals by various filters + * 2. ConceptSearchTool returns image_ids for associated visuals + * 3. Workflow: concept_search → get_visuals via image_ids + * 4. Workflow: catalog_search → get_visuals via catalog_id + * + * Requires: db/test with visuals.lance table and images/ directory + */ + +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; +import { ApplicationContainer } from '../../application/container.js'; +import * as path from 'path'; + +// Test database path +const TEST_DB_PATH = path.resolve(process.cwd(), 'db/test'); + +describe('E2E: Visual Search Integration', () => { + let container: ApplicationContainer; + let getVisualsTool: any; + let conceptSearchTool: any; + let catalogSearchTool: any; + + beforeAll(async () => { + container = new ApplicationContainer(); + await container.initialize(TEST_DB_PATH); + + getVisualsTool = container.getTool('get_visuals'); + conceptSearchTool = container.getTool('concept_search'); + catalogSearchTool = container.getTool('catalog_search'); + }, 30000); + + afterAll(async () => { + if (container) { + await container.close(); + } + }); + + describe('GetVisualsTool Basic Operations', () => { + it('should retrieve visuals with default limit', async () => { + const result = await getVisualsTool.execute({}); + + expect(result.isError).toBe(false); + const response = JSON.parse(result.content[0].text); + + expect(response.visuals).toBeDefined(); + expect(Array.isArray(response.visuals)).toBe(true); + expect(response.total_returned).toBeGreaterThanOrEqual(0); + }); + + it('should retrieve visuals by visual_type', async () => { + const result = await getVisualsTool.execute({ visual_type: 'diagram' }); + + expect(result.isError).toBe(false); + const response = JSON.parse(result.content[0].text); + + expect(response.visuals).toBeDefined(); + expect(response.filters_applied.visual_type).toBe('diagram'); + + // All returned visuals should be diagrams + response.visuals.forEach((v: any) => { + expect(v.visual_type).toBe('diagram'); + }); + }); + + it('should respect limit parameter', async () => { + const result = await getVisualsTool.execute({ limit: 3 }); + + expect(result.isError).toBe(false); + const response = JSON.parse(result.content[0].text); + + expect(response.visuals.length).toBeLessThanOrEqual(3); + }); + + it('should return visual with expected schema', async () => { + const result = await getVisualsTool.execute({ limit: 1 }); + + expect(result.isError).toBe(false); + const response = JSON.parse(result.content[0].text); + + if (response.visuals.length > 0) { + const visual = response.visuals[0]; + + // Verify schema fields + expect(visual).toHaveProperty('id'); + expect(visual).toHaveProperty('catalog_id'); + expect(visual).toHaveProperty('catalog_title'); + expect(visual).toHaveProperty('visual_type'); + expect(visual).toHaveProperty('page_number'); + expect(visual).toHaveProperty('description'); + expect(visual).toHaveProperty('image_path'); + expect(visual).toHaveProperty('concepts'); + + // Verify types + expect(typeof visual.id).toBe('number'); + expect(typeof visual.catalog_id).toBe('number'); + expect(typeof visual.image_path).toBe('string'); + expect(Array.isArray(visual.concepts)).toBe(true); + + // Should NOT have chunk_ids (removed from schema) + expect(visual).not.toHaveProperty('chunk_ids'); + } + }); + }); + + describe('GetVisualsTool by IDs', () => { + it('should retrieve visuals by specific IDs', async () => { + // First get some visuals to get their IDs + const initial = await getVisualsTool.execute({ limit: 5 }); + const initialResponse = JSON.parse(initial.content[0].text); + + if (initialResponse.visuals.length >= 2) { + const ids = initialResponse.visuals.slice(0, 2).map((v: any) => v.id); + + // Now fetch by IDs + const result = await getVisualsTool.execute({ ids }); + + expect(result.isError).toBe(false); + const response = JSON.parse(result.content[0].text); + + expect(response.visuals.length).toBe(2); + expect(response.filters_applied.ids).toEqual(ids); + + // Verify the returned IDs match + const returnedIds = response.visuals.map((v: any) => v.id); + expect(returnedIds).toContain(ids[0]); + expect(returnedIds).toContain(ids[1]); + } + }); + }); + + describe('GetVisualsTool by Catalog ID', () => { + it('should retrieve visuals by catalog_id', async () => { + // First get a visual to find a catalog_id + const initial = await getVisualsTool.execute({ limit: 1 }); + const initialResponse = JSON.parse(initial.content[0].text); + + if (initialResponse.visuals.length > 0) { + const catalogId = initialResponse.visuals[0].catalog_id; + + // Now fetch by catalog_id + const result = await getVisualsTool.execute({ catalog_id: catalogId }); + + expect(result.isError).toBe(false); + const response = JSON.parse(result.content[0].text); + + expect(response.filters_applied.catalog_id).toBe(catalogId); + + // All visuals should be from the same document + response.visuals.forEach((v: any) => { + expect(v.catalog_id).toBe(catalogId); + }); + } + }); + }); + + describe('ConceptSearchTool with image_ids', () => { + it('should return image_ids in concept search results', async () => { + // Search for a concept that likely has associated visuals + const result = await conceptSearchTool.execute({ concept: 'architecture' }); + + expect(result.isError).toBe(false); + const response = JSON.parse(result.content[0].text); + + // Verify image_ids is present in the response + expect(response).toHaveProperty('image_ids'); + expect(Array.isArray(response.image_ids)).toBe(true); + + // Verify stats includes images_found + expect(response.stats).toHaveProperty('images_found'); + expect(typeof response.stats.images_found).toBe('number'); + }); + + it('should return catalog_id in sources array', async () => { + const result = await conceptSearchTool.execute({ concept: 'architecture' }); + + expect(result.isError).toBe(false); + const response = JSON.parse(result.content[0].text); + + if (response.sources && response.sources.length > 0) { + const source = response.sources[0]; + expect(source).toHaveProperty('catalog_id'); + expect(typeof source.catalog_id).toBe('number'); + expect(source).toHaveProperty('title'); + } + }); + + it('should return catalog_id in chunks array', async () => { + const result = await conceptSearchTool.execute({ concept: 'architecture' }); + + expect(result.isError).toBe(false); + const response = JSON.parse(result.content[0].text); + + if (response.chunks && response.chunks.length > 0) { + const chunk = response.chunks[0]; + expect(chunk).toHaveProperty('catalog_id'); + expect(typeof chunk.catalog_id).toBe('number'); + expect(chunk).toHaveProperty('title'); + } + }); + }); + + describe('CatalogSearchTool with catalog_id', () => { + it('should return catalog_id in search results', async () => { + const result = await catalogSearchTool.execute({ text: 'clean architecture' }); + + expect(result.isError).toBe(false); + const response = JSON.parse(result.content[0].text); + + if (response.length > 0) { + const doc = response[0]; + expect(doc).toHaveProperty('catalog_id'); + expect(typeof doc.catalog_id).toBe('number'); + expect(doc).toHaveProperty('title'); + + // Should NOT have 'source' (replaced with title) + expect(doc).not.toHaveProperty('source'); + } + }); + }); + + describe('Workflow: concept_search → get_visuals', () => { + it('should enable visual retrieval via image_ids from concept search', async () => { + // Step 1: Search for a concept + const conceptResult = await conceptSearchTool.execute({ concept: 'diagram' }); + const conceptResponse = JSON.parse(conceptResult.content[0].text); + + if (conceptResponse.image_ids && conceptResponse.image_ids.length > 0) { + // Step 2: Retrieve visuals by IDs + const visualResult = await getVisualsTool.execute({ + ids: conceptResponse.image_ids.slice(0, 5) + }); + + expect(visualResult.isError).toBe(false); + const visualResponse = JSON.parse(visualResult.content[0].text); + + expect(visualResponse.visuals.length).toBeGreaterThan(0); + + // Verify we got the visuals we asked for + const requestedIds = conceptResponse.image_ids.slice(0, 5); + const returnedIds = visualResponse.visuals.map((v: any) => v.id); + + requestedIds.forEach((id: number) => { + expect(returnedIds).toContain(id); + }); + } + }); + }); + + describe('Workflow: catalog_search → get_visuals', () => { + it('should enable visual retrieval via catalog_id from catalog search', async () => { + // Step 1: Search catalog + const catalogResult = await catalogSearchTool.execute({ text: 'architecture' }); + const catalogResponse = JSON.parse(catalogResult.content[0].text); + + if (catalogResponse.length > 0) { + const catalogId = catalogResponse[0].catalog_id; + + // Step 2: Retrieve visuals by catalog_id + const visualResult = await getVisualsTool.execute({ catalog_id: catalogId }); + + expect(visualResult.isError).toBe(false); + const visualResponse = JSON.parse(visualResult.content[0].text); + + // All returned visuals should be from the same document + visualResponse.visuals.forEach((v: any) => { + expect(v.catalog_id).toBe(catalogId); + }); + } + }); + }); + + describe('Visual Schema Compliance', () => { + it('should not include deprecated fields', async () => { + const result = await getVisualsTool.execute({ limit: 5 }); + const response = JSON.parse(result.content[0].text); + + response.visuals.forEach((v: any) => { + // chunk_ids was removed from schema + expect(v).not.toHaveProperty('chunk_ids'); + }); + }); + + it('should include all required fields', async () => { + const result = await getVisualsTool.execute({ limit: 5 }); + const response = JSON.parse(result.content[0].text); + + const requiredFields = [ + 'id', 'catalog_id', 'catalog_title', 'visual_type', + 'page_number', 'description', 'image_path', 'concepts' + ]; + + response.visuals.forEach((v: any) => { + requiredFields.forEach(field => { + expect(v).toHaveProperty(field); + }); + }); + }); + }); + + describe('Semantic Relevance Validation', () => { + it('should return images with descriptions relevant to the searched concept', async () => { + // Search for "architecture" concept + const conceptResult = await conceptSearchTool.execute({ concept: 'architecture' }); + const conceptResponse = JSON.parse(conceptResult.content[0].text); + + if (conceptResponse.image_ids && conceptResponse.image_ids.length > 0) { + // Retrieve associated images + const visualResult = await getVisualsTool.execute({ + ids: conceptResponse.image_ids.slice(0, 10) + }); + const visualResponse = JSON.parse(visualResult.content[0].text); + + // Define terms that would indicate relevance to "architecture" + const relevantTerms = [ + 'architecture', 'layer', 'component', 'module', 'system', + 'design', 'pattern', 'structure', 'diagram', 'flow', + 'dependency', 'interface', 'service', 'class', 'model', + 'clean', 'hexagonal', 'onion', 'domain', 'application' + ]; + + // Check that at least some images have relevant descriptions + const imagesWithRelevantDescriptions = visualResponse.visuals.filter((v: any) => { + const description = (v.description || '').toLowerCase(); + const concepts = (v.concepts || []).map((c: string) => c.toLowerCase()); + const allText = description + ' ' + concepts.join(' '); + + return relevantTerms.some(term => allText.includes(term)); + }); + + // At least 50% of returned images should have relevant descriptions + const relevanceRatio = imagesWithRelevantDescriptions.length / visualResponse.visuals.length; + expect(relevanceRatio).toBeGreaterThanOrEqual(0.5); + + console.error(` 📊 Relevance: ${imagesWithRelevantDescriptions.length}/${visualResponse.visuals.length} images (${(relevanceRatio * 100).toFixed(0)}%) have architecture-related content`); + } + }); + + it('should return images with concepts matching the search term', async () => { + // Search for "dependency" concept + const conceptResult = await conceptSearchTool.execute({ concept: 'dependency' }); + const conceptResponse = JSON.parse(conceptResult.content[0].text); + + if (conceptResponse.image_ids && conceptResponse.image_ids.length > 0) { + const visualResult = await getVisualsTool.execute({ + ids: conceptResponse.image_ids.slice(0, 10) + }); + const visualResponse = JSON.parse(visualResult.content[0].text); + + // Check that images have the searched concept or related terms + const relatedTerms = ['dependency', 'injection', 'inversion', 'coupling', 'interface']; + + const imagesWithMatchingConcepts = visualResponse.visuals.filter((v: any) => { + const concepts = (v.concepts || []).map((c: string) => c.toLowerCase()); + const description = (v.description || '').toLowerCase(); + + return relatedTerms.some(term => + concepts.some((c: string) => c.includes(term)) || + description.includes(term) + ); + }); + + // Log the match results + console.error(` 📊 Concept match: ${imagesWithMatchingConcepts.length}/${visualResponse.visuals.length} images match "dependency" or related terms`); + + // At least one image should match + if (visualResponse.visuals.length > 0) { + expect(imagesWithMatchingConcepts.length).toBeGreaterThan(0); + } + } + }); + + it('should return images that have the searched concept in their concept list', async () => { + // Search for a concept and verify images have that concept associated + const conceptResult = await conceptSearchTool.execute({ concept: 'software' }); + const conceptResponse = JSON.parse(conceptResult.content[0].text); + + if (conceptResponse.image_ids && conceptResponse.image_ids.length > 0) { + // Retrieve associated images + const visualResult = await getVisualsTool.execute({ + ids: conceptResponse.image_ids.slice(0, 10) + }); + const visualResponse = JSON.parse(visualResult.content[0].text); + + // Verify images have the searched concept or related terms in their concepts/description + const relatedTerms = ['software', 'application', 'system', 'program', 'code']; + + const imagesWithMatchingConcept = visualResponse.visuals.filter((v: any) => { + const concepts = (v.concepts || []).map((c: string) => c.toLowerCase()); + const description = (v.description || '').toLowerCase(); + + return relatedTerms.some(term => + concepts.some((c: string) => c.includes(term)) || + description.includes(term) + ); + }); + + console.error(` 📊 Concept association: ${imagesWithMatchingConcept.length}/${visualResponse.visuals.length} images have "software" or related concepts`); + + // Images associated with the concept should have relevant content + if (visualResponse.visuals.length > 0) { + const matchRatio = imagesWithMatchingConcept.length / visualResponse.visuals.length; + expect(matchRatio).toBeGreaterThanOrEqual(0.5); // At least half should match + } + } + }); + + it('should return diagram-type visuals with meaningful descriptions', async () => { + // Get diagrams specifically + const result = await getVisualsTool.execute({ visual_type: 'diagram', limit: 10 }); + const response = JSON.parse(result.content[0].text); + + if (response.visuals.length > 0) { + // Diagrams should have substantive descriptions (not just "No description") + const diagramsWithMeaningfulDescriptions = response.visuals.filter((v: any) => { + const desc = v.description || ''; + return desc.length > 20 && + desc !== 'No description available' && + !desc.startsWith('Error'); + }); + + const meaningfulRatio = diagramsWithMeaningfulDescriptions.length / response.visuals.length; + + console.error(` 📊 Description quality: ${diagramsWithMeaningfulDescriptions.length}/${response.visuals.length} diagrams (${(meaningfulRatio * 100).toFixed(0)}%) have meaningful descriptions`); + + // At least 70% should have meaningful descriptions + expect(meaningfulRatio).toBeGreaterThanOrEqual(0.7); + } + }); + }); +}); + diff --git a/src/application/config/configuration.ts b/src/application/config/configuration.ts index b820c4e5..7d0af903 100644 --- a/src/application/config/configuration.ts +++ b/src/application/config/configuration.ts @@ -140,8 +140,9 @@ export class Configuration implements IConfiguration { return { baseUrl: this.env.get('OPENROUTER_BASE_URL', 'https://openrouter.ai/api/v1'), apiKey: this.env.get('OPENROUTER_API_KEY'), - summaryModel: this.env.get('OPENROUTER_SUMMARY_MODEL', 'x-ai/grok-4-fast'), - conceptModel: this.env.get('OPENROUTER_CONCEPT_MODEL', 'anthropic/claude-sonnet-4.5'), + summaryModel: this.env.get('OPENROUTER_SUMMARY_MODEL', 'x-ai/grok-4.1-fast'), + conceptModel: this.env.get('OPENROUTER_CONCEPT_MODEL', 'google/gemini-3-flash-preview'), + visionModel: this.env.get('OPENROUTER_VISION_MODEL', 'qwen/qwen2.5-vl-72b-instruct'), ...this.overrides?.llm }; } diff --git a/src/application/config/types.ts b/src/application/config/types.ts index b73da6dc..86819b1c 100644 --- a/src/application/config/types.ts +++ b/src/application/config/types.ts @@ -36,6 +36,9 @@ export interface LLMConfig { /** Model for concept extraction (comprehensive) */ conceptModel: string; + + /** Model for visual classification and description (vision-capable) */ + visionModel: string; } /** diff --git a/src/application/container.ts b/src/application/container.ts index 794f174b..3a248b5b 100644 --- a/src/application/container.ts +++ b/src/application/container.ts @@ -22,6 +22,8 @@ import { CategorySearchTool } from '../tools/operations/category-search-tool.js' import { ListCategoriesTool } from '../tools/operations/list-categories-tool.js'; import { ListConceptsInCategoryTool } from '../tools/operations/list-concepts-in-category-tool.js'; import { GetGuidanceTool } from '../tools/operations/get-guidance-tool.js'; +import { GetVisualsTool } from '../tools/operations/get-visuals-tool.js'; +import { LanceDBVisualRepository } from '../infrastructure/lancedb/repositories/lancedb-visual-repository.js'; import { BaseTool } from '../tools/base/tool.js'; import { EmbeddingCache, SearchResultCache } from '../infrastructure/cache/index.js'; import { LanceDBCategoryRepository } from '../infrastructure/lancedb/repositories/lancedb-category-repository.js'; @@ -137,6 +139,15 @@ export class ApplicationContainer { console.error('⚠️ Categories table not found (skipping category features)'); } + // 3b. Open visuals table if it exists (optional for diagram awareness) + let visualsTable = null; + try { + visualsTable = await this.dbConnection.openTable('visuals'); + console.error('✅ Visuals table found'); + } catch (err) { + console.error('⚠️ Visuals table not found (skipping visual features)'); + } + // 3b. Create performance caches (for embeddings and search results only) this.embeddingCache = new EmbeddingCache(10000); // Cache up to 10k embeddings this.searchResultCache = new SearchResultCache(1000, 5 * 60 * 1000); // 1k searches, 5min TTL @@ -177,8 +188,14 @@ export class ApplicationContainer { ); console.error('✅ ConceptSearchService initialized (hybrid search enabled)'); + // 7b. Create visual repository if visuals table exists (needed for concept_search too) + let visualRepo: LanceDBVisualRepository | undefined; + if (visualsTable) { + visualRepo = new LanceDBVisualRepository(visualsTable); + } + // 7. Create tools (with domain services) - this.tools.set('concept_search', new ConceptSearchTool(conceptSearchService)); + this.tools.set('concept_search', new ConceptSearchTool(conceptSearchService, visualRepo)); this.tools.set('catalog_search', new ConceptualCatalogSearchTool(catalogSearchService)); this.tools.set('chunks_search', new ConceptualChunksSearchTool(chunkSearchService, catalogRepo)); this.tools.set('broad_chunks_search', new ConceptualBroadChunksSearchTool(chunkSearchService)); @@ -195,6 +212,12 @@ export class ApplicationContainer { console.error(`✅ Category tools registered (3 tools)`); } + // 7c. Register visual tools if visuals table exists + if (visualRepo) { + this.tools.set('get_visuals', new GetVisualsTool(visualRepo, catalogRepo)); + console.error(`✅ Visual tools registered (1 tool)`); + } + console.error(`✅ Container initialized with ${this.tools.size} tool(s)`); } diff --git a/src/domain/interfaces/repositories/index.ts b/src/domain/interfaces/repositories/index.ts index 6ebfcae9..f09423f6 100644 --- a/src/domain/interfaces/repositories/index.ts +++ b/src/domain/interfaces/repositories/index.ts @@ -1,3 +1,4 @@ export * from './chunk-repository.js'; export * from './concept-repository.js'; export * from './catalog-repository.js'; +export * from './visual-repository.js'; diff --git a/src/domain/interfaces/repositories/visual-repository.ts b/src/domain/interfaces/repositories/visual-repository.ts new file mode 100644 index 00000000..6d93f253 --- /dev/null +++ b/src/domain/interfaces/repositories/visual-repository.ts @@ -0,0 +1,261 @@ +import type { Visual } from '../../models/visual.js'; +import type { Option } from '../../functional/option.js'; + +/** + * Repository interface for accessing visual data from the vector database. + * + * Visuals are diagrams, charts, tables, and figures extracted from documents, + * enriched with: + * - LLM-generated semantic descriptions + * - Vector embeddings for semantic search + * - Extracted concepts for conceptual navigation + * - Links to nearby text chunks for context + * + * **Design Pattern**: Repository Pattern + * - Abstracts data access behind domain interface + * - Enables testability via test doubles + * - Follows Dependency Inversion Principle + * + * @example + * ```typescript + * // Find visuals from a specific document + * const visuals = await visualRepo.findByCatalogId(catalogId, 20); + * console.log(`Found ${visuals.length} diagrams`); + * + * // Get specific visuals by ID + * const selected = await visualRepo.findByIds([123, 456, 789]); + * ``` + * + * @see {@link Visual} for the data model + */ +export interface VisualRepository { + /** + * Find a visual by its unique ID. + * + * @param id - The visual ID (hash-based integer) + * @returns Promise resolving to Option containing the visual if found + * + * @example + * ```typescript + * const visualOpt = await visualRepo.findById(3847293847); + * if (isSome(visualOpt)) { + * console.log(`Description: ${visualOpt.value.description}`); + * } + * ``` + */ + findById(id: number): Promise>; + + /** + * Find multiple visuals by their IDs. + * + * Efficient batch lookup for retrieving multiple visuals at once. + * Returns visuals in the same order as the input IDs. + * Missing IDs are skipped (no error thrown). + * + * @param ids - Array of visual IDs to retrieve + * @returns Promise resolving to array of found visuals + * + * @example + * ```typescript + * const visuals = await visualRepo.findByIds([123, 456, 789]); + * visuals.forEach(v => console.log(v.description)); + * ``` + */ + findByIds(ids: number[]): Promise; + + /** + * Find visuals from a specific catalog entry (document). + * + * @param catalogId - The catalog entry ID (hash-based integer) + * @param limit - Maximum number of visuals to return + * @returns Promise resolving to visuals from the specified document + * + * @example + * ```typescript + * const visuals = await visualRepo.findByCatalogId(12345678, 50); + * console.log(`Document has ${visuals.length} diagrams`); + * ``` + */ + findByCatalogId(catalogId: number, limit: number): Promise; + + /** + * Find visuals by type across all documents. + * + * @param visualType - The type of visual to find + * @param limit - Maximum number of visuals to return + * @returns Promise resolving to visuals of the specified type + * + * @example + * ```typescript + * const charts = await visualRepo.findByType('chart', 20); + * console.log(`Found ${charts.length} charts`); + * ``` + */ + findByType(visualType: string, limit: number): Promise; + + /** + * Find visuals on a specific page of a document. + * + * @param catalogId - The catalog entry ID + * @param pageNumber - The page number (1-indexed) + * @returns Promise resolving to visuals on the specified page + * + * @example + * ```typescript + * const pageVisuals = await visualRepo.findByPage(12345678, 42); + * console.log(`Page 42 has ${pageVisuals.length} diagrams`); + * ``` + */ + findByPage(catalogId: number, pageNumber: number): Promise; + + /** + * Find visuals associated with a specific concept. + * + * Retrieves visuals that have the specified concept in their concept_ids. + * Useful for visual exploration of concepts. + * + * @param conceptId - The concept ID to search for + * @param limit - Maximum number of visuals to return + * @returns Promise resolving to visuals containing the concept + * + * @example + * ```typescript + * const visuals = await visualRepo.findByConceptId(conceptId, 10); + * console.log(`Concept appears in ${visuals.length} diagrams`); + * ``` + */ + findByConceptId(conceptId: number, limit: number): Promise; + + /** + * Find visuals associated with a concept by name. + * + * Searches the concept_names derived field for matching concepts. + * Uses case-insensitive partial matching. + * + * @param conceptName - The concept name to search for + * @param limit - Maximum number of visuals to return + * @returns Promise resolving to visuals containing the concept + * + * @example + * ```typescript + * const visuals = await visualRepo.findByConceptName('dependency injection', 10); + * ``` + */ + findByConceptName(conceptName: string, limit: number): Promise; + + /** + * Find visuals near specific text chunks. + * + * Retrieves visuals that have any of the specified chunk IDs in their chunk_ids. + * Useful for enriching search results with relevant diagrams. + * + * @param chunkIds - Array of chunk IDs to find associated visuals + * @param limit - Maximum number of visuals to return + * @returns Promise resolving to visuals associated with the chunks + * + * @example + * ```typescript + * // Enrich chunk search results with relevant visuals + * const visualIds = await visualRepo.findByChunkIds( + * chunks.map(c => c.id), + * 10 + * ); + * ``` + */ + findByChunkIds(chunkIds: number[], limit: number): Promise; + + /** + * Search visuals by semantic similarity to a query. + * + * Uses vector search on the description embeddings to find + * visuals semantically similar to the query. + * + * @param queryVector - The query embedding vector (384-dim) + * @param limit - Maximum number of visuals to return + * @returns Promise resolving to visuals ranked by similarity + * + * @example + * ```typescript + * const queryVector = embeddingService.embed('architecture diagram'); + * const visuals = await visualRepo.searchByVector(queryVector, 10); + * ``` + */ + searchByVector(queryVector: number[], limit: number): Promise; + + /** + * Count the total number of visuals in the repository. + * + * @returns Promise resolving to total visual count + * + * @example + * ```typescript + * const total = await visualRepo.count(); + * console.log(`Database contains ${total} diagrams`); + * ``` + */ + count(): Promise; + + /** + * Add a new visual to the repository. + * + * @param visual - The visual to add + * @returns Promise resolving when the visual is added + * + * @example + * ```typescript + * await visualRepo.add({ + * id: hashToId(...), + * catalogId: 12345678, + * catalogTitle: 'Clean Architecture', + * imagePath: 'images/12345678/p42_v1.png', + * description: 'Architecture diagram...', + * visualType: 'diagram', + * pageNumber: 42 + * }); + * ``` + */ + add(visual: Visual): Promise; + + /** + * Add multiple visuals to the repository in batch. + * + * More efficient than calling add() multiple times. + * + * @param visuals - Array of visuals to add + * @returns Promise resolving when all visuals are added + */ + addBatch(visuals: Visual[]): Promise; + + /** + * Update an existing visual in the repository. + * + * Typically used to add description, vector, and concepts + * after initial extraction. + * + * @param visual - The visual with updated fields + * @returns Promise resolving when the visual is updated + */ + update(visual: Visual): Promise; + + /** + * Delete a visual by ID. + * + * Note: This does NOT delete the image file - that must be done separately. + * + * @param id - The visual ID to delete + * @returns Promise resolving when the visual is deleted + */ + delete(id: number): Promise; + + /** + * Delete all visuals for a specific catalog entry. + * + * Useful when re-extracting visuals for a document. + * Note: This does NOT delete image files - that must be done separately. + * + * @param catalogId - The catalog entry ID + * @returns Promise resolving to the number of visuals deleted + */ + deleteByCatalogId(catalogId: number): Promise; +} + diff --git a/src/domain/models/index.ts b/src/domain/models/index.ts index c04e2e88..8d73ed36 100644 --- a/src/domain/models/index.ts +++ b/src/domain/models/index.ts @@ -1,4 +1,5 @@ export * from './chunk.js'; export * from './concept.js'; export * from './search-result.js'; +export * from './visual.js'; export * from '../exceptions.js'; diff --git a/src/domain/models/visual.ts b/src/domain/models/visual.ts new file mode 100644 index 00000000..fe9db44e --- /dev/null +++ b/src/domain/models/visual.ts @@ -0,0 +1,143 @@ +/** + * Domain model representing a visual (diagram, chart, table, figure) extracted from a document. + * + * A visual is an image extracted from a document that has semantic meaning: + * - Flowcharts, UML diagrams, architecture diagrams + * - Charts and graphs (bar, line, pie, etc.) + * - Tables with structured data + * - Technical figures with labels + * + * Photos, screenshots, and decorative images are NOT stored as visuals. + * + * Each visual is enriched with: + * - LLM-generated semantic description + * - Vector embeddings for semantic search + * - Extracted concepts for conceptual navigation + * - Links to nearby text chunks for context + * + * @example + * ```typescript + * const visual: Visual = { + * id: 3847293847, + * catalogId: 12345678, + * catalogTitle: 'Clean Architecture', + * imagePath: 'images/12345678/p42_v1.png', + * description: 'Architecture diagram showing dependency inversion...', + * visualType: 'diagram', + * pageNumber: 42, + * conceptIds: [11111111, 22222222], + * conceptNames: ['dependency inversion', 'clean architecture'], + * chunkIds: [33333333, 44444444] + * }; + * ``` + */ +export interface Visual { + /** Unique identifier for the visual (hash-based integer from catalog_id + page + index) */ + id: number; + + /** Parent document ID (hash-based integer, matches catalog.id) */ + catalogId: number; + + /** + * Document title from catalog - DERIVED field for display. + * Populated from catalog.title during extraction. + */ + catalogTitle: string; + + /** + * Path to the extracted image file, relative to database directory. + * Format: `images/{catalog_id}/p{page}_v{index}.png` + * Images are stored as grayscale PNG for storage efficiency. + */ + imagePath: string; + + /** + * LLM-generated semantic description of the visual. + * Captures the meaning, components, and relationships depicted. + * Used for generating embeddings and extracting concepts. + */ + description: string; + + /** 384-dimensional vector embedding of the description for semantic search */ + vector?: number[]; + + /** + * Classification of the visual type. + * - diagram: flowcharts, UML, architecture, state machines + * - flowchart: process flows, decision trees + * - chart: bar, line, pie, scatter, histogram + * - table: structured tabular data + * - figure: technical illustrations with labels + */ + visualType: VisualType; + + /** Page number within source document (1-indexed) */ + pageNumber: number; + + /** + * Bounding box of the visual on the page. + * JSON string format: `{"x": 0, "y": 0, "width": 100, "height": 100}` + * Coordinates are in pixels relative to the page. + */ + boundingBox?: string; + + /** Hash-based concept IDs extracted from the description */ + conceptIds?: number[]; + + /** + * Denormalized concept names - DERIVED field for display. + * Regenerated from concept_ids → concepts.name lookup. + */ + conceptNames?: string[]; + + /** + * IDs of text chunks near this visual on the same page. + * Provides context for understanding the visual. + */ + chunkIds?: number[]; +} + +/** + * Visual type classification. + * Only visuals with semantic meaning are stored. + */ +export type VisualType = + | 'diagram' // flowcharts, UML, architecture, state machines + | 'flowchart' // process flows, decision trees + | 'chart' // bar, line, pie, scatter, histogram + | 'table' // structured tabular data + | 'figure'; // technical illustrations with labels + +/** + * Bounding box for a visual on a page. + */ +export interface BoundingBox { + /** X coordinate (left edge) in pixels */ + x: number; + /** Y coordinate (top edge) in pixels */ + y: number; + /** Width in pixels */ + width: number; + /** Height in pixels */ + height: number; +} + +/** + * Parse a bounding box from JSON string. + */ +export function parseBoundingBox(json: string | undefined): BoundingBox | undefined { + if (!json) return undefined; + try { + return JSON.parse(json) as BoundingBox; + } catch { + return undefined; + } +} + +/** + * Serialize a bounding box to JSON string. + */ +export function serializeBoundingBox(box: BoundingBox): string { + return JSON.stringify(box); +} + diff --git a/src/domain/services/concept-search-service.ts b/src/domain/services/concept-search-service.ts index 769eac58..861b39a6 100644 --- a/src/domain/services/concept-search-service.ts +++ b/src/domain/services/concept-search-service.ts @@ -136,8 +136,8 @@ export interface ConceptSearchParams { /** Maximum sources (default: 5) */ maxSources?: number; - /** Optional source filter */ - sourceFilter?: string; + /** Optional: Filter results to documents containing this text in their title */ + titleFilter?: string; } /** diff --git a/src/infrastructure/lancedb/repositories/lancedb-visual-repository.ts b/src/infrastructure/lancedb/repositories/lancedb-visual-repository.ts new file mode 100644 index 00000000..2c88d366 --- /dev/null +++ b/src/infrastructure/lancedb/repositories/lancedb-visual-repository.ts @@ -0,0 +1,387 @@ +import * as lancedb from "@lancedb/lancedb"; +import type { VisualRepository } from '../../../domain/interfaces/repositories/visual-repository.js'; +import type { Visual, VisualType } from '../../../domain/models/visual.js'; +import type { Option } from '../../../domain/functional/option.js'; +import { Some, None } from '../../../domain/functional/option.js'; +import { DatabaseError } from '../../../domain/exceptions/index.js'; + +/** + * LanceDB implementation of VisualRepository + * + * Stores and retrieves visual content (diagrams, charts, tables, figures) + * extracted from documents. Uses vector search for semantic queries. + * + * **Schema:** + * - id: number (hash-based) + * - catalog_id: number (FK to catalog) + * - catalog_title: string (derived) + * - image_path: string (relative path to grayscale PNG) + * - description: string (LLM-generated) + * - vector: Float32Array (384-dim embedding) + * - visual_type: string (diagram|flowchart|chart|table|figure) + * - page_number: number + * - bounding_box: string (JSON) + * - concept_ids: number[] + * - concept_names: string[] (derived) + * - chunk_ids: number[] + */ +export class LanceDBVisualRepository implements VisualRepository { + constructor(private visualsTable: lancedb.Table) {} + + async findById(id: number): Promise> { + try { + const results = await this.visualsTable + .query() + .where(`id = ${id}`) + .limit(1) + .toArray(); + + if (results.length === 0) { + return None(); + } + + return Some(this.mapRowToVisual(results[0])); + } catch (error) { + throw new DatabaseError( + `Failed to find visual by ID ${id}`, + 'query', + error as Error + ); + } + } + + async findByIds(ids: number[]): Promise { + if (ids.length === 0) { + return []; + } + + try { + // Build OR condition for multiple IDs + const idConditions = ids.map(id => `id = ${id}`).join(' OR '); + + const results = await this.visualsTable + .query() + .where(idConditions) + .limit(ids.length) + .toArray(); + + return results.map(row => this.mapRowToVisual(row)); + } catch (error) { + throw new DatabaseError( + `Failed to find visuals by IDs`, + 'query', + error as Error + ); + } + } + + async findByCatalogId(catalogId: number, limit: number): Promise { + try { + const results = await this.visualsTable + .query() + .where(`catalog_id = ${catalogId}`) + .limit(limit) + .toArray(); + + return results.map(row => this.mapRowToVisual(row)); + } catch (error) { + throw new DatabaseError( + `Failed to find visuals for catalog ID ${catalogId}`, + 'query', + error as Error + ); + } + } + + async findByType(visualType: string, limit: number): Promise { + try { + const results = await this.visualsTable + .query() + .where(`visual_type = '${visualType}'`) + .limit(limit) + .toArray(); + + return results.map(row => this.mapRowToVisual(row)); + } catch (error) { + throw new DatabaseError( + `Failed to find visuals of type ${visualType}`, + 'query', + error as Error + ); + } + } + + async findByPage(catalogId: number, pageNumber: number): Promise { + try { + const results = await this.visualsTable + .query() + .where(`catalog_id = ${catalogId} AND page_number = ${pageNumber}`) + .toArray(); + + return results.map(row => this.mapRowToVisual(row)); + } catch (error) { + throw new DatabaseError( + `Failed to find visuals on page ${pageNumber} of catalog ${catalogId}`, + 'query', + error as Error + ); + } + } + + async findByConceptId(conceptId: number, limit: number): Promise { + try { + // Query all visuals and filter in memory (LanceDB array_contains support varies) + const results = await this.visualsTable + .query() + .limit(10000) + .toArray(); + + const matches = results + .filter(row => { + const conceptIds = this.parseArrayField(row.concept_ids); + return conceptIds.includes(conceptId); + }) + .slice(0, limit); + + return matches.map(row => this.mapRowToVisual(row)); + } catch (error) { + throw new DatabaseError( + `Failed to find visuals for concept ID ${conceptId}`, + 'query', + error as Error + ); + } + } + + async findByConceptName(conceptName: string, limit: number): Promise { + try { + // Query all visuals and filter by concept name in memory + const results = await this.visualsTable + .query() + .limit(10000) + .toArray(); + + const searchName = conceptName.toLowerCase(); + + const matches = results + .filter(row => { + const conceptNames = this.parseArrayField(row.concept_names); + return conceptNames.some(name => + name.toLowerCase().includes(searchName) + ); + }) + .slice(0, limit); + + return matches.map(row => this.mapRowToVisual(row)); + } catch (error) { + throw new DatabaseError( + `Failed to find visuals for concept name "${conceptName}"`, + 'query', + error as Error + ); + } + } + + async findByChunkIds(chunkIds: number[], limit: number): Promise { + if (chunkIds.length === 0) { + return []; + } + + try { + // Query all visuals and filter in memory + const results = await this.visualsTable + .query() + .limit(10000) + .toArray(); + + const chunkIdSet = new Set(chunkIds); + + const matches = results + .filter(row => { + const visualChunkIds = this.parseArrayField(row.chunk_ids); + return visualChunkIds.some(id => chunkIdSet.has(id)); + }) + .slice(0, limit); + + return matches.map(row => this.mapRowToVisual(row)); + } catch (error) { + throw new DatabaseError( + `Failed to find visuals for chunk IDs`, + 'query', + error as Error + ); + } + } + + async searchByVector(queryVector: number[], limit: number): Promise { + try { + const results = await this.visualsTable + .vectorSearch(queryVector) + .limit(limit) + .toArray(); + + return results.map(row => this.mapRowToVisual(row)); + } catch (error) { + throw new DatabaseError( + `Failed to search visuals by vector`, + 'vector_search', + error as Error + ); + } + } + + async count(): Promise { + try { + return await this.visualsTable.countRows(); + } catch (error) { + throw new DatabaseError( + `Failed to count visuals`, + 'query', + error as Error + ); + } + } + + async add(visual: Visual): Promise { + try { + const row = this.mapVisualToRow(visual); + await this.visualsTable.add([row]); + } catch (error) { + throw new DatabaseError( + `Failed to add visual ${visual.id}`, + 'insert', + error as Error + ); + } + } + + async addBatch(visuals: Visual[]): Promise { + if (visuals.length === 0) { + return; + } + + try { + const rows = visuals.map(v => this.mapVisualToRow(v)); + await this.visualsTable.add(rows); + } catch (error) { + throw new DatabaseError( + `Failed to add ${visuals.length} visuals`, + 'insert', + error as Error + ); + } + } + + async update(visual: Visual): Promise { + try { + // LanceDB doesn't have native update - delete and re-add + await this.delete(visual.id); + await this.add(visual); + } catch (error) { + throw new DatabaseError( + `Failed to update visual ${visual.id}`, + 'update', + error as Error + ); + } + } + + async delete(id: number): Promise { + try { + await this.visualsTable.delete(`id = ${id}`); + } catch (error) { + throw new DatabaseError( + `Failed to delete visual ${id}`, + 'delete', + error as Error + ); + } + } + + async deleteByCatalogId(catalogId: number): Promise { + try { + // Count before delete + const count = await this.visualsTable + .query() + .where(`catalog_id = ${catalogId}`) + .toArray(); + + const deleteCount = count.length; + + if (deleteCount > 0) { + await this.visualsTable.delete(`catalog_id = ${catalogId}`); + } + + return deleteCount; + } catch (error) { + throw new DatabaseError( + `Failed to delete visuals for catalog ${catalogId}`, + 'delete', + error as Error + ); + } + } + + // Helper methods + + /** + * Parse array field from various formats (Arrow Vector, native array, JSON string) + */ + private parseArrayField(field: unknown): T[] { + if (!field) return []; + if (Array.isArray(field)) return field; + if (typeof field === 'object' && field !== null && 'toArray' in field) { + // Arrow Vector + return Array.from((field as { toArray(): T[] }).toArray()); + } + if (typeof field === 'string') { + try { + return JSON.parse(field); + } catch { + return []; + } + } + return []; + } + + /** + * Map a database row to a Visual domain model. + */ + private mapRowToVisual(row: any): Visual { + return { + id: typeof row.id === 'number' ? row.id : parseInt(row.id) || 0, + catalogId: row.catalog_id || 0, + catalogTitle: row.catalog_title || '', + imagePath: row.image_path || '', + description: row.description || '', + vector: row.vector ? Array.from(row.vector) : undefined, + visualType: (row.visual_type || 'diagram') as VisualType, + pageNumber: row.page_number || 0, + boundingBox: row.bounding_box, + conceptIds: this.parseArrayField(row.concept_ids), + conceptNames: this.parseArrayField(row.concept_names), + chunkIds: this.parseArrayField(row.chunk_ids) + }; + } + + /** + * Map a Visual domain model to a database row. + */ + private mapVisualToRow(visual: Visual): Record { + return { + id: visual.id, + catalog_id: visual.catalogId, + catalog_title: visual.catalogTitle, + image_path: visual.imagePath, + description: visual.description, + vector: visual.vector ? new Float32Array(visual.vector) : new Float32Array(384), + visual_type: visual.visualType, + page_number: visual.pageNumber, + bounding_box: visual.boundingBox || '', + concept_ids: visual.conceptIds || [], + concept_names: visual.conceptNames || [], + chunk_ids: visual.chunkIds || [] + }; + } +} + diff --git a/src/infrastructure/utils/slugify.ts b/src/infrastructure/utils/slugify.ts new file mode 100644 index 00000000..8610ff8b --- /dev/null +++ b/src/infrastructure/utils/slugify.ts @@ -0,0 +1,198 @@ +/** + * Slugify Utilities + * + * Functions for creating human-readable, filesystem-safe identifiers + * from document metadata. + */ + +export interface DocumentInfo { + title: string; + author?: string; + year?: number | string; + id?: number | string; // Fallback for uniqueness +} + +/** + * Creates a human-readable folder name from document metadata. + * + * Format: {author-surname}_{short-title}_{year} + * + * Examples: + * - "martin_clean-architecture_2017" + * - "gamma_design-patterns-elements_1994" + * - "unknown_cosmos-blockchain_2023" + * + * @param doc Document metadata + * @returns Filesystem-safe folder name + */ +export function slugifyDocument(doc: DocumentInfo): string { + const author = extractAuthorSurname(doc.author); + const title = extractShortTitle(doc.title); + const year = extractYear(doc.year); + + return `${author}_${title}_${year}`; +} + +/** + * Extracts the first author's surname, normalized for filesystem use. + * + * @param author Full author string (e.g., "Robert C. Martin", "Gamma, Erich et al.") + * @returns Lowercase surname, max 15 chars + */ +export function extractAuthorSurname(author?: string): string { + if (!author || author.trim() === '') { + return 'unknown'; + } + + // Handle "Surname, FirstName" format + if (author.includes(',')) { + const surname = author.split(',')[0].trim(); + return normalizeForFilesystem(surname, 15); + } + + // Handle "FirstName Surname" format - take last word before any "et al." + const cleaned = author + .replace(/\s+et\s+al\.?/i, '') + .replace(/\s+and\s+.*/i, '') + .trim(); + + const parts = cleaned.split(/\s+/); + const surname = parts[parts.length - 1]; + + return normalizeForFilesystem(surname, 15); +} + +/** + * Extracts a short, readable title slug. + * + * @param title Full document title + * @returns Kebab-case title, max 30 chars, 4 significant words + */ +export function extractShortTitle(title: string): string { + if (!title || title.trim() === '') { + return 'untitled'; + } + + const shortTitle = title + // Remove subtitles after : ; – — + .replace(/[:;–—].*/g, '') + // Remove edition markers + .replace(/\(\d+(?:st|nd|rd|th)?\s*(?:ed\.?|edition)\)/gi, '') + .replace(/,?\s*\d+(?:st|nd|rd|th)?\s*(?:ed\.?|edition)/gi, '') + // Remove leading articles + .replace(/^(the|a|an)\s+/i, '') + .trim(); + + // Convert to words, filter, and join + const words = shortTitle + .toLowerCase() + .replace(/[^a-z0-9\s]/g, ' ') + .split(/\s+/) + .filter(w => w.length > 0) + .slice(0, 4); // First 4 significant words + + const slug = words.join('-'); + + // Truncate to 30 chars at word boundary + if (slug.length <= 30) { + return slug || 'untitled'; + } + + const truncated = slug.slice(0, 30); + const lastDash = truncated.lastIndexOf('-'); + return lastDash > 10 ? truncated.slice(0, lastDash) : truncated; +} + +/** + * Extracts year from various formats. + * + * @param year Year value (number, string, or undefined) + * @returns 4-digit year string or "undated" + */ +export function extractYear(year?: number | string): string { + if (!year) { + return 'undated'; + } + + const yearStr = String(year); + + // Extract 4-digit year from string + const match = yearStr.match(/\b(19|20)\d{2}\b/); + if (match) { + return match[0]; + } + + // If it's already a valid year number + const yearNum = parseInt(yearStr, 10); + if (yearNum >= 1900 && yearNum <= 2100) { + return String(yearNum); + } + + return 'undated'; +} + +/** + * Normalizes a string for safe filesystem use. + * + * @param str Input string + * @param maxLength Maximum length + * @returns Lowercase, alphanumeric string + */ +export function normalizeForFilesystem(str: string, maxLength: number): string { + return str + .toLowerCase() + .replace(/[^a-z0-9]/g, '') + .slice(0, maxLength) || 'unknown'; +} + +/** + * Creates a unique folder name, appending ID suffix if needed. + * + * @param doc Document metadata + * @param existingNames Set of already-used folder names + * @returns Unique folder name + */ +export function slugifyDocumentUnique( + doc: DocumentInfo, + existingNames: Set +): string { + const baseSlug = slugifyDocument(doc); + + if (!existingNames.has(baseSlug)) { + return baseSlug; + } + + // Append short ID suffix for uniqueness + if (doc.id) { + const idSuffix = String(doc.id).slice(-6); + const uniqueSlug = `${baseSlug}_${idSuffix}`; + if (!existingNames.has(uniqueSlug)) { + return uniqueSlug; + } + } + + // Fallback: append counter + let counter = 2; + while (existingNames.has(`${baseSlug}_${counter}`)) { + counter++; + } + return `${baseSlug}_${counter}`; +} + +/** + * Formats visual filename within a document folder. + * + * @param pageNumber Page number in document + * @param visualIndex Index of visual on that page (0-based) + * @param extension File extension (default: 'png') + * @returns Filename like "p042_v0.png" + */ +export function formatVisualFilename( + pageNumber: number, + visualIndex: number = 0, + extension: string = 'png' +): string { + const page = String(pageNumber).padStart(3, '0'); + return `p${page}_v${visualIndex}.${extension}`; +} + diff --git a/src/infrastructure/visual-extraction/document-analyzer.ts b/src/infrastructure/visual-extraction/document-analyzer.ts new file mode 100644 index 00000000..0094dbc1 --- /dev/null +++ b/src/infrastructure/visual-extraction/document-analyzer.ts @@ -0,0 +1,190 @@ +/** + * Document Analyzer + * + * Analyzes PDF documents to determine their type: + * - native: Contains embedded image objects (diagrams, charts) + * - scanned: Pages are stored as full-page images (OCR scanned) + * - mixed: Contains both native and scanned content + * + * This determines the extraction strategy: + * - native → pdfimages + classify + * - scanned → render pages + detect regions + crop + */ + +import * as fs from 'fs'; +import { + extractPdfImages, + getPdfPageDimensions, + analyzeImageVsPageSize, + getPdfPageCount +} from './pdf-page-renderer.js'; + +/** + * Document type classification. + */ +export type DocumentType = 'native' | 'scanned' | 'mixed'; + +/** + * Result of document analysis. + */ +export interface DocumentAnalysisResult { + /** Document type */ + type: DocumentType; + /** Total number of pages */ + pageCount: number; + /** Number of embedded images found */ + imageCount: number; + /** Number of page-sized images (indicates scanning) */ + pageSizedImages: number; + /** Ratio of page-sized images to total images */ + scanRatio: number; + /** Confidence in the classification (0-1) */ + confidence: number; +} + +/** + * Options for document analysis. + */ +export interface AnalysisOptions { + /** Maximum number of images to sample (default: 20) */ + sampleSize?: number; + /** Threshold for classifying as scanned (default: 0.6) */ + scannedThreshold?: number; + /** Threshold for classifying as mixed (default: 0.2) */ + mixedThreshold?: number; +} + +/** + * Analyze a PDF to determine if it's native or scanned. + * + * Samples embedded images and checks if they match page dimensions. + * Documents with mostly page-sized images are classified as scanned. + * + * @param pdfPath - Path to the PDF file + * @param options - Analysis options + * @returns Analysis result with document type and confidence + */ +export async function analyzeDocumentType( + pdfPath: string, + options: AnalysisOptions = {} +): Promise { + const { + sampleSize = 20, + scannedThreshold = 0.6, + mixedThreshold = 0.2 + } = options; + + // Verify PDF exists + if (!fs.existsSync(pdfPath)) { + throw new Error(`PDF not found: ${pdfPath}`); + } + + // Get page count and dimensions + const pageCount = getPdfPageCount(pdfPath); + const pageDimensions = getPdfPageDimensions(pdfPath); + + // Create lookup map for page dimensions + const pageDimMap = new Map(); + for (const dim of pageDimensions) { + pageDimMap.set(dim.pageNumber, { width: dim.width, height: dim.height }); + } + + // Extract embedded images (sample only) + let extractionResult; + try { + extractionResult = await extractPdfImages(pdfPath, { + minWidth: 50, // Lower threshold to catch more images + minHeight: 50 + }); + } catch (err) { + // If extraction fails, assume it might be scanned + return { + type: 'scanned', + pageCount, + imageCount: 0, + pageSizedImages: 0, + scanRatio: 1, + confidence: 0.5 + }; + } + + const totalImages = extractionResult.images.length; + + // No embedded images = definitely scanned + if (totalImages === 0) { + return { + type: 'scanned', + pageCount, + imageCount: 0, + pageSizedImages: 0, + scanRatio: 1, + confidence: 0.9 + }; + } + + // Sample images for analysis + const samplesToCheck = Math.min(totalImages, sampleSize); + const sampleImages = extractionResult.images.slice(0, samplesToCheck); + + // Count page-sized images + let pageSizedCount = 0; + + for (const img of sampleImages) { + const pageDim = pageDimMap.get(img.pageNumber); + + if (pageDim) { + const analysis = analyzeImageVsPageSize( + img.width, + img.height, + pageDim.width, + pageDim.height + ); + + // Consider it page-sized if it covers significant area + if (analysis.shouldSkip && analysis.areaCoverage > 0.7) { + pageSizedCount++; + } + } + } + + // Calculate scan ratio + const scanRatio = pageSizedCount / samplesToCheck; + + // Determine document type + let type: DocumentType; + let confidence: number; + + if (scanRatio >= scannedThreshold) { + type = 'scanned'; + confidence = Math.min(0.5 + scanRatio * 0.5, 0.95); + } else if (scanRatio >= mixedThreshold) { + type = 'mixed'; + confidence = 0.6 + (0.3 * (1 - Math.abs(scanRatio - 0.4) / 0.4)); + } else { + type = 'native'; + confidence = Math.min(0.5 + (1 - scanRatio) * 0.5, 0.95); + } + + return { + type, + pageCount, + imageCount: totalImages, + pageSizedImages: pageSizedCount, + scanRatio, + confidence + }; +} + +/** + * Quick check if a document is likely scanned. + * + * Faster than full analysis, just checks first few images. + * + * @param pdfPath - Path to the PDF file + * @returns true if document appears to be scanned + */ +export async function isLikelyScanned(pdfPath: string): Promise { + const result = await analyzeDocumentType(pdfPath, { sampleSize: 5 }); + return result.type === 'scanned'; +} + diff --git a/src/infrastructure/visual-extraction/epub-image-extractor.ts b/src/infrastructure/visual-extraction/epub-image-extractor.ts new file mode 100644 index 00000000..61711f72 --- /dev/null +++ b/src/infrastructure/visual-extraction/epub-image-extractor.ts @@ -0,0 +1,518 @@ +/** + * EPUB Image Extractor + * + * Extracts images from EPUB files for visual classification and storage. + * + * EPUB Structure: + * - EPUB files are ZIP archives containing XHTML content + images + * - Images are listed in the OPF manifest with media-type 'image/*' + * - Images are referenced from XHTML chapters via tags + * + * Extraction Strategy: + * 1. Parse EPUB using 'epub' package + * 2. Extract all images from manifest + * 3. Map images to chapters by parsing XHTML for references + * 4. Apply pre-filters (cover, icons, decorative) + * 5. Return candidate images for classification + */ + +import EPub from 'epub'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import sharp from 'sharp'; + +/** + * An image extracted from an EPUB file. + */ +export interface EpubImage { + /** Image ID from manifest */ + manifestId: string; + /** Image path within EPUB (e.g., "images/figure1.png") */ + href: string; + /** MIME type (e.g., "image/png") */ + mimeType: string; + /** Chapter index where image is first referenced (0-based), -1 if not referenced */ + chapterIndex: number; + /** Chapter title if available */ + chapterTitle?: string; + /** Image index within chapter (0-based) */ + imageIndex: number; + /** Path to temp file containing the image */ + tempPath: string; + /** Image dimensions */ + width: number; + height: number; +} + +/** + * Pre-filter result for an image. + */ +export interface PreFilterResult { + /** Whether to skip this image */ + skip: boolean; + /** Reason for skipping */ + reason?: 'cover' | 'tooSmall' | 'decorative' | 'unsupportedFormat'; +} + +/** + * Result of EPUB image extraction. + */ +export interface EpubImageExtractionResult { + /** Total images in manifest */ + totalImages: number; + /** Images extracted (passed pre-filters) */ + extractedImages: EpubImage[]; + /** Temp directory containing extracted images */ + tempDir: string; + /** Images skipped by pre-filter */ + skipped: { + cover: number; + tooSmall: number; + decorative: number; + unsupportedFormat: number; + }; + /** Errors encountered */ + errors: string[]; +} + +/** + * Options for EPUB image extraction. + */ +export interface EpubExtractionOptions { + /** Minimum image width in pixels (default: 100) */ + minWidth?: number; + /** Minimum image height in pixels (default: 100) */ + minHeight?: number; + /** Skip cover image detection (default: false) */ + skipCoverDetection?: boolean; +} + +/** + * EPUB Image Extractor + * + * Extracts and filters images from EPUB files for visual classification. + */ +export class EpubImageExtractor { + + /** + * Check if a file is an EPUB. + */ + static isEpub(filePath: string): boolean { + return filePath.toLowerCase().endsWith('.epub'); + } + + /** + * Extract all candidate images from an EPUB file. + * + * @param epubPath - Path to the EPUB file + * @param options - Extraction options + * @returns Extraction result with candidate images + */ + async extract( + epubPath: string, + options: EpubExtractionOptions = {} + ): Promise { + const { + minWidth = 100, + minHeight = 100, + skipCoverDetection = false + } = options; + + const result: EpubImageExtractionResult = { + totalImages: 0, + extractedImages: [], + tempDir: '', + skipped: { + cover: 0, + tooSmall: 0, + decorative: 0, + unsupportedFormat: 0 + }, + errors: [] + }; + + // Create temp directory for extracted images + result.tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'epub-images-')); + + try { + // Parse EPUB + const epub = await this.parseEpub(epubPath); + + // Get all images from manifest + const manifestImages = this.getManifestImages(epub); + result.totalImages = manifestImages.length; + + if (manifestImages.length === 0) { + return result; + } + + // Build image-to-chapter mapping + const chapterMap = await this.buildImageChapterMap(epub); + + // Track image index per chapter + const chapterImageCounts = new Map(); + + // Process each image + for (const manifestItem of manifestImages) { + try { + // Get image data + const imageData = await this.getImageData(epub, manifestItem.id); + + if (!imageData || imageData.length === 0) { + result.errors.push(`Empty image data: ${manifestItem.href}`); + continue; + } + + // Save to temp file + const ext = this.getExtensionFromMimeType(manifestItem.mediaType); + if (!ext) { + result.skipped.unsupportedFormat++; + continue; + } + + const tempPath = path.join(result.tempDir, `${manifestItem.id}${ext}`); + fs.writeFileSync(tempPath, imageData); + + // Get image dimensions + let width = 0, height = 0; + try { + const metadata = await sharp(tempPath).metadata(); + width = metadata.width || 0; + height = metadata.height || 0; + } catch { + result.errors.push(`Failed to read dimensions: ${manifestItem.href}`); + fs.unlinkSync(tempPath); + continue; + } + + // Get chapter info + const chapterIndex = chapterMap.get(manifestItem.id) ?? -1; + const currentIndex = chapterImageCounts.get(chapterIndex) || 0; + chapterImageCounts.set(chapterIndex, currentIndex + 1); + + const epubImage: EpubImage = { + manifestId: manifestItem.id, + href: manifestItem.href, + mimeType: manifestItem.mediaType, + chapterIndex, + imageIndex: currentIndex, + tempPath, + width, + height + }; + + // Apply pre-filters + const preFilter = this.shouldSkipImage( + epubImage, + manifestImages, + { minWidth, minHeight, skipCoverDetection } + ); + + if (preFilter.skip) { + if (preFilter.reason === 'cover') result.skipped.cover++; + else if (preFilter.reason === 'tooSmall') result.skipped.tooSmall++; + else if (preFilter.reason === 'decorative') result.skipped.decorative++; + + // Clean up temp file for skipped images + fs.unlinkSync(tempPath); + continue; + } + + result.extractedImages.push(epubImage); + + } catch (err: any) { + result.errors.push(`Failed to extract ${manifestItem.href}: ${err.message}`); + } + } + + } catch (err: any) { + result.errors.push(`EPUB parsing failed: ${err.message}`); + } + + return result; + } + + /** + * Clean up temporary files from extraction. + */ + cleanup(result: EpubImageExtractionResult): void { + if (result.tempDir && fs.existsSync(result.tempDir)) { + try { + const files = fs.readdirSync(result.tempDir); + for (const file of files) { + try { + fs.unlinkSync(path.join(result.tempDir, file)); + } catch { + // Ignore individual file errors + } + } + fs.rmdirSync(result.tempDir); + } catch { + // Ignore cleanup errors + } + } + } + + /** + * Parse EPUB file and return parsed instance. + */ + private parseEpub(epubPath: string): Promise { + return new Promise((resolve, reject) => { + const epub = new EPub(epubPath); + + epub.on('error', (err: Error) => { + reject(new Error(`Failed to parse EPUB: ${err.message}`)); + }); + + epub.on('end', () => { + resolve(epub); + }); + + epub.parse(); + }); + } + + /** + * Get all image items from the EPUB manifest. + */ + private getManifestImages(epub: EPub): Array<{ id: string; href: string; mediaType: string }> { + const images: Array<{ id: string; href: string; mediaType: string }> = []; + + const manifest = epub.manifest as Record; + + for (const [id, item] of Object.entries(manifest)) { + const mediaType = item['media-type'] || ''; + if (mediaType.startsWith('image/')) { + images.push({ + id, + href: item.href || id, + mediaType + }); + } + } + + return images; + } + + /** + * Build mapping from image manifest ID to chapter index. + */ + private async buildImageChapterMap(epub: EPub): Promise> { + const imageChapterMap = new Map(); + + // epub.flow contains chapters in reading order + const chapters = epub.flow || []; + + for (let i = 0; i < chapters.length; i++) { + const chapter = chapters[i]; + + try { + // Get chapter content to find image references + const chapterContent = await this.getChapterContent(epub, chapter.id); + + // Find all image references in the chapter + const imageRefs = this.extractImageReferences(chapterContent); + + for (const ref of imageRefs) { + // Normalize the reference to match manifest IDs + const manifestId = this.findManifestIdForReference(epub, ref); + + if (manifestId && !imageChapterMap.has(manifestId)) { + imageChapterMap.set(manifestId, i); + } + } + } catch { + // Skip chapters that can't be read + } + } + + return imageChapterMap; + } + + /** + * Get chapter content as raw HTML. + */ + private getChapterContent(epub: EPub, chapterId: string): Promise { + return new Promise((resolve, reject) => { + epub.getChapter(chapterId, (err: Error | null, content: string) => { + if (err) { + reject(err); + } else { + resolve(content); + } + }); + }); + } + + /** + * Extract image references from HTML content. + */ + private extractImageReferences(html: string): string[] { + const refs: string[] = []; + + // Match tags + const imgRegex = /]+src=["']([^"']+)["']/gi; + let match; + + while ((match = imgRegex.exec(html)) !== null) { + refs.push(match[1]); + } + + // Also match xlink:href for SVG images + const xlinkRegex = /xlink:href=["']([^"']+)["']/gi; + while ((match = xlinkRegex.exec(html)) !== null) { + refs.push(match[1]); + } + + return refs; + } + + /** + * Find manifest ID for an image reference. + */ + private findManifestIdForReference(epub: EPub, ref: string): string | undefined { + const manifest = epub.manifest as Record; + + // Normalize the reference (remove path prefixes, decode URI) + const normalizedRef = this.normalizeImagePath(ref); + + for (const [id, item] of Object.entries(manifest)) { + const mediaType = item['media-type'] || ''; + if (!mediaType.startsWith('image/')) continue; + + const normalizedHref = this.normalizeImagePath(item.href || ''); + + // Check for exact match or filename match + if (normalizedHref === normalizedRef || + normalizedHref.endsWith(normalizedRef) || + normalizedRef.endsWith(normalizedHref)) { + return id; + } + } + + return undefined; + } + + /** + * Normalize image path for comparison. + */ + private normalizeImagePath(pathStr: string): string { + // Decode URI components + let normalized = decodeURIComponent(pathStr); + + // Remove leading path components like ../ + normalized = normalized.replace(/^\.\.\/+/g, ''); + + // Remove leading OEBPS/ or similar + normalized = normalized.replace(/^(OEBPS|OPS|Content)\//i, ''); + + return normalized.toLowerCase(); + } + + /** + * Get image data from EPUB. + */ + private getImageData(epub: EPub, imageId: string): Promise { + return new Promise((resolve, reject) => { + epub.getImage(imageId, (err: Error | null, data: Buffer) => { + if (err) { + reject(err); + } else { + resolve(data); + } + }); + }); + } + + /** + * Get file extension from MIME type. + */ + private getExtensionFromMimeType(mimeType: string): string | null { + const mimeMap: Record = { + 'image/png': '.png', + 'image/jpeg': '.jpg', + 'image/jpg': '.jpg', + 'image/gif': '.gif', + 'image/webp': '.webp', + 'image/svg+xml': '.svg', + 'image/bmp': '.bmp' + }; + + return mimeMap[mimeType.toLowerCase()] || null; + } + + /** + * Determine if an image should be skipped. + */ + private shouldSkipImage( + image: EpubImage, + allImages: Array<{ id: string; href: string; mediaType: string }>, + options: { minWidth: number; minHeight: number; skipCoverDetection: boolean } + ): PreFilterResult { + const { minWidth, minHeight, skipCoverDetection } = options; + + // 1. Skip if too small + if (image.width < minWidth || image.height < minHeight) { + return { skip: true, reason: 'tooSmall' }; + } + + // 2. Skip cover images (unless disabled) + if (!skipCoverDetection && this.isCoverImage(image, allImages)) { + return { skip: true, reason: 'cover' }; + } + + // 3. Skip decorative images (filename patterns) + if (this.isDecorativeImage(image)) { + return { skip: true, reason: 'decorative' }; + } + + return { skip: false }; + } + + /** + * Detect if an image is likely a cover image. + */ + private isCoverImage( + image: EpubImage, + allImages: Array<{ id: string; href: string; mediaType: string }> + ): boolean { + const href = image.href.toLowerCase(); + const id = image.manifestId.toLowerCase(); + + // Check filename/ID patterns + const coverPatterns = ['cover', 'title', 'front', 'titlepage']; + if (coverPatterns.some(p => href.includes(p) || id.includes(p))) { + return true; + } + + // Check if it's the first image and significantly larger than others + // (covers are typically portrait and larger than content images) + if (allImages.length > 0 && allImages[0].id === image.manifestId) { + const isPortrait = image.height > image.width; + const isLarge = image.width > 400 && image.height > 600; + if (isPortrait && isLarge) { + return true; + } + } + + return false; + } + + /** + * Detect if an image is decorative. + */ + private isDecorativeImage(image: EpubImage): boolean { + const href = image.href.toLowerCase(); + + // Check filename patterns for decorative elements + const decorativePatterns = [ + 'divider', 'ornament', 'separator', 'border', 'line', + 'bullet', 'icon', 'arrow', 'button', 'logo', + 'spacer', 'dingbat', 'decoration', 'flourish' + ]; + + return decorativePatterns.some(p => href.includes(p)); + } +} + diff --git a/src/infrastructure/visual-extraction/image-processor.ts b/src/infrastructure/visual-extraction/image-processor.ts new file mode 100644 index 00000000..f4f76c67 --- /dev/null +++ b/src/infrastructure/visual-extraction/image-processor.ts @@ -0,0 +1,305 @@ +/** + * Image Processor + * + * Handles image processing operations for visual extraction: + * - Cropping regions from page images + * - Converting to grayscale + * - Saving as optimized PNG + * - Embedding metadata in PNG tEXt chunks + * + * Uses sharp for high-performance image processing. + */ + +import sharp from 'sharp'; +import * as fs from 'fs'; +import * as path from 'path'; +import type { BoundingBox } from './types.js'; + +/** + * Metadata to embed in PNG images. + */ +export interface ImageEmbeddedMetadata { + title?: string; + author?: string; + year?: number | string; + pageNumber: number; + imageIndex: number; + catalogId: number; + source?: string; +} + +/** + * Image metadata from sharp. + */ +export interface ImageMetadata { + width: number; + height: number; + format: string; + channels: number; +} + +/** + * Get image metadata. + * + * @param imagePath - Path to the image file + * @returns Image metadata + */ +export async function getImageMetadata(imagePath: string): Promise { + const metadata = await sharp(imagePath).metadata(); + return { + width: metadata.width || 0, + height: metadata.height || 0, + format: metadata.format || 'unknown', + channels: metadata.channels || 0 + }; +} + +/** + * Crop a region from an image and convert to grayscale. + * + * @param sourcePath - Path to the source image + * @param outputPath - Path to save the cropped image + * @param boundingBox - Normalized bounding box (0-1 coordinates) + * @param options - Processing options + * @returns Metadata of the cropped image + */ +export async function cropAndGrayscale( + sourcePath: string, + outputPath: string, + boundingBox: BoundingBox, + options: { + pngCompression?: number; // 0-9, higher = smaller file + } = {} +): Promise { + const { pngCompression = 6 } = options; + + // Get source image dimensions + const metadata = await getImageMetadata(sourcePath); + + // Convert normalized coordinates to pixels + const left = Math.round(boundingBox.x * metadata.width); + const top = Math.round(boundingBox.y * metadata.height); + const width = Math.round(boundingBox.width * metadata.width); + const height = Math.round(boundingBox.height * metadata.height); + + // Ensure valid crop dimensions + const cropWidth = Math.max(1, Math.min(width, metadata.width - left)); + const cropHeight = Math.max(1, Math.min(height, metadata.height - top)); + + // Ensure output directory exists + const outputDir = path.dirname(outputPath); + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + + // Crop, convert to grayscale, and save + await sharp(sourcePath) + .extract({ + left: Math.max(0, left), + top: Math.max(0, top), + width: cropWidth, + height: cropHeight + }) + .grayscale() + .png({ compressionLevel: pngCompression }) + .toFile(outputPath); + + // Return metadata of the output image + return getImageMetadata(outputPath); +} + +/** + * Build PNG tEXt chunks from embedded metadata. + * + * PNG tEXt chunks are key-value pairs stored in the image file. + * Standard keys: Title, Author, Description, Copyright, Creation Time, Software + * Custom keys are also supported. + * + * @param metadata - Metadata to embed + * @returns Object with tEXt chunk key-value pairs + */ +function buildPngTextChunks(metadata: ImageEmbeddedMetadata): Record { + const chunks: Record = {}; + + if (metadata.title) { + chunks['Title'] = metadata.title; + } + if (metadata.author) { + chunks['Author'] = metadata.author; + } + if (metadata.year) { + chunks['Creation Time'] = String(metadata.year); + } + if (metadata.source) { + chunks['Source'] = metadata.source; + } + + // Custom metadata fields + chunks['Page'] = String(metadata.pageNumber); + chunks['ImageIndex'] = String(metadata.imageIndex); + chunks['CatalogId'] = String(metadata.catalogId); + chunks['Software'] = 'concept-rag visual extractor'; + + return chunks; +} + +/** + * Convert a full page image to grayscale and save. + * + * Used when extracting the entire page as a visual. + * + * @param sourcePath - Path to the source image + * @param outputPath - Path to save the grayscale image + * @param options - Processing options + * @returns Metadata of the output image + */ +export async function convertToGrayscale( + sourcePath: string, + outputPath: string, + options: { + pngCompression?: number; + maxWidth?: number; // Resize if larger than this + embeddedMetadata?: ImageEmbeddedMetadata; + } = {} +): Promise { + const { pngCompression = 6, maxWidth, embeddedMetadata } = options; + + // Ensure output directory exists + const outputDir = path.dirname(outputPath); + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + + let pipeline = sharp(sourcePath).grayscale(); + + // Resize if maxWidth specified and image is larger + if (maxWidth) { + const metadata = await getImageMetadata(sourcePath); + if (metadata.width > maxWidth) { + pipeline = pipeline.resize(maxWidth, null, { withoutEnlargement: true }); + } + } + + // Build PNG options with optional text chunks + const pngOptions: sharp.PngOptions = { compressionLevel: pngCompression }; + + if (embeddedMetadata) { + // Build text chunks for reference (used in embedMetadataInPng) + void buildPngTextChunks(embeddedMetadata); + // Sharp doesn't directly support tEXt chunks in png(), so we use withMetadata + // and write a separate function for full metadata embedding + } + + await pipeline + .png(pngOptions) + .toFile(outputPath); + + // If metadata was requested, re-process to embed it + if (embeddedMetadata) { + await embedMetadataInPng(outputPath, embeddedMetadata); + } + + return getImageMetadata(outputPath); +} + +/** + * Embed metadata into an existing PNG file. + * + * Uses sharp to read and rewrite the image with metadata. + * This is a two-pass operation: read, then write with metadata. + * + * @param imagePath - Path to the PNG file + * @param metadata - Metadata to embed + */ +export async function embedMetadataInPng( + imagePath: string, + metadata: ImageEmbeddedMetadata +): Promise { + // Read the existing image + const imageBuffer = await fs.promises.readFile(imagePath); + + // Build EXIF-compatible metadata + // Sharp supports a subset of EXIF fields via withMetadata + const exifData: sharp.WriteableMetadata = {}; + + // Build comment string with all metadata + const metadataLines = [ + metadata.title ? `Title: ${metadata.title}` : null, + metadata.author ? `Author: ${metadata.author}` : null, + metadata.year ? `Year: ${metadata.year}` : null, + `Page: ${metadata.pageNumber}`, + `Image Index: ${metadata.imageIndex}`, + `Catalog ID: ${metadata.catalogId}`, + metadata.source ? `Source: ${metadata.source}` : null, + 'Software: concept-rag visual extractor' + ].filter(Boolean).join('\n'); + + // Sharp's PNG support for metadata is limited + // Use EXIF comment field which is preserved in PNG via iTXt/tEXt + // Build IFD0 with only defined values to satisfy TypeScript + const ifd0: Record = { + ImageDescription: metadataLines, + Software: 'concept-rag visual extractor', + }; + if (metadata.author) ifd0.Artist = metadata.author; + if (metadata.title) ifd0.Copyright = `From: ${metadata.title}`; + + exifData.exif = { IFD0: ifd0 }; + + // Write back with metadata + await sharp(imageBuffer) + .withMetadata(exifData) + .png({ compressionLevel: 6 }) + .toFile(imagePath + '.tmp'); + + // Replace original with new file + await fs.promises.rename(imagePath + '.tmp', imagePath); +} + +/** + * Get the file size of an image in bytes. + * + * @param imagePath - Path to the image file + * @returns File size in bytes + */ +export function getImageFileSize(imagePath: string): number { + const stats = fs.statSync(imagePath); + return stats.size; +} + +/** + * Check if an image meets minimum size requirements. + * + * @param imagePath - Path to the image file + * @param minWidth - Minimum width in pixels + * @param minHeight - Minimum height in pixels + * @returns True if image meets requirements + */ +export async function meetsMinimumSize( + imagePath: string, + minWidth: number, + minHeight: number +): Promise { + const metadata = await getImageMetadata(imagePath); + return metadata.width >= minWidth && metadata.height >= minHeight; +} + +/** + * Load an image as a base64 string for sending to Vision LLM. + * + * @param imagePath - Path to the image file + * @returns Base64-encoded image with data URL prefix + */ +export async function loadImageAsBase64(imagePath: string): Promise { + const buffer = await fs.promises.readFile(imagePath); + const base64 = buffer.toString('base64'); + + // Determine MIME type from extension + const ext = path.extname(imagePath).toLowerCase(); + const mimeType = ext === '.png' ? 'image/png' : + ext === '.jpg' || ext === '.jpeg' ? 'image/jpeg' : + 'image/png'; + + return `data:${mimeType};base64,${base64}`; +} + diff --git a/src/infrastructure/visual-extraction/index.ts b/src/infrastructure/visual-extraction/index.ts new file mode 100644 index 00000000..fe6aac6b --- /dev/null +++ b/src/infrastructure/visual-extraction/index.ts @@ -0,0 +1,41 @@ +/** + * Visual Extraction Module + * + * Provides visual extraction capabilities for PDF documents: + * - Automatic document type detection (native vs scanned) + * - Local classification using LayoutParser (no API cost) + * - PDF page rendering and region detection + * - Grayscale image extraction and storage + * - Vision LLM for semantic description generation (separate step) + * + * Only diagrams with semantic meaning are stored. + * Photos, screenshots, and decorative images are filtered out. + */ + +// Main extractor +export { VisualExtractor, type VisualExtractionResult, type VisualExtractionOptions, type DocumentFormat } from './visual-extractor.js'; + +// EPUB image extractor +export { EpubImageExtractor, type EpubImage, type EpubImageExtractionResult, type EpubExtractionOptions } from './epub-image-extractor.js'; + +// Local classifier (no API cost) +export { classifyImage, detectRegions, isLocalClassifierAvailable, type ClassificationResult, type DetectedRegion, type ClassifierOptions } from './local-classifier.js'; + +// Document analysis +export { analyzeDocumentType, isLikelyScanned, type DocumentType, type DocumentAnalysisResult, type AnalysisOptions } from './document-analyzer.js'; + +// Region cropping +export { cropRegion, cropRegions, type CropOptions, type CropResult } from './region-cropper.js'; + +// Vision LLM (for descriptions only) +export { VisionLLMService, createVisionLLMService, type VisionLLMConfig, type DescriptionResult } from './vision-llm-service.js'; + +// PDF utilities +export { renderPdfPages, cleanupRenderedPages, getPdfPageCount, isPdfToolsAvailable, extractPdfImages, cleanupExtractedImages, getPdfPageDimensions, analyzeImageVsPageSize, type RenderResult, type ImageExtractionResult, type ExtractedImage, type PdfPageDimensions, type PageSizeAnalysis } from './pdf-page-renderer.js'; + +// Image processing +export { cropAndGrayscale, convertToGrayscale, getImageMetadata, loadImageAsBase64, getImageFileSize, meetsMinimumSize, embedMetadataInPng, type ImageMetadata, type ImageEmbeddedMetadata } from './image-processor.js'; + +// Types +export { type BoundingBox, type DetectedVisual, type ExtractedVisual, type PageDetectionResult, type VisualExtractionConfig, type VisualExtractionProgressCallback, DEFAULT_VISUAL_EXTRACTION_CONFIG } from './types.js'; + diff --git a/src/infrastructure/visual-extraction/local-classifier.ts b/src/infrastructure/visual-extraction/local-classifier.ts new file mode 100644 index 00000000..0db765d7 --- /dev/null +++ b/src/infrastructure/visual-extraction/local-classifier.ts @@ -0,0 +1,257 @@ +/** + * Local Classifier + * + * TypeScript wrapper for the Python LayoutParser-based classifier. + * Provides local image classification without requiring Vision LLM API calls. + * + * Two modes: + * - classify: Determine if an image is a figure/table/skip (for native PDFs) + * - detect: Find figure/table regions within a page image (for scanned PDFs) + */ + +import { spawn } from 'child_process'; +import * as path from 'path'; +import * as fs from 'fs'; +import { fileURLToPath } from 'url'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +/** + * Result of classifying a single image. + */ +export interface ClassificationResult { + /** Visual type: figure, table, or skip */ + type: 'figure' | 'table' | 'skip'; + /** Confidence score (0-1) */ + score: number; + /** Whether to skip this image */ + skip: boolean; + /** Error message if classification failed */ + error?: string; +} + +/** + * A detected region within a page image. + */ +export interface DetectedRegion { + /** Visual type: figure or table */ + type: 'figure' | 'table'; + /** Confidence score (0-1) */ + score: number; + /** Bounding box in pixels */ + bbox: { + x: number; + y: number; + width: number; + height: number; + }; +} + +/** + * Options for classification/detection. + */ +export interface ClassifierOptions { + /** Minimum confidence score (0-1, default: 0.5) */ + minScore?: number; + /** Timeout in milliseconds (default: 30000) */ + timeout?: number; +} + +// Paths to Python script and virtual environment +const SCRIPT_PATH = path.resolve(__dirname, '../../../scripts/python/classify_visual.py'); +const VENV_PYTHON_LINUX = path.resolve(__dirname, '../../../scripts/python/venv/bin/python3'); +const VENV_PYTHON_WIN = path.resolve(__dirname, '../../../scripts/python/venv/Scripts/python.exe'); + +/** + * Get the path to the Python interpreter. + * Prefers the virtual environment if it exists. + */ +function getPythonPath(): string { + // Check for Linux/Mac venv + if (fs.existsSync(VENV_PYTHON_LINUX)) { + return VENV_PYTHON_LINUX; + } + // Check for Windows venv + if (fs.existsSync(VENV_PYTHON_WIN)) { + return VENV_PYTHON_WIN; + } + // Fall back to system Python + return 'python3'; +} + +/** + * Check if the local classifier is available. + * Returns true if Python script and dependencies are set up. + */ +export function isLocalClassifierAvailable(): boolean { + // Check if script exists + if (!fs.existsSync(SCRIPT_PATH)) { + return false; + } + // Check if venv exists (indicates dependencies are installed) + return fs.existsSync(VENV_PYTHON_LINUX) || fs.existsSync(VENV_PYTHON_WIN); +} + +/** + * Run the Python classification script. + */ +async function runPythonScript(args: string[], timeout: number = 30000): Promise { + return new Promise((resolve, reject) => { + const pythonPath = getPythonPath(); + + const childProcess = spawn(pythonPath, [SCRIPT_PATH, ...args], { + env: { ...process.env, PYTHONUNBUFFERED: '1' } + }); + + let stdout = ''; + let stderr = ''; + + const timeoutId = setTimeout(() => { + childProcess.kill(); + reject(new Error(`Classification timed out after ${timeout}ms`)); + }, timeout); + + childProcess.stdout.on('data', (data: Buffer) => { stdout += data.toString(); }); + childProcess.stderr.on('data', (data: Buffer) => { stderr += data.toString(); }); + + childProcess.on('close', (code: number | null) => { + clearTimeout(timeoutId); + + if (code === 0) { + resolve(stdout.trim()); + } else { + // Try to parse error from stdout (script outputs JSON errors) + try { + const result = JSON.parse(stdout.trim()); + if (result.error) { + reject(new Error(result.error)); + return; + } + } catch { + // Not JSON, use stderr + } + reject(new Error(`Classification failed (code ${code}): ${stderr || stdout}`)); + } + }); + + childProcess.on('error', (err: Error) => { + clearTimeout(timeoutId); + reject(new Error(`Failed to start Python: ${err.message}`)); + }); + }); +} + +/** + * Classify a single image using the local model. + * + * Determines if the image is primarily a figure, table, or should be skipped. + * Used for native PDF images extracted via pdfimages. + * + * @param imagePath - Path to the image file + * @param options - Classification options + * @returns Classification result + */ +export async function classifyImage( + imagePath: string, + options: ClassifierOptions = {} +): Promise { + const { minScore = 0.5, timeout = 30000 } = options; + + // Verify image exists + if (!fs.existsSync(imagePath)) { + return { + type: 'skip', + score: 0, + skip: true, + error: `Image not found: ${imagePath}` + }; + } + + try { + const output = await runPythonScript( + ['classify', imagePath, '--min-score', minScore.toString()], + timeout + ); + + const result = JSON.parse(output); + + if (result.error) { + return { + type: 'skip', + score: 0, + skip: true, + error: result.error + }; + } + + return result as ClassificationResult; + } catch (err: any) { + return { + type: 'skip', + score: 0, + skip: true, + error: err.message + }; + } +} + +/** + * Detect diagram regions within a page image. + * + * Returns bounding boxes for all detected figures and tables. + * Used for scanned PDFs where each page is a single image. + * + * @param imagePath - Path to the page image + * @param options - Detection options + * @returns Array of detected regions with bounding boxes + */ +export async function detectRegions( + imagePath: string, + options: ClassifierOptions = {} +): Promise { + const { minScore = 0.5, timeout = 60000 } = options; + + // Verify image exists + if (!fs.existsSync(imagePath)) { + throw new Error(`Image not found: ${imagePath}`); + } + + const output = await runPythonScript( + ['detect', imagePath, '--min-score', minScore.toString()], + timeout + ); + + const result = JSON.parse(output); + + if (result.error) { + throw new Error(result.error); + } + + return result as DetectedRegion[]; +} + +/** + * Batch classify multiple images. + * + * Processes images sequentially (model is cached between calls). + * More efficient than calling classifyImage() in a loop. + * + * @param imagePaths - Array of image paths + * @param options - Classification options + * @returns Array of classification results (same order as input) + */ +export async function classifyImages( + imagePaths: string[], + options: ClassifierOptions = {} +): Promise { + const results: ClassificationResult[] = []; + + for (const imagePath of imagePaths) { + const result = await classifyImage(imagePath, options); + results.push(result); + } + + return results; +} + diff --git a/src/infrastructure/visual-extraction/pdf-page-renderer.ts b/src/infrastructure/visual-extraction/pdf-page-renderer.ts new file mode 100644 index 00000000..89526e10 --- /dev/null +++ b/src/infrastructure/visual-extraction/pdf-page-renderer.ts @@ -0,0 +1,570 @@ +/** + * PDF Page Renderer + * + * Renders PDF pages to PNG images using pdftoppm (from poppler-utils). + * This is the same approach used by the OCR module. + * + * Requirements: + * - Ubuntu/Debian: sudo apt install poppler-utils + * - macOS: brew install poppler + */ + +import { spawn, execSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +/** + * Result of rendering PDF pages. + */ +export interface RenderResult { + /** Directory containing the rendered page images */ + outputDir: string; + /** Paths to rendered page images (sorted by page number) */ + pageImages: string[]; + /** Total number of pages in the PDF */ + pageCount: number; +} + +/** + * Check if poppler-utils (pdftoppm) is available. + */ +export function isPdfToolsAvailable(): boolean { + try { + execSync('which pdftoppm', { stdio: 'ignore' }); + return true; + } catch { + return false; + } +} + +/** + * Get the number of pages in a PDF file. + * + * @param pdfPath - Path to the PDF file + * @returns Number of pages, or 1 if cannot be determined + */ +export function getPdfPageCount(pdfPath: string): number { + try { + const output = execSync(`pdfinfo "${pdfPath}" 2>/dev/null | grep "^Pages:" | awk '{print $2}'`, { + encoding: 'utf-8', + timeout: 30000 + }); + const count = parseInt(output.trim(), 10); + return isNaN(count) ? 1 : count; + } catch { + return 1; + } +} + +/** + * PDF page dimensions. + */ +export interface PdfPageDimensions { + /** Page number (1-indexed) */ + pageNumber: number; + /** Width in points (72 points = 1 inch) */ + width: number; + /** Height in points */ + height: number; +} + +/** + * Get page dimensions for all pages in a PDF. + * + * Uses pdfinfo to extract MediaBox dimensions. + * + * @param pdfPath - Path to the PDF file + * @returns Array of page dimensions + */ +export function getPdfPageDimensions(pdfPath: string): PdfPageDimensions[] { + const dimensions: PdfPageDimensions[] = []; + + try { + // Use pdfinfo with -f and -l to get per-page info + const pageCount = getPdfPageCount(pdfPath); + + // Get page sizes using pdfinfo -f first -l last + const output = execSync( + `pdfinfo -f 1 -l ${pageCount} "${pdfPath}" 2>/dev/null | grep "Page.*size:"`, + { encoding: 'utf-8', timeout: 30000 } + ); + + // Parse lines like "Page 1 size: 612 x 792 pts (letter)" + const lines = output.trim().split('\n'); + for (const line of lines) { + const match = line.match(/Page\s+(\d+)\s+size:\s+([\d.]+)\s+x\s+([\d.]+)/); + if (match) { + dimensions.push({ + pageNumber: parseInt(match[1], 10), + width: parseFloat(match[2]), + height: parseFloat(match[3]) + }); + } + } + } catch { + // Fallback: try to get just the first page size + try { + const output = execSync( + `pdfinfo "${pdfPath}" 2>/dev/null | grep "Page size:"`, + { encoding: 'utf-8', timeout: 10000 } + ); + const match = output.match(/Page size:\s+([\d.]+)\s+x\s+([\d.]+)/); + if (match) { + const width = parseFloat(match[1]); + const height = parseFloat(match[2]); + const pageCount = getPdfPageCount(pdfPath); + // Assume all pages are same size + for (let i = 1; i <= pageCount; i++) { + dimensions.push({ pageNumber: i, width, height }); + } + } + } catch { + // Ignore fallback errors + } + } + + return dimensions; +} + +/** + * Result of page-size analysis. + */ +export interface PageSizeAnalysis { + /** Whether image should be skipped (too close to page size) */ + shouldSkip: boolean; + /** Reason for skipping */ + reason?: string; + /** Coverage percentage (0-1) of the page area */ + areaCoverage: number; +} + +/** + * Check if an image is likely a full page scan. + * + * Compares image dimensions against page dimensions to detect + * page-sized images (common in OCR-scanned documents). + * + * @param imageWidth - Image width in pixels + * @param imageHeight - Image height in pixels + * @param pageWidth - Page width in points + * @param pageHeight - Page height in points + * @param dpi - Assumed rendering DPI (default 150) + * @returns Analysis result + */ +export function analyzeImageVsPageSize( + imageWidth: number, + imageHeight: number, + pageWidth: number, + pageHeight: number, + dpi: number = 150 +): PageSizeAnalysis { + // Convert page dimensions from points to pixels at the given DPI + // 72 points = 1 inch + const pageWidthPx = (pageWidth / 72) * dpi; + const pageHeightPx = (pageHeight / 72) * dpi; + + // Calculate how much of the page this image covers + const widthRatio = imageWidth / pageWidthPx; + const heightRatio = imageHeight / pageHeightPx; + const areaCoverage = widthRatio * heightRatio; + + // Skip if image covers >70% of page (likely a page scan) + if (areaCoverage > 0.7) { + return { + shouldSkip: true, + reason: `Image covers ${(areaCoverage * 100).toFixed(0)}% of page (likely full-page scan)`, + areaCoverage + }; + } + + // Skip if image dimensions match page dimensions closely + // (within 5% on both dimensions = likely the full page) + if (widthRatio > 0.95 && heightRatio > 0.95) { + return { + shouldSkip: true, + reason: 'Image matches page dimensions (full-page scan)', + areaCoverage + }; + } + + // Skip horizontal strips that span the page width (headers/footers) + if (widthRatio > 0.9 && heightRatio < 0.15) { + return { + shouldSkip: true, + reason: 'Horizontal page-width strip (header/footer)', + areaCoverage + }; + } + + return { + shouldSkip: false, + areaCoverage + }; +} + +/** + * Render a PDF file's pages to PNG images. + * + * Uses pdftoppm from poppler-utils for high-quality rendering. + * Images are saved to a temporary directory. + * + * @param pdfPath - Path to the PDF file + * @param options - Rendering options + * @returns Promise resolving to render result + */ +export async function renderPdfPages( + pdfPath: string, + options: { + dpi?: number; + outputDir?: string; + pages?: number[]; // Specific pages to render (1-indexed), or all if undefined + onProgress?: (current: number, total: number) => void; + timeout?: number; + } = {} +): Promise { + const { + dpi = 150, + outputDir = path.join(os.tmpdir(), `pdf-render-${Date.now()}`), + pages, + onProgress, + timeout = 600000 + } = options; + + // Verify tools are available + if (!isPdfToolsAvailable()) { + throw new Error( + 'pdftoppm not found. Install poppler-utils:\n' + + ' Ubuntu/Debian: sudo apt install poppler-utils\n' + + ' macOS: brew install poppler' + ); + } + + // Verify PDF exists + if (!fs.existsSync(pdfPath)) { + throw new Error(`PDF file not found: ${pdfPath}`); + } + + // Create output directory + fs.mkdirSync(outputDir, { recursive: true }); + + const pageCount = getPdfPageCount(pdfPath); + const outputPrefix = path.join(outputDir, 'page'); + + // Build pdftoppm command + const args = [ + '-png', + '-r', dpi.toString() + ]; + + // Add page range if specific pages requested + if (pages && pages.length > 0) { + const minPage = Math.min(...pages); + const maxPage = Math.max(...pages); + args.push('-f', minPage.toString(), '-l', maxPage.toString()); + } + + args.push(pdfPath, outputPrefix); + + // Run pdftoppm + await new Promise((resolve, reject) => { + const process = spawn('pdftoppm', args); + + let stderr = ''; + + process.stderr.on('data', (data) => { + stderr += data.toString(); + }); + + const timeoutId = setTimeout(() => { + process.kill(); + reject(new Error(`PDF rendering timed out after ${timeout}ms`)); + }, timeout); + + process.on('close', (code) => { + clearTimeout(timeoutId); + if (code === 0) { + resolve(); + } else { + reject(new Error(`pdftoppm failed with code ${code}: ${stderr}`)); + } + }); + + process.on('error', (err) => { + clearTimeout(timeoutId); + reject(err); + }); + }); + + // Collect rendered page images + const files = fs.readdirSync(outputDir) + .filter(f => f.startsWith('page-') && f.endsWith('.png')) + .sort((a, b) => { + // Extract page number from filename (page-01.png, page-02.png, etc.) + const numA = parseInt(a.match(/page-(\d+)\.png/)?.[1] || '0', 10); + const numB = parseInt(b.match(/page-(\d+)\.png/)?.[1] || '0', 10); + return numA - numB; + }); + + const pageImages = files.map(f => path.join(outputDir, f)); + + // Report progress + if (onProgress) { + onProgress(pageImages.length, pageCount); + } + + return { + outputDir, + pageImages, + pageCount + }; +} + +/** + * Clean up rendered page images. + * + * @param renderResult - Result from renderPdfPages + */ +export function cleanupRenderedPages(renderResult: RenderResult): void { + try { + // Delete all files in the output directory + for (const imagePath of renderResult.pageImages) { + if (fs.existsSync(imagePath)) { + fs.unlinkSync(imagePath); + } + } + // Remove the directory if empty + if (fs.existsSync(renderResult.outputDir)) { + const remaining = fs.readdirSync(renderResult.outputDir); + if (remaining.length === 0) { + fs.rmdirSync(renderResult.outputDir); + } + } + } catch { + // Ignore cleanup errors + } +} + +/** + * Result of extracting embedded images from PDF. + */ +export interface ImageExtractionResult { + /** Directory containing extracted images */ + outputDir: string; + /** Extracted images with page info */ + images: ExtractedImage[]; +} + +/** + * Extracted image metadata. + */ +export interface ExtractedImage { + /** Path to the image file */ + imagePath: string; + /** Page number (1-indexed) */ + pageNumber: number; + /** Image index on the page (0-indexed) */ + imageIndex: number; + /** Image width in pixels */ + width: number; + /** Image height in pixels */ + height: number; +} + +/** + * Check if pdfimages is available. + */ +export function isPdfImagesAvailable(): boolean { + try { + execSync('which pdfimages', { stdio: 'ignore' }); + return true; + } catch { + return false; + } +} + +/** + * Extract embedded images from a PDF file using pdfimages. + * + * This extracts the actual image objects embedded in the PDF, + * not rendered pages. Much more accurate for finding diagrams. + * + * @param pdfPath - Path to the PDF file + * @param options - Extraction options + * @returns Promise resolving to extraction result + */ +export async function extractPdfImages( + pdfPath: string, + options: { + outputDir?: string; + minWidth?: number; + minHeight?: number; + timeout?: number; + } = {} +): Promise { + const { + outputDir = path.join(os.tmpdir(), `pdf-images-${Date.now()}`), + minWidth = 100, + minHeight = 100, + timeout = 300000 + } = options; + + // Verify pdfimages is available + if (!isPdfImagesAvailable()) { + throw new Error( + 'pdfimages not found. Install poppler-utils:\n' + + ' Ubuntu/Debian: sudo apt install poppler-utils\n' + + ' macOS: brew install poppler' + ); + } + + // Verify PDF exists + if (!fs.existsSync(pdfPath)) { + throw new Error(`PDF file not found: ${pdfPath}`); + } + + // Create output directory + fs.mkdirSync(outputDir, { recursive: true }); + + const outputPrefix = path.join(outputDir, 'img'); + + // First, get image list with metadata using -list + let imageList = ''; + try { + imageList = execSync(`pdfimages -list "${pdfPath}" 2>/dev/null`, { + encoding: 'utf-8', + timeout: 30000 + }); + } catch { + // pdfimages -list may fail on some PDFs, continue with extraction + } + + // Parse image list to get page numbers + const pageMap = new Map(); // image index -> page number + if (imageList) { + const lines = imageList.split('\n').slice(2); // Skip header + for (const line of lines) { + const parts = line.trim().split(/\s+/); + if (parts.length >= 2) { + const page = parseInt(parts[0], 10); + const imgNum = parseInt(parts[1], 10); + if (!isNaN(page) && !isNaN(imgNum)) { + pageMap.set(imgNum.toString().padStart(3, '0'), page); + } + } + } + } + + // Extract images as PNG + await new Promise((resolve, reject) => { + const process = spawn('pdfimages', ['-png', pdfPath, outputPrefix]); + + let stderr = ''; + + process.stderr.on('data', (data) => { + stderr += data.toString(); + }); + + const timeoutId = setTimeout(() => { + process.kill(); + reject(new Error(`Image extraction timed out after ${timeout}ms`)); + }, timeout); + + process.on('close', (code) => { + clearTimeout(timeoutId); + if (code === 0) { + resolve(); + } else { + reject(new Error(`pdfimages failed with code ${code}: ${stderr}`)); + } + }); + + process.on('error', (err) => { + clearTimeout(timeoutId); + reject(err); + }); + }); + + // Collect extracted images and filter by size + const files = fs.readdirSync(outputDir) + .filter(f => f.startsWith('img-') && f.endsWith('.png')) + .sort(); + + const images: ExtractedImage[] = []; + const pageImageCounts = new Map(); // Track image index per page + + for (const file of files) { + const imagePath = path.join(outputDir, file); + + // Get image dimensions + let width = 0, height = 0; + try { + const result = execSync(`identify -format "%w %h" "${imagePath}"`, { + encoding: 'utf-8', + timeout: 5000 + }); + const [w, h] = result.trim().split(' '); + width = parseInt(w, 10); + height = parseInt(h, 10); + } catch { + // Skip images we can't read + continue; + } + + // Filter by minimum size + if (width < minWidth || height < minHeight) { + fs.unlinkSync(imagePath); // Clean up small images + continue; + } + + // Extract image number from filename (img-000.png, img-001.png, etc.) + const match = file.match(/img-(\d+)\.png/); + const imgNumStr = match?.[1] || '000'; + + // Get page number from the list output, or default to 1 + let pageNumber = pageMap.get(imgNumStr) || 1; + + // Track image index per page + const currentIndex = pageImageCounts.get(pageNumber) || 0; + pageImageCounts.set(pageNumber, currentIndex + 1); + + images.push({ + imagePath, + pageNumber, + imageIndex: currentIndex, + width, + height + }); + } + + return { + outputDir, + images + }; +} + +/** + * Clean up extracted images. + * + * @param result - Result from extractPdfImages + */ +export function cleanupExtractedImages(result: ImageExtractionResult): void { + try { + for (const img of result.images) { + if (fs.existsSync(img.imagePath)) { + fs.unlinkSync(img.imagePath); + } + } + // Clean any remaining files + if (fs.existsSync(result.outputDir)) { + const remaining = fs.readdirSync(result.outputDir); + for (const f of remaining) { + fs.unlinkSync(path.join(result.outputDir, f)); + } + fs.rmdirSync(result.outputDir); + } + } catch { + // Ignore cleanup errors + } +} + diff --git a/src/infrastructure/visual-extraction/region-cropper.ts b/src/infrastructure/visual-extraction/region-cropper.ts new file mode 100644 index 00000000..d64f7a67 --- /dev/null +++ b/src/infrastructure/visual-extraction/region-cropper.ts @@ -0,0 +1,205 @@ +/** + * Region Cropper + * + * Crops detected regions from page images. + * Used for extracting diagrams from scanned PDF pages. + */ + +import sharp from 'sharp'; +import * as fs from 'fs'; +import * as path from 'path'; +import type { DetectedRegion } from './local-classifier.js'; +import type { ImageEmbeddedMetadata } from './image-processor.js'; + +/** + * Options for cropping a region. + */ +export interface CropOptions { + /** Output path for the cropped image */ + outputPath: string; + /** Padding around the region in pixels (default: 10) */ + padding?: number; + /** Maximum width for output (will scale down if larger) */ + maxWidth?: number; + /** Convert to grayscale (default: true) */ + grayscale?: boolean; + /** PNG compression level 0-9 (default: 6) */ + pngCompression?: number; + /** Metadata to embed in the image */ + embeddedMetadata?: ImageEmbeddedMetadata; +} + +/** + * Result of cropping a region. + */ +export interface CropResult { + /** Path to the cropped image */ + outputPath: string; + /** Width of cropped image in pixels */ + width: number; + /** Height of cropped image in pixels */ + height: number; + /** Original region that was cropped */ + region: DetectedRegion; +} + +/** + * Crop a detected region from a page image. + * + * Extracts the specified bounding box, optionally converts to grayscale, + * and saves with embedded metadata. + * + * @param pageImagePath - Path to the full page image + * @param region - Detected region with bounding box + * @param options - Crop options + * @returns Crop result with output dimensions + */ +export async function cropRegion( + pageImagePath: string, + region: DetectedRegion, + options: CropOptions +): Promise { + const { + outputPath, + padding = 10, + maxWidth = 1200, + grayscale = true, + pngCompression = 6, + embeddedMetadata + } = options; + + // Verify source image exists + if (!fs.existsSync(pageImagePath)) { + throw new Error(`Page image not found: ${pageImagePath}`); + } + + // Get source image dimensions + const metadata = await sharp(pageImagePath).metadata(); + const sourceWidth = metadata.width || 0; + const sourceHeight = metadata.height || 0; + + // Calculate crop region with padding, bounded by image dimensions + const x = Math.max(0, region.bbox.x - padding); + const y = Math.max(0, region.bbox.y - padding); + const width = Math.min(region.bbox.width + padding * 2, sourceWidth - x); + const height = Math.min(region.bbox.height + padding * 2, sourceHeight - y); + + // Ensure output directory exists + const outputDir = path.dirname(outputPath); + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + + // Build the sharp pipeline + let pipeline = sharp(pageImagePath) + .extract({ left: x, top: y, width, height }); + + // Convert to grayscale if requested + if (grayscale) { + pipeline = pipeline.grayscale(); + } + + // Scale down if too wide + if (width > maxWidth) { + pipeline = pipeline.resize(maxWidth, null, { + withoutEnlargement: true, + fit: 'inside' + }); + } + + // Add metadata if provided + if (embeddedMetadata) { + const exifData: Record = {}; + + if (embeddedMetadata.title) { + exifData['ImageDescription'] = embeddedMetadata.title; + } + if (embeddedMetadata.author) { + exifData['Artist'] = embeddedMetadata.author; + } + if (embeddedMetadata.year !== undefined) { + exifData['Copyright'] = `${embeddedMetadata.year}`; + } + + // Build custom metadata string + const customParts: string[] = []; + if (embeddedMetadata.pageNumber !== undefined) { + customParts.push(`page:${embeddedMetadata.pageNumber}`); + } + if (embeddedMetadata.imageIndex !== undefined) { + customParts.push(`index:${embeddedMetadata.imageIndex}`); + } + if (embeddedMetadata.catalogId !== undefined) { + customParts.push(`catalog:${embeddedMetadata.catalogId}`); + } + + if (customParts.length > 0) { + exifData['Software'] = `concept-rag ${customParts.join(' ')}`; + } + + pipeline = pipeline.withMetadata({ + exif: { + IFD0: exifData + } + }); + } + + // Save as PNG + await pipeline + .png({ compressionLevel: pngCompression }) + .toFile(outputPath); + + // Get output dimensions + const outputMetadata = await sharp(outputPath).metadata(); + + return { + outputPath, + width: outputMetadata.width || width, + height: outputMetadata.height || height, + region + }; +} + +/** + * Crop multiple regions from a single page image. + * + * More efficient than calling cropRegion() in a loop as it + * only reads the source image once. + * + * @param pageImagePath - Path to the full page image + * @param regions - Array of detected regions + * @param outputDir - Directory to save cropped images + * @param filenamePrefix - Prefix for output filenames (e.g., "p001") + * @param options - Crop options (outputPath is ignored) + * @returns Array of crop results + */ +export async function cropRegions( + pageImagePath: string, + regions: DetectedRegion[], + outputDir: string, + filenamePrefix: string, + options: Omit = {} +): Promise { + const results: CropResult[] = []; + + // Ensure output directory exists + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + + for (let i = 0; i < regions.length; i++) { + const region = regions[i]; + const filename = `${filenamePrefix}_v${i}.png`; + const outputPath = path.join(outputDir, filename); + + const result = await cropRegion(pageImagePath, region, { + ...options, + outputPath + }); + + results.push(result); + } + + return results; +} + diff --git a/src/infrastructure/visual-extraction/types.ts b/src/infrastructure/visual-extraction/types.ts new file mode 100644 index 00000000..f16a4aa5 --- /dev/null +++ b/src/infrastructure/visual-extraction/types.ts @@ -0,0 +1,109 @@ +/** + * Visual Extraction Types + * + * Shared types for the visual extraction pipeline. + */ + +import type { VisualType } from '../../domain/models/visual.js'; + +/** + * Bounding box for a detected visual region on a page. + */ +export interface BoundingBox { + /** X coordinate (left edge) as fraction of page width (0-1) */ + x: number; + /** Y coordinate (top edge) as fraction of page height (0-1) */ + y: number; + /** Width as fraction of page width (0-1) */ + width: number; + /** Height as fraction of page height (0-1) */ + height: number; +} + +/** + * A detected visual region on a page. + */ +export interface DetectedVisual { + /** Classification of the visual */ + type: VisualType | 'skip'; + /** Bounding box (normalized 0-1 coordinates) */ + boundingBox: BoundingBox; + /** Confidence score (0-1) */ + confidence: number; + /** Brief description from detection (not full semantic description) */ + caption?: string; +} + +/** + * Result of visual detection on a single page. + */ +export interface PageDetectionResult { + /** Page number (1-indexed) */ + pageNumber: number; + /** Path to the rendered page image */ + pageImagePath: string; + /** Detected visuals on this page */ + visuals: DetectedVisual[]; +} + +/** + * Result of extracting a visual region. + */ +export interface ExtractedVisual { + /** Page number (1-indexed) for PDFs, or 0 for EPUBs */ + pageNumber: number; + /** Chapter index (0-indexed) for EPUBs, undefined for PDFs */ + chapterIndex?: number; + /** Chapter title for EPUBs */ + chapterTitle?: string; + /** Index of this visual on the page/chapter (0-indexed) */ + visualIndex: number; + /** Classification of the visual */ + type: VisualType; + /** Path to the saved image file */ + imagePath: string; + /** Bounding box used for extraction */ + boundingBox: BoundingBox; + /** Width in pixels */ + width: number; + /** Height in pixels */ + height: number; +} + +/** + * Configuration for visual extraction. + */ +export interface VisualExtractionConfig { + /** Minimum width in pixels for a visual to be extracted */ + minWidth: number; + /** Minimum height in pixels for a visual to be extracted */ + minHeight: number; + /** Maximum number of visuals to extract per page */ + maxVisualsPerPage: number; + /** DPI for PDF page rendering (higher = more detail, larger files) */ + renderDpi: number; + /** PNG compression quality (0-9, higher = smaller file, slower) */ + pngCompression: number; +} + +/** + * Default configuration for visual extraction. + */ +export const DEFAULT_VISUAL_EXTRACTION_CONFIG: VisualExtractionConfig = { + minWidth: 100, + minHeight: 100, + maxVisualsPerPage: 10, + renderDpi: 150, + pngCompression: 6 +}; + +/** + * Progress callback for visual extraction operations. + */ +export type VisualExtractionProgressCallback = ( + stage: 'rendering' | 'detecting' | 'extracting' | 'classifying', + current: number, + total: number, + message?: string +) => void; + diff --git a/src/infrastructure/visual-extraction/vision-llm-service.ts b/src/infrastructure/visual-extraction/vision-llm-service.ts new file mode 100644 index 00000000..9d5e3376 --- /dev/null +++ b/src/infrastructure/visual-extraction/vision-llm-service.ts @@ -0,0 +1,298 @@ +/** + * Vision LLM Service + * + * Provides Vision LLM integration via OpenRouter for: + * - Visual classification (diagram vs photo) + * - Semantic description generation + * + * Supports models with vision capabilities: + * - anthropic/claude-3-5-haiku-20241022 (default - fast and cost-effective) + * - anthropic/claude-sonnet-4 + * - openai/gpt-4o + * - google/gemini-2.0-flash-001 + */ + +import { loadImageAsBase64 } from './image-processor.js'; +import type { VisualType } from '../../domain/models/visual.js'; +import type { DetectedVisual } from './types.js'; + +/** + * Configuration for Vision LLM service. + */ +export interface VisionLLMConfig { + apiKey: string; + model?: string; + baseUrl?: string; + timeoutMs?: number; + maxRetries?: number; +} + +/** + * Classification result from Vision LLM. + */ +export interface ClassificationResult { + /** Visual type or 'skip' if not a diagram */ + type: VisualType | 'skip'; + /** Confidence score (0-1) */ + confidence: number; + /** Brief explanation */ + reason?: string; +} + +/** + * Description result from Vision LLM. + */ +export interface DescriptionResult { + /** Semantic description of the visual */ + description: string; + /** Visual type classification */ + type: VisualType; + /** Key concepts identified in the visual */ + concepts: string[]; +} + +/** + * Detection result for visuals on a page. + */ +export interface PageVisualDetectionResult { + /** Detected visuals with bounding boxes */ + visuals: DetectedVisual[]; + /** Whether the page contains any visuals */ + hasVisuals: boolean; +} + +import { Configuration } from '../../application/config/index.js'; + +const DEFAULT_BASE_URL = 'https://openrouter.ai/api/v1'; +const DEFAULT_TIMEOUT_MS = 60000; + +/** + * Classification prompt for determining if an image is a diagram. + */ +const CLASSIFICATION_PROMPT = `Analyze this image from a technical document. + +Classify it as ONE of: +- diagram: flowcharts, UML, architecture diagrams, state machines, sequence diagrams, dependency graphs +- flowchart: process flows, decision trees, workflow diagrams +- chart: bar charts, line graphs, pie charts, scatter plots, histograms +- table: structured tabular data, matrices +- figure: technical illustrations with labels, annotated diagrams +- skip: photographs, screenshots, decorative images, logos, icons, cover images + +IMPORTANT: Only classify as diagram/flowchart/chart/table/figure if it has semantic technical meaning. +Photos, decorative elements, and non-technical images should be classified as "skip". + +Respond with ONLY a JSON object: +{"type": "", "confidence": <0-1>, "reason": ""}`; + +/** + * Description prompt for generating semantic description of a visual. + */ +const DESCRIPTION_PROMPT = `Describe this diagram from a technical document. + +Focus on the SEMANTIC MEANING, not visual appearance: +1. What system, process, or concept does this diagram represent? +2. What are the key components or entities shown? +3. What relationships or flows are depicted? +4. What technical concepts does this illustrate? + +Provide: +1. A concise description (2-4 sentences) capturing the semantic meaning +2. Classification as: diagram, flowchart, chart, table, or figure +3. Key technical concepts illustrated (3-8 concepts) + +Respond with ONLY a JSON object: +{ + "description": "", + "type": "", + "concepts": ["concept1", "concept2", ...] +}`; + +/** + * Vision LLM Service for visual classification and description. + */ +export class VisionLLMService { + private config: Required; + + constructor(config: VisionLLMConfig) { + if (!config.apiKey) { + throw new Error('Vision LLM API key is required'); + } + + // Get default model from configuration + const appConfig = Configuration.getInstance(); + const defaultModel = appConfig.llm.visionModel; + + this.config = { + apiKey: config.apiKey, + model: config.model || defaultModel, + baseUrl: config.baseUrl || DEFAULT_BASE_URL, + timeoutMs: config.timeoutMs || DEFAULT_TIMEOUT_MS, + maxRetries: config.maxRetries || 2 + }; + } + + /** + * Classify an image as diagram or skip. + * + * @param imagePath - Path to the image file + * @returns Classification result + */ + async classifyImage(imagePath: string): Promise { + const imageBase64 = await loadImageAsBase64(imagePath); + + const response = await this.callVisionLLM(CLASSIFICATION_PROMPT, imageBase64); + + try { + // Extract JSON from response (may have markdown code blocks) + const jsonMatch = response.match(/\{[\s\S]*\}/); + if (!jsonMatch) { + // Only log if there was an actual response (not empty/rate-limited) + if (response.trim()) { + console.warn('Failed to parse classification response:', response); + } + return { type: 'skip', confidence: 0.5, reason: 'Parse error' }; + } + + const result = JSON.parse(jsonMatch[0]); + + // Validate type + const validTypes = ['diagram', 'flowchart', 'chart', 'table', 'figure', 'skip']; + const type = validTypes.includes(result.type) ? result.type : 'skip'; + + return { + type: type as VisualType | 'skip', + confidence: typeof result.confidence === 'number' ? result.confidence : 0.5, + reason: result.reason + }; + } catch (error) { + // Silently skip - parse errors are expected for non-semantic images + return { type: 'skip', confidence: 0.5, reason: 'Parse error' }; + } + } + + /** + * Generate semantic description of a visual. + * + * @param imagePath - Path to the image file + * @returns Description result + */ + async describeVisual(imagePath: string): Promise { + const imageBase64 = await loadImageAsBase64(imagePath); + + const response = await this.callVisionLLM(DESCRIPTION_PROMPT, imageBase64); + + try { + // Extract JSON from response + const jsonMatch = response.match(/\{[\s\S]*\}/); + if (!jsonMatch) { + throw new Error('No JSON found in response'); + } + + const result = JSON.parse(jsonMatch[0]); + + // Validate and normalize + const validTypes = ['diagram', 'flowchart', 'chart', 'table', 'figure']; + const type = validTypes.includes(result.type) ? result.type : 'diagram'; + + return { + description: result.description || 'Visual content from document', + type: type as VisualType, + concepts: Array.isArray(result.concepts) ? result.concepts : [] + }; + } catch (error) { + console.warn('Failed to parse description response:', error); + return { + description: 'Visual content from document (description unavailable)', + type: 'diagram', + concepts: [] + }; + } + } + + /** + * Call the Vision LLM API. + * + * @param prompt - Text prompt + * @param imageBase64 - Base64-encoded image with data URL prefix + * @returns Response text + */ + private async callVisionLLM(prompt: string, imageBase64: string): Promise { + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), this.config.timeoutMs); + + try { + const response = await fetch(`${this.config.baseUrl}/chat/completions`, { + method: 'POST', + headers: { + 'Authorization': `Bearer ${this.config.apiKey}`, + 'Content-Type': 'application/json', + 'HTTP-Referer': 'https://github.com/m2ux/concept-rag', + 'X-Title': 'Concept-RAG Visual Extraction' + }, + body: JSON.stringify({ + model: this.config.model, + messages: [ + { + role: 'user', + content: [ + { + type: 'text', + text: prompt + }, + { + type: 'image_url', + image_url: { + url: imageBase64 + } + } + ] + } + ], + temperature: 0.3, + max_tokens: 1024 + }), + signal: controller.signal + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Vision LLM API error: ${response.status} - ${errorText}`); + } + + const data = await response.json() as { + choices: Array<{ message: { content: string } }>; + }; + + return data.choices[0]?.message?.content || ''; + } finally { + clearTimeout(timeoutId); + } + } +} + +/** + * Create a Vision LLM service from environment/configuration. + */ +export function createVisionLLMService( + options: { + apiKey?: string; + model?: string; + } = {} +): VisionLLMService { + const config = Configuration.getInstance(); + const apiKey = options.apiKey || config.llm.apiKey; + + if (!apiKey) { + throw new Error( + 'OPENROUTER_API_KEY environment variable is required for Vision LLM.\n' + + 'Get an API key from https://openrouter.ai/' + ); + } + + return new VisionLLMService({ + apiKey, + model: options.model // Will use config default if undefined + }); +} + diff --git a/src/infrastructure/visual-extraction/visual-extractor.ts b/src/infrastructure/visual-extraction/visual-extractor.ts new file mode 100644 index 00000000..d97c3c98 --- /dev/null +++ b/src/infrastructure/visual-extraction/visual-extractor.ts @@ -0,0 +1,837 @@ +/** + * Visual Extractor + * + * Orchestrates the visual extraction pipeline: + * 1. Analyze document type (native vs scanned) + * 2. Extract/render images + * 3. Classify using LOCAL model (no API cost) + * 4. Save semantic diagrams as grayscale + * + * Classification is done locally using LayoutParser. + * Vision LLM is only used for description generation (separate step). + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import { + extractPdfImages, + cleanupExtractedImages, + cleanupRenderedPages, + isPdfImagesAvailable, + isPdfToolsAvailable, + getPdfPageDimensions, + analyzeImageVsPageSize, + renderPdfPages, + type ExtractedImage, + type PdfPageDimensions +} from './pdf-page-renderer.js'; +import { convertToGrayscale, getImageMetadata, type ImageEmbeddedMetadata } from './image-processor.js'; +import { classifyImage, detectRegions, isLocalClassifierAvailable } from './local-classifier.js'; +import { analyzeDocumentType, type DocumentType } from './document-analyzer.js'; +import { cropRegion } from './region-cropper.js'; +import { EpubImageExtractor, type EpubImage } from './epub-image-extractor.js'; +import type { ExtractedVisual, VisualExtractionConfig, VisualExtractionProgressCallback } from './types.js'; +import { DEFAULT_VISUAL_EXTRACTION_CONFIG } from './types.js'; +import type { VisualType } from '../../domain/models/visual.js'; +import { slugifyDocument, formatVisualFilename, type DocumentInfo } from '../utils/slugify.js'; + +/** Supported document formats for visual extraction */ +export type DocumentFormat = 'pdf' | 'epub'; + +/** Batch size for parallel classification */ +const CLASSIFICATION_BATCH_SIZE = 5; + +/** + * Result of visual extraction for a document. + */ +export interface VisualExtractionResult { + /** Catalog ID of the source document */ + catalogId: number; + /** Path to source document */ + sourcePath: string; + /** Human-readable folder slug (e.g., "martin_clean-architecture_2017") */ + folderSlug: string; + /** Document format (pdf or epub) */ + documentFormat: DocumentFormat; + /** Document type detected (for PDFs: native/scanned, for EPUBs: always 'native') */ + documentType: DocumentType; + /** Extracted visuals */ + visuals: ExtractedVisual[]; + /** Pages/chapters processed */ + pagesProcessed: number; + /** Pages/chapters skipped (no visuals) */ + pagesSkipped: number; + /** Images classified as non-semantic (not stored) */ + imagesFiltered: number; + /** Images skipped by pre-filter (page-sized for PDF, cover/decorative for EPUB) */ + imagesPreFiltered: number; + /** Errors encountered */ + errors: string[]; +} + +/** + * Options for visual extraction. + */ +export interface VisualExtractionOptions { + /** Configuration overrides */ + config?: Partial; + /** Progress callback */ + onProgress?: VisualExtractionProgressCallback; + /** Specific pages to process (1-indexed), or all if undefined */ + pages?: number[]; + /** Force document type instead of auto-detecting */ + forceDocumentType?: DocumentType; + /** Minimum confidence score for classification (0-1, default: 0.5) */ + minClassificationScore?: number; +} + +/** + * Visual Extractor for extracting diagrams from PDF documents. + * + * Uses local classification model for filtering (no API cost). + * Supports both native PDFs (embedded images) and scanned PDFs (page images). + */ +export class VisualExtractor { + private config: VisualExtractionConfig; + private imagesDir: string; + + /** + * Create a new VisualExtractor. + * + * @param dbPath - Path to the database directory (for images folder) + * @param options - Extraction options + */ + constructor( + dbPath: string, + options: { + config?: Partial; + } = {} + ) { + this.config = { + ...DEFAULT_VISUAL_EXTRACTION_CONFIG, + ...options.config + }; + + this.imagesDir = path.join(dbPath, 'images'); + + // Ensure images directory exists + if (!fs.existsSync(this.imagesDir)) { + fs.mkdirSync(this.imagesDir, { recursive: true }); + } + } + + /** + * Extract visuals from a PDF document. + * + * Automatically detects document type and uses appropriate strategy: + * - Native PDF: Extract embedded images → classify → save + * - Scanned PDF: Render pages → detect regions → crop → save + * + * @param pdfPath - Path to the PDF file + * @param catalogId - Catalog ID for the document + * @param documentInfo - Document metadata for folder naming + * @param options - Extraction options + * @returns Extraction result + */ + async extractFromPdf( + pdfPath: string, + catalogId: number, + documentInfo: DocumentInfo, + options: VisualExtractionOptions = {} + ): Promise { + const { onProgress, forceDocumentType, minClassificationScore = 0.5 } = options; + + // Generate human-readable folder slug + const folderSlug = slugifyDocument({ ...documentInfo, id: catalogId }); + + // Initialize result + const result: VisualExtractionResult = { + catalogId, + sourcePath: pdfPath, + folderSlug, + documentFormat: 'pdf', + documentType: 'native', + visuals: [], + pagesProcessed: 0, + pagesSkipped: 0, + imagesFiltered: 0, + imagesPreFiltered: 0, + errors: [] + }; + + // Verify PDF tools are available + if (!isPdfImagesAvailable()) { + result.errors.push('pdfimages not found. Install poppler-utils.'); + return result; + } + + // Create document-specific images directory + const catalogImagesDir = path.join(this.imagesDir, folderSlug); + if (!fs.existsSync(catalogImagesDir)) { + fs.mkdirSync(catalogImagesDir, { recursive: true }); + } + + try { + // Step 0: Determine document type + if (onProgress) { + onProgress('extracting', 0, 1, 'Analyzing document type...'); + } + + let documentType: DocumentType; + if (forceDocumentType) { + documentType = forceDocumentType; + } else { + const analysis = await analyzeDocumentType(pdfPath); + documentType = analysis.type; + } + result.documentType = documentType; + + if (onProgress) { + onProgress('extracting', 0, 1, `Document type: ${documentType}`); + } + + // Route to appropriate extraction method + if (documentType === 'scanned') { + // Skip extraction for scanned documents - OCR text detection is unreliable + if (onProgress) { + onProgress('extracting', 1, 1, 'Skipping scanned document'); + } + result.pagesSkipped = 1; + } else { + await this.extractFromNativePdf( + pdfPath, catalogId, documentInfo, catalogImagesDir, result, + { onProgress, minScore: minClassificationScore } + ); + + // If ALL images were page-sized (pre-filtered), this is likely a scanned PDF + // packaged as native - skip it rather than attempting region detection + if (result.imagesPreFiltered > 0 && + result.visuals.length === 0 && + result.imagesFiltered === 0) { + if (onProgress) { + onProgress('extracting', 1, 1, 'Skipping (all images page-sized, likely scanned)'); + } + result.documentType = 'scanned'; + } + } + + } catch (error: any) { + result.errors.push(`Extraction failed: ${error.message}`); + } + + return result; + } + + /** + * Extract visuals from a native PDF (embedded image objects). + * + * Uses pdfimages to extract embedded images, pre-filters page-sized images, + * then classifies remaining images using local model. + */ + private async extractFromNativePdf( + pdfPath: string, + catalogId: number, + documentInfo: DocumentInfo, + outputDir: string, + result: VisualExtractionResult, + options: { onProgress?: VisualExtractionProgressCallback; minScore: number } + ): Promise { + const { onProgress, minScore } = options; + const folderSlug = result.folderSlug; + + let extractionResult; + try { + // Get PDF page dimensions for pre-filtering + const pageDimensions = getPdfPageDimensions(pdfPath); + const pageDimMap = new Map(); + for (const dim of pageDimensions) { + pageDimMap.set(dim.pageNumber, dim); + } + + // Extract embedded images + if (onProgress) { + onProgress('extracting', 0, 1, 'Extracting embedded images...'); + } + + extractionResult = await extractPdfImages(pdfPath, { + minWidth: this.config.minWidth, + minHeight: this.config.minHeight + }); + + const totalImages = extractionResult.images.length; + + if (totalImages === 0) { + result.pagesSkipped = 1; + return; + } + + if (onProgress) { + onProgress('extracting', 1, 1, `Found ${totalImages} embedded images`); + } + + // Pre-filter page-sized images + const candidateImages: ExtractedImage[] = []; + + for (const img of extractionResult.images) { + const pageDim = pageDimMap.get(img.pageNumber); + + if (pageDim) { + const analysis = analyzeImageVsPageSize( + img.width, + img.height, + pageDim.width, + pageDim.height + ); + + if (analysis.shouldSkip) { + result.imagesPreFiltered++; + continue; + } + } + + candidateImages.push(img); + } + + if (onProgress && result.imagesPreFiltered > 0) { + onProgress('extracting', 1, 1, + `Pre-filtered ${result.imagesPreFiltered} page-sized, ${candidateImages.length} candidates`); + } + + // Classify candidates using local model + const totalCandidates = candidateImages.length; + + for (let batchStart = 0; batchStart < totalCandidates; batchStart += CLASSIFICATION_BATCH_SIZE) { + const batchEnd = Math.min(batchStart + CLASSIFICATION_BATCH_SIZE, totalCandidates); + const batch = candidateImages.slice(batchStart, batchEnd); + + if (onProgress) { + onProgress('classifying', batchStart + 1, totalCandidates, + `Classifying ${batchStart + 1}-${batchEnd} of ${totalCandidates}`); + } + + // Process batch in parallel using LOCAL classifier + const batchResults = await Promise.all( + batch.map(async (img) => { + try { + const classification = await classifyImage(img.imagePath, { minScore }); + return { img, classification, error: null }; + } catch (err: any) { + return { img, classification: null, error: err.message }; + } + }) + ); + + // Process batch results + for (const { img, classification, error } of batchResults) { + if (error) { + result.errors.push(`Image p${img.pageNumber}_v${img.imageIndex}: ${error}`); + continue; + } + + if (!classification || classification.skip) { + result.imagesFiltered++; + continue; + } + + // Save as grayscale with embedded metadata + await this.saveExtractedImage( + img.imagePath, + img.pageNumber, + img.imageIndex, + classification.type as VisualType, + catalogId, + documentInfo, + outputDir, + folderSlug, + result + ); + } + } + + } finally { + // Clean up temp files + if (extractionResult) { + cleanupExtractedImages(extractionResult); + } + } + } + + /** + * Extract visuals from a scanned PDF (pages stored as images). + * + * Renders each page, detects diagram regions using local model, + * then crops and saves each detected region. + */ + private async extractFromScannedPdf( + pdfPath: string, + catalogId: number, + documentInfo: DocumentInfo, + outputDir: string, + result: VisualExtractionResult, + options: { onProgress?: VisualExtractionProgressCallback; minScore: number } + ): Promise { + const { onProgress, minScore } = options; + const folderSlug = result.folderSlug; + + // Check if local classifier is available + if (!isLocalClassifierAvailable()) { + result.errors.push( + 'Local classifier not available. Run: cd scripts/python && ./setup.sh' + ); + return; + } + + // Check if pdftoppm is available + if (!isPdfToolsAvailable()) { + result.errors.push('pdftoppm not found. Install poppler-utils.'); + return; + } + + let renderResult; + try { + // Render PDF pages to images + if (onProgress) { + onProgress('extracting', 0, 1, 'Rendering PDF pages...'); + } + + renderResult = await renderPdfPages(pdfPath, { + dpi: this.config.renderDpi || 150 + }); + + const totalPages = renderResult.pageImages.length; + + if (totalPages === 0) { + result.pagesSkipped = 1; + return; + } + + if (onProgress) { + onProgress('extracting', 1, 1, `Rendered ${totalPages} pages`); + } + + // Process each page + for (let i = 0; i < totalPages; i++) { + const pageImage = renderResult.pageImages[i]; + const pageNumber = i + 1; + + if (onProgress) { + onProgress('classifying', pageNumber, totalPages, + `Detecting regions on page ${pageNumber}`); + } + + try { + // Detect diagram regions in this page + const regions = await detectRegions(pageImage, { minScore }); + + if (regions.length === 0) { + result.pagesSkipped++; + continue; + } + + // Crop and save each detected region + for (let j = 0; j < regions.length; j++) { + const region = regions[j]; + const outputFilename = formatVisualFilename(pageNumber, j); + const outputPath = path.join(outputDir, outputFilename); + + // Build embedded metadata + const embeddedMetadata: ImageEmbeddedMetadata = { + title: documentInfo.title, + author: documentInfo.author, + year: documentInfo.year, + pageNumber, + imageIndex: j, + catalogId + }; + + try { + const cropResult = await cropRegion(pageImage, region, { + outputPath, + grayscale: true, + maxWidth: 1200, + pngCompression: this.config.pngCompression, + embeddedMetadata + }); + + const extractedVisual: ExtractedVisual = { + pageNumber, + visualIndex: j, + type: region.type as VisualType, + imagePath: path.join('images', folderSlug, outputFilename), + boundingBox: region.bbox, + width: cropResult.width, + height: cropResult.height + }; + + result.visuals.push(extractedVisual); + result.pagesProcessed++; + + } catch (cropError: any) { + result.errors.push(`Crop p${pageNumber}_v${j}: ${cropError.message}`); + } + } + + } catch (detectError: any) { + result.errors.push(`Page ${pageNumber}: ${detectError.message}`); + result.pagesSkipped++; + } + } + + } finally { + // Clean up rendered pages + if (renderResult) { + cleanupRenderedPages(renderResult); + } + } + } + + /** + * Save an extracted image with grayscale conversion and metadata. + */ + private async saveExtractedImage( + sourcePath: string, + pageNumber: number, + imageIndex: number, + visualType: VisualType, + catalogId: number, + documentInfo: DocumentInfo, + outputDir: string, + folderSlug: string, + result: VisualExtractionResult + ): Promise { + const outputFilename = formatVisualFilename(pageNumber, imageIndex); + const outputPath = path.join(outputDir, outputFilename); + + // Build embedded metadata + const embeddedMetadata: ImageEmbeddedMetadata = { + title: documentInfo.title, + author: documentInfo.author, + year: documentInfo.year, + pageNumber, + imageIndex, + catalogId + }; + + try { + await convertToGrayscale(sourcePath, outputPath, { + pngCompression: this.config.pngCompression, + maxWidth: 1200, + embeddedMetadata + }); + + const outputMetadata = await getImageMetadata(outputPath); + + const extractedVisual: ExtractedVisual = { + pageNumber, + visualIndex: imageIndex, + type: visualType, + imagePath: path.join('images', folderSlug, outputFilename), + boundingBox: { x: 0, y: 0, width: 1, height: 1 }, + width: outputMetadata.width, + height: outputMetadata.height + }; + + result.visuals.push(extractedVisual); + result.pagesProcessed++; + + } catch (saveError: any) { + result.errors.push(`Save p${pageNumber}_v${imageIndex}: ${saveError.message}`); + } + } + + /** + * Get the path to a stored visual image. + * + * @param folderSlug - Document folder slug (e.g., "martin_clean-architecture_2017") + * @param pageNumber - Page number (1-indexed) + * @param visualIndex - Visual index on the page (0-indexed) + * @returns Full path to the image file + */ + getVisualPath(folderSlug: string, pageNumber: number, visualIndex: number): string { + const filename = formatVisualFilename(pageNumber, visualIndex); + return path.join(this.imagesDir, folderSlug, filename); + } + + /** + * Delete all extracted visuals for a document. + * + * @param folderSlug - Document folder slug + * @returns Number of files deleted + */ + async deleteVisualsForDocument(folderSlug: string): Promise { + const docDir = path.join(this.imagesDir, folderSlug); + + if (!fs.existsSync(docDir)) { + return 0; + } + + const files = fs.readdirSync(docDir); + let deleted = 0; + + for (const file of files) { + try { + fs.unlinkSync(path.join(docDir, file)); + deleted++; + } catch { + // Ignore individual file errors + } + } + + // Try to remove the directory if empty + try { + const remaining = fs.readdirSync(docDir); + if (remaining.length === 0) { + fs.rmdirSync(docDir); + } + } catch { + // Ignore directory removal errors + } + + return deleted; + } + + /** + * List all document folders in the images directory. + * + * @returns Array of folder slugs + */ + listDocumentFolders(): string[] { + if (!fs.existsSync(this.imagesDir)) { + return []; + } + + return fs.readdirSync(this.imagesDir, { withFileTypes: true }) + .filter(dirent => dirent.isDirectory()) + .map(dirent => dirent.name); + } + + /** + * Extract visuals from a document (auto-detects format). + * + * Routes to appropriate extraction method based on file extension. + * + * @param filePath - Path to the document file (PDF or EPUB) + * @param catalogId - Catalog ID for the document + * @param documentInfo - Document metadata for folder naming + * @param options - Extraction options + * @returns Extraction result + */ + async extract( + filePath: string, + catalogId: number, + documentInfo: DocumentInfo, + options: VisualExtractionOptions = {} + ): Promise { + const ext = path.extname(filePath).toLowerCase(); + + if (ext === '.pdf') { + return this.extractFromPdf(filePath, catalogId, documentInfo, options); + } else if (ext === '.epub') { + return this.extractFromEpub(filePath, catalogId, documentInfo, options); + } else { + throw new Error(`Unsupported document format: ${ext}. Supported formats: .pdf, .epub`); + } + } + + /** + * Extract visuals from an EPUB document. + * + * Extracts images from EPUB, classifies them using local model, + * and saves semantic diagrams as grayscale images. + * + * @param epubPath - Path to the EPUB file + * @param catalogId - Catalog ID for the document + * @param documentInfo - Document metadata for folder naming + * @param options - Extraction options + * @returns Extraction result + */ + async extractFromEpub( + epubPath: string, + catalogId: number, + documentInfo: DocumentInfo, + options: VisualExtractionOptions = {} + ): Promise { + const { onProgress, minClassificationScore = 0.5 } = options; + + // Generate human-readable folder slug + const folderSlug = slugifyDocument({ ...documentInfo, id: catalogId }); + + // Initialize result + const result: VisualExtractionResult = { + catalogId, + sourcePath: epubPath, + folderSlug, + documentFormat: 'epub', + documentType: 'native', // EPUBs are always "native" + visuals: [], + pagesProcessed: 0, + pagesSkipped: 0, + imagesFiltered: 0, + imagesPreFiltered: 0, + errors: [] + }; + + // Create document-specific images directory + const catalogImagesDir = path.join(this.imagesDir, folderSlug); + if (!fs.existsSync(catalogImagesDir)) { + fs.mkdirSync(catalogImagesDir, { recursive: true }); + } + + const epubExtractor = new EpubImageExtractor(); + let extractionResult; + + try { + // Step 1: Extract images from EPUB + if (onProgress) { + onProgress('extracting', 0, 1, 'Extracting images from EPUB...'); + } + + extractionResult = await epubExtractor.extract(epubPath, { + minWidth: this.config.minWidth, + minHeight: this.config.minHeight + }); + + // Track pre-filtered images + result.imagesPreFiltered = + extractionResult.skipped.cover + + extractionResult.skipped.tooSmall + + extractionResult.skipped.decorative + + extractionResult.skipped.unsupportedFormat; + + const totalImages = extractionResult.extractedImages.length; + + if (totalImages === 0) { + if (onProgress) { + onProgress('extracting', 1, 1, 'No candidate images found'); + } + result.pagesSkipped = 1; + return result; + } + + if (onProgress) { + onProgress('extracting', 1, 1, + `Found ${totalImages} candidate images (${result.imagesPreFiltered} pre-filtered)`); + } + + // Step 2: Classify candidates using local model + for (let batchStart = 0; batchStart < totalImages; batchStart += CLASSIFICATION_BATCH_SIZE) { + const batchEnd = Math.min(batchStart + CLASSIFICATION_BATCH_SIZE, totalImages); + const batch = extractionResult.extractedImages.slice(batchStart, batchEnd); + + if (onProgress) { + onProgress('classifying', batchStart + 1, totalImages, + `Classifying ${batchStart + 1}-${batchEnd} of ${totalImages}`); + } + + // Process batch in parallel using LOCAL classifier + const batchResults = await Promise.all( + batch.map(async (img) => { + try { + const classification = await classifyImage(img.tempPath, { minScore: minClassificationScore }); + return { img, classification, error: null }; + } catch (err: any) { + return { img, classification: null, error: err.message }; + } + }) + ); + + // Process batch results + for (const { img, classification, error } of batchResults) { + if (error) { + result.errors.push(`Image ${img.manifestId}: ${error}`); + continue; + } + + if (!classification || classification.skip) { + result.imagesFiltered++; + continue; + } + + // Save as grayscale with embedded metadata + await this.saveEpubImage( + img, + classification.type as VisualType, + catalogId, + documentInfo, + catalogImagesDir, + folderSlug, + result + ); + } + } + + // Add extraction errors + if (extractionResult.errors.length > 0) { + result.errors.push(...extractionResult.errors); + } + + } catch (error: any) { + result.errors.push(`EPUB extraction failed: ${error.message}`); + } finally { + // Clean up temp files + if (extractionResult) { + epubExtractor.cleanup(extractionResult); + } + } + + return result; + } + + /** + * Save an extracted EPUB image with grayscale conversion and metadata. + */ + private async saveEpubImage( + epubImage: EpubImage, + visualType: VisualType, + catalogId: number, + documentInfo: DocumentInfo, + outputDir: string, + folderSlug: string, + result: VisualExtractionResult + ): Promise { + // Use chapter index for naming (since EPUBs don't have pages) + // Add 1 to make it 1-indexed like PDF pages + const chapterNum = epubImage.chapterIndex >= 0 ? epubImage.chapterIndex + 1 : 0; + const outputFilename = formatVisualFilename(chapterNum, epubImage.imageIndex); + const outputPath = path.join(outputDir, outputFilename); + + // Build embedded metadata + const embeddedMetadata: ImageEmbeddedMetadata = { + title: documentInfo.title, + author: documentInfo.author, + year: documentInfo.year, + pageNumber: chapterNum, // Use chapter as "page" + imageIndex: epubImage.imageIndex, + catalogId, + source: epubImage.href + }; + + try { + await convertToGrayscale(epubImage.tempPath, outputPath, { + pngCompression: this.config.pngCompression, + maxWidth: 1200, + embeddedMetadata + }); + + const outputMetadata = await getImageMetadata(outputPath); + + const extractedVisual: ExtractedVisual = { + pageNumber: chapterNum, // Store chapter as page number for compatibility + chapterIndex: epubImage.chapterIndex >= 0 ? epubImage.chapterIndex : undefined, + chapterTitle: epubImage.chapterTitle, + visualIndex: epubImage.imageIndex, + type: visualType, + imagePath: path.join('images', folderSlug, outputFilename), + boundingBox: { x: 0, y: 0, width: 1, height: 1 }, + width: outputMetadata.width, + height: outputMetadata.height + }; + + result.visuals.push(extractedVisual); + result.pagesProcessed++; + + } catch (saveError: any) { + result.errors.push(`Save ${epubImage.manifestId}: ${saveError.message}`); + } + } +} diff --git a/src/tools/operations/concept_search.ts b/src/tools/operations/concept_search.ts index a7d3dba6..9c670aa7 100644 --- a/src/tools/operations/concept_search.ts +++ b/src/tools/operations/concept_search.ts @@ -1,13 +1,14 @@ import { BaseTool, ToolParams } from "../base/tool.js"; import { ConceptSearchService, ConceptSearchResult, EnrichedChunk, SourceWithPages } from "../../domain/services/concept-search-service.js"; import { Configuration } from "../../application/config/index.js"; +import type { VisualRepository } from "../../domain/interfaces/repositories/visual-repository.js"; export interface ConceptSearchParams extends ToolParams { /** The concept to search for */ concept: string; - /** Optional source path filter */ - source_filter?: string; + /** Optional document title filter */ + title_filter?: string; } /** @@ -23,7 +24,8 @@ export interface ConceptSearchParams extends ToolParams { */ export class ConceptSearchTool extends BaseTool { constructor( - private conceptSearchService: ConceptSearchService + private conceptSearchService: ConceptSearchService, + private visualRepo?: VisualRepository ) { super(); } @@ -58,9 +60,9 @@ Debug output can be enabled via DEBUG_SEARCH=true environment variable.`; type: "string", description: "The concept to search for - use conceptual terms not exact phrases (e.g., 'innovation' not 'innovation process')", }, - source_filter: { + title_filter: { type: "string", - description: "Optional: Filter results to documents containing this text in their source path" + description: "Optional: Filter results to documents containing this text in their title" } }, required: ["concept"], @@ -94,14 +96,25 @@ Debug output can be enabled via DEBUG_SEARCH=true environment variable.`; maxSources: 1000, // Effectively unlimited maxChunks: 3000, // Effectively unlimited (~3 per source) chunksPerSource: 10, - sourceFilter: params.source_filter + titleFilter: params.title_filter }); + // Get associated visual IDs for this concept + let imageIds: number[] = []; + if (this.visualRepo) { + try { + const visuals = await this.visualRepo.findByConceptName(params.concept, 100); + imageIds = visuals.map(v => v.id); + } catch { + // Visual lookup is optional - don't fail the search + } + } + // Format for MCP response const debugSearch = Configuration.getInstance().logging.debugSearch; - const formatted = this.formatResult(result, debugSearch); + const formatted = this.formatResult(result, imageIds, debugSearch); - console.error(`✅ Found: ${result.totalDocuments} documents, ${result.chunks.length} chunks across ${result.sources.length} sources`); + console.error(`✅ Found: ${result.totalDocuments} documents, ${result.chunks.length} chunks, ${imageIds.length} images across ${result.sources.length} sources`); return { content: [ @@ -130,9 +143,10 @@ Debug output can be enabled via DEBUG_SEARCH=true environment variable.`; /** * Format hierarchical result for LLM consumption. */ - private formatResult(result: ConceptSearchResult, debug?: boolean) { + private formatResult(result: ConceptSearchResult, imageIds: number[], debug?: boolean) { // Format sources with page context and match type const sources = result.sources.map((s: SourceWithPages) => ({ + catalog_id: s.catalogId, title: s.title, pages: s.pageNumbers, match_type: s.matchType, // 'primary' or 'related' @@ -148,8 +162,9 @@ Debug output can be enabled via DEBUG_SEARCH=true environment variable.`; : []; return { - text: e.chunk.text, + catalog_id: e.chunk.catalogId, title: e.chunk.catalogTitle || e.documentTitle || '', + text: e.chunk.text, page: e.pageNumber, concept_density: e.conceptDensity.toFixed(3), concepts: conceptNames @@ -161,6 +176,9 @@ Debug output can be enabled via DEBUG_SEARCH=true environment variable.`; concept_id: result.conceptId, summary: result.summary, + // Associated visuals + image_ids: imageIds, + // Semantic relationships related_concepts: result.relatedConcepts, synonyms: result.synonyms, @@ -178,7 +196,8 @@ Debug output can be enabled via DEBUG_SEARCH=true environment variable.`; total_documents: result.totalDocuments, total_chunks: result.totalChunks, sources_returned: result.sources.length, - chunks_returned: result.chunks.length + chunks_returned: result.chunks.length, + images_found: imageIds.length }, // Hybrid score always shown diff --git a/src/tools/operations/conceptual_broad_chunks_search.ts b/src/tools/operations/conceptual_broad_chunks_search.ts index 5c6e2881..042077c7 100644 --- a/src/tools/operations/conceptual_broad_chunks_search.ts +++ b/src/tools/operations/conceptual_broad_chunks_search.ts @@ -116,9 +116,18 @@ Debug output can be enabled via DEBUG_SEARCH=true environment variable.`; const clusteredResults = filterByScoreGap(positiveResults) as SearchResult[]; // Format results for MCP response - const formattedResults = clusteredResults.map((r) => ({ + const formattedResults = clusteredResults.map((r) => { + // Extract concept names + const conceptNames = (r.conceptNames && r.conceptNames.length > 0 && r.conceptNames[0] !== '') + ? r.conceptNames + : []; + + return { + catalog_id: r.catalogId, + title: r.catalogTitle || 'Untitled', text: r.text, - source: r.source, + page_number: r.pageNumber, + concepts: conceptNames, score: r.hybridScore.toFixed(3), // Hybrid score always shown ...(debugSearch && { score_components: { // Component breakdown only in debug mode @@ -129,7 +138,8 @@ Debug output can be enabled via DEBUG_SEARCH=true environment variable.`; } }), expanded_terms: r.expandedTerms - })); + }; + }); return { content: [ diff --git a/src/tools/operations/conceptual_catalog_search.ts b/src/tools/operations/conceptual_catalog_search.ts index fd34db7b..86eb8838 100644 --- a/src/tools/operations/conceptual_catalog_search.ts +++ b/src/tools/operations/conceptual_catalog_search.ts @@ -114,7 +114,8 @@ Debug output can be enabled via DEBUG_SEARCH=true environment variable.`; // Format results for MCP response const formattedResults = clusteredResults.map((r) => ({ - source: r.source, + catalog_id: r.catalogId, + title: r.catalogTitle || r.source || 'Untitled', summary: r.text, // Full summary (not truncated) score: r.hybridScore.toFixed(3), // Hybrid score always shown ...(debugSearch && { diff --git a/src/tools/operations/conceptual_chunks_search.ts b/src/tools/operations/conceptual_chunks_search.ts index 588177b4..11a7873e 100644 --- a/src/tools/operations/conceptual_chunks_search.ts +++ b/src/tools/operations/conceptual_chunks_search.ts @@ -8,7 +8,7 @@ import { Configuration } from "../../application/config/index.js"; export interface ConceptualChunksSearchParams extends ToolParams { text: string; - source: string; + catalog_id: number; } /** @@ -26,11 +26,11 @@ export class ConceptualChunksSearchTool extends BaseTool { @@ -146,9 +157,8 @@ Debug output can be enabled via DEBUG_SEARCH=true environment variable.`; : []; return { + title: r.catalogTitle || catalogTitle, text: r.text, - source: catalogSource, // From catalog lookup - title: r.catalogTitle || '', concepts: conceptNames, concept_ids: r.conceptIds || [], }; diff --git a/src/tools/operations/get-visuals-tool.ts b/src/tools/operations/get-visuals-tool.ts new file mode 100644 index 00000000..a916fa11 --- /dev/null +++ b/src/tools/operations/get-visuals-tool.ts @@ -0,0 +1,168 @@ +/** + * Get Visuals MCP Tool + * + * Retrieves visual content (diagrams, charts, tables, figures) from documents. + * Enables semantic search over diagram descriptions and filtering by type. + */ + +import { BaseTool, ToolParams } from '../base/tool.js'; +import type { VisualRepository } from '../../domain/interfaces/repositories/visual-repository.js'; +import type { CatalogRepository } from '../../domain/interfaces/repositories/catalog-repository.js'; +import type { Visual, VisualType } from '../../domain/models/visual.js'; + +export interface GetVisualsParams extends ToolParams { + /** Retrieve visuals by specific IDs (from concept_search image_ids) */ + ids?: number[]; + /** Filter by catalog ID */ + catalog_id?: number; + /** Filter by visual type */ + visual_type?: VisualType; + /** Filter by concept name */ + concept?: string; + /** Maximum number of visuals to return */ + limit?: number; +} + +/** + * MCP tool for retrieving visuals (diagrams, charts, tables, figures) from documents. + * + * USE THIS TOOL WHEN: + * - Looking for diagrams, charts, or figures that illustrate a concept + * - Finding visual representations associated with specific documents + * - Retrieving visual context for text content + * + * DO NOT USE for: + * - Text-based search (use chunks_search or broad_chunks_search instead) + * - Finding documents by title (use catalog_search instead) + * - Searching for concepts in text (use concept_search instead) + * + * RETURNS: Array of visuals with descriptions, types, page numbers, + * concept associations, and image paths. + */ +export class GetVisualsTool extends BaseTool { + + constructor( + private visualRepo: VisualRepository, + private catalogRepo: CatalogRepository + ) { + super(); + } + + name = "get_visuals"; + description = `Retrieve visual content (diagrams, charts, tables, figures) from documents. + +USE THIS TOOL WHEN: +- Fetching visuals by ID (from concept_search image_ids) +- Looking for diagrams, charts, or figures that illustrate a concept +- Finding visual representations associated with specific documents + +DO NOT USE for: +- Text-based search (use chunks_search or broad_chunks_search instead) +- Finding documents by title (use catalog_search instead) +- Searching for concepts in text (use concept_search instead) + +RETURNS: Array of visuals with descriptions, types, page numbers, +concept associations, and image paths. Visual types include: +diagram, flowchart, chart, table, figure.`; + + inputSchema = { + type: "object" as const, + properties: { + ids: { + type: "array", + items: { type: "number" }, + description: "Retrieve specific visuals by their IDs (from concept_search image_ids)", + }, + catalog_id: { + type: "number", + description: "Filter visuals by catalog (document) ID", + }, + visual_type: { + type: "string", + enum: ["diagram", "flowchart", "chart", "table", "figure"], + description: "Filter by visual type: diagram, flowchart, chart, table, or figure", + }, + concept: { + type: "string", + description: "Filter by concept name associated with the visual", + }, + limit: { + type: "number", + description: "Maximum number of visuals to return (default: 20)", + default: 20 + } + }, + required: [], + }; + + async execute(params: GetVisualsParams) { + try { + const limit = params.limit ?? 20; + let visuals: Visual[]; + + // Apply filters in order of specificity + if (params.ids && params.ids.length > 0) { + // Retrieve specific visuals by IDs (most direct access) + console.error(`🔍 Retrieving ${params.ids.length} visuals by ID`); + visuals = await this.visualRepo.findByIds(params.ids); + } else if (params.concept) { + // Search by concept + console.error(`🔍 Searching visuals for concept: "${params.concept}"`); + visuals = await this.visualRepo.findByConceptName(params.concept, limit); + } else if (params.catalog_id) { + // Filter by catalog + console.error(`🔍 Searching visuals for catalog ID: ${params.catalog_id}`); + visuals = await this.visualRepo.findByCatalogId(params.catalog_id, limit); + } else if (params.visual_type) { + // Filter by visual type + console.error(`🔍 Searching visuals of type: ${params.visual_type}`); + visuals = await this.visualRepo.findByType(params.visual_type, limit); + } else { + // Get all visuals with limit - use findByType with any type to get all + console.error(`🔍 Retrieving up to ${limit} visuals`); + visuals = await this.visualRepo.findByType('diagram', limit); + } + + // Apply limit (unless fetching by IDs) + if (!params.ids) { + visuals = visuals.slice(0, limit); + } + + // Format response + const formattedVisuals = visuals.map((v: Visual) => ({ + id: v.id, + catalog_id: v.catalogId, + catalog_title: v.catalogTitle, + visual_type: v.visualType, + page_number: v.pageNumber, + description: v.description || 'No description available', + image_path: v.imagePath, + concepts: v.conceptNames || [] + })); + + const response = { + visuals: formattedVisuals, + total_returned: formattedVisuals.length, + filters_applied: { + ...(params.ids && { ids: params.ids }), + ...(params.catalog_id && { catalog_id: params.catalog_id }), + ...(params.visual_type && { visual_type: params.visual_type }), + ...(params.concept && { concept: params.concept }) + } + }; + + console.error(`✅ Found ${formattedVisuals.length} visuals`); + + return { + content: [{ + type: "text" as const, + text: JSON.stringify(response, null, 2) + }], + isError: false + }; + } catch (error) { + return this.handleError(error); + } + } +} +