From 61dd69a7e10f4e75f731335d069ac7878841aa34 Mon Sep 17 00:00:00 2001 From: Mykola Pereyma Date: Fri, 17 Apr 2026 11:32:17 -0700 Subject: [PATCH 01/10] Standardize local-dev Docker setup, scripts, and docs Docker: - Standardize container names (neo4j-local, pgvector-local, jupyter-local) - Use same service names in standard and dev compose for DNS resolution - Separate dev ports (7477/7690/5434/8890) to avoid conflicts with hybrid - Remove mysql from dev compose - Move Dockerfile to jupyter/Dockerfile.dev - Consolidate 7 redundant scripts into start-containers.sh/ps1 with --dev (switches compose file) and --reset (cleans extracted/) Docs: - Add AWS Prerequisites and Configure Environment to README Quick Start - Update docker_services.md with new container names and dev comparison - Fix troubleshooting.md: wrong connection strings, container names, volume names, mount paths throughout - Fix README: stale model name, mount path reference - Add SYSTEM_PROMPT_ARN/USER_PROMPT_ARN to .env.template --- examples/lexical-graph-local-dev/README.md | 47 +++++---- .../lexical-graph-local-dev/docker/build.sh | 6 -- .../docker/create_network_and_up.cmd | 38 ------- .../docker/create_network_and_up.sh | 46 --------- .../docker/dev-reset.sh | 21 ---- .../docker/dev-start.sh | 8 -- .../docker/docker-compose-dev.yml | 70 +++++-------- .../docker/docker-compose.arm.yml | 67 +++++++------ .../docker/docker-compose.yml | 65 ++++++------ .../Dockerfile.dev} | 0 .../docker/start-containers.bat | 64 ------------ .../docker/start-containers.ps1 | 58 +++++------ .../docker/start-containers.sh | 28 ++++-- .../lexical-graph-local-dev/docker/start.sh | 11 --- .../docs/docker_services.md | 99 +++++-------------- .../docs/troubleshooting.md | 52 +++++----- .../notebooks/.env.template | 23 +++-- 17 files changed, 234 insertions(+), 469 deletions(-) delete mode 100644 examples/lexical-graph-local-dev/docker/build.sh delete mode 100644 examples/lexical-graph-local-dev/docker/create_network_and_up.cmd delete mode 100755 examples/lexical-graph-local-dev/docker/create_network_and_up.sh delete mode 100755 examples/lexical-graph-local-dev/docker/dev-reset.sh delete mode 100755 examples/lexical-graph-local-dev/docker/dev-start.sh rename examples/lexical-graph-local-dev/docker/{Dockerfile.jupyter => jupyter/Dockerfile.dev} (100%) delete mode 100644 examples/lexical-graph-local-dev/docker/start-containers.bat delete mode 100755 examples/lexical-graph-local-dev/docker/start.sh diff --git a/examples/lexical-graph-local-dev/README.md b/examples/lexical-graph-local-dev/README.md index d7636073..fd495af1 100644 --- a/examples/lexical-graph-local-dev/README.md +++ b/examples/lexical-graph-local-dev/README.md @@ -11,13 +11,30 @@ This example provides a complete local development environment for the GraphRAG - [**00-Setup**](./notebooks/00-Setup.ipynb) – Environment setup, package installation, and development mode configuration - [**01-Combined-Extract-and-Build**](./notebooks/01-Combined-Extract-and-Build.ipynb) – Complete extraction and building pipeline using `LexicalGraphIndex.extract_and_build()` - [**02-Querying**](./notebooks/02-Querying.ipynb) – Graph querying examples using `LexicalGraphQueryEngine` with various retrievers -- [**03-Querying with prompting**](./notebooks/03-Querying%20with%20prompting.ipynb) – Advanced querying with custom prompts and prompt providers +- [**03-Querying-with-Prompting**](./notebooks/03-Querying-with-Prompting.ipynb) – Advanced querying with custom prompts and prompt providers - [**04-Advanced-Configuration-Examples**](./notebooks/04-Advanced-Configuration-Examples.ipynb) – Advanced reader configurations and metadata handling - [**05-S3-Directory-Reader-Provider**](./notebooks/05-S3-Directory-Reader-Provider.ipynb) – S3-based document reading and processing ## Quick Start -### 1. Start the Environment +### 1. AWS Prerequisites + +Before starting, ensure you have: +- [AWS CLI configured with credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html) — verify with `aws sts get-caller-identity` +- Access to Amazon Bedrock models: + - `us.anthropic.claude-sonnet-4-6` (extraction, response, evaluation) + - `cohere.embed-english-v3` (embeddings) + +### 2. Configure Environment + +```bash +cd notebooks +cp .env.template .env +``` + +Review `.env` — defaults work for local Docker services. Set `S3_BUCKET_NAME` if using S3 features (notebooks 03, 05). + +### 3. Start the Environment **Standard (x86/Intel):** ```bash @@ -37,7 +54,7 @@ cd docker ./start-containers.sh --dev --mac # Enable live code editing ``` -### 2. Access Jupyter Lab +### 4. Access Jupyter Lab Open your browser to: **http://localhost:8889** @@ -45,7 +62,7 @@ Open your browser to: **http://localhost:8889** - Navigate to the `work` folder to find notebooks - All dependencies are pre-installed -### 3. Run the Setup Notebook +### 5. Run the Setup Notebook Start with `00-Setup.ipynb` to configure your environment and verify all services are working. @@ -57,9 +74,6 @@ Start with `00-Setup.ipynb` to configure your environment and verify all service |--------|----------|-------------| | `start-containers.sh` | Unix/Linux/Mac | Main startup script with all options | | `start-containers.ps1` | Windows PowerShell | PowerShell version with same functionality | -| `start-containers.bat` | Windows CMD | Command prompt version | -| `dev-start.sh` | Unix/Linux/Mac | Development mode startup | -| `dev-reset.sh` | Unix/Linux/Mac | Reset development environment | ### Script Options @@ -86,9 +100,6 @@ Start with `00-Setup.ipynb` to configure your environment and verify all service # Windows PowerShell .\start-containers.ps1 -Mac -Dev - -# Windows Command Prompt -start-containers.bat --mac --dev ``` ## Services @@ -99,7 +110,7 @@ After startup, the following services are available: |---------|-----|-------------|---------| | **Jupyter Lab** | http://localhost:8889 | None required | Interactive development | | **Neo4j Browser** | http://localhost:7476 | neo4j/password | Graph database management | -| **PostgreSQL** | localhost:5432 | graphrag/graphragpass | Vector storage | +| **PostgreSQL** | localhost:5432 | postgres/password | Vector storage | ## Development Mode @@ -195,8 +206,8 @@ Key environment variables (configured in `docker/.env`): ```bash # Database connections (Docker internal names) -VECTOR_STORE="postgresql://graphrag:graphragpass@postgres:5432/graphrag_db" -GRAPH_STORE="bolt://neo4j:password@neo4j:7687" +VECTOR_STORE="postgresql://postgres:password@pgvector-local:5432/graphrag" +GRAPH_STORE="bolt://neo4j:password@neo4j-local:7687" # AWS Configuration (optional) AWS_REGION="us-east-1" @@ -204,7 +215,7 @@ AWS_PROFILE="your-profile" # Model Configuration EMBEDDINGS_MODEL="cohere.embed-english-v3" -EXTRACTION_MODEL="us.anthropic.claude-3-7-sonnet-20250219-v1:0" +EXTRACTION_MODEL="us.anthropic.claude-sonnet-4-6" ``` ## Troubleshooting @@ -218,12 +229,12 @@ EXTRACTION_MODEL="us.anthropic.claude-3-7-sonnet-20250219-v1:0" - PostgreSQL: 5432 **Container networking:** -- Use container names in connection strings (e.g., `neo4j:7687`, not `localhost:7687`) +- Use container names in connection strings (e.g., `neo4j-local:7687`, not `localhost:7687`) - The `.env` file uses Docker internal networking **Development mode:** - Restart Jupyter kernel after enabling hot-reload -- Check that lexical-graph source is mounted at `/home/jovyan/lexical-graph-src` +- Check that lexical-graph source is mounted at `/home/jovyan/lexical-graph` ### Reset Environment @@ -240,7 +251,7 @@ docker-compose down -v ## AWS Foundation Model Access (Optional) For AWS Bedrock integration, ensure your AWS account has access to: -- `anthropic.claude-3-7-sonnet-20250219-v1:0` +- `us.anthropic.claude-sonnet-4-6` - `cohere.embed-english-v3` Enable model access via the [Bedrock model access console](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html). @@ -255,7 +266,7 @@ If you have existing FalkorDB configurations: GRAPH_STORE="falkordb://localhost:6379" # New Neo4j - GRAPH_STORE="bolt://neo4j:password@neo4j:7687" + GRAPH_STORE="bolt://neo4j:password@neo4j-local:7687" ``` 2. **Update imports** in your code: diff --git a/examples/lexical-graph-local-dev/docker/build.sh b/examples/lexical-graph-local-dev/docker/build.sh deleted file mode 100644 index 2ceabb9c..00000000 --- a/examples/lexical-graph-local-dev/docker/build.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -echo "Building and starting containers..." -docker compose up -d --build - -echo "Build and startup complete." diff --git a/examples/lexical-graph-local-dev/docker/create_network_and_up.cmd b/examples/lexical-graph-local-dev/docker/create_network_and_up.cmd deleted file mode 100644 index e9b1a874..00000000 --- a/examples/lexical-graph-local-dev/docker/create_network_and_up.cmd +++ /dev/null @@ -1,38 +0,0 @@ -@echo off - -echo Setting up Lexical-Graph Local Development Environment... - -REM Check for --reset flag -IF "%1"=="--reset" ( - echo Resetting all data... - docker compose down -v - docker volume prune -f - rmdir /s /q data 2>nul - echo All data deleted. -) - -REM Create data directories -echo Creating data directories... -mkdir data\neo4j 2>nul - -REM Check if Docker network exists -docker network inspect graphrag_network >nul 2>&1 - -IF %ERRORLEVEL% NEQ 0 ( - echo Creating Docker network: graphrag_network - docker network create graphrag_network -) ELSE ( - echo Docker network 'graphrag_network' already exists -) - -REM Start services -echo Starting Docker containers... -docker compose up -d - -echo Waiting for services to be ready... -timeout /t 10 /nobreak >nul - -echo Lexical-Graph environment is ready! -echo Jupyter Lab: http://localhost:8889 (token: lexical-graph) -echo Neo4j: http://localhost:7476 -echo PostgreSQL: localhost:5433 \ No newline at end of file diff --git a/examples/lexical-graph-local-dev/docker/create_network_and_up.sh b/examples/lexical-graph-local-dev/docker/create_network_and_up.sh deleted file mode 100755 index e081d7b8..00000000 --- a/examples/lexical-graph-local-dev/docker/create_network_and_up.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -set -e - -echo "Setting up Lexical-Graph Local Development Environment..." - -# Check for --reset flag -if [ "$1" = "--reset" ]; then - echo "Resetting all data..." - docker compose down -v - docker volume prune -f - rm -rf data - echo "All data deleted." -fi - -# Create data directories -echo "Creating data directories..." -mkdir -p data/neo4j -chmod -R 755 data/ - -# Check if network exists -if ! docker network inspect graphrag_network >/dev/null 2>&1; then - echo "Creating Docker network: graphrag_network" - docker network create graphrag_network -else - echo "Docker network 'graphrag_network' already exists" -fi - -# Start containers -echo "Starting Docker containers..." -docker compose up -d - -echo "Waiting for services to be ready..." -sleep 10 - -# Check service health -echo "Checking service health..." -if curl -f http://localhost:8889 >/dev/null 2>&1; then - echo " Jupyter is ready at http://localhost:8889 (token: lexical-graph)" -else - echo " Jupyter may still be starting up" -fi - -echo " Lexical-Graph environment is ready!" -echo " Jupyter Lab: http://localhost:8889 (token: lexical-graph)" -echo " Neo4j: http://localhost:7476" -echo " PostgreSQL: localhost:5433" \ No newline at end of file diff --git a/examples/lexical-graph-local-dev/docker/dev-reset.sh b/examples/lexical-graph-local-dev/docker/dev-reset.sh deleted file mode 100755 index 62a85684..00000000 --- a/examples/lexical-graph-local-dev/docker/dev-reset.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -echo "Stopping and removing development containers, volumes, and networks..." -docker compose -f docker-compose-dev.yml up -d --build --force-recreate - -echo "Ensuring development containers are removed..." -docker rm -f lg-neo4j-dev lg-pgvector-db-dev lg-jupyter-dev 2>/dev/null - -echo "Removing development volumes..." -docker volume rm -f lg_pgvector_data_dev lg_neo4j_data_dev lg_neo4j_logs_dev lg_jupyter_data_dev 2>/dev/null - -echo "Clearing extracted directory..." -rm -rf extracted - -echo "Rebuilding development containers..." -docker compose -f docker-compose-dev.yml up -d --force-recreate - -echo "Development environment reset complete." -echo "" -echo "Jupyter Lab is available at: http://localhost:8889 (no password required)" -echo "Source code is mounted for live development" \ No newline at end of file diff --git a/examples/lexical-graph-local-dev/docker/dev-start.sh b/examples/lexical-graph-local-dev/docker/dev-start.sh deleted file mode 100755 index 06d68273..00000000 --- a/examples/lexical-graph-local-dev/docker/dev-start.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -echo "Building and starting development containers..." -docker compose -f docker-compose-dev.yml up -d --build -echo "Development environment startup complete." -echo "" -echo "Jupyter Lab is available at: http://localhost:8889 (no password required)" -echo "Source code is mounted for live development" \ No newline at end of file diff --git a/examples/lexical-graph-local-dev/docker/docker-compose-dev.yml b/examples/lexical-graph-local-dev/docker/docker-compose-dev.yml index df33fa1a..3be71567 100644 --- a/examples/lexical-graph-local-dev/docker/docker-compose-dev.yml +++ b/examples/lexical-graph-local-dev/docker/docker-compose-dev.yml @@ -1,80 +1,62 @@ services: - lg-neo4j: + neo4j-local: image: neo4j:5.25-community - container_name: lg-neo4j-dev + container_name: neo4j-local-dev ports: - - "7476:7474" # HTTP - - "7689:7687" # Bolt + - "7477:7474" # HTTP + - "7690:7687" # Bolt environment: - NEO4J_AUTH=${NEO4J_USER:-neo4j}/${NEO4J_PASSWORD:-password} - NEO4J_PLUGINS=["apoc"] volumes: - - lg_neo4j_data_dev:/data - - lg_neo4j_logs_dev:/logs + - neo4j_local_data_dev:/data + - neo4j_local_logs_dev:/logs networks: - - lg_graphrag_network_dev + - graphrag_local_network_dev - lg-postgres: + pgvector-local: image: pgvector/pgvector:0.6.2-pg16 - container_name: lg-pgvector-db-dev + container_name: pgvector-local-dev ports: - - "5433:5432" + - "5434:5432" environment: - POSTGRES_USER=${POSTGRES_USER:-postgres} - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-password} - POSTGRES_DB=${POSTGRES_DB:-graphrag} volumes: - - lg_pgvector_data_dev:/var/lib/postgresql/data + - pgvector_local_data_dev:/var/lib/postgresql/data - ./postgres/schema.sql:/docker-entrypoint-initdb.d/schema.sql networks: - - lg_graphrag_network_dev + - graphrag_local_network_dev - lg-mysql-dev: - image: mysql:8.4 - container_name: lg-mysql-dev - ports: - - "3307:3306" # Avoid conflict with host MySQL - environment: - - MYSQL_ROOT_PASSWORD=${MYSQL_ROOT_PASSWORD:-graphragroot} - - MYSQL_DATABASE=${MYSQL_DATABASE:-graphrag_db} - - MYSQL_USER=${MYSQL_USER:-graphrag} - - MYSQL_PASSWORD=${MYSQL_PASSWORD:-graphragpass} - volumes: - - lg_mysql_data_dev:/var/lib/mysql - - ./mysql/schema.sql:/docker-entrypoint-initdb.d/schema.sql # Optional SQL init - networks: - - lg_graphrag_network_dev - - lg-jupyter-dev: + jupyter-local-dev: build: - context: . - dockerfile: Dockerfile.jupyter - container_name: lg-jupyter-dev + context: ./jupyter + dockerfile: Dockerfile.dev + container_name: jupyter-local-dev ports: - - "8889:8888" + - "8890:8888" environment: - JUPYTER_ENABLE_LAB=yes volumes: - ../notebooks:/home/jovyan/notebooks - ../../../lexical-graph:/home/jovyan/lexical-graph - ../../../lexical-graph-contrib:/home/jovyan/lexical-graph-contrib - - lg_jupyter_data_dev:/home/jovyan/work + - jupyter_local_data_dev:/home/jovyan/work - ~/.aws:/home/jovyan/.aws networks: - - lg_graphrag_network_dev + - graphrag_local_network_dev depends_on: - - lg-postgres - - lg-neo4j - - lg-mysql-dev + - pgvector-local + - neo4j-local command: start-notebook.sh --NotebookApp.token='' --NotebookApp.password='' networks: - lg_graphrag_network_dev: + graphrag_local_network_dev: driver: bridge volumes: - lg_neo4j_data_dev: - lg_neo4j_logs_dev: - lg_pgvector_data_dev: - lg_jupyter_data_dev: - lg_mysql_data_dev: + neo4j_local_data_dev: + neo4j_local_logs_dev: + pgvector_local_data_dev: + jupyter_local_data_dev: diff --git a/examples/lexical-graph-local-dev/docker/docker-compose.arm.yml b/examples/lexical-graph-local-dev/docker/docker-compose.arm.yml index de6f26a8..013cfa1f 100644 --- a/examples/lexical-graph-local-dev/docker/docker-compose.arm.yml +++ b/examples/lexical-graph-local-dev/docker/docker-compose.arm.yml @@ -1,7 +1,7 @@ services: - neo4j: + neo4j-local: image: neo4j:5.25-community - container_name: neo4j + container_name: neo4j-local ports: - "7476:7474" # HTTP - "7689:7687" # Bolt @@ -9,17 +9,33 @@ services: - NEO4J_AUTH=${NEO4J_USER:-neo4j}/${NEO4J_PASSWORD:-password} - NEO4J_PLUGINS=["apoc"] volumes: - - neo4j_data:/data - - neo4j_logs:/logs + - neo4j_local_data:/data + - neo4j_local_logs:/logs networks: - - graphrag_network + - graphrag_local_network platform: linux/arm64 - jupyter: + pgvector-local: + image: pgvector/pgvector:0.6.2-pg16 + container_name: pgvector-local + ports: + - "5432:5432" + environment: + - POSTGRES_USER=${POSTGRES_USER:-postgres} + - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-password} + - POSTGRES_DB=${POSTGRES_DB:-graphrag} + volumes: + - pgvector_local_data:/var/lib/postgresql/data + - ./postgres/schema.sql:/docker-entrypoint-initdb.d/schema.sql + networks: + - graphrag_local_network + platform: linux/arm64 + + jupyter-local: build: context: ./jupyter dockerfile: Dockerfile - container_name: jupyter-notebook + container_name: jupyter-local ports: - "8889:8888" environment: @@ -27,38 +43,21 @@ services: - JUPYTER_TOKEN='' volumes: - ../notebooks:/home/jovyan/work - - jupyter_data:/home/jovyan/.jupyter - - ${HOME}/.aws:/home/jovyan/.aws - - ${LEXICAL_GRAPH_DEV_MOUNT:-/dev/null:/tmp/unused} + - jupyter_local_data:/home/jovyan/.jupyter + - ~/.aws:/home/jovyan/.aws networks: - - graphrag_network + - graphrag_local_network depends_on: - - neo4j - - postgres - platform: linux/arm64 - - postgres: - image: pgvector/pgvector:0.6.2-pg16 - container_name: pgvector-db - ports: - - "5432:5432" - environment: - - POSTGRES_USER=${POSTGRES_USER:-postgres} - - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-password} - - POSTGRES_DB=${POSTGRES_DB:-graphrag} - volumes: - - pgvector_data:/var/lib/postgresql/data - - ./postgres/schema.sql:/docker-entrypoint-initdb.d/schema.sql - networks: - - graphrag_network + - neo4j-local + - pgvector-local platform: linux/arm64 networks: - graphrag_network: + graphrag_local_network: driver: bridge volumes: - neo4j_data: - neo4j_logs: - pgvector_data: - jupyter_data: \ No newline at end of file + neo4j_local_data: + neo4j_local_logs: + pgvector_local_data: + jupyter_local_data: diff --git a/examples/lexical-graph-local-dev/docker/docker-compose.yml b/examples/lexical-graph-local-dev/docker/docker-compose.yml index f25fd0c1..725f98a9 100644 --- a/examples/lexical-graph-local-dev/docker/docker-compose.yml +++ b/examples/lexical-graph-local-dev/docker/docker-compose.yml @@ -1,7 +1,7 @@ services: - neo4j: + neo4j-local: image: neo4j:5.25-community - container_name: neo4j + container_name: neo4j-local ports: - "7476:7474" # HTTP - "7689:7687" # Bolt @@ -9,16 +9,31 @@ services: - NEO4J_AUTH=${NEO4J_USER:-neo4j}/${NEO4J_PASSWORD:-password} - NEO4J_PLUGINS=["apoc"] volumes: - - neo4j_data:/data - - neo4j_logs:/logs + - neo4j_local_data:/data + - neo4j_local_logs:/logs networks: - - graphrag_network + - graphrag_local_network - jupyter: + pgvector-local: + image: pgvector/pgvector:0.6.2-pg16 + container_name: pgvector-local + ports: + - "5432:5432" + environment: + - POSTGRES_USER=${POSTGRES_USER:-postgres} + - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-password} + - POSTGRES_DB=${POSTGRES_DB:-graphrag} + volumes: + - pgvector_local_data:/var/lib/postgresql/data + - ./postgres/schema.sql:/docker-entrypoint-initdb.d/schema.sql + networks: + - graphrag_local_network + + jupyter-local: build: context: ./jupyter dockerfile: Dockerfile - container_name: jupyter-notebook + container_name: jupyter-local ports: - "8889:8888" environment: @@ -26,36 +41,20 @@ services: - JUPYTER_TOKEN='' volumes: - ../notebooks:/home/jovyan/work - - jupyter_data:/home/jovyan/.jupyter - - ${HOME}/.aws:/home/jovyan/.aws - - ${LEXICAL_GRAPH_DEV_MOUNT:-/dev/null:/tmp/unused} + - jupyter_local_data:/home/jovyan/.jupyter + - ~/.aws:/home/jovyan/.aws networks: - - graphrag_network + - graphrag_local_network depends_on: - - neo4j - - postgres - - postgres: - image: pgvector/pgvector:0.6.2-pg16 - container_name: pgvector-db - ports: - - "5432:5432" - environment: - - POSTGRES_USER=${POSTGRES_USER:-postgres} - - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-password} - - POSTGRES_DB=${POSTGRES_DB:-graphrag} - volumes: - - pgvector_data:/var/lib/postgresql/data - - ./postgres/schema.sql:/docker-entrypoint-initdb.d/schema.sql - networks: - - graphrag_network + - neo4j-local + - pgvector-local networks: - graphrag_network: + graphrag_local_network: driver: bridge volumes: - neo4j_data: - neo4j_logs: - pgvector_data: - jupyter_data: + neo4j_local_data: + neo4j_local_logs: + pgvector_local_data: + jupyter_local_data: diff --git a/examples/lexical-graph-local-dev/docker/Dockerfile.jupyter b/examples/lexical-graph-local-dev/docker/jupyter/Dockerfile.dev similarity index 100% rename from examples/lexical-graph-local-dev/docker/Dockerfile.jupyter rename to examples/lexical-graph-local-dev/docker/jupyter/Dockerfile.dev diff --git a/examples/lexical-graph-local-dev/docker/start-containers.bat b/examples/lexical-graph-local-dev/docker/start-containers.bat deleted file mode 100644 index c4a1c5bb..00000000 --- a/examples/lexical-graph-local-dev/docker/start-containers.bat +++ /dev/null @@ -1,64 +0,0 @@ -@echo off -setlocal enabledelayedexpansion - -REM Default to standard docker-compose file -set COMPOSE_FILE=docker-compose.yml -set DEV_MODE=false -set RESET_MODE=false - -REM Check for flags -for %%i in (%*) do ( - if "%%i"=="--mac" ( - set COMPOSE_FILE=docker-compose.arm.yml - echo Using ARM/Mac-specific configuration - ) - if "%%i"=="--dev" ( - set DEV_MODE=true - echo Enabling development mode with hot-code-injection - ) - if "%%i"=="--reset" ( - set RESET_MODE=true - echo Reset mode enabled - will rebuild containers and reset data - ) -) - -if "%RESET_MODE%"=="true" ( - echo Resetting containers and data... - docker compose -f %COMPOSE_FILE% down -v - echo Building and starting containers... - set BUILD_FLAG=--build -) else ( - echo Starting containers (preserving data)... - set BUILD_FLAG= -) - -if "%DEV_MODE%"=="true" ( - set LEXICAL_GRAPH_DEV_MOUNT=../../../lexical-graph:/home/jovyan/lexical-graph-src - echo Development mode: Mounting lexical-graph source code -) - -docker compose -f %COMPOSE_FILE% up -d %BUILD_FLAG% - -echo. -if "%RESET_MODE%"=="true" ( - echo Reset and startup complete! -) else ( - echo Startup complete! -) -echo. -echo Services available at: -echo Jupyter Lab: http://localhost:8889 (no password required) -echo Neo4j Browser: http://localhost:7476 (neo4j/password) -echo. -echo IMPORTANT: All notebook execution must happen in Jupyter Lab. -echo Open http://localhost:8889 to access the development environment. -echo Navigate to the 'work' folder to find the notebooks. -if "%DEV_MODE%"=="true" ( - echo. - echo Development mode enabled - lexical-graph source code mounted for hot-code-injection - echo Changes to lexical-graph source will be reflected immediately in notebooks -) -if "%RESET_MODE%"=="false" ( - echo. - echo Data preserved from previous runs. Use --reset to start fresh. -) \ No newline at end of file diff --git a/examples/lexical-graph-local-dev/docker/start-containers.ps1 b/examples/lexical-graph-local-dev/docker/start-containers.ps1 index 6e5ce8cd..bf5342c6 100644 --- a/examples/lexical-graph-local-dev/docker/start-containers.ps1 +++ b/examples/lexical-graph-local-dev/docker/start-containers.ps1 @@ -4,68 +4,64 @@ param( [switch]$Reset ) -# Default to standard docker-compose file -$COMPOSE_FILE = "docker-compose.yml" -$DEV_MODE = $false -$RESET_MODE = $false +$ComposeFile = "docker-compose.yml" -# Check for flags if ($Mac) { - $COMPOSE_FILE = "docker-compose.arm.yml" + $ComposeFile = "docker-compose.arm.yml" Write-Host "Using ARM/Mac-specific configuration" } if ($Dev) { - $DEV_MODE = $true - Write-Host "Enabling development mode with hot-code-injection" + $ComposeFile = "docker-compose-dev.yml" + Write-Host "Development mode: Using docker-compose-dev.yml with hot-code-injection" } if ($Reset) { - $RESET_MODE = $true - Write-Host "Reset mode enabled - will rebuild containers and reset data" -} - -if ($RESET_MODE) { Write-Host "Resetting containers and data..." - docker compose -f $COMPOSE_FILE down -v + docker compose -f $ComposeFile down -v + Remove-Item -Recurse -Force extracted -ErrorAction SilentlyContinue Write-Host "Building and starting containers..." - $BUILD_FLAG = "--build" + $BuildFlag = "--build" } else { Write-Host "Starting containers (preserving data)..." - $BUILD_FLAG = "" -} - -if ($DEV_MODE) { - $env:LEXICAL_GRAPH_DEV_MOUNT = "../../../lexical-graph:/home/jovyan/lexical-graph-src" - Write-Host "Development mode: Mounting lexical-graph source code" + $BuildFlag = "" } -if ($BUILD_FLAG) { - docker compose -f $COMPOSE_FILE up -d --build +if ($BuildFlag) { + docker compose -f $ComposeFile up -d --build } else { - docker compose -f $COMPOSE_FILE up -d + docker compose -f $ComposeFile up -d } Write-Host "" -if ($RESET_MODE) { +if ($Reset) { Write-Host "Reset and startup complete!" } else { Write-Host "Startup complete!" } Write-Host "" Write-Host "Services available at:" -Write-Host " Jupyter Lab: http://localhost:8889 (no password required)" -Write-Host " Neo4j Browser: http://localhost:7476 (neo4j/password)" +if ($Dev) { + Write-Host " Jupyter Lab: http://localhost:8890 (no password required)" + Write-Host " Neo4j Browser: http://localhost:7477 (neo4j/password)" +} else { + Write-Host " Jupyter Lab: http://localhost:8889 (no password required)" + Write-Host " Neo4j Browser: http://localhost:7476 (neo4j/password)" +} Write-Host "" Write-Host "IMPORTANT: All notebook execution must happen in Jupyter Lab." -Write-Host " Open http://localhost:8889 to access the development environment." +if ($Dev) { + Write-Host " Open http://localhost:8890 to access the development environment." +} else { + Write-Host " Open http://localhost:8889 to access the development environment." +} Write-Host " Navigate to the 'work' folder to find the notebooks." -if ($DEV_MODE) { +if ($Dev) { Write-Host "" Write-Host "Development mode enabled - lexical-graph source code mounted for hot-code-injection" Write-Host " Changes to lexical-graph source will be reflected immediately in notebooks" } -if (-not $RESET_MODE) { +if (-not $Reset) { Write-Host "" Write-Host "Data preserved from previous runs. Use -Reset to start fresh." -} \ No newline at end of file +} diff --git a/examples/lexical-graph-local-dev/docker/start-containers.sh b/examples/lexical-graph-local-dev/docker/start-containers.sh index 899883c9..0311e50a 100755 --- a/examples/lexical-graph-local-dev/docker/start-containers.sh +++ b/examples/lexical-graph-local-dev/docker/start-containers.sh @@ -21,9 +21,15 @@ for arg in "$@"; do esac done +if [ "$DEV_MODE" = true ]; then + COMPOSE_FILE="docker-compose-dev.yml" + echo "Development mode: Using docker-compose-dev.yml with hot-code-injection" +fi + if [ "$RESET_MODE" = true ]; then echo "Resetting containers and data..." docker compose -f $COMPOSE_FILE down -v + rm -rf extracted echo "Building and starting containers..." BUILD_FLAG="--build" else @@ -31,11 +37,6 @@ else BUILD_FLAG="" fi -if [ "$DEV_MODE" = true ]; then - export LEXICAL_GRAPH_DEV_MOUNT="../../../lexical-graph:/home/jovyan/lexical-graph-src" - echo "Development mode: Mounting lexical-graph source code" -fi - docker compose -f $COMPOSE_FILE up -d $BUILD_FLAG echo "" @@ -46,11 +47,20 @@ else fi echo "" echo "Services available at:" -echo " Jupyter Lab: http://localhost:8889 (no password required)" -echo " Neo4j Browser: http://localhost:7476 (neo4j/password)" +if [ "$DEV_MODE" = true ]; then + echo " Jupyter Lab: http://localhost:8890 (no password required)" + echo " Neo4j Browser: http://localhost:7477 (neo4j/password)" +else + echo " Jupyter Lab: http://localhost:8889 (no password required)" + echo " Neo4j Browser: http://localhost:7476 (neo4j/password)" +fi echo "" echo "IMPORTANT: All notebook execution must happen in Jupyter Lab." -echo " Open http://localhost:8889 to access the development environment." +if [ "$DEV_MODE" = true ]; then + echo " Open http://localhost:8890 to access the development environment." +else + echo " Open http://localhost:8889 to access the development environment." +fi echo " Navigate to the 'work' folder to find the notebooks." if [ "$DEV_MODE" = true ]; then echo "" @@ -60,4 +70,4 @@ fi if [ "$RESET_MODE" = false ]; then echo "" echo "Data preserved from previous runs. Use --reset to start fresh." -fi \ No newline at end of file +fi diff --git a/examples/lexical-graph-local-dev/docker/start.sh b/examples/lexical-graph-local-dev/docker/start.sh deleted file mode 100755 index 26467d21..00000000 --- a/examples/lexical-graph-local-dev/docker/start.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -echo "Building and starting containers..." -docker compose up -d --build -echo "Build and startup complete." -echo "" -echo "Jupyter Lab is available at: http://localhost:8889" -echo "Waiting for Jupyter to start..." -sleep 5 -echo "Jupyter token:" -docker logs lg-jupyter 2>&1 | grep -E "(token=|127.0.0.1:8888)" | tail -1 \ No newline at end of file diff --git a/examples/lexical-graph-local-dev/docs/docker_services.md b/examples/lexical-graph-local-dev/docs/docker_services.md index edfc4a3f..80966e2c 100644 --- a/examples/lexical-graph-local-dev/docs/docker_services.md +++ b/examples/lexical-graph-local-dev/docs/docker_services.md @@ -6,21 +6,21 @@ This document describes the services defined in the `docker-compose.yml` file us ## Services -### 1. `neo4j` +### 1. `neo4j-local` - **Image**: `neo4j:5.25-community` - **Description**: Neo4j graph database for storing the lexical graph structure - **Ports**: - `7476:7474`: Neo4j Browser web interface - - `7687:7687`: Bolt protocol for database connections + - `7689:7687`: Bolt protocol for database connections - **Environment Variables**: - `NEO4J_AUTH`: Authentication (neo4j/password) - `NEO4J_PLUGINS`: APOC plugin enabled for advanced procedures - **Volumes**: - - `neo4j_data`: Persists graph database - - `neo4j_logs`: Neo4j log files -- **Network**: Connected to `lg_graphrag_network` + - `neo4j_local_data`: Persists graph database + - `neo4j_local_logs`: Neo4j log files +- **Network**: Connected to `graphrag_local_network` -### 2. `jupyter-notebook` +### 2. `jupyter-local` - **Build**: Custom Jupyter image with GraphRAG dependencies - **Description**: Jupyter Lab environment for interactive development - **Ports**: @@ -29,12 +29,11 @@ This document describes the services defined in the `docker-compose.yml` file us - `JUPYTER_ENABLE_LAB`: Enables Jupyter Lab interface - **Volumes**: - `../notebooks:/home/jovyan/work`: Notebook files - - `../../../lexical-graph:/home/jovyan/lexical-graph-src`: Source code (dev mode only) - - `jupyter_data`: Jupyter user data and configurations -- **Network**: Connected to `lg_graphrag_network` -- **Depends On**: `postgres`, `neo4j` + - `~/.aws:/home/jovyan/.aws`: AWS credentials +- **Network**: Connected to `graphrag_local_network` +- **Depends On**: `pgvector-local`, `neo4j-local` -### 3. `postgres` +### 3. `pgvector-local` - **Image**: `pgvector/pgvector:0.6.2-pg16` - **Description**: PostgreSQL 16 with pgvector extension for vector embeddings - **Ports**: @@ -44,21 +43,26 @@ This document describes the services defined in the `docker-compose.yml` file us - `POSTGRES_PASSWORD`: Database password (from .env) - `POSTGRES_DB`: Database name (from .env) - **Volumes**: - - `pgvector_data`: Data persistence + - `pgvector_local_data`: Data persistence - `./postgres/schema.sql`: Database initialization script -- **Network**: Connected to `lg_graphrag_network` +- **Network**: Connected to `graphrag_local_network` --- ## Development Mode Services -When using `--dev` flag, additional volume mounts are enabled: +The `docker-compose-dev.yml` provides a development variant with hot-code-injection support. Key differences from standard mode: -### Enhanced Jupyter Service (Dev Mode) -- **Additional Volume**: `../../../lexical-graph:/home/jovyan/lexical-graph-src` -- **Hot-Code-Injection**: Changes to lexical-graph source reflected immediately -- **Editable Installation**: Package installed in development mode -- **Auto-Reload**: Jupyter notebooks automatically reload modules +| Aspect | Standard (`docker-compose.yml`) | Dev (`docker-compose-dev.yml`) | +|--------|--------------------------------|-------------------------------| +| Neo4j ports | 7476, 7689 | 7477, 7690 | +| Jupyter port | 8889 | 8890 | +| PostgreSQL port | 5432 | 5434 | +| Jupyter Dockerfile | `jupyter/Dockerfile` (full) | `jupyter/Dockerfile.dev` (minimal) | +| Notebook mount | `/home/jovyan/work` | `/home/jovyan/notebooks` | +| Source mounts | None | lexical-graph, lexical-graph-contrib | + +Start dev mode with: `./start-containers.sh --dev` --- @@ -77,47 +81,6 @@ CREATE EXTENSION IF NOT EXISTS pg_trgm SCHEMA public; CREATE SCHEMA IF NOT EXISTS graphrag; ``` -These extensions are required for: -- **pgvector**: Vector similarity search and embeddings storage -- **pg_trgm**: Trigram-based fuzzy text matching -- **graphrag schema**: Organized storage for GraphRAG-specific data - ---- - -## Networks - -- **lg_graphrag_network**: Dedicated Docker bridge network for service communication - ---- - -## Volumes - -- **neo4j_data**: Persists Neo4j graph database and configurations -- **neo4j_logs**: Neo4j application and query logs -- **pgvector_data**: PostgreSQL data including vector embeddings and indexes -- **jupyter_data**: Jupyter user data, notebooks, and configurations - ---- - -## Environment Variables - -Services use environment variables from `docker/.env`: - -```bash -# Database Configuration -POSTGRES_USER=graphrag -POSTGRES_PASSWORD=graphragpass -POSTGRES_DB=graphrag_db - -# Neo4j Configuration -NEO4J_USER=neo4j -NEO4J_PASSWORD=password - -# Connection Strings (for notebooks) -VECTOR_STORE="postgresql://graphrag:graphragpass@postgres:5432/graphrag_db" -GRAPH_STORE="bolt://neo4j:password@neo4j:7687" -``` - --- ## Service Communication @@ -126,20 +89,14 @@ Services communicate using Docker internal networking: | From Service | To Service | Connection String | |--------------|------------|-------------------| -| Jupyter | Neo4j | `bolt://neo4j:password@neo4j:7687` | -| Jupyter | PostgreSQL | `postgresql://graphrag:graphragpass@postgres:5432/graphrag_db` | +| Jupyter | Neo4j | `bolt://neo4j:password@neo4j-local:7687` | +| Jupyter | PostgreSQL | `postgresql://postgres:password@pgvector-local:5432/graphrag` | --- ## Data Persistence -All services use Docker volumes for data persistence: - -- **Database data** survives container restarts -- **Jupyter configurations** persist between sessions -- **Neo4j graph data** maintained across deployments - -To reset all data, use: +All services use Docker volumes for data persistence. To reset all data: ```bash ./start-containers.sh --reset ``` @@ -154,6 +111,4 @@ After startup, services are available at: |---------|-----|-------------|---------| | **Jupyter Lab** | http://localhost:8889 | None required | Interactive development | | **Neo4j Browser** | http://localhost:7476 | neo4j/password | Graph database management | -| **PostgreSQL** | localhost:5432 | graphrag/graphragpass | Vector database (internal) | - -All development happens in Jupyter Lab, which provides pre-configured access to both databases. \ No newline at end of file +| **PostgreSQL** | localhost:5432 | postgres/password | Vector database | diff --git a/examples/lexical-graph-local-dev/docs/troubleshooting.md b/examples/lexical-graph-local-dev/docs/troubleshooting.md index e41627d8..96fc9058 100644 --- a/examples/lexical-graph-local-dev/docs/troubleshooting.md +++ b/examples/lexical-graph-local-dev/docs/troubleshooting.md @@ -84,7 +84,7 @@ sudo chown -R $USER:$USER notebooks/ **Issue: Development mode not detected** ```python -dev_mode = os.path.exists('/home/jovyan/lexical-graph-src') +dev_mode = os.path.exists('/home/jovyan/lexical-graph') print(dev_mode) # False ``` @@ -116,7 +116,7 @@ ls -la ../../../lexical-graph # Should exist # 3. Verify editable installation import graphrag_toolkit print(graphrag_toolkit.__file__) -# Should show: /home/jovyan/lexical-graph-src/... +# Should show: /home/jovyan/lexical-graph/... ``` ### Installation Issues @@ -129,11 +129,11 @@ ERROR: Could not install packages due to an EnvironmentError **Solution:** ```python # In Jupyter, try manual installation -!pip install -e /home/jovyan/lexical-graph-src --user +!pip install -e /home/jovyan/lexical-graph --user # Or reinstall from scratch !pip uninstall graphrag-lexical-graph -y -!pip install -e /home/jovyan/lexical-graph-src +!pip install -e /home/jovyan/lexical-graph # Restart kernel after installation ``` @@ -154,11 +154,11 @@ ServiceUnavailable: Failed to establish connection # Check connection string in notebook import os print(os.environ.get('GRAPH_STORE')) -# Should be: bolt://neo4j:password@neo4j:7687 +# Should be: bolt://neo4j:password@neo4j-local:7687 # Test connection from neo4j import GraphDatabase -driver = GraphDatabase.driver("bolt://neo4j:7687", auth=("neo4j", "password")) +driver = GraphDatabase.driver("bolt://neo4j-local:7687", auth=("neo4j", "password")) with driver.session() as session: result = session.run("RETURN 1") print(result.single()[0]) # Should print: 1 @@ -175,10 +175,10 @@ http://localhost:7476 not loading docker ps | grep neo4j # Check logs -docker logs neo4j +docker logs neo4j-local # Restart if needed -docker restart neo4j +docker restart neo4j-local ``` ### PostgreSQL Connection Problems @@ -193,15 +193,15 @@ psycopg2.OperationalError: could not connect to server # Check connection string import os print(os.environ.get('VECTOR_STORE')) -# Should be: postgresql://graphrag:graphragpass@postgres:5432/graphrag_db +# Should be: postgresql://postgres:password@pgvector-local:5432/graphrag # Test connection import psycopg2 conn = psycopg2.connect( - host="postgres", - database="graphrag_db", - user="graphrag", - password="graphragpass" + host="pgvector-local", + database="graphrag", + user="postgres", + password="password" ) print("PostgreSQL connection successful") conn.close() @@ -332,13 +332,13 @@ http://localhost:8889 loads but shows nothing **Solution:** ```bash # Check Jupyter logs -docker logs jupyter-notebook +docker logs jupyter-local # Try different browser or incognito mode # Clear browser cache # Restart Jupyter container -docker restart jupyter-notebook +docker restart jupyter-local ``` ### Kernel Issues @@ -348,13 +348,13 @@ docker restart jupyter-notebook **Solution:** ```bash # Check Jupyter container resources -docker stats jupyter-notebook +docker stats jupyter-local # Restart with more memory (if needed) # Edit docker-compose.yml to add memory limits # Clear Jupyter cache -docker exec -it jupyter-notebook rm -rf /home/jovyan/.jupyter/runtime/* +docker exec -it jupyter-local rm -rf /home/jovyan/.jupyter/runtime/* ``` --- @@ -398,10 +398,10 @@ Connection refused when connecting between containers **Solution:** ```bash # Check network status -docker network ls | grep lg_graphrag +docker network ls | grep graphrag_local # Inspect network -docker network inspect lg_graphrag_network +docker network inspect graphrag_local_network # Ensure all containers are on same network docker ps --format "table {{.Names}}\t{{.Networks}}" @@ -467,12 +467,12 @@ Reset only specific components: ```bash # Reset only databases (keep Jupyter) -docker stop neo4j postgres -docker rm neo4j postgres -docker volume rm neo4j_data pgvector_data +docker stop neo4j-local pgvector-local +docker rm neo4j-local pgvector-local +docker volume rm neo4j_local_data pgvector_local_data # Restart databases -docker-compose up -d neo4j postgres +docker-compose up -d neo4j-local pgvector-local ``` --- @@ -485,9 +485,9 @@ When reporting issues, collect relevant logs: ```bash # Container logs -docker logs neo4j > neo4j.log -docker logs postgres > postgres.log -docker logs jupyter-notebook > jupyter.log +docker logs neo4j-local > neo4j.log +docker logs pgvector-local > postgres.log +docker logs jupyter-local > jupyter.log # System information docker version > system_info.txt diff --git a/examples/lexical-graph-local-dev/notebooks/.env.template b/examples/lexical-graph-local-dev/notebooks/.env.template index 11ff462b..f83478a1 100644 --- a/examples/lexical-graph-local-dev/notebooks/.env.template +++ b/examples/lexical-graph-local-dev/notebooks/.env.template @@ -2,14 +2,14 @@ # Based on docker-compose.yml services # AWS Configuration -AWS_PROFILE=default +# AWS_PROFILE=default AWS_REGION=us-east-1 # Graph Database -GRAPH_STORE=bolt://neo4j:password@neo4j:7687 +GRAPH_STORE=bolt://neo4j:password@neo4j-local:7687 # Vector Database (PostgreSQL) -VECTOR_STORE=postgresql://postgres:password@postgres:5432/graphrag +VECTOR_STORE=postgresql://postgres:password@pgvector-local:5432/graphrag POSTGRES_USER=postgres POSTGRES_PASSWORD=password POSTGRES_DB=graphrag @@ -37,13 +37,20 @@ ENABLE_CACHE=False # Include domain labels in entity identifiers INCLUDE_DOMAIN_LABELS=False -# S3 Buckets -LOCAL_EXTRACT_S3=ccms-lexical-graph -PROMPT_S3=ccms-lexical-graph +# S3 Storage (optional — set your bucket name if using S3) +S3_BUCKET_NAME= +PROMPT_PREFIX=prompts + +# Bedrock Managed Prompts (optional — set ARNs from create_custom_prompt.sh output) +# SYSTEM_PROMPT_ARN= +# USER_PROMPT_ARN= # GitLab Registry Credentials -GITLAB_PYPI_TOKEN="your-gitlab-token-here" -GITLAB_USERNAME="your-gitlab-username" +GITLAB_PYPI_TOKEN=your-gitlab-token-here +GITLAB_USERNAME=your-gitlab-username + +# Suppress Neo4j warnings +NEO4J_LOG_LEVEL=ERROR # Tenant Configuration TENANT_ID=default From 5f603fa66e427dda0eda4408609295cad0789276 Mon Sep 17 00:00:00 2001 From: Mykola Pereyma Date: Fri, 17 Apr 2026 11:32:28 -0700 Subject: [PATCH 02/10] Fix notebooks and add AWS setup scripts Notebooks: - Rename '03-Querying with prompting' to '03-Querying-with-Prompting' - Replace hardcoded aws_profile='padmin', bucket='ccms-prompts', aws_region='ap-south-1' with env vars matching hybrid-dev pattern - Use BedrockPromptProviderConfig() with env vars (no hardcoded ARNs) - Replace hardcoded S3 bucket in 04-Advanced-Configuration-Examples with f-string using os.environ.get('S3_BUCKET_NAME') AWS scripts (copied from hybrid-dev): - setup-bedrock-batch.sh/ps1: S3 bucket, DynamoDB, IAM roles - create_custom_prompt.sh/ps1: Bedrock managed prompts - create_prompt_role.sh/ps1: IAM role for Bedrock - Prompt JSON files and IAM policy --- .../aws/bedrock-prompt-policy.json | 14 + .../aws/create_custom_prompt.ps1 | 33 ++ .../aws/create_custom_prompt.sh | 30 ++ .../aws/create_prompt_role.ps1 | 67 ++++ .../aws/create_prompt_role.sh | 68 ++++ .../aws/setup-bedrock-batch-doc.md | 134 ++++++++ .../aws/setup-bedrock-batch.ps1 | 246 ++++++++++++++ .../aws/setup-bedrock-batch.sh | 312 ++++++++++++++++++ .../aws/system_prompt.json | 33 ++ .../aws/user_prompt.json | 33 ++ ...ipynb => 03-Querying-with-Prompting.ipynb} | 21 +- .../04-Advanced-Configuration-Examples.ipynb | 8 +- 12 files changed, 983 insertions(+), 16 deletions(-) create mode 100644 examples/lexical-graph-local-dev/aws/bedrock-prompt-policy.json create mode 100644 examples/lexical-graph-local-dev/aws/create_custom_prompt.ps1 create mode 100755 examples/lexical-graph-local-dev/aws/create_custom_prompt.sh create mode 100644 examples/lexical-graph-local-dev/aws/create_prompt_role.ps1 create mode 100755 examples/lexical-graph-local-dev/aws/create_prompt_role.sh create mode 100644 examples/lexical-graph-local-dev/aws/setup-bedrock-batch-doc.md create mode 100644 examples/lexical-graph-local-dev/aws/setup-bedrock-batch.ps1 create mode 100755 examples/lexical-graph-local-dev/aws/setup-bedrock-batch.sh create mode 100644 examples/lexical-graph-local-dev/aws/system_prompt.json create mode 100644 examples/lexical-graph-local-dev/aws/user_prompt.json rename examples/lexical-graph-local-dev/notebooks/{03-Querying with prompting.ipynb => 03-Querying-with-Prompting.ipynb} (92%) diff --git a/examples/lexical-graph-local-dev/aws/bedrock-prompt-policy.json b/examples/lexical-graph-local-dev/aws/bedrock-prompt-policy.json new file mode 100644 index 00000000..29bd2dbd --- /dev/null +++ b/examples/lexical-graph-local-dev/aws/bedrock-prompt-policy.json @@ -0,0 +1,14 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "bedrock:CreatePrompt", + "bedrock:GetPrompt", + "bedrock:ListPrompts" + ], + "Resource": "*" + } + ] +} diff --git a/examples/lexical-graph-local-dev/aws/create_custom_prompt.ps1 b/examples/lexical-graph-local-dev/aws/create_custom_prompt.ps1 new file mode 100644 index 00000000..f2b5d97d --- /dev/null +++ b/examples/lexical-graph-local-dev/aws/create_custom_prompt.ps1 @@ -0,0 +1,33 @@ +# Usage: +# .\create_custom_prompt.ps1 [aws_profile] + +param( + [Parameter(Mandatory = $true)] + [string]$PromptJson, + + [Parameter(Mandatory = $true)] + [string]$Region, + + [string]$AwsProfile +) + +if (-not (Test-Path $PromptJson)) { + Write-Host "Error: JSON file '$PromptJson' not found." + exit 1 +} + +Write-Host "Creating prompt from JSON file: $PromptJson" + +$cmd = @( + "aws", "bedrock-agent", "create-prompt", + "--region", $Region, + "--cli-input-json", "file://$PromptJson" +) + +if ($AwsProfile) { + $cmd += @("--profile", $AwsProfile) +} + +& $cmd + +Write-Host "Prompt created successfully." diff --git a/examples/lexical-graph-local-dev/aws/create_custom_prompt.sh b/examples/lexical-graph-local-dev/aws/create_custom_prompt.sh new file mode 100755 index 00000000..34f04184 --- /dev/null +++ b/examples/lexical-graph-local-dev/aws/create_custom_prompt.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Usage: +# ./create_custom_prompt.sh [aws_profile] + +set -e + +PROMPT_JSON="$1" +REGION="$2" +AWS_PROFILE="$3" + +if [[ -z "$PROMPT_JSON" || -z "$REGION" ]]; then + echo "Usage: $0 [aws_profile]" + exit 1 +fi + +if [[ ! -f "$PROMPT_JSON" ]]; then + echo "Error: JSON file '$PROMPT_JSON' not found." + exit 1 +fi + +# Build AWS CLI command +CMD=(aws bedrock-agent create-prompt --region "$REGION" --cli-input-json file://"$PROMPT_JSON") +if [[ -n "$AWS_PROFILE" ]]; then + CMD+=(--profile "$AWS_PROFILE") +fi + +echo "Creating prompt from JSON file: $PROMPT_JSON" +"${CMD[@]}" +echo "Prompt created successfully." diff --git a/examples/lexical-graph-local-dev/aws/create_prompt_role.ps1 b/examples/lexical-graph-local-dev/aws/create_prompt_role.ps1 new file mode 100644 index 00000000..5ac2c3a4 --- /dev/null +++ b/examples/lexical-graph-local-dev/aws/create_prompt_role.ps1 @@ -0,0 +1,67 @@ +# Usage: +# .\create_prompt_role.ps1 -RoleName "my-bedrock-prompt-role" -Profile "my-aws-profile" + +param ( + [Parameter(Mandatory = $true)] + [string]$RoleName, + + [string]$Profile +) + +if (-not $RoleName) { + Write-Host "Error: --role-name is required" + exit 1 +} + +$profileArgs = @() +if ($Profile) { + $profileArgs = @("--profile", $Profile) +} + +# Define the trust policy +$trustPolicy = @" +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "bedrock.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} +"@ + +# Write to temporary trust policy file +$tempTrustPolicyFile = "trust-policy-temp.json" +$trustPolicy | Set-Content -Encoding UTF8 $tempTrustPolicyFile + +# Create the IAM role +Write-Host "Creating IAM role '$RoleName' for Bedrock..." +aws iam create-role ` + --role-name $RoleName ` + --assume-role-policy-document file://$tempTrustPolicyFile ` + @profileArgs + +# Attach inline policy (assumes bedrock-prompt-policy.json is in same directory) +Write-Host "Attaching inline policy (BedrockPromptMinimalPolicy)..." +aws iam put-role-policy ` + --role-name $RoleName ` + --policy-name "BedrockPromptMinimalPolicy" ` + --policy-document file://bedrock-prompt-policy.json ` + @profileArgs + +# Get the role ARN +$roleArn = aws iam get-role ` + --role-name $RoleName ` + --query "Role.Arn" ` + --output text ` + @profileArgs + +Write-Host "`nDone. Role ARN:" +Write-Host $roleArn + +# Cleanup +Remove-Item $tem diff --git a/examples/lexical-graph-local-dev/aws/create_prompt_role.sh b/examples/lexical-graph-local-dev/aws/create_prompt_role.sh new file mode 100755 index 00000000..7ea59065 --- /dev/null +++ b/examples/lexical-graph-local-dev/aws/create_prompt_role.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Usage: +# ./create_prompt_role.sh --role-name my-bedrock-prompt-role --profile my-aws-profile + +set -e + +# Default values +ROLE_NAME="" +PROFILE_OPTION="" + +# Parse arguments +while [[ "$#" -gt 0 ]]; do + case $1 in + --role-name) + ROLE_NAME="$2" + shift + ;; + --profile) + PROFILE_OPTION="--profile $2" + shift + ;; + *) + echo "Unknown parameter passed: $1" + exit 1 + ;; + esac + shift +done + +if [[ -z "$ROLE_NAME" ]]; then + echo "Error: --role-name is required" + exit 1 +fi + +TRUST_POLICY=$(cat <` for a specific AWS credentials profile, or set the `AWS_PROFILE` environment variable. + +2. **Retrieves AWS Account and Region Info** + Using the AWS profile, the script resolves: + - `ACCOUNT_ID` + - `REGION` + - (Optional) Current SSO role being used + +3. **Creates an S3 Bucket** + Creates a bucket named `graphrag-toolkit-` for uploading input/output files used in batch jobs. + +4. **Creates an IAM Role for Bedrock (Execution Role)** + - Name: `bedrock-batch-inference-role` + - Trusts the `bedrock.amazonaws.com` service + - Permissions: + Allows access to the newly created S3 bucket. + +5. **Creates an IAM Identity Policy** + - Name: `bedrock-batch-identity-policy` + - Grants permission to: + - Create, List, Get, and Stop Bedrock model invocation jobs + - Pass the execution role to Bedrock + +6. **Attaches Policies to Role/User** + - Attaches the role permissions to the `bedrock-batch-inference-role` + - Prints instructions to attach the identity policy manually depending on credential type + +7. **Cleanup** + Temporary policy files are deleted from the local directory. + +--- + +## Output Resources + +| Resource | Description | +|---------|-------------| +| S3 Bucket | `graphrag-toolkit-` | +| IAM Role | `bedrock-batch-inference-role` | +| IAM Role Policy | Grants S3 access for batch inference | +| IAM Identity Policy | Grants permission to submit and manage Bedrock batch jobs | + +--- + +## Usage + +```bash +bash setup-bedrock-batch.sh [your-profile] +``` + +If no profile is specified, the AWS CLI uses its default credential chain (environment variables, instance profile, etc.). + +--- + +## Manual IAM Setup Required (SSO Users) + +If you're using AWS SSO, the script will print: +``` +NOTE: You are using AWS SSO with role: +To complete setup, you need to: +1. Go to AWS IAM Identity Center +2. Find your Permission Set +3. Add the identity policy (arn:aws:iam:::policy/bedrock-batch-identity-policy) to your Permission Set +``` + +If you're using static credentials, you must manually attach the identity policy to the user/role. + +--- + +## Related Policies + +### Trust Policy (Role) +```json +{ + "Principal": { + "Service": "bedrock.amazonaws.com" + }, + "Condition": { + "StringEquals": { + "aws:SourceAccount": "" + }, + "ArnEquals": { + "aws:SourceArn": "arn:aws:bedrock:::model-invocation-job/*" + } + } +} +``` + +### Role Policy (S3 Access) +```json +{ + "Action": ["s3:GetObject", "s3:ListBucket", "s3:PutObject"], + "Resource": [ + "arn:aws:s3:::graphrag-toolkit-", + "arn:aws:s3:::graphrag-toolkit-/*" + ] +} +``` + +### Identity Policy (Bedrock Access) +```json +{ + "Action": [ + "bedrock:CreateModelInvocationJob", + "bedrock:GetModelInvocationJob", + "bedrock:ListModelInvocationJobs", + "bedrock:StopModelInvocationJob", + "iam:PassRole" + ] +} +``` + +--- + +## Prerequisites + +- AWS CLI installed +- AWS credentials configured for the profile (via SSO or `aws configure`) +- Sufficient permissions to: + - Create IAM roles and policies + - Create S3 buckets diff --git a/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.ps1 b/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.ps1 new file mode 100644 index 00000000..7560f58f --- /dev/null +++ b/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.ps1 @@ -0,0 +1,246 @@ +# Usage: .\setup-graphrag.ps1 [-Profile ] +param( + [string]$Profile = "" +) + +# Build conditional profile args for splatting +$ProfileArgs = @() +if ($Profile) { + $ProfileArgs = @("--profile", $Profile) +} + +function Check-AwsCredentials { + if (-not (aws sts get-caller-identity @ProfileArgs -ErrorAction SilentlyContinue)) { + Write-Host "Error: No valid AWS credentials found" + if ($Profile) { + Write-Host "If using AWS SSO, run: aws sso login --profile $Profile" + Write-Host "If using traditional credentials, run: aws configure --profile $Profile" + } else { + Write-Host "If using AWS SSO, run: aws sso login" + Write-Host "If using traditional credentials, run: aws configure" + } + exit 1 + } +} + +function Get-AccountDetails { + $global:AccountId = aws sts get-caller-identity @ProfileArgs --query Account --output text + if (-not $AccountId) { + Write-Host "Error: Could not determine AWS Account ID" + exit 1 + } + + $global:Region = aws configure get region @ProfileArgs + if (-not $Region) { + Write-Host "Error: Could not determine AWS Region" + exit 1 + } + + $global:CurrentRole = aws sts get-caller-identity @ProfileArgs --query Arn --output text | Select-String -Pattern 'AWSReservedSSO_[^/]+' | ForEach-Object { $_.Matches.Value } +} + +Check-AwsCredentials +Get-AccountDetails + +$ApplicationId = "graphrag-toolkit" +$BucketName = "graphrag-toolkit-$AccountId" +$RoleName = "bedrock-batch-inference-role" +$PolicyName = "bedrock-batch-inference-policy" +$ModelId = "anthropic.claude-v2" +$TableName = "graphrag-toolkit-batch-table" + +# Create S3 bucket +Write-Host "Creating S3 bucket $BucketName..." +if (-not (aws s3api head-bucket --bucket $BucketName @ProfileArgs -ErrorAction SilentlyContinue)) { + if ($Region -eq "us-east-1") { + aws s3api create-bucket --bucket $BucketName --region $Region @ProfileArgs + } else { + aws s3api create-bucket --bucket $BucketName --region $Region --create-bucket-configuration LocationConstraint=$Region @ProfileArgs + } + Write-Host "Bucket created successfully" +} else { + Write-Host "Bucket $BucketName already exists" +} + +# Create DynamoDB table +Write-Host "Creating DynamoDB table $TableName..." +if (-not (aws dynamodb describe-table --table-name $TableName @ProfileArgs -ErrorAction SilentlyContinue)) { + aws dynamodb create-table ` + --table-name $TableName ` + --attribute-definitions ` + AttributeName=collection_id,AttributeType=S ` + AttributeName=completion_date,AttributeType=S ` + AttributeName=reader_type,AttributeType=S ` + --key-schema ` + AttributeName=collection_id,KeyType=HASH ` + AttributeName=completion_date,KeyType=RANGE ` + --billing-mode PAY_PER_REQUEST ` + --global-secondary-indexes "[{`"IndexName`": `"reader_type-index`", `"KeySchema`": [{`"AttributeName`": `"reader_type`", `"KeyType`": `"HASH`"}, {`"AttributeName`": `"completion_date`", `"KeyType`": `"RANGE`"}], `"Projection`": {`"ProjectionType`": `"ALL`"}}]" ` + --region $Region ` + @ProfileArgs + + Write-Host "Waiting for DynamoDB table to become active..." + aws dynamodb wait table-exists --table-name $TableName --region $Region @ProfileArgs + Write-Host "DynamoDB table created successfully" +} else { + Write-Host "DynamoDB table $TableName already exists" +} + +# Write IAM policy JSON files +@" +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "bedrock.amazonaws.com" + }, + "Action": "sts:AssumeRole", + "Condition": { + "StringEquals": { + "aws:SourceAccount": "$AccountId" + }, + "ArnEquals": { + "aws:SourceArn": "arn:aws:bedrock:$Region:$AccountId:model-invocation-job/*" + } + } + } + ] +} +"@ | Set-Content -Encoding UTF8 trust-policy.json + +@" +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": ["bedrock:InvokeModel"], + "Resource": "arn:aws:bedrock:${Region}::foundation-model/*" + }, + { + "Effect": "Allow", + "Action": ["s3:GetObject", "s3:ListBucket", "s3:PutObject"], + "Resource": [ + "arn:aws:s3:::$BucketName", + "arn:aws:s3:::$BucketName/*" + ], + "Condition": { + "StringEquals": { + "aws:ResourceAccount": ["$AccountId"] + } + } + }, + { + "Effect": "Allow", + "Action": ["dynamodb:PutItem", "dynamodb:Query", "dynamodb:Scan"], + "Resource": "arn:aws:dynamodb:$Region:$AccountId:table/$TableName", + "Condition": { + "StringEquals": { + "aws:ResourceAccount": ["$AccountId"] + } + } + } + ] +} +"@ | Set-Content -Encoding UTF8 role-permissions-policy.json + +@" +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "bedrock:CreateModelInvocationJob", + "bedrock:GetModelInvocationJob", + "bedrock:ListModelInvocationJobs", + "bedrock:StopModelInvocationJob" + ], + "Resource": [ + "arn:aws:bedrock:$Region::foundation-model/$ModelId", + "arn:aws:bedrock:$Region:$AccountId:model-invocation-job/*" + ] + }, + { + "Effect": "Allow", + "Action": ["iam:PassRole"], + "Resource": "arn:aws:iam::$AccountId:role/$RoleName" + }, + { + "Effect": "Allow", + "Action": ["dynamodb:PutItem", "dynamodb:Query", "dynamodb:Scan"], + "Resource": "arn:aws:dynamodb:$Region:$AccountId:table/$TableName" + } + ] +} +"@ | Set-Content -Encoding UTF8 identity-permissions-policy.json + +# Create IAM role and attach policy +Write-Host "Creating IAM role $RoleName..." +if (-not (aws iam get-role --role-name $RoleName @ProfileArgs -ErrorAction SilentlyContinue)) { + aws iam create-role --role-name $RoleName --assume-role-policy-document file://trust-policy.json @ProfileArgs + Write-Host "Role created successfully" +} else { + Write-Host "Role $RoleName already exists" +} + +$PolicyArn = "arn:aws:iam::$AccountId:policy/$PolicyName" +if (-not (aws iam get-policy --policy-arn $PolicyArn @ProfileArgs -ErrorAction SilentlyContinue)) { + aws iam create-policy --policy-name $PolicyName --policy-document file://role-permissions-policy.json @ProfileArgs + Write-Host "Policy created successfully" +} else { + Write-Host "Policy $PolicyName already exists" +} + +aws iam attach-role-policy --role-name $RoleName --policy-arn $PolicyArn @ProfileArgs + +# Create identity policy +$IdentityPolicyName = "bedrock-batch-identity-policy" +$IdentityPolicyArn = "arn:aws:iam::$AccountId:policy/$IdentityPolicyName" +if (-not (aws iam get-policy --policy-arn $IdentityPolicyArn @ProfileArgs -ErrorAction SilentlyContinue)) { + aws iam create-policy --policy-name $IdentityPolicyName --policy-document file://identity-permissions-policy.json @ProfileArgs + Write-Host "Identity policy created successfully" +} else { + Write-Host "Identity policy $IdentityPolicyName already exists" +} + +# Clean up temp files +Remove-Item trust-policy.json, role-permissions-policy.json, identity-permissions-policy.json -Force + +# Upload S3 prompt files for S3PromptProvider (used by notebook 04) +Write-Host "Uploading prompt files to S3..." +$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path + +python3 -c @" +import json, sys +with open(sys.argv[1]) as f: + data = json.load(f) +print(data['variants'][0]['templateConfiguration']['text']['text'], end='') +"@ "$ScriptDir/system_prompt.json" | aws s3 cp - "s3://$BucketName/prompts/system_prompt.txt" --content-type text/plain --region $Region @ProfileArgs + +python3 -c @" +import json, sys +with open(sys.argv[1]) as f: + data = json.load(f) +print(data['variants'][0]['templateConfiguration']['text']['text'], end='') +"@ "$ScriptDir/user_prompt.json" | aws s3 cp - "s3://$BucketName/prompts/user_prompt.txt" --content-type text/plain --region $Region @ProfileArgs + +Write-Host "Prompt files uploaded to s3://$BucketName/prompts/" + +# Summary +Write-Host "`nSetup complete!" +Write-Host "Bucket: $BucketName" +Write-Host "DynamoDB Table: arn:aws:dynamodb:$Region:$AccountId:table/$TableName" +Write-Host "Role ARN: arn:aws:iam::$AccountId:role/$RoleName" +Write-Host "Policy ARN: $PolicyArn" +Write-Host "Identity Policy ARN: $IdentityPolicyArn" + +if ($CurrentRole) { + Write-Host "`nNOTE: You are using AWS SSO with role: $CurrentRole" + Write-Host "To complete setup, go to IAM Identity Center and attach the identity policy to the Permission Set." +} else { + Write-Host "`nNOTE: You are using traditional IAM credentials." + Write-Host "Ensure the identity policy is attached to your IAM user or role." +} diff --git a/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.sh b/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.sh new file mode 100755 index 00000000..b3c024f5 --- /dev/null +++ b/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.sh @@ -0,0 +1,312 @@ +#!/bin/bash + +# Script to set up S3 bucket, IAM role, policies, and DynamoDB table for GraphRAG +# Usage: ./setup-graphrag.sh [profile_name] + +PROFILE="${1:-""}" + +# Build conditional profile flag +PROFILE_ARGS="" +if [ -n "$PROFILE" ]; then + PROFILE_ARGS="--profile ${PROFILE}" +fi + +# Check if AWS credentials are available +check_aws_credentials() { + if ! aws sts get-caller-identity ${PROFILE_ARGS} &>/dev/null; then + echo "Error: No valid AWS credentials found" + if [ -n "$PROFILE" ]; then + echo "If using AWS SSO, please run 'aws sso login --profile ${PROFILE}'" + echo "If using traditional credentials, please configure AWS CLI with 'aws configure --profile ${PROFILE}'" + else + echo "If using AWS SSO, please run 'aws sso login'" + echo "If using traditional credentials, please configure AWS CLI with 'aws configure'" + fi + exit 1 + fi +} + +# Get account details safely +get_account_details() { + ACCOUNT_ID=$(aws sts get-caller-identity ${PROFILE_ARGS} --query Account --output text) + if [ -z "$ACCOUNT_ID" ]; then + echo "Error: Could not determine AWS Account ID" + exit 1 + fi + + REGION=$(aws configure get region ${PROFILE_ARGS}) + if [ -z "$REGION" ]; then + echo "Error: Could not determine AWS Region" + exit 1 + fi + + # For SSO users, get the role name they're using + CURRENT_ROLE=$(aws sts get-caller-identity ${PROFILE_ARGS} --query Arn --output text | grep -o 'AWSReservedSSO_[^/]*' || echo "") +} + +# Configuration variables +check_aws_credentials +get_account_details + +APPLICATION_ID="graphrag-toolkit" +BUCKET_NAME="graphrag-toolkit-${ACCOUNT_ID}" # Using account ID to ensure uniqueness +ROLE_NAME="bedrock-batch-inference-role" +POLICY_NAME="bedrock-batch-inference-policy" +MODEL_ID="anthropic.claude-v2" # Example model ID, adjust as needed +TABLE_NAME="graphrag-toolkit-batch-table" + +# Create S3 bucket with error handling +echo "Creating S3 bucket ${BUCKET_NAME}..." +if ! aws s3api head-bucket --bucket "${BUCKET_NAME}" ${PROFILE_ARGS} 2>/dev/null; then + if [[ "${REGION}" == "us-east-1" ]]; then + aws s3api create-bucket \ + --bucket "${BUCKET_NAME}" \ + --region "${REGION}" \ + ${PROFILE_ARGS} || exit 1 + else + aws s3api create-bucket \ + --bucket "${BUCKET_NAME}" \ + --region "${REGION}" \ + --create-bucket-configuration LocationConstraint="${REGION}" \ + ${PROFILE_ARGS} || exit 1 + fi + echo "Bucket created successfully" +else + echo "Bucket ${BUCKET_NAME} already exists" +fi + +# Create DynamoDB table with error handling +echo "Creating DynamoDB table ${TABLE_NAME}..." +if ! aws dynamodb describe-table --table-name "${TABLE_NAME}" ${PROFILE_ARGS} &>/dev/null; then + aws dynamodb create-table \ + --table-name "${TABLE_NAME}" \ + --attribute-definitions \ + AttributeName=collection_id,AttributeType=S \ + AttributeName=completion_date,AttributeType=S \ + AttributeName=reader_type,AttributeType=S \ + --key-schema \ + AttributeName=collection_id,KeyType=HASH \ + AttributeName=completion_date,KeyType=RANGE \ + --billing-mode PAY_PER_REQUEST \ + --global-secondary-indexes \ + "[{ + \"IndexName\": \"reader_type-index\", + \"KeySchema\": [ + {\"AttributeName\": \"reader_type\", \"KeyType\": \"HASH\"}, + {\"AttributeName\": \"completion_date\", \"KeyType\": \"RANGE\"} + ], + \"Projection\": {\"ProjectionType\": \"ALL\"} + }]" \ + --region "${REGION}" \ + ${PROFILE_ARGS} || exit 1 + echo "Waiting for DynamoDB table to become active..." + aws dynamodb wait table-exists \ + --table-name "${TABLE_NAME}" \ + --region "${REGION}" \ + ${PROFILE_ARGS} || exit 1 + echo "DynamoDB table created successfully" +else + echo "DynamoDB table ${TABLE_NAME} already exists" +fi + +# Create trust policy for the service role +echo "Creating trust policy..." +cat << EOF > trust-policy.json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "bedrock.amazonaws.com" + }, + "Action": "sts:AssumeRole", + "Condition": { + "StringEquals": { + "aws:SourceAccount": "${ACCOUNT_ID}" + }, + "ArnEquals": { + "aws:SourceArn": "arn:aws:bedrock:${REGION}:${ACCOUNT_ID}:model-invocation-job/*" + } + } + } + ] +} +EOF + +# Create service role permissions policy +echo "Creating service role permissions policy..." +cat << EOF > role-permissions-policy.json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "bedrock:InvokeModel" + ], + "Resource": "arn:aws:bedrock:${REGION}::foundation-model/*" + }, + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:ListBucket", + "s3:PutObject" + ], + "Resource": [ + "arn:aws:s3:::${BUCKET_NAME}", + "arn:aws:s3:::${BUCKET_NAME}/*" + ], + "Condition": { + "StringEquals": { + "aws:ResourceAccount": ["${ACCOUNT_ID}"] + } + } + }, + { + "Effect": "Allow", + "Action": [ + "dynamodb:PutItem", + "dynamodb:Query", + "dynamodb:Scan" + ], + "Resource": "arn:aws:dynamodb:${REGION}:${ACCOUNT_ID}:table/${TABLE_NAME}", + "Condition": { + "StringEquals": { + "aws:ResourceAccount": ["${ACCOUNT_ID}"] + } + } + } + ] +} +EOF + +# Create IAM identity permissions policy +echo "Creating identity permissions policy..." +cat << EOF > identity-permissions-policy.json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "bedrock:CreateModelInvocationJob", + "bedrock:GetModelInvocationJob", + "bedrock:ListModelInvocationJobs", + "bedrock:StopModelInvocationJob" + ], + "Resource": [ + "arn:aws:bedrock:${REGION}::foundation-model/${MODEL_ID}", + "arn:aws:bedrock:${REGION}:${ACCOUNT_ID}:model-invocation-job/*" + ] + }, + { + "Effect": "Allow", + "Action": [ + "iam:PassRole" + ], + "Resource": "arn:aws:iam::${ACCOUNT_ID}:role/${ROLE_NAME}" + }, + { + "Effect": "Allow", + "Action": [ + "dynamodb:PutItem", + "dynamodb:Query", + "dynamodb:Scan" + ], + "Resource": "arn:aws:dynamodb:${REGION}:${ACCOUNT_ID}:table/${TABLE_NAME}" + } + ] +} +EOF + +# Create the IAM role with error handling +echo "Creating IAM role ${ROLE_NAME}..." +if ! aws iam get-role --role-name "${ROLE_NAME}" ${PROFILE_ARGS} &>/dev/null; then + aws iam create-role \ + --role-name "${ROLE_NAME}" \ + --assume-role-policy-document file://trust-policy.json \ + ${PROFILE_ARGS} || exit 1 + echo "Role created successfully" +else + echo "Role ${ROLE_NAME} already exists" +fi + +# Create and attach the service role policy +echo "Creating and attaching service role policy..." +POLICY_ARN="arn:aws:iam::${ACCOUNT_ID}:policy/${POLICY_NAME}" + +if ! aws iam get-policy --policy-arn "${POLICY_ARN}" ${PROFILE_ARGS} &>/dev/null; then + aws iam create-policy \ + --policy-name "${POLICY_NAME}" \ + --policy-document file://role-permissions-policy.json \ + ${PROFILE_ARGS} || exit 1 + echo "Policy created successfully" +else + echo "Policy ${POLICY_NAME} already exists" +fi + +# Attach policy to role +aws iam attach-role-policy \ + --role-name "${ROLE_NAME}" \ + --policy-arn "${POLICY_ARN}" \ + ${PROFILE_ARGS} || exit 1 + +# Create the identity permissions policy +IDENTITY_POLICY_NAME="bedrock-batch-identity-policy" +IDENTITY_POLICY_ARN="arn:aws:iam::${ACCOUNT_ID}:policy/${IDENTITY_POLICY_NAME}" + +if ! aws iam get-policy --policy-arn "${IDENTITY_POLICY_ARN}" ${PROFILE_ARGS} &>/dev/null; then + aws iam create-policy \ + --policy-name "${IDENTITY_POLICY_NAME}" \ + --policy-document file://identity-permissions-policy.json \ + ${PROFILE_ARGS} || exit 1 + echo "Identity policy created successfully" +else + echo "Identity policy ${IDENTITY_POLICY_NAME} already exists" +fi + +# Clean up temporary files +rm -f trust-policy.json role-permissions-policy.json identity-permissions-policy.json + +# Upload S3 prompt files for S3PromptProvider (used by notebook 04) +echo "Uploading prompt files to S3..." +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +# Extract prompt text from JSON and upload as .txt +python3 -c " +import json, sys +with open(sys.argv[1]) as f: + data = json.load(f) +print(data['variants'][0]['templateConfiguration']['text']['text'], end='') +" "${SCRIPT_DIR}/system_prompt.json" | aws s3 cp - "s3://${BUCKET_NAME}/prompts/system_prompt.txt" --content-type text/plain ${PROFILE_ARGS} + +python3 -c " +import json, sys +with open(sys.argv[1]) as f: + data = json.load(f) +print(data['variants'][0]['templateConfiguration']['text']['text'], end='') +" "${SCRIPT_DIR}/user_prompt.json" | aws s3 cp - "s3://${BUCKET_NAME}/prompts/user_prompt.txt" --content-type text/plain ${PROFILE_ARGS} + +echo "Prompt files uploaded to s3://${BUCKET_NAME}/prompts/" + +echo "Setup complete!" +echo "Bucket name: ${BUCKET_NAME}" +echo "DynamoDB Table ARN: arn:aws:dynamodb:${REGION}:${ACCOUNT_ID}:table/${TABLE_NAME}" +echo "Role ARN: arn:aws:iam::${ACCOUNT_ID}:role/${ROLE_NAME}" +echo "Service Role Policy ARN: ${POLICY_ARN}" +echo "Identity Policy ARN: ${IDENTITY_POLICY_ARN}" + +if [ -n "$CURRENT_ROLE" ]; then + echo "" + echo "NOTE: You are using AWS SSO with role: ${CURRENT_ROLE}" + echo "To complete setup, you need to:" + echo "1. Go to AWS IAM Identity Center" + echo "2. Find your Permission Set" + echo "3. Add the identity policy (${IDENTITY_POLICY_ARN}) to your Permission Set" +else + echo "" + echo "NOTE: You are using traditional IAM credentials" + echo "Make sure to attach the identity policy to your IAM user or role" +fi \ No newline at end of file diff --git a/examples/lexical-graph-local-dev/aws/system_prompt.json b/examples/lexical-graph-local-dev/aws/system_prompt.json new file mode 100644 index 00000000..2859f98e --- /dev/null +++ b/examples/lexical-graph-local-dev/aws/system_prompt.json @@ -0,0 +1,33 @@ +{ + "name": "system_prompt", + "description": "System prompt for answering user questions based on search results", + "defaultVariant": "default", + "variants": [ + { + "name": "default", + "templateType": "TEXT", + "templateConfiguration": { + "text": { + "text": "You are a question answering agent. I will provide you with a set of search results. The user will provide you with a question. Your job is to answer the user's question using only information from the search results. If the search results are empty, do not attempt to answer the question.\n\n\n{search_results}\n\n\n## Instructions\n - Think carefully about the question, the source and relevancy of each of the search results, and the logical connections between different search results before answering.\n - Ensure you answer each part of the question.\n - Reference information from the search results in your answer by adding the 'source' in square brackets at the end of relevant sentences.\n - Do NOT directly quote the search results in your answer.\n - If the question is a yes/no question, start with either 'Yes' or 'No'.\n - If the search results are empty, do not attempt to answer the question.\n\nBased on the search results, answer the following question as concisely as possible:" + } + }, + "inferenceConfiguration": { + "text": { + "maxTokens": 800, + "temperature": 0.3, + "topP": 0.9 + } + }, + "metadata": [ + { + "key": "creator", + "value": "script" + }, + { + "key": "project", + "value": "GraphRAG" + } + ] + } + ] +} diff --git a/examples/lexical-graph-local-dev/aws/user_prompt.json b/examples/lexical-graph-local-dev/aws/user_prompt.json new file mode 100644 index 00000000..d2464c14 --- /dev/null +++ b/examples/lexical-graph-local-dev/aws/user_prompt.json @@ -0,0 +1,33 @@ +{ + "name": "user_prompt", + "description": "Prompt for user questions", + "defaultVariant": "default", + "variants": [ + { + "name": "default", + "templateType": "TEXT", + "templateConfiguration": { + "text": { + "text": "\n{query}\n" + } + }, + "inferenceConfiguration": { + "text": { + "maxTokens": 500, + "temperature": 0.7, + "topP": 0.9 + } + }, + "metadata": [ + { + "key": "creator", + "value": "script" + }, + { + "key": "project", + "value": "GraphRAG" + } + ] + } + ] +} diff --git a/examples/lexical-graph-local-dev/notebooks/03-Querying with prompting.ipynb b/examples/lexical-graph-local-dev/notebooks/03-Querying-with-Prompting.ipynb similarity index 92% rename from examples/lexical-graph-local-dev/notebooks/03-Querying with prompting.ipynb rename to examples/lexical-graph-local-dev/notebooks/03-Querying-with-Prompting.ipynb index d8ff2164..289254a8 100644 --- a/examples/lexical-graph-local-dev/notebooks/03-Querying with prompting.ipynb +++ b/examples/lexical-graph-local-dev/notebooks/03-Querying-with-Prompting.ipynb @@ -118,6 +118,8 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", + "\n", "from graphrag_toolkit.lexical_graph import LexicalGraphQueryEngine\n", "from graphrag_toolkit.lexical_graph.prompts.s3_prompt_provider import S3PromptProvider\n", "from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import S3PromptProviderConfig\n", @@ -125,10 +127,8 @@ "# Setup S3 prompt provider\n", "prompt_provider = S3PromptProvider(\n", " S3PromptProviderConfig(\n", - " bucket=\"ccms-prompts\",\n", - " prefix=\"prompts\",\n", - " aws_region=\"ap-south-1\",\n", - " aws_profile=\"padmin\",\n", + " bucket=os.environ['S3_BUCKET_NAME'],\n", + " prefix=os.environ.get('PROMPT_PREFIX', 'prompts'),\n", " system_prompt_file=\"system_prompt.txt\",\n", " user_prompt_file=\"user_prompt.txt\"\n", " )\n", @@ -163,17 +163,14 @@ "outputs": [], "source": [ "from graphrag_toolkit.lexical_graph import LexicalGraphQueryEngine\n", + "from graphrag_toolkit.lexical_graph.prompts.bedrock_prompt_provider import BedrockPromptProvider\n", "from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import BedrockPromptProviderConfig\n", "\n", "# Setup Bedrock prompt provider\n", - "prompt_provider = BedrockPromptProviderConfig(\n", - " aws_region=\"us-east-1\",\n", - " aws_profile=\"padmin\",\n", - " system_prompt_arn=\"KEOXPXUM00\",\n", - " user_prompt_arn=\"TSF4PI4A6C\",\n", - " system_prompt_version=\"1\",\n", - " user_prompt_version=\"1\"\n", - ").build()\n", + "# Requires SYSTEM_PROMPT_ARN and USER_PROMPT_ARN in .env\n", + "prompt_provider = BedrockPromptProvider(\n", + " BedrockPromptProviderConfig()\n", + ")\n", "\n", "# Create query engine with Bedrock prompts\n", "query_engine = LexicalGraphQueryEngine.for_traversal_based_search(\n", diff --git a/examples/lexical-graph-local-dev/notebooks/04-Advanced-Configuration-Examples.ipynb b/examples/lexical-graph-local-dev/notebooks/04-Advanced-Configuration-Examples.ipynb index f369d640..eba6354f 100644 --- a/examples/lexical-graph-local-dev/notebooks/04-Advanced-Configuration-Examples.ipynb +++ b/examples/lexical-graph-local-dev/notebooks/04-Advanced-Configuration-Examples.ipynb @@ -104,8 +104,8 @@ " 'artifacts/sample.csv',\n", " 'artifacts/sample.md',\n", " # S3 files\n", - " 's3://config-test-bucket-188967239867/artifacts/sample.json',\n", - " 's3://config-test-bucket-188967239867/artifacts/sample.xlsx'\n", + " f's3://{os.environ.get(\"S3_BUCKET_NAME\", \"\")}/artifacts/sample.json',\n", + " f's3://{os.environ.get(\"S3_BUCKET_NAME\", \"\")}/artifacts/sample.xlsx'\n", "]\n", "\n", "all_docs = []\n", @@ -256,8 +256,8 @@ " 'artifacts/sample.csv',\n", " 'artifacts/sample.md',\n", " # S3 files\n", - " 's3://config-test-bucket-188967239867/artifacts/sample.json',\n", - " 's3://config-test-bucket-188967239867/artifacts/sample.xlsx'\n", + " f's3://{os.environ.get(\"S3_BUCKET_NAME\", \"\")}/artifacts/sample.json',\n", + " f's3://{os.environ.get(\"S3_BUCKET_NAME\", \"\")}/artifacts/sample.xlsx'\n", "]\n", "\n", "all_docs = []\n", From c6577ca18a79c9bd126014715dbb2f6b23a73c9a Mon Sep 17 00:00:00 2001 From: Mykola Pereyma Date: Fri, 17 Apr 2026 11:32:37 -0700 Subject: [PATCH 03/10] Fix BedrockPromptProviderConfig and WikipediaReaderProvider bugs BedrockPromptProviderConfig._resolve_prompt_arn(): - aws_region defaults to None, producing invalid ARN 'arn:aws:bedrock:None:...' - Fix: fall back to session.region_name when aws_region is not set - Add unit test for region fallback behavior WikipediaReaderProvider._init_reader(): - WikipediaReader imported inside __init__ but used in _init_reader where it was out of scope, causing NameError - Fix: store class as self._reader_cls for lazy initialization - Add 4 unit tests (import error, lazy init, single creation, empty input) --- .../aws/setup-bedrock-batch.sh | 19 ++++- .../docker/start-containers.sh | 3 + .../providers/wikipedia_reader_provider.py | 3 +- .../prompts/prompt_provider_config.py | 3 +- .../test_wikipedia_reader_provider.py | 78 +++++++++++++++++++ .../prompts/test_prompt_provider_config.py | 18 +++++ 6 files changed, 119 insertions(+), 5 deletions(-) create mode 100644 lexical-graph/tests/unit/indexing/load/readers/providers/test_wikipedia_reader_provider.py diff --git a/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.sh b/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.sh index b3c024f5..bdca2ba3 100755 --- a/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.sh +++ b/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.sh @@ -306,7 +306,20 @@ if [ -n "$CURRENT_ROLE" ]; then echo "2. Find your Permission Set" echo "3. Add the identity policy (${IDENTITY_POLICY_ARN}) to your Permission Set" else - echo "" - echo "NOTE: You are using traditional IAM credentials" - echo "Make sure to attach the identity policy to your IAM user or role" + # Auto-attach identity policy to the caller's IAM role + CALLER_ARN=$(aws sts get-caller-identity ${PROFILE_ARGS} --query Arn --output text) + CALLER_ROLE=$(echo "$CALLER_ARN" | sed 's|.*assumed-role/||;s|.*role/||' | cut -d/ -f1) + if [ -n "$CALLER_ROLE" ]; then + echo "" + echo "Attaching identity policy to your IAM role: ${CALLER_ROLE}..." + aws iam attach-role-policy \ + --role-name "${CALLER_ROLE}" \ + --policy-arn "${IDENTITY_POLICY_ARN}" \ + ${PROFILE_ARGS} && echo "Identity policy attached successfully" \ + || echo "WARNING: Could not attach identity policy. Attach it manually to your IAM role." + else + echo "" + echo "NOTE: You are using traditional IAM credentials" + echo "Make sure to attach the identity policy to your IAM user or role" + fi fi \ No newline at end of file diff --git a/examples/lexical-graph-local-dev/docker/start-containers.sh b/examples/lexical-graph-local-dev/docker/start-containers.sh index 0311e50a..ca37eab6 100755 --- a/examples/lexical-graph-local-dev/docker/start-containers.sh +++ b/examples/lexical-graph-local-dev/docker/start-containers.sh @@ -30,6 +30,9 @@ if [ "$RESET_MODE" = true ]; then echo "Resetting containers and data..." docker compose -f $COMPOSE_FILE down -v rm -rf extracted + if [ "$DEV_MODE" = false ]; then + echo "NOTE: This resets standard mode containers. Use --dev --reset to reset dev containers." + fi echo "Building and starting containers..." BUILD_FLAG="--build" else diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/load/readers/providers/wikipedia_reader_provider.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/load/readers/providers/wikipedia_reader_provider.py index 6449e5a4..de4485ba 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/load/readers/providers/wikipedia_reader_provider.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/load/readers/providers/wikipedia_reader_provider.py @@ -26,13 +26,14 @@ def __init__(self, config: WikipediaReaderConfig): self.lang = config.lang self.metadata_fn = config.metadata_fn self._reader = None + self._reader_cls = WikipediaReader logger.debug(f"Initialized WikipediaReaderProvider with lang={config.lang}") def _init_reader(self): """Lazily initialize WikipediaReader if not already created.""" if self._reader is None: - self._reader = WikipediaReader() + self._reader = self._reader_cls() def read(self, input_source: Union[str, List[str]]) -> List[Document]: """Read Wikipedia documents with metadata handling and title correction.""" diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_config.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_config.py index 5f9f6759..596fd6ae 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_config.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_config.py @@ -159,8 +159,9 @@ def _resolve_prompt_arn(self, identifier: str) -> str: # Example ARN: arn:aws-us-gov:sts::123456789012:assumed-role/... partition = caller_arn.split(":")[1] account_id = self.sts.get_caller_identity()["Account"] + region = self.aws_region or self.session.region_name - return f"arn:{partition}:bedrock:{self.aws_region}:{account_id}:prompt/{identifier}" + return f"arn:{partition}:bedrock:{region}:{account_id}:prompt/{identifier}" def build(self) -> PromptProvider: from graphrag_toolkit.lexical_graph.prompts.bedrock_prompt_provider import BedrockPromptProvider diff --git a/lexical-graph/tests/unit/indexing/load/readers/providers/test_wikipedia_reader_provider.py b/lexical-graph/tests/unit/indexing/load/readers/providers/test_wikipedia_reader_provider.py new file mode 100644 index 00000000..c6496375 --- /dev/null +++ b/lexical-graph/tests/unit/indexing/load/readers/providers/test_wikipedia_reader_provider.py @@ -0,0 +1,78 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from unittest.mock import Mock, patch, MagicMock + + +def test_raises_exception_if_dependencies_not_installed(): + with patch.dict('sys.modules', {'llama_index.readers.wikipedia': None}): + from importlib import reload + import graphrag_toolkit.lexical_graph.indexing.load.readers.providers.wikipedia_reader_provider as mod + reload(mod) + + from graphrag_toolkit.lexical_graph.indexing.load.readers.reader_provider_config import WikipediaReaderConfig + + with pytest.raises(ImportError) as exc_info: + mod.WikipediaReaderProvider(WikipediaReaderConfig()) + + assert "llama-index-readers-wikipedia" in str(exc_info.value) + + +def test_init_reader_creates_reader_instance(): + """Test that _init_reader uses the stored class from __init__.""" + from graphrag_toolkit.lexical_graph.indexing.load.readers.reader_provider_config import WikipediaReaderConfig + + mock_reader_cls = Mock() + mock_reader_instance = Mock() + mock_reader_cls.return_value = mock_reader_instance + + mock_module = MagicMock() + mock_module.WikipediaReader = mock_reader_cls + + with patch.dict('sys.modules', {'llama_index.readers.wikipedia': mock_module}): + from importlib import reload + import graphrag_toolkit.lexical_graph.indexing.load.readers.providers.wikipedia_reader_provider as mod + reload(mod) + + provider = mod.WikipediaReaderProvider(WikipediaReaderConfig()) + assert provider._reader is None + + provider._init_reader() + assert provider._reader is mock_reader_instance + mock_reader_cls.assert_called_once() + + +def test_init_reader_only_creates_once(): + """Test that _init_reader is lazy and only creates the reader once.""" + from graphrag_toolkit.lexical_graph.indexing.load.readers.reader_provider_config import WikipediaReaderConfig + + mock_reader_cls = Mock() + mock_module = MagicMock() + mock_module.WikipediaReader = mock_reader_cls + + with patch.dict('sys.modules', {'llama_index.readers.wikipedia': mock_module}): + from importlib import reload + import graphrag_toolkit.lexical_graph.indexing.load.readers.providers.wikipedia_reader_provider as mod + reload(mod) + + provider = mod.WikipediaReaderProvider(WikipediaReaderConfig()) + provider._init_reader() + provider._init_reader() + mock_reader_cls.assert_called_once() + + +def test_read_raises_on_empty_input(): + """Test that read raises ValueError on empty input.""" + from graphrag_toolkit.lexical_graph.indexing.load.readers.reader_provider_config import WikipediaReaderConfig + + mock_module = MagicMock() + with patch.dict('sys.modules', {'llama_index.readers.wikipedia': mock_module}): + from importlib import reload + import graphrag_toolkit.lexical_graph.indexing.load.readers.providers.wikipedia_reader_provider as mod + reload(mod) + + provider = mod.WikipediaReaderProvider(WikipediaReaderConfig()) + + with pytest.raises(ValueError, match="cannot be None or empty"): + provider.read("") diff --git a/lexical-graph/tests/unit/prompts/test_prompt_provider_config.py b/lexical-graph/tests/unit/prompts/test_prompt_provider_config.py index 6b33d6e0..11bd9d6f 100644 --- a/lexical-graph/tests/unit/prompts/test_prompt_provider_config.py +++ b/lexical-graph/tests/unit/prompts/test_prompt_provider_config.py @@ -48,6 +48,24 @@ def test_resolve_prompt_arn_with_identifier(self, mock_sts): resolved = config._resolve_prompt_arn("my-prompt-id") assert resolved == "arn:aws:bedrock:us-west-2:123456789012:prompt/my-prompt-id" + @patch.object(BedrockPromptProviderConfig, 'session', new_callable=lambda: property(lambda self: Mock(region_name='us-east-1'))) + @patch.object(BedrockPromptProviderConfig, 'sts') + def test_resolve_prompt_arn_without_region(self, mock_sts, mock_session): + """Test ARN resolution falls back to session region when aws_region is None.""" + mock_sts.get_caller_identity.return_value = { + "Arn": "arn:aws:sts::123456789012:assumed-role/test-role", + "Account": "123456789012" + } + + config = BedrockPromptProviderConfig( + system_prompt_arn="my-prompt-id", + user_prompt_arn="user-prompt-id", + ) + assert config.aws_region is None + + resolved = config._resolve_prompt_arn("my-prompt-id") + assert resolved == "arn:aws:bedrock:us-east-1:123456789012:prompt/my-prompt-id" + @patch('graphrag_toolkit.lexical_graph.prompts.bedrock_prompt_provider.BedrockPromptProvider') def test_build(self, mock_provider_class): """Test build method creates BedrockPromptProvider.""" From 69324f80ced08984ee035c939569e90e5704f4c6 Mon Sep 17 00:00:00 2001 From: Mykola Pereyma Date: Mon, 20 Apr 2026 15:42:47 -0700 Subject: [PATCH 04/10] fix: add missing Docker dependencies and fix notebook titles - Dockerfile: add llama-index-readers-s3 (required by notebook 05-S3-Directory-Reader) - Dockerfile.dev: add neo4j driver and build-essential (required for dev mode) - 02-Querying.ipynb: fix title spacing - 03-Querying-with-Prompting.ipynb: fix title casing --- examples/lexical-graph-local-dev/docker/jupyter/Dockerfile | 1 + .../lexical-graph-local-dev/docker/jupyter/Dockerfile.dev | 6 ++++++ .../lexical-graph-local-dev/notebooks/02-Querying.ipynb | 2 +- .../notebooks/03-Querying-with-Prompting.ipynb | 2 +- 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/lexical-graph-local-dev/docker/jupyter/Dockerfile b/examples/lexical-graph-local-dev/docker/jupyter/Dockerfile index 2895ebed..84e34b26 100644 --- a/examples/lexical-graph-local-dev/docker/jupyter/Dockerfile +++ b/examples/lexical-graph-local-dev/docker/jupyter/Dockerfile @@ -45,6 +45,7 @@ RUN pip install --no-cache-dir \ llama-index-readers-file \ llama-index-readers-github \ llama-index-readers-json \ + llama-index-readers-s3 \ llama-index-readers-structured-data \ pymupdf \ youtube-transcript-api \ diff --git a/examples/lexical-graph-local-dev/docker/jupyter/Dockerfile.dev b/examples/lexical-graph-local-dev/docker/jupyter/Dockerfile.dev index 3cc68a3d..e8f713ca 100644 --- a/examples/lexical-graph-local-dev/docker/jupyter/Dockerfile.dev +++ b/examples/lexical-graph-local-dev/docker/jupyter/Dockerfile.dev @@ -19,5 +19,11 @@ ENV PIP_CACHE_DIR=/tmp/pip-cache USER $NB_UID +# Neo4j driver (lazy import, required by all notebooks via Neo4jGraphStoreFactory) +RUN pip install --no-cache-dir neo4j +# Build tools for packages requiring C compilation (e.g. lru-dict) +USER root +RUN apt-get update && apt-get install -y --no-install-recommends build-essential && rm -rf /var/lib/apt/lists/* +USER jovyan diff --git a/examples/lexical-graph-local-dev/notebooks/02-Querying.ipynb b/examples/lexical-graph-local-dev/notebooks/02-Querying.ipynb index e19be19c..7d61532a 100644 --- a/examples/lexical-graph-local-dev/notebooks/02-Querying.ipynb +++ b/examples/lexical-graph-local-dev/notebooks/02-Querying.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "id": "3bfeb79c9431a3c6", "metadata": {}, - "source": "# 02 -Querying" + "source": "# 02 - Querying" }, { "cell_type": "code", diff --git a/examples/lexical-graph-local-dev/notebooks/03-Querying-with-Prompting.ipynb b/examples/lexical-graph-local-dev/notebooks/03-Querying-with-Prompting.ipynb index 289254a8..5fea8837 100644 --- a/examples/lexical-graph-local-dev/notebooks/03-Querying-with-Prompting.ipynb +++ b/examples/lexical-graph-local-dev/notebooks/03-Querying-with-Prompting.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "id": "3bfeb79c9431a3c6", "metadata": {}, - "source": "# 03 - Querying with prompting" + "source": "# 03 - Querying with Prompting" }, { "cell_type": "code", From 15019bbe7624f998ed1156b3ca32a65420031668 Mon Sep 17 00:00:00 2001 From: Mykola Pereyma Date: Mon, 20 Apr 2026 17:21:53 -0700 Subject: [PATCH 05/10] feat: add automated notebook test runner for local-dev - scripts/test-local-dev-notebooks.sh: full lifecycle test runner (env setup, AWS resources, Docker, notebook execution, cleanup) - scripts/run_notebooks.py: cell-by-cell executor with skip lists (GitHub, PPTX, JSON/Wikipedia long-running cells) - .kiro/steering/test-local-dev-notebooks.md: Kiro steering file Configurable via env vars: SKIP_GITHUB, SKIP_PPTX, SKIP_LONG_RUNNING, CLEANUP, DOCKER_MODE, REPORT_DIR --- .kiro/steering/test-local-dev-notebooks.md | 48 +++ .../scripts/run_notebooks.py | 189 ++++++++++++ .../scripts/test-local-dev-notebooks.sh | 292 ++++++++++++++++++ 3 files changed, 529 insertions(+) create mode 100644 .kiro/steering/test-local-dev-notebooks.md create mode 100644 examples/lexical-graph-local-dev/scripts/run_notebooks.py create mode 100755 examples/lexical-graph-local-dev/scripts/test-local-dev-notebooks.sh diff --git a/.kiro/steering/test-local-dev-notebooks.md b/.kiro/steering/test-local-dev-notebooks.md new file mode 100644 index 00000000..c97ac2c1 --- /dev/null +++ b/.kiro/steering/test-local-dev-notebooks.md @@ -0,0 +1,48 @@ +--- +inclusion: manual +--- +# Test Local-Dev Notebooks + +## When to Use +When asked to test, validate, or run the lexical-graph-local-dev notebooks. + +## How to Run +```bash +cd examples/lexical-graph-local-dev +bash scripts/test-local-dev-notebooks.sh +``` + +## Configuration +Environment variables to customize behavior: +- `SKIP_GITHUB=true|false` (default: true) — skip GitHub reader cells (no token available) +- `SKIP_PPTX=true|false` (default: true) — skip PPTX reader cells (600s timeout) +- `SKIP_LONG_RUNNING=true|false` (default: true) — skip JSON/Wikipedia reader cells (extract_and_build timeout) +- `CLEANUP=true|false` (default: true) — cleanup all resources after test +- `DOCKER_MODE=standard|dev` (default: standard) +- `REPORT_DIR=path` (default: examples/lexical-graph-local-dev/test-reports/) + +## Prerequisites +- AWS CLI configured with valid credentials +- Docker running +- Bedrock model access enabled (Claude Sonnet, Cohere Embed English v3) + +## What It Does +1. Detects platform (ARM/x86) +2. Creates .env from template with auto-detected AWS account/region +3. Creates AWS resources (S3 bucket, Bedrock managed prompts) +4. Starts Docker containers (Neo4j, pgvector, Jupyter) +5. Executes all notebook cells (skipping GitHub and PPTX as configured) +6. Generates per-cell execution report (JSON + markdown) +7. Cleans up all resources (Docker, S3, Bedrock prompts, local .env) + +## Notebooks Tested +- 00-Setup.ipynb — Environment setup, package installation, reader dependencies +- 01-Combined-Extract-and-Build.ipynb — Reader providers (web, PDF, YouTube, docx, markdown, JSON, Wikipedia, CSV, directory) +- 02-Querying.ipynb — TraversalBasedRetriever queries +- 03-Querying-with-Prompting.ipynb — Custom prompts (file, S3, Bedrock managed) +- 04-Advanced-Configuration-Examples.ipynb — Batch processing, custom metadata functions +- 05-S3-Directory-Reader-Provider.ipynb — S3 directory reader with prefix filtering and metadata + +## Expected Results +- 90+ cells SUCCESS, 8 SKIPPED (GitHub + PPTX + JSON + Wikipedia), 0 FAILED +- Reports in test-reports/ directory diff --git a/examples/lexical-graph-local-dev/scripts/run_notebooks.py b/examples/lexical-graph-local-dev/scripts/run_notebooks.py new file mode 100644 index 00000000..88d41502 --- /dev/null +++ b/examples/lexical-graph-local-dev/scripts/run_notebooks.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +"""Execute local-dev notebooks cell-by-cell with skip logic and per-cell reporting. + +Runs inside the Jupyter container. Produces JSON and markdown reports. +""" + +import argparse +import json +import os +import sys +import time + +import nbformat +from nbclient import NotebookClient +from nbclient.exceptions import CellExecutionError + +ALL_NOTEBOOKS = [ + "00-Setup.ipynb", + "01-Combined-Extract-and-Build.ipynb", + "02-Querying.ipynb", + "03-Querying-with-Prompting.ipynb", + "04-Advanced-Configuration-Examples.ipynb", + "05-S3-Directory-Reader-Provider.ipynb", +] + +# (notebook_index, cell_index) -> reason +GITHUB_SKIPS = {(1, 14): "GitHub markdown header", (1, 15): "GitHub reader - no token"} +PPTX_SKIPS = {(1, 16): "PPTX markdown header", (1, 17): "PPTX reader - 600s timeout"} +LONG_RUNNING_SKIPS = { + (1, 20): "JSON markdown header", + (1, 21): "JSON reader - extract_and_build timeout", + (1, 22): "Wikipedia markdown header", + (1, 23): "Wikipedia reader - extract_and_build timeout", +} + + +def load_env(env_path): + if not os.path.exists(env_path): + return + with open(env_path) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#") and "=" in line: + key, _, value = line.partition("=") + os.environ[key.strip()] = value.strip() + + +def extract_output(cell): + parts = [] + for o in cell.get("outputs", []): + if o.get("output_type") == "stream": + parts.append(o.get("text", "")) + elif o.get("output_type") == "execute_result": + data = o.get("data", {}) + if "text/plain" in data: + parts.append(data["text/plain"]) + elif o.get("output_type") == "error": + parts.append("\n".join(o.get("traceback", [])[-3:])) + text = "".join(parts).strip() + lines = text.split("\n")[:20] + return "\n".join(lines) if lines and lines[0] else "(no output)" + + +def run_notebook(nb_idx, nb_name, work_dir, skip_cells): + results = [] + nb_path = os.path.join(work_dir, nb_name) + nb = nbformat.read(nb_path, as_version=4) + client = NotebookClient( + nb, timeout=600, kernel_name="python3", + resources={"metadata": {"path": work_dir}}, + ) + print(f"\n{'=' * 60}\nNOTEBOOK {nb_idx}: {nb_name}\n{'=' * 60}", flush=True) + + with client.setup_kernel(): + for cell_idx, cell in enumerate(nb.cells): + key = (nb_idx, cell_idx) + if key in skip_cells: + reason = skip_cells[key] + print(f" Cell {cell_idx} [{cell.cell_type}]: SKIPPED ({reason})", flush=True) + results.append(dict( + notebook=nb_name, cell_index=cell_idx, cell_type=cell.cell_type, + status="SKIPPED", output_summary=f"Skipped: {reason}", + exec_time_s=0, error=None, source_preview=cell.source[:150], + )) + continue + + if cell.cell_type != "code": + results.append(dict( + notebook=nb_name, cell_index=cell_idx, cell_type=cell.cell_type, + status="SUCCESS", output_summary="Markdown cell", + exec_time_s=0, error=None, source_preview=cell.source[:150], + )) + continue + + start = time.time() + error_detail = None + try: + client.execute_cell(cell, cell_idx) + status = "SUCCESS" + except CellExecutionError as e: + status = "FAILED" + error_detail = str(e)[-800:] + except Exception as e: + status = "FAILED" + error_detail = f"{type(e).__name__}: {str(e)[:500]}" + elapsed = round(time.time() - start, 2) + + output_summary = extract_output(cell) + print(f" Cell {cell_idx} [code]: {status} ({elapsed}s)", flush=True) + if status == "FAILED": + print(f" ERROR: {(error_detail or 'unknown')[:200]}", flush=True) + + results.append(dict( + notebook=nb_name, cell_index=cell_idx, cell_type="code", + status=status, output_summary=output_summary, + exec_time_s=elapsed, error=error_detail, + source_preview=cell.source[:150], + )) + return results + + +def write_markdown_report(report, path): + with open(path, "w") as f: + success = sum(1 for r in report if r["status"] == "SUCCESS") + failed = sum(1 for r in report if r["status"] == "FAILED") + skipped = sum(1 for r in report if r["status"] == "SKIPPED") + f.write("# Notebook Execution Report\n\n") + f.write(f"| Metric | Count |\n|--------|-------|\n") + f.write(f"| Total cells | {len(report)} |\n") + f.write(f"| SUCCESS | {success} |\n| FAILED | {failed} |\n| SKIPPED | {skipped} |\n\n") + + current_nb = None + for r in report: + if r["notebook"] != current_nb: + current_nb = r["notebook"] + f.write(f"## {current_nb}\n\n") + f.write("| Cell | Type | Status | Time | Output Summary |\n") + f.write("|------|------|--------|------|----------------|\n") + summary = r["output_summary"].replace("\n", " ")[:100] + f.write(f"| {r['cell_index']} | {r['cell_type']} | {r['status']} | {r['exec_time_s']}s | {summary} |\n") + if r["error"]: + f.write(f"\n**Error (Cell {r['cell_index']}):** `{r['error'][:200]}`\n\n") + f.write("\n") + + +def main(): + parser = argparse.ArgumentParser(description="Run local-dev notebooks") + parser.add_argument("--work-dir", default="/home/jovyan/work") + parser.add_argument("--output-dir", default="/home/jovyan/work") + parser.add_argument("--skip-github", default="true", choices=["true", "false"]) + parser.add_argument("--skip-pptx", default="true", choices=["true", "false"]) + parser.add_argument("--skip-long-running", default="true", choices=["true", "false"]) + parser.add_argument("--notebooks", nargs="*", help="Specific notebooks to run") + args = parser.parse_args() + + load_env(os.path.join(args.work_dir, ".env")) + + notebooks = args.notebooks or ALL_NOTEBOOKS + skip_cells = {} + if args.skip_github == "true": + skip_cells.update(GITHUB_SKIPS) + if args.skip_pptx == "true": + skip_cells.update(PPTX_SKIPS) + if args.skip_long_running == "true": + skip_cells.update(LONG_RUNNING_SKIPS) + + report = [] + for nb_idx, nb_name in enumerate(ALL_NOTEBOOKS): + if nb_name not in notebooks: + continue + report.extend(run_notebook(nb_idx, nb_name, args.work_dir, skip_cells)) + + # Write reports + json_path = os.path.join(args.output_dir, "execution_report.json") + md_path = os.path.join(args.output_dir, "execution_report.md") + with open(json_path, "w") as f: + json.dump(report, f, indent=2) + write_markdown_report(report, md_path) + + failed = sum(1 for r in report if r["status"] == "FAILED") + success = sum(1 for r in report if r["status"] == "SUCCESS") + skipped = sum(1 for r in report if r["status"] == "SKIPPED") + print(f"\n\nDone. {len(report)} cells: {success} SUCCESS, {failed} FAILED, {skipped} SKIPPED") + print(f"Reports: {json_path}, {md_path}") + sys.exit(1 if failed > 0 else 0) + + +if __name__ == "__main__": + main() diff --git a/examples/lexical-graph-local-dev/scripts/test-local-dev-notebooks.sh b/examples/lexical-graph-local-dev/scripts/test-local-dev-notebooks.sh new file mode 100755 index 00000000..23f62886 --- /dev/null +++ b/examples/lexical-graph-local-dev/scripts/test-local-dev-notebooks.sh @@ -0,0 +1,292 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ============================================================================= +# test-local-dev-notebooks.sh +# +# Full lifecycle test runner for lexical-graph-local-dev notebooks. +# Handles: env setup → AWS resources → Docker → notebook execution → report → cleanup +# ============================================================================= + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +NOTEBOOKS_DIR="$PROJECT_DIR/notebooks" +DOCKER_DIR="$PROJECT_DIR/docker" +AWS_DIR="$PROJECT_DIR/aws" +REPORT_DIR="${REPORT_DIR:-$PROJECT_DIR/test-reports}" + +# Configurable flags +SKIP_GITHUB="${SKIP_GITHUB:-true}" +SKIP_PPTX="${SKIP_PPTX:-true}" +SKIP_LONG_RUNNING="${SKIP_LONG_RUNNING:-true}" +CLEANUP="${CLEANUP:-true}" +DOCKER_MODE="${DOCKER_MODE:-standard}" + +# State tracking for cleanup +AWS_ACCOUNT="" +AWS_REGION="" +S3_BUCKET="" +DOCKER_STARTED=false +AWS_RESOURCES_CREATED=false +BEDROCK_PROMPTS_CREATED=false +ENV_CREATED=false +SYSTEM_PROMPT_ID="" +USER_PROMPT_ID="" +NOTEBOOK_EXIT_CODE=0 + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log() { echo -e "${BLUE}[$(date +%H:%M:%S)]${NC} $*"; } +ok() { echo -e "${GREEN}[✓]${NC} $*"; } +warn() { echo -e "${YELLOW}[!]${NC} $*"; } +err() { echo -e "${RED}[✗]${NC} $*"; } + +timer_start() { TIMER_START=$(date +%s); } +timer_end() { echo "$(($(date +%s) - TIMER_START))s"; } + +# ============================================================================= +# Phase 1: Platform detection +# ============================================================================= +detect_platform() { + log "Detecting platform..." + local arch + arch=$(uname -m) + if [[ "$arch" == "arm64" || "$arch" == "aarch64" ]]; then + COMPOSE_FILE="docker-compose.arm.yml" + DOCKER_FLAGS="--mac" + ok "ARM platform detected (compose: $COMPOSE_FILE)" + else + COMPOSE_FILE="docker-compose.yml" + DOCKER_FLAGS="" + ok "x86 platform detected (compose: $COMPOSE_FILE)" + fi + if [[ "$DOCKER_MODE" == "dev" ]]; then + DOCKER_FLAGS="$DOCKER_FLAGS --dev" + fi +} + +# ============================================================================= +# Phase 2: Environment setup +# ============================================================================= +setup_env() { + log "Setting up environment..." + timer_start + + AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text) + AWS_REGION=$(aws configure get region 2>/dev/null || echo "us-east-1") + S3_BUCKET="graphrag-toolkit-${AWS_ACCOUNT}" + + cp "$NOTEBOOKS_DIR/.env.template" "$NOTEBOOKS_DIR/.env" + ENV_CREATED=true + + # Patch .env with detected values + if [[ "$(uname)" == "Darwin" ]]; then + sed -i '' "s/^AWS_REGION=.*/AWS_REGION=${AWS_REGION}/" "$NOTEBOOKS_DIR/.env" + sed -i '' "s/^S3_BUCKET_NAME=.*/S3_BUCKET_NAME=${S3_BUCKET}/" "$NOTEBOOKS_DIR/.env" + else + sed -i "s/^AWS_REGION=.*/AWS_REGION=${AWS_REGION}/" "$NOTEBOOKS_DIR/.env" + sed -i "s/^S3_BUCKET_NAME=.*/S3_BUCKET_NAME=${S3_BUCKET}/" "$NOTEBOOKS_DIR/.env" + fi + + ok "Environment configured (account=$AWS_ACCOUNT, region=$AWS_REGION, bucket=$S3_BUCKET) [$(timer_end)]" +} + +# ============================================================================= +# Phase 3: AWS resources +# ============================================================================= +setup_aws() { + log "Creating AWS resources..." + timer_start + + # S3 bucket + log "Creating S3 bucket: $S3_BUCKET" + if [[ "$AWS_REGION" == "us-east-1" ]]; then + aws s3api create-bucket --bucket "$S3_BUCKET" --region "$AWS_REGION" 2>/dev/null || true + else + aws s3api create-bucket --bucket "$S3_BUCKET" --region "$AWS_REGION" \ + --create-bucket-configuration LocationConstraint="$AWS_REGION" 2>/dev/null || true + fi + AWS_RESOURCES_CREATED=true + + # Upload prompt files to S3 + python3 -c " +import json, sys +with open(sys.argv[1]) as f: + data = json.load(f) +print(data['variants'][0]['templateConfiguration']['text']['text'], end='') +" "$AWS_DIR/system_prompt.json" | aws s3 cp - "s3://$S3_BUCKET/prompts/system_prompt.txt" --region "$AWS_REGION" + + python3 -c " +import json, sys +with open(sys.argv[1]) as f: + data = json.load(f) +print(data['variants'][0]['templateConfiguration']['text']['text'], end='') +" "$AWS_DIR/user_prompt.json" | aws s3 cp - "s3://$S3_BUCKET/prompts/user_prompt.txt" --region "$AWS_REGION" + + ok "S3 bucket created and prompts uploaded" + + # Bedrock managed prompts + if [[ -f "$AWS_DIR/system_prompt.json" && -f "$AWS_DIR/user_prompt.json" ]]; then + local sys_response usr_response + sys_response=$(aws bedrock-agent create-prompt --region "$AWS_REGION" --cli-input-json file://"$AWS_DIR/system_prompt.json" 2>&1) || true + usr_response=$(aws bedrock-agent create-prompt --region "$AWS_REGION" --cli-input-json file://"$AWS_DIR/user_prompt.json" 2>&1) || true + + SYSTEM_PROMPT_ID=$(echo "$sys_response" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])" 2>/dev/null) || true + USER_PROMPT_ID=$(echo "$usr_response" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])" 2>/dev/null) || true + + if [[ -n "$SYSTEM_PROMPT_ID" && -n "$USER_PROMPT_ID" ]]; then + local sys_arn="arn:aws:bedrock:${AWS_REGION}:${AWS_ACCOUNT}:prompt/${SYSTEM_PROMPT_ID}" + local usr_arn="arn:aws:bedrock:${AWS_REGION}:${AWS_ACCOUNT}:prompt/${USER_PROMPT_ID}" + echo "SYSTEM_PROMPT_ARN=$sys_arn" >> "$NOTEBOOKS_DIR/.env" + echo "USER_PROMPT_ARN=$usr_arn" >> "$NOTEBOOKS_DIR/.env" + BEDROCK_PROMPTS_CREATED=true + ok "Bedrock prompts created (system=$SYSTEM_PROMPT_ID, user=$USER_PROMPT_ID)" + else + warn "Could not extract Bedrock prompt IDs — prompt-based cells may fail" + fi + fi + + ok "AWS resources created [$(timer_end)]" +} + +# ============================================================================= +# Phase 4: Docker +# ============================================================================= +start_docker() { + log "Starting Docker containers ($DOCKER_MODE mode)..." + timer_start + + (cd "$DOCKER_DIR" && ./start-containers.sh $DOCKER_FLAGS) + DOCKER_STARTED=true + + wait_for_containers + ok "Docker containers running [$(timer_end)]" +} + +wait_for_containers() { + local max_wait=120 + local waited=0 + while [[ $waited -lt $max_wait ]]; do + local count + count=$(docker ps --filter "name=neo4j-local" --filter "name=pgvector-local" --filter "name=jupyter-local" --format "{{.Names}}" | wc -l | tr -d ' ') + if [[ "$count" -ge 3 ]]; then + # Also verify jupyter is responsive + if docker exec jupyter-local python3 -c "print('ready')" 2>/dev/null; then + return 0 + fi + fi + sleep 5 + waited=$((waited + 5)) + done + err "Containers did not start within ${max_wait}s" + return 1 +} + +# ============================================================================= +# Phase 5: Execute notebooks +# ============================================================================= +run_notebooks() { + log "Executing notebooks..." + timer_start + mkdir -p "$REPORT_DIR" + + # Copy runner script into container + docker cp "$SCRIPT_DIR/run_notebooks.py" jupyter-local:/home/jovyan/work/run_notebooks.py + + # Execute + docker exec jupyter-local \ + python3 /home/jovyan/work/run_notebooks.py \ + --skip-github="$SKIP_GITHUB" \ + --skip-pptx="$SKIP_PPTX" \ + --skip-long-running="$SKIP_LONG_RUNNING" \ + || NOTEBOOK_EXIT_CODE=$? + + # Collect reports + docker cp jupyter-local:/home/jovyan/work/execution_report.json "$REPORT_DIR/" 2>/dev/null || true + docker cp jupyter-local:/home/jovyan/work/execution_report.md "$REPORT_DIR/" 2>/dev/null || true + + if [[ $NOTEBOOK_EXIT_CODE -eq 0 ]]; then + ok "All notebooks passed [$(timer_end)]" + else + err "Some notebooks failed (exit code $NOTEBOOK_EXIT_CODE) [$(timer_end)]" + fi +} + +# ============================================================================= +# Phase 6: Cleanup +# ============================================================================= +cleanup() { + if [[ "$CLEANUP" != "true" ]]; then + warn "Cleanup skipped (CLEANUP=$CLEANUP)" + return 0 + fi + log "Cleaning up resources..." + + # Docker + if [[ "$DOCKER_STARTED" == "true" ]]; then + (cd "$DOCKER_DIR" && docker compose -f "$COMPOSE_FILE" down -v 2>/dev/null) || true + ok "Docker containers removed" + fi + + # S3 + if [[ "$AWS_RESOURCES_CREATED" == "true" && -n "$S3_BUCKET" ]]; then + aws s3 rb "s3://$S3_BUCKET" --force --region "$AWS_REGION" 2>/dev/null || true + ok "S3 bucket deleted" + fi + + # Bedrock prompts + if [[ "$BEDROCK_PROMPTS_CREATED" == "true" ]]; then + [[ -n "$SYSTEM_PROMPT_ID" ]] && aws bedrock-agent delete-prompt --prompt-identifier "$SYSTEM_PROMPT_ID" --region "$AWS_REGION" 2>/dev/null || true + [[ -n "$USER_PROMPT_ID" ]] && aws bedrock-agent delete-prompt --prompt-identifier "$USER_PROMPT_ID" --region "$AWS_REGION" 2>/dev/null || true + ok "Bedrock prompts deleted" + fi + + # Local .env + if [[ "$ENV_CREATED" == "true" ]]; then + rm -f "$NOTEBOOKS_DIR/.env" + ok "Local .env removed" + fi + + ok "Cleanup complete" +} + +# ============================================================================= +# Main +# ============================================================================= +main() { + echo "" + echo "============================================================" + echo " lexical-graph-local-dev Notebook Test Runner" + echo "============================================================" + echo " Mode: $DOCKER_MODE | GitHub: skip=$SKIP_GITHUB | PPTX: skip=$SKIP_PPTX | Long-running: skip=$SKIP_LONG_RUNNING" + echo " Cleanup: $CLEANUP | Reports: $REPORT_DIR" + echo "============================================================" + echo "" + + trap cleanup EXIT + + detect_platform + setup_env + setup_aws + start_docker + run_notebooks + + echo "" + echo "============================================================" + if [[ $NOTEBOOK_EXIT_CODE -eq 0 ]]; then + ok "ALL TESTS PASSED" + else + err "SOME TESTS FAILED" + fi + echo " Reports: $REPORT_DIR/execution_report.{json,md}" + echo "============================================================" + + exit $NOTEBOOK_EXIT_CODE +} + +main "$@" From 7aeca85fcbdef49a94ad3ea30aed87e054908b1e Mon Sep 17 00:00:00 2001 From: Mykola Pereyma Date: Tue, 21 Apr 2026 11:19:40 -0700 Subject: [PATCH 06/10] docs: remove hardcoded cell counts from steering file --- .kiro/steering/test-local-dev-notebooks.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.kiro/steering/test-local-dev-notebooks.md b/.kiro/steering/test-local-dev-notebooks.md index c97ac2c1..b19d1e4a 100644 --- a/.kiro/steering/test-local-dev-notebooks.md +++ b/.kiro/steering/test-local-dev-notebooks.md @@ -44,5 +44,6 @@ Environment variables to customize behavior: - 05-S3-Directory-Reader-Provider.ipynb — S3 directory reader with prefix filtering and metadata ## Expected Results -- 90+ cells SUCCESS, 8 SKIPPED (GitHub + PPTX + JSON + Wikipedia), 0 FAILED -- Reports in test-reports/ directory +- All executed cells SUCCESS, 0 FAILED +- Skipped cells depend on configuration (GitHub, PPTX, long-running by default) +- Reports in test-reports/ directory (execution_report.json + execution_report.md) From b036a8f6dfbe56cb57a36ffbeb6b855d50eb6081 Mon Sep 17 00:00:00 2001 From: Mykola Pereyma Date: Fri, 1 May 2026 18:45:14 -0700 Subject: [PATCH 07/10] refactor: modernize Docker infrastructure - Replace ARM-specific compose file with DOCKER_PLATFORM env var - Add explicit project names to compose files - Add disable_check_xsrf flag to dev compose - Rename notebook mount from /home/jovyan/work to /home/jovyan/notebooks - Add all required dependencies to Dockerfile.dev (psycopg2, pgvector, nltk) - Remove PowerShell start-containers script --- .../docker/docker-compose-dev.yml | 7 +- .../docker/docker-compose.arm.yml | 63 ----------------- .../docker/docker-compose.yml | 3 +- .../docker/jupyter/Dockerfile.dev | 28 ++++++-- .../docker/start-containers.ps1 | 67 ------------------- .../docker/start-containers.sh | 6 +- 6 files changed, 29 insertions(+), 145 deletions(-) delete mode 100644 examples/lexical-graph-local-dev/docker/docker-compose.arm.yml delete mode 100644 examples/lexical-graph-local-dev/docker/start-containers.ps1 diff --git a/examples/lexical-graph-local-dev/docker/docker-compose-dev.yml b/examples/lexical-graph-local-dev/docker/docker-compose-dev.yml index 3be71567..c8f9435a 100644 --- a/examples/lexical-graph-local-dev/docker/docker-compose-dev.yml +++ b/examples/lexical-graph-local-dev/docker/docker-compose-dev.yml @@ -1,3 +1,4 @@ +name: local-dev services: neo4j-local: image: neo4j:5.25-community @@ -29,7 +30,7 @@ services: networks: - graphrag_local_network_dev - jupyter-local-dev: + jupyter-local: build: context: ./jupyter dockerfile: Dockerfile.dev @@ -42,14 +43,13 @@ services: - ../notebooks:/home/jovyan/notebooks - ../../../lexical-graph:/home/jovyan/lexical-graph - ../../../lexical-graph-contrib:/home/jovyan/lexical-graph-contrib - - jupyter_local_data_dev:/home/jovyan/work - ~/.aws:/home/jovyan/.aws networks: - graphrag_local_network_dev depends_on: - pgvector-local - neo4j-local - command: start-notebook.sh --NotebookApp.token='' --NotebookApp.password='' + command: start-notebook.sh --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.disable_check_xsrf=True networks: graphrag_local_network_dev: @@ -59,4 +59,3 @@ volumes: neo4j_local_data_dev: neo4j_local_logs_dev: pgvector_local_data_dev: - jupyter_local_data_dev: diff --git a/examples/lexical-graph-local-dev/docker/docker-compose.arm.yml b/examples/lexical-graph-local-dev/docker/docker-compose.arm.yml deleted file mode 100644 index 013cfa1f..00000000 --- a/examples/lexical-graph-local-dev/docker/docker-compose.arm.yml +++ /dev/null @@ -1,63 +0,0 @@ -services: - neo4j-local: - image: neo4j:5.25-community - container_name: neo4j-local - ports: - - "7476:7474" # HTTP - - "7689:7687" # Bolt - environment: - - NEO4J_AUTH=${NEO4J_USER:-neo4j}/${NEO4J_PASSWORD:-password} - - NEO4J_PLUGINS=["apoc"] - volumes: - - neo4j_local_data:/data - - neo4j_local_logs:/logs - networks: - - graphrag_local_network - platform: linux/arm64 - - pgvector-local: - image: pgvector/pgvector:0.6.2-pg16 - container_name: pgvector-local - ports: - - "5432:5432" - environment: - - POSTGRES_USER=${POSTGRES_USER:-postgres} - - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-password} - - POSTGRES_DB=${POSTGRES_DB:-graphrag} - volumes: - - pgvector_local_data:/var/lib/postgresql/data - - ./postgres/schema.sql:/docker-entrypoint-initdb.d/schema.sql - networks: - - graphrag_local_network - platform: linux/arm64 - - jupyter-local: - build: - context: ./jupyter - dockerfile: Dockerfile - container_name: jupyter-local - ports: - - "8889:8888" - environment: - - JUPYTER_ENABLE_LAB=yes - - JUPYTER_TOKEN='' - volumes: - - ../notebooks:/home/jovyan/work - - jupyter_local_data:/home/jovyan/.jupyter - - ~/.aws:/home/jovyan/.aws - networks: - - graphrag_local_network - depends_on: - - neo4j-local - - pgvector-local - platform: linux/arm64 - -networks: - graphrag_local_network: - driver: bridge - -volumes: - neo4j_local_data: - neo4j_local_logs: - pgvector_local_data: - jupyter_local_data: diff --git a/examples/lexical-graph-local-dev/docker/docker-compose.yml b/examples/lexical-graph-local-dev/docker/docker-compose.yml index 725f98a9..9f3cb46c 100644 --- a/examples/lexical-graph-local-dev/docker/docker-compose.yml +++ b/examples/lexical-graph-local-dev/docker/docker-compose.yml @@ -1,3 +1,4 @@ +name: local-standard services: neo4j-local: image: neo4j:5.25-community @@ -40,7 +41,7 @@ services: - JUPYTER_ENABLE_LAB=yes - JUPYTER_TOKEN='' volumes: - - ../notebooks:/home/jovyan/work + - ../notebooks:/home/jovyan/notebooks - jupyter_local_data:/home/jovyan/.jupyter - ~/.aws:/home/jovyan/.aws networks: diff --git a/examples/lexical-graph-local-dev/docker/jupyter/Dockerfile.dev b/examples/lexical-graph-local-dev/docker/jupyter/Dockerfile.dev index e8f713ca..ac1a6a1f 100644 --- a/examples/lexical-graph-local-dev/docker/jupyter/Dockerfile.dev +++ b/examples/lexical-graph-local-dev/docker/jupyter/Dockerfile.dev @@ -5,11 +5,8 @@ USER root # Install mamba in base environment, upgrade pip, and preinstall build tools RUN conda install -n base -c conda-forge mamba -y && \ mamba update -n base -c defaults conda -y && \ - # Clean broken 'backports' if it exists rm -rf /opt/conda/lib/python3.11/site-packages/backports* && \ - # Install build tools and correct backport pip install --upgrade pip setuptools wheel build backports.tarfile && \ - # Optional: configure clean pip cache location to suppress permission warnings mkdir -p /tmp/pip-cache && chmod 777 /tmp/pip-cache && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -20,10 +17,31 @@ ENV PIP_CACHE_DIR=/tmp/pip-cache USER $NB_UID # Neo4j driver (lazy import, required by all notebooks via Neo4jGraphStoreFactory) -RUN pip install --no-cache-dir neo4j +# psycopg2-binary + pgvector (required for PGVector store) +RUN pip install --no-cache-dir neo4j psycopg2-binary pgvector + +# NLTK (imported in 00-Setup before any pip install cell) +RUN pip install --no-cache-dir nltk && \ + python -c "import nltk; nltk.download('punkt', quiet=True); nltk.download('stopwords', quiet=True)" + +# Core packages (required before notebook pip install cells run) +RUN pip install --no-cache-dir \ + nest_asyncio \ + python-dotenv \ + matplotlib \ + plotly + +# LlamaIndex readers (hard imports in lexical-graph source) +RUN pip install --no-cache-dir \ + llama-index-readers-web \ + llama-index-readers-file \ + llama-index-readers-github \ + llama-index-readers-json \ + llama-index-readers-structured-data \ + llama-index-readers-s3 \ + pymupdf # Build tools for packages requiring C compilation (e.g. lru-dict) USER root RUN apt-get update && apt-get install -y --no-install-recommends build-essential && rm -rf /var/lib/apt/lists/* USER jovyan - diff --git a/examples/lexical-graph-local-dev/docker/start-containers.ps1 b/examples/lexical-graph-local-dev/docker/start-containers.ps1 deleted file mode 100644 index bf5342c6..00000000 --- a/examples/lexical-graph-local-dev/docker/start-containers.ps1 +++ /dev/null @@ -1,67 +0,0 @@ -param( - [switch]$Mac, - [switch]$Dev, - [switch]$Reset -) - -$ComposeFile = "docker-compose.yml" - -if ($Mac) { - $ComposeFile = "docker-compose.arm.yml" - Write-Host "Using ARM/Mac-specific configuration" -} - -if ($Dev) { - $ComposeFile = "docker-compose-dev.yml" - Write-Host "Development mode: Using docker-compose-dev.yml with hot-code-injection" -} - -if ($Reset) { - Write-Host "Resetting containers and data..." - docker compose -f $ComposeFile down -v - Remove-Item -Recurse -Force extracted -ErrorAction SilentlyContinue - Write-Host "Building and starting containers..." - $BuildFlag = "--build" -} else { - Write-Host "Starting containers (preserving data)..." - $BuildFlag = "" -} - -if ($BuildFlag) { - docker compose -f $ComposeFile up -d --build -} else { - docker compose -f $ComposeFile up -d -} - -Write-Host "" -if ($Reset) { - Write-Host "Reset and startup complete!" -} else { - Write-Host "Startup complete!" -} -Write-Host "" -Write-Host "Services available at:" -if ($Dev) { - Write-Host " Jupyter Lab: http://localhost:8890 (no password required)" - Write-Host " Neo4j Browser: http://localhost:7477 (neo4j/password)" -} else { - Write-Host " Jupyter Lab: http://localhost:8889 (no password required)" - Write-Host " Neo4j Browser: http://localhost:7476 (neo4j/password)" -} -Write-Host "" -Write-Host "IMPORTANT: All notebook execution must happen in Jupyter Lab." -if ($Dev) { - Write-Host " Open http://localhost:8890 to access the development environment." -} else { - Write-Host " Open http://localhost:8889 to access the development environment." -} -Write-Host " Navigate to the 'work' folder to find the notebooks." -if ($Dev) { - Write-Host "" - Write-Host "Development mode enabled - lexical-graph source code mounted for hot-code-injection" - Write-Host " Changes to lexical-graph source will be reflected immediately in notebooks" -} -if (-not $Reset) { - Write-Host "" - Write-Host "Data preserved from previous runs. Use -Reset to start fresh." -} diff --git a/examples/lexical-graph-local-dev/docker/start-containers.sh b/examples/lexical-graph-local-dev/docker/start-containers.sh index ca37eab6..61ae6ef4 100755 --- a/examples/lexical-graph-local-dev/docker/start-containers.sh +++ b/examples/lexical-graph-local-dev/docker/start-containers.sh @@ -6,10 +6,6 @@ RESET_MODE=false for arg in "$@"; do case $arg in - --mac) - COMPOSE_FILE="docker-compose.arm.yml" - echo "Using ARM/Mac-specific configuration" - ;; --dev) DEV_MODE=true echo "Enabling development mode with hot-code-injection" @@ -64,7 +60,7 @@ if [ "$DEV_MODE" = true ]; then else echo " Open http://localhost:8889 to access the development environment." fi -echo " Navigate to the 'work' folder to find the notebooks." +echo " Navigate to the 'notebooks' folder to find the notebooks." if [ "$DEV_MODE" = true ]; then echo "" echo "Development mode enabled - lexical-graph source code mounted for hot-code-injection" From 74a204cbde275d55d1063e47827e1356d14b4c42 Mon Sep 17 00:00:00 2001 From: Mykola Pereyma Date: Fri, 1 May 2026 18:45:24 -0700 Subject: [PATCH 08/10] fix: notebook correctness and env var guards - 00-Setup: fix dev mode path (/home/jovyan/lexical-graph-src -> /home/jovyan/lexical-graph) - 01-Extract: fix GitHub reader (env token, correct repo owner awslabs) - 03-Prompting: add S3_BUCKET_NAME and SYSTEM_PROMPT_ARN/USER_PROMPT_ARN guards - 04-Advanced: add S3 preflight checks, remove duplicate import and empty cell - 05-S3-Reader: add region-aware bucket creation, fix print typo - .env.template: add GITHUB_TOKEN, consolidate S3 bucket config --- .../notebooks/.env.template | 7 +- .../notebooks/00-Setup.ipynb | 78 +++++++++---------- .../01-Combined-Extract-and-Build.ipynb | 10 +-- .../03-Querying-with-Prompting.ipynb | 69 +++++++++------- .../04-Advanced-Configuration-Examples.ipynb | 39 ++++++---- .../05-S3-Directory-Reader-Provider.ipynb | 14 ++-- 6 files changed, 121 insertions(+), 96 deletions(-) diff --git a/examples/lexical-graph-local-dev/notebooks/.env.template b/examples/lexical-graph-local-dev/notebooks/.env.template index f83478a1..1324436d 100644 --- a/examples/lexical-graph-local-dev/notebooks/.env.template +++ b/examples/lexical-graph-local-dev/notebooks/.env.template @@ -5,6 +5,9 @@ # AWS_PROFILE=default AWS_REGION=us-east-1 +# GitHub (optional — for GitHub reader in notebook 01) +# GITHUB_TOKEN= + # Graph Database GRAPH_STORE=bolt://neo4j:password@neo4j-local:7687 @@ -45,10 +48,6 @@ PROMPT_PREFIX=prompts # SYSTEM_PROMPT_ARN= # USER_PROMPT_ARN= -# GitLab Registry Credentials -GITLAB_PYPI_TOKEN=your-gitlab-token-here -GITLAB_USERNAME=your-gitlab-username - # Suppress Neo4j warnings NEO4J_LOG_LEVEL=ERROR diff --git a/examples/lexical-graph-local-dev/notebooks/00-Setup.ipynb b/examples/lexical-graph-local-dev/notebooks/00-Setup.ipynb index 71a5a822..98d30431 100644 --- a/examples/lexical-graph-local-dev/notebooks/00-Setup.ipynb +++ b/examples/lexical-graph-local-dev/notebooks/00-Setup.ipynb @@ -31,7 +31,7 @@ "import sys\n", "\n", "# Check if lexical-graph source is mounted\n", - "lexical_graph_path = '/home/jovyan/lexical-graph-src'\n", + "lexical_graph_path = '/home/jovyan/lexical-graph'\n", "dev_mode = os.path.exists(lexical_graph_path)\n", "\n", "if dev_mode:\n", @@ -71,40 +71,6 @@ " print('Development mode - will install from mounted source')" ] }, - { - "cell_type": "markdown", - "id": "fix_nltk", - "metadata": {}, - "source": [ - "## Fix NLTK Data\n", - "\n", - "Download required NLTK data to prevent processing errors:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "nltk_fix", - "metadata": {}, - "outputs": [], - "source": [ - "import nltk\n", - "import ssl\n", - "\n", - "# Handle SSL certificate issues\n", - "try:\n", - " _create_unverified_https_context = ssl._create_unverified_context\n", - "except AttributeError:\n", - " pass\n", - "else:\n", - " ssl._create_default_https_context = _create_unverified_https_context\n", - "\n", - "# Download required NLTK data\n", - "nltk.download('punkt', quiet=True)\n", - "nltk.download('stopwords', quiet=True)\n", - "print('NLTK data downloaded successfully')" - ] - }, { "cell_type": "markdown", "id": "hot_reload", @@ -114,7 +80,7 @@ "\n", "If in development mode, set up hot-reloading for lexical-graph modules:\n", "\n", - "**IMPORTANT**: After running this cell in development mode, you must restart the kernel (Kernel → Restart Kernel) before continuing to the next cells.\n", + "**IMPORTANT**: After running this cell in development mode, you must restart the kernel (Kernel \u2192 Restart Kernel) before continuing to the next cells.\n", "\n", "**NOTE**: This installation process can sometimes fail or hang. If it doesn't complete within 2-3 minutes, interrupt the kernel and try running this cell again." ] @@ -137,7 +103,7 @@ " print('Please wait - installation in progress', end='', flush=True)\n", " \n", " # Run pip install with real-time feedback\n", - " process = subprocess.Popen(['pip', 'install', '-e', '/home/jovyan/lexical-graph-src'], \n", + " process = subprocess.Popen(['pip', 'install', '-e', '/home/jovyan/lexical-graph'], \n", " stdout=subprocess.PIPE, stderr=subprocess.STDOUT, \n", " universal_newlines=True, bufsize=1)\n", " \n", @@ -178,11 +144,45 @@ " print('Tip: Use %autoreload 2 in cells where you want fresh imports')\n", " print('')\n", " print('IMPORTANT: You must restart the kernel now for the editable installation to take effect.')\n", - " print(' Go to Kernel → Restart Kernel, then continue with the remaining cells.')\n", + " print(' Go to Kernel \u2192 Restart Kernel, then continue with the remaining cells.')\n", "else:\n", " print('Hot-reload not available in standard mode')" ] }, + { + "cell_type": "markdown", + "id": "fix_nltk", + "metadata": {}, + "source": [ + "## Fix NLTK Data\n", + "\n", + "Download required NLTK data to prevent processing errors:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nltk_fix", + "metadata": {}, + "outputs": [], + "source": [ + "import nltk\n", + "import ssl\n", + "\n", + "# Handle SSL certificate issues\n", + "try:\n", + " _create_unverified_https_context = ssl._create_unverified_context\n", + "except AttributeError:\n", + " pass\n", + "else:\n", + " ssl._create_default_https_context = _create_unverified_https_context\n", + "\n", + "# Download required NLTK data\n", + "nltk.download('punkt', quiet=True)\n", + "nltk.download('stopwords', quiet=True)\n", + "print('NLTK data downloaded successfully')" + ] + }, { "cell_type": "markdown", "id": "setup_env", @@ -206,7 +206,7 @@ "import sys\n", "\n", "# Re-check dev mode after kernel restart\n", - "lexical_graph_path = '/home/jovyan/lexical-graph-src'\n", + "lexical_graph_path = '/home/jovyan/lexical-graph'\n", "dev_mode = os.path.exists(lexical_graph_path)\n", "\n", "from graphrag_toolkit.lexical_graph.storage.graph.neo4j_graph_store_factory import Neo4jGraphStoreFactory\n", diff --git a/examples/lexical-graph-local-dev/notebooks/01-Combined-Extract-and-Build.ipynb b/examples/lexical-graph-local-dev/notebooks/01-Combined-Extract-and-Build.ipynb index 2f904a71..33ecaf93 100644 --- a/examples/lexical-graph-local-dev/notebooks/01-Combined-Extract-and-Build.ipynb +++ b/examples/lexical-graph-local-dev/notebooks/01-Combined-Extract-and-Build.ipynb @@ -353,20 +353,20 @@ "from llama_index.readers.github import GithubRepositoryReader, GithubClient\n", "\n", "# GitHub token - replace with your actual token\n", - "github_token = \"\" # Replace with your GitHub token\n", + "github_token = os.environ.get('GITHUB_TOKEN', '')\n", "\n", - "if github_token and github_token != \"ghp_your_token_here\":\n", + "if github_token:\n", " print(\"Using authenticated GitHub access with token.\")\n", "else:\n", - " print(\"No valid GITHUB_TOKEN found — using unauthenticated access. You may be rate-limited.\")\n", + " print(\"No valid GITHUB_TOKEN found \u2014 using unauthenticated access. You may be rate-limited.\")\n", " print(\"To add a token:\")\n", - " print(\"Replace 'ghp_your_token_here' with your actual GitHub personal access token\")\n", + " print(\"Set GITHUB_TOKEN in notebooks/.env to use authenticated access\")\n", "\n", "# Create GitHub client and reader\n", "github_client = GithubClient(github_token=github_token, verbose=True)\n", "reader = GithubRepositoryReader(\n", " github_client=github_client,\n", - " owner=\"evanerwee\",\n", + " owner=\"awslabs\",\n", " repo=\"graphrag-toolkit\",\n", " use_parser=False,\n", " verbose=False,\n", diff --git a/examples/lexical-graph-local-dev/notebooks/03-Querying-with-Prompting.ipynb b/examples/lexical-graph-local-dev/notebooks/03-Querying-with-Prompting.ipynb index 5fea8837..92ba133b 100644 --- a/examples/lexical-graph-local-dev/notebooks/03-Querying-with-Prompting.ipynb +++ b/examples/lexical-graph-local-dev/notebooks/03-Querying-with-Prompting.ipynb @@ -125,26 +125,32 @@ "from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import S3PromptProviderConfig\n", "\n", "# Setup S3 prompt provider\n", - "prompt_provider = S3PromptProvider(\n", - " S3PromptProviderConfig(\n", - " bucket=os.environ['S3_BUCKET_NAME'],\n", - " prefix=os.environ.get('PROMPT_PREFIX', 'prompts'),\n", - " system_prompt_file=\"system_prompt.txt\",\n", - " user_prompt_file=\"user_prompt.txt\"\n", + "s3_bucket = os.environ.get('S3_BUCKET_NAME', '')\n", + "\n", + "if not s3_bucket:\n", + " print('⚠️ S3_BUCKET_NAME not set in .env — skipping S3 prompt provider demo.')\n", + " print('Set S3_BUCKET_NAME in notebooks/.env and upload prompt files to use this feature.')\n", + "else:\n", + " prompt_provider = S3PromptProvider(\n", + " S3PromptProviderConfig(\n", + " bucket=s3_bucket,\n", + " prefix=os.environ.get('PROMPT_PREFIX', 'prompts'),\n", + " system_prompt_file=\"system_prompt.txt\",\n", + " user_prompt_file=\"user_prompt.txt\"\n", + " )\n", " )\n", - ")\n", "\n", - "# Create query engine with S3 prompts\n", - "query_engine = LexicalGraphQueryEngine.for_traversal_based_search(\n", - " graph_store, \n", - " vector_store,\n", - " streaming=True,\n", - " prompt_provider=prompt_provider\n", - ")\n", + " # Create query engine with S3 prompts\n", + " query_engine = LexicalGraphQueryEngine.for_traversal_based_search(\n", + " graph_store, \n", + " vector_store,\n", + " streaming=True,\n", + " prompt_provider=prompt_provider\n", + " )\n", "\n", - "response = query_engine.query(\"What are the similarities and differences between Neptune Database and Neptune Analytics?\")\n", + " response = query_engine.query(\"What are the similarities and differences between Neptune Database and Neptune Analytics?\")\n", "\n", - "print(response.print_response_stream())\n" + " print(response.print_response_stream())\n" ] }, { @@ -168,21 +174,28 @@ "\n", "# Setup Bedrock prompt provider\n", "# Requires SYSTEM_PROMPT_ARN and USER_PROMPT_ARN in .env\n", - "prompt_provider = BedrockPromptProvider(\n", - " BedrockPromptProviderConfig()\n", - ")\n", + "system_prompt_arn = os.environ.get('SYSTEM_PROMPT_ARN', '')\n", + "user_prompt_arn = os.environ.get('USER_PROMPT_ARN', '')\n", + "\n", + "if not system_prompt_arn or not user_prompt_arn:\n", + " print('⚠️ SYSTEM_PROMPT_ARN and/or USER_PROMPT_ARN not set in .env — skipping Bedrock prompt provider demo.')\n", + " print('Run the aws/create_custom_prompt.sh script to create managed prompts, then add the ARNs to .env.')\n", + "else:\n", + " prompt_provider = BedrockPromptProvider(\n", + " BedrockPromptProviderConfig()\n", + " )\n", "\n", - "# Create query engine with Bedrock prompts\n", - "query_engine = LexicalGraphQueryEngine.for_traversal_based_search(\n", - " graph_store, \n", - " vector_store,\n", - " streaming=True,\n", - " prompt_provider=prompt_provider\n", - ")\n", + " # Create query engine with Bedrock prompts\n", + " query_engine = LexicalGraphQueryEngine.for_traversal_based_search(\n", + " graph_store, \n", + " vector_store,\n", + " streaming=True,\n", + " prompt_provider=prompt_provider\n", + " )\n", "\n", - "response = query_engine.query(\"What are the similarities and differences between Neptune Database and Neptune Analytics?\")\n", + " response = query_engine.query(\"What are the similarities and differences between Neptune Database and Neptune Analytics?\")\n", "\n", - "print(response.print_response_stream())\n" + " print(response.print_response_stream())\n" ] } ], diff --git a/examples/lexical-graph-local-dev/notebooks/04-Advanced-Configuration-Examples.ipynb b/examples/lexical-graph-local-dev/notebooks/04-Advanced-Configuration-Examples.ipynb index eba6354f..19b975e9 100644 --- a/examples/lexical-graph-local-dev/notebooks/04-Advanced-Configuration-Examples.ipynb +++ b/examples/lexical-graph-local-dev/notebooks/04-Advanced-Configuration-Examples.ipynb @@ -98,16 +98,25 @@ " ))\n", "}\n", "\n", + "# Preflight check for S3\n", + "s3_bucket = os.environ.get('S3_BUCKET_NAME', '')\n", + "if not s3_bucket:\n", + " print('\u26a0\ufe0f S3_BUCKET_NAME not set \u2014 S3 files will be skipped. Only local files will be processed.')\n", + "\n", "# Define file sources (mix of local and S3)\n", "file_sources = [\n", " # Local files\n", " 'artifacts/sample.csv',\n", " 'artifacts/sample.md',\n", - " # S3 files\n", - " f's3://{os.environ.get(\"S3_BUCKET_NAME\", \"\")}/artifacts/sample.json',\n", - " f's3://{os.environ.get(\"S3_BUCKET_NAME\", \"\")}/artifacts/sample.xlsx'\n", "]\n", "\n", + "# Add S3 files only if bucket is configured\n", + "if s3_bucket:\n", + " file_sources.extend([\n", + " f's3://{s3_bucket}/artifacts/sample.json',\n", + " f's3://{s3_bucket}/artifacts/sample.xlsx'\n", + " ])\n", + "\n", "all_docs = []\n", "\n", "for file_path in file_sources:\n", @@ -168,7 +177,6 @@ "source": [ "import datetime\n", "from pathlib import Path\n", - "from pathlib import Path\n", "from graphrag_toolkit.lexical_graph.indexing.load.readers import (\n", " StructuredDataReaderProvider, StructuredDataReaderConfig,\n", " MarkdownReaderProvider, MarkdownReaderConfig\n", @@ -250,16 +258,25 @@ "metadata": {}, "outputs": [], "source": [ + "# Preflight check for S3\n", + "s3_bucket = os.environ.get('S3_BUCKET_NAME', '')\n", + "if not s3_bucket:\n", + " print('\u26a0\ufe0f S3_BUCKET_NAME not set \u2014 S3 files will be skipped. Only local files will be processed.')\n", + "\n", "# Define file sources (mix of local and S3)\n", "file_sources = [\n", " # Local files\n", " 'artifacts/sample.csv',\n", " 'artifacts/sample.md',\n", - " # S3 files\n", - " f's3://{os.environ.get(\"S3_BUCKET_NAME\", \"\")}/artifacts/sample.json',\n", - " f's3://{os.environ.get(\"S3_BUCKET_NAME\", \"\")}/artifacts/sample.xlsx'\n", "]\n", "\n", + "# Add S3 files only if bucket is configured\n", + "if s3_bucket:\n", + " file_sources.extend([\n", + " f's3://{s3_bucket}/artifacts/sample.json',\n", + " f's3://{s3_bucket}/artifacts/sample.xlsx'\n", + " ])\n", + "\n", "all_docs = []\n", "\n", "for file_path in file_sources:\n", @@ -324,14 +341,6 @@ "- Try different YouTube videos with captions for transcript extraction\n", "- Experiment with custom metadata functions for your specific use cases" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "89a3eb2e-2ebd-4770-b54d-c02a7cdd2c1e", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/examples/lexical-graph-local-dev/notebooks/05-S3-Directory-Reader-Provider.ipynb b/examples/lexical-graph-local-dev/notebooks/05-S3-Directory-Reader-Provider.ipynb index 4a1b6c5e..74f68973 100644 --- a/examples/lexical-graph-local-dev/notebooks/05-S3-Directory-Reader-Provider.ipynb +++ b/examples/lexical-graph-local-dev/notebooks/05-S3-Directory-Reader-Provider.ipynb @@ -145,13 +145,17 @@ "# Only proceed if AWS is configured\n", "if aws_configured:\n", " # Generate unique bucket name\n", - " bucket_name = f\"graphrag-test-{uuid.uuid4().hex[:8]}\"\n", + " bucket_name = os.environ.get('S3_BUCKET_NAME', f\"graphrag-test-{uuid.uuid4().hex[:8]}\")\n", " s3_client = boto3.client('s3')\n", " \n", " try:\n", - " # Create bucket\n", + " # Create bucket (region-aware)\n", " print(f\"Creating test bucket: {bucket_name}\")\n", - " s3_client.create_bucket(Bucket=bucket_name)\n", + " region = os.environ.get('AWS_REGION', 'us-east-1')\n", + " if region == 'us-east-1':\n", + " s3_client.create_bucket(Bucket=bucket_name)\n", + " else:\n", + " s3_client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': region})\n", " \n", " # Create test documents\n", " test_documents = {\n", @@ -350,7 +354,7 @@ " content_type = doc.metadata.get('content_type', 'unknown')\n", " content_types[content_type] = content_types.get(content_type, 0) + 1\n", "\n", - " print(\"\\Content types summary:\")\n", + " print(\"\\nContent types summary:\")\n", " for content_type, count in content_types.items():\n", " print(f\" - {content_type}: {count} documents\")\n", "\n", @@ -455,7 +459,7 @@ " docs = []\n", "\n", "else:\n", - " print(\"Skipping advanced metadata load — no bucket name provided\")\n", + " print(\"Skipping advanced metadata load \u2014 no bucket name provided\")\n", " docs = []\n" ] }, From da0010976af51232543c751c7d5a4561ec4e0194 Mon Sep 17 00:00:00 2001 From: Mykola Pereyma Date: Fri, 1 May 2026 18:45:32 -0700 Subject: [PATCH 09/10] chore: clean up AWS scripts and remove .kiro steering - Remove PowerShell AWS scripts (keep bash only) - Make IAM role name configurable via BATCH_ROLE_NAME env var - Remove .kiro steering file from PR (moved to separate branch) - Update .gitignore for test artifacts --- .gitignore | 3 + .kiro/steering/test-local-dev-notebooks.md | 49 ---- .../aws/create_custom_prompt.ps1 | 33 --- .../aws/create_prompt_role.ps1 | 67 ----- .../aws/setup-bedrock-batch.ps1 | 246 ------------------ .../aws/setup-bedrock-batch.sh | 2 +- 6 files changed, 4 insertions(+), 396 deletions(-) delete mode 100644 .kiro/steering/test-local-dev-notebooks.md delete mode 100644 examples/lexical-graph-local-dev/aws/create_custom_prompt.ps1 delete mode 100644 examples/lexical-graph-local-dev/aws/create_prompt_role.ps1 delete mode 100644 examples/lexical-graph-local-dev/aws/setup-bedrock-batch.ps1 diff --git a/.gitignore b/.gitignore index aaf05800..4f66883a 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,9 @@ build.log temp/ test-results/ test-logs/ +execution_report.json +execution_report.md screenlog.* extracted/ examples/lexical-graph-hybrid-dev/notebooks/output.log +examples/lexical-graph-local-dev/notebooks/run_notebooks.py diff --git a/.kiro/steering/test-local-dev-notebooks.md b/.kiro/steering/test-local-dev-notebooks.md deleted file mode 100644 index b19d1e4a..00000000 --- a/.kiro/steering/test-local-dev-notebooks.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -inclusion: manual ---- -# Test Local-Dev Notebooks - -## When to Use -When asked to test, validate, or run the lexical-graph-local-dev notebooks. - -## How to Run -```bash -cd examples/lexical-graph-local-dev -bash scripts/test-local-dev-notebooks.sh -``` - -## Configuration -Environment variables to customize behavior: -- `SKIP_GITHUB=true|false` (default: true) — skip GitHub reader cells (no token available) -- `SKIP_PPTX=true|false` (default: true) — skip PPTX reader cells (600s timeout) -- `SKIP_LONG_RUNNING=true|false` (default: true) — skip JSON/Wikipedia reader cells (extract_and_build timeout) -- `CLEANUP=true|false` (default: true) — cleanup all resources after test -- `DOCKER_MODE=standard|dev` (default: standard) -- `REPORT_DIR=path` (default: examples/lexical-graph-local-dev/test-reports/) - -## Prerequisites -- AWS CLI configured with valid credentials -- Docker running -- Bedrock model access enabled (Claude Sonnet, Cohere Embed English v3) - -## What It Does -1. Detects platform (ARM/x86) -2. Creates .env from template with auto-detected AWS account/region -3. Creates AWS resources (S3 bucket, Bedrock managed prompts) -4. Starts Docker containers (Neo4j, pgvector, Jupyter) -5. Executes all notebook cells (skipping GitHub and PPTX as configured) -6. Generates per-cell execution report (JSON + markdown) -7. Cleans up all resources (Docker, S3, Bedrock prompts, local .env) - -## Notebooks Tested -- 00-Setup.ipynb — Environment setup, package installation, reader dependencies -- 01-Combined-Extract-and-Build.ipynb — Reader providers (web, PDF, YouTube, docx, markdown, JSON, Wikipedia, CSV, directory) -- 02-Querying.ipynb — TraversalBasedRetriever queries -- 03-Querying-with-Prompting.ipynb — Custom prompts (file, S3, Bedrock managed) -- 04-Advanced-Configuration-Examples.ipynb — Batch processing, custom metadata functions -- 05-S3-Directory-Reader-Provider.ipynb — S3 directory reader with prefix filtering and metadata - -## Expected Results -- All executed cells SUCCESS, 0 FAILED -- Skipped cells depend on configuration (GitHub, PPTX, long-running by default) -- Reports in test-reports/ directory (execution_report.json + execution_report.md) diff --git a/examples/lexical-graph-local-dev/aws/create_custom_prompt.ps1 b/examples/lexical-graph-local-dev/aws/create_custom_prompt.ps1 deleted file mode 100644 index f2b5d97d..00000000 --- a/examples/lexical-graph-local-dev/aws/create_custom_prompt.ps1 +++ /dev/null @@ -1,33 +0,0 @@ -# Usage: -# .\create_custom_prompt.ps1 [aws_profile] - -param( - [Parameter(Mandatory = $true)] - [string]$PromptJson, - - [Parameter(Mandatory = $true)] - [string]$Region, - - [string]$AwsProfile -) - -if (-not (Test-Path $PromptJson)) { - Write-Host "Error: JSON file '$PromptJson' not found." - exit 1 -} - -Write-Host "Creating prompt from JSON file: $PromptJson" - -$cmd = @( - "aws", "bedrock-agent", "create-prompt", - "--region", $Region, - "--cli-input-json", "file://$PromptJson" -) - -if ($AwsProfile) { - $cmd += @("--profile", $AwsProfile) -} - -& $cmd - -Write-Host "Prompt created successfully." diff --git a/examples/lexical-graph-local-dev/aws/create_prompt_role.ps1 b/examples/lexical-graph-local-dev/aws/create_prompt_role.ps1 deleted file mode 100644 index 5ac2c3a4..00000000 --- a/examples/lexical-graph-local-dev/aws/create_prompt_role.ps1 +++ /dev/null @@ -1,67 +0,0 @@ -# Usage: -# .\create_prompt_role.ps1 -RoleName "my-bedrock-prompt-role" -Profile "my-aws-profile" - -param ( - [Parameter(Mandatory = $true)] - [string]$RoleName, - - [string]$Profile -) - -if (-not $RoleName) { - Write-Host "Error: --role-name is required" - exit 1 -} - -$profileArgs = @() -if ($Profile) { - $profileArgs = @("--profile", $Profile) -} - -# Define the trust policy -$trustPolicy = @" -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Service": "bedrock.amazonaws.com" - }, - "Action": "sts:AssumeRole" - } - ] -} -"@ - -# Write to temporary trust policy file -$tempTrustPolicyFile = "trust-policy-temp.json" -$trustPolicy | Set-Content -Encoding UTF8 $tempTrustPolicyFile - -# Create the IAM role -Write-Host "Creating IAM role '$RoleName' for Bedrock..." -aws iam create-role ` - --role-name $RoleName ` - --assume-role-policy-document file://$tempTrustPolicyFile ` - @profileArgs - -# Attach inline policy (assumes bedrock-prompt-policy.json is in same directory) -Write-Host "Attaching inline policy (BedrockPromptMinimalPolicy)..." -aws iam put-role-policy ` - --role-name $RoleName ` - --policy-name "BedrockPromptMinimalPolicy" ` - --policy-document file://bedrock-prompt-policy.json ` - @profileArgs - -# Get the role ARN -$roleArn = aws iam get-role ` - --role-name $RoleName ` - --query "Role.Arn" ` - --output text ` - @profileArgs - -Write-Host "`nDone. Role ARN:" -Write-Host $roleArn - -# Cleanup -Remove-Item $tem diff --git a/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.ps1 b/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.ps1 deleted file mode 100644 index 7560f58f..00000000 --- a/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.ps1 +++ /dev/null @@ -1,246 +0,0 @@ -# Usage: .\setup-graphrag.ps1 [-Profile ] -param( - [string]$Profile = "" -) - -# Build conditional profile args for splatting -$ProfileArgs = @() -if ($Profile) { - $ProfileArgs = @("--profile", $Profile) -} - -function Check-AwsCredentials { - if (-not (aws sts get-caller-identity @ProfileArgs -ErrorAction SilentlyContinue)) { - Write-Host "Error: No valid AWS credentials found" - if ($Profile) { - Write-Host "If using AWS SSO, run: aws sso login --profile $Profile" - Write-Host "If using traditional credentials, run: aws configure --profile $Profile" - } else { - Write-Host "If using AWS SSO, run: aws sso login" - Write-Host "If using traditional credentials, run: aws configure" - } - exit 1 - } -} - -function Get-AccountDetails { - $global:AccountId = aws sts get-caller-identity @ProfileArgs --query Account --output text - if (-not $AccountId) { - Write-Host "Error: Could not determine AWS Account ID" - exit 1 - } - - $global:Region = aws configure get region @ProfileArgs - if (-not $Region) { - Write-Host "Error: Could not determine AWS Region" - exit 1 - } - - $global:CurrentRole = aws sts get-caller-identity @ProfileArgs --query Arn --output text | Select-String -Pattern 'AWSReservedSSO_[^/]+' | ForEach-Object { $_.Matches.Value } -} - -Check-AwsCredentials -Get-AccountDetails - -$ApplicationId = "graphrag-toolkit" -$BucketName = "graphrag-toolkit-$AccountId" -$RoleName = "bedrock-batch-inference-role" -$PolicyName = "bedrock-batch-inference-policy" -$ModelId = "anthropic.claude-v2" -$TableName = "graphrag-toolkit-batch-table" - -# Create S3 bucket -Write-Host "Creating S3 bucket $BucketName..." -if (-not (aws s3api head-bucket --bucket $BucketName @ProfileArgs -ErrorAction SilentlyContinue)) { - if ($Region -eq "us-east-1") { - aws s3api create-bucket --bucket $BucketName --region $Region @ProfileArgs - } else { - aws s3api create-bucket --bucket $BucketName --region $Region --create-bucket-configuration LocationConstraint=$Region @ProfileArgs - } - Write-Host "Bucket created successfully" -} else { - Write-Host "Bucket $BucketName already exists" -} - -# Create DynamoDB table -Write-Host "Creating DynamoDB table $TableName..." -if (-not (aws dynamodb describe-table --table-name $TableName @ProfileArgs -ErrorAction SilentlyContinue)) { - aws dynamodb create-table ` - --table-name $TableName ` - --attribute-definitions ` - AttributeName=collection_id,AttributeType=S ` - AttributeName=completion_date,AttributeType=S ` - AttributeName=reader_type,AttributeType=S ` - --key-schema ` - AttributeName=collection_id,KeyType=HASH ` - AttributeName=completion_date,KeyType=RANGE ` - --billing-mode PAY_PER_REQUEST ` - --global-secondary-indexes "[{`"IndexName`": `"reader_type-index`", `"KeySchema`": [{`"AttributeName`": `"reader_type`", `"KeyType`": `"HASH`"}, {`"AttributeName`": `"completion_date`", `"KeyType`": `"RANGE`"}], `"Projection`": {`"ProjectionType`": `"ALL`"}}]" ` - --region $Region ` - @ProfileArgs - - Write-Host "Waiting for DynamoDB table to become active..." - aws dynamodb wait table-exists --table-name $TableName --region $Region @ProfileArgs - Write-Host "DynamoDB table created successfully" -} else { - Write-Host "DynamoDB table $TableName already exists" -} - -# Write IAM policy JSON files -@" -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Service": "bedrock.amazonaws.com" - }, - "Action": "sts:AssumeRole", - "Condition": { - "StringEquals": { - "aws:SourceAccount": "$AccountId" - }, - "ArnEquals": { - "aws:SourceArn": "arn:aws:bedrock:$Region:$AccountId:model-invocation-job/*" - } - } - } - ] -} -"@ | Set-Content -Encoding UTF8 trust-policy.json - -@" -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": ["bedrock:InvokeModel"], - "Resource": "arn:aws:bedrock:${Region}::foundation-model/*" - }, - { - "Effect": "Allow", - "Action": ["s3:GetObject", "s3:ListBucket", "s3:PutObject"], - "Resource": [ - "arn:aws:s3:::$BucketName", - "arn:aws:s3:::$BucketName/*" - ], - "Condition": { - "StringEquals": { - "aws:ResourceAccount": ["$AccountId"] - } - } - }, - { - "Effect": "Allow", - "Action": ["dynamodb:PutItem", "dynamodb:Query", "dynamodb:Scan"], - "Resource": "arn:aws:dynamodb:$Region:$AccountId:table/$TableName", - "Condition": { - "StringEquals": { - "aws:ResourceAccount": ["$AccountId"] - } - } - } - ] -} -"@ | Set-Content -Encoding UTF8 role-permissions-policy.json - -@" -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "bedrock:CreateModelInvocationJob", - "bedrock:GetModelInvocationJob", - "bedrock:ListModelInvocationJobs", - "bedrock:StopModelInvocationJob" - ], - "Resource": [ - "arn:aws:bedrock:$Region::foundation-model/$ModelId", - "arn:aws:bedrock:$Region:$AccountId:model-invocation-job/*" - ] - }, - { - "Effect": "Allow", - "Action": ["iam:PassRole"], - "Resource": "arn:aws:iam::$AccountId:role/$RoleName" - }, - { - "Effect": "Allow", - "Action": ["dynamodb:PutItem", "dynamodb:Query", "dynamodb:Scan"], - "Resource": "arn:aws:dynamodb:$Region:$AccountId:table/$TableName" - } - ] -} -"@ | Set-Content -Encoding UTF8 identity-permissions-policy.json - -# Create IAM role and attach policy -Write-Host "Creating IAM role $RoleName..." -if (-not (aws iam get-role --role-name $RoleName @ProfileArgs -ErrorAction SilentlyContinue)) { - aws iam create-role --role-name $RoleName --assume-role-policy-document file://trust-policy.json @ProfileArgs - Write-Host "Role created successfully" -} else { - Write-Host "Role $RoleName already exists" -} - -$PolicyArn = "arn:aws:iam::$AccountId:policy/$PolicyName" -if (-not (aws iam get-policy --policy-arn $PolicyArn @ProfileArgs -ErrorAction SilentlyContinue)) { - aws iam create-policy --policy-name $PolicyName --policy-document file://role-permissions-policy.json @ProfileArgs - Write-Host "Policy created successfully" -} else { - Write-Host "Policy $PolicyName already exists" -} - -aws iam attach-role-policy --role-name $RoleName --policy-arn $PolicyArn @ProfileArgs - -# Create identity policy -$IdentityPolicyName = "bedrock-batch-identity-policy" -$IdentityPolicyArn = "arn:aws:iam::$AccountId:policy/$IdentityPolicyName" -if (-not (aws iam get-policy --policy-arn $IdentityPolicyArn @ProfileArgs -ErrorAction SilentlyContinue)) { - aws iam create-policy --policy-name $IdentityPolicyName --policy-document file://identity-permissions-policy.json @ProfileArgs - Write-Host "Identity policy created successfully" -} else { - Write-Host "Identity policy $IdentityPolicyName already exists" -} - -# Clean up temp files -Remove-Item trust-policy.json, role-permissions-policy.json, identity-permissions-policy.json -Force - -# Upload S3 prompt files for S3PromptProvider (used by notebook 04) -Write-Host "Uploading prompt files to S3..." -$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path - -python3 -c @" -import json, sys -with open(sys.argv[1]) as f: - data = json.load(f) -print(data['variants'][0]['templateConfiguration']['text']['text'], end='') -"@ "$ScriptDir/system_prompt.json" | aws s3 cp - "s3://$BucketName/prompts/system_prompt.txt" --content-type text/plain --region $Region @ProfileArgs - -python3 -c @" -import json, sys -with open(sys.argv[1]) as f: - data = json.load(f) -print(data['variants'][0]['templateConfiguration']['text']['text'], end='') -"@ "$ScriptDir/user_prompt.json" | aws s3 cp - "s3://$BucketName/prompts/user_prompt.txt" --content-type text/plain --region $Region @ProfileArgs - -Write-Host "Prompt files uploaded to s3://$BucketName/prompts/" - -# Summary -Write-Host "`nSetup complete!" -Write-Host "Bucket: $BucketName" -Write-Host "DynamoDB Table: arn:aws:dynamodb:$Region:$AccountId:table/$TableName" -Write-Host "Role ARN: arn:aws:iam::$AccountId:role/$RoleName" -Write-Host "Policy ARN: $PolicyArn" -Write-Host "Identity Policy ARN: $IdentityPolicyArn" - -if ($CurrentRole) { - Write-Host "`nNOTE: You are using AWS SSO with role: $CurrentRole" - Write-Host "To complete setup, go to IAM Identity Center and attach the identity policy to the Permission Set." -} else { - Write-Host "`nNOTE: You are using traditional IAM credentials." - Write-Host "Ensure the identity policy is attached to your IAM user or role." -} diff --git a/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.sh b/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.sh index bdca2ba3..0e93226f 100755 --- a/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.sh +++ b/examples/lexical-graph-local-dev/aws/setup-bedrock-batch.sh @@ -50,7 +50,7 @@ get_account_details APPLICATION_ID="graphrag-toolkit" BUCKET_NAME="graphrag-toolkit-${ACCOUNT_ID}" # Using account ID to ensure uniqueness -ROLE_NAME="bedrock-batch-inference-role" +ROLE_NAME="${BATCH_ROLE_NAME:-bedrock-batch-inference-role}" POLICY_NAME="bedrock-batch-inference-policy" MODEL_ID="anthropic.claude-v2" # Example model ID, adjust as needed TABLE_NAME="graphrag-toolkit-batch-table" From 774484d1b9aef1d8107f19cbb40bc28f744b0782 Mon Sep 17 00:00:00 2001 From: Mykola Pereyma Date: Fri, 1 May 2026 18:45:45 -0700 Subject: [PATCH 10/10] docs: update README, docs, and test scripts - README: fix .env path, Bolt port, S3 notebook list, AWS requirements, test config table (SKIP_CUDA -> actual flags), add automated testing section - development_mode.md: fix mount path, Jupyter port, notebook directory - docker_services.md: add dev mode ports - troubleshooting.md: update docker compose v2 syntax - Rename scripts/ to tests/ for clarity - Test script: fix compose file refs, add --reset default, standard mode only --- examples/lexical-graph-local-dev/README.md | 84 ++++++++++--------- .../docs/development_mode.md | 26 +++--- .../docs/docker_services.md | 14 ++-- .../docs/troubleshooting.md | 20 ++--- .../{scripts => tests}/run_notebooks.py | 9 +- .../test-local-dev-notebooks.sh | 39 ++++----- 6 files changed, 99 insertions(+), 93 deletions(-) rename examples/lexical-graph-local-dev/{scripts => tests}/run_notebooks.py (95%) rename examples/lexical-graph-local-dev/{scripts => tests}/test-local-dev-notebooks.sh (89%) diff --git a/examples/lexical-graph-local-dev/README.md b/examples/lexical-graph-local-dev/README.md index fd495af1..bb1f2f92 100644 --- a/examples/lexical-graph-local-dev/README.md +++ b/examples/lexical-graph-local-dev/README.md @@ -17,6 +17,8 @@ This example provides a complete local development environment for the GraphRAG ## Quick Start +> All commands below should be executed from the `lexical-graph-local-dev/` directory. + ### 1. AWS Prerequisites Before starting, ensure you have: @@ -28,38 +30,31 @@ Before starting, ensure you have: ### 2. Configure Environment ```bash -cd notebooks -cp .env.template .env +cp notebooks/.env.template notebooks/.env ``` -Review `.env` — defaults work for local Docker services. Set `S3_BUCKET_NAME` if using S3 features (notebooks 03, 05). +Review `notebooks/.env` — defaults work for local Docker services. Set `S3_BUCKET_NAME` if using S3 features (notebooks 03, 04, 05). ### 3. Start the Environment -**Standard (x86/Intel):** +**Standard:** ```bash cd docker ./start-containers.sh ``` -**Mac/ARM (Apple Silicon):** -```bash -cd docker -./start-containers.sh --mac -``` - **Development Mode (Hot-Code-Injection):** ```bash cd docker -./start-containers.sh --dev --mac # Enable live code editing +./start-containers.sh --dev ``` ### 4. Access Jupyter Lab -Open your browser to: **http://localhost:8889** +Open your browser to: **http://localhost:8889** (or **http://localhost:8890** for dev mode) - No password required -- Navigate to the `work` folder to find notebooks +- Navigate to the `notebooks` folder to find notebooks - All dependencies are pre-installed ### 5. Run the Setup Notebook @@ -73,13 +68,11 @@ Start with `00-Setup.ipynb` to configure your environment and verify all service | Script | Platform | Description | |--------|----------|-------------| | `start-containers.sh` | Unix/Linux/Mac | Main startup script with all options | -| `start-containers.ps1` | Windows PowerShell | PowerShell version with same functionality | ### Script Options | Flag | Description | |------|-------------| -| `--mac` | Use ARM/Apple Silicon optimized containers | | `--dev` | Enable development mode with hot-code-injection | | `--reset` | Reset all data and rebuild containers | @@ -89,35 +82,32 @@ Start with `00-Setup.ipynb` to configure your environment and verify all service # Standard startup ./start-containers.sh -# Apple Silicon Mac -./start-containers.sh --mac - # Development mode with hot-reload -./start-containers.sh --dev --mac +./start-containers.sh --dev # Reset everything and start fresh -./start-containers.sh --reset --mac +./start-containers.sh --reset -# Windows PowerShell -.\start-containers.ps1 -Mac -Dev +# Reset with dev mode +./start-containers.sh --dev --reset ``` ## Services After startup, the following services are available: -| Service | URL | Credentials | Purpose | -|---------|-----|-------------|---------| -| **Jupyter Lab** | http://localhost:8889 | None required | Interactive development | -| **Neo4j Browser** | http://localhost:7476 | neo4j/password | Graph database management | -| **PostgreSQL** | localhost:5432 | postgres/password | Vector storage | +| Service | Standard URL | Dev URL | Credentials | Purpose | +|---------|-------------|---------|-------------|---------| +| **Jupyter Lab** | http://localhost:8889 | http://localhost:8890 | None required | Interactive development | +| **Neo4j Browser** | http://localhost:7476 | http://localhost:7477 | neo4j/password | Graph database management | +| **PostgreSQL** | localhost:5432 | localhost:5434 | postgres/password | Vector storage | ## Development Mode Development mode enables hot-code-injection for active lexical-graph development: ```bash -./start-containers.sh --dev --mac +./start-containers.sh --dev ``` **Features:** @@ -141,7 +131,7 @@ Development mode enables hot-code-injection for active lexical-graph development **To reset all data:** ```bash -./start-containers.sh --reset --mac +./start-containers.sh --reset ``` ## Database Configuration @@ -202,7 +192,7 @@ docs = reader.read('s3://my-bucket/documents/file.pdf') ## Environment Variables -Key environment variables (configured in `docker/.env`): +Key environment variables (configured in `notebooks/.env`): ```bash # Database connections (Docker internal names) @@ -218,6 +208,28 @@ EMBEDDINGS_MODEL="cohere.embed-english-v3" EXTRACTION_MODEL="us.anthropic.claude-sonnet-4-6" ``` +## Automated Testing + +Run all notebooks end-to-end with a single command: + +```bash +bash tests/test-local-dev-notebooks.sh +``` + +This handles the full lifecycle: environment setup, Docker containers, notebook execution, reporting, and cleanup. + +Configuration options (environment variables): + +| Variable | Default | Description | +|----------|---------|-------------| +| `SKIP_GITHUB` | `true` | Skip GitHub reader cells (requires token) | +| `SKIP_PPTX` | `true` | Skip PPTX reader cells (slow, requires torch) | +| `SKIP_LONG_RUNNING` | `true` | Skip JSON/Wikipedia extract_and_build cells | +| `CLEANUP` | `true` | Clean up all resources after run | +| `REPORT_DIR` | `test-results/` | Output directory for reports | + +Reports are generated in `test-results/` (execution_report.json + execution_report.md). + ## Troubleshooting ### Common Issues @@ -225,7 +237,7 @@ EXTRACTION_MODEL="us.anthropic.claude-sonnet-4-6" **Port conflicts:** - Jupyter: 8889 (not 8888) - Neo4j HTTP: 7476 (not 7474) -- Neo4j Bolt: 7687 +- Neo4j Bolt: 7689 (not 7687) - PostgreSQL: 5432 **Container networking:** @@ -242,10 +254,10 @@ If you encounter persistent issues: ```bash # Stop and remove everything -docker-compose down -v +docker compose down -v # Start fresh -./start-containers.sh --reset --mac +./start-containers.sh --reset ``` ## AWS Foundation Model Access (Optional) @@ -276,8 +288,4 @@ If you have existing FalkorDB configurations: GraphStoreFactory.register(Neo4jGraphStoreFactory) ``` -3. **Migrate data** if needed (contact support for migration tools) - ---- - -This local development environment provides everything needed to develop, test, and experiment with GraphRAG lexical-graph functionality without requiring AWS infrastructure. \ No newline at end of file +3. **Migrate data** if needed (contact support for migration tools) \ No newline at end of file diff --git a/examples/lexical-graph-local-dev/docs/development_mode.md b/examples/lexical-graph-local-dev/docs/development_mode.md index b08a39be..df0eb446 100644 --- a/examples/lexical-graph-local-dev/docs/development_mode.md +++ b/examples/lexical-graph-local-dev/docs/development_mode.md @@ -21,13 +21,7 @@ cd docker ### Apple Silicon Mac ```bash cd docker -./start-containers.sh --dev --mac -``` - -### Windows PowerShell -```powershell -cd docker -.\start-containers.ps1 -Dev -Mac +./start-containers.sh --dev ``` --- @@ -35,7 +29,7 @@ cd docker ## What Development Mode Does ### 1. Source Code Mounting -- Mounts `../../../lexical-graph` to `/home/jovyan/lexical-graph-src` in Jupyter container +- Mounts `../../../lexical-graph` to `/home/jovyan/lexical-graph` in Jupyter container - Provides direct access to lexical-graph source code - Changes to source files are immediately visible @@ -55,11 +49,11 @@ cd docker ### 1. Start Development Environment ```bash -./start-containers.sh --dev --mac +./start-containers.sh --dev ``` ### 2. Access Jupyter Lab -Open http://localhost:8889 in your browser +Open http://localhost:8890 in your browser ### 3. Run Setup Notebook Execute `00-Setup.ipynb` which will: @@ -71,7 +65,7 @@ Execute `00-Setup.ipynb` which will: ### 4. Verify Development Mode ```python import os -dev_mode = os.path.exists('/home/jovyan/lexical-graph-src') +dev_mode = os.path.exists('/home/jovyan/lexical-graph') print(f"Development mode: {dev_mode}") # Check if auto-reload is active @@ -103,8 +97,8 @@ lexical-graph-local-dev/ In Jupyter container: ``` /home/jovyan/ -├── work/ # notebooks/ mounted here -└── lexical-graph-src/ # lexical-graph/ mounted here (dev mode only) +├── notebooks/ # notebooks/ mounted here +└── lexical-graph/ # lexical-graph/ mounted here (dev mode only) ├── src/ │ └── graphrag_toolkit/ └── pyproject.toml @@ -187,13 +181,13 @@ ls -la ../../../lexical-graph # Check installation mode import graphrag_toolkit print(graphrag_toolkit.__file__) -# Should show path to /home/jovyan/lexical-graph-src/... +# Should show path to /home/jovyan/lexical-graph/... ``` ### Issue: Import Errors ```python # Reinstall in editable mode -!pip install -e /home/jovyan/lexical-graph-src +!pip install -e /home/jovyan/lexical-graph # Restart kernel after installation ``` @@ -254,7 +248,7 @@ Use proper shutdown to avoid container state issues: ## Contributing Workflow 1. **Fork and Clone**: Fork graphrag-toolkit, clone locally -2. **Start Dev Mode**: `./start-containers.sh --dev --mac` +2. **Start Dev Mode**: `./start-containers.sh --dev` 3. **Make Changes**: Edit source code in your IDE 4. **Test in Jupyter**: Verify changes work in notebooks 5. **Commit and Push**: Standard git workflow diff --git a/examples/lexical-graph-local-dev/docs/docker_services.md b/examples/lexical-graph-local-dev/docs/docker_services.md index 80966e2c..a297d2a1 100644 --- a/examples/lexical-graph-local-dev/docs/docker_services.md +++ b/examples/lexical-graph-local-dev/docs/docker_services.md @@ -28,7 +28,7 @@ This document describes the services defined in the `docker-compose.yml` file us - **Environment Variables**: - `JUPYTER_ENABLE_LAB`: Enables Jupyter Lab interface - **Volumes**: - - `../notebooks:/home/jovyan/work`: Notebook files + - `../notebooks:/home/jovyan/notebooks`: Notebook files - `~/.aws:/home/jovyan/.aws`: AWS credentials - **Network**: Connected to `graphrag_local_network` - **Depends On**: `pgvector-local`, `neo4j-local` @@ -59,7 +59,7 @@ The `docker-compose-dev.yml` provides a development variant with hot-code-inject | Jupyter port | 8889 | 8890 | | PostgreSQL port | 5432 | 5434 | | Jupyter Dockerfile | `jupyter/Dockerfile` (full) | `jupyter/Dockerfile.dev` (minimal) | -| Notebook mount | `/home/jovyan/work` | `/home/jovyan/notebooks` | +| Notebook mount | `/home/jovyan/notebooks` | `/home/jovyan/notebooks` | | Source mounts | None | lexical-graph, lexical-graph-contrib | Start dev mode with: `./start-containers.sh --dev` @@ -107,8 +107,8 @@ All services use Docker volumes for data persistence. To reset all data: After startup, services are available at: -| Service | URL | Credentials | Purpose | -|---------|-----|-------------|---------| -| **Jupyter Lab** | http://localhost:8889 | None required | Interactive development | -| **Neo4j Browser** | http://localhost:7476 | neo4j/password | Graph database management | -| **PostgreSQL** | localhost:5432 | postgres/password | Vector database | +| Service | Standard URL | Dev URL | Credentials | Purpose | +|---------|-------------|---------|-------------|---------| +| **Jupyter Lab** | http://localhost:8889 | http://localhost:8890 | None required | Interactive development | +| **Neo4j Browser** | http://localhost:7476 | http://localhost:7477 | neo4j/password | Graph database management | +| **PostgreSQL** | localhost:5432 | localhost:5434 | postgres/password | Vector database | diff --git a/examples/lexical-graph-local-dev/docs/troubleshooting.md b/examples/lexical-graph-local-dev/docs/troubleshooting.md index 96fc9058..395f002e 100644 --- a/examples/lexical-graph-local-dev/docs/troubleshooting.md +++ b/examples/lexical-graph-local-dev/docs/troubleshooting.md @@ -40,7 +40,7 @@ docker volume prune -f # Docker Desktop → Restart # Try again -./start-containers.sh --mac +./start-containers.sh ``` ### Volume and Data Issues @@ -56,15 +56,15 @@ Neo4j database empty after restart docker volume ls | grep neo4j # Ensure proper shutdown -docker-compose down # Don't use -v flag +docker compose down # Don't use -v flag # Restart normally -./start-containers.sh --mac +./start-containers.sh ``` **Issue: Permission denied errors** ``` -Permission denied: '/home/jovyan/work' +Permission denied: '/home/jovyan/notebooks' ``` **Solution:** @@ -73,7 +73,7 @@ Permission denied: '/home/jovyan/work' sudo chown -R $USER:$USER notebooks/ # Rebuild containers -./start-containers.sh --reset --mac +./start-containers.sh --reset ``` --- @@ -95,7 +95,7 @@ print(dev_mode) # False ls -la ../../../lexical-graph # Should exist # Start with --dev flag -./start-containers.sh --dev --mac +./start-containers.sh --dev ``` ### Hot-Reload Not Working @@ -443,7 +443,7 @@ When all else fails, perform a complete reset: ```bash # 1. Stop everything -docker-compose down -v --remove-orphans +docker compose down -v --remove-orphans # 2. Clean up Docker docker system prune -f @@ -458,7 +458,7 @@ rm -rf notebooks/extracted/ rm -rf notebooks/output/ # 5. Restart fresh -./start-containers.sh --reset --mac +./start-containers.sh --reset ``` ### Selective Reset @@ -472,7 +472,7 @@ docker rm neo4j-local pgvector-local docker volume rm neo4j_local_data pgvector_local_data # Restart databases -docker-compose up -d neo4j-local pgvector-local +docker compose up -d neo4j-local pgvector-local ``` --- @@ -491,7 +491,7 @@ docker logs jupyter-local > jupyter.log # System information docker version > system_info.txt -docker-compose version >> system_info.txt +docker compose version >> system_info.txt uname -a >> system_info.txt ``` diff --git a/examples/lexical-graph-local-dev/scripts/run_notebooks.py b/examples/lexical-graph-local-dev/tests/run_notebooks.py similarity index 95% rename from examples/lexical-graph-local-dev/scripts/run_notebooks.py rename to examples/lexical-graph-local-dev/tests/run_notebooks.py index 88d41502..d16fc6ba 100644 --- a/examples/lexical-graph-local-dev/scripts/run_notebooks.py +++ b/examples/lexical-graph-local-dev/tests/run_notebooks.py @@ -42,7 +42,10 @@ def load_env(env_path): line = line.strip() if line and not line.startswith("#") and "=" in line: key, _, value = line.partition("=") - os.environ[key.strip()] = value.strip() + value = value.strip() + if len(value) >= 2 and value[0] == value[-1] and value[0] in ("'", '"'): + value = value[1:-1] + os.environ[key.strip()] = value def extract_output(cell): @@ -145,8 +148,8 @@ def write_markdown_report(report, path): def main(): parser = argparse.ArgumentParser(description="Run local-dev notebooks") - parser.add_argument("--work-dir", default="/home/jovyan/work") - parser.add_argument("--output-dir", default="/home/jovyan/work") + parser.add_argument("--work-dir", default="/home/jovyan/notebooks") + parser.add_argument("--output-dir", default="/home/jovyan/notebooks") parser.add_argument("--skip-github", default="true", choices=["true", "false"]) parser.add_argument("--skip-pptx", default="true", choices=["true", "false"]) parser.add_argument("--skip-long-running", default="true", choices=["true", "false"]) diff --git a/examples/lexical-graph-local-dev/scripts/test-local-dev-notebooks.sh b/examples/lexical-graph-local-dev/tests/test-local-dev-notebooks.sh similarity index 89% rename from examples/lexical-graph-local-dev/scripts/test-local-dev-notebooks.sh rename to examples/lexical-graph-local-dev/tests/test-local-dev-notebooks.sh index 23f62886..3e76ef50 100755 --- a/examples/lexical-graph-local-dev/scripts/test-local-dev-notebooks.sh +++ b/examples/lexical-graph-local-dev/tests/test-local-dev-notebooks.sh @@ -13,14 +13,14 @@ PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" NOTEBOOKS_DIR="$PROJECT_DIR/notebooks" DOCKER_DIR="$PROJECT_DIR/docker" AWS_DIR="$PROJECT_DIR/aws" -REPORT_DIR="${REPORT_DIR:-$PROJECT_DIR/test-reports}" +REPORT_DIR="${REPORT_DIR:-$PROJECT_DIR/test-results}" # Configurable flags SKIP_GITHUB="${SKIP_GITHUB:-true}" SKIP_PPTX="${SKIP_PPTX:-true}" SKIP_LONG_RUNNING="${SKIP_LONG_RUNNING:-true}" CLEANUP="${CLEANUP:-true}" -DOCKER_MODE="${DOCKER_MODE:-standard}" +DOCKER_MODE="standard" # State tracking for cleanup AWS_ACCOUNT="" @@ -57,17 +57,11 @@ detect_platform() { local arch arch=$(uname -m) if [[ "$arch" == "arm64" || "$arch" == "aarch64" ]]; then - COMPOSE_FILE="docker-compose.arm.yml" - DOCKER_FLAGS="--mac" - ok "ARM platform detected (compose: $COMPOSE_FILE)" + ok "ARM platform detected" else - COMPOSE_FILE="docker-compose.yml" - DOCKER_FLAGS="" - ok "x86 platform detected (compose: $COMPOSE_FILE)" - fi - if [[ "$DOCKER_MODE" == "dev" ]]; then - DOCKER_FLAGS="$DOCKER_FLAGS --dev" + ok "x86 platform detected" fi + DOCKER_FLAGS="--reset" } # ============================================================================= @@ -161,6 +155,11 @@ start_docker() { log "Starting Docker containers ($DOCKER_MODE mode)..." timer_start + # Container names (standard mode) + JUPYTER_CONTAINER="jupyter-local" + NEO4J_CONTAINER="neo4j-local" + PGVECTOR_CONTAINER="pgvector-local" + (cd "$DOCKER_DIR" && ./start-containers.sh $DOCKER_FLAGS) DOCKER_STARTED=true @@ -173,10 +172,10 @@ wait_for_containers() { local waited=0 while [[ $waited -lt $max_wait ]]; do local count - count=$(docker ps --filter "name=neo4j-local" --filter "name=pgvector-local" --filter "name=jupyter-local" --format "{{.Names}}" | wc -l | tr -d ' ') + count=$(docker ps --filter "name=$NEO4J_CONTAINER" --filter "name=$PGVECTOR_CONTAINER" --filter "name=$JUPYTER_CONTAINER" --format "{{.Names}}" | wc -l | tr -d ' ') if [[ "$count" -ge 3 ]]; then # Also verify jupyter is responsive - if docker exec jupyter-local python3 -c "print('ready')" 2>/dev/null; then + if docker exec "$JUPYTER_CONTAINER" python3 -c "print('ready')" 2>/dev/null; then return 0 fi fi @@ -195,20 +194,22 @@ run_notebooks() { timer_start mkdir -p "$REPORT_DIR" + JUPYTER_WORK_DIR="/home/jovyan/notebooks" + # Copy runner script into container - docker cp "$SCRIPT_DIR/run_notebooks.py" jupyter-local:/home/jovyan/work/run_notebooks.py + docker cp "$SCRIPT_DIR/run_notebooks.py" "$JUPYTER_CONTAINER:$JUPYTER_WORK_DIR/run_notebooks.py" # Execute - docker exec jupyter-local \ - python3 /home/jovyan/work/run_notebooks.py \ + docker exec "$JUPYTER_CONTAINER" \ + python3 "$JUPYTER_WORK_DIR/run_notebooks.py" \ --skip-github="$SKIP_GITHUB" \ --skip-pptx="$SKIP_PPTX" \ --skip-long-running="$SKIP_LONG_RUNNING" \ || NOTEBOOK_EXIT_CODE=$? # Collect reports - docker cp jupyter-local:/home/jovyan/work/execution_report.json "$REPORT_DIR/" 2>/dev/null || true - docker cp jupyter-local:/home/jovyan/work/execution_report.md "$REPORT_DIR/" 2>/dev/null || true + docker cp "$JUPYTER_CONTAINER:$JUPYTER_WORK_DIR/execution_report.json" "$REPORT_DIR/" 2>/dev/null || true + docker cp "$JUPYTER_CONTAINER:$JUPYTER_WORK_DIR/execution_report.md" "$REPORT_DIR/" 2>/dev/null || true if [[ $NOTEBOOK_EXIT_CODE -eq 0 ]]; then ok "All notebooks passed [$(timer_end)]" @@ -229,7 +230,7 @@ cleanup() { # Docker if [[ "$DOCKER_STARTED" == "true" ]]; then - (cd "$DOCKER_DIR" && docker compose -f "$COMPOSE_FILE" down -v 2>/dev/null) || true + (cd "$DOCKER_DIR" && docker compose -f "docker-compose.yml" down -v 2>/dev/null) || true ok "Docker containers removed" fi