ryanznie · ryanznie · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026
diff --git a/.dockerignore b/.dockerignore
@@ -46,13 +46,11 @@ data/
 !data/.gitignore
 
 # Models (will be mounted as volume or downloaded)
-# Keep the model directory structure but exclude large files
-models/**/*.bin
-models/**/*.safetensors
-models/**/*.onnx
+models/
 
 # Logs
 *.log
+wandb/
 
 # OS
 .DS_Store
@@ -62,6 +60,8 @@ Thumbs.db
 Dockerfile
 docker-compose.yml
 .dockerignore
+triton_model_repo/
+artifacts/
 
 # CI/CD
 .github/

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -29,7 +29,7 @@ jobs:
       run: |
         uv venv
         source .venv/bin/activate
-        uv pip install -e .
+        uv pip install -e ".[dev]"
 
     - name: Run tests with pytest
       run: |
@@ -72,7 +72,7 @@ jobs:
       run: |
         uv venv
         source .venv/bin/activate
-        uv pip install -e .
+        uv pip install -e ".[dev]"
         uv pip install ruff
 
     - name: Check code formatting with ruff

diff --git a/Dockerfile b/Dockerfile
@@ -19,16 +19,12 @@ COPY pyproject.toml uv.lock* ./
 
 # Install Python dependencies directly (not editable to avoid README requirement)
 # Note: We'll use CPU-only PyTorch for Docker to reduce image size
+# Split install to avoid huge layer commit fails
+RUN uv pip install --system --no-cache \
+    torch --index-url https://download.pytorch.org/whl/cpu
+
 RUN uv pip install --system --no-cache \
-    torch --index-url https://download.pytorch.org/whl/cpu && \
-    uv pip install --system --no-cache \
     transformers \
-    peft \
-    accelerate \
-    datasets \
-    evaluate \
-    scikit-learn \
-    seqeval \
     fastapi \
     uvicorn[standard] \
     gradio \
@@ -37,7 +33,8 @@ RUN uv pip install --system --no-cache \
     pandas \
     tqdm \
     onnx \
-    onnxruntime
+    onnxruntime \
+    tritonclient[http]
 
 # Copy application code
 COPY . .

diff --git a/README.md b/README.md
@@ -24,6 +24,7 @@ invoice-ner/
 ├── pyproject.toml              # Python project configuration & dependencies
 ├── setup.sh                    # Development environment setup script
 ├── .env.example                # Environment variables template
+├── uv.lock                     # Lock file for reproducible installs
 │
 ├── data/                       # Dataset and labeling tools
 │   ├── app.py                  # Streamlit labeling application
@@ -35,28 +36,36 @@ invoice-ner/
 │   └── test_labels.json        # Test data labels
 │
 ├── models/                     # Model files and checkpoints
+│   ├── artifacts/              # Exported models (ONNX, etc.)
 │   └── layoutlmv3-lora-invoice-number/  # Fine-tuned LoRA adapter
 │       ├── adapter_config.json
 │       ├── adapter_model.safetensors
 │       └── ...
 │
+├── triton_model_repo/          # Triton Inference Server model repository
+│   └── ...
+│
 ├── notebooks/                  # Jupyter notebooks for experimentation
 │   ├── 01_heuristics.ipynb     # Heuristic-based extraction
 │   ├── 02_labeling.ipynb       # Data labeling analysis
 │   ├── 03_inference.ipynb      # Model inference testing
-│   └── 04_postprocess.ipynb    # Post-processing experiments
+│   ├── 04_postprocess.ipynb    # Post-processing experiments
+│   └── 05_evaluations.ipynb    # Evaluation metrics and analysis
 │
 ├── benchmarks/                 # Benchmarking suite
 │   ├── models/                 # Model wrappers (Gemini, ONNX, etc.)
+│   ├── benchmark_results/      # Benchmark run results
 │   ├── benchmark.py            # Main benchmark script
 │   └── README.md               # Benchmarking documentation
 │
 ├── scripts/                    # Utility scripts
 │   ├── preprocess.py           # Data preprocessing utilities
-│   └── train.py
+│   ├── export_to_onnx.py       # ONNX export script
+│   ├── setup_triton_repo.py    # Triton repo setup script
+│   └── train.py                # Model training script
 │
 ├── src/                        # Core application modules
-│   ├── __init__.py              # Package initialization
+│   ├── __init__.py
 │   ├── api.py                   # FastAPI endpoints
 │   ├── gradio_ui.py             # Gradio interface
 │   ├── inference.py             # Model inference logic

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -17,8 +17,12 @@ services:
       - .env
     environment:
       - PYTHONUNBUFFERED=1
-      - DEVICE=${DEVICE:-cpu}
+      - DEVICE=cpu
       - LOG_LEVEL=${LOG_LEVEL:-info}
+      - INFERENCE_BACKEND=triton
+      - TRITON_URL=tritonserver:8000
+    depends_on:
+      - tritonserver
     restart: unless-stopped
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:7860/health"]
@@ -36,6 +40,22 @@ services:
           cpus: '${DOCKER_CPU_RESERVATION:-2}'
           memory: ${DOCKER_MEMORY_RESERVATION:-4G}
 
+  tritonserver:
+    image: nvcr.io/nvidia/tritonserver:23.10-py3
+    container_name: triton-server
+    ports:
+      - "8000:8000"
+      - "8001:8001"
+      - "8002:8002"
+    volumes:
+      - ./triton_model_repo:/models
+    command: ["tritonserver", "--model-repository=/models"]
+    deploy:
+      resources:
+        limits:
+          cpus: '${TRITON_CPU_LIMIT:-4}'
+          memory: ${TRITON_MEMORY_LIMIT:-8G}
+
 networks:
   default:
     name: invoice-ner-network
diff --git a/docs/DEV_SETUP.md b/docs/DEV_SETUP.md
@@ -279,17 +279,34 @@ pre-commit autoupdate
 
 ### Docker Build
 
+The Dockerfile has been optimized for inference, excluding heavy training dependencies (`peft`, `datasets`, etc.) to reduce image size.
+
 ```bash
 # Build image
 docker build -t invoice-ner:latest .
 
 # Build with specific tag
 docker build -t invoice-ner:v1.0.0 .
 
-# Build with no cache
+# Build with no cache (useful if you changed dependencies)
 docker build --no-cache -t invoice-ner:latest .
 ```
 
+### Running with Docker Compose
+
+Docker Compose is the recommended way to run the application as it handles the model server (Triton) and application services together.
+
+```bash
+# Start all services (detached mode)
+docker-compose up -d --build
+
+# View logs
+docker-compose logs -f
+
+# Stop all services
+docker-compose down
+```
+
 ### Production Deployment Considerations
 
 1. **Use a reverse proxy** (nginx, Traefik) for SSL/TLS termination
@@ -309,7 +326,7 @@ FROM python:3.10-slim as builder
 WORKDIR /app
 COPY pyproject.toml uv.lock ./
 RUN pip install uv && \
-    uv pip install --system --no-cache torch && \
+    uv pip install --system --no-cache torch --index-url https://download.pytorch.org/whl/cpu && \
     uv pip install --system --no-cache -e .
 
 # Runtime stage

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,21 +9,11 @@ license = { text = "MIT" }
 keywords = ["invoice", "ner", "layoutlmv3", "document-ai", "ocr"]
 
 dependencies = [
-    "dvc>=3.63.0",
-    "ipykernel>=6.30.1",
     "pandas>=2.3.2",
     "pillow>=11.3.0",
-    "pre-commit>=4.3.0",
-    "streamlit>=1.49.1",
     "tqdm>=4.67.1",
     "torch",
     "transformers",
-    "datasets",
-    "peft",
-    "accelerate",
-    "evaluate",
-    "scikit-learn",
-    "seqeval>=1.2.2",
     "numpy>=1.24.0",
     # Web app dependencies
     "fastapi>=0.104.0",
@@ -36,6 +26,20 @@ dependencies = [
     "onnxruntime-tools>=1.7.0",
     "onnxconverter-common>=1.14.0",
     "tritonclient[http]>=2.41.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "dvc>=3.63.0",
+    "ipykernel>=6.30.1",
+    "pre-commit>=4.3.0",
+    "streamlit>=1.49.1",
+    "datasets",
+    "peft",
+    "accelerate",
+    "evaluate",
+    "scikit-learn",
+    "seqeval>=1.2.2",
     # Testing dependencies
     "pytest>=8.0.0",
     "pytest-cov>=4.1.0",