Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,7 @@ test.*
# Frontend build output
frontend/dist/
frontend/node_modules/

# data dir for training, validation, and testing
data/
config.toml
2 changes: 2 additions & 0 deletions overlays/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
fastapi
fastapi-cli
httpx
huggingface-hub
mypy
orjson
polars
Expand All @@ -41,6 +42,7 @@
sqlalchemy
tenacity
textual
tiktoken
tinytuya
typer
websockets
Expand Down
11 changes: 11 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies = [
"alembic",
"apprise",
"apscheduler",
"huggingface-hub",
"httpx",
"python-multipart",
"polars",
Expand All @@ -26,6 +27,11 @@ dependencies = [
[project.scripts]
database = "python.database_cli:app"
van-inventory = "python.van_inventory.main:serve"
prompt-bench = "python.prompt_bench.main:cli"
prompt-bench-download = "python.prompt_bench.downloader:cli"
finetune = "python.prompt_bench.finetune:cli"
finetune-container = "python.prompt_bench.finetune_container:cli"
build-finetune-dataset = "python.prompt_bench.build_finetune_dataset:cli"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Verify the actual module path that defines `cli()` for build_finetune_dataset.

set -euo pipefail

echo "== entrypoint in pyproject.toml =="
rg -n 'build-finetune-dataset' pyproject.toml -C2

echo
echo "== candidate files =="
fd -a build_finetune_dataset.py

echo
echo "== cli() definitions =="
rg -n --glob '**/build_finetune_dataset.py' '^\s*def\s+cli\s*\(' -C2

Repository: RichieCahill/dotfiles

Length of output: 790


Fix broken console entrypoint for build-finetune-dataset.

Line 34 declares build-finetune-dataset = "python.prompt_bench.build_finetune_dataset:cli", but the cli() function exists only at python/prompt_bench/tools/build_finetune_dataset.py (module path python.prompt_bench.tools.build_finetune_dataset). No shim module exists at python/prompt_bench/build_finetune_dataset.py, so this entrypoint will fail at runtime. Either create a shim that re-exports the cli, or update the entrypoint to reference python.prompt_bench.tools.build_finetune_dataset:cli.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pyproject.toml` at line 34, The console entrypoint "build-finetune-dataset"
points to python.prompt_bench.build_finetune_dataset:cli but the actual cli()
lives in python.prompt_bench.tools.build_finetune_dataset; fix by either
creating a shim module python/prompt_bench/build_finetune_dataset.py that
imports and re-exports cli from python.prompt_bench.tools.build_finetune_dataset
(e.g., from .tools.build_finetune_dataset import cli) or by editing
pyproject.toml to change the entrypoint to
python.prompt_bench.tools.build_finetune_dataset:cli so the
build-finetune-dataset entry references the actual module containing cli().


[dependency-groups]
dev = [
Expand Down Expand Up @@ -81,6 +87,11 @@ lint.ignore = [
"python/eval_warnings/**" = [
"S607", # (perm) gh and git are expected on PATH in the runner environment
]
"python/prompt_bench/**" = [
"FBT002", # (perm) typer requires boolean defaults for --flag/--no-flag options
"PLR0913", # (perm) typer CLIs naturally have many parameters
"S607", # (perm) docker and nvidia-smi are expected on PATH
]
"python/alembic/**" = [
"INP001", # (perm) this creates LSP issues for alembic
]
Expand Down
25 changes: 25 additions & 0 deletions python/prompt_bench/Dockerfile.finetune
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Unsloth fine-tuning container for Qwen 3.5 4B on RTX 3090.
#
# Build:
# docker build -f python/prompt_bench/Dockerfile.finetune -t bill-finetune .
#
# Run:
# docker run --rm --device=nvidia.com/gpu=all --ipc=host \
# -v $(pwd)/output:/workspace/output \
# -v $(pwd)/output/finetune_dataset.jsonl:/workspace/dataset.jsonl:ro \
# -v /zfs/models/hf:/models \
# bill-finetune \
# --dataset /workspace/dataset.jsonl \
# --output-dir /workspace/output/qwen-bill-summarizer

FROM ghcr.io/unslothai/unsloth:latest

RUN pip install --no-cache-dir typer

WORKDIR /workspace
COPY python/prompt_bench/finetune.py python/prompt_bench/finetune.py
COPY python/prompt_bench/summarization_prompts.py python/prompt_bench/summarization_prompts.py
COPY python/prompt_bench/__init__.py python/prompt_bench/__init__.py
COPY python/__init__.py python/__init__.py

ENTRYPOINT ["python", "-m", "python.prompt_bench.finetune"]
Comment on lines +15 to +25
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Run the container as non-root.

There is no USER directive, so the container executes as root. This is a security hardening gap and already matches the static-analysis finding.

💡 Proposed fix
 FROM ghcr.io/unslothai/unsloth:latest
 
 RUN pip install --no-cache-dir typer
+RUN useradd --create-home --uid 10001 appuser && mkdir -p /workspace && chown -R appuser:appuser /workspace
 
 WORKDIR /workspace
 COPY python/prompt_bench/finetune.py python/prompt_bench/finetune.py
 COPY python/prompt_bench/summarization_prompts.py python/prompt_bench/summarization_prompts.py
 COPY python/prompt_bench/__init__.py python/prompt_bench/__init__.py
 COPY python/__init__.py python/__init__.py
+USER appuser
 
 ENTRYPOINT ["python", "-m", "python.prompt_bench.finetune"]
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
FROM ghcr.io/unslothai/unsloth:latest
RUN pip install --no-cache-dir typer
WORKDIR /workspace
COPY python/prompt_bench/finetune.py python/prompt_bench/finetune.py
COPY python/prompt_bench/summarization_prompts.py python/prompt_bench/summarization_prompts.py
COPY python/prompt_bench/__init__.py python/prompt_bench/__init__.py
COPY python/__init__.py python/__init__.py
ENTRYPOINT ["python", "-m", "python.prompt_bench.finetune"]
FROM ghcr.io/unslothai/unsloth:latest
RUN pip install --no-cache-dir typer
RUN useradd --create-home --uid 10001 appuser && mkdir -p /workspace && chown -R appuser:appuser /workspace
WORKDIR /workspace
COPY python/prompt_bench/finetune.py python/prompt_bench/finetune.py
COPY python/prompt_bench/summarization_prompts.py python/prompt_bench/summarization_prompts.py
COPY python/prompt_bench/__init__.py python/prompt_bench/__init__.py
COPY python/__init__.py python/__init__.py
USER appuser
ENTRYPOINT ["python", "-m", "python.prompt_bench.finetune"]
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@python/prompt_bench/Dockerfile.finetune` around lines 15 - 25, The Dockerfile
runs as root (no USER directive) which is insecure; modify the
Dockerfile.finetune to create a non-root user and switch to it before the
ENTRYPOINT: add steps after setting WORKDIR to create a user/group (e.g.,
uid/gid 1000 or a stable UID), chown /workspace and any copied files (referenced
by WORKDIR and the copied python/prompt_bench/* and python/__init__.py
artifacts) to that user, and add a USER instruction so the process launched by
ENTRYPOINT ["python", "-m", "python.prompt_bench.finetune"] runs as the non-root
user. Ensure ownership changes cover all copied files and happen before
switching users.

1 change: 1 addition & 0 deletions python/prompt_bench/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Prompt benchmarking system for evaluating LLMs via vLLM."""
233 changes: 233 additions & 0 deletions python/prompt_bench/batch_bill_summarizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
"""Submit an OpenAI Batch API bill-summarization job over compressed text.

Reads the first N bills from a CSV with a `text_content` column, compresses
each via `bill_token_compression.compress_bill_text`, builds a JSONL file of
summarization requests, and submits it as an asynchronous Batch API job
against `/v1/chat/completions`. Also writes a CSV of per-bill pre/post-
compression token counts.
"""

from __future__ import annotations

import csv
import json
import logging
import re
import sys
from os import getenv
from pathlib import Path
from typing import Annotated

import httpx
import typer
from tiktoken import Encoding, get_encoding

from python.prompt_bench.bill_token_compression import compress_bill_text
from python.prompt_bench.summarization_prompts import SUMMARIZATION_SYSTEM_PROMPT, SUMMARIZATION_USER_TEMPLATE

logger = logging.getLogger(__name__)

OPENAI_API_BASE = "https://api.openai.com/v1"


def load_bills(csv_path: Path, count: int = 0) -> list[tuple[str, str]]:
"""Return (bill_id, text_content) tuples with non-empty text.

If `count` is 0 or negative, all rows are returned.
"""
csv.field_size_limit(sys.maxsize)
bills: list[tuple[str, str]] = []
with csv_path.open(newline="", encoding="utf-8") as handle:
reader = csv.DictReader(handle)
for row in reader:
text_content = (row.get("text_content") or "").strip()
if not text_content:
continue
bill_id = row.get("bill_id") or row.get("id") or f"row-{len(bills)}"
version_code = row.get("version_code") or ""
unique_id = f"{bill_id}-{version_code}" if version_code else bill_id
bills.append((unique_id, text_content))
if count > 0 and len(bills) >= count:
break
return bills


def safe_filename(value: str) -> str:
"""Make a string safe for use as a filename or batch custom_id."""
return re.sub(r"[^A-Za-z0-9._-]+", "_", value).strip("_") or "unnamed"


def build_request(custom_id: str, model: str, bill_text: str) -> dict:
"""Build one OpenAI batch request line."""
return {
"custom_id": custom_id,
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": model,
"messages": [
{"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
{"role": "user", "content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text)},
],
},
}


def write_jsonl(path: Path, lines: list[dict]) -> None:
"""Write a list of dicts as JSONL."""
with path.open("w", encoding="utf-8") as handle:
for line in lines:
handle.write(json.dumps(line, ensure_ascii=False))
handle.write("\n")


def upload_file(client: httpx.Client, path: Path) -> str:
"""Upload a JSONL file to the OpenAI Files API and return its file id."""
with path.open("rb") as handle:
response = client.post(
f"{OPENAI_API_BASE}/files",
files={"file": (path.name, handle, "application/jsonl")},
data={"purpose": "batch"},
)
response.raise_for_status()
return response.json()["id"]


def prepare_requests(
bills: list[tuple[str, str]],
*,
model: str,
encoder: Encoding,
) -> tuple[list[dict], list[dict]]:
"""Build (request_lines, token_rows) from bills.

Each bill is compressed before being turned into a request line.
Each `token_rows` entry has chars + token counts for one bill so the caller
can write a per-bill CSV.
"""
request_lines: list[dict] = []
token_rows: list[dict] = []
for bill_id, text_content in bills:
raw_token_count = len(encoder.encode(text_content))
compressed_text = compress_bill_text(text_content)
compressed_token_count = len(encoder.encode(compressed_text))
token_rows.append(
{
"bill_id": bill_id,
"raw_chars": len(text_content),
"compressed_chars": len(compressed_text),
"raw_tokens": raw_token_count,
"compressed_tokens": compressed_token_count,
"token_ratio": (compressed_token_count / raw_token_count) if raw_token_count else None,
},
)
safe_id = safe_filename(bill_id)
request_lines.append(build_request(safe_id, model, compressed_text))
Comment on lines +124 to +125
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Sanitizing bill_id into custom_id can create collisions.

Two different bill ids can map to the same safe_filename() output, and the downstream dataset builder joins by custom_id using dict keys. Once that happens, one request/completion pair overwrites another with no error.

Collision-safe alternative
+import hashlib
+
 ...
-        safe_id = safe_filename(bill_id)
+        digest = hashlib.sha1(bill_id.encode("utf-8")).hexdigest()[:8]
+        safe_id = f"{safe_filename(bill_id)}-{digest}"
         request_lines.append(build_request(safe_id, model, compressed_text))
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@python/prompt_bench/batch_bill_summarizer.py` around lines 124 - 125, The
current use of safe_id = safe_filename(bill_id) before
request_lines.append(build_request(...)) can cause collisions when different
bill_id values map to the same safe_filename output; change to produce a
collision-safe custom_id (e.g., combine safe_filename(bill_id) with a stable
short hash or unique suffix derived from bill_id) and use that custom_id in
build_request and downstream dataset keys; also keep the original bill_id in the
request metadata so you can trace back to the source. Ensure you update
references to safe_id to the new custom_id variable everywhere
request_lines/dataset building expects custom_id.

return request_lines, token_rows


def write_token_csv(path: Path, token_rows: list[dict]) -> tuple[int, int]:
"""Write per-bill token counts to CSV. Returns (raw_total, compressed_total)."""
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(
handle,
fieldnames=["bill_id", "raw_chars", "compressed_chars", "raw_tokens", "compressed_tokens", "token_ratio"],
)
writer.writeheader()
writer.writerows(token_rows)
raw_total = sum(row["raw_tokens"] for row in token_rows)
compressed_total = sum(row["compressed_tokens"] for row in token_rows)
return raw_total, compressed_total


def create_batch(client: httpx.Client, input_file_id: str, description: str) -> dict:
"""Create a batch job and return its full response payload."""
response = client.post(
f"{OPENAI_API_BASE}/batches",
json={
"input_file_id": input_file_id,
"endpoint": "/v1/chat/completions",
"completion_window": "24h",
"metadata": {"description": description},
},
)
response.raise_for_status()
return response.json()


def main(
csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write JSONL + metadata")] = Path(
"output/openai_batch",
),
model: Annotated[str, typer.Option(help="OpenAI model id")] = "gpt-5-mini",
count: Annotated[int, typer.Option(help="Max bills to process, 0 = all")] = 0,
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
) -> None:
"""Submit an OpenAI Batch job of compressed bill summaries."""
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")

api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
if not api_key:
message = "Neither CLOSEDAI_TOKEN nor OPENAI_API_KEY is set"
raise typer.BadParameter(message)
if not csv_path.is_file():
message = f"CSV not found: {csv_path}"
raise typer.BadParameter(message)

output_dir.mkdir(parents=True, exist_ok=True)

logger.info("Loading %d bills from %s", count, csv_path)
bills = load_bills(csv_path, count)
if len(bills) < count:
logger.warning("Only %d bills available (requested %d)", len(bills), count)

encoder = get_encoding("o200k_base")
request_lines, token_rows = prepare_requests(bills, model=model, encoder=encoder)

token_csv_path = output_dir / "token_counts.csv"
raw_tokens_total, compressed_tokens_total = write_token_csv(token_csv_path, token_rows)
logger.info(
"Token counts: raw=%d compressed=%d ratio=%.3f -> %s",
raw_tokens_total,
compressed_tokens_total,
(compressed_tokens_total / raw_tokens_total) if raw_tokens_total else 0.0,
token_csv_path,
)

jsonl_path = output_dir / "requests.jsonl"
write_jsonl(jsonl_path, request_lines)
logger.info("Wrote %s (%d bills)", jsonl_path, len(request_lines))

headers = {"Authorization": f"Bearer {api_key}"}
with httpx.Client(headers=headers, timeout=httpx.Timeout(300.0)) as client:
logger.info("Uploading JSONL")
file_id = upload_file(client, jsonl_path)
logger.info("Uploaded: %s", file_id)

logger.info("Creating batch")
batch = create_batch(client, file_id, f"compressed bill summaries x{len(request_lines)} ({model})")
logger.info("Batch created: %s", batch["id"])

metadata = {
"model": model,
"count": len(bills),
"jsonl": str(jsonl_path),
"input_file_id": file_id,
"batch_id": batch["id"],
"raw_tokens_total": raw_tokens_total,
"compressed_tokens_total": compressed_tokens_total,
"batch": batch,
}
metadata_path = output_dir / "batch.json"
metadata_path.write_text(json.dumps(metadata, indent=2))
logger.info("Wrote metadata to %s", metadata_path)


def cli() -> None:
"""Typer entry point."""
typer.run(main)


if __name__ == "__main__":
cli()
Loading
Loading