Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 46 additions & 18 deletions src/file_manipulator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
from src.filler import Filler
from src.llm import LLM
from src.validator import validate_incident
from commonforms import prepare_form


Expand All @@ -17,31 +18,58 @@ def create_template(self, pdf_path: str):
prepare_form(pdf_path, template_path)
return template_path

def fill_form(self, user_input: str, fields: list, pdf_form_path: str):
def fill_form(self, user_input: str, fields: list, pdf_form_path: str) -> str:
"""
It receives the raw data, runs the PDF filling logic,
and returns the path to the newly created file.
Orchestrates the full extract → validate → fill pipeline.

Steps:
1. Run LLM extraction to convert raw incident text into a
structured field dict.
2. Validate the extracted dict before touching any PDF.
Raises ``ValueError`` if required incident fields are
missing or empty.
3. Pass validated data to the Filler to produce the output PDF.

Args:
user_input: Free-form incident description (voice transcript or
typed text).
fields: Template field schema passed to the LLM as extraction
targets.
pdf_form_path: Path to the fillable PDF template.

Returns:
Path to the filled output PDF.

Raises:
FileNotFoundError: If the PDF template does not exist.
ValueError: If extracted incident data fails validation.
"""
print("[1] Received request from frontend.")
print(f"[2] PDF template path: {pdf_form_path}")

if not os.path.exists(pdf_form_path):
print(f"Error: PDF template not found at {pdf_form_path}")
return None # Or raise an exception
raise FileNotFoundError(
f"PDF template not found: {pdf_form_path}"
)

print("[3] Running LLM extraction...")
self.llm._target_fields = fields
self.llm._transcript_text = user_input
extracted = self.llm.main_loop().get_data()

print("[3] Starting extraction and PDF filling process...")
try:
self.llm._target_fields = fields
self.llm._transcript_text = user_input
output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm)
print("[4] Validating extracted incident data...")
errors = validate_incident(extracted)
if errors:
raise ValueError(
"Extracted incident data failed validation:\n"
+ "\n".join(f" - {e}" for e in errors)
)

print("\n----------------------------------")
print("✅ Process Complete.")
print(f"Output saved to: {output_name}")
print("[5] Filling PDF with validated data...")
output_name = self.filler.fill_form(pdf_form=pdf_form_path, data=extracted)

return output_name
print("\n----------------------------------")
print("✅ Process Complete.")
print(f"Output saved to: {output_name}")

except Exception as e:
print(f"An error occurred during PDF generation: {e}")
# Re-raise the exception so the frontend can handle it
raise e
return output_name
32 changes: 18 additions & 14 deletions src/filler.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,29 @@
from pdfrw import PdfReader, PdfWriter
from src.llm import LLM
from datetime import datetime


class Filler:
def __init__(self):
pass

def fill_form(self, pdf_form: str, llm: LLM):
def fill_form(self, pdf_form: str, data: dict) -> str:
"""
Fill a PDF form with values from user_input using LLM.
Fields are filled in the visual order (top-to-bottom, left-to-right).
Fill a PDF form with pre-extracted, validated field values.

Separation of concerns: this class is responsible only for writing
data to a PDF. LLM extraction and validation are handled upstream
by FileManipulator before this method is called.

Fields are written in visual order (top-to-bottom, left-to-right)
to match the annotation layout of the source PDF.

Args:
pdf_form: Absolute or relative path to the fillable PDF template.
data: Pre-extracted and validated field values. Values are written
positionally in the order they appear in the dict.

Returns:
Path to the newly written, filled PDF file.
"""
output_pdf = (
pdf_form[:-4]
Expand All @@ -19,16 +32,10 @@ def fill_form(self, pdf_form: str, llm: LLM):
+ "_filled.pdf"
)

# Generate dictionary of answers from your original function
t2j = llm.main_loop()
textbox_answers = t2j.get_data() # This is a dictionary

answers_list = list(textbox_answers.values())
answers_list = list(data.values())

# Read PDF
pdf = PdfReader(pdf_form)

# Loop through pages
for page in pdf.pages:
if page.Annots:
sorted_annots = sorted(
Expand All @@ -43,10 +50,7 @@ def fill_form(self, pdf_form: str, llm: LLM):
annot.AP = None
i += 1
else:
# Stop if we run out of answers
break

PdfWriter().write(output_pdf, pdf)

# Your main.py expects this function to return the path
return output_pdf
57 changes: 57 additions & 0 deletions src/validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from typing import Any

# Minimum required fields for a valid incident report.
# Extend this tuple to enforce additional fields across the pipeline.
INCIDENT_REQUIRED_FIELDS: tuple[str, ...] = ("incident_type", "location", "time")


def validate_incident(
data: Any,
required_fields: tuple[str, ...] = INCIDENT_REQUIRED_FIELDS,
) -> list[str]:
"""
Validates structured incident data at the pipeline input boundary.

This gate runs before extracted data is written to any PDF form, ensuring
that the minimum required incident fields are present and meaningful.

This is an *input-side* validator — it checks logical completeness of the
incident dict produced by LLM extraction. It is distinct from LLM output
schema validation (see issue #114), which verifies type correctness and
hallucination confidence of individual extracted values.

Pipeline position::

raw text → LLM.main_loop() → [validate_incident()] → Filler → PDF

Args:
data: The incident data dict to validate. Must be a plain ``dict``.
required_fields: Tuple of field names that must be present and
non-empty. Defaults to ``INCIDENT_REQUIRED_FIELDS``.

Returns:
A list of human-readable validation error strings.
Returns an empty list when all checks pass.

Examples:
>>> validate_incident({"incident_type": "Fire", "location": "", "time": None})
['Field cannot be empty: location', 'Field cannot be empty: time']

>>> validate_incident({"incident_type": "Fire", "location": "HQ", "time": "09:00"})
[]
"""
if not isinstance(data, dict):
return ["Input data must be a dictionary."]

errors: list[str] = []

for field in required_fields:
if field not in data:
errors.append(f"Missing required field: {field}")
continue

value = data[field]
if value is None or (isinstance(value, str) and not value.strip()):
errors.append(f"Field cannot be empty: {field}")

return errors
Loading