fireform-core · geoffkats · Mar 20, 2026
diff --git a/src/file_manipulator.py b/src/file_manipulator.py
@@ -1,6 +1,7 @@
 import os
 from src.filler import Filler
 from src.llm import LLM
+from src.validator import validate_incident
 from commonforms import prepare_form
 
 
@@ -17,31 +18,58 @@ def create_template(self, pdf_path: str):
         prepare_form(pdf_path, template_path)
         return template_path
 
-    def fill_form(self, user_input: str, fields: list, pdf_form_path: str):
+    def fill_form(self, user_input: str, fields: list, pdf_form_path: str) -> str:
         """
-        It receives the raw data, runs the PDF filling logic,
-        and returns the path to the newly created file.
+        Orchestrates the full extract → validate → fill pipeline.
+
+        Steps:
+            1. Run LLM extraction to convert raw incident text into a
+               structured field dict.
+            2. Validate the extracted dict before touching any PDF.
+               Raises ``ValueError`` if required incident fields are
+               missing or empty.
+            3. Pass validated data to the Filler to produce the output PDF.
+
+        Args:
+            user_input: Free-form incident description (voice transcript or
+                        typed text).
+            fields:     Template field schema passed to the LLM as extraction
+                        targets.
+            pdf_form_path: Path to the fillable PDF template.
+
+        Returns:
+            Path to the filled output PDF.
+
+        Raises:
+            FileNotFoundError: If the PDF template does not exist.
+            ValueError: If extracted incident data fails validation.
         """
         print("[1] Received request from frontend.")
         print(f"[2] PDF template path: {pdf_form_path}")
 
         if not os.path.exists(pdf_form_path):
-            print(f"Error: PDF template not found at {pdf_form_path}")
-            return None  # Or raise an exception
+            raise FileNotFoundError(
+                f"PDF template not found: {pdf_form_path}"
+            )
+
+        print("[3] Running LLM extraction...")
+        self.llm._target_fields = fields
+        self.llm._transcript_text = user_input
+        extracted = self.llm.main_loop().get_data()
 
-        print("[3] Starting extraction and PDF filling process...")
-        try:
-            self.llm._target_fields = fields
-            self.llm._transcript_text = user_input
-            output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm)
+        print("[4] Validating extracted incident data...")
+        errors = validate_incident(extracted)
+        if errors:
+            raise ValueError(
+                "Extracted incident data failed validation:\n"
+                + "\n".join(f"  - {e}" for e in errors)
+            )
 
-            print("\n----------------------------------")
-            print("✅ Process Complete.")
-            print(f"Output saved to: {output_name}")
+        print("[5] Filling PDF with validated data...")
+        output_name = self.filler.fill_form(pdf_form=pdf_form_path, data=extracted)
 
-            return output_name
+        print("\n----------------------------------")
+        print("✅ Process Complete.")
+        print(f"Output saved to: {output_name}")
 
-        except Exception as e:
-            print(f"An error occurred during PDF generation: {e}")
-            # Re-raise the exception so the frontend can handle it
-            raise e
+        return output_name
diff --git a/src/filler.py b/src/filler.py
@@ -1,16 +1,29 @@
 from pdfrw import PdfReader, PdfWriter
-from src.llm import LLM
 from datetime import datetime
 
 
 class Filler:
     def __init__(self):
         pass
 
-    def fill_form(self, pdf_form: str, llm: LLM):
+    def fill_form(self, pdf_form: str, data: dict) -> str:
         """
-        Fill a PDF form with values from user_input using LLM.
-        Fields are filled in the visual order (top-to-bottom, left-to-right).
+        Fill a PDF form with pre-extracted, validated field values.
+
+        Separation of concerns: this class is responsible only for writing
+        data to a PDF. LLM extraction and validation are handled upstream
+        by FileManipulator before this method is called.
+
+        Fields are written in visual order (top-to-bottom, left-to-right)
+        to match the annotation layout of the source PDF.
+
+        Args:
+            pdf_form: Absolute or relative path to the fillable PDF template.
+            data: Pre-extracted and validated field values. Values are written
+                  positionally in the order they appear in the dict.
+
+        Returns:
+            Path to the newly written, filled PDF file.
         """
         output_pdf = (
             pdf_form[:-4]
@@ -19,16 +32,10 @@ def fill_form(self, pdf_form: str, llm: LLM):
             + "_filled.pdf"
         )
 
-        # Generate dictionary of answers from your original function
-        t2j = llm.main_loop()
-        textbox_answers = t2j.get_data()  # This is a dictionary
-
-        answers_list = list(textbox_answers.values())
+        answers_list = list(data.values())
 
-        # Read PDF
         pdf = PdfReader(pdf_form)
 
-        # Loop through pages
         for page in pdf.pages:
             if page.Annots:
                 sorted_annots = sorted(
@@ -43,10 +50,7 @@ def fill_form(self, pdf_form: str, llm: LLM):
                             annot.AP = None
                             i += 1
                         else:
-                            # Stop if we run out of answers
                             break
 
         PdfWriter().write(output_pdf, pdf)
-
-        # Your main.py expects this function to return the path
         return output_pdf
diff --git a/src/validator.py b/src/validator.py
@@ -0,0 +1,57 @@
+from typing import Any
+
+# Minimum required fields for a valid incident report.
+# Extend this tuple to enforce additional fields across the pipeline.
+INCIDENT_REQUIRED_FIELDS: tuple[str, ...] = ("incident_type", "location", "time")
+
+
+def validate_incident(
+    data: Any,
+    required_fields: tuple[str, ...] = INCIDENT_REQUIRED_FIELDS,
+) -> list[str]:
+    """
+    Validates structured incident data at the pipeline input boundary.
+
+    This gate runs before extracted data is written to any PDF form, ensuring
+    that the minimum required incident fields are present and meaningful.
+
+    This is an *input-side* validator — it checks logical completeness of the
+    incident dict produced by LLM extraction. It is distinct from LLM output
+    schema validation (see issue #114), which verifies type correctness and
+    hallucination confidence of individual extracted values.
+
+    Pipeline position::
+
+        raw text → LLM.main_loop() → [validate_incident()] → Filler → PDF
+
+    Args:
+        data: The incident data dict to validate. Must be a plain ``dict``.
+        required_fields: Tuple of field names that must be present and
+            non-empty. Defaults to ``INCIDENT_REQUIRED_FIELDS``.
+
+    Returns:
+        A list of human-readable validation error strings.
+        Returns an empty list when all checks pass.
+
+    Examples:
+        >>> validate_incident({"incident_type": "Fire", "location": "", "time": None})
+        ['Field cannot be empty: location', 'Field cannot be empty: time']
+
+        >>> validate_incident({"incident_type": "Fire", "location": "HQ", "time": "09:00"})
+        []
+    """
+    if not isinstance(data, dict):
+        return ["Input data must be a dictionary."]
+
+    errors: list[str] = []
+
+    for field in required_fields:
+        if field not in data:
+            errors.append(f"Missing required field: {field}")
+            continue
+
+        value = data[field]
+        if value is None or (isinstance(value, str) and not value.strip()):
+            errors.append(f"Field cannot be empty: {field}")
+
+    return errors