From ce967d58d0fe742cdb6193788ca6e8bb075cc1be Mon Sep 17 00:00:00 2001
From: geoffkats <katogeoffreyg@gmail.com>
Date: Fri, 20 Mar 2026 21:27:48 +0300
Subject: [PATCH] feat: add input-side incident validation before PDF fill

---
 src/file_manipulator.py |  64 +++++++++----
 src/filler.py           |  32 ++++---
 src/validator.py        |  57 ++++++++++++
 tests/test_validator.py | 196 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 317 insertions(+), 32 deletions(-)
 create mode 100644 src/validator.py
 create mode 100644 tests/test_validator.py

diff --git a/src/file_manipulator.py b/src/file_manipulator.py
index b7815cc..7d34a80 100644
--- a/src/file_manipulator.py
+++ b/src/file_manipulator.py
@@ -1,6 +1,7 @@
 import os
 from src.filler import Filler
 from src.llm import LLM
+from src.validator import validate_incident
 from commonforms import prepare_form
 
 
@@ -17,31 +18,58 @@ def create_template(self, pdf_path: str):
         prepare_form(pdf_path, template_path)
         return template_path
 
-    def fill_form(self, user_input: str, fields: list, pdf_form_path: str):
+    def fill_form(self, user_input: str, fields: list, pdf_form_path: str) -> str:
         """
-        It receives the raw data, runs the PDF filling logic,
-        and returns the path to the newly created file.
+        Orchestrates the full extract → validate → fill pipeline.
+
+        Steps:
+            1. Run LLM extraction to convert raw incident text into a
+               structured field dict.
+            2. Validate the extracted dict before touching any PDF.
+               Raises ``ValueError`` if required incident fields are
+               missing or empty.
+            3. Pass validated data to the Filler to produce the output PDF.
+
+        Args:
+            user_input: Free-form incident description (voice transcript or
+                        typed text).
+            fields:     Template field schema passed to the LLM as extraction
+                        targets.
+            pdf_form_path: Path to the fillable PDF template.
+
+        Returns:
+            Path to the filled output PDF.
+
+        Raises:
+            FileNotFoundError: If the PDF template does not exist.
+            ValueError: If extracted incident data fails validation.
         """
         print("[1] Received request from frontend.")
         print(f"[2] PDF template path: {pdf_form_path}")
 
         if not os.path.exists(pdf_form_path):
-            print(f"Error: PDF template not found at {pdf_form_path}")
-            return None  # Or raise an exception
+            raise FileNotFoundError(
+                f"PDF template not found: {pdf_form_path}"
+            )
+
+        print("[3] Running LLM extraction...")
+        self.llm._target_fields = fields
+        self.llm._transcript_text = user_input
+        extracted = self.llm.main_loop().get_data()
 
-        print("[3] Starting extraction and PDF filling process...")
-        try:
-            self.llm._target_fields = fields
-            self.llm._transcript_text = user_input
-            output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm)
+        print("[4] Validating extracted incident data...")
+        errors = validate_incident(extracted)
+        if errors:
+            raise ValueError(
+                "Extracted incident data failed validation:\n"
+                + "\n".join(f"  - {e}" for e in errors)
+            )
 
-            print("\n----------------------------------")
-            print("✅ Process Complete.")
-            print(f"Output saved to: {output_name}")
+        print("[5] Filling PDF with validated data...")
+        output_name = self.filler.fill_form(pdf_form=pdf_form_path, data=extracted)
 
-            return output_name
+        print("\n----------------------------------")
+        print("✅ Process Complete.")
+        print(f"Output saved to: {output_name}")
 
-        except Exception as e:
-            print(f"An error occurred during PDF generation: {e}")
-            # Re-raise the exception so the frontend can handle it
-            raise e
+        return output_name
diff --git a/src/filler.py b/src/filler.py
index e31e535..b6ea53f 100644
--- a/src/filler.py
+++ b/src/filler.py
@@ -1,5 +1,4 @@
 from pdfrw import PdfReader, PdfWriter
-from src.llm import LLM
 from datetime import datetime
 
 
@@ -7,10 +6,24 @@ class Filler:
     def __init__(self):
         pass
 
-    def fill_form(self, pdf_form: str, llm: LLM):
+    def fill_form(self, pdf_form: str, data: dict) -> str:
         """
-        Fill a PDF form with values from user_input using LLM.
-        Fields are filled in the visual order (top-to-bottom, left-to-right).
+        Fill a PDF form with pre-extracted, validated field values.
+
+        Separation of concerns: this class is responsible only for writing
+        data to a PDF. LLM extraction and validation are handled upstream
+        by FileManipulator before this method is called.
+
+        Fields are written in visual order (top-to-bottom, left-to-right)
+        to match the annotation layout of the source PDF.
+
+        Args:
+            pdf_form: Absolute or relative path to the fillable PDF template.
+            data: Pre-extracted and validated field values. Values are written
+                  positionally in the order they appear in the dict.
+
+        Returns:
+            Path to the newly written, filled PDF file.
         """
         output_pdf = (
             pdf_form[:-4]
@@ -19,16 +32,10 @@ def fill_form(self, pdf_form: str, llm: LLM):
             + "_filled.pdf"
         )
 
-        # Generate dictionary of answers from your original function
-        t2j = llm.main_loop()
-        textbox_answers = t2j.get_data()  # This is a dictionary
-
-        answers_list = list(textbox_answers.values())
+        answers_list = list(data.values())
 
-        # Read PDF
         pdf = PdfReader(pdf_form)
 
-        # Loop through pages
         for page in pdf.pages:
             if page.Annots:
                 sorted_annots = sorted(
@@ -43,10 +50,7 @@ def fill_form(self, pdf_form: str, llm: LLM):
                             annot.AP = None
                             i += 1
                         else:
-                            # Stop if we run out of answers
                             break
 
         PdfWriter().write(output_pdf, pdf)
-
-        # Your main.py expects this function to return the path
         return output_pdf
diff --git a/src/validator.py b/src/validator.py
new file mode 100644
index 0000000..4e3b596
--- /dev/null
+++ b/src/validator.py
@@ -0,0 +1,57 @@
+from typing import Any
+
+# Minimum required fields for a valid incident report.
+# Extend this tuple to enforce additional fields across the pipeline.
+INCIDENT_REQUIRED_FIELDS: tuple[str, ...] = ("incident_type", "location", "time")
+
+
+def validate_incident(
+    data: Any,
+    required_fields: tuple[str, ...] = INCIDENT_REQUIRED_FIELDS,
+) -> list[str]:
+    """
+    Validates structured incident data at the pipeline input boundary.
+
+    This gate runs before extracted data is written to any PDF form, ensuring
+    that the minimum required incident fields are present and meaningful.
+
+    This is an *input-side* validator — it checks logical completeness of the
+    incident dict produced by LLM extraction. It is distinct from LLM output
+    schema validation (see issue #114), which verifies type correctness and
+    hallucination confidence of individual extracted values.
+
+    Pipeline position::
+
+        raw text → LLM.main_loop() → [validate_incident()] → Filler → PDF
+
+    Args:
+        data: The incident data dict to validate. Must be a plain ``dict``.
+        required_fields: Tuple of field names that must be present and
+            non-empty. Defaults to ``INCIDENT_REQUIRED_FIELDS``.
+
+    Returns:
+        A list of human-readable validation error strings.
+        Returns an empty list when all checks pass.
+
+    Examples:
+        >>> validate_incident({"incident_type": "Fire", "location": "", "time": None})
+        ['Field cannot be empty: location', 'Field cannot be empty: time']
+
+        >>> validate_incident({"incident_type": "Fire", "location": "HQ", "time": "09:00"})
+        []
+    """
+    if not isinstance(data, dict):
+        return ["Input data must be a dictionary."]
+
+    errors: list[str] = []
+
+    for field in required_fields:
+        if field not in data:
+            errors.append(f"Missing required field: {field}")
+            continue
+
+        value = data[field]
+        if value is None or (isinstance(value, str) and not value.strip()):
+            errors.append(f"Field cannot be empty: {field}")
+
+    return errors
diff --git a/tests/test_validator.py b/tests/test_validator.py
new file mode 100644
index 0000000..a91db60
--- /dev/null
+++ b/tests/test_validator.py
@@ -0,0 +1,196 @@
+"""
+Tests for src/validator.py — validate_incident()
+
+Coverage matrix:
+  - Invalid input type (str, list, None, int)
+  - Valid input: minimal, with extra fields
+  - Missing single required field (each of the 3)
+  - Missing all required fields
+  - Empty string value
+  - None value
+  - Whitespace-only string (space, tab, newline)
+  - Multiple empty fields simultaneously
+  - Mixed: missing + empty in same call
+  - Numeric (non-string, non-None) value — should pass
+  - Custom required_fields override
+  - Empty required_fields tuple — must always pass
+"""
+
+import pytest
+from src.validator import validate_incident, INCIDENT_REQUIRED_FIELDS
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture
+def valid_incident() -> dict:
+    return {
+        "incident_type": "Fire",
+        "location": "123 Main St",
+        "time": "2026-03-20T09:15:00",
+    }
+
+
+# ---------------------------------------------------------------------------
+# 1. Input type enforcement
+# ---------------------------------------------------------------------------
+
+class TestInputTypeEnforcement:
+    def test_rejects_plain_string(self):
+        assert validate_incident("incident summary text") == [
+            "Input data must be a dictionary."
+        ]
+
+    def test_rejects_list(self):
+        assert validate_incident([{"incident_type": "Fire"}]) == [
+            "Input data must be a dictionary."
+        ]
+
+    def test_rejects_none(self):
+        assert validate_incident(None) == ["Input data must be a dictionary."]
+
+    def test_rejects_integer(self):
+        assert validate_incident(42) == ["Input data must be a dictionary."]
+
+    def test_rejects_float(self):
+        assert validate_incident(3.14) == ["Input data must be a dictionary."]
+
+    def test_rejects_bool(self):
+        # bool is a subclass of int in Python — still not a dict
+        assert validate_incident(True) == ["Input data must be a dictionary."]
+
+
+# ---------------------------------------------------------------------------
+# 2. Valid input
+# ---------------------------------------------------------------------------
+
+class TestValidInput:
+    def test_minimal_valid_incident(self, valid_incident):
+        assert validate_incident(valid_incident) == []
+
+    def test_valid_with_extra_fields(self, valid_incident):
+        valid_incident["reporting_officer"] = "Jane Doe"
+        valid_incident["unit"] = "Engine 7"
+        assert validate_incident(valid_incident) == []
+
+    def test_numeric_time_value_passes(self, valid_incident):
+        """Non-string, non-None values are not empty — they pass."""
+        valid_incident["time"] = 1711000000
+        assert validate_incident(valid_incident) == []
+
+    def test_empty_dict_with_no_required_fields(self):
+        """Overriding required_fields to empty means everything passes."""
+        assert validate_incident({}, required_fields=()) == []
+
+
+# ---------------------------------------------------------------------------
+# 3. Missing required fields
+# ---------------------------------------------------------------------------
+
+class TestMissingFields:
+    def test_missing_incident_type(self):
+        data = {"location": "123 Main St", "time": "2026-03-20T09:15:00"}
+        assert validate_incident(data) == ["Missing required field: incident_type"]
+
+    def test_missing_location(self):
+        data = {"incident_type": "Fire", "time": "2026-03-20T09:15:00"}
+        assert validate_incident(data) == ["Missing required field: location"]
+
+    def test_missing_time(self):
+        data = {"incident_type": "Fire", "location": "123 Main St"}
+        assert validate_incident(data) == ["Missing required field: time"]
+
+    def test_missing_all_three_fields(self):
+        assert validate_incident({}) == [
+            "Missing required field: incident_type",
+            "Missing required field: location",
+            "Missing required field: time",
+        ]
+
+    def test_error_order_matches_required_fields_order(self):
+        """Error list order must be deterministic — matches INCIDENT_REQUIRED_FIELDS."""
+        errors = validate_incident({})
+        expected = [f"Missing required field: {f}" for f in INCIDENT_REQUIRED_FIELDS]
+        assert errors == expected
+
+
+# ---------------------------------------------------------------------------
+# 4. Present but empty values
+# ---------------------------------------------------------------------------
+
+class TestEmptyValues:
+    def test_empty_string_location(self, valid_incident):
+        valid_incident["location"] = ""
+        assert validate_incident(valid_incident) == ["Field cannot be empty: location"]
+
+    def test_none_time(self, valid_incident):
+        valid_incident["time"] = None
+        assert validate_incident(valid_incident) == ["Field cannot be empty: time"]
+
+    def test_whitespace_only_incident_type(self, valid_incident):
+        valid_incident["incident_type"] = "   "
+        assert validate_incident(valid_incident) == [
+            "Field cannot be empty: incident_type"
+        ]
+
+    def test_tab_character_location(self, valid_incident):
+        valid_incident["location"] = "\t"
+        assert validate_incident(valid_incident) == ["Field cannot be empty: location"]
+
+    def test_newline_only_time(self, valid_incident):
+        valid_incident["time"] = "\n"
+        assert validate_incident(valid_incident) == ["Field cannot be empty: time"]
+
+    def test_mixed_whitespace(self, valid_incident):
+        valid_incident["location"] = " \t\n "
+        assert validate_incident(valid_incident) == ["Field cannot be empty: location"]
+
+
+# ---------------------------------------------------------------------------
+# 5. Multiple errors in one call
+# ---------------------------------------------------------------------------
+
+class TestMultipleErrors:
+    def test_two_fields_empty(self, valid_incident):
+        valid_incident["location"] = ""
+        valid_incident["time"] = None
+        assert validate_incident(valid_incident) == [
+            "Field cannot be empty: location",
+            "Field cannot be empty: time",
+        ]
+
+    def test_one_missing_one_empty(self):
+        data = {"incident_type": "Fire", "location": ""}
+        errors = validate_incident(data)
+        assert "Field cannot be empty: location" in errors
+        assert "Missing required field: time" in errors
+
+    def test_all_fields_empty(self):
+        data = {"incident_type": "", "location": None, "time": "  "}
+        assert validate_incident(data) == [
+            "Field cannot be empty: incident_type",
+            "Field cannot be empty: location",
+            "Field cannot be empty: time",
+        ]
+
+
+# ---------------------------------------------------------------------------
+# 6. Custom required_fields override
+# ---------------------------------------------------------------------------
+
+class TestCustomRequiredFields:
+    def test_custom_single_field_missing(self):
+        data = {"name": "John Doe"}
+        errors = validate_incident(data, required_fields=("name", "badge_number"))
+        assert errors == ["Missing required field: badge_number"]
+
+    def test_custom_field_empty(self):
+        data = {"report_id": "  "}
+        errors = validate_incident(data, required_fields=("report_id",))
+        assert errors == ["Field cannot be empty: report_id"]
+
+    def test_custom_fields_all_valid(self):
+        data = {"report_id": "RPT-001", "unit": "Engine 7"}
+        assert validate_incident(data, required_fields=("report_id", "unit")) == []