From ce967d58d0fe742cdb6193788ca6e8bb075cc1be Mon Sep 17 00:00:00 2001 From: geoffkats Date: Fri, 20 Mar 2026 21:27:48 +0300 Subject: [PATCH] feat: add input-side incident validation before PDF fill --- src/file_manipulator.py | 64 +++++++++---- src/filler.py | 32 ++++--- src/validator.py | 57 ++++++++++++ tests/test_validator.py | 196 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 317 insertions(+), 32 deletions(-) create mode 100644 src/validator.py create mode 100644 tests/test_validator.py diff --git a/src/file_manipulator.py b/src/file_manipulator.py index b7815cc..7d34a80 100644 --- a/src/file_manipulator.py +++ b/src/file_manipulator.py @@ -1,6 +1,7 @@ import os from src.filler import Filler from src.llm import LLM +from src.validator import validate_incident from commonforms import prepare_form @@ -17,31 +18,58 @@ def create_template(self, pdf_path: str): prepare_form(pdf_path, template_path) return template_path - def fill_form(self, user_input: str, fields: list, pdf_form_path: str): + def fill_form(self, user_input: str, fields: list, pdf_form_path: str) -> str: """ - It receives the raw data, runs the PDF filling logic, - and returns the path to the newly created file. + Orchestrates the full extract → validate → fill pipeline. + + Steps: + 1. Run LLM extraction to convert raw incident text into a + structured field dict. + 2. Validate the extracted dict before touching any PDF. + Raises ``ValueError`` if required incident fields are + missing or empty. + 3. Pass validated data to the Filler to produce the output PDF. + + Args: + user_input: Free-form incident description (voice transcript or + typed text). + fields: Template field schema passed to the LLM as extraction + targets. + pdf_form_path: Path to the fillable PDF template. + + Returns: + Path to the filled output PDF. + + Raises: + FileNotFoundError: If the PDF template does not exist. + ValueError: If extracted incident data fails validation. """ print("[1] Received request from frontend.") print(f"[2] PDF template path: {pdf_form_path}") if not os.path.exists(pdf_form_path): - print(f"Error: PDF template not found at {pdf_form_path}") - return None # Or raise an exception + raise FileNotFoundError( + f"PDF template not found: {pdf_form_path}" + ) + + print("[3] Running LLM extraction...") + self.llm._target_fields = fields + self.llm._transcript_text = user_input + extracted = self.llm.main_loop().get_data() - print("[3] Starting extraction and PDF filling process...") - try: - self.llm._target_fields = fields - self.llm._transcript_text = user_input - output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm) + print("[4] Validating extracted incident data...") + errors = validate_incident(extracted) + if errors: + raise ValueError( + "Extracted incident data failed validation:\n" + + "\n".join(f" - {e}" for e in errors) + ) - print("\n----------------------------------") - print("✅ Process Complete.") - print(f"Output saved to: {output_name}") + print("[5] Filling PDF with validated data...") + output_name = self.filler.fill_form(pdf_form=pdf_form_path, data=extracted) - return output_name + print("\n----------------------------------") + print("✅ Process Complete.") + print(f"Output saved to: {output_name}") - except Exception as e: - print(f"An error occurred during PDF generation: {e}") - # Re-raise the exception so the frontend can handle it - raise e + return output_name diff --git a/src/filler.py b/src/filler.py index e31e535..b6ea53f 100644 --- a/src/filler.py +++ b/src/filler.py @@ -1,5 +1,4 @@ from pdfrw import PdfReader, PdfWriter -from src.llm import LLM from datetime import datetime @@ -7,10 +6,24 @@ class Filler: def __init__(self): pass - def fill_form(self, pdf_form: str, llm: LLM): + def fill_form(self, pdf_form: str, data: dict) -> str: """ - Fill a PDF form with values from user_input using LLM. - Fields are filled in the visual order (top-to-bottom, left-to-right). + Fill a PDF form with pre-extracted, validated field values. + + Separation of concerns: this class is responsible only for writing + data to a PDF. LLM extraction and validation are handled upstream + by FileManipulator before this method is called. + + Fields are written in visual order (top-to-bottom, left-to-right) + to match the annotation layout of the source PDF. + + Args: + pdf_form: Absolute or relative path to the fillable PDF template. + data: Pre-extracted and validated field values. Values are written + positionally in the order they appear in the dict. + + Returns: + Path to the newly written, filled PDF file. """ output_pdf = ( pdf_form[:-4] @@ -19,16 +32,10 @@ def fill_form(self, pdf_form: str, llm: LLM): + "_filled.pdf" ) - # Generate dictionary of answers from your original function - t2j = llm.main_loop() - textbox_answers = t2j.get_data() # This is a dictionary - - answers_list = list(textbox_answers.values()) + answers_list = list(data.values()) - # Read PDF pdf = PdfReader(pdf_form) - # Loop through pages for page in pdf.pages: if page.Annots: sorted_annots = sorted( @@ -43,10 +50,7 @@ def fill_form(self, pdf_form: str, llm: LLM): annot.AP = None i += 1 else: - # Stop if we run out of answers break PdfWriter().write(output_pdf, pdf) - - # Your main.py expects this function to return the path return output_pdf diff --git a/src/validator.py b/src/validator.py new file mode 100644 index 0000000..4e3b596 --- /dev/null +++ b/src/validator.py @@ -0,0 +1,57 @@ +from typing import Any + +# Minimum required fields for a valid incident report. +# Extend this tuple to enforce additional fields across the pipeline. +INCIDENT_REQUIRED_FIELDS: tuple[str, ...] = ("incident_type", "location", "time") + + +def validate_incident( + data: Any, + required_fields: tuple[str, ...] = INCIDENT_REQUIRED_FIELDS, +) -> list[str]: + """ + Validates structured incident data at the pipeline input boundary. + + This gate runs before extracted data is written to any PDF form, ensuring + that the minimum required incident fields are present and meaningful. + + This is an *input-side* validator — it checks logical completeness of the + incident dict produced by LLM extraction. It is distinct from LLM output + schema validation (see issue #114), which verifies type correctness and + hallucination confidence of individual extracted values. + + Pipeline position:: + + raw text → LLM.main_loop() → [validate_incident()] → Filler → PDF + + Args: + data: The incident data dict to validate. Must be a plain ``dict``. + required_fields: Tuple of field names that must be present and + non-empty. Defaults to ``INCIDENT_REQUIRED_FIELDS``. + + Returns: + A list of human-readable validation error strings. + Returns an empty list when all checks pass. + + Examples: + >>> validate_incident({"incident_type": "Fire", "location": "", "time": None}) + ['Field cannot be empty: location', 'Field cannot be empty: time'] + + >>> validate_incident({"incident_type": "Fire", "location": "HQ", "time": "09:00"}) + [] + """ + if not isinstance(data, dict): + return ["Input data must be a dictionary."] + + errors: list[str] = [] + + for field in required_fields: + if field not in data: + errors.append(f"Missing required field: {field}") + continue + + value = data[field] + if value is None or (isinstance(value, str) and not value.strip()): + errors.append(f"Field cannot be empty: {field}") + + return errors diff --git a/tests/test_validator.py b/tests/test_validator.py new file mode 100644 index 0000000..a91db60 --- /dev/null +++ b/tests/test_validator.py @@ -0,0 +1,196 @@ +""" +Tests for src/validator.py — validate_incident() + +Coverage matrix: + - Invalid input type (str, list, None, int) + - Valid input: minimal, with extra fields + - Missing single required field (each of the 3) + - Missing all required fields + - Empty string value + - None value + - Whitespace-only string (space, tab, newline) + - Multiple empty fields simultaneously + - Mixed: missing + empty in same call + - Numeric (non-string, non-None) value — should pass + - Custom required_fields override + - Empty required_fields tuple — must always pass +""" + +import pytest +from src.validator import validate_incident, INCIDENT_REQUIRED_FIELDS + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture +def valid_incident() -> dict: + return { + "incident_type": "Fire", + "location": "123 Main St", + "time": "2026-03-20T09:15:00", + } + + +# --------------------------------------------------------------------------- +# 1. Input type enforcement +# --------------------------------------------------------------------------- + +class TestInputTypeEnforcement: + def test_rejects_plain_string(self): + assert validate_incident("incident summary text") == [ + "Input data must be a dictionary." + ] + + def test_rejects_list(self): + assert validate_incident([{"incident_type": "Fire"}]) == [ + "Input data must be a dictionary." + ] + + def test_rejects_none(self): + assert validate_incident(None) == ["Input data must be a dictionary."] + + def test_rejects_integer(self): + assert validate_incident(42) == ["Input data must be a dictionary."] + + def test_rejects_float(self): + assert validate_incident(3.14) == ["Input data must be a dictionary."] + + def test_rejects_bool(self): + # bool is a subclass of int in Python — still not a dict + assert validate_incident(True) == ["Input data must be a dictionary."] + + +# --------------------------------------------------------------------------- +# 2. Valid input +# --------------------------------------------------------------------------- + +class TestValidInput: + def test_minimal_valid_incident(self, valid_incident): + assert validate_incident(valid_incident) == [] + + def test_valid_with_extra_fields(self, valid_incident): + valid_incident["reporting_officer"] = "Jane Doe" + valid_incident["unit"] = "Engine 7" + assert validate_incident(valid_incident) == [] + + def test_numeric_time_value_passes(self, valid_incident): + """Non-string, non-None values are not empty — they pass.""" + valid_incident["time"] = 1711000000 + assert validate_incident(valid_incident) == [] + + def test_empty_dict_with_no_required_fields(self): + """Overriding required_fields to empty means everything passes.""" + assert validate_incident({}, required_fields=()) == [] + + +# --------------------------------------------------------------------------- +# 3. Missing required fields +# --------------------------------------------------------------------------- + +class TestMissingFields: + def test_missing_incident_type(self): + data = {"location": "123 Main St", "time": "2026-03-20T09:15:00"} + assert validate_incident(data) == ["Missing required field: incident_type"] + + def test_missing_location(self): + data = {"incident_type": "Fire", "time": "2026-03-20T09:15:00"} + assert validate_incident(data) == ["Missing required field: location"] + + def test_missing_time(self): + data = {"incident_type": "Fire", "location": "123 Main St"} + assert validate_incident(data) == ["Missing required field: time"] + + def test_missing_all_three_fields(self): + assert validate_incident({}) == [ + "Missing required field: incident_type", + "Missing required field: location", + "Missing required field: time", + ] + + def test_error_order_matches_required_fields_order(self): + """Error list order must be deterministic — matches INCIDENT_REQUIRED_FIELDS.""" + errors = validate_incident({}) + expected = [f"Missing required field: {f}" for f in INCIDENT_REQUIRED_FIELDS] + assert errors == expected + + +# --------------------------------------------------------------------------- +# 4. Present but empty values +# --------------------------------------------------------------------------- + +class TestEmptyValues: + def test_empty_string_location(self, valid_incident): + valid_incident["location"] = "" + assert validate_incident(valid_incident) == ["Field cannot be empty: location"] + + def test_none_time(self, valid_incident): + valid_incident["time"] = None + assert validate_incident(valid_incident) == ["Field cannot be empty: time"] + + def test_whitespace_only_incident_type(self, valid_incident): + valid_incident["incident_type"] = " " + assert validate_incident(valid_incident) == [ + "Field cannot be empty: incident_type" + ] + + def test_tab_character_location(self, valid_incident): + valid_incident["location"] = "\t" + assert validate_incident(valid_incident) == ["Field cannot be empty: location"] + + def test_newline_only_time(self, valid_incident): + valid_incident["time"] = "\n" + assert validate_incident(valid_incident) == ["Field cannot be empty: time"] + + def test_mixed_whitespace(self, valid_incident): + valid_incident["location"] = " \t\n " + assert validate_incident(valid_incident) == ["Field cannot be empty: location"] + + +# --------------------------------------------------------------------------- +# 5. Multiple errors in one call +# --------------------------------------------------------------------------- + +class TestMultipleErrors: + def test_two_fields_empty(self, valid_incident): + valid_incident["location"] = "" + valid_incident["time"] = None + assert validate_incident(valid_incident) == [ + "Field cannot be empty: location", + "Field cannot be empty: time", + ] + + def test_one_missing_one_empty(self): + data = {"incident_type": "Fire", "location": ""} + errors = validate_incident(data) + assert "Field cannot be empty: location" in errors + assert "Missing required field: time" in errors + + def test_all_fields_empty(self): + data = {"incident_type": "", "location": None, "time": " "} + assert validate_incident(data) == [ + "Field cannot be empty: incident_type", + "Field cannot be empty: location", + "Field cannot be empty: time", + ] + + +# --------------------------------------------------------------------------- +# 6. Custom required_fields override +# --------------------------------------------------------------------------- + +class TestCustomRequiredFields: + def test_custom_single_field_missing(self): + data = {"name": "John Doe"} + errors = validate_incident(data, required_fields=("name", "badge_number")) + assert errors == ["Missing required field: badge_number"] + + def test_custom_field_empty(self): + data = {"report_id": " "} + errors = validate_incident(data, required_fields=("report_id",)) + assert errors == ["Field cannot be empty: report_id"] + + def test_custom_fields_all_valid(self): + data = {"report_id": "RPT-001", "unit": "Engine 7"} + assert validate_incident(data, required_fields=("report_id", "unit")) == []