diff --git a/reports/find_malformed_urls.py b/reports/find_malformed_urls.py new file mode 100644 index 000000000..633ec6cb5 --- /dev/null +++ b/reports/find_malformed_urls.py @@ -0,0 +1,101 @@ +from datetime import datetime, timedelta, timezone +from typing import Any + +import boto3 +import fire + +from nrlf.consumer.fhir.r4.model import DocumentReference +from nrlf.core.logger import logger +from nrlf.core.validators import DocumentReferenceValidator + +dynamodb = boto3.client("dynamodb") +paginator = dynamodb.get_paginator("scan") + +logger.setLevel("ERROR") + + +def _validate_url(document: str): + """ + Per NRL-1509 improperly truncated pointer URLs end with '/Binary/[ODSCODE]/' + Properly formed full pointer URLs from these custodians end with '/Binary/[ODSCODE]/[unique-id-string]' + """ + docref = DocumentReference.model_validate_json(document) + + for content in docref.content: + if content.attachment.contentType == "application/pdf": + url = content.attachment.url + if url[-1] == "/": + raise RuntimeError("Malformed URL found: " + str(url)) + + validator = DocumentReferenceValidator() + result = validator.validate(data=docref) + if not result.is_valid: + raise RuntimeError("Failed to validate document: " + str(result.issues)) + + +def _find_malformed_urls( + table_name: str, custodian_ods_codes: set[str] +) -> dict[str, float | int]: + """ + Find pointers in the given table that have malformed URLs. + This is required for NRL-1509 because some suppliers suspect their pointer retrieval URLs may have been improperly truncated. + Parameters: + - table_name: The name of the pointers table to use. + - custodian_ods_codes: Set of ODS codes whose pointers may have malformed URLs. + """ + + print(f"Finding invalid URLs in table {table_name}....") # noqa + + params: dict[str, Any] = { + "TableName": table_name, + "PaginationConfig": {"PageSize": 50}, + } + + malformed_pointers: list[dict[str, Any]] = [] + total_scanned_count = 0 + + start_time = datetime.now(tz=timezone.utc) + + for page in paginator.paginate(**params): + for item in page["Items"]: + pointer_id = item.get("id", {}).get("S", "") + custodian, _ = pointer_id.split("-", 1) + # only need to check pointers created by the specified custodians + if custodian in custodian_ods_codes: + patient_number = item.get("nhs_number", {}).get("S", "") + document = item.get("document", {}).get("S", "") + try: + _validate_url(document) + except Exception as exc: + malformed_pointers.append( + {"id": pointer_id, "patient_number": patient_number, "err": exc} + ) + + total_scanned_count += page["ScannedCount"] + + if total_scanned_count % 1000 == 0: + print(".", end="", flush=True) # noqa + + if total_scanned_count % 100000 == 0: + print( # noqa + f"scanned={total_scanned_count} invalid={len(malformed_pointers)}" + ) + + end_time = datetime.now(tz=timezone.utc) + + print(" Done") # noqa + + print("Writing malformed_pointers to file ./malformed_pointers.txt ...") # noqa + with open("malformed_pointers.txt", "w") as f: + for malformed_pointer_info in malformed_pointers: + f.write(f"{malformed_pointer_info}\n") + + return { + "malformed_pointers": len(malformed_pointers), + "scanned_count": total_scanned_count, + "took-secs": timedelta.total_seconds(end_time - start_time), + } + + +if __name__ == "__main__": + fire.Fire(_find_malformed_urls)