From 3a9a7075880b2fbf4ccac7f81cadb19120106965 Mon Sep 17 00:00:00 2001 From: Kate Bobyn Date: Wed, 9 Jul 2025 00:07:35 +0100 Subject: [PATCH 1/2] NRL-1509 add devOps script to identify pointers with malformed URLs for specified custodians --- reports/find_malformed_urls.py | 98 ++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 reports/find_malformed_urls.py diff --git a/reports/find_malformed_urls.py b/reports/find_malformed_urls.py new file mode 100644 index 000000000..5b57a7bb2 --- /dev/null +++ b/reports/find_malformed_urls.py @@ -0,0 +1,98 @@ +from datetime import datetime, timedelta, timezone +from typing import Any + +import boto3 +import fire + +from nrlf.consumer.fhir.r4.model import DocumentReference +from nrlf.core.logger import logger +from nrlf.core.validators import DocumentReferenceValidator + +dynamodb = boto3.client("dynamodb") +paginator = dynamodb.get_paginator("scan") + +logger.setLevel("ERROR") + + +def _validate_url(document: str): + """ + Per NRL-1509 improperly truncated pointer URLs end with '/Binary/[ODSCODE]/' + Properly formed full pointer URLs from these custodians end with '/Binary/[ODSCODE]/[unique-id-string]' + """ + docref = DocumentReference.model_validate_json(document) + + validator = DocumentReferenceValidator() + result = validator.validate(data=docref) + for i, content in enumerate(result.content): + if content.attachment.contentType == "application/pdf": + url = content.attachment.url + if url[-1] == "/": + raise RuntimeError("Malformed URL found: " + str(url)) + + if not result.is_valid: + raise RuntimeError("Failed to validate document: " + str(result.issues)) + + +def _find_malformed_urls( + table_name: str, custodian_ods_codes: set[str] +) -> dict[str, float | int]: + """ + Find pointers in the given table that have malformed URLs. + This is required for NRL-1509 because some suppliers suspect their pointer retrieval URLs may have been improperly truncated. + Parameters: + - table_name: The name of the pointers table to use. + - custodian_ods_codes: Set of ODS codes whose pointers may have malformed URLs. + """ + + print(f"Finding invalid URLs in table {table_name}....") # noqa + + params: dict[str, Any] = { + "TableName": table_name, + "PaginationConfig": {"PageSize": 50}, + } + + malformed_pointers = [] + total_scanned_count = 0 + + start_time = datetime.now(tz=timezone.utc) + + for page in paginator.paginate(**params): + for item in page["Items"]: + pointer_id = item.get("id", {}).get("S") + custodian, _ = pointer_id.split("-", 1) + # only need to check pointers created by the specified custodians + if custodian in custodian_ods_codes: + document = item.get("document", {}).get("S", "") + try: + _validate_url(document) + except Exception as exc: + malformed_pointers.append((pointer_id, exc)) + + total_scanned_count += page["ScannedCount"] + + if total_scanned_count % 1000 == 0: + print(".", end="", flush=True) # noqa + + if total_scanned_count % 100000 == 0: + print( # noqa + f"scanned={total_scanned_count} invalid={len(malformed_pointers)}" + ) + + end_time = datetime.now(tz=timezone.utc) + + print(" Done") # noqa + + print("Writing malformed_pointers to file ./malformed_pointers.txt ...") # noqa + with open("malformed_pointers.txt", "w") as f: + for _id, err in malformed_pointers: + f.write(f"{_id}: {err}\n") + + return { + "malformed_pointers": len(malformed_pointers), + "scanned_count": total_scanned_count, + "took-secs": timedelta.total_seconds(end_time - start_time), + } + + +if __name__ == "__main__": + fire.Fire(_find_malformed_urls) From 52401b78b7051f7ad75c75eaaec775bbc53954c9 Mon Sep 17 00:00:00 2001 From: Matt Dean Date: Tue, 15 Jul 2025 09:08:07 +0100 Subject: [PATCH 2/2] [NRL-1509] Add patient number to invalid pointer report --- reports/find_malformed_urls.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/reports/find_malformed_urls.py b/reports/find_malformed_urls.py index 5b57a7bb2..633ec6cb5 100644 --- a/reports/find_malformed_urls.py +++ b/reports/find_malformed_urls.py @@ -21,14 +21,14 @@ def _validate_url(document: str): """ docref = DocumentReference.model_validate_json(document) - validator = DocumentReferenceValidator() - result = validator.validate(data=docref) - for i, content in enumerate(result.content): + for content in docref.content: if content.attachment.contentType == "application/pdf": url = content.attachment.url if url[-1] == "/": raise RuntimeError("Malformed URL found: " + str(url)) + validator = DocumentReferenceValidator() + result = validator.validate(data=docref) if not result.is_valid: raise RuntimeError("Failed to validate document: " + str(result.issues)) @@ -51,22 +51,25 @@ def _find_malformed_urls( "PaginationConfig": {"PageSize": 50}, } - malformed_pointers = [] + malformed_pointers: list[dict[str, Any]] = [] total_scanned_count = 0 start_time = datetime.now(tz=timezone.utc) for page in paginator.paginate(**params): for item in page["Items"]: - pointer_id = item.get("id", {}).get("S") + pointer_id = item.get("id", {}).get("S", "") custodian, _ = pointer_id.split("-", 1) # only need to check pointers created by the specified custodians if custodian in custodian_ods_codes: + patient_number = item.get("nhs_number", {}).get("S", "") document = item.get("document", {}).get("S", "") try: _validate_url(document) except Exception as exc: - malformed_pointers.append((pointer_id, exc)) + malformed_pointers.append( + {"id": pointer_id, "patient_number": patient_number, "err": exc} + ) total_scanned_count += page["ScannedCount"] @@ -84,8 +87,8 @@ def _find_malformed_urls( print("Writing malformed_pointers to file ./malformed_pointers.txt ...") # noqa with open("malformed_pointers.txt", "w") as f: - for _id, err in malformed_pointers: - f.write(f"{_id}: {err}\n") + for malformed_pointer_info in malformed_pointers: + f.write(f"{malformed_pointer_info}\n") return { "malformed_pointers": len(malformed_pointers),