Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions reports/find_malformed_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from datetime import datetime, timedelta, timezone
from typing import Any

import boto3
import fire

from nrlf.consumer.fhir.r4.model import DocumentReference
from nrlf.core.logger import logger
from nrlf.core.validators import DocumentReferenceValidator

dynamodb = boto3.client("dynamodb")
paginator = dynamodb.get_paginator("scan")

logger.setLevel("ERROR")


def _validate_url(document: str):
"""
Per NRL-1509 improperly truncated pointer URLs end with '/Binary/[ODSCODE]/'
Properly formed full pointer URLs from these custodians end with '/Binary/[ODSCODE]/[unique-id-string]'
"""
docref = DocumentReference.model_validate_json(document)

for content in docref.content:
if content.attachment.contentType == "application/pdf":
url = content.attachment.url
if url[-1] == "/":
raise RuntimeError("Malformed URL found: " + str(url))

validator = DocumentReferenceValidator()
result = validator.validate(data=docref)
if not result.is_valid:
raise RuntimeError("Failed to validate document: " + str(result.issues))


def _find_malformed_urls(
table_name: str, custodian_ods_codes: set[str]
) -> dict[str, float | int]:
"""
Find pointers in the given table that have malformed URLs.
This is required for NRL-1509 because some suppliers suspect their pointer retrieval URLs may have been improperly truncated.
Parameters:
- table_name: The name of the pointers table to use.
- custodian_ods_codes: Set of ODS codes whose pointers may have malformed URLs.
"""

print(f"Finding invalid URLs in table {table_name}....") # noqa

params: dict[str, Any] = {
"TableName": table_name,
"PaginationConfig": {"PageSize": 50},
}

malformed_pointers: list[dict[str, Any]] = []
total_scanned_count = 0

start_time = datetime.now(tz=timezone.utc)

for page in paginator.paginate(**params):
for item in page["Items"]:
pointer_id = item.get("id", {}).get("S", "")
custodian, _ = pointer_id.split("-", 1)
# only need to check pointers created by the specified custodians
if custodian in custodian_ods_codes:
patient_number = item.get("nhs_number", {}).get("S", "")
document = item.get("document", {}).get("S", "")
try:
_validate_url(document)
except Exception as exc:
malformed_pointers.append(
{"id": pointer_id, "patient_number": patient_number, "err": exc}
)

total_scanned_count += page["ScannedCount"]

if total_scanned_count % 1000 == 0:
print(".", end="", flush=True) # noqa

if total_scanned_count % 100000 == 0:
print( # noqa
f"scanned={total_scanned_count} invalid={len(malformed_pointers)}"
)

end_time = datetime.now(tz=timezone.utc)

print(" Done") # noqa

print("Writing malformed_pointers to file ./malformed_pointers.txt ...") # noqa
with open("malformed_pointers.txt", "w") as f:
for malformed_pointer_info in malformed_pointers:
f.write(f"{malformed_pointer_info}\n")

return {
"malformed_pointers": len(malformed_pointers),
"scanned_count": total_scanned_count,
"took-secs": timedelta.total_seconds(end_time - start_time),
}


if __name__ == "__main__":
fire.Fire(_find_malformed_urls)