From f1d9045017effb20e2cbb6d7462dec70f58957ff Mon Sep 17 00:00:00 2001 From: "Axel Garcia K." Date: Wed, 12 Feb 2025 07:47:20 +0000 Subject: [PATCH 1/4] NRL-1277 Add time constraints and fix the most common issues --- scripts/delete_all_invalid_pointers.py | 88 +++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/scripts/delete_all_invalid_pointers.py b/scripts/delete_all_invalid_pointers.py index 1b5269bf9..354da5408 100644 --- a/scripts/delete_all_invalid_pointers.py +++ b/scripts/delete_all_invalid_pointers.py @@ -31,6 +31,10 @@ def _find_invalid_pointers(table_name: str) -> dict[str, Any]: params: dict[str, Any] = { "TableName": table_name, "PaginationConfig": {"PageSize": 50}, + "FilterExpression": "created_on < :date", + "ExpressionAttributeValues": { + ":date": {"S": "2025-01-20T00:00:00.000000+0000"} + } } invalid_pointers = [] @@ -41,6 +45,11 @@ def _find_invalid_pointers(table_name: str) -> dict[str, Any]: for page in paginator.paginate(**params): for item in page["Items"]: pointer_id = item.get("id", {}).get("S") + created_on = item.get("created_on", {}).get("S") + # parse datetime from created_on string + created_on = datetime.strptime(created_on, "%Y-%m-%dT%H:%M:%S.%f%z") + if created_on > datetime(2025, 1, 20, tzinfo=timezone.utc): + continue document = item.get("document", {}).get("S", "") try: _validate_document(document) @@ -146,5 +155,82 @@ def _find_and_delete_invalid_pointers(table_name: str) -> dict[str, float | int] return {**find_result, **delete_result} +def _fix_invalid_pointers(table_name: str) -> dict[str, Any]: + print(f"Finding and fixing invalid pointers in table {table_name}....") + + params: dict[str, Any] = { + "TableName": table_name, + "PaginationConfig": {"PageSize": 50}, + "FilterExpression": "created_on < :date", + "ExpressionAttributeValues": { + ":date": {"S": "2025-01-20T00:00:00.000000+0000"} + } + } + + fixed_pointers = [] + total_scanned_count = 0 + total_fixed_count = 0 + + start_time = datetime.now(tz=timezone.utc) + + for page in paginator.paginate(**params): + for item in page["Items"]: + pointer_id = item.get("id", {}).get("S") + created_on = item.get("created_on", {}).get("S") + # parse datetime from created_on string + created_on = datetime.strptime(created_on, "%Y-%m-%dT%H:%M:%S.%f%z") + if created_on > datetime(2025, 1, 20, tzinfo=timezone.utc): + continue + document = item.get("document", {}).get("S", "") + try: + docref = DocumentReference.model_validate_json(document) + if ( + docref.type.coding[0].display == "Mental Health Crisis plan" + ): + #print(f"Fixing document {pointer_id}") + docref.type.coding[0].display = "Mental health crisis plan" + resource.Table(table_name).update_item( + Key={"pk": f"D#{pointer_id}", "sk": f"D#{pointer_id}"}, + UpdateExpression="SET document = :d", + ExpressionAttributeValues={":d": docref.json()}, + ) + fixed_pointers.append(pointer_id) + total_fixed_count += 1 + except Exception as exc: + print(f"Failed to fix document {pointer_id}: {exc}") + + total_scanned_count += page["ScannedCount"] + + if total_fixed_count % 100 == 0: + print("x", end="", flush=True) + + if total_scanned_count % 1000 == 0: + print(".", end="", flush=True) + + if total_scanned_count % 100000 == 0: + print(f"scanned={total_scanned_count} fixed={len(fixed_pointers)}") + + end_time = datetime.now(tz=timezone.utc) + + print(f" Done. Fixed {len(fixed_pointers)} invalid pointers") + + #save fixed pointers to file + if len(fixed_pointers) > 0: + print("Writing fixed pointers IDs to file ./fixed_pointers.txt ...") + with open("fixed_pointers.txt", "w") as f: + for _id in fixed_pointers: + f.write(f"{_id}\n") + + return { + "fixed_pointers": fixed_pointers, + "scanned_count": total_scanned_count, + "fix-took-secs": timedelta.total_seconds(end_time - start_time), + } + + if __name__ == "__main__": - fire.Fire(_find_and_delete_invalid_pointers) + fire.Fire({ + "find_and_delete_invalid_pointers": _find_and_delete_invalid_pointers, + "fix_invalid_pointers": _fix_invalid_pointers, + "find_invalid_pointers": _find_invalid_pointers, + }) From aae468a07d1cb11ad3fc8c7773e284642c7fef3c Mon Sep 17 00:00:00 2001 From: "Axel Garcia K." Date: Wed, 12 Feb 2025 07:48:13 +0000 Subject: [PATCH 2/4] NRL-1277 Fix invalid pointers from file --- scripts/delete_all_invalid_pointers.py | 96 ++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/scripts/delete_all_invalid_pointers.py b/scripts/delete_all_invalid_pointers.py index 354da5408..8dd15cf2e 100644 --- a/scripts/delete_all_invalid_pointers.py +++ b/scripts/delete_all_invalid_pointers.py @@ -228,9 +228,105 @@ def _fix_invalid_pointers(table_name: str) -> dict[str, Any]: } +def _fix_invalid_pointers_from_file(table_name: str, file_path: str) -> dict[str, Any]: + print(f"Fixing invalid pointers from file {file_path} in table {table_name}....") + + with open(file_path, "r") as f: + pointer_ids = [line.split(":")[0] for line in f.readlines()] + + fixed_pointers = [] + total_fixed_count = 0 + + start_time = datetime.now(tz=timezone.utc) + + for pointer_id in pointer_ids: + try: + response = dynamodb.get_item( + TableName=table_name, + Key={"pk": {"S": f"D#{pointer_id}"}, "sk": {"S": f"D#{pointer_id}"}} + ) + item = response.get("Item") + if not item: + print(f"Pointer {pointer_id} not found.") + continue + + document = item.get("document", {}).get("S", "") + + # if "https://fhir.nhs.uk/STU3/CodeSystem/NRL-FormatCode-1" in document: + # document = document.replace( + # "https://fhir.nhs.uk/STU3/CodeSystem/NRL-FormatCode-1", + # "https://fhir.nhs.uk/England/CodeSystem/England-NRLFormatCode", + # ) + # resource.Table(table_name).update_item( + # Key={"pk": f"D#{pointer_id}", "sk": f"D#{pointer_id}"}, + # UpdateExpression="SET document = :d", + # ExpressionAttributeValues={":d": document}, + # ) + # fixed_pointers.append(pointer_id) + # total_fixed_count += 1 + + docref: DocumentReference = DocumentReference.model_validate_json(document) + if docref.content[0].attachment.contentType.startswith("application/pdf") and len(docref.content[0].attachment.contentType) > len("application/pdf"): + docref.content[0].attachment.contentType = "application/pdf" + resource.Table(table_name).update_item( + Key={"pk": f"D#{pointer_id}", "sk": f"D#{pointer_id}"}, + UpdateExpression="SET document = :d", + ExpressionAttributeValues={":d": docref.json()}, + ) + fixed_pointers.append(pointer_id) + total_fixed_count += 1 + + if docref.content[0].attachment.url.startswith("ssp://") and docref.content[0].attachment.contentType != "application/pdf": + docref.content[0].attachment.contentType = "application/pdf" + resource.Table(table_name).update_item( + Key={"pk": f"D#{pointer_id}", "sk": f"D#{pointer_id}"}, + UpdateExpression="SET document = :d", + ExpressionAttributeValues={":d": docref.json()}, + ) + fixed_pointers.append(pointer_id) + total_fixed_count += 1 + + if docref.content[0].attachment.contentType == "application/pdf" and docref.content[0].attachment.url.endswith("pdf"): + if docref.content[0].attachment.url.startswith("https://"): + docref.content[0].attachment.url.replace(("https://"), ("ssp://")) + + if docref.content[0].format.code == "urn:nhs-ic:record-contact" or docref.content[0].format.display == "Contact details (HTTP Unsecured)": + docref.content[0].format.code = "urn:nhs-ic:unstructured" + docref.content[0].format.display = "Unstructured Document" + resource.Table(table_name).update_item( + Key={"pk": f"D#{pointer_id}", "sk": f"D#{pointer_id}"}, + UpdateExpression="SET document = :d", + ExpressionAttributeValues={":d": docref.json()}, + ) + fixed_pointers.append(pointer_id) + total_fixed_count += 1 + + except Exception as exc: + print(f"Failed to fix document {pointer_id}: {exc}") + + if total_fixed_count % 100 == 0: + print("x", end="", flush=True) + + end_time = datetime.now(tz=timezone.utc) + + print(f" Done. Fixed {len(fixed_pointers)} invalid pointers") + + if len(fixed_pointers) > 0: + print("Writing fixed pointers IDs to file ./fixed_pointers_from_file.txt ...") + with open("fixed_pointers_from_file.txt", "w") as f: + for _id in fixed_pointers: + f.write(f"{_id}\n") + + return { + "fixed_pointers": fixed_pointers, + "total_fixed_count": total_fixed_count, + "fix-took-secs": timedelta.total_seconds(end_time - start_time), + } + if __name__ == "__main__": fire.Fire({ "find_and_delete_invalid_pointers": _find_and_delete_invalid_pointers, "fix_invalid_pointers": _fix_invalid_pointers, "find_invalid_pointers": _find_invalid_pointers, + "fix_invalid_pointers_from_file": _fix_invalid_pointers_from_file, }) From e13ffc79e9e468a2b3fa1e33c037bacd25e72afc Mon Sep 17 00:00:00 2001 From: "Axel Garcia K." Date: Wed, 12 Feb 2025 09:12:28 +0000 Subject: [PATCH 3/4] NRL-1277 Add method to verify pointers that have been fixed --- scripts/delete_all_invalid_pointers.py | 53 ++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/scripts/delete_all_invalid_pointers.py b/scripts/delete_all_invalid_pointers.py index 8dd15cf2e..02291b08f 100644 --- a/scripts/delete_all_invalid_pointers.py +++ b/scripts/delete_all_invalid_pointers.py @@ -323,10 +323,63 @@ def _fix_invalid_pointers_from_file(table_name: str, file_path: str) -> dict[str "fix-took-secs": timedelta.total_seconds(end_time - start_time), } +def _verify_pointers_from_file(table_name: str, file_path: str) -> dict[str, Any]: + print(f"Verifying pointers from file {file_path} in table {table_name}....") + + with open(file_path, "r") as f: + pointer_ids = [line.split(":")[0] for line in f.readlines()] + + verified_pointers = [] + invalid_pointers = [] + total_verified_count = 0 + + start_time = datetime.now(tz=timezone.utc) + + for pointer_id in pointer_ids: + try: + response = dynamodb.get_item( + TableName=table_name, + Key={"pk": {"S": f"D#{pointer_id}"}, "sk": {"S": f"D#{pointer_id}"}} + ) + item = response.get("Item") + if not item: + print(f"Pointer {pointer_id} not found.") + continue + + document = item.get("document", {}).get("S", "") + _validate_document(document) + verified_pointers.append(pointer_id) + total_verified_count += 1 + + except Exception as exc: + print(f"Failed to verify document {pointer_id}: {exc}") + invalid_pointers.append(pointer_id) + + if total_verified_count % 100 == 0: + print("x", end="", flush=True) + + end_time = datetime.now(tz=timezone.utc) + + print(f" Done. Verified {len(verified_pointers)} pointers") + + if len(invalid_pointers) > 0: + print("Writing invalid pointers IDs to file ./invalid_pointers_from_file.txt ...") + with open("invalid_pointers_from_file.txt", "w") as f: + for _id in invalid_pointers: + f.write(f"{_id}\n") + + return { + "verified_pointers": verified_pointers, + "invalid_pointers": invalid_pointers, + "total_verified_count": total_verified_count, + "verify-took-secs": timedelta.total_seconds(end_time - start_time), + } + if __name__ == "__main__": fire.Fire({ "find_and_delete_invalid_pointers": _find_and_delete_invalid_pointers, "fix_invalid_pointers": _fix_invalid_pointers, "find_invalid_pointers": _find_invalid_pointers, "fix_invalid_pointers_from_file": _fix_invalid_pointers_from_file, + "verify_pointers_from_file": _verify_pointers_from_file, }) From 091e9c4f9ac582313014fa2caceb2498bfc72fc4 Mon Sep 17 00:00:00 2001 From: "Axel Garcia K." Date: Wed, 12 Feb 2025 09:24:30 +0000 Subject: [PATCH 4/4] NRL-1277 Remove time constraints from original methods --- scripts/delete_all_invalid_pointers.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/scripts/delete_all_invalid_pointers.py b/scripts/delete_all_invalid_pointers.py index 02291b08f..f872e32f5 100644 --- a/scripts/delete_all_invalid_pointers.py +++ b/scripts/delete_all_invalid_pointers.py @@ -31,10 +31,6 @@ def _find_invalid_pointers(table_name: str) -> dict[str, Any]: params: dict[str, Any] = { "TableName": table_name, "PaginationConfig": {"PageSize": 50}, - "FilterExpression": "created_on < :date", - "ExpressionAttributeValues": { - ":date": {"S": "2025-01-20T00:00:00.000000+0000"} - } } invalid_pointers = [] @@ -45,11 +41,6 @@ def _find_invalid_pointers(table_name: str) -> dict[str, Any]: for page in paginator.paginate(**params): for item in page["Items"]: pointer_id = item.get("id", {}).get("S") - created_on = item.get("created_on", {}).get("S") - # parse datetime from created_on string - created_on = datetime.strptime(created_on, "%Y-%m-%dT%H:%M:%S.%f%z") - if created_on > datetime(2025, 1, 20, tzinfo=timezone.utc): - continue document = item.get("document", {}).get("S", "") try: _validate_document(document)