From eb0236061fcbf5ac89165672c49d07c017f6b623 Mon Sep 17 00:00:00 2001 From: Darren Siegel Date: Fri, 19 Dec 2025 09:48:34 -0500 Subject: [PATCH 1/2] check two days ago fix --- dataset/keys.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/dataset/keys.py b/dataset/keys.py index a3e9ee2..86c6455 100644 --- a/dataset/keys.py +++ b/dataset/keys.py @@ -15,9 +15,12 @@ def list_keys_from_inventory(section_ids, action, inventoried_bucket_name, bucke s3_client = boto3.client('s3') try: - # Read the JSON contents of the most created inventory manifest file + # Read the JSON contents of the most recent inventory manifest file manifest_json = get_most_recent_manifest(inventoried_bucket_name, bucket_name) + if manifest_json is None: + raise FileNotFoundError("No inventory manifest found in the last two days") + # Iterate over the Parquet files in the manifest, fetching and reading each one # to get the list of keys, and filtering them based on the section IDs and action all = [] @@ -28,6 +31,9 @@ def list_keys_from_inventory(section_ids, action, inventoried_bucket_name, bucke return all + except FileNotFoundError: + # Bubble up so the job fails loudly instead of silently returning no data + raise except Exception as e: print(e) return [] @@ -38,9 +44,11 @@ def list_keys_from_inventory(section_ids, action, inventoried_bucket_name, bucke # The time lag is due to the fact that the inventory files in AWS are generated once a day. def get_most_recent_manifest(inventoried_bucket_name, bucket_name): - for i in range(1, 2): + attempted_keys = [] + for i in range(1, 3): # yesterday, then two days ago day = (datetime.datetime.utcnow() - datetime.timedelta(days=i)).strftime('%Y-%m-%dT01-00Z') manifest_key = f'{inventoried_bucket_name}/{bucket_name}/{day}/manifest.json' + attempted_keys.append(manifest_key) try: s3_client = boto3.client('s3') @@ -53,6 +61,8 @@ def get_most_recent_manifest(inventoried_bucket_name, bucket_name): print(e) continue + raise FileNotFoundError(f"No inventory manifest found for keys: {attempted_keys}") + def fetch_parquet(section_ids, action, s3_client, bucket_name, key): @@ -113,4 +123,3 @@ def list_keys(bucket_name, section_id, action): break return files - From ac850d2cf653a284238131b1303fb9897729740e Mon Sep 17 00:00:00 2001 From: Darren Siegel Date: Fri, 19 Dec 2025 09:59:03 -0500 Subject: [PATCH 2/2] update test --- tests/test_keys.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/test_keys.py b/tests/test_keys.py index ff3800d..a9d6253 100644 --- a/tests/test_keys.py +++ b/tests/test_keys.py @@ -53,13 +53,11 @@ def test_list_keys_from_inventory_exception_handling(self, mock_boto_client): mock_s3_client.get_object.side_effect = Exception("S3 Error") mock_boto_client.return_value = mock_s3_client - result = list_keys_from_inventory( - self.section_ids, self.action, - self.inventory_bucket_name, self.bucket_name - ) - - # Should return empty list on exception - self.assertEqual(result, []) + with self.assertRaises(FileNotFoundError): + list_keys_from_inventory( + self.section_ids, self.action, + self.inventory_bucket_name, self.bucket_name + ) @patch('boto3.client') def test_get_most_recent_manifest_yesterday(self, mock_boto_client): @@ -275,4 +273,4 @@ def test_list_keys_no_contents(self, mock_boto_client): self.assertEqual(result, []) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main()