Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions dataset/keys.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@ def list_keys_from_inventory(section_ids, action, inventoried_bucket_name, bucke
s3_client = boto3.client('s3')

try:
# Read the JSON contents of the most created inventory manifest file
# Read the JSON contents of the most recent inventory manifest file
manifest_json = get_most_recent_manifest(inventoried_bucket_name, bucket_name)

if manifest_json is None:
raise FileNotFoundError("No inventory manifest found in the last two days")

# Iterate over the Parquet files in the manifest, fetching and reading each one
# to get the list of keys, and filtering them based on the section IDs and action
all = []
Expand All @@ -28,6 +31,9 @@ def list_keys_from_inventory(section_ids, action, inventoried_bucket_name, bucke

return all

except FileNotFoundError:
# Bubble up so the job fails loudly instead of silently returning no data
raise
except Exception as e:
print(e)
return []
Expand All @@ -38,9 +44,11 @@ def list_keys_from_inventory(section_ids, action, inventoried_bucket_name, bucke
# The time lag is due to the fact that the inventory files in AWS are generated once a day.
def get_most_recent_manifest(inventoried_bucket_name, bucket_name):

for i in range(1, 2):
attempted_keys = []
for i in range(1, 3): # yesterday, then two days ago
day = (datetime.datetime.utcnow() - datetime.timedelta(days=i)).strftime('%Y-%m-%dT01-00Z')
manifest_key = f'{inventoried_bucket_name}/{bucket_name}/{day}/manifest.json'
attempted_keys.append(manifest_key)

try:
s3_client = boto3.client('s3')
Expand All @@ -53,6 +61,8 @@ def get_most_recent_manifest(inventoried_bucket_name, bucket_name):
print(e)
continue

raise FileNotFoundError(f"No inventory manifest found for keys: {attempted_keys}")


def fetch_parquet(section_ids, action, s3_client, bucket_name, key):

Expand Down Expand Up @@ -113,4 +123,3 @@ def list_keys(bucket_name, section_id, action):
break

return files

14 changes: 6 additions & 8 deletions tests/test_keys.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,11 @@ def test_list_keys_from_inventory_exception_handling(self, mock_boto_client):
mock_s3_client.get_object.side_effect = Exception("S3 Error")
mock_boto_client.return_value = mock_s3_client

result = list_keys_from_inventory(
self.section_ids, self.action,
self.inventory_bucket_name, self.bucket_name
)

# Should return empty list on exception
self.assertEqual(result, [])
with self.assertRaises(FileNotFoundError):
list_keys_from_inventory(
self.section_ids, self.action,
self.inventory_bucket_name, self.bucket_name
)

@patch('boto3.client')
def test_get_most_recent_manifest_yesterday(self, mock_boto_client):
Expand Down Expand Up @@ -275,4 +273,4 @@ def test_list_keys_no_contents(self, mock_boto_client):
self.assertEqual(result, [])

if __name__ == '__main__':
unittest.main()
unittest.main()