From 9e995926721d7e1923da21ed620b3a2422e5d390 Mon Sep 17 00:00:00 2001 From: "Abuzar Mahmood (aider)" Date: Sat, 8 Feb 2025 17:58:20 -0500 Subject: [PATCH] feat: Implement auto-renaming for file uploads to prevent S3 filename conflicts --- src/_tests/s3_test.py | 36 +++++++++++++++++++++++++++++------- src/pages/Upload.py | 14 +++++++++++++- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/src/_tests/s3_test.py b/src/_tests/s3_test.py index 0407027..8baa77a 100644 --- a/src/_tests/s3_test.py +++ b/src/_tests/s3_test.py @@ -2,18 +2,40 @@ import os import streamlit as st import pandas as pd +import pytest +import time +from pages.Upload import get_unique_filename s3 = s3fs.S3FileSystem( anon=False, key = st.secrets["S3_KEY"], secret = st.secrets["S3_SECRET"] ) -s3.ls('s3://ocr-database-s3') -wanted_files = [x for x in os.listdir('.') if x.endswith('.csv')][0] +def test_get_unique_filename(): + # Setup test environment + test_path = f's3://{st.secrets["S3_BUCKET_NAME"]}/test' + test_filename = 'test_file.pdf' + + # Create a test file + with s3.open(os.path.join(test_path, test_filename), 'wb') as f: + f.write(b'test content') + + # Test unique filename generation + unique_name = get_unique_filename(s3, test_path, test_filename) + assert unique_name != test_filename + assert unique_name.startswith('test_file_') + assert unique_name.endswith('.pdf') + + # Cleanup + s3.rm(os.path.join(test_path, test_filename)) -# Upload file -s3.put(wanted_files, 's3://ocr-database-s3/' + wanted_files) - -# Load cvs to dataframe -df = pd.read_csv(s3.open('s3://ocr-database-s3/' + wanted_files)) +def test_file_upload(): + wanted_files = [x for x in os.listdir('.') if x.endswith('.csv')][0] + + # Upload file + s3.put(wanted_files, 's3://ocr-database-s3/' + wanted_files) + + # Load csv to dataframe + df = pd.read_csv(s3.open('s3://ocr-database-s3/' + wanted_files)) + assert not df.empty diff --git a/src/pages/Upload.py b/src/pages/Upload.py index cd57a92..1027889 100644 --- a/src/pages/Upload.py +++ b/src/pages/Upload.py @@ -6,6 +6,17 @@ from datetime import datetime from PyPDF2 import PdfWriter, PdfReader import s3fs +import time + +def get_unique_filename(s3, path, filename): + """Generate a unique filename if a conflict is detected""" + base, ext = os.path.splitext(filename) + counter = 1 + new_filename = filename + while s3.exists(os.path.join(path, new_filename)): + new_filename = f"{base}_{int(time.time())}_{counter}{ext}" + counter += 1 + return new_filename s3 = s3fs.S3FileSystem( anon=False, @@ -56,9 +67,10 @@ output = PdfWriter() output.add_page(page) + base_filename = get_unique_filename(s3, save_path, uploaded_file.name) save_page_path = os.path.join( save_path, - uploaded_file.name.split('.')[0] + '_' + str(i) + '.pdf' + base_filename.split('.')[0] + '_' + str(i) + '.pdf' ) # with open(save_page_path, 'wb') as f: