From b06ad1d71c7a5c064a29ca333e81c3dc448a73d1 Mon Sep 17 00:00:00 2001 From: Ramesh Maddegoda <94033485+ramesh-maddegoda@users.noreply.github.com> Date: Thu, 26 Feb 2026 11:38:32 -0800 Subject: [PATCH 1/2] PODAAC-7294: Support large file uploads (4GB+) with optimized file handling --- mcc/checker.wsgi | 9 ++-- mcc/web/file_utils.py | 119 +++++++++++++++++++++++++++--------------- mcc/web/form_utils.py | 27 +++++++--- mcc/web/server.py | 21 ++++++++ 4 files changed, 125 insertions(+), 51 deletions(-) diff --git a/mcc/checker.wsgi b/mcc/checker.wsgi index bd826eb..3b2f9b0 100644 --- a/mcc/checker.wsgi +++ b/mcc/checker.wsgi @@ -11,10 +11,13 @@ sys.path.append(os.path.dirname(__file__)) def application(environ, start_response): - os.environ['ApiMaxFileSize'] = environ.get('ApiMaxFileSize', '10737418240') - os.environ['UiMaxFileSize'] = environ.get('UiMaxFileSize', '4295000000') + # Increased API file size limit to 5GB (5 * 1024^3 = 5,368,709,120 bytes) + os.environ['ApiMaxFileSize'] = environ.get('ApiMaxFileSize', '5368709120') + # Increased UI file size limit to 5GB (5 * 1024^3 = 5,368,709,120 bytes) + os.environ['UiMaxFileSize'] = environ.get('UiMaxFileSize', '5368709120') os.environ['HomepageURL'] = environ.get('HomepageURL', '"https://mcc.podaac.earthdatacloud.nasa.gov"') - os.environ['TempFileLocation'] = environ.get('TempFileLocation', tempfile.gettempdir()) + # Use /tmp for temporary file storage to ensure enough space for 4GB+ files + os.environ['TempFileLocation'] = environ.get('TempFileLocation', '/tmp') os.environ['Venue'] = environ.get('Venue', 'OPS') os.environ['CF_STANDARD_NAME_TABLE'] = environ.get('CF_STANDARD_NAME_TABLE', '') diff --git a/mcc/web/file_utils.py b/mcc/web/file_utils.py index 2a1d4fb..cefcf27 100644 --- a/mcc/web/file_utils.py +++ b/mcc/web/file_utils.py @@ -23,26 +23,33 @@ logger = logging.getLogger(__name__) -def hash_file(infile, hasher=None, blocksize=65536): +def hash_file(infile, hasher=None, blocksize=8388608): # 8MB buffer (8 * 1024 * 1024) """ - Incrementally generate file hashes (suitable for large files). + Incrementally generate file hashes (suitable for very large files). + Uses a large buffer size (8MB) for better performance with multi-GB files. @param infile a python file-like object @param hasher a hasher function from hashlib library (e.g. md5, sha256, ...) - @param blocksize an integer of bytes to read into the hash at a time + @param blocksize an integer of bytes to read into the hash at a time (default: 8MB) @return a hexdigest of the hash based on the hasher """ if hasher is None: hasher = md5() + # Save current file position + current_pos = infile.tell() + + # Go to beginning of file + infile.seek(0) + + # Read and hash in chunks buf = infile.read(blocksize) - while len(buf) > 0: hasher.update(buf) buf = infile.read(blocksize) - # Reset file buffer back to starting position - infile.seek(0) + # Reset file buffer back to original position + infile.seek(current_pos) return hasher.hexdigest() @@ -67,22 +74,8 @@ def format_byte_size(n_bytes): def decompress_file(infile, upload_filename): """ Opens gzip and bz2 files and returns the uncompressed data. - Uncompresses in pieces andand cuts off when limit is reached in order to - avoid 'zip bombs'. - - Solution suggested by Mark Adler: - http://stackoverflow.com/questions/13622706/how-to-protect-myself-from-a-gzip-or-bzip2-bomb - - The gist is that we decompress the file in chunks, counting the uncompressed - size of each and adding it to the total. If the filesize is OK, then - decompress the file and return data. This seems a little redundant, but - uncompressing the actual file in chunks and storing the data creates - crashing issues in the Docker container. - - TODO: Bz2 doesn't have the same ability as zlib to read in chunks, so a bomb is - just going to cause a memory error. So, given the extra time it takes to - decompress a bz2, I'm just assembling the chunks on the fly instead of - unzipping a second time. Investigate a workaround for this at some point. + Optimized for handling large files (4GB+) efficiently. + Uncompresses in pieces and cuts off when limit is reached to avoid 'zip bombs'. @param infile file object containing the data to decompress @param upload_filename name of the file to decompress @@ -90,8 +83,10 @@ def decompress_file(infile, upload_filename): """ app.logger.info("Decompressing file %s", upload_filename) - decompressed_file = tempfile.NamedTemporaryFile() - extension = os.path.splitext(upload_filename)[-1] + # Use the configured temporary directory for large file processing + temp_dir = app.config.get('TEMP_FILE_DIR', tempfile.gettempdir()) + decompressed_file = tempfile.NamedTemporaryFile(dir=temp_dir) + extension = os.path.splitext(upload_filename)[-1].lower() # Determine the correct open function based on the type of decompression if extension == ".gz": @@ -101,12 +96,16 @@ def decompress_file(infile, upload_filename): else: raise ValueError(f"Unknown file extension ({extension}) for decompression") + # Use a larger buffer size for better performance with large files + buffer_size = 8 * 1024 * 1024 # 8MB buffer + try: data_length = 0 reader = open_fn(infile) + # Stream decompression with a larger buffer while data_length < app.config['MAX_CONTENT_LENGTH']: - buf = reader.read(1024) + buf = reader.read(buffer_size) if len(buf) == 0: break @@ -114,29 +113,34 @@ def decompress_file(infile, upload_filename): data_length += len(buf) decompressed_file.write(buf) else: + # If we exit the loop without breaking, the file is too large decompressed_file.close() return abort( 400, f"The decompressed file size is too large. " f"Max decompressed file size is: {format_byte_size(app.config['MAX_CONTENT_LENGTH'])}. " f"Filename: {upload_filename}" ) + + # Close the reader to free resources + reader.close() + except (BadGzipFile, OSError, ValueError, TypeError, IOError, EOFError) as err: decompressed_file.close() raise ValueError( f'Failed to decompress {extension} file {upload_filename}, reason: {str(err)}.' ) except MemoryError: - # Noticed that some bzip bombs cause memory errors. There doesn't - # seem to be a ton of great ways around this. + # Handle memory errors from bzip bombs decompressed_file.close() return abort( - 400, f"The decompressed file size is too large. " + 400, f"The decompressed file size is too large or caused a memory error. " f"Max decompressed file size is: {format_byte_size(app.config['MAX_CONTENT_LENGTH'])}. " f"Filename: {upload_filename}" ) # Roll file pointer back to beginning of buffer now that decompression is complete decompressed_file.seek(0) + app.logger.info(f"Successfully decompressed {upload_filename} to size {format_byte_size(data_length)}") return decompressed_file @@ -144,24 +148,55 @@ def decompress_file(infile, upload_filename): def get_dataset_from_file(uploaded_file): """ Derives a netcdf4.Dataset object from the provided file upload dictionary. + Optimized for handling large files (4GB+) efficiently. - @param uploaded_file open file handle to the data to convert. + @param uploaded_file open file handle or path to the data to convert. """ app.logger.info("Attempting to get dataset from uploaded file %s", uploaded_file) - - datafile_name = uploaded_file.filename + + # Handle both file paths and file objects + if isinstance(uploaded_file, str): + # It's a file path + datafile_name = os.path.basename(uploaded_file) + is_path = True + else: + # It's a file object + datafile_name = uploaded_file.filename + is_path = False + check_valid_filename(datafile_name) - - # uploaded_file is an instance of werkzeug.FileStorage, which only allows us - # to read the file contents but not reference a location on disk. - # Copy the uploaded file to a named temporary file, so we can provide a disk - # location when initializing the netCDF.Dataset object below. - - # Since the CF suite checks the Dataset's filename for compliance, - # we need to make sure the original is incorporated into the name of the - # temporary file used to initialize the Dataset object below. - datafile = tempfile.NamedTemporaryFile(suffix=f"_{datafile_name}") - datafile.write(uploaded_file.read()) + + # For large files, we want to avoid loading the entire file into memory + # Instead, we'll use a temporary file and stream the data in chunks + + # Use the configured temporary directory for large file processing + temp_dir = app.config.get('TEMP_FILE_DIR', tempfile.gettempdir()) + + # Create a named temporary file in the specified directory + datafile = tempfile.NamedTemporaryFile(dir=temp_dir, suffix=f"_{datafile_name}") + + # Stream the file in chunks to avoid memory issues + if is_path: + # If it's a path, open the file and copy it + with open(uploaded_file, 'rb') as src_file: + # Use a larger buffer size (8MB) for faster copying of large files + buffer_size = 8 * 1024 * 1024 # 8MB buffer + while True: + buffer = src_file.read(buffer_size) + if not buffer: + break + datafile.write(buffer) + else: + # If it's a file object, stream from it + # Use a larger buffer size (8MB) for faster copying of large files + buffer_size = 8 * 1024 * 1024 # 8MB buffer + while True: + buffer = uploaded_file.read(buffer_size) + if not buffer: + break + datafile.write(buffer) + + # Reset file pointer to beginning datafile.seek(0) # Calculate hash and file size now, prior to any potential file decompression diff --git a/mcc/web/form_utils.py b/mcc/web/form_utils.py index 1c1f401..670e5fc 100644 --- a/mcc/web/form_utils.py +++ b/mcc/web/form_utils.py @@ -47,6 +47,7 @@ def get_tests(form_dict, checker_map): def parse_post_arguments(form_dict, files, checker_map): """ Parse a POST request from either an HTML page form or a cURL-like request. + Optimized for handling large files (4GB+) efficiently. Aborts response if no tests or no files. @param form_dict ImmutableMultiDict from flask or a regular @@ -57,10 +58,13 @@ def parse_post_arguments(form_dict, files, checker_map): @param files a dict with a flask file-like object @param checker_map a dict of checker short names to initialized checkers @return a dict with 'file', 'checkers', 'response' or abort() - # TODO determine if this is an additional place to verify upload size constraint """ + from flask import current_app + app = current_app + ret = {} + # Get the selected checkers checkers = get_tests(form_dict, checker_map) if not checkers: @@ -68,12 +72,23 @@ def parse_post_arguments(form_dict, files, checker_map): 400, "You need to choose at least one metadata convention to test your file against." ) - if 'file-upload' not in files: + # Check for file upload + # Try multiple possible field names for file uploads + uploaded_file = None + for field in ['file-upload', 'file', 'upload', 'fileUpload']: + if field in files and files[field]: + uploaded_file = files[field] + app.logger.info(f"Found file in field: {field}") + break + + if not uploaded_file: return abort(400, "Your request was empty. Please make sure you've specified a file.") - elif not files['file-upload']: - return abort(400, "There was a problem uploading your file. Please try again.") - - ret['file'] = files['file-upload'] + + # Log file information + app.logger.info(f"File received: {uploaded_file.filename}") + + # Return the file object directly - we'll handle streaming in get_dataset_from_file + ret['file'] = uploaded_file ret['checkers'] = checkers ret['response'] = form_dict.get('response', 'html').lower() diff --git a/mcc/web/server.py b/mcc/web/server.py index ea08835..112e5f2 100644 --- a/mcc/web/server.py +++ b/mcc/web/server.py @@ -7,7 +7,9 @@ """ +import os import time +import logging from os import environ from os.path import join @@ -21,6 +23,11 @@ from .form_utils import parse_post_arguments from .json_utils import CustomJSONEncoder +# Configure logging +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + app = Flask(__name__) # Style options for whitespace in templated HTML code. @@ -42,6 +49,20 @@ # Venue that MCC is deployed to (SIT, UAT, or OPS) app.config['Venue'] = str(environ['Venue']) +# Temporary directory for file processing +app.config['TEMP_FILE_DIR'] = environ.get('TempFileLocation', '/tmp') + +# Ensure the temporary directory exists and is writable +if not os.path.exists(app.config['TEMP_FILE_DIR']): + try: + os.makedirs(app.config['TEMP_FILE_DIR'], exist_ok=True) + logger.info(f"Created temporary directory: {app.config['TEMP_FILE_DIR']}") + except Exception as e: + logger.warning(f"Error creating temporary directory {app.config['TEMP_FILE_DIR']}: {str(e)}") + +if not os.access(app.config['TEMP_FILE_DIR'], os.W_OK): + logger.warning(f"Temporary directory {app.config['TEMP_FILE_DIR']} is not writable") + # Mapping of checker short names to CheckSuite implementations. # This could be easily kept up-to-date by inspection of a module, but # prefer the explicit writing of current checkers. From e3c04040e14489e94daa4742758560e073e39af2 Mon Sep 17 00:00:00 2001 From: Ramesh Maddegoda <94033485+ramesh-maddegoda@users.noreply.github.com> Date: Thu, 26 Feb 2026 13:17:40 -0800 Subject: [PATCH 2/2] PODAAC-7294: Add progress reporting and timeout options to API documentation --- mcc/web/templates/about_api.html | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/mcc/web/templates/about_api.html b/mcc/web/templates/about_api.html index c4639af..e697395 100644 --- a/mcc/web/templates/about_api.html +++ b/mcc/web/templates/about_api.html @@ -191,9 +191,31 @@

POST Examples

Note that you need to append an '@' to the beginning of the filename in the CURL request when using the file-upload option.

+

Basic Examples

curl -L -F ACDD=on -F ACDD-version=1.3 -F file-upload=@/home/user/granule.nc -F response=json {{ homepage_url }}/check
 curl -L -F CF=on -F CF-version=1.7 -F file-upload=@/home/user/granule.nc -F response=html {{ homepage_url }}/check
 curl -L -F GDS2=on -F GDS2-parameter=L4 -F file-upload=@/home/user/granule.nc -F response=pdf {{ homepage_url }}/check
+ +

With Progress Reporting and Extended Timeouts

+

+ For large files, you may want to show progress and set longer timeouts: +

+
curl -L --progress-bar --connect-timeout 300 --max-time 3600 --keepalive-time 3600 \
+    -F ACDD=on -F ACDD-version=1.3 -F file-upload=@/home/user/granule.nc -F response=json {{ homepage_url }}/check
+ +

With Detailed Progress

+
curl -L -v --progress-meter --connect-timeout 300 --max-time 3600 --keepalive-time 3600 \
+    -F CF=on -F CF-version=1.7 -F file-upload=@/home/user/granule.nc -F response=html {{ homepage_url }}/check
+ +

Options Explained

+