From b06ad1d71c7a5c064a29ca333e81c3dc448a73d1 Mon Sep 17 00:00:00 2001
From: Ramesh Maddegoda <94033485+ramesh-maddegoda@users.noreply.github.com>
Date: Thu, 26 Feb 2026 11:38:32 -0800
Subject: [PATCH 1/2] PODAAC-7294: Support large file uploads (4GB+) with
 optimized file handling

---
 mcc/checker.wsgi      |   9 ++--
 mcc/web/file_utils.py | 119 +++++++++++++++++++++++++++---------------
 mcc/web/form_utils.py |  27 +++++++---
 mcc/web/server.py     |  21 ++++++++
 4 files changed, 125 insertions(+), 51 deletions(-)

diff --git a/mcc/checker.wsgi b/mcc/checker.wsgi
index bd826eb..3b2f9b0 100644
--- a/mcc/checker.wsgi
+++ b/mcc/checker.wsgi
@@ -11,10 +11,13 @@ sys.path.append(os.path.dirname(__file__))
 
 
 def application(environ, start_response):
-    os.environ['ApiMaxFileSize'] = environ.get('ApiMaxFileSize', '10737418240')
-    os.environ['UiMaxFileSize'] = environ.get('UiMaxFileSize', '4295000000')
+    # Increased API file size limit to 5GB (5 * 1024^3 = 5,368,709,120 bytes)
+    os.environ['ApiMaxFileSize'] = environ.get('ApiMaxFileSize', '5368709120')
+    # Increased UI file size limit to 5GB (5 * 1024^3 = 5,368,709,120 bytes)
+    os.environ['UiMaxFileSize'] = environ.get('UiMaxFileSize', '5368709120')
     os.environ['HomepageURL'] = environ.get('HomepageURL', '"https://mcc.podaac.earthdatacloud.nasa.gov"')
-    os.environ['TempFileLocation'] = environ.get('TempFileLocation', tempfile.gettempdir())
+    # Use /tmp for temporary file storage to ensure enough space for 4GB+ files
+    os.environ['TempFileLocation'] = environ.get('TempFileLocation', '/tmp')
     os.environ['Venue'] = environ.get('Venue', 'OPS')
     os.environ['CF_STANDARD_NAME_TABLE'] = environ.get('CF_STANDARD_NAME_TABLE', '')
 
diff --git a/mcc/web/file_utils.py b/mcc/web/file_utils.py
index 2a1d4fb..cefcf27 100644
--- a/mcc/web/file_utils.py
+++ b/mcc/web/file_utils.py
@@ -23,26 +23,33 @@
 logger = logging.getLogger(__name__)
 
 
-def hash_file(infile, hasher=None, blocksize=65536):
+def hash_file(infile, hasher=None, blocksize=8388608):  # 8MB buffer (8 * 1024 * 1024)
     """
-    Incrementally generate file hashes (suitable for large files).
+    Incrementally generate file hashes (suitable for very large files).
+    Uses a large buffer size (8MB) for better performance with multi-GB files.
 
     @param infile a python file-like object
     @param hasher a hasher function from hashlib library (e.g. md5, sha256, ...)
-    @param blocksize an integer of bytes to read into the hash at a time
+    @param blocksize an integer of bytes to read into the hash at a time (default: 8MB)
     @return a hexdigest of the hash based on the hasher
     """
     if hasher is None:
         hasher = md5()
 
+    # Save current file position
+    current_pos = infile.tell()
+    
+    # Go to beginning of file
+    infile.seek(0)
+    
+    # Read and hash in chunks
     buf = infile.read(blocksize)
-
     while len(buf) > 0:
         hasher.update(buf)
         buf = infile.read(blocksize)
 
-    # Reset file buffer back to starting position
-    infile.seek(0)
+    # Reset file buffer back to original position
+    infile.seek(current_pos)
 
     return hasher.hexdigest()
 
@@ -67,22 +74,8 @@ def format_byte_size(n_bytes):
 def decompress_file(infile, upload_filename):
     """
     Opens gzip and bz2 files and returns the uncompressed data.
-    Uncompresses in pieces andand cuts off when limit is reached in order to
-    avoid 'zip bombs'.
-
-    Solution suggested by Mark Adler:
-    http://stackoverflow.com/questions/13622706/how-to-protect-myself-from-a-gzip-or-bzip2-bomb
-
-    The gist is that we decompress the file in chunks, counting the uncompressed
-    size of each and adding it to the total. If the filesize is OK, then
-    decompress the file and return data. This seems a little redundant, but
-    uncompressing the actual file in chunks and storing the data creates
-    crashing issues in the Docker container.
-
-    TODO: Bz2 doesn't have the same ability as zlib to read in chunks, so a bomb is
-          just going to cause a memory error. So, given the extra time it takes to
-          decompress a bz2, I'm just assembling the chunks on the fly instead of
-          unzipping a second time. Investigate a workaround for this at some point.
+    Optimized for handling large files (4GB+) efficiently.
+    Uncompresses in pieces and cuts off when limit is reached to avoid 'zip bombs'.
 
     @param infile file object containing the data to decompress
     @param upload_filename name of the file to decompress
@@ -90,8 +83,10 @@ def decompress_file(infile, upload_filename):
     """
     app.logger.info("Decompressing file %s", upload_filename)
 
-    decompressed_file = tempfile.NamedTemporaryFile()
-    extension = os.path.splitext(upload_filename)[-1]
+    # Use the configured temporary directory for large file processing
+    temp_dir = app.config.get('TEMP_FILE_DIR', tempfile.gettempdir())
+    decompressed_file = tempfile.NamedTemporaryFile(dir=temp_dir)
+    extension = os.path.splitext(upload_filename)[-1].lower()
 
     # Determine the correct open function based on the type of decompression
     if extension == ".gz":
@@ -101,12 +96,16 @@ def decompress_file(infile, upload_filename):
     else:
         raise ValueError(f"Unknown file extension ({extension}) for decompression")
 
+    # Use a larger buffer size for better performance with large files
+    buffer_size = 8 * 1024 * 1024  # 8MB buffer
+
     try:
         data_length = 0
         reader = open_fn(infile)
 
+        # Stream decompression with a larger buffer
         while data_length < app.config['MAX_CONTENT_LENGTH']:
-            buf = reader.read(1024)
+            buf = reader.read(buffer_size)
 
             if len(buf) == 0:
                 break
@@ -114,29 +113,34 @@ def decompress_file(infile, upload_filename):
             data_length += len(buf)
             decompressed_file.write(buf)
         else:
+            # If we exit the loop without breaking, the file is too large
             decompressed_file.close()
             return abort(
                 400, f"The decompressed file size is too large. "
                      f"Max decompressed file size is: {format_byte_size(app.config['MAX_CONTENT_LENGTH'])}. "
                      f"Filename: {upload_filename}"
             )
+            
+        # Close the reader to free resources
+        reader.close()
+        
     except (BadGzipFile, OSError, ValueError, TypeError, IOError, EOFError) as err:
         decompressed_file.close()
         raise ValueError(
             f'Failed to decompress {extension} file {upload_filename}, reason: {str(err)}.'
         )
     except MemoryError:
-        # Noticed that some bzip bombs cause memory errors. There doesn't
-        # seem to be a ton of great ways around this.
+        # Handle memory errors from bzip bombs
         decompressed_file.close()
         return abort(
-            400, f"The decompressed file size is too large. "
+            400, f"The decompressed file size is too large or caused a memory error. "
                  f"Max decompressed file size is: {format_byte_size(app.config['MAX_CONTENT_LENGTH'])}. "
                  f"Filename: {upload_filename}"
         )
 
     # Roll file pointer back to beginning of buffer now that decompression is complete
     decompressed_file.seek(0)
+    app.logger.info(f"Successfully decompressed {upload_filename} to size {format_byte_size(data_length)}")
 
     return decompressed_file
 
@@ -144,24 +148,55 @@ def decompress_file(infile, upload_filename):
 def get_dataset_from_file(uploaded_file):
     """
     Derives a netcdf4.Dataset object from the provided file upload dictionary.
+    Optimized for handling large files (4GB+) efficiently.
 
-    @param uploaded_file open file handle to the data to convert.
+    @param uploaded_file open file handle or path to the data to convert.
     """
     app.logger.info("Attempting to get dataset from uploaded file %s", uploaded_file)
-
-    datafile_name = uploaded_file.filename
+    
+    # Handle both file paths and file objects
+    if isinstance(uploaded_file, str):
+        # It's a file path
+        datafile_name = os.path.basename(uploaded_file)
+        is_path = True
+    else:
+        # It's a file object
+        datafile_name = uploaded_file.filename
+        is_path = False
+    
     check_valid_filename(datafile_name)
-
-    # uploaded_file is an instance of werkzeug.FileStorage, which only allows us
-    # to read the file contents but not reference a location on disk.
-    # Copy the uploaded file to a named temporary file, so we can provide a disk
-    # location when initializing the netCDF.Dataset object below.
-
-    # Since the CF suite checks the Dataset's filename for compliance,
-    # we need to make sure the original is incorporated into the name of the
-    # temporary file used to initialize the Dataset object below.
-    datafile = tempfile.NamedTemporaryFile(suffix=f"_{datafile_name}")
-    datafile.write(uploaded_file.read())
+    
+    # For large files, we want to avoid loading the entire file into memory
+    # Instead, we'll use a temporary file and stream the data in chunks
+    
+    # Use the configured temporary directory for large file processing
+    temp_dir = app.config.get('TEMP_FILE_DIR', tempfile.gettempdir())
+    
+    # Create a named temporary file in the specified directory
+    datafile = tempfile.NamedTemporaryFile(dir=temp_dir, suffix=f"_{datafile_name}")
+    
+    # Stream the file in chunks to avoid memory issues
+    if is_path:
+        # If it's a path, open the file and copy it
+        with open(uploaded_file, 'rb') as src_file:
+            # Use a larger buffer size (8MB) for faster copying of large files
+            buffer_size = 8 * 1024 * 1024  # 8MB buffer
+            while True:
+                buffer = src_file.read(buffer_size)
+                if not buffer:
+                    break
+                datafile.write(buffer)
+    else:
+        # If it's a file object, stream from it
+        # Use a larger buffer size (8MB) for faster copying of large files
+        buffer_size = 8 * 1024 * 1024  # 8MB buffer
+        while True:
+            buffer = uploaded_file.read(buffer_size)
+            if not buffer:
+                break
+            datafile.write(buffer)
+    
+    # Reset file pointer to beginning
     datafile.seek(0)
 
     # Calculate hash and file size now, prior to any potential file decompression
diff --git a/mcc/web/form_utils.py b/mcc/web/form_utils.py
index 1c1f401..670e5fc 100644
--- a/mcc/web/form_utils.py
+++ b/mcc/web/form_utils.py
@@ -47,6 +47,7 @@ def get_tests(form_dict, checker_map):
 def parse_post_arguments(form_dict, files, checker_map):
     """
     Parse a POST request from either an HTML page form or a cURL-like request.
+    Optimized for handling large files (4GB+) efficiently.
     Aborts response if no tests or no files.
 
     @param form_dict ImmutableMultiDict from flask or a regular
@@ -57,10 +58,13 @@ def parse_post_arguments(form_dict, files, checker_map):
     @param files a dict with a flask file-like object
     @param checker_map a dict of checker short names to initialized checkers
     @return a dict with 'file', 'checkers', 'response' or abort()
-    # TODO determine if this is an additional place to verify upload size constraint
     """
+    from flask import current_app
+    app = current_app
+    
     ret = {}
 
+    # Get the selected checkers
     checkers = get_tests(form_dict, checker_map)
 
     if not checkers:
@@ -68,12 +72,23 @@ def parse_post_arguments(form_dict, files, checker_map):
             400, "You need to choose at least one metadata convention to test your file against."
         )
 
-    if 'file-upload' not in files:
+    # Check for file upload
+    # Try multiple possible field names for file uploads
+    uploaded_file = None
+    for field in ['file-upload', 'file', 'upload', 'fileUpload']:
+        if field in files and files[field]:
+            uploaded_file = files[field]
+            app.logger.info(f"Found file in field: {field}")
+            break
+    
+    if not uploaded_file:
         return abort(400, "Your request was empty. Please make sure you've specified a file.")
-    elif not files['file-upload']:
-        return abort(400, "There was a problem uploading your file. Please try again.")
-
-    ret['file'] = files['file-upload']
+    
+    # Log file information
+    app.logger.info(f"File received: {uploaded_file.filename}")
+    
+    # Return the file object directly - we'll handle streaming in get_dataset_from_file
+    ret['file'] = uploaded_file
     ret['checkers'] = checkers
     ret['response'] = form_dict.get('response', 'html').lower()
 
diff --git a/mcc/web/server.py b/mcc/web/server.py
index ea08835..112e5f2 100644
--- a/mcc/web/server.py
+++ b/mcc/web/server.py
@@ -7,7 +7,9 @@
 
 """
 
+import os
 import time
+import logging
 from os import environ
 from os.path import join
 
@@ -21,6 +23,11 @@
 from .form_utils import parse_post_arguments
 from .json_utils import CustomJSONEncoder
 
+# Configure logging
+logging.basicConfig(level=logging.INFO, 
+                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
 app = Flask(__name__)
 
 # Style options for whitespace in templated HTML code.
@@ -42,6 +49,20 @@
 # Venue that MCC is deployed to (SIT, UAT, or OPS)
 app.config['Venue'] = str(environ['Venue'])
 
+# Temporary directory for file processing
+app.config['TEMP_FILE_DIR'] = environ.get('TempFileLocation', '/tmp')
+
+# Ensure the temporary directory exists and is writable
+if not os.path.exists(app.config['TEMP_FILE_DIR']):
+    try:
+        os.makedirs(app.config['TEMP_FILE_DIR'], exist_ok=True)
+        logger.info(f"Created temporary directory: {app.config['TEMP_FILE_DIR']}")
+    except Exception as e:
+        logger.warning(f"Error creating temporary directory {app.config['TEMP_FILE_DIR']}: {str(e)}")
+
+if not os.access(app.config['TEMP_FILE_DIR'], os.W_OK):
+    logger.warning(f"Temporary directory {app.config['TEMP_FILE_DIR']} is not writable")
+
 # Mapping of checker short names to CheckSuite implementations.
 # This could be easily kept up-to-date by inspection of a module, but
 # prefer the explicit writing of current checkers.

From e3c04040e14489e94daa4742758560e073e39af2 Mon Sep 17 00:00:00 2001
From: Ramesh Maddegoda <94033485+ramesh-maddegoda@users.noreply.github.com>
Date: Thu, 26 Feb 2026 13:17:40 -0800
Subject: [PATCH 2/2] PODAAC-7294: Add progress reporting and timeout options
 to API documentation

---
 mcc/web/templates/about_api.html | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/mcc/web/templates/about_api.html b/mcc/web/templates/about_api.html
index c4639af..e697395 100644
--- a/mcc/web/templates/about_api.html
+++ b/mcc/web/templates/about_api.html
@@ -191,9 +191,31 @@ <h3>POST Examples</h3>
                   <p>
                     <em>Note that you need to append an '@' to the beginning of the filename in the CURL request when using the file-upload option.</em>
                   </p>
+                  <h4>Basic Examples</h4>
                   <pre>curl -L -F ACDD=on -F ACDD-version=1.3 -F file-upload=@/home/user/granule.nc -F response=json {{ homepage_url }}/check
 curl -L -F CF=on -F CF-version=1.7 -F file-upload=@/home/user/granule.nc -F response=html {{ homepage_url }}/check
 curl -L -F GDS2=on -F GDS2-parameter=L4 -F file-upload=@/home/user/granule.nc -F response=pdf {{ homepage_url }}/check</pre>
+                  
+                  <h4>With Progress Reporting and Extended Timeouts</h4>
+                  <p>
+                    <em>For large files, you may want to show progress and set longer timeouts:</em>
+                  </p>
+                  <pre>curl -L --progress-bar --connect-timeout 300 --max-time 3600 --keepalive-time 3600 \
+    -F ACDD=on -F ACDD-version=1.3 -F file-upload=@/home/user/granule.nc -F response=json {{ homepage_url }}/check</pre>
+                  
+                  <h4>With Detailed Progress</h4>
+                  <pre>curl -L -v --progress-meter --connect-timeout 300 --max-time 3600 --keepalive-time 3600 \
+    -F CF=on -F CF-version=1.7 -F file-upload=@/home/user/granule.nc -F response=html {{ homepage_url }}/check</pre>
+                  
+                  <h4>Options Explained</h4>
+                  <ul>
+                    <li><code>--progress-bar</code>: Display a simple progress bar</li>
+                    <li><code>--progress-meter</code>: Display detailed progress information</li>
+                    <li><code>-v</code>: Verbose output showing request and response headers</li>
+                    <li><code>--connect-timeout 300</code>: Wait up to 300 seconds when connecting</li>
+                    <li><code>--max-time 3600</code>: Allow the whole operation to take up to 3600 seconds (1 hour)</li>
+                    <li><code>--keepalive-time 3600</code>: Keep the connection alive for 3600 seconds</li>
+                  </ul>
                 </div>
               </div>
             </div>