Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions mcc/checker.wsgi
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@ sys.path.append(os.path.dirname(__file__))


def application(environ, start_response):
os.environ['ApiMaxFileSize'] = environ.get('ApiMaxFileSize', '10737418240')
os.environ['UiMaxFileSize'] = environ.get('UiMaxFileSize', '4295000000')
# Increased API file size limit to 5GB (5 * 1024^3 = 5,368,709,120 bytes)
os.environ['ApiMaxFileSize'] = environ.get('ApiMaxFileSize', '5368709120')
# Increased UI file size limit to 5GB (5 * 1024^3 = 5,368,709,120 bytes)
os.environ['UiMaxFileSize'] = environ.get('UiMaxFileSize', '5368709120')
os.environ['HomepageURL'] = environ.get('HomepageURL', '"https://mcc.podaac.earthdatacloud.nasa.gov"')
os.environ['TempFileLocation'] = environ.get('TempFileLocation', tempfile.gettempdir())
# Use /tmp for temporary file storage to ensure enough space for 4GB+ files
os.environ['TempFileLocation'] = environ.get('TempFileLocation', '/tmp')
os.environ['Venue'] = environ.get('Venue', 'OPS')
os.environ['CF_STANDARD_NAME_TABLE'] = environ.get('CF_STANDARD_NAME_TABLE', '')

Expand Down
119 changes: 77 additions & 42 deletions mcc/web/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,33 @@
logger = logging.getLogger(__name__)


def hash_file(infile, hasher=None, blocksize=65536):
def hash_file(infile, hasher=None, blocksize=8388608): # 8MB buffer (8 * 1024 * 1024)
"""
Incrementally generate file hashes (suitable for large files).
Incrementally generate file hashes (suitable for very large files).
Uses a large buffer size (8MB) for better performance with multi-GB files.

@param infile a python file-like object
@param hasher a hasher function from hashlib library (e.g. md5, sha256, ...)
@param blocksize an integer of bytes to read into the hash at a time
@param blocksize an integer of bytes to read into the hash at a time (default: 8MB)
@return a hexdigest of the hash based on the hasher
"""
if hasher is None:
hasher = md5()

# Save current file position
current_pos = infile.tell()

# Go to beginning of file
infile.seek(0)

# Read and hash in chunks
buf = infile.read(blocksize)

while len(buf) > 0:
hasher.update(buf)
buf = infile.read(blocksize)

# Reset file buffer back to starting position
infile.seek(0)
# Reset file buffer back to original position
infile.seek(current_pos)

return hasher.hexdigest()

Expand All @@ -67,31 +74,19 @@ def format_byte_size(n_bytes):
def decompress_file(infile, upload_filename):
"""
Opens gzip and bz2 files and returns the uncompressed data.
Uncompresses in pieces andand cuts off when limit is reached in order to
avoid 'zip bombs'.

Solution suggested by Mark Adler:
http://stackoverflow.com/questions/13622706/how-to-protect-myself-from-a-gzip-or-bzip2-bomb

The gist is that we decompress the file in chunks, counting the uncompressed
size of each and adding it to the total. If the filesize is OK, then
decompress the file and return data. This seems a little redundant, but
uncompressing the actual file in chunks and storing the data creates
crashing issues in the Docker container.

TODO: Bz2 doesn't have the same ability as zlib to read in chunks, so a bomb is
just going to cause a memory error. So, given the extra time it takes to
decompress a bz2, I'm just assembling the chunks on the fly instead of
unzipping a second time. Investigate a workaround for this at some point.
Optimized for handling large files (4GB+) efficiently.
Uncompresses in pieces and cuts off when limit is reached to avoid 'zip bombs'.

@param infile file object containing the data to decompress
@param upload_filename name of the file to decompress
@return a temporary file object containing the decompressed data
"""
app.logger.info("Decompressing file %s", upload_filename)

decompressed_file = tempfile.NamedTemporaryFile()
extension = os.path.splitext(upload_filename)[-1]
# Use the configured temporary directory for large file processing
temp_dir = app.config.get('TEMP_FILE_DIR', tempfile.gettempdir())
decompressed_file = tempfile.NamedTemporaryFile(dir=temp_dir)
extension = os.path.splitext(upload_filename)[-1].lower()

# Determine the correct open function based on the type of decompression
if extension == ".gz":
Expand All @@ -101,67 +96,107 @@ def decompress_file(infile, upload_filename):
else:
raise ValueError(f"Unknown file extension ({extension}) for decompression")

# Use a larger buffer size for better performance with large files
buffer_size = 8 * 1024 * 1024 # 8MB buffer

try:
data_length = 0
reader = open_fn(infile)

# Stream decompression with a larger buffer
while data_length < app.config['MAX_CONTENT_LENGTH']:
buf = reader.read(1024)
buf = reader.read(buffer_size)

if len(buf) == 0:
break

data_length += len(buf)
decompressed_file.write(buf)
else:
# If we exit the loop without breaking, the file is too large
decompressed_file.close()
return abort(
400, f"The decompressed file size is too large. "
f"Max decompressed file size is: {format_byte_size(app.config['MAX_CONTENT_LENGTH'])}. "
f"Filename: {upload_filename}"
)

# Close the reader to free resources
reader.close()

except (BadGzipFile, OSError, ValueError, TypeError, IOError, EOFError) as err:
decompressed_file.close()
raise ValueError(
f'Failed to decompress {extension} file {upload_filename}, reason: {str(err)}.'
)
except MemoryError:
# Noticed that some bzip bombs cause memory errors. There doesn't
# seem to be a ton of great ways around this.
# Handle memory errors from bzip bombs
decompressed_file.close()
return abort(
400, f"The decompressed file size is too large. "
400, f"The decompressed file size is too large or caused a memory error. "
f"Max decompressed file size is: {format_byte_size(app.config['MAX_CONTENT_LENGTH'])}. "
f"Filename: {upload_filename}"
)

# Roll file pointer back to beginning of buffer now that decompression is complete
decompressed_file.seek(0)
app.logger.info(f"Successfully decompressed {upload_filename} to size {format_byte_size(data_length)}")

return decompressed_file


def get_dataset_from_file(uploaded_file):
"""
Derives a netcdf4.Dataset object from the provided file upload dictionary.
Optimized for handling large files (4GB+) efficiently.

@param uploaded_file open file handle to the data to convert.
@param uploaded_file open file handle or path to the data to convert.
"""
app.logger.info("Attempting to get dataset from uploaded file %s", uploaded_file)

datafile_name = uploaded_file.filename

# Handle both file paths and file objects
if isinstance(uploaded_file, str):
# It's a file path
datafile_name = os.path.basename(uploaded_file)
is_path = True
else:
# It's a file object
datafile_name = uploaded_file.filename
is_path = False

check_valid_filename(datafile_name)

# uploaded_file is an instance of werkzeug.FileStorage, which only allows us
# to read the file contents but not reference a location on disk.
# Copy the uploaded file to a named temporary file, so we can provide a disk
# location when initializing the netCDF.Dataset object below.

# Since the CF suite checks the Dataset's filename for compliance,
# we need to make sure the original is incorporated into the name of the
# temporary file used to initialize the Dataset object below.
datafile = tempfile.NamedTemporaryFile(suffix=f"_{datafile_name}")
datafile.write(uploaded_file.read())

# For large files, we want to avoid loading the entire file into memory
# Instead, we'll use a temporary file and stream the data in chunks

# Use the configured temporary directory for large file processing
temp_dir = app.config.get('TEMP_FILE_DIR', tempfile.gettempdir())

# Create a named temporary file in the specified directory
datafile = tempfile.NamedTemporaryFile(dir=temp_dir, suffix=f"_{datafile_name}")

# Stream the file in chunks to avoid memory issues
if is_path:
# If it's a path, open the file and copy it
with open(uploaded_file, 'rb') as src_file:
# Use a larger buffer size (8MB) for faster copying of large files
buffer_size = 8 * 1024 * 1024 # 8MB buffer
while True:
buffer = src_file.read(buffer_size)
if not buffer:
break
datafile.write(buffer)
else:
# If it's a file object, stream from it
# Use a larger buffer size (8MB) for faster copying of large files
buffer_size = 8 * 1024 * 1024 # 8MB buffer
while True:
buffer = uploaded_file.read(buffer_size)
if not buffer:
break
datafile.write(buffer)

# Reset file pointer to beginning
datafile.seek(0)

# Calculate hash and file size now, prior to any potential file decompression
Expand Down
27 changes: 21 additions & 6 deletions mcc/web/form_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def get_tests(form_dict, checker_map):
def parse_post_arguments(form_dict, files, checker_map):
"""
Parse a POST request from either an HTML page form or a cURL-like request.
Optimized for handling large files (4GB+) efficiently.
Aborts response if no tests or no files.

@param form_dict ImmutableMultiDict from flask or a regular
Expand All @@ -57,23 +58,37 @@ def parse_post_arguments(form_dict, files, checker_map):
@param files a dict with a flask file-like object
@param checker_map a dict of checker short names to initialized checkers
@return a dict with 'file', 'checkers', 'response' or abort()
# TODO determine if this is an additional place to verify upload size constraint
"""
from flask import current_app
app = current_app

ret = {}

# Get the selected checkers
checkers = get_tests(form_dict, checker_map)

if not checkers:
return abort(
400, "You need to choose at least one metadata convention to test your file against."
)

if 'file-upload' not in files:
# Check for file upload
# Try multiple possible field names for file uploads
uploaded_file = None
for field in ['file-upload', 'file', 'upload', 'fileUpload']:
if field in files and files[field]:
uploaded_file = files[field]
app.logger.info(f"Found file in field: {field}")
break

if not uploaded_file:
return abort(400, "Your request was empty. Please make sure you've specified a file.")
elif not files['file-upload']:
return abort(400, "There was a problem uploading your file. Please try again.")

ret['file'] = files['file-upload']

# Log file information
app.logger.info(f"File received: {uploaded_file.filename}")

# Return the file object directly - we'll handle streaming in get_dataset_from_file
ret['file'] = uploaded_file
ret['checkers'] = checkers
ret['response'] = form_dict.get('response', 'html').lower()

Expand Down
21 changes: 21 additions & 0 deletions mcc/web/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@

"""

import os
import time
import logging
from os import environ
from os.path import join

Expand All @@ -21,6 +23,11 @@
from .form_utils import parse_post_arguments
from .json_utils import CustomJSONEncoder

# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

app = Flask(__name__)

# Style options for whitespace in templated HTML code.
Expand All @@ -42,6 +49,20 @@
# Venue that MCC is deployed to (SIT, UAT, or OPS)
app.config['Venue'] = str(environ['Venue'])

# Temporary directory for file processing
app.config['TEMP_FILE_DIR'] = environ.get('TempFileLocation', '/tmp')

# Ensure the temporary directory exists and is writable
if not os.path.exists(app.config['TEMP_FILE_DIR']):
try:
os.makedirs(app.config['TEMP_FILE_DIR'], exist_ok=True)
logger.info(f"Created temporary directory: {app.config['TEMP_FILE_DIR']}")
except Exception as e:
logger.warning(f"Error creating temporary directory {app.config['TEMP_FILE_DIR']}: {str(e)}")

if not os.access(app.config['TEMP_FILE_DIR'], os.W_OK):
logger.warning(f"Temporary directory {app.config['TEMP_FILE_DIR']} is not writable")

# Mapping of checker short names to CheckSuite implementations.
# This could be easily kept up-to-date by inspection of a module, but
# prefer the explicit writing of current checkers.
Expand Down
22 changes: 22 additions & 0 deletions mcc/web/templates/about_api.html
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,31 @@ <h3>POST Examples</h3>
<p>
<em>Note that you need to append an '@' to the beginning of the filename in the CURL request when using the file-upload option.</em>
</p>
<h4>Basic Examples</h4>
<pre>curl -L -F ACDD=on -F ACDD-version=1.3 -F file-upload=@/home/user/granule.nc -F response=json {{ homepage_url }}/check
curl -L -F CF=on -F CF-version=1.7 -F file-upload=@/home/user/granule.nc -F response=html {{ homepage_url }}/check
curl -L -F GDS2=on -F GDS2-parameter=L4 -F file-upload=@/home/user/granule.nc -F response=pdf {{ homepage_url }}/check</pre>

<h4>With Progress Reporting and Extended Timeouts</h4>
<p>
<em>For large files, you may want to show progress and set longer timeouts:</em>
</p>
<pre>curl -L --progress-bar --connect-timeout 300 --max-time 3600 --keepalive-time 3600 \
-F ACDD=on -F ACDD-version=1.3 -F file-upload=@/home/user/granule.nc -F response=json {{ homepage_url }}/check</pre>

<h4>With Detailed Progress</h4>
<pre>curl -L -v --progress-meter --connect-timeout 300 --max-time 3600 --keepalive-time 3600 \
-F CF=on -F CF-version=1.7 -F file-upload=@/home/user/granule.nc -F response=html {{ homepage_url }}/check</pre>

<h4>Options Explained</h4>
<ul>
<li><code>--progress-bar</code>: Display a simple progress bar</li>
<li><code>--progress-meter</code>: Display detailed progress information</li>
<li><code>-v</code>: Verbose output showing request and response headers</li>
<li><code>--connect-timeout 300</code>: Wait up to 300 seconds when connecting</li>
<li><code>--max-time 3600</code>: Allow the whole operation to take up to 3600 seconds (1 hour)</li>
<li><code>--keepalive-time 3600</code>: Keep the connection alive for 3600 seconds</li>
</ul>
</div>
</div>
</div>
Expand Down
Loading