diff --git a/src/utils/file_utils.py b/src/utils/file_utils.py index 764f90694..987e908d9 100644 --- a/src/utils/file_utils.py +++ b/src/utils/file_utils.py @@ -1,19 +1,18 @@ """File handling utilities for OpenRAG""" - import os +import re import tempfile from contextlib import contextmanager from typing import Optional +from urllib.parse import unquote, urlparse @contextmanager def auto_cleanup_tempfile(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Optional[str] = None): """ Context manager for temporary files that automatically cleans up. - Unlike tempfile.NamedTemporaryFile with delete=True, this keeps the file on disk for the duration of the context, making it safe for async operations. - Usage: with auto_cleanup_tempfile(suffix=".pdf") as tmp_path: # Write to the file @@ -22,12 +21,10 @@ def auto_cleanup_tempfile(suffix: Optional[str] = None, prefix: Optional[str] = # Use tmp_path for processing result = await process_file(tmp_path) # File is automatically deleted here - Args: suffix: Optional file suffix/extension (e.g., ".pdf") prefix: Optional file prefix dir: Optional directory for temp file - Yields: str: Path to the temporary file """ @@ -48,7 +45,6 @@ def auto_cleanup_tempfile(suffix: Optional[str] = None, prefix: Optional[str] = def safe_unlink(path: str) -> None: """ Safely delete a file, ignoring errors if it doesn't exist. - Args: path: Path to the file to delete """ @@ -84,4 +80,69 @@ def clean_connector_filename(filename: str, mimetype: str) -> str: clean_name = filename.replace(" ", "_").replace("/", "_") if not clean_name.lower().endswith(suffix.lower()): return clean_name + suffix - return clean_name \ No newline at end of file + return clean_name + + +# Characters that are unsafe in filenames across Linux, macOS, and Windows +_UNSAFE_FILENAME_CHARS = re.compile(r'[<>:"/\\|?*\x00-\x1f]') +_MULTI_UNDERSCORE = re.compile(r'_+') + + +def sanitize_filename(url: str, fallback: str = "document", max_length: int = 200) -> str: + """ + Derive a safe filename from a URL for use during document ingestion. + + Decodes percent-encoding, extracts the last path segment, strips query + strings and fragments, and removes characters that are unsafe on any + major OS filesystem. + + Examples:: + + sanitize_filename("https://example.com/docs/my%20report.pdf") + # "my_report.pdf" + + sanitize_filename("https://example.com/blog/post?id=42#section") + # "post" + + sanitize_filename("https://example.com/") + # "document" + + Args: + url: The source URL to derive a filename from. + fallback: Name to use when no usable path segment can be found. + max_length: Maximum number of characters in the returned filename + (excluding extension). Defaults to 200. + + Returns: + A cleaned filename string safe for use on Linux, macOS, and Windows. + """ + try: + parsed = urlparse(url) + # Take the last non-empty path segment, ignoring query/fragment + path_segment = parsed.path.rstrip("/").rsplit("/", 1)[-1] + # Decode percent-encoding (e.g. %20 → space) + path_segment = unquote(path_segment) + except Exception: + path_segment = "" + + if not path_segment: + return fallback + + # Replace spaces and unsafe characters with underscores + name = path_segment.replace(" ", "_") + name = _UNSAFE_FILENAME_CHARS.sub("_", name) + # Collapse consecutive underscores + name = _MULTI_UNDERSCORE.sub("_", name).strip("_") + + if not name: + return fallback + + # Respect max_length while preserving the extension if present + if "." in name: + stem, _, ext = name.rpartition(".") + stem = stem[:max_length] + name = f"{stem}.{ext}" if stem else fallback + else: + name = name[:max_length] + + return name or fallback