From ea2c0eeacd59bd825dbb662b55ac308cb465c11e Mon Sep 17 00:00:00 2001 From: Jan Stroetmann Date: Thu, 24 Apr 2025 13:47:11 +0900 Subject: [PATCH 1/3] fix url normalization --- modules/processors/url_processor.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/modules/processors/url_processor.py b/modules/processors/url_processor.py index ce686bf..64a93ed 100644 --- a/modules/processors/url_processor.py +++ b/modules/processors/url_processor.py @@ -53,7 +53,8 @@ def is_valid_url(url: str, base_url: str) -> bool: def normalize_url(url: str) -> str: """ - Normalize a URL by removing trailing slashes and standardizing the scheme. + Normalize a URL by removing trailing slashes and standardizing the scheme + while preserving case in the path component. Args: url (str): The URL to normalize. @@ -61,10 +62,25 @@ def normalize_url(url: str) -> str: Returns: str: The normalized URL. """ - parsed = urlparse(url.lower()) - scheme = parsed.scheme or 'https' # Default to https if no scheme is provided - path = parsed.path.rstrip('/') # Remove trailing slash from path - return f"{scheme}://{parsed.netloc}{path}" + parsed = urlparse(url) + # Normalize scheme (case-insensitive) + scheme = parsed.scheme.lower() or 'https' # Default to https if no scheme is provided + # Normalize netloc (domain is case-insensitive) + netloc = parsed.netloc.lower() + # Preserve case in path but remove trailing slash + path = parsed.path.rstrip('/') + # Preserve query and fragment + query = parsed.query + fragment = parsed.fragment + + # Reconstruct the URL + normalized_url = f"{scheme}://{netloc}{path}" + if query: + normalized_url += f"?{query}" + if fragment: + normalized_url += f"#{fragment}" + + return normalized_url def url_matches_base(url: str, base_url: str) -> bool: """ From 21ab35d605f745d98ce4838648d84d22ca33ca75 Mon Sep 17 00:00:00 2001 From: Jan Stroetmann Date: Thu, 24 Apr 2025 14:08:12 +0900 Subject: [PATCH 2/3] Make discovered urls optional in output --- main.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 774ecca..1e033ca 100644 --- a/main.py +++ b/main.py @@ -26,11 +26,13 @@ normalize_url, ) + async def run_scraping( base_url: str, discovery_mode: bool, force_scrape_method: str, output_format: str, + include_urls: bool = False, ) -> Tuple[Dict[str, Any], int]: """ Run the web scraping process. @@ -40,6 +42,7 @@ async def run_scraping( discovery_mode (bool): Whether to scrape the entire site or just the base URL. force_scrape_method (str): Method to force for scraping ('req' or 'sel'). output_format (str): The desired output format ('csv' or 'json'). + include_urls (bool): Whether to include discovered URLs in the output. Returns: Tuple[Dict[str, Any], int]: A tuple containing the formatted output @@ -59,7 +62,7 @@ async def run_scraping( results = await run_scrapers(base_url, discovery_mode, force_scrape_method) - formatted_output = format_output(results, output_format) + formatted_output = format_output(results, output_format, include_urls) total_urls_scraped = len(results) if output_format == 'json': @@ -106,6 +109,11 @@ def main() -> None: choices=['req', 'sel'], help="Force scraping with either requests or selenium" ) + parser.add_argument( + "--include-urls", + action="store_true", + help="Include discovered URLs in the output (useful for debugging, but not recommended for LLM context)" + ) args = parser.parse_args() base_url = normalize_url(args.url) @@ -118,6 +126,7 @@ def main() -> None: "log_level": args.log, "output_format": args.format, "save_directory": args.savename or get_domain(base_url), + "include_urls": args.include_urls, } # Set up logging @@ -137,7 +146,7 @@ def main() -> None: logging.info("Starting web scraping process...") formatted_output, total_urls_scraped = asyncio.run( - run_scraping(base_url, args.discovery, args.force, args.format) + run_scraping(base_url, args.discovery, args.force, args.format, args.include_urls) ) filename = set_filename(args.format, now) From d8c61bc8ad99ec9efd8e61d85604035aabc9973c Mon Sep 17 00:00:00 2001 From: Jan Stroetmann Date: Thu, 24 Apr 2025 14:24:01 +0900 Subject: [PATCH 3/3] fix formatting, exclude unnecessary metadata --- main.py | 26 ++++++++++++++++++++-- modules/utils/utils.py | 50 ++++++++++++++++++++++++++++++++---------- 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index 1e033ca..9387882 100644 --- a/main.py +++ b/main.py @@ -33,6 +33,7 @@ async def run_scraping( force_scrape_method: str, output_format: str, include_urls: bool = False, + essential_metadata_only: bool = True, ) -> Tuple[Dict[str, Any], int]: """ Run the web scraping process. @@ -43,6 +44,7 @@ async def run_scraping( force_scrape_method (str): Method to force for scraping ('req' or 'sel'). output_format (str): The desired output format ('csv' or 'json'). include_urls (bool): Whether to include discovered URLs in the output. + essential_metadata_only (bool): Whether to include only essential metadata fields. Returns: Tuple[Dict[str, Any], int]: A tuple containing the formatted output @@ -62,7 +64,14 @@ async def run_scraping( results = await run_scrapers(base_url, discovery_mode, force_scrape_method) - formatted_output = format_output(results, output_format, include_urls) + # Pass both include_urls and essential_metadata_only parameters + formatted_output = format_output( + results, + output_format, + include_urls=include_urls, + essential_metadata_only=essential_metadata_only + ) + total_urls_scraped = len(results) if output_format == 'json': @@ -114,6 +123,11 @@ def main() -> None: action="store_true", help="Include discovered URLs in the output (useful for debugging, but not recommended for LLM context)" ) + parser.add_argument( + "--full-metadata", + action="store_true", + help="Include all metadata fields (by default, only essential fields like url, title, and content_type are included)" + ) args = parser.parse_args() base_url = normalize_url(args.url) @@ -127,6 +141,7 @@ def main() -> None: "output_format": args.format, "save_directory": args.savename or get_domain(base_url), "include_urls": args.include_urls, + "essential_metadata_only": not args.full_metadata, } # Set up logging @@ -146,7 +161,14 @@ def main() -> None: logging.info("Starting web scraping process...") formatted_output, total_urls_scraped = asyncio.run( - run_scraping(base_url, args.discovery, args.force, args.format, args.include_urls) + run_scraping( + base_url, + args.discovery, + args.force, + args.format, + include_urls=args.include_urls, + essential_metadata_only=not args.full_metadata + ) ) filename = set_filename(args.format, now) diff --git a/modules/utils/utils.py b/modules/utils/utils.py index 56d42fb..df95b7f 100644 --- a/modules/utils/utils.py +++ b/modules/utils/utils.py @@ -83,7 +83,7 @@ def is_image_content_type(url): logging.error(f"Error checking content type for {url}") return False -def format_output(results, output_format): +def format_output(results, output_format, include_urls=False, essential_metadata_only=True): """ Format the scraped results according to the specified output format. @@ -91,7 +91,8 @@ def format_output(results, output_format): results (dict): Dictionary of scraped results with URLs as keys and dictionaries containing 'content', 'discovered_urls', and 'metadata' as values output_format (str): Desired output format ('csv' or 'json') - sitemap_urls (set): Set of URLs from the sitemap + include_urls (bool, optional): Whether to include discovered URLs in the output. Defaults to False. + essential_metadata_only (bool, optional): Whether to include only essential metadata. Defaults to True. Returns: list or dict: Formatted data ready for output. For CSV, a list of lists where the first row @@ -101,19 +102,46 @@ def format_output(results, output_format): ValueError: If an invalid output format is specified """ sorted_results = dict(sorted(results.items())) + + # Filter metadata if requested + if essential_metadata_only: + for url, data in sorted_results.items(): + if 'metadata' in data: + # Keep only essential metadata fields + essential_fields = ['url', 'title', 'content_type'] + data['metadata'] = {k: v for k, v in data['metadata'].items() if k in essential_fields} if output_format == 'csv': - csv_data = [['URL', 'Content', 'Discovered URLs', 'Metadata']] - for url, data in sorted_results.items(): - metadata_str = json.dumps(data.get('metadata', {})) - csv_data.append([ - url, - data['content'], - ', '.join(data['discovered_urls']), - metadata_str - ]) + if include_urls: + csv_data = [['URL', 'Content', 'Discovered URLs', 'Metadata']] + for url, data in sorted_results.items(): + metadata_str = json.dumps(data.get('metadata', {})) + csv_data.append([ + url, + data['content'], + ', '.join(data.get('discovered_urls', [])), + metadata_str + ]) + else: + csv_data = [['URL', 'Content', 'Metadata']] + for url, data in sorted_results.items(): + metadata_str = json.dumps(data.get('metadata', {})) + csv_data.append([ + url, + data['content'], + metadata_str + ]) return csv_data elif output_format == 'json': + if not include_urls: + # Create a copy without the discovered_urls for each entry + clean_results = {} + for url, data in sorted_results.items(): + clean_results[url] = { + 'metadata': data.get('metadata', {}), + 'content': data['content'] + } + return clean_results return sorted_results else: raise ValueError(f"Invalid output format: {output_format}")