diff --git a/pyproject.toml b/pyproject.toml index bdfecd610..46616464e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -729,30 +729,8 @@ verify = [ # Web Module - Web utilities # Use: pip install scitex[web] -web = [ - "aiohttp", - "beautifulsoup4", - "readability-lxml", - "requests", - "Pillow", - "matplotlib", - "tqdm", - "joblib", - "scikit-learn", - "pytest-asyncio", - "ruamel.yaml", - "xarray", - "seaborn", - "scipy", - "markdown2", - "anthropic", - "openai", - "google-genai", - "groq", - # # Heavy dependencies handled by _AVAILABLE flags - # "torch", - # "umap-learn", -] +# Real implementation lives in the standalone scitex-web package. +web = ["scitex-web[readability]>=0.1.0"] # Clew Module - Hash-based verification for reproducible science (Ariadne's thread) # Use: pip install scitex[clew] diff --git a/src/scitex/web/__init__.py b/src/scitex/web/__init__.py index aa46bcc0e..01eb10cba 100755 --- a/src/scitex/web/__init__.py +++ b/src/scitex/web/__init__.py @@ -1,35 +1,20 @@ -#!/usr/bin/env python3 -"""Web-related utilities module for scitex.""" +"""SciTeX web — thin compatibility shim for scitex-web. -from ._scraping import get_image_urls, get_urls -from ._search_pubmed import ( - _fetch_details, - _get_citation, - _parse_abstract_xml, - _search_pubmed, -) -from ._search_pubmed import batch__fetch_details as _batch__fetch_details -from ._search_pubmed import fetch_async as _fetch_async -from ._search_pubmed import format_bibtex as _format_bibtex -from ._search_pubmed import get_crossref_metrics -from ._search_pubmed import parse_args as _parse_args -from ._search_pubmed import run_main as _run_main -from ._search_pubmed import save_bibtex as _save_bibtex -from ._search_pubmed import search_pubmed -from ._summarize_url import crawl_to_json, crawl_url -from ._summarize_url import extract_main_content as _extract_main_content -from ._summarize_url import summarize_all as _summarize_all -from ._summarize_url import summarize_url -from .download_images import download_images +Aliases ``scitex.web`` to the standalone ``scitex_web`` package via ``sys.modules``. +``scitex.web is scitex_web``. -__all__ = [ - # Public API - "search_pubmed", - "get_crossref_metrics", - "summarize_url", - "crawl_url", - "crawl_to_json", - "get_urls", - "download_images", - "get_image_urls", -] +Install: ``pip install scitex[web]`` (or ``pip install scitex-web``). +See: https://github.com/ywatanabe1989/scitex-web +""" + +import sys as _sys + +try: + import scitex_web as _real +except ImportError as _e: # pragma: no cover + raise ImportError( + "scitex.web requires the 'scitex-web' package. " + "Install with: pip install scitex[web] (or: pip install scitex-web)" + ) from _e + +_sys.modules[__name__] = _real diff --git a/src/scitex/web/_scraping.py b/src/scitex/web/_scraping.py deleted file mode 100755 index d97dc668a..000000000 --- a/src/scitex/web/_scraping.py +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/env python3 -# File: ./src/scitex/web/_scraping.py - -"""Web scraping utilities for extracting URLs. - -``bs4`` is an optional third-party dependency (only needed when actually -scraping). Do **not** import it at module load -- doing so leaks the -``ModuleNotFoundError`` through ``scitex.web.__init__`` and through -``scitex.cli.web``, which in turn breaks ``scitex --json`` and -``scitex --help-recursive`` on any install without ``beautifulsoup4``. -See ywatanabe1989/todo#279. The import now lives inside each scraping -function, so merely importing this module is side-effect-free. -""" - -import re -import urllib.parse -from typing import List, Optional, Set - -import requests - -from scitex.logging import getLogger - -logger = getLogger(__name__) - -DEFAULT_TIMEOUT = 10 -DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" - - -def get_urls( - url: str, - pattern: Optional[str] = None, - absolute: bool = True, - same_domain: bool = False, - include_external: bool = True, -) -> List[str]: - """ - Extract all URLs from a webpage. - - Args: - url: The URL of the webpage to scrape - pattern: Optional regex pattern to filter URLs (e.g., r'\\.pdf$' for PDF files) - absolute: If True, convert relative URLs to absolute URLs - same_domain: If True, only return URLs from the same domain - include_external: If True, include external links (only applies if same_domain=False) - - Returns: - List of URLs found on the page - - Example: - >>> urls = get_urls('https://example.com', pattern=r'\\.pdf$') - >>> urls = get_urls('https://example.com', same_domain=True) - """ - from bs4 import BeautifulSoup # lazy: see module docstring, todo#279 - - try: - logger.info(f"Fetching URLs from: {url}") - response = requests.get( - url, - timeout=DEFAULT_TIMEOUT, - headers={"User-Agent": DEFAULT_USER_AGENT}, - ) - response.raise_for_status() - except requests.RequestException as e: - logger.error(f"Failed to fetch URL {url}: {e}") - return [] - - soup = BeautifulSoup(response.text, "html.parser") - urls_found: Set[str] = set() - - parsed_base = urllib.parse.urlparse(url) - - for link in soup.find_all("a", href=True): - href = link["href"] - - if absolute: - href = urllib.parse.urljoin(url, href) - - if same_domain: - parsed_href = urllib.parse.urlparse(href) - if parsed_href.netloc != parsed_base.netloc: - continue - elif not include_external: - parsed_href = urllib.parse.urlparse(href) - if parsed_href.netloc and parsed_href.netloc != parsed_base.netloc: - continue - - if pattern and not re.search(pattern, href): - continue - - urls_found.add(href) - - result = sorted(list(urls_found)) - logger.info(f"Found {len(result)} URLs") - return result - - -def get_image_urls( - url: str, - pattern: Optional[str] = None, - same_domain: bool = False, -) -> List[str]: - """ - Extract all image URLs from a webpage without downloading them. - - Args: - url: The URL of the webpage to scrape - pattern: Optional regex pattern to filter image URLs - same_domain: If True, only return images from the same domain - - Returns: - List of image URLs found on the page - - Note: - - SVG files are automatically skipped (vector graphics) - - Checks both 'src' and 'data-src' attributes for lazy-loaded images - - Example: - >>> img_urls = get_image_urls('https://example.com') - >>> img_urls = get_image_urls('https://example.com', pattern=r'\\.png$') - """ - from bs4 import BeautifulSoup # lazy: see module docstring, todo#279 - - try: - logger.info(f"Fetching image URLs from: {url}") - response = requests.get( - url, - timeout=DEFAULT_TIMEOUT, - headers={"User-Agent": DEFAULT_USER_AGENT}, - ) - response.raise_for_status() - except requests.RequestException as e: - logger.error(f"Failed to fetch URL {url}: {e}") - return [] - - soup = BeautifulSoup(response.text, "html.parser") - image_urls: Set[str] = set() - - parsed_base = urllib.parse.urlparse(url) - - for img in soup.find_all("img"): - img_url = img.get("src") or img.get("data-src") - if not img_url: - continue - - img_url = urllib.parse.urljoin(url, img_url) - - if img_url.lower().endswith((".svg", ".svgz")): - continue - - if same_domain: - parsed_img = urllib.parse.urlparse(img_url) - if parsed_img.netloc != parsed_base.netloc: - continue - - if pattern and not re.search(pattern, img_url): - continue - - image_urls.add(img_url) - - result = sorted(list(image_urls)) - logger.info(f"Found {len(result)} image URLs") - return result diff --git a/src/scitex/web/_search_pubmed.py b/src/scitex/web/_search_pubmed.py deleted file mode 100755 index f41aa1fbd..000000000 --- a/src/scitex/web/_search_pubmed.py +++ /dev/null @@ -1,505 +0,0 @@ -#!/usr/bin/env python3 -# Time-stamp: "2024-11-13 14:30:43 (ywatanabe)" -# File: ./scitex_repo/src/scitex/web/_search_pubmed.py - -""" -1. Functionality: - - Searches PubMed database for scientific articles - - Retrieves detailed information about matched articles - - Displays article metadata including title, authors, journal, year, and abstract -2. Input: - - Search query string (e.g., "epilepsy prediction") - - Optional parameters for batch size and result limit -3. Output: - - Formatted article information displayed to stdout - - BibTeX file with official citations -4. Prerequisites: - - Internet connection - - requests package - - scitex package -""" - -"""Imports""" -import argparse -import asyncio -import xml.etree.ElementTree as ET -from typing import Any, Dict, List, Optional, Union - -import aiohttp -import requests - -import scitex - -"""Functions & Classes""" - - -def _search_pubmed(query: str, retmax: int = 300) -> Dict[str, Any]: - try: - base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" - search_url = f"{base_url}esearch.fcgi" - params = { - "db": "pubmed", - "term": query, - "retmax": retmax, - "retmode": "json", - "usehistory": "y", - } - - response = requests.get(search_url, params=params, timeout=10) - if not response.ok: - scitex.str.printc("PubMed API request failed", c="red") - return {} - return response.json() - except requests.exceptions.RequestException as e: - scitex.str.printc(f"Network error: {e}", c="red") - return {} - - -def _fetch_details( - webenv: str, query_key: str, retstart: int = 0, retmax: int = 100 -) -> Dict[str, Any]: - """Fetches detailed information including abstracts for articles. - - Parameters - ---------- - [Previous parameters remain the same] - - Returns - ------- - Dict[str, Any] - Dictionary containing article details and abstracts - """ - base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" - - # Fetch abstracts - efetch_url = f"{base_url}efetch.fcgi" - efetch_params = { - "db": "pubmed", - "query_key": query_key, - "WebEnv": webenv, - "retstart": retstart, - "retmax": retmax, - "retmode": "xml", - "rettype": "abstract", - "field": "abstract,mesh", - } - - abstract_response = requests.get(efetch_url, params=efetch_params) - - # Fetch metadata - fetch_url = f"{base_url}esummary.fcgi" - params = { - "db": "pubmed", - "query_key": query_key, - "WebEnv": webenv, - "retstart": retstart, - "retmax": retmax, - "retmode": "json", - } - - details_response = requests.get(fetch_url, params=params) - - if not all([abstract_response.ok, details_response.ok]): - # print(f"Error fetching data") - return {} - - return { - "abstracts": abstract_response.text, - "details": details_response.json(), - } - - -def _parse_abstract_xml(xml_text: str) -> Dict[str, tuple]: - """Parses XML response to extract abstracts. - - Parameters - ---------- - xml_text : str - XML response from PubMed - - Returns - ------- - Dict[str, str] - Dictionary mapping PMIDs to abstracts - """ - root = ET.fromstring(xml_text) - results = {} - - for article in root.findall(".//PubmedArticle"): - pmid = article.find(".//PMID").text - abstract_element = article.find(".//Abstract/AbstractText") - abstract = abstract_element.text if abstract_element is not None else "" - - # DOI - doi_element = article.find(".//ArticleId[@IdType='doi']") - doi = doi_element.text if doi_element is not None else "" - - # Get MeSH terms - keywords = [] - mesh_terms = article.findall(".//MeshHeading/DescriptorName") - keywords = [term.text for term in mesh_terms if term is not None] - - results[pmid] = (abstract, keywords, doi) - - return results - - -def _get_citation(pmid: str) -> str: - """Gets official citation in BibTeX format. - - Parameters - ---------- - pmid : str - PubMed ID - - Returns - ------- - str - Official BibTeX citation - """ - base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" - cite_url = f"{base_url}efetch.fcgi" - params = { - "db": "pubmed", - "id": pmid, - "rettype": "bibtex", - "retmode": "text", - } - response = requests.get(cite_url, params=params) - return response.text if response.ok else "" - - -def get_crossref_metrics( - doi: str, api_key: Optional[str] = None, email: Optional[str] = None -) -> Dict[str, Any]: - """Get article metrics from CrossRef using DOI.""" - import os - - base_url = "https://api.crossref.org/works/" - - # Use provided email or fallback to environment variables - if not email: - email = ( - os.getenv("SCITEX_SCHOLAR_CROSSREF_EMAIL") - or os.getenv("SCITEX_CROSSREF_EMAIL") - or os.getenv("SCITEX_SCHOLAR_PUBMED_EMAIL") - or os.getenv("SCITEX_PUBMED_EMAIL", "research@example.com") - ) - headers = {"User-Agent": f"SciTeX/1.0 (mailto:{email})"} - - # Add API key as query parameter if provided - params = {} - if api_key: - params["key"] = api_key - - try: - response = requests.get( - f"{base_url}{doi}", headers=headers, params=params, timeout=10 - ) - if response.ok: - data = response.json()["message"] - return { - "citations": data.get("is-referenced-by-count", 0), - "type": data.get("type", ""), - "publisher": data.get("publisher", ""), - "references": len(data.get("reference", [])), - "doi": data.get("DOI", ""), - } - except Exception as e: - print(f"CrossRef API error for DOI {doi}: {e}") - return {} - - -async def get_crossref_metrics_async( - doi: str, api_key: Optional[str] = None, email: Optional[str] = None -) -> Dict[str, Any]: - """Get article metrics from CrossRef using DOI (async version).""" - import os - - base_url = "https://api.crossref.org/works/" - - # Use provided email or fallback to environment variables - if not email: - email = ( - os.getenv("SCITEX_SCHOLAR_CROSSREF_EMAIL") - or os.getenv("SCITEX_CROSSREF_EMAIL") - or os.getenv("SCITEX_SCHOLAR_PUBMED_EMAIL") - or os.getenv("SCITEX_PUBMED_EMAIL", "research@example.com") - ) - headers = {"User-Agent": f"SciTeX/1.0 (mailto:{email})"} - - # Add API key as query parameter if provided - params = {} - if api_key: - params["key"] = api_key - - try: - async with aiohttp.ClientSession() as session: - async with session.get( - f"{base_url}{doi}", headers=headers, params=params, timeout=10 - ) as response: - if response.ok: - data = await response.json() - message = data["message"] - return { - "citations": message.get("is-referenced-by-count", 0), - "type": message.get("type", ""), - "publisher": message.get("publisher", ""), - "references": len(message.get("reference", [])), - "doi": message.get("DOI", ""), - } - except Exception as e: - print(f"CrossRef API error for DOI {doi}: {e}") - return {} - - -def save_bibtex( - papers: Dict[str, Any], abstracts: Dict[str, str], output_file: str -) -> None: - """Saves paper metadata as BibTeX file with abstracts. - - Parameters - ---------- - papers : Dict[str, Any] - Dictionary of paper metadata - abstracts : Dict[str, str] - Dictionary of PMIDs to abstracts - output_file : str - Output file path - """ - with open(output_file, "w", encoding="utf-8") as bibtex_file: - for pmid, paper in papers.items(): - if pmid == "uids": - continue - - citation = _get_citation(pmid) - if citation: - bibtex_file.write(citation) - else: - # Use default tuple if pmid not in abstracts - default_data = ("", [], "") # abstract, keywords, doi - bibtex_entry = format_bibtex( - paper, pmid, abstracts.get(pmid, default_data) - ) - bibtex_file.write(bibtex_entry + "\n") - scitex.str.printc(f"Saved to: {str(bibtex_file)}", c="yellow") - - -def format_bibtex(paper: Dict[str, Any], pmid: str, abstract_data: tuple) -> str: - abstract, keywords, doi = abstract_data - - # Get CrossRef and Scimago metrics - crossref_metrics = get_crossref_metrics(doi) if doi else {} - journal = paper.get("source", "Unknown Journal") - # journal_metrics = get_journal_metrics(journal) - - authors = paper.get("authors", [{"name": "Unknown"}]) - author_names = " and ".join(author["name"] for author in authors) - pubdate = paper.get("pubdate", "") - year = pubdate.split()[0] if pubdate.strip() else "" - title = paper.get("title", "No Title") - - # Name formatting - first_author = authors[0]["name"] - first_name = first_author.split()[0] - last_name = first_author.split()[-1] - clean_first_name = "".join(c for c in first_name if c.isalnum()) - clean_last_name = "".join(c for c in last_name if c.isalnum()) - - # Title words - title_words = title.split() - first_title_word = "".join(c.lower() for c in title_words[0] if c.isalnum()) - second_title_word = ( - "".join(c.lower() for c in title_words[1] if c.isalnum()) - if len(title_words) > 1 - else "" - ) - - citation_key = f"{clean_first_name}.{clean_last_name}_{year}_{first_title_word}_{second_title_word}" - - entry = f"""@article{{{citation_key}, - author = {{{author_names}}}, - title = {{{title}}}, - journal = {{{journal}}}, - year = {{{year}}}, - pmid = {{{pmid}}}, - doi = {{{doi}}}, - publisher = {{{crossref_metrics.get("publisher", "")}}}, - references = {{{crossref_metrics.get("references", 0)}}}, - keywords = {{{", ".join(keywords)}}}, - abstract = {{{abstract}}} -}} -""" - return entry - - -async def fetch_async( - session: aiohttp.ClientSession, url: str, params: Dict -) -> Union[Dict, str]: - """Asynchronous fetch helper.""" - async with session.get(url, params=params) as response: - if response.status == 200: - if params.get("retmode") == "xml": - return await response.text() - elif params.get("retmode") == "json": - return await response.json() - return await response.text() - return {} - - -async def batch__fetch_details(pmids: List[str], batch_size: int = 20) -> List[Dict]: - """Fetches details for multiple PMIDs concurrently. - - Parameters - ---------- - pmids : List[str] - List of PubMed IDs - batch_size : int, optional - Size of each batch for concurrent requests - - Returns - ------- - List[Dict] - List of response data - """ - base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" - - async with aiohttp.ClientSession() as session: - tasks = [] - for i in range(0, len(pmids), batch_size): - batch_pmids = pmids[i : i + batch_size] - - # Fetch both details and citations concurrently - efetch_params = { - "db": "pubmed", - "id": ",".join(batch_pmids), - "retmode": "xml", - "rettype": "abstract", - } - - esummary_params = { - "db": "pubmed", - "id": ",".join(batch_pmids), - "retmode": "json", - } - - tasks.append(fetch_async(session, f"{base_url}efetch.fcgi", efetch_params)) - tasks.append( - fetch_async(session, f"{base_url}esummary.fcgi", esummary_params) - ) - - results = await asyncio.gather(*tasks) - return results - - -def search_pubmed(query: str, n_entries: int = 10) -> int: - # query = args.query or "epilepsy prediction" - # print(f"Using query: {query}") - - search_results = _search_pubmed(query) - if not search_results: - # print("No results found or error occurred") - return 1 - - pmids = search_results["esearchresult"]["idlist"] - count = len(pmids) - # print(f"Found {count:,} results") - - output_file = f"pubmed_{query.replace(' ', '_')}.bib" - # print(f"Saving results to: {output_file}") - - # Process in larger batches asynchronously - results = asyncio.run(batch__fetch_details(pmids[:n_entries])) - # here, results seems long string - - # Process results and save - with open(output_file, "w", encoding="utf-8") as f: - for i in range(0, len(results), 2): - xml_response = results[i] - json_response = results[i + 1] - - if isinstance(xml_response, str): - abstracts = _parse_abstract_xml(xml_response) - if isinstance(json_response, dict) and "result" in json_response: - details = json_response["result"] - save_bibtex(details, abstracts, output_file) - - # Process results and save - temp_bibtex = [] - for i in range(0, len(results), 2): - xml_response = results[i] - json_response = results[i + 1] - - if isinstance(xml_response, str): - abstracts = _parse_abstract_xml(xml_response) - if isinstance(json_response, dict) and "result" in json_response: - details = json_response["result"] - for pmid in details: - if pmid != "uids": - citation = _get_citation(pmid) - if citation: - temp_bibtex.append(citation) - else: - entry = format_bibtex( - details[pmid], pmid, abstracts.get(pmid, "") - ) - temp_bibtex.append(entry) - - # Write all entries at once - with open(output_file, "w", encoding="utf-8") as f: - f.write("\n".join(temp_bibtex)) - - return 0 - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="PubMed article search and retrieval tool" - ) - parser.add_argument( - "--query", - "-q", - type=str, - help='Search query (default: "epilepsy prediction")', - ) - parser.add_argument( - "--n_entries", - "-n", - type=int, - default=10, - help='Search query (default: "epilepsy prediction")', - ) - args = parser.parse_args() - scitex.str.printc(args, c="yellow") - return args - - -def run_main() -> None: - global CONFIG - import sys - - import matplotlib.pyplot as plt - - import scitex - - CONFIG, sys.stdout, sys.stderr, plt, CC = scitex.session.start( - sys, - verbose=False, - ) - - args = parse_args() - exit_status = search_pubmed(args.query, args.n_entries) - - scitex.session.close( - CONFIG, - verbose=False, - notify=False, - message="", - exit_status=exit_status, - ) - - -if __name__ == "__main__": - run_main() - -# EOF diff --git a/src/scitex/web/_skills/SKILL.md b/src/scitex/web/_skills/SKILL.md deleted file mode 100644 index 7152595b7..000000000 --- a/src/scitex/web/_skills/SKILL.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -name: stx.web -description: Web utilities for PubMed search, URL scraping, content summarization, and image downloading. ---- - -# stx.web - -The `stx.web` module provides web utilities for scientific use cases: searching PubMed for papers, scraping URLs for content and images, summarizing web pages, and downloading images in bulk. - -## Python API - -```python -import scitex as stx - -# Search PubMed -papers = stx.web.search_pubmed("EEG deep learning classification", max_results=20) -metrics = stx.web.get_crossref_metrics(doi="10.1000/xyz123") - -# Summarize a URL -summary = stx.web.summarize_url("https://arxiv.org/abs/2401.00000") - -# Crawl URL for structured content -content = stx.web.crawl_url("https://example.com") -json_data = stx.web.crawl_to_json("https://example.com") - -# Scrape URLs and images from a page -urls = stx.web.get_urls("https://example.com") -image_urls = stx.web.get_image_urls("https://example.com") - -# Download images -stx.web.download_images( - urls=image_urls, - output_dir="./downloaded_images", - max_workers=5 -) -``` - -## Key Features - -- `search_pubmed(query, max_results)` — search PubMed and return structured paper data -- `get_crossref_metrics(doi)` — fetch citation counts and impact metrics from CrossRef -- `summarize_url(url)` — extract and summarize main content from a URL -- `crawl_url` / `crawl_to_json` — structured web crawling -- `get_urls` / `get_image_urls` — scrape links and images from pages -- `download_images(urls, output_dir)` — bulk image download with concurrency diff --git a/src/scitex/web/_skills/images.md b/src/scitex/web/_skills/images.md deleted file mode 100644 index ad994fb47..000000000 --- a/src/scitex/web/_skills/images.md +++ /dev/null @@ -1,43 +0,0 @@ ---- -description: Bulk-download images from a web page with download_images() and collect all image URLs with get_image_urls(). ---- - -# Image Downloading - -## download_images - -Download all images found on a web page to a local directory. - -```python -download_images(url: str, output_dir: str = ".", extensions: list[str] | None = None) -> list[str] -``` - -Returns a list of local file paths for successfully downloaded images. - -```python -import scitex as stx - -saved = stx.web.download_images( - "https://example.com/gallery", - output_dir="./downloaded_images", - extensions=[".png", ".jpg"], -) -print(f"Downloaded {len(saved)} images") -``` - ---- - -## get_image_urls - -Collect all image URLs from a web page without downloading them. - -```python -get_image_urls(url: str) -> list[str] -``` - -```python -import scitex as stx - -img_urls = stx.web.get_image_urls("https://example.com/gallery") -print(img_urls[:3]) -``` diff --git a/src/scitex/web/_skills/pubmed.md b/src/scitex/web/_skills/pubmed.md deleted file mode 100644 index e61a1e8b5..000000000 --- a/src/scitex/web/_skills/pubmed.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -description: Search PubMed for papers matching a query with search_pubmed() and retrieve Crossref citation counts with get_crossref_metrics(). ---- - -# PubMed Search - -## search_pubmed - -Query PubMed and return structured results including abstracts, authors, and DOIs. - -```python -search_pubmed( - query: str, - max_results: int = 20, - email: str | None = None, -) -> list[dict] -``` - -```python -import scitex as stx - -papers = stx.web.search_pubmed("EEG epilepsy deep learning", max_results=10) -for p in papers: - print(p["title"], p.get("doi")) -``` - -Each result dict contains: `pmid`, `title`, `abstract`, `authors`, `journal`, `year`, `doi`. - ---- - -## get_crossref_metrics - -Retrieve citation count and journal impact factor for a DOI via the Crossref API. - -```python -get_crossref_metrics(doi: str) -> dict -``` - -```python -import scitex as stx - -metrics = stx.web.get_crossref_metrics("10.1038/s41586-021-03819-2") -print(metrics) -# {'cited_by': 523, 'journal': 'Nature', 'type': 'journal-article'} -``` diff --git a/src/scitex/web/_skills/url.md b/src/scitex/web/_skills/url.md deleted file mode 100644 index a8ddf881a..000000000 --- a/src/scitex/web/_skills/url.md +++ /dev/null @@ -1,71 +0,0 @@ ---- -description: Extract and summarize web page content with summarize_url(), crawl pages with crawl_url() and crawl_to_json(), and collect all hyperlinks with get_urls(). ---- - -# URL Utilities - -## summarize_url - -Fetch a URL and return a concise text summary of the main content. - -```python -summarize_url(url: str, max_length: int = 500) -> str -``` - -```python -import scitex as stx - -summary = stx.web.summarize_url("https://arxiv.org/abs/2301.12345") -print(summary) -``` - ---- - -## crawl_url - -Fetch the full main text content of a page. - -```python -crawl_url(url: str) -> str -``` - -```python -import scitex as stx - -content = stx.web.crawl_url("https://example.com/article") -print(content[:500]) -``` - ---- - -## crawl_to_json - -Fetch a page and return structured content as a dict. - -```python -crawl_to_json(url: str) -> dict -``` - -```python -import scitex as stx - -data = stx.web.crawl_to_json("https://example.com/article") -# Returns: {'title': ..., 'content': ..., 'links': [...], 'url': ...} -``` - ---- - -## get_urls - -Extract all hyperlinks from a web page. - -```python -get_urls(url: str) -> list[str] -``` - -```python -import scitex as stx - -links = stx.web.get_urls("https://example.com") -print(links[:5]) -``` diff --git a/src/scitex/web/_summarize_url.py b/src/scitex/web/_summarize_url.py deleted file mode 100755 index 5f191d95d..000000000 --- a/src/scitex/web/_summarize_url.py +++ /dev/null @@ -1,160 +0,0 @@ -#!./env/bin/python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-07-29 21:43:30 (ywatanabe)" -# ./src/scitex/web/_crawl.py - - -import json -import urllib.parse -from concurrent.futures import ThreadPoolExecutor, as_completed -from pprint import pprint - -import requests -from bs4 import BeautifulSoup -from tqdm import tqdm - -import scitex - -try: - from readability import Document -except ImportError: - try: - from readability.readability import Document - except ImportError: - Document = None - -import re - -# def crawl_url(url, max_depth=1): -# print("\nCrawling...") -# visited = set() -# to_visit = [(url, 0)] -# contents = {} - -# while to_visit: -# current_url, depth = to_visit.pop(0) -# if current_url in visited or depth > max_depth: -# continue - -# try: -# response = requests.get(current_url) -# if response.status_code == 200: -# visited.add(current_url) -# contents[current_url] = response.text -# soup = BeautifulSoup(response.text, "html.parser") - -# for link in soup.find_all("a", href=True): -# absolute_link = urllib.parse.urljoin( -# current_url, link["href"] -# ) -# if absolute_link not in visited: -# to_visit.append((absolute_link, depth + 1)) - -# except requests.RequestException: -# pass - -# return visited, contents - - -def extract_main_content(html): - if Document is None: - # Fallback: just strip HTML tags - content = re.sub("<[^<]+?>", "", html) - content = " ".join(content.split()) - return content[:5000] # Limit to first 5000 chars - - doc = Document(html) - content = doc.summary() - # Remove HTML tags - content = re.sub("<[^<]+?>", "", content) - # Remove extra whitespace - content = " ".join(content.split()) - return content - - -def crawl_url(url, max_depth=1): - print("\nCrawling...") - visited = set() - to_visit = [(url, 0)] - contents = {} - - while to_visit: - current_url, depth = to_visit.pop(0) - if current_url in visited or depth > max_depth: - continue - - try: - response = requests.get(current_url) - if response.status_code == 200: - visited.add(current_url) - main_content = extract_main_content(response.text) - contents[current_url] = main_content - soup = BeautifulSoup(response.text, "html.parser") - - for link in soup.find_all("a", href=True): - absolute_link = urllib.parse.urljoin(current_url, link["href"]) - if absolute_link not in visited: - to_visit.append((absolute_link, depth + 1)) - - except requests.RequestException: - pass - - return visited, contents - - -def crawl_to_json(start_url): - if not start_url.startswith("http"): - start_url = "https://" + start_url - crawled_urls, contents = crawl_url(start_url) - - print("\nSummalizing as json...") - - def process_url(url): - llm = scitex.ai.GenAI("gpt-4o-mini") - return { - "url": url, - "content": llm(f"Summarize this page in 1 line:\n\n{contents[url]}"), - } - - with ThreadPoolExecutor() as executor: - future_to_url = {executor.submit(process_url, url): url for url in crawled_urls} - crawled_pages = [] - for future in tqdm( - as_completed(future_to_url), - total=len(crawled_urls), - desc="Processing URLs", - ): - crawled_pages.append(future.result()) - - result = {"start_url": start_url, "crawled_pages": crawled_pages} - - return json.dumps(result, indent=2) - - -def summarize_all(json_contents): - llm = scitex.ai.GenAI("gpt-4o-mini") - out = llm(f"Summarize this json file with 5 bullet points:\n\n{json_contents}") - return out - - -def summarize_url(start_url): - json_result = crawl_to_json(start_url) - ground_summary = summarize_all(json_result) - - pprint(ground_summary) - return ground_summary, json_result - - -main = summarize_url - -if __name__ == "__main__": - import argparse - - import scitex - - parser = argparse.ArgumentParser(description="") - parser.add_argument("--url", "-u", type=str, help="(default: %(default)s)") - args = parser.parse_args() - scitex.gen.print_block(args, c="yellow") - - main(args.url) diff --git a/src/scitex/web/download_images.py b/src/scitex/web/download_images.py deleted file mode 100755 index b891eda90..000000000 --- a/src/scitex/web/download_images.py +++ /dev/null @@ -1,323 +0,0 @@ -#!/usr/bin/env python3 -# File: ./src/scitex/web/download_images.py - -""" -Image Downloader for SciTeX. - -Downloads images from URLs with minimum size filtering. - -Usage: - python -m scitex.web.download_images https://example.com - python -m scitex.web.download_images https://example.com -o ./downloads - python -m scitex.web.download_images https://example.com --min-size 800x600 -""" - -import os -import re -import urllib.parse -from concurrent.futures import ThreadPoolExecutor, as_completed -from datetime import datetime -from pathlib import Path -from typing import List, Optional, Tuple - -import requests -from tqdm import tqdm - -# NOTE: ``bs4`` is imported lazily inside functions that actually use it. -# Importing at module load leaks ``ModuleNotFoundError`` through -# ``scitex.web.__init__`` and breaks ``scitex --json`` / -# ``scitex --help-recursive`` on installs without beautifulsoup4. -# See ywatanabe1989/todo#279. - -try: - from io import BytesIO - - from PIL import Image - - PILLOW_AVAILABLE = True -except ImportError: - PILLOW_AVAILABLE = False - -from scitex.logging import getLogger - -logger = getLogger(__name__) - -# Configuration -DEFAULT_MIN_WIDTH = 400 -DEFAULT_MIN_HEIGHT = 300 -DEFAULT_TIMEOUT = 10 -DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" - - -def _get_default_download_dir() -> str: - """Get default download directory using SCITEX_DIR if available.""" - scitex_root = os.environ.get("SCITEX_DIR", os.path.expanduser("~/.scitex")) - return os.path.join(scitex_root, "web", "downloads") - - -def _normalize_url_for_directory(url: str) -> str: - """Convert URL to a safe directory name.""" - parsed = urllib.parse.urlparse(url) - domain = parsed.netloc.replace("www.", "") - path = parsed.path.strip("/").replace("/", "-") - - normalized = f"{domain}-{path}" if path else domain - normalized = re.sub(r"[^\w\-.]", "-", normalized) - normalized = re.sub(r"-+", "-", normalized) - normalized = normalized[:100].strip("-") - - return normalized - - -def _is_direct_image_url(url: str) -> bool: - """Check if URL appears to be a direct image link.""" - extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"] - path = urllib.parse.urlparse(url.lower()).path - return any(path.endswith(ext) for ext in extensions) - - -def _extract_image_urls(url: str, same_domain: bool = False) -> List[str]: - """Extract image URLs from a webpage.""" - from bs4 import BeautifulSoup # lazy: see module note, todo#279 - - try: - logger.info(f"Fetching page: {url}") - response = requests.get( - url, - timeout=DEFAULT_TIMEOUT, - headers={"User-Agent": DEFAULT_USER_AGENT}, - ) - response.raise_for_status() - except requests.RequestException as e: - logger.error(f"Failed to fetch page: {e}") - return [] - - soup = BeautifulSoup(response.content, "html.parser") - parsed_base = urllib.parse.urlparse(url) - image_urls = set() - - for img in soup.find_all("img"): - img_url = img.get("src") or img.get("data-src") - if not img_url: - continue - - img_url = urllib.parse.urljoin(url, img_url) - - if img_url.lower().endswith((".svg", ".svgz")): - continue - - if same_domain: - parsed_img = urllib.parse.urlparse(img_url) - if parsed_img.netloc != parsed_base.netloc: - continue - - image_urls.add(img_url) - - logger.info(f"Found {len(image_urls)} images on page") - return list(image_urls) - - -def _download_single_image( - img_url: str, - output_dir: Path, - counter: int, - min_size: Optional[Tuple[int, int]], -) -> Optional[str]: - """Download a single image.""" - try: - response = requests.get( - img_url, - timeout=DEFAULT_TIMEOUT, - headers={"User-Agent": DEFAULT_USER_AGENT}, - ) - response.raise_for_status() - - # Validate content-type - content_type = response.headers.get("content-type", "") - if not content_type.startswith("image/"): - logger.debug(f"Skipping non-image: {content_type}") - return None - - # Check dimensions - if min_size and PILLOW_AVAILABLE: - try: - img = Image.open(BytesIO(response.content)) - width, height = img.size - if width < min_size[0] or height < min_size[1]: - logger.debug( - f"Skipping small image: {width}x{height} " - f"(min: {min_size[0]}x{min_size[1]})" - ) - return None - except Exception: - pass - - # Determine extension - ext = "jpg" - if PILLOW_AVAILABLE: - try: - img = Image.open(BytesIO(response.content)) - fmt = img.format.lower() if img.format else "jpeg" - ext = "jpg" if fmt == "jpeg" else fmt - except Exception: - pass - elif "png" in content_type: - ext = "png" - elif "gif" in content_type: - ext = "gif" - elif "webp" in content_type: - ext = "webp" - - filename = f"{counter:04d}.{ext}" - filepath = output_dir / filename - - with open(filepath, "wb") as f: - f.write(response.content) - - logger.info(f"Downloaded: {filename}") - return str(filepath) - - except Exception as e: - logger.warning(f"Error downloading {img_url}: {e}") - return None - - -def download_images( - url: str, - output_dir: Optional[str] = None, - min_size: Optional[Tuple[int, int]] = None, - max_workers: int = 5, - same_domain: bool = False, -) -> List[str]: - """ - Download images from a URL. - - Args: - url: Webpage URL or direct image URL - output_dir: Output directory (default: $SCITEX_DIR/web/downloads) - min_size: Minimum (width, height) to filter small images (default: 400x300) - max_workers: Concurrent download threads - same_domain: Only download images from the same domain - - Returns: - List of downloaded file paths - - Example: - >>> paths = download_images("https://example.com") - >>> paths = download_images("https://example.com/photo.jpg") - >>> paths = download_images("https://example.com", min_size=(800, 600)) - """ - if not PILLOW_AVAILABLE: - logger.warning("Pillow not available. Size filtering disabled.") - min_size = None - elif min_size is None: - min_size = (DEFAULT_MIN_WIDTH, DEFAULT_MIN_HEIGHT) - - # Setup output directory - if output_dir is None: - output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR") - if output_dir is None: - output_dir = _get_default_download_dir() - - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - normalized = _normalize_url_for_directory(url) - output_path = Path(output_dir).expanduser() / f"{timestamp}-{normalized}-images" - output_path.mkdir(parents=True, exist_ok=True) - - logger.info(f"Output directory: {output_path}") - - # Get image URLs - if _is_direct_image_url(url): - image_urls = [url] - logger.info("Direct image URL detected") - else: - image_urls = _extract_image_urls(url, same_domain=same_domain) - - if not image_urls: - logger.warning("No images found") - return [] - - # Download concurrently - downloaded = [] - counter = [1] - - def download_with_counter(img_url: str) -> Optional[str]: - idx = counter[0] - counter[0] += 1 - return _download_single_image(img_url, output_path, idx, min_size) - - with ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = {executor.submit(download_with_counter, u): u for u in image_urls} - - for future in tqdm( - as_completed(futures), total=len(image_urls), desc="Downloading" - ): - result = future.result() - if result: - downloaded.append(result) - - logger.info(f"Downloaded {len(downloaded)} images to {output_path}") - return downloaded - - -def main(): - """CLI entry point.""" - import argparse - - parser = argparse.ArgumentParser( - description="Download images from URL", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - python -m scitex.web.download_images https://example.com - python -m scitex.web.download_images https://example.com -o ./downloads - python -m scitex.web.download_images https://example.com --min-size 800x600 - python -m scitex.web.download_images https://example.com --no-min-size - """, - ) - parser.add_argument("url", help="URL to download images from") - parser.add_argument("-o", "--output", help="Output directory") - parser.add_argument( - "--min-size", - default="400x300", - help="Minimum size WIDTHxHEIGHT (default: 400x300)", - ) - parser.add_argument( - "--no-min-size", - action="store_true", - help="Disable size filtering", - ) - parser.add_argument( - "--same-domain", - action="store_true", - help="Only download from same domain", - ) - parser.add_argument( - "--workers", - type=int, - default=5, - help="Concurrent downloads (default: 5)", - ) - - args = parser.parse_args() - - min_size = None - if not args.no_min_size and args.min_size: - w, h = map(int, args.min_size.split("x")) - min_size = (w, h) - - paths = download_images( - args.url, - output_dir=args.output, - min_size=min_size, - max_workers=args.workers, - same_domain=args.same_domain, - ) - - print(f"\nDownloaded {len(paths)} images:") - for p in paths: - print(f" {p}") - - -if __name__ == "__main__": - main() diff --git a/tests/scitex/web/test__scraping.py b/tests/scitex/web/test__scraping.py deleted file mode 100644 index 0534300db..000000000 --- a/tests/scitex/web/test__scraping.py +++ /dev/null @@ -1,712 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# File: ./tests/scitex/web/test__scraping.py - -""" -Tests for web scraping utilities. -""" - -import re -import shutil -import tempfile -from pathlib import Path -from unittest.mock import MagicMock, Mock, mock_open, patch - -import pytest - - -class TestGetUrls: - """Test get_urls function.""" - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_basic(self, mock_get): - """Test basic URL extraction.""" - from scitex.web import get_urls - - mock_response = Mock() - mock_response.text = """ - -
- Link 1 - Link 2 - Link 3 - - - """ - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - urls = get_urls("https://example.com") - - assert len(urls) == 3 - assert "https://example.com/page1" in urls - assert "https://example.com/page2" in urls - assert "https://example.com/page3" in urls - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_with_pattern(self, mock_get): - """Test URL extraction with pattern filtering.""" - from scitex.web import get_urls - - mock_response = Mock() - mock_response.text = """ - - - PDF - HTML - Another PDF - - - """ - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - urls = get_urls("https://example.com", pattern=r"\.pdf$") - - assert len(urls) == 2 - assert all(url.endswith(".pdf") for url in urls) - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_same_domain(self, mock_get): - """Test URL extraction with same domain filter.""" - from scitex.web import get_urls - - mock_response = Mock() - mock_response.text = """ - - - Internal - External - Relative - - - """ - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - urls = get_urls("https://example.com", same_domain=True) - - assert len(urls) == 2 - assert all("example.com" in url for url in urls) - assert not any("other.com" in url for url in urls) - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_relative_urls(self, mock_get): - """Test conversion of relative URLs to absolute.""" - from scitex.web import get_urls - - mock_response = Mock() - mock_response.text = """ - - - Page 1 - Page 2 - Page 3 - - - """ - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - urls = get_urls("https://example.com/dir/", absolute=True) - - assert len(urls) == 3 - assert all(url.startswith("https://") for url in urls) - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_request_failure(self, mock_get): - """Test handling of request failures.""" - import requests - - from scitex.web import get_urls - - mock_get.side_effect = requests.RequestException("Network error") - - urls = get_urls("https://example.com") - - assert urls == [] - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_duplicate_removal(self, mock_get): - """Test that duplicate URLs are removed.""" - from scitex.web import get_urls - - mock_response = Mock() - mock_response.text = """ - - - Link 1 - Link 1 again - Relative to same page - - - """ - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - urls = get_urls("https://example.com") - - # Should only have one instance of page1 - assert len(urls) == 1 - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_empty_page(self, mock_get): - """Test handling of page with no links.""" - from scitex.web import get_urls - - mock_response = Mock() - mock_response.text = "No links here" - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - urls = get_urls("https://example.com") - - assert urls == [] - - -class TestGetImageUrls: - """Test get_image_urls function.""" - - @patch("scitex.web._scraping.requests.get") - def test_get_image_urls_basic(self, mock_get): - """Test basic image URL extraction.""" - from scitex.web import get_image_urls - - mock_response = Mock() - mock_response.text = """ - - -
-
-
-
-
- """
- mock_response.raise_for_status = Mock()
- mock_get.return_value = mock_response
-
- img_urls = get_image_urls("https://example.com")
-
- assert len(img_urls) == 3
- assert "https://example.com/image1.jpg" in img_urls
- assert "https://example.com/images/image2.png" in img_urls
-
- @patch("scitex.web._scraping.requests.get")
- def test_get_image_urls_with_pattern(self, mock_get):
- """Test image URL extraction with pattern filtering."""
- from scitex.web import get_image_urls
-
- mock_response = Mock()
- mock_response.text = """
-
-
-
-
-
-
-
- """
- mock_response.raise_for_status = Mock()
- mock_get.return_value = mock_response
-
- img_urls = get_image_urls("https://example.com", pattern=r"\.jpg$")
-
- assert len(img_urls) == 2
- assert all(url.endswith(".jpg") for url in img_urls)
-
- @patch("scitex.web._scraping.requests.get")
- def test_get_image_urls_same_domain(self, mock_get):
- """Test image URL extraction with same domain filter."""
- from scitex.web import get_image_urls
-
- mock_response = Mock()
- mock_response.text = """
-
-
-
-
-
-
-
- """
- mock_response.raise_for_status = Mock()
- mock_get.return_value = mock_response
-
- img_urls = get_image_urls("https://example.com", same_domain=True)
-
- assert len(img_urls) == 2
- assert all("example.com" in url for url in img_urls)
-
- @patch("scitex.web._scraping.requests.get")
- def test_get_image_urls_request_failure(self, mock_get):
- """Test handling of request failures."""
- import requests
-
- from scitex.web import get_image_urls
-
- mock_get.side_effect = requests.RequestException("Network error")
-
- img_urls = get_image_urls("https://example.com")
-
- assert img_urls == []
-
- @patch("scitex.web._scraping.requests.get")
- def test_get_image_urls_no_images(self, mock_get):
- """Test handling of page with no images."""
- from scitex.web import get_image_urls
-
- mock_response = Mock()
- mock_response.text = "No images here"
- mock_response.raise_for_status = Mock()
- mock_get.return_value = mock_response
-
- img_urls = get_image_urls("https://example.com")
-
- assert img_urls == []
-
-
-class TestDownloadImages:
- """Test download_images function."""
-
- def setup_method(self):
- """Set up temporary directory for tests."""
- self.temp_dir = tempfile.mkdtemp()
-
- def teardown_method(self):
- """Clean up temporary directory after tests."""
- if Path(self.temp_dir).exists():
- shutil.rmtree(self.temp_dir)
-
- @patch("scitex.web._scraping.requests.get")
- def test_download_images_basic(self, mock_get):
- """Test basic image downloading."""
- from scitex.web import download_images
-
- # Mock page response
- page_response = Mock()
- page_response.text = """
-
-
-
-
-
-
- """
- page_response.raise_for_status = Mock()
-
- # Mock image responses
- img_response1 = Mock()
- img_response1.content = b"fake image data 1"
- img_response1.headers = {"content-type": "image/jpeg"}
- img_response1.raise_for_status = Mock()
-
- img_response2 = Mock()
- img_response2.content = b"fake image data 2"
- img_response2.headers = {"content-type": "image/png"}
- img_response2.raise_for_status = Mock()
-
- mock_get.side_effect = [page_response, img_response1, img_response2]
-
- paths = download_images("https://example.com", output_dir=self.temp_dir)
-
- assert len(paths) == 2
- assert all(Path(p).exists() for p in paths)
-
- @patch("scitex.web._scraping.requests.get")
- def test_download_images_with_pattern(self, mock_get):
- """Test image downloading with pattern filter."""
- from scitex.web import download_images
-
- page_response = Mock()
- page_response.text = """
-
-
-
-
-
-
- """
- page_response.raise_for_status = Mock()
-
- img_response = Mock()
- img_response.content = b"fake image data"
- img_response.headers = {"content-type": "image/jpeg"}
- img_response.raise_for_status = Mock()
-
- mock_get.side_effect = [page_response, img_response]
-
- paths = download_images(
- "https://example.com", output_dir=self.temp_dir, pattern=r"\.jpg$"
- )
-
- assert len(paths) == 1
-
- @patch("scitex.web._scraping.requests.get")
- def test_download_images_duplicate_filenames(self, mock_get):
- """Test handling of duplicate filenames."""
- from scitex.web import download_images
-
- page_response = Mock()
- page_response.text = """
-
-
-
-
-
-
- """
- page_response.raise_for_status = Mock()
-
- img_response = Mock()
- img_response.content = b"fake image data"
- img_response.headers = {"content-type": "image/jpeg"}
- img_response.raise_for_status = Mock()
-
- mock_get.side_effect = [page_response, img_response, img_response]
-
- paths = download_images("https://example.com", output_dir=self.temp_dir)
-
- # Should have both images with different filenames
- assert len(paths) == 2
- assert len(set(paths)) == 2 # All paths are unique
-
- @patch("scitex.web._scraping.requests.get")
- def test_download_images_request_failure(self, mock_get):
- """Test handling of request failures."""
- import requests
-
- from scitex.web import download_images
-
- mock_get.side_effect = requests.RequestException("Network error")
-
- paths = download_images("https://example.com", output_dir=self.temp_dir)
-
- assert paths == []
-
- @patch("scitex.web._scraping.requests.get")
- def test_download_images_same_domain(self, mock_get):
- """Test downloading only images from same domain."""
- from scitex.web import download_images
-
- page_response = Mock()
- page_response.text = """
-
-
-
-
-
-
- """
- page_response.raise_for_status = Mock()
-
- img_response = Mock()
- img_response.content = b"fake image data"
- img_response.headers = {"content-type": "image/jpeg"}
- img_response.raise_for_status = Mock()
-
- mock_get.side_effect = [page_response, img_response]
-
- paths = download_images(
- "https://example.com", output_dir=self.temp_dir, same_domain=True
- )
-
- # Should only download the first image
- assert len(paths) == 1
-
- @patch("scitex.web._scraping.requests.get")
- @patch.dict("os.environ", {}, clear=True)
- def test_download_images_no_output_dir(self, mock_get):
- """Test default output directory creation using SCITEX_DIR."""
- import os
-
- from scitex.web import download_images
-
- page_response = Mock()
- page_response.text = """
-
-
-
-
-
- """
- page_response.raise_for_status = Mock()
-
- img_response = Mock()
- img_response.content = b"fake image data"
- img_response.headers = {"content-type": "image/jpeg"}
- img_response.raise_for_status = Mock()
-
- mock_get.side_effect = [page_response, img_response]
-
- # Set SCITEX_DIR to a temp location for testing
- test_scitex_dir = Path(self.temp_dir) / "scitex"
- os.environ["SCITEX_DIR"] = str(test_scitex_dir)
-
- paths = download_images("https://example.com")
-
- assert len(paths) == 1
- expected_dir = test_scitex_dir / "web" / "downloads"
- assert expected_dir.exists()
-
- @patch("scitex.web._scraping.requests.get")
- @patch.dict(
- "os.environ", {"SCITEX_WEB_DOWNLOADS_DIR": "/tmp/test_downloads"}, clear=True
- )
- def test_download_images_env_var_priority(self, mock_get):
- """Test that SCITEX_WEB_DOWNLOADS_DIR takes priority."""
- import os
-
- from scitex.web import download_images
-
- page_response = Mock()
- page_response.text = """
-
-
-
-
-
- """
- page_response.raise_for_status = Mock()
-
- img_response = Mock()
- img_response.content = b"fake image data"
- img_response.headers = {"content-type": "image/jpeg"}
- img_response.raise_for_status = Mock()
-
- mock_get.side_effect = [page_response, img_response]
-
- # Set both env vars
- os.environ["SCITEX_DIR"] = "/tmp/scitex"
- os.environ["SCITEX_WEB_DOWNLOADS_DIR"] = self.temp_dir
-
- paths = download_images("https://example.com")
-
- # Should use SCITEX_WEB_DOWNLOADS_DIR, not SCITEX_DIR
- assert len(paths) == 1
- assert paths[0].startswith(self.temp_dir)
-
- @patch("scitex.web._scraping.requests.get")
- @patch("scitex.web._scraping.PILLOW_AVAILABLE", True)
- @patch("scitex.web._scraping.Image.open")
- def test_download_images_min_size_filter(self, mock_image_open, mock_get):
- """Test minimum size filtering."""
- from scitex.web import download_images
-
- page_response = Mock()
- page_response.text = """
-
-
-
-
-
-
- """
- page_response.raise_for_status = Mock()
-
- img_response_small = Mock()
- img_response_small.content = b"small image"
- img_response_small.headers = {"content-type": "image/jpeg"}
- img_response_small.raise_for_status = Mock()
-
- img_response_large = Mock()
- img_response_large.content = b"large image"
- img_response_large.headers = {"content-type": "image/jpeg"}
- img_response_large.raise_for_status = Mock()
-
- # Mock image sizes
- small_img = Mock()
- small_img.size = (50, 50)
- large_img = Mock()
- large_img.size = (500, 500)
-
- mock_image_open.side_effect = [small_img, large_img]
- mock_get.side_effect = [page_response, img_response_small, img_response_large]
-
- paths = download_images(
- "https://example.com", output_dir=self.temp_dir, min_size=(100, 100)
- )
-
- # Only the large image should be downloaded
- assert len(paths) == 1
-
-
-class TestScrapingModuleImport:
- """Test that scraping functions are properly exported."""
-
- def test_scraping_functions_available(self):
- """Test that all scraping functions are available."""
- import scitex.web
-
- assert hasattr(scitex.web, "get_urls")
- assert hasattr(scitex.web, "download_images")
- assert hasattr(scitex.web, "get_image_urls")
-
- assert callable(scitex.web.get_urls)
- assert callable(scitex.web.download_images)
- assert callable(scitex.web.get_image_urls)
-
-
-if __name__ == "__main__":
- import os
-
- import pytest
-
- pytest.main([os.path.abspath(__file__)])
-
-# --------------------------------------------------------------------------------
-# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_scraping.py
-# --------------------------------------------------------------------------------
-# #!/usr/bin/env python3
-# # File: ./src/scitex/web/_scraping.py
-#
-# """Web scraping utilities for extracting URLs."""
-#
-# import re
-# import urllib.parse
-# from typing import List, Optional, Set
-#
-# import requests
-# from bs4 import BeautifulSoup
-#
-# from scitex.logging import getLogger
-#
-# logger = getLogger(__name__)
-#
-# DEFAULT_TIMEOUT = 10
-# DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-#
-#
-# def get_urls(
-# url: str,
-# pattern: Optional[str] = None,
-# absolute: bool = True,
-# same_domain: bool = False,
-# include_external: bool = True,
-# ) -> List[str]:
-# """
-# Extract all URLs from a webpage.
-#
-# Args:
-# url: The URL of the webpage to scrape
-# pattern: Optional regex pattern to filter URLs (e.g., r'\\.pdf$' for PDF files)
-# absolute: If True, convert relative URLs to absolute URLs
-# same_domain: If True, only return URLs from the same domain
-# include_external: If True, include external links (only applies if same_domain=False)
-#
-# Returns:
-# List of URLs found on the page
-#
-# Example:
-# >>> urls = get_urls('https://example.com', pattern=r'\\.pdf$')
-# >>> urls = get_urls('https://example.com', same_domain=True)
-# """
-# try:
-# logger.info(f"Fetching URLs from: {url}")
-# response = requests.get(
-# url,
-# timeout=DEFAULT_TIMEOUT,
-# headers={"User-Agent": DEFAULT_USER_AGENT},
-# )
-# response.raise_for_status()
-# except requests.RequestException as e:
-# logger.error(f"Failed to fetch URL {url}: {e}")
-# return []
-#
-# soup = BeautifulSoup(response.text, "html.parser")
-# urls_found: Set[str] = set()
-#
-# parsed_base = urllib.parse.urlparse(url)
-#
-# for link in soup.find_all("a", href=True):
-# href = link["href"]
-#
-# if absolute:
-# href = urllib.parse.urljoin(url, href)
-#
-# if same_domain:
-# parsed_href = urllib.parse.urlparse(href)
-# if parsed_href.netloc != parsed_base.netloc:
-# continue
-# elif not include_external:
-# parsed_href = urllib.parse.urlparse(href)
-# if parsed_href.netloc and parsed_href.netloc != parsed_base.netloc:
-# continue
-#
-# if pattern and not re.search(pattern, href):
-# continue
-#
-# urls_found.add(href)
-#
-# result = sorted(list(urls_found))
-# logger.info(f"Found {len(result)} URLs")
-# return result
-#
-#
-# def get_image_urls(
-# url: str,
-# pattern: Optional[str] = None,
-# same_domain: bool = False,
-# ) -> List[str]:
-# """
-# Extract all image URLs from a webpage without downloading them.
-#
-# Args:
-# url: The URL of the webpage to scrape
-# pattern: Optional regex pattern to filter image URLs
-# same_domain: If True, only return images from the same domain
-#
-# Returns:
-# List of image URLs found on the page
-#
-# Note:
-# - SVG files are automatically skipped (vector graphics)
-# - Checks both 'src' and 'data-src' attributes for lazy-loaded images
-#
-# Example:
-# >>> img_urls = get_image_urls('https://example.com')
-# >>> img_urls = get_image_urls('https://example.com', pattern=r'\\.png$')
-# """
-# try:
-# logger.info(f"Fetching image URLs from: {url}")
-# response = requests.get(
-# url,
-# timeout=DEFAULT_TIMEOUT,
-# headers={"User-Agent": DEFAULT_USER_AGENT},
-# )
-# response.raise_for_status()
-# except requests.RequestException as e:
-# logger.error(f"Failed to fetch URL {url}: {e}")
-# return []
-#
-# soup = BeautifulSoup(response.text, "html.parser")
-# image_urls: Set[str] = set()
-#
-# parsed_base = urllib.parse.urlparse(url)
-#
-# for img in soup.find_all("img"):
-# img_url = img.get("src") or img.get("data-src")
-# if not img_url:
-# continue
-#
-# img_url = urllib.parse.urljoin(url, img_url)
-#
-# if img_url.lower().endswith((".svg", ".svgz")):
-# continue
-#
-# if same_domain:
-# parsed_img = urllib.parse.urlparse(img_url)
-# if parsed_img.netloc != parsed_base.netloc:
-# continue
-#
-# if pattern and not re.search(pattern, img_url):
-# continue
-#
-# image_urls.add(img_url)
-#
-# result = sorted(list(image_urls))
-# logger.info(f"Found {len(result)} image URLs")
-# return result
-
-# --------------------------------------------------------------------------------
-# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_scraping.py
-# --------------------------------------------------------------------------------
diff --git a/tests/scitex/web/test__search_pubmed.py b/tests/scitex/web/test__search_pubmed.py
deleted file mode 100755
index bf91741b5..000000000
--- a/tests/scitex/web/test__search_pubmed.py
+++ /dev/null
@@ -1,1170 +0,0 @@
-#!/usr/bin/env python3
-# Time-stamp: "2024-11-08 05:50:57 (ywatanabe)"
-# File: ./scitex_repo/tests/scitex/web/test__search_pubmed.py
-
-"""
-Tests for PubMed search functionality.
-"""
-
-import pytest
-
-aiohttp = pytest.importorskip("aiohttp")
-pytest.importorskip("scitex.web.search_pubmed")
-
-import asyncio # noqa: F401, E402
-import json # noqa: F401, E402
-import xml.etree.ElementTree as ET # noqa: F401, E402
-from io import StringIO # noqa: F401, E402
-from unittest.mock import MagicMock, Mock, mock_open, patch # noqa: E402
-
-try:
- from scitex.web import (
- _fetch_details,
- _get_citation,
- _parse_abstract_xml,
- _search_pubmed,
- batch__fetch_details,
- fetch_async,
- format_bibtex,
- get_crossref_metrics,
- parse_args,
- run_main,
- save_bibtex,
- search_pubmed,
- )
-except ImportError:
- pytest.skip("scitex.web.search_pubmed not available", allow_module_level=True)
-
-
-class TestSearchPubmed:
- """Test _search_pubmed function."""
-
- def test_search_pubmed_success(self):
- """Test successful PubMed search."""
- mock_response = Mock()
- mock_response.ok = True
- mock_response.json.return_value = {
- "esearchresult": {"idlist": ["12345", "67890"], "count": "2"}
- }
-
- with patch("requests.get", return_value=mock_response):
- result = _search_pubmed("test query", retmax=10)
- assert result == mock_response.json.return_value
- assert len(result["esearchresult"]["idlist"]) == 2
-
- def test_search_pubmed_failure(self):
- """Test failed PubMed search."""
- mock_response = Mock()
- mock_response.ok = False
-
- with patch("requests.get", return_value=mock_response):
- with patch("scitex.str.printc") as mock_print:
- result = _search_pubmed("test query")
- assert result == {}
- mock_print.assert_called_once()
-
- def test_search_pubmed_network_error(self):
- """Test network error during search."""
- import requests
-
- with patch(
- "requests.get",
- side_effect=requests.exceptions.RequestException("Network error"),
- ):
- with patch("scitex.str.printc") as mock_print:
- result = _search_pubmed("test query")
- assert result == {}
- mock_print.assert_called_once()
-
- def test_search_pubmed_parameters(self):
- """Test search parameters are correctly passed."""
- mock_response = Mock()
- mock_response.ok = True
- mock_response.json.return_value = {"esearchresult": {}}
-
- with patch("requests.get", return_value=mock_response) as mock_get:
- _search_pubmed("epilepsy", retmax=500)
-
- # Check that correct parameters were passed
- args, kwargs = mock_get.call_args
- assert kwargs["params"]["term"] == "epilepsy"
- assert kwargs["params"]["retmax"] == 500
- assert kwargs["params"]["db"] == "pubmed"
-
-
-class TestFetchDetails:
- """Test _fetch_details function."""
-
- def test_fetch_details_success(self):
- """Test successful fetch of article details."""
- mock_abstract_response = Mock()
- mock_abstract_response.ok = True
- mock_abstract_response.text = "This is the main content.
-This is the main content.
" - ) - - with patch("scitex.web._summarize_url.Document", return_value=mock_doc): - result = extract_main_content(html_content) - assert "Main Title" in result - assert "This is the main content" in result - assert "<" not in result # HTML tags removed - - def test_extract_main_content_without_readability(self): - """Test content extraction when readability is not available.""" - html_content = "Test content
" - - with patch("scitex.web._summarize_url.Document", None): - result = extract_main_content(html_content) - assert result == "Test content"[:5000] # Limited to 5000 chars - - def test_extract_main_content_complex_html(self): - """Test extraction with complex HTML.""" - html_content = """ - -Real content with spaces
- - - - """ - - mock_doc = Mock() - mock_doc.summary.return_value = "Real content with spaces
" - - with patch("scitex.web._summarize_url.Document", return_value=mock_doc): - result = extract_main_content(html_content) - assert result == "Real content with spaces" # Extra spaces removed - - def test_extract_main_content_empty_html(self): - """Test extraction with empty HTML.""" - with patch("scitex.web._summarize_url.Document", None): - result = extract_main_content("") - assert result == "" - - def test_extract_main_content_no_tags(self): - """Test extraction with plain text.""" - plain_text = "Just plain text without HTML" - - with patch("scitex.web._summarize_url.Document", None): - result = extract_main_content(plain_text) - assert result == plain_text - - -class TestCrawlUrl: - """Test crawl_url function.""" - - def test_crawl_url_single_page(self): - """Test crawling a single page.""" - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = "Test content
" - - with patch("requests.get", return_value=mock_response): - with patch( - "scitex.web._summarize_url.extract_main_content", - return_value="Test content", - ): - visited, contents = crawl_url("http://test.com", max_depth=0) - - assert "http://test.com" in visited - assert contents["http://test.com"] == "Test content" - assert len(visited) == 1 - - def test_crawl_url_with_links(self): - """Test crawling with links to follow.""" - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = """ - -Main page
- Link to page 2 - Link to page 3 - - """ - - with patch("requests.get", return_value=mock_response): - with patch( - "scitex.web._summarize_url.extract_main_content", return_value="Content" - ): - visited, contents = crawl_url("http://test.com", max_depth=1) - - # Should visit main page and try to visit linked pages - assert "http://test.com" in visited - - def test_crawl_url_max_depth(self): - """Test that max_depth is respected.""" - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = 'Link' - - with patch("requests.get", return_value=mock_response): - with patch( - "scitex.web._summarize_url.extract_main_content", return_value="Content" - ): - visited, contents = crawl_url("http://test.com", max_depth=0) - - # Should only visit the initial URL with max_depth=0 - assert len(visited) == 1 - assert "http://test.com" in visited - - def test_crawl_url_request_exception(self): - """Test handling of request exceptions.""" - import requests - - with patch( - "requests.get", side_effect=requests.RequestException("Network error") - ): - visited, contents = crawl_url("http://test.com") - - assert len(visited) == 0 - assert len(contents) == 0 - - def test_crawl_url_non_200_status(self): - """Test handling of non-200 status codes.""" - mock_response = Mock() - mock_response.status_code = 404 - - with patch("requests.get", return_value=mock_response): - visited, contents = crawl_url("http://test.com") - - assert len(visited) == 0 - assert len(contents) == 0 - - def test_crawl_url_avoid_duplicate_visits(self): - """Test that URLs are not visited twice.""" - mock_response = Mock() - mock_response.status_code = 200 - # Use exact same URL to test duplicate avoidance - mock_response.text = 'Home' - - call_count = 0 - - def mock_get(*args, **kwargs): - nonlocal call_count - call_count += 1 - return mock_response - - with patch("requests.get", side_effect=mock_get): - with patch( - "scitex.web._summarize_url.extract_main_content", return_value="Content" - ): - visited, contents = crawl_url("http://test.com", max_depth=1) - - # Should only call once despite self-referential link to exact same URL - assert call_count == 1 - - -class TestCrawlToJson: - """Test crawl_to_json function.""" - - def test_crawl_to_json_basic(self): - """Test basic JSON conversion.""" - mock_urls = {"http://test.com"} - mock_contents = {"http://test.com": "Test page content"} - - with patch( - "scitex.web._summarize_url.crawl_url", - return_value=(mock_urls, mock_contents), - ): - with patch("scitex.ai.GenAI") as mock_genai: - mock_llm = Mock() - mock_llm.return_value = "Summary of test page" - mock_genai.return_value = mock_llm - - # Mock ThreadPoolExecutor - mock_future = Mock(spec=Future) - mock_future.result.return_value = { - "url": "http://test.com", - "content": "Summary of test page", - } - - with patch("concurrent.futures.ThreadPoolExecutor") as mock_executor: - mock_executor.return_value.__enter__.return_value.submit.return_value = ( - mock_future - ) - with patch( - "concurrent.futures.as_completed", return_value=[mock_future] - ): - with patch("tqdm.tqdm", side_effect=lambda x, **kwargs: x): - result = crawl_to_json("test.com") - - parsed = json.loads(result) - assert parsed["start_url"] == "https://test.com" - assert len(parsed["crawled_pages"]) == 1 - assert ( - parsed["crawled_pages"][0]["url"] == "http://test.com" - ) - - def test_crawl_to_json_url_normalization(self): - """Test URL normalization (adding https://).""" - with patch("scitex.web._summarize_url.crawl_url", return_value=(set(), {})): - with patch("concurrent.futures.ThreadPoolExecutor"): - with patch("concurrent.futures.as_completed", return_value=[]): - with patch("tqdm.tqdm", side_effect=lambda x, **kwargs: x): - result = crawl_to_json("example.com") - parsed = json.loads(result) - assert parsed["start_url"] == "https://example.com" - - def test_crawl_to_json_already_has_protocol(self): - """Test URL with existing protocol.""" - with patch("scitex.web._summarize_url.crawl_url", return_value=(set(), {})): - with patch("concurrent.futures.ThreadPoolExecutor"): - with patch("concurrent.futures.as_completed", return_value=[]): - with patch("tqdm.tqdm", side_effect=lambda x, **kwargs: x): - result = crawl_to_json("http://example.com") - parsed = json.loads(result) - assert parsed["start_url"] == "http://example.com" - - def test_crawl_to_json_multiple_pages(self): - """Test JSON conversion with multiple pages.""" - mock_urls = {"http://test.com", "http://test.com/page2"} - mock_contents = { - "http://test.com": "Main content", - "http://test.com/page2": "Page 2 content", - } - - with patch( - "scitex.web._summarize_url.crawl_url", - return_value=(mock_urls, mock_contents), - ): - with patch("scitex.ai.GenAI") as mock_genai: - mock_llm = Mock() - mock_llm.side_effect = ["Summary 1", "Summary 2"] - mock_genai.return_value = mock_llm - - # Create futures for each URL - futures = [] - for i, url in enumerate(mock_urls): - mock_future = Mock(spec=Future) - mock_future.result.return_value = { - "url": url, - "content": f"Summary {i + 1}", - } - futures.append(mock_future) - - with patch("concurrent.futures.ThreadPoolExecutor") as mock_executor: - mock_executor.return_value.__enter__.return_value.submit.side_effect = ( - futures - ) - with patch("concurrent.futures.as_completed", return_value=futures): - with patch("tqdm.tqdm", side_effect=lambda x, **kwargs: x): - result = crawl_to_json("test.com") - - parsed = json.loads(result) - assert len(parsed["crawled_pages"]) == 2 - - -class TestSummarizeAll: - """Test summarize_all function.""" - - def test_summarize_all_basic(self): - """Test basic summarization.""" - json_content = json.dumps( - { - "start_url": "http://test.com", - "crawled_pages": [ - {"url": "http://test.com", "content": "Test summary"} - ], - } - ) - - with patch("scitex.ai.GenAI") as mock_genai: - mock_llm = Mock() - mock_llm.return_value = ( - "• Point 1\n• Point 2\n• Point 3\n• Point 4\n• Point 5" - ) - mock_genai.return_value = mock_llm - - result = summarize_all(json_content) - - assert "Point 1" in result - assert "Point 5" in result - mock_llm.assert_called_once() - - # Check that the prompt includes the JSON content - call_args = mock_llm.call_args[0][0] - assert "5 bullet points" in call_args - assert json_content in call_args - - def test_summarize_all_empty_json(self): - """Test summarization with empty JSON.""" - empty_json = json.dumps({"start_url": "", "crawled_pages": []}) - - with patch("scitex.ai.GenAI") as mock_genai: - mock_llm = Mock() - mock_llm.return_value = "No content to summarize" - mock_genai.return_value = mock_llm - - result = summarize_all(empty_json) - assert result == "No content to summarize" - - -class TestSummarizeUrl: - """Test summarize_url function.""" - - def test_summarize_url_complete_flow(self): - """Test complete URL summarization flow.""" - mock_json = json.dumps( - { - "start_url": "https://test.com", - "crawled_pages": [ - {"url": "https://test.com", "content": "Page summary"} - ], - } - ) - mock_summary = "• Summary point 1\n• Summary point 2" - - with patch("scitex.web._summarize_url.crawl_to_json", return_value=mock_json): - with patch( - "scitex.web._summarize_url.summarize_all", return_value=mock_summary - ): - with patch("builtins.print"): # Suppress pprint output - ground_summary, json_result = summarize_url("test.com") - - assert ground_summary == mock_summary - assert json_result == mock_json - - def test_summarize_url_error_handling(self): - """Test error handling in summarize_url.""" - with patch( - "scitex.web._summarize_url.crawl_to_json", - side_effect=Exception("Crawl error"), - ): - with pytest.raises(Exception) as exc_info: - summarize_url("test.com") - assert str(exc_info.value) == "Crawl error" - - def test_summarize_url_pprint_called(self): - """Test that pprint is called with the summary.""" - mock_json = '{"test": "data"}' - mock_summary = "Test summary" - - with patch("scitex.web._summarize_url.crawl_to_json", return_value=mock_json): - with patch( - "scitex.web._summarize_url.summarize_all", return_value=mock_summary - ): - # pprint is imported as 'from pprint import pprint' in the module - with patch("scitex.web._summarize_url.pprint") as mock_pprint: - summarize_url("test.com") - mock_pprint.assert_called_once_with(mock_summary) - - -class TestMain: - """Test main function and module alias.""" - - def test_main_is_summarize_url(self): - """Test that main is an alias for summarize_url.""" - assert main == summarize_url - - def test_main_execution(self): - """Test main function execution returns expected result structure.""" - mock_json = '{"test": "data"}' - mock_summary = "Test summary" - - # main is the same function as summarize_url, so we patch the inner calls - with patch("scitex.web._summarize_url.crawl_to_json", return_value=mock_json): - with patch( - "scitex.web._summarize_url.summarize_all", return_value=mock_summary - ): - with patch("scitex.web._summarize_url.pprint"): - result = main("http://example.com") - assert result[0] == mock_summary - assert result[1] == mock_json - - def test_script_execution(self): - """Test script execution with arguments.""" - import argparse - - with patch("sys.argv", ["script.py", "--url", "http://example.com"]): - # Import and execute the argument parsing similar to __main__ block - parser = argparse.ArgumentParser(description="") - parser.add_argument("--url", "-u", type=str, help="(default: %(default)s)") - args = parser.parse_args() - - assert args.url == "http://example.com" - - def test_readability_import_fallback(self): - """Test readability import fallback mechanism.""" - # This tests the import logic in the actual module - # The module tries to import from 'readability' first, then 'readability.readability' - import sys - - # Test when both imports fail - with patch.dict( - "sys.modules", {"readability": None, "readability.readability": None} - ): - # Re-import the module to trigger the import logic - if "scitex.web._summarize_url" in sys.modules: - del sys.modules["scitex.web._summarize_url"] - - # This should set Document to None - from scitex.web import _summarize_url # noqa: F401 - - # The Document variable should be None when imports fail - # (This is handled in the actual module's import section) - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_summarize_url.py -# -------------------------------------------------------------------------------- -# #!./env/bin/python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-07-29 21:43:30 (ywatanabe)" -# # ./src/scitex/web/_crawl.py -# -# -# import requests -# from bs4 import BeautifulSoup -# import urllib.parse -# from concurrent.futures import ThreadPoolExecutor, as_completed -# import json -# from tqdm import tqdm -# import scitex -# from pprint import pprint -# -# try: -# from readability import Document -# except ImportError: -# try: -# from readability.readability import Document -# except ImportError: -# Document = None -# -# import re -# -# -# # def crawl_url(url, max_depth=1): -# # print("\nCrawling...") -# # visited = set() -# # to_visit = [(url, 0)] -# # contents = {} -# -# # while to_visit: -# # current_url, depth = to_visit.pop(0) -# # if current_url in visited or depth > max_depth: -# # continue -# -# # try: -# # response = requests.get(current_url) -# # if response.status_code == 200: -# # visited.add(current_url) -# # contents[current_url] = response.text -# # soup = BeautifulSoup(response.text, "html.parser") -# -# # for link in soup.find_all("a", href=True): -# # absolute_link = urllib.parse.urljoin( -# # current_url, link["href"] -# # ) -# # if absolute_link not in visited: -# # to_visit.append((absolute_link, depth + 1)) -# -# # except requests.RequestException: -# # pass -# -# # return visited, contents -# -# -# def extract_main_content(html): -# if Document is None: -# # Fallback: just strip HTML tags -# content = re.sub("<[^<]+?>", "", html) -# content = " ".join(content.split()) -# return content[:5000] # Limit to first 5000 chars -# -# doc = Document(html) -# content = doc.summary() -# # Remove HTML tags -# content = re.sub("<[^<]+?>", "", content) -# # Remove extra whitespace -# content = " ".join(content.split()) -# return content -# -# -# def crawl_url(url, max_depth=1): -# print("\nCrawling...") -# visited = set() -# to_visit = [(url, 0)] -# contents = {} -# -# while to_visit: -# current_url, depth = to_visit.pop(0) -# if current_url in visited or depth > max_depth: -# continue -# -# try: -# response = requests.get(current_url) -# if response.status_code == 200: -# visited.add(current_url) -# main_content = extract_main_content(response.text) -# contents[current_url] = main_content -# soup = BeautifulSoup(response.text, "html.parser") -# -# for link in soup.find_all("a", href=True): -# absolute_link = urllib.parse.urljoin(current_url, link["href"]) -# if absolute_link not in visited: -# to_visit.append((absolute_link, depth + 1)) -# -# except requests.RequestException: -# pass -# -# return visited, contents -# -# -# def crawl_to_json(start_url): -# if not start_url.startswith("http"): -# start_url = "https://" + start_url -# crawled_urls, contents = crawl_url(start_url) -# -# print("\nSummalizing as json...") -# -# def process_url(url): -# llm = scitex.ai.GenAI("gpt-4o-mini") -# return { -# "url": url, -# "content": llm(f"Summarize this page in 1 line:\n\n{contents[url]}"), -# } -# -# with ThreadPoolExecutor() as executor: -# future_to_url = {executor.submit(process_url, url): url for url in crawled_urls} -# crawled_pages = [] -# for future in tqdm( -# as_completed(future_to_url), -# total=len(crawled_urls), -# desc="Processing URLs", -# ): -# crawled_pages.append(future.result()) -# -# result = {"start_url": start_url, "crawled_pages": crawled_pages} -# -# return json.dumps(result, indent=2) -# -# -# def summarize_all(json_contents): -# llm = scitex.ai.GenAI("gpt-4o-mini") -# out = llm(f"Summarize this json file with 5 bullet points:\n\n{json_contents}") -# return out -# -# -# def summarize_url(start_url): -# json_result = crawl_to_json(start_url) -# ground_summary = summarize_all(json_result) -# -# pprint(ground_summary) -# return ground_summary, json_result -# -# -# main = summarize_url -# -# if __name__ == "__main__": -# import argparse -# import scitex -# -# parser = argparse.ArgumentParser(description="") -# parser.add_argument("--url", "-u", type=str, help="(default: %(default)s)") -# args = parser.parse_args() -# scitex.gen.print_block(args, c="yellow") -# -# main(args.url) - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_summarize_url.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/web/test_download_images.py b/tests/scitex/web/test_download_images.py deleted file mode 100644 index 122a88dca..000000000 --- a/tests/scitex/web/test_download_images.py +++ /dev/null @@ -1,332 +0,0 @@ -# Add your tests here - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/download_images.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # File: ./src/scitex/web/download_images.py -# -# """ -# Image Downloader for SciTeX. -# -# Downloads images from URLs with minimum size filtering. -# -# Usage: -# python -m scitex.web.download_images https://example.com -# python -m scitex.web.download_images https://example.com -o ./downloads -# python -m scitex.web.download_images https://example.com --min-size 800x600 -# """ -# -# import os -# import re -# import urllib.parse -# from concurrent.futures import ThreadPoolExecutor, as_completed -# from datetime import datetime -# from pathlib import Path -# from typing import List, Optional, Tuple -# -# import requests -# from bs4 import BeautifulSoup -# from tqdm import tqdm -# -# try: -# from io import BytesIO -# -# from PIL import Image -# -# PILLOW_AVAILABLE = True -# except ImportError: -# PILLOW_AVAILABLE = False -# -# from scitex.logging import getLogger -# -# logger = getLogger(__name__) -# -# # Configuration -# DEFAULT_MIN_WIDTH = 400 -# DEFAULT_MIN_HEIGHT = 300 -# DEFAULT_TIMEOUT = 10 -# DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" -# -# -# def _get_default_download_dir() -> str: -# """Get default download directory using SCITEX_DIR if available.""" -# scitex_root = os.environ.get("SCITEX_DIR", os.path.expanduser("~/.scitex")) -# return os.path.join(scitex_root, "web", "downloads") -# -# -# def _normalize_url_for_directory(url: str) -> str: -# """Convert URL to a safe directory name.""" -# parsed = urllib.parse.urlparse(url) -# domain = parsed.netloc.replace("www.", "") -# path = parsed.path.strip("/").replace("/", "-") -# -# normalized = f"{domain}-{path}" if path else domain -# normalized = re.sub(r"[^\w\-.]", "-", normalized) -# normalized = re.sub(r"-+", "-", normalized) -# normalized = normalized[:100].strip("-") -# -# return normalized -# -# -# def _is_direct_image_url(url: str) -> bool: -# """Check if URL appears to be a direct image link.""" -# extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"] -# path = urllib.parse.urlparse(url.lower()).path -# return any(path.endswith(ext) for ext in extensions) -# -# -# def _extract_image_urls(url: str, same_domain: bool = False) -> List[str]: -# """Extract image URLs from a webpage.""" -# try: -# logger.info(f"Fetching page: {url}") -# response = requests.get( -# url, -# timeout=DEFAULT_TIMEOUT, -# headers={"User-Agent": DEFAULT_USER_AGENT}, -# ) -# response.raise_for_status() -# except requests.RequestException as e: -# logger.error(f"Failed to fetch page: {e}") -# return [] -# -# soup = BeautifulSoup(response.content, "html.parser") -# parsed_base = urllib.parse.urlparse(url) -# image_urls = set() -# -# for img in soup.find_all("img"): -# img_url = img.get("src") or img.get("data-src") -# if not img_url: -# continue -# -# img_url = urllib.parse.urljoin(url, img_url) -# -# if img_url.lower().endswith((".svg", ".svgz")): -# continue -# -# if same_domain: -# parsed_img = urllib.parse.urlparse(img_url) -# if parsed_img.netloc != parsed_base.netloc: -# continue -# -# image_urls.add(img_url) -# -# logger.info(f"Found {len(image_urls)} images on page") -# return list(image_urls) -# -# -# def _download_single_image( -# img_url: str, -# output_dir: Path, -# counter: int, -# min_size: Optional[Tuple[int, int]], -# ) -> Optional[str]: -# """Download a single image.""" -# try: -# response = requests.get( -# img_url, -# timeout=DEFAULT_TIMEOUT, -# headers={"User-Agent": DEFAULT_USER_AGENT}, -# ) -# response.raise_for_status() -# -# # Validate content-type -# content_type = response.headers.get("content-type", "") -# if not content_type.startswith("image/"): -# logger.debug(f"Skipping non-image: {content_type}") -# return None -# -# # Check dimensions -# if min_size and PILLOW_AVAILABLE: -# try: -# img = Image.open(BytesIO(response.content)) -# width, height = img.size -# if width < min_size[0] or height < min_size[1]: -# logger.debug( -# f"Skipping small image: {width}x{height} " -# f"(min: {min_size[0]}x{min_size[1]})" -# ) -# return None -# except Exception: -# pass -# -# # Determine extension -# ext = "jpg" -# if PILLOW_AVAILABLE: -# try: -# img = Image.open(BytesIO(response.content)) -# fmt = img.format.lower() if img.format else "jpeg" -# ext = "jpg" if fmt == "jpeg" else fmt -# except Exception: -# pass -# elif "png" in content_type: -# ext = "png" -# elif "gif" in content_type: -# ext = "gif" -# elif "webp" in content_type: -# ext = "webp" -# -# filename = f"{counter:04d}.{ext}" -# filepath = output_dir / filename -# -# with open(filepath, "wb") as f: -# f.write(response.content) -# -# logger.info(f"Downloaded: {filename}") -# return str(filepath) -# -# except Exception as e: -# logger.warning(f"Error downloading {img_url}: {e}") -# return None -# -# -# def download_images( -# url: str, -# output_dir: Optional[str] = None, -# min_size: Optional[Tuple[int, int]] = None, -# max_workers: int = 5, -# same_domain: bool = False, -# ) -> List[str]: -# """ -# Download images from a URL. -# -# Args: -# url: Webpage URL or direct image URL -# output_dir: Output directory (default: $SCITEX_DIR/web/downloads) -# min_size: Minimum (width, height) to filter small images (default: 400x300) -# max_workers: Concurrent download threads -# same_domain: Only download images from the same domain -# -# Returns: -# List of downloaded file paths -# -# Example: -# >>> paths = download_images("https://example.com") -# >>> paths = download_images("https://example.com/photo.jpg") -# >>> paths = download_images("https://example.com", min_size=(800, 600)) -# """ -# if not PILLOW_AVAILABLE: -# logger.warning("Pillow not available. Size filtering disabled.") -# min_size = None -# elif min_size is None: -# min_size = (DEFAULT_MIN_WIDTH, DEFAULT_MIN_HEIGHT) -# -# # Setup output directory -# if output_dir is None: -# output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR") -# if output_dir is None: -# output_dir = _get_default_download_dir() -# -# timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") -# normalized = _normalize_url_for_directory(url) -# output_path = Path(output_dir).expanduser() / f"{timestamp}-{normalized}-images" -# output_path.mkdir(parents=True, exist_ok=True) -# -# logger.info(f"Output directory: {output_path}") -# -# # Get image URLs -# if _is_direct_image_url(url): -# image_urls = [url] -# logger.info("Direct image URL detected") -# else: -# image_urls = _extract_image_urls(url, same_domain=same_domain) -# -# if not image_urls: -# logger.warning("No images found") -# return [] -# -# # Download concurrently -# downloaded = [] -# counter = [1] -# -# def download_with_counter(img_url: str) -> Optional[str]: -# idx = counter[0] -# counter[0] += 1 -# return _download_single_image(img_url, output_path, idx, min_size) -# -# with ThreadPoolExecutor(max_workers=max_workers) as executor: -# futures = {executor.submit(download_with_counter, u): u for u in image_urls} -# -# for future in tqdm( -# as_completed(futures), total=len(image_urls), desc="Downloading" -# ): -# result = future.result() -# if result: -# downloaded.append(result) -# -# logger.info(f"Downloaded {len(downloaded)} images to {output_path}") -# return downloaded -# -# -# def main(): -# """CLI entry point.""" -# import argparse -# -# parser = argparse.ArgumentParser( -# description="Download images from URL", -# formatter_class=argparse.RawDescriptionHelpFormatter, -# epilog=""" -# Examples: -# python -m scitex.web.download_images https://example.com -# python -m scitex.web.download_images https://example.com -o ./downloads -# python -m scitex.web.download_images https://example.com --min-size 800x600 -# python -m scitex.web.download_images https://example.com --no-min-size -# """, -# ) -# parser.add_argument("url", help="URL to download images from") -# parser.add_argument("-o", "--output", help="Output directory") -# parser.add_argument( -# "--min-size", -# default="400x300", -# help="Minimum size WIDTHxHEIGHT (default: 400x300)", -# ) -# parser.add_argument( -# "--no-min-size", -# action="store_true", -# help="Disable size filtering", -# ) -# parser.add_argument( -# "--same-domain", -# action="store_true", -# help="Only download from same domain", -# ) -# parser.add_argument( -# "--workers", -# type=int, -# default=5, -# help="Concurrent downloads (default: 5)", -# ) -# -# args = parser.parse_args() -# -# min_size = None -# if not args.no_min_size and args.min_size: -# w, h = map(int, args.min_size.split("x")) -# min_size = (w, h) -# -# paths = download_images( -# args.url, -# output_dir=args.output, -# min_size=min_size, -# max_workers=args.workers, -# same_domain=args.same_domain, -# ) -# -# print(f"\nDownloaded {len(paths)} images:") -# for p in paths: -# print(f" {p}") -# -# -# if __name__ == "__main__": -# main() - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/download_images.py -# --------------------------------------------------------------------------------