diff --git a/pyproject.toml b/pyproject.toml index bdfecd610..46616464e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -729,30 +729,8 @@ verify = [ # Web Module - Web utilities # Use: pip install scitex[web] -web = [ - "aiohttp", - "beautifulsoup4", - "readability-lxml", - "requests", - "Pillow", - "matplotlib", - "tqdm", - "joblib", - "scikit-learn", - "pytest-asyncio", - "ruamel.yaml", - "xarray", - "seaborn", - "scipy", - "markdown2", - "anthropic", - "openai", - "google-genai", - "groq", - # # Heavy dependencies handled by _AVAILABLE flags - # "torch", - # "umap-learn", -] +# Real implementation lives in the standalone scitex-web package. +web = ["scitex-web[readability]>=0.1.0"] # Clew Module - Hash-based verification for reproducible science (Ariadne's thread) # Use: pip install scitex[clew] diff --git a/src/scitex/web/__init__.py b/src/scitex/web/__init__.py index aa46bcc0e..01eb10cba 100755 --- a/src/scitex/web/__init__.py +++ b/src/scitex/web/__init__.py @@ -1,35 +1,20 @@ -#!/usr/bin/env python3 -"""Web-related utilities module for scitex.""" +"""SciTeX web — thin compatibility shim for scitex-web. -from ._scraping import get_image_urls, get_urls -from ._search_pubmed import ( - _fetch_details, - _get_citation, - _parse_abstract_xml, - _search_pubmed, -) -from ._search_pubmed import batch__fetch_details as _batch__fetch_details -from ._search_pubmed import fetch_async as _fetch_async -from ._search_pubmed import format_bibtex as _format_bibtex -from ._search_pubmed import get_crossref_metrics -from ._search_pubmed import parse_args as _parse_args -from ._search_pubmed import run_main as _run_main -from ._search_pubmed import save_bibtex as _save_bibtex -from ._search_pubmed import search_pubmed -from ._summarize_url import crawl_to_json, crawl_url -from ._summarize_url import extract_main_content as _extract_main_content -from ._summarize_url import summarize_all as _summarize_all -from ._summarize_url import summarize_url -from .download_images import download_images +Aliases ``scitex.web`` to the standalone ``scitex_web`` package via ``sys.modules``. +``scitex.web is scitex_web``. -__all__ = [ - # Public API - "search_pubmed", - "get_crossref_metrics", - "summarize_url", - "crawl_url", - "crawl_to_json", - "get_urls", - "download_images", - "get_image_urls", -] +Install: ``pip install scitex[web]`` (or ``pip install scitex-web``). +See: https://github.com/ywatanabe1989/scitex-web +""" + +import sys as _sys + +try: + import scitex_web as _real +except ImportError as _e: # pragma: no cover + raise ImportError( + "scitex.web requires the 'scitex-web' package. " + "Install with: pip install scitex[web] (or: pip install scitex-web)" + ) from _e + +_sys.modules[__name__] = _real diff --git a/src/scitex/web/_scraping.py b/src/scitex/web/_scraping.py deleted file mode 100755 index d97dc668a..000000000 --- a/src/scitex/web/_scraping.py +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/env python3 -# File: ./src/scitex/web/_scraping.py - -"""Web scraping utilities for extracting URLs. - -``bs4`` is an optional third-party dependency (only needed when actually -scraping). Do **not** import it at module load -- doing so leaks the -``ModuleNotFoundError`` through ``scitex.web.__init__`` and through -``scitex.cli.web``, which in turn breaks ``scitex --json`` and -``scitex --help-recursive`` on any install without ``beautifulsoup4``. -See ywatanabe1989/todo#279. The import now lives inside each scraping -function, so merely importing this module is side-effect-free. -""" - -import re -import urllib.parse -from typing import List, Optional, Set - -import requests - -from scitex.logging import getLogger - -logger = getLogger(__name__) - -DEFAULT_TIMEOUT = 10 -DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" - - -def get_urls( - url: str, - pattern: Optional[str] = None, - absolute: bool = True, - same_domain: bool = False, - include_external: bool = True, -) -> List[str]: - """ - Extract all URLs from a webpage. - - Args: - url: The URL of the webpage to scrape - pattern: Optional regex pattern to filter URLs (e.g., r'\\.pdf$' for PDF files) - absolute: If True, convert relative URLs to absolute URLs - same_domain: If True, only return URLs from the same domain - include_external: If True, include external links (only applies if same_domain=False) - - Returns: - List of URLs found on the page - - Example: - >>> urls = get_urls('https://example.com', pattern=r'\\.pdf$') - >>> urls = get_urls('https://example.com', same_domain=True) - """ - from bs4 import BeautifulSoup # lazy: see module docstring, todo#279 - - try: - logger.info(f"Fetching URLs from: {url}") - response = requests.get( - url, - timeout=DEFAULT_TIMEOUT, - headers={"User-Agent": DEFAULT_USER_AGENT}, - ) - response.raise_for_status() - except requests.RequestException as e: - logger.error(f"Failed to fetch URL {url}: {e}") - return [] - - soup = BeautifulSoup(response.text, "html.parser") - urls_found: Set[str] = set() - - parsed_base = urllib.parse.urlparse(url) - - for link in soup.find_all("a", href=True): - href = link["href"] - - if absolute: - href = urllib.parse.urljoin(url, href) - - if same_domain: - parsed_href = urllib.parse.urlparse(href) - if parsed_href.netloc != parsed_base.netloc: - continue - elif not include_external: - parsed_href = urllib.parse.urlparse(href) - if parsed_href.netloc and parsed_href.netloc != parsed_base.netloc: - continue - - if pattern and not re.search(pattern, href): - continue - - urls_found.add(href) - - result = sorted(list(urls_found)) - logger.info(f"Found {len(result)} URLs") - return result - - -def get_image_urls( - url: str, - pattern: Optional[str] = None, - same_domain: bool = False, -) -> List[str]: - """ - Extract all image URLs from a webpage without downloading them. - - Args: - url: The URL of the webpage to scrape - pattern: Optional regex pattern to filter image URLs - same_domain: If True, only return images from the same domain - - Returns: - List of image URLs found on the page - - Note: - - SVG files are automatically skipped (vector graphics) - - Checks both 'src' and 'data-src' attributes for lazy-loaded images - - Example: - >>> img_urls = get_image_urls('https://example.com') - >>> img_urls = get_image_urls('https://example.com', pattern=r'\\.png$') - """ - from bs4 import BeautifulSoup # lazy: see module docstring, todo#279 - - try: - logger.info(f"Fetching image URLs from: {url}") - response = requests.get( - url, - timeout=DEFAULT_TIMEOUT, - headers={"User-Agent": DEFAULT_USER_AGENT}, - ) - response.raise_for_status() - except requests.RequestException as e: - logger.error(f"Failed to fetch URL {url}: {e}") - return [] - - soup = BeautifulSoup(response.text, "html.parser") - image_urls: Set[str] = set() - - parsed_base = urllib.parse.urlparse(url) - - for img in soup.find_all("img"): - img_url = img.get("src") or img.get("data-src") - if not img_url: - continue - - img_url = urllib.parse.urljoin(url, img_url) - - if img_url.lower().endswith((".svg", ".svgz")): - continue - - if same_domain: - parsed_img = urllib.parse.urlparse(img_url) - if parsed_img.netloc != parsed_base.netloc: - continue - - if pattern and not re.search(pattern, img_url): - continue - - image_urls.add(img_url) - - result = sorted(list(image_urls)) - logger.info(f"Found {len(result)} image URLs") - return result diff --git a/src/scitex/web/_search_pubmed.py b/src/scitex/web/_search_pubmed.py deleted file mode 100755 index f41aa1fbd..000000000 --- a/src/scitex/web/_search_pubmed.py +++ /dev/null @@ -1,505 +0,0 @@ -#!/usr/bin/env python3 -# Time-stamp: "2024-11-13 14:30:43 (ywatanabe)" -# File: ./scitex_repo/src/scitex/web/_search_pubmed.py - -""" -1. Functionality: - - Searches PubMed database for scientific articles - - Retrieves detailed information about matched articles - - Displays article metadata including title, authors, journal, year, and abstract -2. Input: - - Search query string (e.g., "epilepsy prediction") - - Optional parameters for batch size and result limit -3. Output: - - Formatted article information displayed to stdout - - BibTeX file with official citations -4. Prerequisites: - - Internet connection - - requests package - - scitex package -""" - -"""Imports""" -import argparse -import asyncio -import xml.etree.ElementTree as ET -from typing import Any, Dict, List, Optional, Union - -import aiohttp -import requests - -import scitex - -"""Functions & Classes""" - - -def _search_pubmed(query: str, retmax: int = 300) -> Dict[str, Any]: - try: - base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" - search_url = f"{base_url}esearch.fcgi" - params = { - "db": "pubmed", - "term": query, - "retmax": retmax, - "retmode": "json", - "usehistory": "y", - } - - response = requests.get(search_url, params=params, timeout=10) - if not response.ok: - scitex.str.printc("PubMed API request failed", c="red") - return {} - return response.json() - except requests.exceptions.RequestException as e: - scitex.str.printc(f"Network error: {e}", c="red") - return {} - - -def _fetch_details( - webenv: str, query_key: str, retstart: int = 0, retmax: int = 100 -) -> Dict[str, Any]: - """Fetches detailed information including abstracts for articles. - - Parameters - ---------- - [Previous parameters remain the same] - - Returns - ------- - Dict[str, Any] - Dictionary containing article details and abstracts - """ - base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" - - # Fetch abstracts - efetch_url = f"{base_url}efetch.fcgi" - efetch_params = { - "db": "pubmed", - "query_key": query_key, - "WebEnv": webenv, - "retstart": retstart, - "retmax": retmax, - "retmode": "xml", - "rettype": "abstract", - "field": "abstract,mesh", - } - - abstract_response = requests.get(efetch_url, params=efetch_params) - - # Fetch metadata - fetch_url = f"{base_url}esummary.fcgi" - params = { - "db": "pubmed", - "query_key": query_key, - "WebEnv": webenv, - "retstart": retstart, - "retmax": retmax, - "retmode": "json", - } - - details_response = requests.get(fetch_url, params=params) - - if not all([abstract_response.ok, details_response.ok]): - # print(f"Error fetching data") - return {} - - return { - "abstracts": abstract_response.text, - "details": details_response.json(), - } - - -def _parse_abstract_xml(xml_text: str) -> Dict[str, tuple]: - """Parses XML response to extract abstracts. - - Parameters - ---------- - xml_text : str - XML response from PubMed - - Returns - ------- - Dict[str, str] - Dictionary mapping PMIDs to abstracts - """ - root = ET.fromstring(xml_text) - results = {} - - for article in root.findall(".//PubmedArticle"): - pmid = article.find(".//PMID").text - abstract_element = article.find(".//Abstract/AbstractText") - abstract = abstract_element.text if abstract_element is not None else "" - - # DOI - doi_element = article.find(".//ArticleId[@IdType='doi']") - doi = doi_element.text if doi_element is not None else "" - - # Get MeSH terms - keywords = [] - mesh_terms = article.findall(".//MeshHeading/DescriptorName") - keywords = [term.text for term in mesh_terms if term is not None] - - results[pmid] = (abstract, keywords, doi) - - return results - - -def _get_citation(pmid: str) -> str: - """Gets official citation in BibTeX format. - - Parameters - ---------- - pmid : str - PubMed ID - - Returns - ------- - str - Official BibTeX citation - """ - base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" - cite_url = f"{base_url}efetch.fcgi" - params = { - "db": "pubmed", - "id": pmid, - "rettype": "bibtex", - "retmode": "text", - } - response = requests.get(cite_url, params=params) - return response.text if response.ok else "" - - -def get_crossref_metrics( - doi: str, api_key: Optional[str] = None, email: Optional[str] = None -) -> Dict[str, Any]: - """Get article metrics from CrossRef using DOI.""" - import os - - base_url = "https://api.crossref.org/works/" - - # Use provided email or fallback to environment variables - if not email: - email = ( - os.getenv("SCITEX_SCHOLAR_CROSSREF_EMAIL") - or os.getenv("SCITEX_CROSSREF_EMAIL") - or os.getenv("SCITEX_SCHOLAR_PUBMED_EMAIL") - or os.getenv("SCITEX_PUBMED_EMAIL", "research@example.com") - ) - headers = {"User-Agent": f"SciTeX/1.0 (mailto:{email})"} - - # Add API key as query parameter if provided - params = {} - if api_key: - params["key"] = api_key - - try: - response = requests.get( - f"{base_url}{doi}", headers=headers, params=params, timeout=10 - ) - if response.ok: - data = response.json()["message"] - return { - "citations": data.get("is-referenced-by-count", 0), - "type": data.get("type", ""), - "publisher": data.get("publisher", ""), - "references": len(data.get("reference", [])), - "doi": data.get("DOI", ""), - } - except Exception as e: - print(f"CrossRef API error for DOI {doi}: {e}") - return {} - - -async def get_crossref_metrics_async( - doi: str, api_key: Optional[str] = None, email: Optional[str] = None -) -> Dict[str, Any]: - """Get article metrics from CrossRef using DOI (async version).""" - import os - - base_url = "https://api.crossref.org/works/" - - # Use provided email or fallback to environment variables - if not email: - email = ( - os.getenv("SCITEX_SCHOLAR_CROSSREF_EMAIL") - or os.getenv("SCITEX_CROSSREF_EMAIL") - or os.getenv("SCITEX_SCHOLAR_PUBMED_EMAIL") - or os.getenv("SCITEX_PUBMED_EMAIL", "research@example.com") - ) - headers = {"User-Agent": f"SciTeX/1.0 (mailto:{email})"} - - # Add API key as query parameter if provided - params = {} - if api_key: - params["key"] = api_key - - try: - async with aiohttp.ClientSession() as session: - async with session.get( - f"{base_url}{doi}", headers=headers, params=params, timeout=10 - ) as response: - if response.ok: - data = await response.json() - message = data["message"] - return { - "citations": message.get("is-referenced-by-count", 0), - "type": message.get("type", ""), - "publisher": message.get("publisher", ""), - "references": len(message.get("reference", [])), - "doi": message.get("DOI", ""), - } - except Exception as e: - print(f"CrossRef API error for DOI {doi}: {e}") - return {} - - -def save_bibtex( - papers: Dict[str, Any], abstracts: Dict[str, str], output_file: str -) -> None: - """Saves paper metadata as BibTeX file with abstracts. - - Parameters - ---------- - papers : Dict[str, Any] - Dictionary of paper metadata - abstracts : Dict[str, str] - Dictionary of PMIDs to abstracts - output_file : str - Output file path - """ - with open(output_file, "w", encoding="utf-8") as bibtex_file: - for pmid, paper in papers.items(): - if pmid == "uids": - continue - - citation = _get_citation(pmid) - if citation: - bibtex_file.write(citation) - else: - # Use default tuple if pmid not in abstracts - default_data = ("", [], "") # abstract, keywords, doi - bibtex_entry = format_bibtex( - paper, pmid, abstracts.get(pmid, default_data) - ) - bibtex_file.write(bibtex_entry + "\n") - scitex.str.printc(f"Saved to: {str(bibtex_file)}", c="yellow") - - -def format_bibtex(paper: Dict[str, Any], pmid: str, abstract_data: tuple) -> str: - abstract, keywords, doi = abstract_data - - # Get CrossRef and Scimago metrics - crossref_metrics = get_crossref_metrics(doi) if doi else {} - journal = paper.get("source", "Unknown Journal") - # journal_metrics = get_journal_metrics(journal) - - authors = paper.get("authors", [{"name": "Unknown"}]) - author_names = " and ".join(author["name"] for author in authors) - pubdate = paper.get("pubdate", "") - year = pubdate.split()[0] if pubdate.strip() else "" - title = paper.get("title", "No Title") - - # Name formatting - first_author = authors[0]["name"] - first_name = first_author.split()[0] - last_name = first_author.split()[-1] - clean_first_name = "".join(c for c in first_name if c.isalnum()) - clean_last_name = "".join(c for c in last_name if c.isalnum()) - - # Title words - title_words = title.split() - first_title_word = "".join(c.lower() for c in title_words[0] if c.isalnum()) - second_title_word = ( - "".join(c.lower() for c in title_words[1] if c.isalnum()) - if len(title_words) > 1 - else "" - ) - - citation_key = f"{clean_first_name}.{clean_last_name}_{year}_{first_title_word}_{second_title_word}" - - entry = f"""@article{{{citation_key}, - author = {{{author_names}}}, - title = {{{title}}}, - journal = {{{journal}}}, - year = {{{year}}}, - pmid = {{{pmid}}}, - doi = {{{doi}}}, - publisher = {{{crossref_metrics.get("publisher", "")}}}, - references = {{{crossref_metrics.get("references", 0)}}}, - keywords = {{{", ".join(keywords)}}}, - abstract = {{{abstract}}} -}} -""" - return entry - - -async def fetch_async( - session: aiohttp.ClientSession, url: str, params: Dict -) -> Union[Dict, str]: - """Asynchronous fetch helper.""" - async with session.get(url, params=params) as response: - if response.status == 200: - if params.get("retmode") == "xml": - return await response.text() - elif params.get("retmode") == "json": - return await response.json() - return await response.text() - return {} - - -async def batch__fetch_details(pmids: List[str], batch_size: int = 20) -> List[Dict]: - """Fetches details for multiple PMIDs concurrently. - - Parameters - ---------- - pmids : List[str] - List of PubMed IDs - batch_size : int, optional - Size of each batch for concurrent requests - - Returns - ------- - List[Dict] - List of response data - """ - base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" - - async with aiohttp.ClientSession() as session: - tasks = [] - for i in range(0, len(pmids), batch_size): - batch_pmids = pmids[i : i + batch_size] - - # Fetch both details and citations concurrently - efetch_params = { - "db": "pubmed", - "id": ",".join(batch_pmids), - "retmode": "xml", - "rettype": "abstract", - } - - esummary_params = { - "db": "pubmed", - "id": ",".join(batch_pmids), - "retmode": "json", - } - - tasks.append(fetch_async(session, f"{base_url}efetch.fcgi", efetch_params)) - tasks.append( - fetch_async(session, f"{base_url}esummary.fcgi", esummary_params) - ) - - results = await asyncio.gather(*tasks) - return results - - -def search_pubmed(query: str, n_entries: int = 10) -> int: - # query = args.query or "epilepsy prediction" - # print(f"Using query: {query}") - - search_results = _search_pubmed(query) - if not search_results: - # print("No results found or error occurred") - return 1 - - pmids = search_results["esearchresult"]["idlist"] - count = len(pmids) - # print(f"Found {count:,} results") - - output_file = f"pubmed_{query.replace(' ', '_')}.bib" - # print(f"Saving results to: {output_file}") - - # Process in larger batches asynchronously - results = asyncio.run(batch__fetch_details(pmids[:n_entries])) - # here, results seems long string - - # Process results and save - with open(output_file, "w", encoding="utf-8") as f: - for i in range(0, len(results), 2): - xml_response = results[i] - json_response = results[i + 1] - - if isinstance(xml_response, str): - abstracts = _parse_abstract_xml(xml_response) - if isinstance(json_response, dict) and "result" in json_response: - details = json_response["result"] - save_bibtex(details, abstracts, output_file) - - # Process results and save - temp_bibtex = [] - for i in range(0, len(results), 2): - xml_response = results[i] - json_response = results[i + 1] - - if isinstance(xml_response, str): - abstracts = _parse_abstract_xml(xml_response) - if isinstance(json_response, dict) and "result" in json_response: - details = json_response["result"] - for pmid in details: - if pmid != "uids": - citation = _get_citation(pmid) - if citation: - temp_bibtex.append(citation) - else: - entry = format_bibtex( - details[pmid], pmid, abstracts.get(pmid, "") - ) - temp_bibtex.append(entry) - - # Write all entries at once - with open(output_file, "w", encoding="utf-8") as f: - f.write("\n".join(temp_bibtex)) - - return 0 - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="PubMed article search and retrieval tool" - ) - parser.add_argument( - "--query", - "-q", - type=str, - help='Search query (default: "epilepsy prediction")', - ) - parser.add_argument( - "--n_entries", - "-n", - type=int, - default=10, - help='Search query (default: "epilepsy prediction")', - ) - args = parser.parse_args() - scitex.str.printc(args, c="yellow") - return args - - -def run_main() -> None: - global CONFIG - import sys - - import matplotlib.pyplot as plt - - import scitex - - CONFIG, sys.stdout, sys.stderr, plt, CC = scitex.session.start( - sys, - verbose=False, - ) - - args = parse_args() - exit_status = search_pubmed(args.query, args.n_entries) - - scitex.session.close( - CONFIG, - verbose=False, - notify=False, - message="", - exit_status=exit_status, - ) - - -if __name__ == "__main__": - run_main() - -# EOF diff --git a/src/scitex/web/_skills/SKILL.md b/src/scitex/web/_skills/SKILL.md deleted file mode 100644 index 7152595b7..000000000 --- a/src/scitex/web/_skills/SKILL.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -name: stx.web -description: Web utilities for PubMed search, URL scraping, content summarization, and image downloading. ---- - -# stx.web - -The `stx.web` module provides web utilities for scientific use cases: searching PubMed for papers, scraping URLs for content and images, summarizing web pages, and downloading images in bulk. - -## Python API - -```python -import scitex as stx - -# Search PubMed -papers = stx.web.search_pubmed("EEG deep learning classification", max_results=20) -metrics = stx.web.get_crossref_metrics(doi="10.1000/xyz123") - -# Summarize a URL -summary = stx.web.summarize_url("https://arxiv.org/abs/2401.00000") - -# Crawl URL for structured content -content = stx.web.crawl_url("https://example.com") -json_data = stx.web.crawl_to_json("https://example.com") - -# Scrape URLs and images from a page -urls = stx.web.get_urls("https://example.com") -image_urls = stx.web.get_image_urls("https://example.com") - -# Download images -stx.web.download_images( - urls=image_urls, - output_dir="./downloaded_images", - max_workers=5 -) -``` - -## Key Features - -- `search_pubmed(query, max_results)` — search PubMed and return structured paper data -- `get_crossref_metrics(doi)` — fetch citation counts and impact metrics from CrossRef -- `summarize_url(url)` — extract and summarize main content from a URL -- `crawl_url` / `crawl_to_json` — structured web crawling -- `get_urls` / `get_image_urls` — scrape links and images from pages -- `download_images(urls, output_dir)` — bulk image download with concurrency diff --git a/src/scitex/web/_skills/images.md b/src/scitex/web/_skills/images.md deleted file mode 100644 index ad994fb47..000000000 --- a/src/scitex/web/_skills/images.md +++ /dev/null @@ -1,43 +0,0 @@ ---- -description: Bulk-download images from a web page with download_images() and collect all image URLs with get_image_urls(). ---- - -# Image Downloading - -## download_images - -Download all images found on a web page to a local directory. - -```python -download_images(url: str, output_dir: str = ".", extensions: list[str] | None = None) -> list[str] -``` - -Returns a list of local file paths for successfully downloaded images. - -```python -import scitex as stx - -saved = stx.web.download_images( - "https://example.com/gallery", - output_dir="./downloaded_images", - extensions=[".png", ".jpg"], -) -print(f"Downloaded {len(saved)} images") -``` - ---- - -## get_image_urls - -Collect all image URLs from a web page without downloading them. - -```python -get_image_urls(url: str) -> list[str] -``` - -```python -import scitex as stx - -img_urls = stx.web.get_image_urls("https://example.com/gallery") -print(img_urls[:3]) -``` diff --git a/src/scitex/web/_skills/pubmed.md b/src/scitex/web/_skills/pubmed.md deleted file mode 100644 index e61a1e8b5..000000000 --- a/src/scitex/web/_skills/pubmed.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -description: Search PubMed for papers matching a query with search_pubmed() and retrieve Crossref citation counts with get_crossref_metrics(). ---- - -# PubMed Search - -## search_pubmed - -Query PubMed and return structured results including abstracts, authors, and DOIs. - -```python -search_pubmed( - query: str, - max_results: int = 20, - email: str | None = None, -) -> list[dict] -``` - -```python -import scitex as stx - -papers = stx.web.search_pubmed("EEG epilepsy deep learning", max_results=10) -for p in papers: - print(p["title"], p.get("doi")) -``` - -Each result dict contains: `pmid`, `title`, `abstract`, `authors`, `journal`, `year`, `doi`. - ---- - -## get_crossref_metrics - -Retrieve citation count and journal impact factor for a DOI via the Crossref API. - -```python -get_crossref_metrics(doi: str) -> dict -``` - -```python -import scitex as stx - -metrics = stx.web.get_crossref_metrics("10.1038/s41586-021-03819-2") -print(metrics) -# {'cited_by': 523, 'journal': 'Nature', 'type': 'journal-article'} -``` diff --git a/src/scitex/web/_skills/url.md b/src/scitex/web/_skills/url.md deleted file mode 100644 index a8ddf881a..000000000 --- a/src/scitex/web/_skills/url.md +++ /dev/null @@ -1,71 +0,0 @@ ---- -description: Extract and summarize web page content with summarize_url(), crawl pages with crawl_url() and crawl_to_json(), and collect all hyperlinks with get_urls(). ---- - -# URL Utilities - -## summarize_url - -Fetch a URL and return a concise text summary of the main content. - -```python -summarize_url(url: str, max_length: int = 500) -> str -``` - -```python -import scitex as stx - -summary = stx.web.summarize_url("https://arxiv.org/abs/2301.12345") -print(summary) -``` - ---- - -## crawl_url - -Fetch the full main text content of a page. - -```python -crawl_url(url: str) -> str -``` - -```python -import scitex as stx - -content = stx.web.crawl_url("https://example.com/article") -print(content[:500]) -``` - ---- - -## crawl_to_json - -Fetch a page and return structured content as a dict. - -```python -crawl_to_json(url: str) -> dict -``` - -```python -import scitex as stx - -data = stx.web.crawl_to_json("https://example.com/article") -# Returns: {'title': ..., 'content': ..., 'links': [...], 'url': ...} -``` - ---- - -## get_urls - -Extract all hyperlinks from a web page. - -```python -get_urls(url: str) -> list[str] -``` - -```python -import scitex as stx - -links = stx.web.get_urls("https://example.com") -print(links[:5]) -``` diff --git a/src/scitex/web/_summarize_url.py b/src/scitex/web/_summarize_url.py deleted file mode 100755 index 5f191d95d..000000000 --- a/src/scitex/web/_summarize_url.py +++ /dev/null @@ -1,160 +0,0 @@ -#!./env/bin/python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-07-29 21:43:30 (ywatanabe)" -# ./src/scitex/web/_crawl.py - - -import json -import urllib.parse -from concurrent.futures import ThreadPoolExecutor, as_completed -from pprint import pprint - -import requests -from bs4 import BeautifulSoup -from tqdm import tqdm - -import scitex - -try: - from readability import Document -except ImportError: - try: - from readability.readability import Document - except ImportError: - Document = None - -import re - -# def crawl_url(url, max_depth=1): -# print("\nCrawling...") -# visited = set() -# to_visit = [(url, 0)] -# contents = {} - -# while to_visit: -# current_url, depth = to_visit.pop(0) -# if current_url in visited or depth > max_depth: -# continue - -# try: -# response = requests.get(current_url) -# if response.status_code == 200: -# visited.add(current_url) -# contents[current_url] = response.text -# soup = BeautifulSoup(response.text, "html.parser") - -# for link in soup.find_all("a", href=True): -# absolute_link = urllib.parse.urljoin( -# current_url, link["href"] -# ) -# if absolute_link not in visited: -# to_visit.append((absolute_link, depth + 1)) - -# except requests.RequestException: -# pass - -# return visited, contents - - -def extract_main_content(html): - if Document is None: - # Fallback: just strip HTML tags - content = re.sub("<[^<]+?>", "", html) - content = " ".join(content.split()) - return content[:5000] # Limit to first 5000 chars - - doc = Document(html) - content = doc.summary() - # Remove HTML tags - content = re.sub("<[^<]+?>", "", content) - # Remove extra whitespace - content = " ".join(content.split()) - return content - - -def crawl_url(url, max_depth=1): - print("\nCrawling...") - visited = set() - to_visit = [(url, 0)] - contents = {} - - while to_visit: - current_url, depth = to_visit.pop(0) - if current_url in visited or depth > max_depth: - continue - - try: - response = requests.get(current_url) - if response.status_code == 200: - visited.add(current_url) - main_content = extract_main_content(response.text) - contents[current_url] = main_content - soup = BeautifulSoup(response.text, "html.parser") - - for link in soup.find_all("a", href=True): - absolute_link = urllib.parse.urljoin(current_url, link["href"]) - if absolute_link not in visited: - to_visit.append((absolute_link, depth + 1)) - - except requests.RequestException: - pass - - return visited, contents - - -def crawl_to_json(start_url): - if not start_url.startswith("http"): - start_url = "https://" + start_url - crawled_urls, contents = crawl_url(start_url) - - print("\nSummalizing as json...") - - def process_url(url): - llm = scitex.ai.GenAI("gpt-4o-mini") - return { - "url": url, - "content": llm(f"Summarize this page in 1 line:\n\n{contents[url]}"), - } - - with ThreadPoolExecutor() as executor: - future_to_url = {executor.submit(process_url, url): url for url in crawled_urls} - crawled_pages = [] - for future in tqdm( - as_completed(future_to_url), - total=len(crawled_urls), - desc="Processing URLs", - ): - crawled_pages.append(future.result()) - - result = {"start_url": start_url, "crawled_pages": crawled_pages} - - return json.dumps(result, indent=2) - - -def summarize_all(json_contents): - llm = scitex.ai.GenAI("gpt-4o-mini") - out = llm(f"Summarize this json file with 5 bullet points:\n\n{json_contents}") - return out - - -def summarize_url(start_url): - json_result = crawl_to_json(start_url) - ground_summary = summarize_all(json_result) - - pprint(ground_summary) - return ground_summary, json_result - - -main = summarize_url - -if __name__ == "__main__": - import argparse - - import scitex - - parser = argparse.ArgumentParser(description="") - parser.add_argument("--url", "-u", type=str, help="(default: %(default)s)") - args = parser.parse_args() - scitex.gen.print_block(args, c="yellow") - - main(args.url) diff --git a/src/scitex/web/download_images.py b/src/scitex/web/download_images.py deleted file mode 100755 index b891eda90..000000000 --- a/src/scitex/web/download_images.py +++ /dev/null @@ -1,323 +0,0 @@ -#!/usr/bin/env python3 -# File: ./src/scitex/web/download_images.py - -""" -Image Downloader for SciTeX. - -Downloads images from URLs with minimum size filtering. - -Usage: - python -m scitex.web.download_images https://example.com - python -m scitex.web.download_images https://example.com -o ./downloads - python -m scitex.web.download_images https://example.com --min-size 800x600 -""" - -import os -import re -import urllib.parse -from concurrent.futures import ThreadPoolExecutor, as_completed -from datetime import datetime -from pathlib import Path -from typing import List, Optional, Tuple - -import requests -from tqdm import tqdm - -# NOTE: ``bs4`` is imported lazily inside functions that actually use it. -# Importing at module load leaks ``ModuleNotFoundError`` through -# ``scitex.web.__init__`` and breaks ``scitex --json`` / -# ``scitex --help-recursive`` on installs without beautifulsoup4. -# See ywatanabe1989/todo#279. - -try: - from io import BytesIO - - from PIL import Image - - PILLOW_AVAILABLE = True -except ImportError: - PILLOW_AVAILABLE = False - -from scitex.logging import getLogger - -logger = getLogger(__name__) - -# Configuration -DEFAULT_MIN_WIDTH = 400 -DEFAULT_MIN_HEIGHT = 300 -DEFAULT_TIMEOUT = 10 -DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" - - -def _get_default_download_dir() -> str: - """Get default download directory using SCITEX_DIR if available.""" - scitex_root = os.environ.get("SCITEX_DIR", os.path.expanduser("~/.scitex")) - return os.path.join(scitex_root, "web", "downloads") - - -def _normalize_url_for_directory(url: str) -> str: - """Convert URL to a safe directory name.""" - parsed = urllib.parse.urlparse(url) - domain = parsed.netloc.replace("www.", "") - path = parsed.path.strip("/").replace("/", "-") - - normalized = f"{domain}-{path}" if path else domain - normalized = re.sub(r"[^\w\-.]", "-", normalized) - normalized = re.sub(r"-+", "-", normalized) - normalized = normalized[:100].strip("-") - - return normalized - - -def _is_direct_image_url(url: str) -> bool: - """Check if URL appears to be a direct image link.""" - extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"] - path = urllib.parse.urlparse(url.lower()).path - return any(path.endswith(ext) for ext in extensions) - - -def _extract_image_urls(url: str, same_domain: bool = False) -> List[str]: - """Extract image URLs from a webpage.""" - from bs4 import BeautifulSoup # lazy: see module note, todo#279 - - try: - logger.info(f"Fetching page: {url}") - response = requests.get( - url, - timeout=DEFAULT_TIMEOUT, - headers={"User-Agent": DEFAULT_USER_AGENT}, - ) - response.raise_for_status() - except requests.RequestException as e: - logger.error(f"Failed to fetch page: {e}") - return [] - - soup = BeautifulSoup(response.content, "html.parser") - parsed_base = urllib.parse.urlparse(url) - image_urls = set() - - for img in soup.find_all("img"): - img_url = img.get("src") or img.get("data-src") - if not img_url: - continue - - img_url = urllib.parse.urljoin(url, img_url) - - if img_url.lower().endswith((".svg", ".svgz")): - continue - - if same_domain: - parsed_img = urllib.parse.urlparse(img_url) - if parsed_img.netloc != parsed_base.netloc: - continue - - image_urls.add(img_url) - - logger.info(f"Found {len(image_urls)} images on page") - return list(image_urls) - - -def _download_single_image( - img_url: str, - output_dir: Path, - counter: int, - min_size: Optional[Tuple[int, int]], -) -> Optional[str]: - """Download a single image.""" - try: - response = requests.get( - img_url, - timeout=DEFAULT_TIMEOUT, - headers={"User-Agent": DEFAULT_USER_AGENT}, - ) - response.raise_for_status() - - # Validate content-type - content_type = response.headers.get("content-type", "") - if not content_type.startswith("image/"): - logger.debug(f"Skipping non-image: {content_type}") - return None - - # Check dimensions - if min_size and PILLOW_AVAILABLE: - try: - img = Image.open(BytesIO(response.content)) - width, height = img.size - if width < min_size[0] or height < min_size[1]: - logger.debug( - f"Skipping small image: {width}x{height} " - f"(min: {min_size[0]}x{min_size[1]})" - ) - return None - except Exception: - pass - - # Determine extension - ext = "jpg" - if PILLOW_AVAILABLE: - try: - img = Image.open(BytesIO(response.content)) - fmt = img.format.lower() if img.format else "jpeg" - ext = "jpg" if fmt == "jpeg" else fmt - except Exception: - pass - elif "png" in content_type: - ext = "png" - elif "gif" in content_type: - ext = "gif" - elif "webp" in content_type: - ext = "webp" - - filename = f"{counter:04d}.{ext}" - filepath = output_dir / filename - - with open(filepath, "wb") as f: - f.write(response.content) - - logger.info(f"Downloaded: {filename}") - return str(filepath) - - except Exception as e: - logger.warning(f"Error downloading {img_url}: {e}") - return None - - -def download_images( - url: str, - output_dir: Optional[str] = None, - min_size: Optional[Tuple[int, int]] = None, - max_workers: int = 5, - same_domain: bool = False, -) -> List[str]: - """ - Download images from a URL. - - Args: - url: Webpage URL or direct image URL - output_dir: Output directory (default: $SCITEX_DIR/web/downloads) - min_size: Minimum (width, height) to filter small images (default: 400x300) - max_workers: Concurrent download threads - same_domain: Only download images from the same domain - - Returns: - List of downloaded file paths - - Example: - >>> paths = download_images("https://example.com") - >>> paths = download_images("https://example.com/photo.jpg") - >>> paths = download_images("https://example.com", min_size=(800, 600)) - """ - if not PILLOW_AVAILABLE: - logger.warning("Pillow not available. Size filtering disabled.") - min_size = None - elif min_size is None: - min_size = (DEFAULT_MIN_WIDTH, DEFAULT_MIN_HEIGHT) - - # Setup output directory - if output_dir is None: - output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR") - if output_dir is None: - output_dir = _get_default_download_dir() - - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - normalized = _normalize_url_for_directory(url) - output_path = Path(output_dir).expanduser() / f"{timestamp}-{normalized}-images" - output_path.mkdir(parents=True, exist_ok=True) - - logger.info(f"Output directory: {output_path}") - - # Get image URLs - if _is_direct_image_url(url): - image_urls = [url] - logger.info("Direct image URL detected") - else: - image_urls = _extract_image_urls(url, same_domain=same_domain) - - if not image_urls: - logger.warning("No images found") - return [] - - # Download concurrently - downloaded = [] - counter = [1] - - def download_with_counter(img_url: str) -> Optional[str]: - idx = counter[0] - counter[0] += 1 - return _download_single_image(img_url, output_path, idx, min_size) - - with ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = {executor.submit(download_with_counter, u): u for u in image_urls} - - for future in tqdm( - as_completed(futures), total=len(image_urls), desc="Downloading" - ): - result = future.result() - if result: - downloaded.append(result) - - logger.info(f"Downloaded {len(downloaded)} images to {output_path}") - return downloaded - - -def main(): - """CLI entry point.""" - import argparse - - parser = argparse.ArgumentParser( - description="Download images from URL", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - python -m scitex.web.download_images https://example.com - python -m scitex.web.download_images https://example.com -o ./downloads - python -m scitex.web.download_images https://example.com --min-size 800x600 - python -m scitex.web.download_images https://example.com --no-min-size - """, - ) - parser.add_argument("url", help="URL to download images from") - parser.add_argument("-o", "--output", help="Output directory") - parser.add_argument( - "--min-size", - default="400x300", - help="Minimum size WIDTHxHEIGHT (default: 400x300)", - ) - parser.add_argument( - "--no-min-size", - action="store_true", - help="Disable size filtering", - ) - parser.add_argument( - "--same-domain", - action="store_true", - help="Only download from same domain", - ) - parser.add_argument( - "--workers", - type=int, - default=5, - help="Concurrent downloads (default: 5)", - ) - - args = parser.parse_args() - - min_size = None - if not args.no_min_size and args.min_size: - w, h = map(int, args.min_size.split("x")) - min_size = (w, h) - - paths = download_images( - args.url, - output_dir=args.output, - min_size=min_size, - max_workers=args.workers, - same_domain=args.same_domain, - ) - - print(f"\nDownloaded {len(paths)} images:") - for p in paths: - print(f" {p}") - - -if __name__ == "__main__": - main() diff --git a/tests/scitex/web/test__scraping.py b/tests/scitex/web/test__scraping.py deleted file mode 100644 index 0534300db..000000000 --- a/tests/scitex/web/test__scraping.py +++ /dev/null @@ -1,712 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# File: ./tests/scitex/web/test__scraping.py - -""" -Tests for web scraping utilities. -""" - -import re -import shutil -import tempfile -from pathlib import Path -from unittest.mock import MagicMock, Mock, mock_open, patch - -import pytest - - -class TestGetUrls: - """Test get_urls function.""" - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_basic(self, mock_get): - """Test basic URL extraction.""" - from scitex.web import get_urls - - mock_response = Mock() - mock_response.text = """ - - - Link 1 - Link 2 - Link 3 - - - """ - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - urls = get_urls("https://example.com") - - assert len(urls) == 3 - assert "https://example.com/page1" in urls - assert "https://example.com/page2" in urls - assert "https://example.com/page3" in urls - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_with_pattern(self, mock_get): - """Test URL extraction with pattern filtering.""" - from scitex.web import get_urls - - mock_response = Mock() - mock_response.text = """ - - - PDF - HTML - Another PDF - - - """ - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - urls = get_urls("https://example.com", pattern=r"\.pdf$") - - assert len(urls) == 2 - assert all(url.endswith(".pdf") for url in urls) - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_same_domain(self, mock_get): - """Test URL extraction with same domain filter.""" - from scitex.web import get_urls - - mock_response = Mock() - mock_response.text = """ - - - Internal - External - Relative - - - """ - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - urls = get_urls("https://example.com", same_domain=True) - - assert len(urls) == 2 - assert all("example.com" in url for url in urls) - assert not any("other.com" in url for url in urls) - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_relative_urls(self, mock_get): - """Test conversion of relative URLs to absolute.""" - from scitex.web import get_urls - - mock_response = Mock() - mock_response.text = """ - - - Page 1 - Page 2 - Page 3 - - - """ - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - urls = get_urls("https://example.com/dir/", absolute=True) - - assert len(urls) == 3 - assert all(url.startswith("https://") for url in urls) - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_request_failure(self, mock_get): - """Test handling of request failures.""" - import requests - - from scitex.web import get_urls - - mock_get.side_effect = requests.RequestException("Network error") - - urls = get_urls("https://example.com") - - assert urls == [] - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_duplicate_removal(self, mock_get): - """Test that duplicate URLs are removed.""" - from scitex.web import get_urls - - mock_response = Mock() - mock_response.text = """ - - - Link 1 - Link 1 again - Relative to same page - - - """ - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - urls = get_urls("https://example.com") - - # Should only have one instance of page1 - assert len(urls) == 1 - - @patch("scitex.web._scraping.requests.get") - def test_get_urls_empty_page(self, mock_get): - """Test handling of page with no links.""" - from scitex.web import get_urls - - mock_response = Mock() - mock_response.text = "No links here" - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - urls = get_urls("https://example.com") - - assert urls == [] - - -class TestGetImageUrls: - """Test get_image_urls function.""" - - @patch("scitex.web._scraping.requests.get") - def test_get_image_urls_basic(self, mock_get): - """Test basic image URL extraction.""" - from scitex.web import get_image_urls - - mock_response = Mock() - mock_response.text = """ - - - - - - - - """ - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - img_urls = get_image_urls("https://example.com") - - assert len(img_urls) == 3 - assert "https://example.com/image1.jpg" in img_urls - assert "https://example.com/images/image2.png" in img_urls - - @patch("scitex.web._scraping.requests.get") - def test_get_image_urls_with_pattern(self, mock_get): - """Test image URL extraction with pattern filtering.""" - from scitex.web import get_image_urls - - mock_response = Mock() - mock_response.text = """ - - - - - - - - """ - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - img_urls = get_image_urls("https://example.com", pattern=r"\.jpg$") - - assert len(img_urls) == 2 - assert all(url.endswith(".jpg") for url in img_urls) - - @patch("scitex.web._scraping.requests.get") - def test_get_image_urls_same_domain(self, mock_get): - """Test image URL extraction with same domain filter.""" - from scitex.web import get_image_urls - - mock_response = Mock() - mock_response.text = """ - - - - - - - - """ - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - img_urls = get_image_urls("https://example.com", same_domain=True) - - assert len(img_urls) == 2 - assert all("example.com" in url for url in img_urls) - - @patch("scitex.web._scraping.requests.get") - def test_get_image_urls_request_failure(self, mock_get): - """Test handling of request failures.""" - import requests - - from scitex.web import get_image_urls - - mock_get.side_effect = requests.RequestException("Network error") - - img_urls = get_image_urls("https://example.com") - - assert img_urls == [] - - @patch("scitex.web._scraping.requests.get") - def test_get_image_urls_no_images(self, mock_get): - """Test handling of page with no images.""" - from scitex.web import get_image_urls - - mock_response = Mock() - mock_response.text = "No images here" - mock_response.raise_for_status = Mock() - mock_get.return_value = mock_response - - img_urls = get_image_urls("https://example.com") - - assert img_urls == [] - - -class TestDownloadImages: - """Test download_images function.""" - - def setup_method(self): - """Set up temporary directory for tests.""" - self.temp_dir = tempfile.mkdtemp() - - def teardown_method(self): - """Clean up temporary directory after tests.""" - if Path(self.temp_dir).exists(): - shutil.rmtree(self.temp_dir) - - @patch("scitex.web._scraping.requests.get") - def test_download_images_basic(self, mock_get): - """Test basic image downloading.""" - from scitex.web import download_images - - # Mock page response - page_response = Mock() - page_response.text = """ - - - - - - - """ - page_response.raise_for_status = Mock() - - # Mock image responses - img_response1 = Mock() - img_response1.content = b"fake image data 1" - img_response1.headers = {"content-type": "image/jpeg"} - img_response1.raise_for_status = Mock() - - img_response2 = Mock() - img_response2.content = b"fake image data 2" - img_response2.headers = {"content-type": "image/png"} - img_response2.raise_for_status = Mock() - - mock_get.side_effect = [page_response, img_response1, img_response2] - - paths = download_images("https://example.com", output_dir=self.temp_dir) - - assert len(paths) == 2 - assert all(Path(p).exists() for p in paths) - - @patch("scitex.web._scraping.requests.get") - def test_download_images_with_pattern(self, mock_get): - """Test image downloading with pattern filter.""" - from scitex.web import download_images - - page_response = Mock() - page_response.text = """ - - - - - - - """ - page_response.raise_for_status = Mock() - - img_response = Mock() - img_response.content = b"fake image data" - img_response.headers = {"content-type": "image/jpeg"} - img_response.raise_for_status = Mock() - - mock_get.side_effect = [page_response, img_response] - - paths = download_images( - "https://example.com", output_dir=self.temp_dir, pattern=r"\.jpg$" - ) - - assert len(paths) == 1 - - @patch("scitex.web._scraping.requests.get") - def test_download_images_duplicate_filenames(self, mock_get): - """Test handling of duplicate filenames.""" - from scitex.web import download_images - - page_response = Mock() - page_response.text = """ - - - - - - - """ - page_response.raise_for_status = Mock() - - img_response = Mock() - img_response.content = b"fake image data" - img_response.headers = {"content-type": "image/jpeg"} - img_response.raise_for_status = Mock() - - mock_get.side_effect = [page_response, img_response, img_response] - - paths = download_images("https://example.com", output_dir=self.temp_dir) - - # Should have both images with different filenames - assert len(paths) == 2 - assert len(set(paths)) == 2 # All paths are unique - - @patch("scitex.web._scraping.requests.get") - def test_download_images_request_failure(self, mock_get): - """Test handling of request failures.""" - import requests - - from scitex.web import download_images - - mock_get.side_effect = requests.RequestException("Network error") - - paths = download_images("https://example.com", output_dir=self.temp_dir) - - assert paths == [] - - @patch("scitex.web._scraping.requests.get") - def test_download_images_same_domain(self, mock_get): - """Test downloading only images from same domain.""" - from scitex.web import download_images - - page_response = Mock() - page_response.text = """ - - - - - - - """ - page_response.raise_for_status = Mock() - - img_response = Mock() - img_response.content = b"fake image data" - img_response.headers = {"content-type": "image/jpeg"} - img_response.raise_for_status = Mock() - - mock_get.side_effect = [page_response, img_response] - - paths = download_images( - "https://example.com", output_dir=self.temp_dir, same_domain=True - ) - - # Should only download the first image - assert len(paths) == 1 - - @patch("scitex.web._scraping.requests.get") - @patch.dict("os.environ", {}, clear=True) - def test_download_images_no_output_dir(self, mock_get): - """Test default output directory creation using SCITEX_DIR.""" - import os - - from scitex.web import download_images - - page_response = Mock() - page_response.text = """ - - - - - - """ - page_response.raise_for_status = Mock() - - img_response = Mock() - img_response.content = b"fake image data" - img_response.headers = {"content-type": "image/jpeg"} - img_response.raise_for_status = Mock() - - mock_get.side_effect = [page_response, img_response] - - # Set SCITEX_DIR to a temp location for testing - test_scitex_dir = Path(self.temp_dir) / "scitex" - os.environ["SCITEX_DIR"] = str(test_scitex_dir) - - paths = download_images("https://example.com") - - assert len(paths) == 1 - expected_dir = test_scitex_dir / "web" / "downloads" - assert expected_dir.exists() - - @patch("scitex.web._scraping.requests.get") - @patch.dict( - "os.environ", {"SCITEX_WEB_DOWNLOADS_DIR": "/tmp/test_downloads"}, clear=True - ) - def test_download_images_env_var_priority(self, mock_get): - """Test that SCITEX_WEB_DOWNLOADS_DIR takes priority.""" - import os - - from scitex.web import download_images - - page_response = Mock() - page_response.text = """ - - - - - - """ - page_response.raise_for_status = Mock() - - img_response = Mock() - img_response.content = b"fake image data" - img_response.headers = {"content-type": "image/jpeg"} - img_response.raise_for_status = Mock() - - mock_get.side_effect = [page_response, img_response] - - # Set both env vars - os.environ["SCITEX_DIR"] = "/tmp/scitex" - os.environ["SCITEX_WEB_DOWNLOADS_DIR"] = self.temp_dir - - paths = download_images("https://example.com") - - # Should use SCITEX_WEB_DOWNLOADS_DIR, not SCITEX_DIR - assert len(paths) == 1 - assert paths[0].startswith(self.temp_dir) - - @patch("scitex.web._scraping.requests.get") - @patch("scitex.web._scraping.PILLOW_AVAILABLE", True) - @patch("scitex.web._scraping.Image.open") - def test_download_images_min_size_filter(self, mock_image_open, mock_get): - """Test minimum size filtering.""" - from scitex.web import download_images - - page_response = Mock() - page_response.text = """ - - - - - - - """ - page_response.raise_for_status = Mock() - - img_response_small = Mock() - img_response_small.content = b"small image" - img_response_small.headers = {"content-type": "image/jpeg"} - img_response_small.raise_for_status = Mock() - - img_response_large = Mock() - img_response_large.content = b"large image" - img_response_large.headers = {"content-type": "image/jpeg"} - img_response_large.raise_for_status = Mock() - - # Mock image sizes - small_img = Mock() - small_img.size = (50, 50) - large_img = Mock() - large_img.size = (500, 500) - - mock_image_open.side_effect = [small_img, large_img] - mock_get.side_effect = [page_response, img_response_small, img_response_large] - - paths = download_images( - "https://example.com", output_dir=self.temp_dir, min_size=(100, 100) - ) - - # Only the large image should be downloaded - assert len(paths) == 1 - - -class TestScrapingModuleImport: - """Test that scraping functions are properly exported.""" - - def test_scraping_functions_available(self): - """Test that all scraping functions are available.""" - import scitex.web - - assert hasattr(scitex.web, "get_urls") - assert hasattr(scitex.web, "download_images") - assert hasattr(scitex.web, "get_image_urls") - - assert callable(scitex.web.get_urls) - assert callable(scitex.web.download_images) - assert callable(scitex.web.get_image_urls) - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_scraping.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # File: ./src/scitex/web/_scraping.py -# -# """Web scraping utilities for extracting URLs.""" -# -# import re -# import urllib.parse -# from typing import List, Optional, Set -# -# import requests -# from bs4 import BeautifulSoup -# -# from scitex.logging import getLogger -# -# logger = getLogger(__name__) -# -# DEFAULT_TIMEOUT = 10 -# DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" -# -# -# def get_urls( -# url: str, -# pattern: Optional[str] = None, -# absolute: bool = True, -# same_domain: bool = False, -# include_external: bool = True, -# ) -> List[str]: -# """ -# Extract all URLs from a webpage. -# -# Args: -# url: The URL of the webpage to scrape -# pattern: Optional regex pattern to filter URLs (e.g., r'\\.pdf$' for PDF files) -# absolute: If True, convert relative URLs to absolute URLs -# same_domain: If True, only return URLs from the same domain -# include_external: If True, include external links (only applies if same_domain=False) -# -# Returns: -# List of URLs found on the page -# -# Example: -# >>> urls = get_urls('https://example.com', pattern=r'\\.pdf$') -# >>> urls = get_urls('https://example.com', same_domain=True) -# """ -# try: -# logger.info(f"Fetching URLs from: {url}") -# response = requests.get( -# url, -# timeout=DEFAULT_TIMEOUT, -# headers={"User-Agent": DEFAULT_USER_AGENT}, -# ) -# response.raise_for_status() -# except requests.RequestException as e: -# logger.error(f"Failed to fetch URL {url}: {e}") -# return [] -# -# soup = BeautifulSoup(response.text, "html.parser") -# urls_found: Set[str] = set() -# -# parsed_base = urllib.parse.urlparse(url) -# -# for link in soup.find_all("a", href=True): -# href = link["href"] -# -# if absolute: -# href = urllib.parse.urljoin(url, href) -# -# if same_domain: -# parsed_href = urllib.parse.urlparse(href) -# if parsed_href.netloc != parsed_base.netloc: -# continue -# elif not include_external: -# parsed_href = urllib.parse.urlparse(href) -# if parsed_href.netloc and parsed_href.netloc != parsed_base.netloc: -# continue -# -# if pattern and not re.search(pattern, href): -# continue -# -# urls_found.add(href) -# -# result = sorted(list(urls_found)) -# logger.info(f"Found {len(result)} URLs") -# return result -# -# -# def get_image_urls( -# url: str, -# pattern: Optional[str] = None, -# same_domain: bool = False, -# ) -> List[str]: -# """ -# Extract all image URLs from a webpage without downloading them. -# -# Args: -# url: The URL of the webpage to scrape -# pattern: Optional regex pattern to filter image URLs -# same_domain: If True, only return images from the same domain -# -# Returns: -# List of image URLs found on the page -# -# Note: -# - SVG files are automatically skipped (vector graphics) -# - Checks both 'src' and 'data-src' attributes for lazy-loaded images -# -# Example: -# >>> img_urls = get_image_urls('https://example.com') -# >>> img_urls = get_image_urls('https://example.com', pattern=r'\\.png$') -# """ -# try: -# logger.info(f"Fetching image URLs from: {url}") -# response = requests.get( -# url, -# timeout=DEFAULT_TIMEOUT, -# headers={"User-Agent": DEFAULT_USER_AGENT}, -# ) -# response.raise_for_status() -# except requests.RequestException as e: -# logger.error(f"Failed to fetch URL {url}: {e}") -# return [] -# -# soup = BeautifulSoup(response.text, "html.parser") -# image_urls: Set[str] = set() -# -# parsed_base = urllib.parse.urlparse(url) -# -# for img in soup.find_all("img"): -# img_url = img.get("src") or img.get("data-src") -# if not img_url: -# continue -# -# img_url = urllib.parse.urljoin(url, img_url) -# -# if img_url.lower().endswith((".svg", ".svgz")): -# continue -# -# if same_domain: -# parsed_img = urllib.parse.urlparse(img_url) -# if parsed_img.netloc != parsed_base.netloc: -# continue -# -# if pattern and not re.search(pattern, img_url): -# continue -# -# image_urls.add(img_url) -# -# result = sorted(list(image_urls)) -# logger.info(f"Found {len(result)} image URLs") -# return result - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_scraping.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/web/test__search_pubmed.py b/tests/scitex/web/test__search_pubmed.py deleted file mode 100755 index bf91741b5..000000000 --- a/tests/scitex/web/test__search_pubmed.py +++ /dev/null @@ -1,1170 +0,0 @@ -#!/usr/bin/env python3 -# Time-stamp: "2024-11-08 05:50:57 (ywatanabe)" -# File: ./scitex_repo/tests/scitex/web/test__search_pubmed.py - -""" -Tests for PubMed search functionality. -""" - -import pytest - -aiohttp = pytest.importorskip("aiohttp") -pytest.importorskip("scitex.web.search_pubmed") - -import asyncio # noqa: F401, E402 -import json # noqa: F401, E402 -import xml.etree.ElementTree as ET # noqa: F401, E402 -from io import StringIO # noqa: F401, E402 -from unittest.mock import MagicMock, Mock, mock_open, patch # noqa: E402 - -try: - from scitex.web import ( - _fetch_details, - _get_citation, - _parse_abstract_xml, - _search_pubmed, - batch__fetch_details, - fetch_async, - format_bibtex, - get_crossref_metrics, - parse_args, - run_main, - save_bibtex, - search_pubmed, - ) -except ImportError: - pytest.skip("scitex.web.search_pubmed not available", allow_module_level=True) - - -class TestSearchPubmed: - """Test _search_pubmed function.""" - - def test_search_pubmed_success(self): - """Test successful PubMed search.""" - mock_response = Mock() - mock_response.ok = True - mock_response.json.return_value = { - "esearchresult": {"idlist": ["12345", "67890"], "count": "2"} - } - - with patch("requests.get", return_value=mock_response): - result = _search_pubmed("test query", retmax=10) - assert result == mock_response.json.return_value - assert len(result["esearchresult"]["idlist"]) == 2 - - def test_search_pubmed_failure(self): - """Test failed PubMed search.""" - mock_response = Mock() - mock_response.ok = False - - with patch("requests.get", return_value=mock_response): - with patch("scitex.str.printc") as mock_print: - result = _search_pubmed("test query") - assert result == {} - mock_print.assert_called_once() - - def test_search_pubmed_network_error(self): - """Test network error during search.""" - import requests - - with patch( - "requests.get", - side_effect=requests.exceptions.RequestException("Network error"), - ): - with patch("scitex.str.printc") as mock_print: - result = _search_pubmed("test query") - assert result == {} - mock_print.assert_called_once() - - def test_search_pubmed_parameters(self): - """Test search parameters are correctly passed.""" - mock_response = Mock() - mock_response.ok = True - mock_response.json.return_value = {"esearchresult": {}} - - with patch("requests.get", return_value=mock_response) as mock_get: - _search_pubmed("epilepsy", retmax=500) - - # Check that correct parameters were passed - args, kwargs = mock_get.call_args - assert kwargs["params"]["term"] == "epilepsy" - assert kwargs["params"]["retmax"] == 500 - assert kwargs["params"]["db"] == "pubmed" - - -class TestFetchDetails: - """Test _fetch_details function.""" - - def test_fetch_details_success(self): - """Test successful fetch of article details.""" - mock_abstract_response = Mock() - mock_abstract_response.ok = True - mock_abstract_response.text = "abstract data" - - mock_details_response = Mock() - mock_details_response.ok = True - mock_details_response.json.return_value = { - "result": {"12345": {"title": "Test"}} - } - - with patch( - "requests.get", side_effect=[mock_abstract_response, mock_details_response] - ): - result = _fetch_details("webenv123", "query_key456", retstart=0, retmax=100) - assert result["abstracts"] == "abstract data" - assert result["details"] == mock_details_response.json.return_value - - def test_fetch_details_failure(self): - """Test failed fetch of article details.""" - mock_response = Mock() - mock_response.ok = False - - with patch("requests.get", return_value=mock_response): - result = _fetch_details("webenv123", "query_key456") - assert result == {} - - def test_fetch_details_parameters(self): - """Test fetch details parameters.""" - mock_response = Mock() - mock_response.ok = True - mock_response.text = "" - mock_response.json.return_value = {} - - with patch("requests.get", return_value=mock_response) as mock_get: - _fetch_details("env123", "key456", retstart=100, retmax=50) - - # Verify two calls were made - assert mock_get.call_count == 2 - - # Check parameters for abstract fetch - first_call_params = mock_get.call_args_list[0][1]["params"] - assert first_call_params["WebEnv"] == "env123" - assert first_call_params["query_key"] == "key456" - assert first_call_params["retstart"] == 100 - assert first_call_params["retmax"] == 50 - - -class TestParseAbstractXml: - """Test _parse_abstract_xml function.""" - - def test_parse_abstract_xml_complete(self): - """Test parsing complete XML with all fields.""" - xml_text = """ - - - - 12345 -
- - This is the abstract text. - -
-
- - - 10.1234/test.doi - - - - - Keyword1 - - - Keyword2 - - -
-
- """ - - result = _parse_abstract_xml(xml_text) - assert "12345" in result - assert result["12345"][0] == "This is the abstract text." - assert result["12345"][1] == ["Keyword1", "Keyword2"] - assert result["12345"][2] == "10.1234/test.doi" - - def test_parse_abstract_xml_missing_fields(self): - """Test parsing XML with missing fields.""" - xml_text = """ - - - - 67890 - - - - """ - - result = _parse_abstract_xml(xml_text) - assert "67890" in result - assert result["67890"][0] == "" # No abstract - assert result["67890"][1] == [] # No keywords - assert result["67890"][2] == "" # No DOI - - def test_parse_abstract_xml_multiple_articles(self): - """Test parsing XML with multiple articles.""" - xml_text = """ - - - - 11111 - - - - - 22222 - - - - """ - - result = _parse_abstract_xml(xml_text) - assert len(result) == 2 - assert "11111" in result - assert "22222" in result - - -class TestGetCitation: - """Test _get_citation function.""" - - def test_get_citation_success(self): - """Test successful citation retrieval.""" - mock_response = Mock() - mock_response.ok = True - mock_response.text = "@article{test_citation}" - - with patch("requests.get", return_value=mock_response): - result = _get_citation("12345") - assert result == "@article{test_citation}" - - def test_get_citation_failure(self): - """Test failed citation retrieval.""" - mock_response = Mock() - mock_response.ok = False - - with patch("requests.get", return_value=mock_response): - result = _get_citation("12345") - assert result == "" - - def test_get_citation_parameters(self): - """Test citation parameters.""" - mock_response = Mock() - mock_response.ok = True - mock_response.text = "" - - with patch("requests.get", return_value=mock_response) as mock_get: - _get_citation("99999") - - args, kwargs = mock_get.call_args - assert kwargs["params"]["db"] == "pubmed" - assert kwargs["params"]["id"] == "99999" - assert kwargs["params"]["rettype"] == "bibtex" - - -class TestGetCrossrefMetrics: - """Test get_crossref_metrics function.""" - - def test_get_crossref_metrics_success(self): - """Test successful CrossRef metrics retrieval.""" - mock_response = Mock() - mock_response.ok = True - mock_response.json.return_value = { - "message": { - "is-referenced-by-count": 42, - "type": "journal-article", - "publisher": "Test Publisher", - "reference": [1, 2, 3], - "DOI": "10.1234/test", - } - } - - with patch("requests.get", return_value=mock_response): - result = get_crossref_metrics("10.1234/test") - assert result["citations"] == 42 - assert result["type"] == "journal-article" - assert result["publisher"] == "Test Publisher" - assert result["references"] == 3 - assert result["doi"] == "10.1234/test" - - def test_get_crossref_metrics_failure(self): - """Test failed CrossRef metrics retrieval.""" - mock_response = Mock() - mock_response.ok = False - - with patch("requests.get", return_value=mock_response): - result = get_crossref_metrics("10.1234/test") - assert result == {} - - def test_get_crossref_metrics_missing_fields(self): - """Test CrossRef metrics with missing fields.""" - mock_response = Mock() - mock_response.ok = True - mock_response.json.return_value = {"message": {}} - - with patch("requests.get", return_value=mock_response): - result = get_crossref_metrics("10.1234/test") - assert result["citations"] == 0 - assert result["type"] == "" - assert result["publisher"] == "" - assert result["references"] == 0 - assert result["doi"] == "" - - -class TestSaveBibtex: - """Test save_bibtex function.""" - - def test_save_bibtex_with_citations(self): - """Test saving BibTeX with official citations.""" - papers = { - "12345": { - "title": "Test Paper", - "authors": [{"name": "John Doe"}], - "source": "Test Journal", - "pubdate": "2023", - } - } - abstracts = {"12345": ("Abstract text", ["Keyword1"], "10.1234/test")} - - mock_citation = "@article{official_citation}" - - with patch("builtins.open", mock_open()) as mock_file: - with patch( - "scitex.web._search_pubmed._get_citation", return_value=mock_citation - ): - with patch("scitex.str.printc"): - save_bibtex(papers, abstracts, "test.bib") - - # Verify file was written - mock_file.assert_called_once_with("test.bib", "w", encoding="utf-8") - handle = mock_file() - handle.write.assert_called_with(mock_citation) - - def test_save_bibtex_without_citations(self): - """Test saving BibTeX without official citations.""" - papers = { - "67890": { - "title": "Test Paper Without Citation", - "authors": [{"name": "Jane Smith"}], - "source": "Another Journal", - "pubdate": "2024", - } - } - abstracts = {} - - with patch("builtins.open", mock_open()) as mock_file: - with patch("scitex.web._search_pubmed._get_citation", return_value=""): - with patch( - "scitex.web._search_pubmed.format_bibtex", - return_value="@article{formatted}", - ) as mock_format: - with patch("scitex.str.printc"): - save_bibtex(papers, abstracts, "test.bib") - - # Verify format_bibtex was called - mock_format.assert_called_once() - handle = mock_file() - handle.write.assert_called_with("@article{formatted}\n") - - def test_save_bibtex_skip_uids(self): - """Test that 'uids' key is skipped.""" - papers = {"uids": ["12345"], "12345": {"title": "Real Paper"}} - abstracts = {} - - with patch("builtins.open", mock_open()) as mock_file: # noqa: F841 - with patch("scitex.web._search_pubmed._get_citation", return_value=""): - with patch("scitex.web._search_pubmed.format_bibtex") as mock_format: - with patch("scitex.str.printc"): - save_bibtex(papers, abstracts, "test.bib") - - # Verify format_bibtex was called only once (not for 'uids') - assert mock_format.call_count == 1 - - -class TestFormatBibtex: - """Test format_bibtex function.""" - - def test_format_bibtex_complete(self): - """Test formatting complete BibTeX entry.""" - paper = { - "title": "Machine Learning for Medical Diagnosis", - "authors": [{"name": "John A. Smith"}, {"name": "Jane B. Doe"}], - "source": "Nature Medicine", - "pubdate": "2023 Jul 15", - } - pmid = "12345678" - abstract_data = ( - "This is the abstract text.", - ["Machine Learning", "Diagnosis"], - "10.1038/s41591-023-12345", - ) - - with patch( - "scitex.web._search_pubmed.get_crossref_metrics", - return_value={"publisher": "Nature Publishing", "references": 50}, - ): - result = format_bibtex(paper, pmid, abstract_data) - - # Check key components - assert "@article{John.Smith_2023_machine_learning" in result - assert "author = {John A. Smith and Jane B. Doe}" in result - assert "title = {Machine Learning for Medical Diagnosis}" in result - assert "journal = {Nature Medicine}" in result - assert "year = {2023}" in result - assert "pmid = {12345678}" in result - assert "doi = {10.1038/s41591-023-12345}" in result - assert "keywords = {Machine Learning, Diagnosis}" in result - assert "abstract = {This is the abstract text.}" in result - - def test_format_bibtex_minimal(self): - """Test formatting BibTeX with minimal data.""" - paper = { - "title": "A", - "authors": [{"name": "X"}], - "source": "Unknown Journal", - "pubdate": "", - } - pmid = "99999" - abstract_data = ("", [], "") - - with patch("scitex.web._search_pubmed.get_crossref_metrics", return_value={}): - result = format_bibtex(paper, pmid, abstract_data) - - # Check it doesn't crash and produces valid entry - assert "@article{" in result - assert "pmid = {99999}" in result - - def test_format_bibtex_special_characters(self): - """Test formatting with special characters in names.""" - paper = { - "title": "Test-Paper: With Special Characters!", - "authors": [{"name": "O'Neill-Smith"}], - "source": "Test Journal", - "pubdate": "2023", - } - pmid = "11111" - abstract_data = ("", [], "") - - with patch("scitex.web._search_pubmed.get_crossref_metrics", return_value={}): - result = format_bibtex(paper, pmid, abstract_data) - - # Check citation key is properly cleaned (format: FirstName.LastName_year_...) - assert "@article{ONeillSmith.ONeillSmith_2023_testpaper_with" in result - - -class TestAsyncFunctions: - """Test async functions.""" - - @pytest.mark.asyncio - async def test_fetch_async_json(self): - """Test async fetch with JSON response.""" - from unittest.mock import AsyncMock - - mock_response = MagicMock() - mock_response.status = 200 - mock_response.json = AsyncMock(return_value={"test": "data"}) - - mock_session = MagicMock() - mock_session.get.return_value.__aenter__.return_value = mock_response - - result = await fetch_async(mock_session, "http://test.com", {"retmode": "json"}) - assert result == {"test": "data"} - - @pytest.mark.asyncio - async def test_fetch_async_xml(self): - """Test async fetch with XML response.""" - from unittest.mock import AsyncMock - - mock_response = MagicMock() - mock_response.status = 200 - mock_response.text = AsyncMock(return_value="test") - - mock_session = MagicMock() - mock_session.get.return_value.__aenter__.return_value = mock_response - - result = await fetch_async(mock_session, "http://test.com", {"retmode": "xml"}) - assert result == "test" - - @pytest.mark.asyncio - async def test_fetch_async_failure(self): - """Test async fetch with failed response.""" - mock_response = MagicMock() - mock_response.status = 404 - - mock_session = MagicMock() - mock_session.get.return_value.__aenter__.return_value = mock_response - - result = await fetch_async(mock_session, "http://test.com", {}) - assert result == {} - - @pytest.mark.asyncio - async def test_batch_fetch_details(self): - """Test batch fetching details.""" - pmids = ["11111", "22222", "33333"] - - with patch("aiohttp.ClientSession") as mock_session_class: - mock_session = MagicMock() - mock_session_class.return_value.__aenter__.return_value = mock_session - - with patch( - "scitex.web._search_pubmed.fetch_async", - side_effect=[ - "1", - {"result": "1"}, - "2", - {"result": "2"}, - ], - ): - results = await batch__fetch_details(pmids, batch_size=2) - - assert len(results) == 4 # 2 batches × 2 requests each - assert results[0] == "1" - assert results[1] == {"result": "1"} - - -class TestSearchPubmedMain: - """Test main search_pubmed function.""" - - def test_search_pubmed_no_results(self): - """Test search with no results.""" - with patch("scitex.web._search_pubmed._search_pubmed", return_value={}): - result = search_pubmed("test query", n_entries=10) - assert result == 1 - - def test_search_pubmed_success(self): - """Test successful search and save.""" - search_results = {"esearchresult": {"idlist": ["12345", "67890"], "count": "2"}} - - batch_results = [ - "", # XML - { - "result": {"12345": {"title": "Test1"}, "67890": {"title": "Test2"}} - }, # JSON - ] - - with patch( - "scitex.web._search_pubmed._search_pubmed", return_value=search_results - ): - with patch("asyncio.run", return_value=batch_results): - with patch("builtins.open", mock_open()) as mock_file: - with patch( - "scitex.web._search_pubmed._parse_abstract_xml", return_value={} - ): - with patch( - "scitex.web._search_pubmed._get_citation", return_value="" - ): - with patch( - "scitex.web._search_pubmed.format_bibtex", - return_value="@article{}", - ): - result = search_pubmed("test query", n_entries=2) - assert result == 0 - - # Verify file was opened (may be called multiple times) - assert mock_file.call_count >= 1 - - def test_search_pubmed_query_sanitization(self): - """Test that query is properly sanitized for filename.""" - search_results = {"esearchresult": {"idlist": [], "count": "0"}} - - with patch( - "scitex.web._search_pubmed._search_pubmed", return_value=search_results - ): - with patch("asyncio.run", return_value=[]): - with patch("builtins.open", mock_open()) as mock_file: - search_pubmed("test query with spaces", n_entries=0) - - # Check filename has underscores - filename = mock_file.call_args_list[0][0][0] - assert filename == "pubmed_test_query_with_spaces.bib" - - -class TestParseArgs: - """Test parse_args function.""" - - def test_parse_args_with_query(self): - """Test parsing arguments with query.""" - with patch( - "sys.argv", - ["script.py", "--query", "epilepsy prediction", "--n_entries", "20"], - ): - with patch("scitex.str.printc"): - args = parse_args() - assert args.query == "epilepsy prediction" - assert args.n_entries == 20 - - def test_parse_args_defaults(self): - """Test parsing arguments with defaults.""" - with patch("sys.argv", ["script.py"]): - with patch("scitex.str.printc"): - args = parse_args() - assert args.query is None - assert args.n_entries == 10 - - def test_parse_args_short_options(self): - """Test parsing with short options.""" - with patch("sys.argv", ["script.py", "-q", "test", "-n", "5"]): - with patch("scitex.str.printc"): - args = parse_args() - assert args.query == "test" - assert args.n_entries == 5 - - -class TestRunMain: - """Test run_main function.""" - - def test_run_main_success(self): - """Test successful main execution.""" - mock_args = Mock() - mock_args.query = "test query" - mock_args.n_entries = 10 - - # Patch at the location where scitex is imported in the module - with patch( - "scitex.web._search_pubmed.scitex.session.start", - return_value=(None, None, None, None, None), - ): - with patch("scitex.web._search_pubmed.parse_args", return_value=mock_args): - with patch( - "scitex.web._search_pubmed.search_pubmed", return_value=0 - ) as mock_search: - with patch("scitex.web._search_pubmed.scitex.session.close"): - run_main() - - mock_search.assert_called_once_with("test query", 10) - - def test_run_main_with_error(self): - """Test main execution with error.""" - mock_args = Mock() - mock_args.query = "test" - mock_args.n_entries = 5 - - with patch( - "scitex.web._search_pubmed.scitex.session.start", - return_value=(None, None, None, None, None), - ): - with patch("scitex.web._search_pubmed.parse_args", return_value=mock_args): - with patch("scitex.web._search_pubmed.search_pubmed", return_value=1): - with patch( - "scitex.web._search_pubmed.scitex.session.close" - ) as mock_close: - run_main() - - # Verify close was called with exit_status=1 - assert mock_close.call_args[1]["exit_status"] == 1 - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_search_pubmed.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # Time-stamp: "2024-11-13 14:30:43 (ywatanabe)" -# # File: ./scitex_repo/src/scitex/web/_search_pubmed.py -# -# """ -# 1. Functionality: -# - Searches PubMed database for scientific articles -# - Retrieves detailed information about matched articles -# - Displays article metadata including title, authors, journal, year, and abstract -# 2. Input: -# - Search query string (e.g., "epilepsy prediction") -# - Optional parameters for batch size and result limit -# 3. Output: -# - Formatted article information displayed to stdout -# - BibTeX file with official citations -# 4. Prerequisites: -# - Internet connection -# - requests package -# - scitex package -# """ -# -# """Imports""" -# import argparse -# import asyncio -# import xml.etree.ElementTree as ET -# from typing import Any, Dict, List, Optional, Union -# -# import aiohttp -# import requests -# -# import scitex -# -# """Functions & Classes""" -# -# -# def _search_pubmed(query: str, retmax: int = 300) -> Dict[str, Any]: -# try: -# base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" -# search_url = f"{base_url}esearch.fcgi" -# params = { -# "db": "pubmed", -# "term": query, -# "retmax": retmax, -# "retmode": "json", -# "usehistory": "y", -# } -# -# response = requests.get(search_url, params=params, timeout=10) -# if not response.ok: -# scitex.str.printc("PubMed API request failed", c="red") -# return {} -# return response.json() -# except requests.exceptions.RequestException as e: -# scitex.str.printc(f"Network error: {e}", c="red") -# return {} -# -# -# def _fetch_details( -# webenv: str, query_key: str, retstart: int = 0, retmax: int = 100 -# ) -> Dict[str, Any]: -# """Fetches detailed information including abstracts for articles. -# -# Parameters -# ---------- -# [Previous parameters remain the same] -# -# Returns -# ------- -# Dict[str, Any] -# Dictionary containing article details and abstracts -# """ -# base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" -# -# # Fetch abstracts -# efetch_url = f"{base_url}efetch.fcgi" -# efetch_params = { -# "db": "pubmed", -# "query_key": query_key, -# "WebEnv": webenv, -# "retstart": retstart, -# "retmax": retmax, -# "retmode": "xml", -# "rettype": "abstract", -# "field": "abstract,mesh", -# } -# -# abstract_response = requests.get(efetch_url, params=efetch_params) -# -# # Fetch metadata -# fetch_url = f"{base_url}esummary.fcgi" -# params = { -# "db": "pubmed", -# "query_key": query_key, -# "WebEnv": webenv, -# "retstart": retstart, -# "retmax": retmax, -# "retmode": "json", -# } -# -# details_response = requests.get(fetch_url, params=params) -# -# if not all([abstract_response.ok, details_response.ok]): -# # print(f"Error fetching data") -# return {} -# -# return { -# "abstracts": abstract_response.text, -# "details": details_response.json(), -# } -# -# -# def _parse_abstract_xml(xml_text: str) -> Dict[str, tuple]: -# """Parses XML response to extract abstracts. -# -# Parameters -# ---------- -# xml_text : str -# XML response from PubMed -# -# Returns -# ------- -# Dict[str, str] -# Dictionary mapping PMIDs to abstracts -# """ -# root = ET.fromstring(xml_text) -# results = {} -# -# for article in root.findall(".//PubmedArticle"): -# pmid = article.find(".//PMID").text -# abstract_element = article.find(".//Abstract/AbstractText") -# abstract = abstract_element.text if abstract_element is not None else "" -# -# # DOI -# doi_element = article.find(".//ArticleId[@IdType='doi']") -# doi = doi_element.text if doi_element is not None else "" -# -# # Get MeSH terms -# keywords = [] -# mesh_terms = article.findall(".//MeshHeading/DescriptorName") -# keywords = [term.text for term in mesh_terms if term is not None] -# -# results[pmid] = (abstract, keywords, doi) -# -# return results -# -# -# def _get_citation(pmid: str) -> str: -# """Gets official citation in BibTeX format. -# -# Parameters -# ---------- -# pmid : str -# PubMed ID -# -# Returns -# ------- -# str -# Official BibTeX citation -# """ -# base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" -# cite_url = f"{base_url}efetch.fcgi" -# params = { -# "db": "pubmed", -# "id": pmid, -# "rettype": "bibtex", -# "retmode": "text", -# } -# response = requests.get(cite_url, params=params) -# return response.text if response.ok else "" -# -# -# def get_crossref_metrics( -# doi: str, api_key: Optional[str] = None, email: Optional[str] = None -# ) -> Dict[str, Any]: -# """Get article metrics from CrossRef using DOI.""" -# import os -# -# base_url = "https://api.crossref.org/works/" -# -# # Use provided email or fallback to environment variables -# if not email: -# email = os.getenv( -# "SCITEX_CROSSREF_EMAIL", -# os.getenv("SCITEX_PUBMED_EMAIL", "research@example.com"), -# ) -# headers = {"User-Agent": f"SciTeX/1.0 (mailto:{email})"} -# -# # Add API key as query parameter if provided -# params = {} -# if api_key: -# params["key"] = api_key -# -# try: -# response = requests.get( -# f"{base_url}{doi}", headers=headers, params=params, timeout=10 -# ) -# if response.ok: -# data = response.json()["message"] -# return { -# "citations": data.get("is-referenced-by-count", 0), -# "type": data.get("type", ""), -# "publisher": data.get("publisher", ""), -# "references": len(data.get("reference", [])), -# "doi": data.get("DOI", ""), -# } -# except Exception as e: -# print(f"CrossRef API error for DOI {doi}: {e}") -# return {} -# -# -# async def get_crossref_metrics_async( -# doi: str, api_key: Optional[str] = None, email: Optional[str] = None -# ) -> Dict[str, Any]: -# """Get article metrics from CrossRef using DOI (async version).""" -# import os -# -# base_url = "https://api.crossref.org/works/" -# -# # Use provided email or fallback to environment variables -# if not email: -# email = os.getenv( -# "SCITEX_CROSSREF_EMAIL", -# os.getenv("SCITEX_PUBMED_EMAIL", "research@example.com"), -# ) -# headers = {"User-Agent": f"SciTeX/1.0 (mailto:{email})"} -# -# # Add API key as query parameter if provided -# params = {} -# if api_key: -# params["key"] = api_key -# -# try: -# async with aiohttp.ClientSession() as session: -# async with session.get( -# f"{base_url}{doi}", headers=headers, params=params, timeout=10 -# ) as response: -# if response.ok: -# data = await response.json() -# message = data["message"] -# return { -# "citations": message.get("is-referenced-by-count", 0), -# "type": message.get("type", ""), -# "publisher": message.get("publisher", ""), -# "references": len(message.get("reference", [])), -# "doi": message.get("DOI", ""), -# } -# except Exception as e: -# print(f"CrossRef API error for DOI {doi}: {e}") -# return {} -# -# -# def save_bibtex( -# papers: Dict[str, Any], abstracts: Dict[str, str], output_file: str -# ) -> None: -# """Saves paper metadata as BibTeX file with abstracts. -# -# Parameters -# ---------- -# papers : Dict[str, Any] -# Dictionary of paper metadata -# abstracts : Dict[str, str] -# Dictionary of PMIDs to abstracts -# output_file : str -# Output file path -# """ -# with open(output_file, "w", encoding="utf-8") as bibtex_file: -# for pmid, paper in papers.items(): -# if pmid == "uids": -# continue -# -# citation = _get_citation(pmid) -# if citation: -# bibtex_file.write(citation) -# else: -# # Use default tuple if pmid not in abstracts -# default_data = ("", [], "") # abstract, keywords, doi -# bibtex_entry = format_bibtex( -# paper, pmid, abstracts.get(pmid, default_data) -# ) -# bibtex_file.write(bibtex_entry + "\n") -# scitex.str.printc(f"Saved to: {str(bibtex_file)}", c="yellow") -# -# -# def format_bibtex(paper: Dict[str, Any], pmid: str, abstract_data: tuple) -> str: -# abstract, keywords, doi = abstract_data -# -# # Get CrossRef and Scimago metrics -# crossref_metrics = get_crossref_metrics(doi) if doi else {} -# journal = paper.get("source", "Unknown Journal") -# # journal_metrics = get_journal_metrics(journal) -# -# authors = paper.get("authors", [{"name": "Unknown"}]) -# author_names = " and ".join(author["name"] for author in authors) -# pubdate = paper.get("pubdate", "") -# year = pubdate.split()[0] if pubdate.strip() else "" -# title = paper.get("title", "No Title") -# -# # Name formatting -# first_author = authors[0]["name"] -# first_name = first_author.split()[0] -# last_name = first_author.split()[-1] -# clean_first_name = "".join(c for c in first_name if c.isalnum()) -# clean_last_name = "".join(c for c in last_name if c.isalnum()) -# -# # Title words -# title_words = title.split() -# first_title_word = "".join(c.lower() for c in title_words[0] if c.isalnum()) -# second_title_word = ( -# "".join(c.lower() for c in title_words[1] if c.isalnum()) -# if len(title_words) > 1 -# else "" -# ) -# -# citation_key = f"{clean_first_name}.{clean_last_name}_{year}_{first_title_word}_{second_title_word}" -# -# entry = f"""@article{{{citation_key}, -# author = {{{author_names}}}, -# title = {{{title}}}, -# journal = {{{journal}}}, -# year = {{{year}}}, -# pmid = {{{pmid}}}, -# doi = {{{doi}}}, -# publisher = {{{crossref_metrics.get("publisher", "")}}}, -# references = {{{crossref_metrics.get("references", 0)}}}, -# keywords = {{{", ".join(keywords)}}}, -# abstract = {{{abstract}}} -# }} -# """ -# return entry -# -# -# async def fetch_async( -# session: aiohttp.ClientSession, url: str, params: Dict -# ) -> Union[Dict, str]: -# """Asynchronous fetch helper.""" -# async with session.get(url, params=params) as response: -# if response.status == 200: -# if params.get("retmode") == "xml": -# return await response.text() -# elif params.get("retmode") == "json": -# return await response.json() -# return await response.text() -# return {} -# -# -# async def batch__fetch_details(pmids: List[str], batch_size: int = 20) -> List[Dict]: -# """Fetches details for multiple PMIDs concurrently. -# -# Parameters -# ---------- -# pmids : List[str] -# List of PubMed IDs -# batch_size : int, optional -# Size of each batch for concurrent requests -# -# Returns -# ------- -# List[Dict] -# List of response data -# """ -# base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" -# -# async with aiohttp.ClientSession() as session: -# tasks = [] -# for i in range(0, len(pmids), batch_size): -# batch_pmids = pmids[i : i + batch_size] -# -# # Fetch both details and citations concurrently -# efetch_params = { -# "db": "pubmed", -# "id": ",".join(batch_pmids), -# "retmode": "xml", -# "rettype": "abstract", -# } -# -# esummary_params = { -# "db": "pubmed", -# "id": ",".join(batch_pmids), -# "retmode": "json", -# } -# -# tasks.append(fetch_async(session, f"{base_url}efetch.fcgi", efetch_params)) -# tasks.append( -# fetch_async(session, f"{base_url}esummary.fcgi", esummary_params) -# ) -# -# results = await asyncio.gather(*tasks) -# return results -# -# -# def search_pubmed(query: str, n_entries: int = 10) -> int: -# # query = args.query or "epilepsy prediction" -# # print(f"Using query: {query}") -# -# search_results = _search_pubmed(query) -# if not search_results: -# # print("No results found or error occurred") -# return 1 -# -# pmids = search_results["esearchresult"]["idlist"] -# count = len(pmids) -# # print(f"Found {count:,} results") -# -# output_file = f"pubmed_{query.replace(' ', '_')}.bib" -# # print(f"Saving results to: {output_file}") -# -# # Process in larger batches asynchronously -# results = asyncio.run(batch__fetch_details(pmids[:n_entries])) -# # here, results seems long string -# -# # Process results and save -# with open(output_file, "w", encoding="utf-8") as f: -# for i in range(0, len(results), 2): -# xml_response = results[i] -# json_response = results[i + 1] -# -# if isinstance(xml_response, str): -# abstracts = _parse_abstract_xml(xml_response) -# if isinstance(json_response, dict) and "result" in json_response: -# details = json_response["result"] -# save_bibtex(details, abstracts, output_file) -# -# # Process results and save -# temp_bibtex = [] -# for i in range(0, len(results), 2): -# xml_response = results[i] -# json_response = results[i + 1] -# -# if isinstance(xml_response, str): -# abstracts = _parse_abstract_xml(xml_response) -# if isinstance(json_response, dict) and "result" in json_response: -# details = json_response["result"] -# for pmid in details: -# if pmid != "uids": -# citation = _get_citation(pmid) -# if citation: -# temp_bibtex.append(citation) -# else: -# entry = format_bibtex( -# details[pmid], pmid, abstracts.get(pmid, "") -# ) -# temp_bibtex.append(entry) -# -# # Write all entries at once -# with open(output_file, "w", encoding="utf-8") as f: -# f.write("\n".join(temp_bibtex)) -# -# return 0 -# -# -# def parse_args() -> argparse.Namespace: -# parser = argparse.ArgumentParser( -# description="PubMed article search and retrieval tool" -# ) -# parser.add_argument( -# "--query", -# "-q", -# type=str, -# help='Search query (default: "epilepsy prediction")', -# ) -# parser.add_argument( -# "--n_entries", -# "-n", -# type=int, -# default=10, -# help='Search query (default: "epilepsy prediction")', -# ) -# args = parser.parse_args() -# scitex.str.printc(args, c="yellow") -# return args -# -# -# def run_main() -> None: -# global CONFIG -# import sys -# -# import matplotlib.pyplot as plt -# -# import scitex -# -# CONFIG, sys.stdout, sys.stderr, plt, CC = scitex.session.start( -# sys, -# verbose=False, -# ) -# -# args = parse_args() -# exit_status = search_pubmed(args.query, args.n_entries) -# -# scitex.session.close( -# CONFIG, -# verbose=False, -# notify=False, -# message="", -# exit_status=exit_status, -# ) -# -# -# if __name__ == "__main__": -# run_main() -# -# # EOF - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_search_pubmed.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/web/test__summarize_url.py b/tests/scitex/web/test__summarize_url.py deleted file mode 100755 index f1e978102..000000000 --- a/tests/scitex/web/test__summarize_url.py +++ /dev/null @@ -1,629 +0,0 @@ -#!/usr/bin/env python3 -# Time-stamp: "2024-11-08 05:51:10 (ywatanabe)" -# File: ./scitex_repo/tests/scitex/web/test__summarize_url.py - -""" -Tests for URL summarization functionality. -""" - -import pytest - -pytest.importorskip("aiohttp") -pytest.importorskip("scitex.web.summarize_url") - -import json # noqa: E402 -import re # noqa: F401, E402 -from concurrent.futures import Future # noqa: E402 -from unittest.mock import MagicMock, Mock, call, patch # noqa: F401, E402 - -from bs4 import BeautifulSoup # noqa: F401, E402 - -try: - from scitex.web import ( - crawl_to_json, - crawl_url, - extract_main_content, - summarize_all, - summarize_url, - ) -except ImportError: - pytest.skip("scitex.web.summarize_url not available", allow_module_level=True) -from scitex.web._summarize_url import main # noqa: F401, E402 - - -class TestExtractMainContent: - """Test extract_main_content function.""" - - def test_extract_main_content_with_readability(self): - """Test content extraction with readability library.""" - html_content = """ - - -

Main Title

-

This is the main content.

-
Some extra content
- - - """ - - # Test when Document is available - mock_doc = Mock() - mock_doc.summary.return_value = ( - "

Main Title

This is the main content.

" - ) - - with patch("scitex.web._summarize_url.Document", return_value=mock_doc): - result = extract_main_content(html_content) - assert "Main Title" in result - assert "This is the main content" in result - assert "<" not in result # HTML tags removed - - def test_extract_main_content_without_readability(self): - """Test content extraction when readability is not available.""" - html_content = "

Test content

" - - with patch("scitex.web._summarize_url.Document", None): - result = extract_main_content(html_content) - assert result == "Test content"[:5000] # Limited to 5000 chars - - def test_extract_main_content_complex_html(self): - """Test extraction with complex HTML.""" - html_content = """ - - Test - - -

Real content with spaces

- - - - """ - - mock_doc = Mock() - mock_doc.summary.return_value = "

Real content with spaces

" - - with patch("scitex.web._summarize_url.Document", return_value=mock_doc): - result = extract_main_content(html_content) - assert result == "Real content with spaces" # Extra spaces removed - - def test_extract_main_content_empty_html(self): - """Test extraction with empty HTML.""" - with patch("scitex.web._summarize_url.Document", None): - result = extract_main_content("") - assert result == "" - - def test_extract_main_content_no_tags(self): - """Test extraction with plain text.""" - plain_text = "Just plain text without HTML" - - with patch("scitex.web._summarize_url.Document", None): - result = extract_main_content(plain_text) - assert result == plain_text - - -class TestCrawlUrl: - """Test crawl_url function.""" - - def test_crawl_url_single_page(self): - """Test crawling a single page.""" - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = "

Test content

" - - with patch("requests.get", return_value=mock_response): - with patch( - "scitex.web._summarize_url.extract_main_content", - return_value="Test content", - ): - visited, contents = crawl_url("http://test.com", max_depth=0) - - assert "http://test.com" in visited - assert contents["http://test.com"] == "Test content" - assert len(visited) == 1 - - def test_crawl_url_with_links(self): - """Test crawling with links to follow.""" - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = """ - -

Main page

- Link to page 2 - Link to page 3 - - """ - - with patch("requests.get", return_value=mock_response): - with patch( - "scitex.web._summarize_url.extract_main_content", return_value="Content" - ): - visited, contents = crawl_url("http://test.com", max_depth=1) - - # Should visit main page and try to visit linked pages - assert "http://test.com" in visited - - def test_crawl_url_max_depth(self): - """Test that max_depth is respected.""" - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = 'Link' - - with patch("requests.get", return_value=mock_response): - with patch( - "scitex.web._summarize_url.extract_main_content", return_value="Content" - ): - visited, contents = crawl_url("http://test.com", max_depth=0) - - # Should only visit the initial URL with max_depth=0 - assert len(visited) == 1 - assert "http://test.com" in visited - - def test_crawl_url_request_exception(self): - """Test handling of request exceptions.""" - import requests - - with patch( - "requests.get", side_effect=requests.RequestException("Network error") - ): - visited, contents = crawl_url("http://test.com") - - assert len(visited) == 0 - assert len(contents) == 0 - - def test_crawl_url_non_200_status(self): - """Test handling of non-200 status codes.""" - mock_response = Mock() - mock_response.status_code = 404 - - with patch("requests.get", return_value=mock_response): - visited, contents = crawl_url("http://test.com") - - assert len(visited) == 0 - assert len(contents) == 0 - - def test_crawl_url_avoid_duplicate_visits(self): - """Test that URLs are not visited twice.""" - mock_response = Mock() - mock_response.status_code = 200 - # Use exact same URL to test duplicate avoidance - mock_response.text = 'Home' - - call_count = 0 - - def mock_get(*args, **kwargs): - nonlocal call_count - call_count += 1 - return mock_response - - with patch("requests.get", side_effect=mock_get): - with patch( - "scitex.web._summarize_url.extract_main_content", return_value="Content" - ): - visited, contents = crawl_url("http://test.com", max_depth=1) - - # Should only call once despite self-referential link to exact same URL - assert call_count == 1 - - -class TestCrawlToJson: - """Test crawl_to_json function.""" - - def test_crawl_to_json_basic(self): - """Test basic JSON conversion.""" - mock_urls = {"http://test.com"} - mock_contents = {"http://test.com": "Test page content"} - - with patch( - "scitex.web._summarize_url.crawl_url", - return_value=(mock_urls, mock_contents), - ): - with patch("scitex.ai.GenAI") as mock_genai: - mock_llm = Mock() - mock_llm.return_value = "Summary of test page" - mock_genai.return_value = mock_llm - - # Mock ThreadPoolExecutor - mock_future = Mock(spec=Future) - mock_future.result.return_value = { - "url": "http://test.com", - "content": "Summary of test page", - } - - with patch("concurrent.futures.ThreadPoolExecutor") as mock_executor: - mock_executor.return_value.__enter__.return_value.submit.return_value = ( - mock_future - ) - with patch( - "concurrent.futures.as_completed", return_value=[mock_future] - ): - with patch("tqdm.tqdm", side_effect=lambda x, **kwargs: x): - result = crawl_to_json("test.com") - - parsed = json.loads(result) - assert parsed["start_url"] == "https://test.com" - assert len(parsed["crawled_pages"]) == 1 - assert ( - parsed["crawled_pages"][0]["url"] == "http://test.com" - ) - - def test_crawl_to_json_url_normalization(self): - """Test URL normalization (adding https://).""" - with patch("scitex.web._summarize_url.crawl_url", return_value=(set(), {})): - with patch("concurrent.futures.ThreadPoolExecutor"): - with patch("concurrent.futures.as_completed", return_value=[]): - with patch("tqdm.tqdm", side_effect=lambda x, **kwargs: x): - result = crawl_to_json("example.com") - parsed = json.loads(result) - assert parsed["start_url"] == "https://example.com" - - def test_crawl_to_json_already_has_protocol(self): - """Test URL with existing protocol.""" - with patch("scitex.web._summarize_url.crawl_url", return_value=(set(), {})): - with patch("concurrent.futures.ThreadPoolExecutor"): - with patch("concurrent.futures.as_completed", return_value=[]): - with patch("tqdm.tqdm", side_effect=lambda x, **kwargs: x): - result = crawl_to_json("http://example.com") - parsed = json.loads(result) - assert parsed["start_url"] == "http://example.com" - - def test_crawl_to_json_multiple_pages(self): - """Test JSON conversion with multiple pages.""" - mock_urls = {"http://test.com", "http://test.com/page2"} - mock_contents = { - "http://test.com": "Main content", - "http://test.com/page2": "Page 2 content", - } - - with patch( - "scitex.web._summarize_url.crawl_url", - return_value=(mock_urls, mock_contents), - ): - with patch("scitex.ai.GenAI") as mock_genai: - mock_llm = Mock() - mock_llm.side_effect = ["Summary 1", "Summary 2"] - mock_genai.return_value = mock_llm - - # Create futures for each URL - futures = [] - for i, url in enumerate(mock_urls): - mock_future = Mock(spec=Future) - mock_future.result.return_value = { - "url": url, - "content": f"Summary {i + 1}", - } - futures.append(mock_future) - - with patch("concurrent.futures.ThreadPoolExecutor") as mock_executor: - mock_executor.return_value.__enter__.return_value.submit.side_effect = ( - futures - ) - with patch("concurrent.futures.as_completed", return_value=futures): - with patch("tqdm.tqdm", side_effect=lambda x, **kwargs: x): - result = crawl_to_json("test.com") - - parsed = json.loads(result) - assert len(parsed["crawled_pages"]) == 2 - - -class TestSummarizeAll: - """Test summarize_all function.""" - - def test_summarize_all_basic(self): - """Test basic summarization.""" - json_content = json.dumps( - { - "start_url": "http://test.com", - "crawled_pages": [ - {"url": "http://test.com", "content": "Test summary"} - ], - } - ) - - with patch("scitex.ai.GenAI") as mock_genai: - mock_llm = Mock() - mock_llm.return_value = ( - "• Point 1\n• Point 2\n• Point 3\n• Point 4\n• Point 5" - ) - mock_genai.return_value = mock_llm - - result = summarize_all(json_content) - - assert "Point 1" in result - assert "Point 5" in result - mock_llm.assert_called_once() - - # Check that the prompt includes the JSON content - call_args = mock_llm.call_args[0][0] - assert "5 bullet points" in call_args - assert json_content in call_args - - def test_summarize_all_empty_json(self): - """Test summarization with empty JSON.""" - empty_json = json.dumps({"start_url": "", "crawled_pages": []}) - - with patch("scitex.ai.GenAI") as mock_genai: - mock_llm = Mock() - mock_llm.return_value = "No content to summarize" - mock_genai.return_value = mock_llm - - result = summarize_all(empty_json) - assert result == "No content to summarize" - - -class TestSummarizeUrl: - """Test summarize_url function.""" - - def test_summarize_url_complete_flow(self): - """Test complete URL summarization flow.""" - mock_json = json.dumps( - { - "start_url": "https://test.com", - "crawled_pages": [ - {"url": "https://test.com", "content": "Page summary"} - ], - } - ) - mock_summary = "• Summary point 1\n• Summary point 2" - - with patch("scitex.web._summarize_url.crawl_to_json", return_value=mock_json): - with patch( - "scitex.web._summarize_url.summarize_all", return_value=mock_summary - ): - with patch("builtins.print"): # Suppress pprint output - ground_summary, json_result = summarize_url("test.com") - - assert ground_summary == mock_summary - assert json_result == mock_json - - def test_summarize_url_error_handling(self): - """Test error handling in summarize_url.""" - with patch( - "scitex.web._summarize_url.crawl_to_json", - side_effect=Exception("Crawl error"), - ): - with pytest.raises(Exception) as exc_info: - summarize_url("test.com") - assert str(exc_info.value) == "Crawl error" - - def test_summarize_url_pprint_called(self): - """Test that pprint is called with the summary.""" - mock_json = '{"test": "data"}' - mock_summary = "Test summary" - - with patch("scitex.web._summarize_url.crawl_to_json", return_value=mock_json): - with patch( - "scitex.web._summarize_url.summarize_all", return_value=mock_summary - ): - # pprint is imported as 'from pprint import pprint' in the module - with patch("scitex.web._summarize_url.pprint") as mock_pprint: - summarize_url("test.com") - mock_pprint.assert_called_once_with(mock_summary) - - -class TestMain: - """Test main function and module alias.""" - - def test_main_is_summarize_url(self): - """Test that main is an alias for summarize_url.""" - assert main == summarize_url - - def test_main_execution(self): - """Test main function execution returns expected result structure.""" - mock_json = '{"test": "data"}' - mock_summary = "Test summary" - - # main is the same function as summarize_url, so we patch the inner calls - with patch("scitex.web._summarize_url.crawl_to_json", return_value=mock_json): - with patch( - "scitex.web._summarize_url.summarize_all", return_value=mock_summary - ): - with patch("scitex.web._summarize_url.pprint"): - result = main("http://example.com") - assert result[0] == mock_summary - assert result[1] == mock_json - - def test_script_execution(self): - """Test script execution with arguments.""" - import argparse - - with patch("sys.argv", ["script.py", "--url", "http://example.com"]): - # Import and execute the argument parsing similar to __main__ block - parser = argparse.ArgumentParser(description="") - parser.add_argument("--url", "-u", type=str, help="(default: %(default)s)") - args = parser.parse_args() - - assert args.url == "http://example.com" - - def test_readability_import_fallback(self): - """Test readability import fallback mechanism.""" - # This tests the import logic in the actual module - # The module tries to import from 'readability' first, then 'readability.readability' - import sys - - # Test when both imports fail - with patch.dict( - "sys.modules", {"readability": None, "readability.readability": None} - ): - # Re-import the module to trigger the import logic - if "scitex.web._summarize_url" in sys.modules: - del sys.modules["scitex.web._summarize_url"] - - # This should set Document to None - from scitex.web import _summarize_url # noqa: F401 - - # The Document variable should be None when imports fail - # (This is handled in the actual module's import section) - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_summarize_url.py -# -------------------------------------------------------------------------------- -# #!./env/bin/python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-07-29 21:43:30 (ywatanabe)" -# # ./src/scitex/web/_crawl.py -# -# -# import requests -# from bs4 import BeautifulSoup -# import urllib.parse -# from concurrent.futures import ThreadPoolExecutor, as_completed -# import json -# from tqdm import tqdm -# import scitex -# from pprint import pprint -# -# try: -# from readability import Document -# except ImportError: -# try: -# from readability.readability import Document -# except ImportError: -# Document = None -# -# import re -# -# -# # def crawl_url(url, max_depth=1): -# # print("\nCrawling...") -# # visited = set() -# # to_visit = [(url, 0)] -# # contents = {} -# -# # while to_visit: -# # current_url, depth = to_visit.pop(0) -# # if current_url in visited or depth > max_depth: -# # continue -# -# # try: -# # response = requests.get(current_url) -# # if response.status_code == 200: -# # visited.add(current_url) -# # contents[current_url] = response.text -# # soup = BeautifulSoup(response.text, "html.parser") -# -# # for link in soup.find_all("a", href=True): -# # absolute_link = urllib.parse.urljoin( -# # current_url, link["href"] -# # ) -# # if absolute_link not in visited: -# # to_visit.append((absolute_link, depth + 1)) -# -# # except requests.RequestException: -# # pass -# -# # return visited, contents -# -# -# def extract_main_content(html): -# if Document is None: -# # Fallback: just strip HTML tags -# content = re.sub("<[^<]+?>", "", html) -# content = " ".join(content.split()) -# return content[:5000] # Limit to first 5000 chars -# -# doc = Document(html) -# content = doc.summary() -# # Remove HTML tags -# content = re.sub("<[^<]+?>", "", content) -# # Remove extra whitespace -# content = " ".join(content.split()) -# return content -# -# -# def crawl_url(url, max_depth=1): -# print("\nCrawling...") -# visited = set() -# to_visit = [(url, 0)] -# contents = {} -# -# while to_visit: -# current_url, depth = to_visit.pop(0) -# if current_url in visited or depth > max_depth: -# continue -# -# try: -# response = requests.get(current_url) -# if response.status_code == 200: -# visited.add(current_url) -# main_content = extract_main_content(response.text) -# contents[current_url] = main_content -# soup = BeautifulSoup(response.text, "html.parser") -# -# for link in soup.find_all("a", href=True): -# absolute_link = urllib.parse.urljoin(current_url, link["href"]) -# if absolute_link not in visited: -# to_visit.append((absolute_link, depth + 1)) -# -# except requests.RequestException: -# pass -# -# return visited, contents -# -# -# def crawl_to_json(start_url): -# if not start_url.startswith("http"): -# start_url = "https://" + start_url -# crawled_urls, contents = crawl_url(start_url) -# -# print("\nSummalizing as json...") -# -# def process_url(url): -# llm = scitex.ai.GenAI("gpt-4o-mini") -# return { -# "url": url, -# "content": llm(f"Summarize this page in 1 line:\n\n{contents[url]}"), -# } -# -# with ThreadPoolExecutor() as executor: -# future_to_url = {executor.submit(process_url, url): url for url in crawled_urls} -# crawled_pages = [] -# for future in tqdm( -# as_completed(future_to_url), -# total=len(crawled_urls), -# desc="Processing URLs", -# ): -# crawled_pages.append(future.result()) -# -# result = {"start_url": start_url, "crawled_pages": crawled_pages} -# -# return json.dumps(result, indent=2) -# -# -# def summarize_all(json_contents): -# llm = scitex.ai.GenAI("gpt-4o-mini") -# out = llm(f"Summarize this json file with 5 bullet points:\n\n{json_contents}") -# return out -# -# -# def summarize_url(start_url): -# json_result = crawl_to_json(start_url) -# ground_summary = summarize_all(json_result) -# -# pprint(ground_summary) -# return ground_summary, json_result -# -# -# main = summarize_url -# -# if __name__ == "__main__": -# import argparse -# import scitex -# -# parser = argparse.ArgumentParser(description="") -# parser.add_argument("--url", "-u", type=str, help="(default: %(default)s)") -# args = parser.parse_args() -# scitex.gen.print_block(args, c="yellow") -# -# main(args.url) - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_summarize_url.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/web/test_download_images.py b/tests/scitex/web/test_download_images.py deleted file mode 100644 index 122a88dca..000000000 --- a/tests/scitex/web/test_download_images.py +++ /dev/null @@ -1,332 +0,0 @@ -# Add your tests here - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/download_images.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # File: ./src/scitex/web/download_images.py -# -# """ -# Image Downloader for SciTeX. -# -# Downloads images from URLs with minimum size filtering. -# -# Usage: -# python -m scitex.web.download_images https://example.com -# python -m scitex.web.download_images https://example.com -o ./downloads -# python -m scitex.web.download_images https://example.com --min-size 800x600 -# """ -# -# import os -# import re -# import urllib.parse -# from concurrent.futures import ThreadPoolExecutor, as_completed -# from datetime import datetime -# from pathlib import Path -# from typing import List, Optional, Tuple -# -# import requests -# from bs4 import BeautifulSoup -# from tqdm import tqdm -# -# try: -# from io import BytesIO -# -# from PIL import Image -# -# PILLOW_AVAILABLE = True -# except ImportError: -# PILLOW_AVAILABLE = False -# -# from scitex.logging import getLogger -# -# logger = getLogger(__name__) -# -# # Configuration -# DEFAULT_MIN_WIDTH = 400 -# DEFAULT_MIN_HEIGHT = 300 -# DEFAULT_TIMEOUT = 10 -# DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" -# -# -# def _get_default_download_dir() -> str: -# """Get default download directory using SCITEX_DIR if available.""" -# scitex_root = os.environ.get("SCITEX_DIR", os.path.expanduser("~/.scitex")) -# return os.path.join(scitex_root, "web", "downloads") -# -# -# def _normalize_url_for_directory(url: str) -> str: -# """Convert URL to a safe directory name.""" -# parsed = urllib.parse.urlparse(url) -# domain = parsed.netloc.replace("www.", "") -# path = parsed.path.strip("/").replace("/", "-") -# -# normalized = f"{domain}-{path}" if path else domain -# normalized = re.sub(r"[^\w\-.]", "-", normalized) -# normalized = re.sub(r"-+", "-", normalized) -# normalized = normalized[:100].strip("-") -# -# return normalized -# -# -# def _is_direct_image_url(url: str) -> bool: -# """Check if URL appears to be a direct image link.""" -# extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"] -# path = urllib.parse.urlparse(url.lower()).path -# return any(path.endswith(ext) for ext in extensions) -# -# -# def _extract_image_urls(url: str, same_domain: bool = False) -> List[str]: -# """Extract image URLs from a webpage.""" -# try: -# logger.info(f"Fetching page: {url}") -# response = requests.get( -# url, -# timeout=DEFAULT_TIMEOUT, -# headers={"User-Agent": DEFAULT_USER_AGENT}, -# ) -# response.raise_for_status() -# except requests.RequestException as e: -# logger.error(f"Failed to fetch page: {e}") -# return [] -# -# soup = BeautifulSoup(response.content, "html.parser") -# parsed_base = urllib.parse.urlparse(url) -# image_urls = set() -# -# for img in soup.find_all("img"): -# img_url = img.get("src") or img.get("data-src") -# if not img_url: -# continue -# -# img_url = urllib.parse.urljoin(url, img_url) -# -# if img_url.lower().endswith((".svg", ".svgz")): -# continue -# -# if same_domain: -# parsed_img = urllib.parse.urlparse(img_url) -# if parsed_img.netloc != parsed_base.netloc: -# continue -# -# image_urls.add(img_url) -# -# logger.info(f"Found {len(image_urls)} images on page") -# return list(image_urls) -# -# -# def _download_single_image( -# img_url: str, -# output_dir: Path, -# counter: int, -# min_size: Optional[Tuple[int, int]], -# ) -> Optional[str]: -# """Download a single image.""" -# try: -# response = requests.get( -# img_url, -# timeout=DEFAULT_TIMEOUT, -# headers={"User-Agent": DEFAULT_USER_AGENT}, -# ) -# response.raise_for_status() -# -# # Validate content-type -# content_type = response.headers.get("content-type", "") -# if not content_type.startswith("image/"): -# logger.debug(f"Skipping non-image: {content_type}") -# return None -# -# # Check dimensions -# if min_size and PILLOW_AVAILABLE: -# try: -# img = Image.open(BytesIO(response.content)) -# width, height = img.size -# if width < min_size[0] or height < min_size[1]: -# logger.debug( -# f"Skipping small image: {width}x{height} " -# f"(min: {min_size[0]}x{min_size[1]})" -# ) -# return None -# except Exception: -# pass -# -# # Determine extension -# ext = "jpg" -# if PILLOW_AVAILABLE: -# try: -# img = Image.open(BytesIO(response.content)) -# fmt = img.format.lower() if img.format else "jpeg" -# ext = "jpg" if fmt == "jpeg" else fmt -# except Exception: -# pass -# elif "png" in content_type: -# ext = "png" -# elif "gif" in content_type: -# ext = "gif" -# elif "webp" in content_type: -# ext = "webp" -# -# filename = f"{counter:04d}.{ext}" -# filepath = output_dir / filename -# -# with open(filepath, "wb") as f: -# f.write(response.content) -# -# logger.info(f"Downloaded: {filename}") -# return str(filepath) -# -# except Exception as e: -# logger.warning(f"Error downloading {img_url}: {e}") -# return None -# -# -# def download_images( -# url: str, -# output_dir: Optional[str] = None, -# min_size: Optional[Tuple[int, int]] = None, -# max_workers: int = 5, -# same_domain: bool = False, -# ) -> List[str]: -# """ -# Download images from a URL. -# -# Args: -# url: Webpage URL or direct image URL -# output_dir: Output directory (default: $SCITEX_DIR/web/downloads) -# min_size: Minimum (width, height) to filter small images (default: 400x300) -# max_workers: Concurrent download threads -# same_domain: Only download images from the same domain -# -# Returns: -# List of downloaded file paths -# -# Example: -# >>> paths = download_images("https://example.com") -# >>> paths = download_images("https://example.com/photo.jpg") -# >>> paths = download_images("https://example.com", min_size=(800, 600)) -# """ -# if not PILLOW_AVAILABLE: -# logger.warning("Pillow not available. Size filtering disabled.") -# min_size = None -# elif min_size is None: -# min_size = (DEFAULT_MIN_WIDTH, DEFAULT_MIN_HEIGHT) -# -# # Setup output directory -# if output_dir is None: -# output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR") -# if output_dir is None: -# output_dir = _get_default_download_dir() -# -# timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") -# normalized = _normalize_url_for_directory(url) -# output_path = Path(output_dir).expanduser() / f"{timestamp}-{normalized}-images" -# output_path.mkdir(parents=True, exist_ok=True) -# -# logger.info(f"Output directory: {output_path}") -# -# # Get image URLs -# if _is_direct_image_url(url): -# image_urls = [url] -# logger.info("Direct image URL detected") -# else: -# image_urls = _extract_image_urls(url, same_domain=same_domain) -# -# if not image_urls: -# logger.warning("No images found") -# return [] -# -# # Download concurrently -# downloaded = [] -# counter = [1] -# -# def download_with_counter(img_url: str) -> Optional[str]: -# idx = counter[0] -# counter[0] += 1 -# return _download_single_image(img_url, output_path, idx, min_size) -# -# with ThreadPoolExecutor(max_workers=max_workers) as executor: -# futures = {executor.submit(download_with_counter, u): u for u in image_urls} -# -# for future in tqdm( -# as_completed(futures), total=len(image_urls), desc="Downloading" -# ): -# result = future.result() -# if result: -# downloaded.append(result) -# -# logger.info(f"Downloaded {len(downloaded)} images to {output_path}") -# return downloaded -# -# -# def main(): -# """CLI entry point.""" -# import argparse -# -# parser = argparse.ArgumentParser( -# description="Download images from URL", -# formatter_class=argparse.RawDescriptionHelpFormatter, -# epilog=""" -# Examples: -# python -m scitex.web.download_images https://example.com -# python -m scitex.web.download_images https://example.com -o ./downloads -# python -m scitex.web.download_images https://example.com --min-size 800x600 -# python -m scitex.web.download_images https://example.com --no-min-size -# """, -# ) -# parser.add_argument("url", help="URL to download images from") -# parser.add_argument("-o", "--output", help="Output directory") -# parser.add_argument( -# "--min-size", -# default="400x300", -# help="Minimum size WIDTHxHEIGHT (default: 400x300)", -# ) -# parser.add_argument( -# "--no-min-size", -# action="store_true", -# help="Disable size filtering", -# ) -# parser.add_argument( -# "--same-domain", -# action="store_true", -# help="Only download from same domain", -# ) -# parser.add_argument( -# "--workers", -# type=int, -# default=5, -# help="Concurrent downloads (default: 5)", -# ) -# -# args = parser.parse_args() -# -# min_size = None -# if not args.no_min_size and args.min_size: -# w, h = map(int, args.min_size.split("x")) -# min_size = (w, h) -# -# paths = download_images( -# args.url, -# output_dir=args.output, -# min_size=min_size, -# max_workers=args.workers, -# same_domain=args.same_domain, -# ) -# -# print(f"\nDownloaded {len(paths)} images:") -# for p in paths: -# print(f" {p}") -# -# -# if __name__ == "__main__": -# main() - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/download_images.py -# --------------------------------------------------------------------------------