diff --git a/pyproject.toml b/pyproject.toml
index bdfecd610..46616464e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -729,30 +729,8 @@ verify = [
 
 # Web Module - Web utilities
 # Use: pip install scitex[web]
-web = [
-    "aiohttp",
-    "beautifulsoup4",
-    "readability-lxml",
-    "requests",
-    "Pillow",
-    "matplotlib",
-    "tqdm",
-    "joblib",
-    "scikit-learn",
-    "pytest-asyncio",
-    "ruamel.yaml",
-    "xarray",
-    "seaborn",
-    "scipy",
-    "markdown2",
-    "anthropic",
-    "openai",
-    "google-genai",
-    "groq",
-    # # Heavy dependencies handled by _AVAILABLE flags
-    # "torch",
-    # "umap-learn",
-]
+# Real implementation lives in the standalone scitex-web package.
+web = ["scitex-web[readability]>=0.1.0"]
 
 # Clew Module - Hash-based verification for reproducible science (Ariadne's thread)
 # Use: pip install scitex[clew]
diff --git a/src/scitex/web/__init__.py b/src/scitex/web/__init__.py
index aa46bcc0e..01eb10cba 100755
--- a/src/scitex/web/__init__.py
+++ b/src/scitex/web/__init__.py
@@ -1,35 +1,20 @@
-#!/usr/bin/env python3
-"""Web-related utilities module for scitex."""
+"""SciTeX web — thin compatibility shim for scitex-web.
 
-from ._scraping import get_image_urls, get_urls
-from ._search_pubmed import (
-    _fetch_details,
-    _get_citation,
-    _parse_abstract_xml,
-    _search_pubmed,
-)
-from ._search_pubmed import batch__fetch_details as _batch__fetch_details
-from ._search_pubmed import fetch_async as _fetch_async
-from ._search_pubmed import format_bibtex as _format_bibtex
-from ._search_pubmed import get_crossref_metrics
-from ._search_pubmed import parse_args as _parse_args
-from ._search_pubmed import run_main as _run_main
-from ._search_pubmed import save_bibtex as _save_bibtex
-from ._search_pubmed import search_pubmed
-from ._summarize_url import crawl_to_json, crawl_url
-from ._summarize_url import extract_main_content as _extract_main_content
-from ._summarize_url import summarize_all as _summarize_all
-from ._summarize_url import summarize_url
-from .download_images import download_images
+Aliases ``scitex.web`` to the standalone ``scitex_web`` package via ``sys.modules``.
+``scitex.web is scitex_web``.
 
-__all__ = [
-    # Public API
-    "search_pubmed",
-    "get_crossref_metrics",
-    "summarize_url",
-    "crawl_url",
-    "crawl_to_json",
-    "get_urls",
-    "download_images",
-    "get_image_urls",
-]
+Install: ``pip install scitex[web]``  (or ``pip install scitex-web``).
+See: https://github.com/ywatanabe1989/scitex-web
+"""
+
+import sys as _sys
+
+try:
+    import scitex_web as _real
+except ImportError as _e:  # pragma: no cover
+    raise ImportError(
+        "scitex.web requires the 'scitex-web' package. "
+        "Install with: pip install scitex[web]  (or: pip install scitex-web)"
+    ) from _e
+
+_sys.modules[__name__] = _real
diff --git a/src/scitex/web/_scraping.py b/src/scitex/web/_scraping.py
deleted file mode 100755
index d97dc668a..000000000
--- a/src/scitex/web/_scraping.py
+++ /dev/null
@@ -1,162 +0,0 @@
-#!/usr/bin/env python3
-# File: ./src/scitex/web/_scraping.py
-
-"""Web scraping utilities for extracting URLs.
-
-``bs4`` is an optional third-party dependency (only needed when actually
-scraping). Do **not** import it at module load -- doing so leaks the
-``ModuleNotFoundError`` through ``scitex.web.__init__`` and through
-``scitex.cli.web``, which in turn breaks ``scitex --json`` and
-``scitex --help-recursive`` on any install without ``beautifulsoup4``.
-See ywatanabe1989/todo#279. The import now lives inside each scraping
-function, so merely importing this module is side-effect-free.
-"""
-
-import re
-import urllib.parse
-from typing import List, Optional, Set
-
-import requests
-
-from scitex.logging import getLogger
-
-logger = getLogger(__name__)
-
-DEFAULT_TIMEOUT = 10
-DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-
-
-def get_urls(
-    url: str,
-    pattern: Optional[str] = None,
-    absolute: bool = True,
-    same_domain: bool = False,
-    include_external: bool = True,
-) -> List[str]:
-    """
-    Extract all URLs from a webpage.
-
-    Args:
-        url: The URL of the webpage to scrape
-        pattern: Optional regex pattern to filter URLs (e.g., r'\\.pdf$' for PDF files)
-        absolute: If True, convert relative URLs to absolute URLs
-        same_domain: If True, only return URLs from the same domain
-        include_external: If True, include external links (only applies if same_domain=False)
-
-    Returns:
-        List of URLs found on the page
-
-    Example:
-        >>> urls = get_urls('https://example.com', pattern=r'\\.pdf$')
-        >>> urls = get_urls('https://example.com', same_domain=True)
-    """
-    from bs4 import BeautifulSoup  # lazy: see module docstring, todo#279
-
-    try:
-        logger.info(f"Fetching URLs from: {url}")
-        response = requests.get(
-            url,
-            timeout=DEFAULT_TIMEOUT,
-            headers={"User-Agent": DEFAULT_USER_AGENT},
-        )
-        response.raise_for_status()
-    except requests.RequestException as e:
-        logger.error(f"Failed to fetch URL {url}: {e}")
-        return []
-
-    soup = BeautifulSoup(response.text, "html.parser")
-    urls_found: Set[str] = set()
-
-    parsed_base = urllib.parse.urlparse(url)
-
-    for link in soup.find_all("a", href=True):
-        href = link["href"]
-
-        if absolute:
-            href = urllib.parse.urljoin(url, href)
-
-        if same_domain:
-            parsed_href = urllib.parse.urlparse(href)
-            if parsed_href.netloc != parsed_base.netloc:
-                continue
-        elif not include_external:
-            parsed_href = urllib.parse.urlparse(href)
-            if parsed_href.netloc and parsed_href.netloc != parsed_base.netloc:
-                continue
-
-        if pattern and not re.search(pattern, href):
-            continue
-
-        urls_found.add(href)
-
-    result = sorted(list(urls_found))
-    logger.info(f"Found {len(result)} URLs")
-    return result
-
-
-def get_image_urls(
-    url: str,
-    pattern: Optional[str] = None,
-    same_domain: bool = False,
-) -> List[str]:
-    """
-    Extract all image URLs from a webpage without downloading them.
-
-    Args:
-        url: The URL of the webpage to scrape
-        pattern: Optional regex pattern to filter image URLs
-        same_domain: If True, only return images from the same domain
-
-    Returns:
-        List of image URLs found on the page
-
-    Note:
-        - SVG files are automatically skipped (vector graphics)
-        - Checks both 'src' and 'data-src' attributes for lazy-loaded images
-
-    Example:
-        >>> img_urls = get_image_urls('https://example.com')
-        >>> img_urls = get_image_urls('https://example.com', pattern=r'\\.png$')
-    """
-    from bs4 import BeautifulSoup  # lazy: see module docstring, todo#279
-
-    try:
-        logger.info(f"Fetching image URLs from: {url}")
-        response = requests.get(
-            url,
-            timeout=DEFAULT_TIMEOUT,
-            headers={"User-Agent": DEFAULT_USER_AGENT},
-        )
-        response.raise_for_status()
-    except requests.RequestException as e:
-        logger.error(f"Failed to fetch URL {url}: {e}")
-        return []
-
-    soup = BeautifulSoup(response.text, "html.parser")
-    image_urls: Set[str] = set()
-
-    parsed_base = urllib.parse.urlparse(url)
-
-    for img in soup.find_all("img"):
-        img_url = img.get("src") or img.get("data-src")
-        if not img_url:
-            continue
-
-        img_url = urllib.parse.urljoin(url, img_url)
-
-        if img_url.lower().endswith((".svg", ".svgz")):
-            continue
-
-        if same_domain:
-            parsed_img = urllib.parse.urlparse(img_url)
-            if parsed_img.netloc != parsed_base.netloc:
-                continue
-
-        if pattern and not re.search(pattern, img_url):
-            continue
-
-        image_urls.add(img_url)
-
-    result = sorted(list(image_urls))
-    logger.info(f"Found {len(result)} image URLs")
-    return result
diff --git a/src/scitex/web/_search_pubmed.py b/src/scitex/web/_search_pubmed.py
deleted file mode 100755
index f41aa1fbd..000000000
--- a/src/scitex/web/_search_pubmed.py
+++ /dev/null
@@ -1,505 +0,0 @@
-#!/usr/bin/env python3
-# Time-stamp: "2024-11-13 14:30:43 (ywatanabe)"
-# File: ./scitex_repo/src/scitex/web/_search_pubmed.py
-
-"""
-1. Functionality:
-   - Searches PubMed database for scientific articles
-   - Retrieves detailed information about matched articles
-   - Displays article metadata including title, authors, journal, year, and abstract
-2. Input:
-   - Search query string (e.g., "epilepsy prediction")
-   - Optional parameters for batch size and result limit
-3. Output:
-   - Formatted article information displayed to stdout
-   - BibTeX file with official citations
-4. Prerequisites:
-   - Internet connection
-   - requests package
-   - scitex package
-"""
-
-"""Imports"""
-import argparse
-import asyncio
-import xml.etree.ElementTree as ET
-from typing import Any, Dict, List, Optional, Union
-
-import aiohttp
-import requests
-
-import scitex
-
-"""Functions & Classes"""
-
-
-def _search_pubmed(query: str, retmax: int = 300) -> Dict[str, Any]:
-    try:
-        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
-        search_url = f"{base_url}esearch.fcgi"
-        params = {
-            "db": "pubmed",
-            "term": query,
-            "retmax": retmax,
-            "retmode": "json",
-            "usehistory": "y",
-        }
-
-        response = requests.get(search_url, params=params, timeout=10)
-        if not response.ok:
-            scitex.str.printc("PubMed API request failed", c="red")
-            return {}
-        return response.json()
-    except requests.exceptions.RequestException as e:
-        scitex.str.printc(f"Network error: {e}", c="red")
-        return {}
-
-
-def _fetch_details(
-    webenv: str, query_key: str, retstart: int = 0, retmax: int = 100
-) -> Dict[str, Any]:
-    """Fetches detailed information including abstracts for articles.
-
-    Parameters
-    ----------
-    [Previous parameters remain the same]
-
-    Returns
-    -------
-    Dict[str, Any]
-        Dictionary containing article details and abstracts
-    """
-    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
-
-    # Fetch abstracts
-    efetch_url = f"{base_url}efetch.fcgi"
-    efetch_params = {
-        "db": "pubmed",
-        "query_key": query_key,
-        "WebEnv": webenv,
-        "retstart": retstart,
-        "retmax": retmax,
-        "retmode": "xml",
-        "rettype": "abstract",
-        "field": "abstract,mesh",
-    }
-
-    abstract_response = requests.get(efetch_url, params=efetch_params)
-
-    # Fetch metadata
-    fetch_url = f"{base_url}esummary.fcgi"
-    params = {
-        "db": "pubmed",
-        "query_key": query_key,
-        "WebEnv": webenv,
-        "retstart": retstart,
-        "retmax": retmax,
-        "retmode": "json",
-    }
-
-    details_response = requests.get(fetch_url, params=params)
-
-    if not all([abstract_response.ok, details_response.ok]):
-        # print(f"Error fetching data")
-        return {}
-
-    return {
-        "abstracts": abstract_response.text,
-        "details": details_response.json(),
-    }
-
-
-def _parse_abstract_xml(xml_text: str) -> Dict[str, tuple]:
-    """Parses XML response to extract abstracts.
-
-    Parameters
-    ----------
-    xml_text : str
-        XML response from PubMed
-
-    Returns
-    -------
-    Dict[str, str]
-        Dictionary mapping PMIDs to abstracts
-    """
-    root = ET.fromstring(xml_text)
-    results = {}
-
-    for article in root.findall(".//PubmedArticle"):
-        pmid = article.find(".//PMID").text
-        abstract_element = article.find(".//Abstract/AbstractText")
-        abstract = abstract_element.text if abstract_element is not None else ""
-
-        # DOI
-        doi_element = article.find(".//ArticleId[@IdType='doi']")
-        doi = doi_element.text if doi_element is not None else ""
-
-        # Get MeSH terms
-        keywords = []
-        mesh_terms = article.findall(".//MeshHeading/DescriptorName")
-        keywords = [term.text for term in mesh_terms if term is not None]
-
-        results[pmid] = (abstract, keywords, doi)
-
-    return results
-
-
-def _get_citation(pmid: str) -> str:
-    """Gets official citation in BibTeX format.
-
-    Parameters
-    ----------
-    pmid : str
-        PubMed ID
-
-    Returns
-    -------
-    str
-        Official BibTeX citation
-    """
-    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
-    cite_url = f"{base_url}efetch.fcgi"
-    params = {
-        "db": "pubmed",
-        "id": pmid,
-        "rettype": "bibtex",
-        "retmode": "text",
-    }
-    response = requests.get(cite_url, params=params)
-    return response.text if response.ok else ""
-
-
-def get_crossref_metrics(
-    doi: str, api_key: Optional[str] = None, email: Optional[str] = None
-) -> Dict[str, Any]:
-    """Get article metrics from CrossRef using DOI."""
-    import os
-
-    base_url = "https://api.crossref.org/works/"
-
-    # Use provided email or fallback to environment variables
-    if not email:
-        email = (
-            os.getenv("SCITEX_SCHOLAR_CROSSREF_EMAIL")
-            or os.getenv("SCITEX_CROSSREF_EMAIL")
-            or os.getenv("SCITEX_SCHOLAR_PUBMED_EMAIL")
-            or os.getenv("SCITEX_PUBMED_EMAIL", "research@example.com")
-        )
-    headers = {"User-Agent": f"SciTeX/1.0 (mailto:{email})"}
-
-    # Add API key as query parameter if provided
-    params = {}
-    if api_key:
-        params["key"] = api_key
-
-    try:
-        response = requests.get(
-            f"{base_url}{doi}", headers=headers, params=params, timeout=10
-        )
-        if response.ok:
-            data = response.json()["message"]
-            return {
-                "citations": data.get("is-referenced-by-count", 0),
-                "type": data.get("type", ""),
-                "publisher": data.get("publisher", ""),
-                "references": len(data.get("reference", [])),
-                "doi": data.get("DOI", ""),
-            }
-    except Exception as e:
-        print(f"CrossRef API error for DOI {doi}: {e}")
-    return {}
-
-
-async def get_crossref_metrics_async(
-    doi: str, api_key: Optional[str] = None, email: Optional[str] = None
-) -> Dict[str, Any]:
-    """Get article metrics from CrossRef using DOI (async version)."""
-    import os
-
-    base_url = "https://api.crossref.org/works/"
-
-    # Use provided email or fallback to environment variables
-    if not email:
-        email = (
-            os.getenv("SCITEX_SCHOLAR_CROSSREF_EMAIL")
-            or os.getenv("SCITEX_CROSSREF_EMAIL")
-            or os.getenv("SCITEX_SCHOLAR_PUBMED_EMAIL")
-            or os.getenv("SCITEX_PUBMED_EMAIL", "research@example.com")
-        )
-    headers = {"User-Agent": f"SciTeX/1.0 (mailto:{email})"}
-
-    # Add API key as query parameter if provided
-    params = {}
-    if api_key:
-        params["key"] = api_key
-
-    try:
-        async with aiohttp.ClientSession() as session:
-            async with session.get(
-                f"{base_url}{doi}", headers=headers, params=params, timeout=10
-            ) as response:
-                if response.ok:
-                    data = await response.json()
-                    message = data["message"]
-                    return {
-                        "citations": message.get("is-referenced-by-count", 0),
-                        "type": message.get("type", ""),
-                        "publisher": message.get("publisher", ""),
-                        "references": len(message.get("reference", [])),
-                        "doi": message.get("DOI", ""),
-                    }
-    except Exception as e:
-        print(f"CrossRef API error for DOI {doi}: {e}")
-    return {}
-
-
-def save_bibtex(
-    papers: Dict[str, Any], abstracts: Dict[str, str], output_file: str
-) -> None:
-    """Saves paper metadata as BibTeX file with abstracts.
-
-    Parameters
-    ----------
-    papers : Dict[str, Any]
-        Dictionary of paper metadata
-    abstracts : Dict[str, str]
-        Dictionary of PMIDs to abstracts
-    output_file : str
-        Output file path
-    """
-    with open(output_file, "w", encoding="utf-8") as bibtex_file:
-        for pmid, paper in papers.items():
-            if pmid == "uids":
-                continue
-
-            citation = _get_citation(pmid)
-            if citation:
-                bibtex_file.write(citation)
-            else:
-                # Use default tuple if pmid not in abstracts
-                default_data = ("", [], "")  # abstract, keywords, doi
-                bibtex_entry = format_bibtex(
-                    paper, pmid, abstracts.get(pmid, default_data)
-                )
-                bibtex_file.write(bibtex_entry + "\n")
-    scitex.str.printc(f"Saved to: {str(bibtex_file)}", c="yellow")
-
-
-def format_bibtex(paper: Dict[str, Any], pmid: str, abstract_data: tuple) -> str:
-    abstract, keywords, doi = abstract_data
-
-    # Get CrossRef and Scimago metrics
-    crossref_metrics = get_crossref_metrics(doi) if doi else {}
-    journal = paper.get("source", "Unknown Journal")
-    # journal_metrics = get_journal_metrics(journal)
-
-    authors = paper.get("authors", [{"name": "Unknown"}])
-    author_names = " and ".join(author["name"] for author in authors)
-    pubdate = paper.get("pubdate", "")
-    year = pubdate.split()[0] if pubdate.strip() else ""
-    title = paper.get("title", "No Title")
-
-    # Name formatting
-    first_author = authors[0]["name"]
-    first_name = first_author.split()[0]
-    last_name = first_author.split()[-1]
-    clean_first_name = "".join(c for c in first_name if c.isalnum())
-    clean_last_name = "".join(c for c in last_name if c.isalnum())
-
-    # Title words
-    title_words = title.split()
-    first_title_word = "".join(c.lower() for c in title_words[0] if c.isalnum())
-    second_title_word = (
-        "".join(c.lower() for c in title_words[1] if c.isalnum())
-        if len(title_words) > 1
-        else ""
-    )
-
-    citation_key = f"{clean_first_name}.{clean_last_name}_{year}_{first_title_word}_{second_title_word}"
-
-    entry = f"""@article{{{citation_key},
-    author = {{{author_names}}},
-    title = {{{title}}},
-    journal = {{{journal}}},
-    year = {{{year}}},
-    pmid = {{{pmid}}},
-    doi = {{{doi}}},
-    publisher = {{{crossref_metrics.get("publisher", "")}}},
-    references = {{{crossref_metrics.get("references", 0)}}},
-    keywords = {{{", ".join(keywords)}}},
-    abstract = {{{abstract}}}
-}}
-"""
-    return entry
-
-
-async def fetch_async(
-    session: aiohttp.ClientSession, url: str, params: Dict
-) -> Union[Dict, str]:
-    """Asynchronous fetch helper."""
-    async with session.get(url, params=params) as response:
-        if response.status == 200:
-            if params.get("retmode") == "xml":
-                return await response.text()
-            elif params.get("retmode") == "json":
-                return await response.json()
-            return await response.text()
-        return {}
-
-
-async def batch__fetch_details(pmids: List[str], batch_size: int = 20) -> List[Dict]:
-    """Fetches details for multiple PMIDs concurrently.
-
-    Parameters
-    ----------
-    pmids : List[str]
-        List of PubMed IDs
-    batch_size : int, optional
-        Size of each batch for concurrent requests
-
-    Returns
-    -------
-    List[Dict]
-        List of response data
-    """
-    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
-
-    async with aiohttp.ClientSession() as session:
-        tasks = []
-        for i in range(0, len(pmids), batch_size):
-            batch_pmids = pmids[i : i + batch_size]
-
-            # Fetch both details and citations concurrently
-            efetch_params = {
-                "db": "pubmed",
-                "id": ",".join(batch_pmids),
-                "retmode": "xml",
-                "rettype": "abstract",
-            }
-
-            esummary_params = {
-                "db": "pubmed",
-                "id": ",".join(batch_pmids),
-                "retmode": "json",
-            }
-
-            tasks.append(fetch_async(session, f"{base_url}efetch.fcgi", efetch_params))
-            tasks.append(
-                fetch_async(session, f"{base_url}esummary.fcgi", esummary_params)
-            )
-
-        results = await asyncio.gather(*tasks)
-        return results
-
-
-def search_pubmed(query: str, n_entries: int = 10) -> int:
-    # query = args.query or "epilepsy prediction"
-    # print(f"Using query: {query}")
-
-    search_results = _search_pubmed(query)
-    if not search_results:
-        # print("No results found or error occurred")
-        return 1
-
-    pmids = search_results["esearchresult"]["idlist"]
-    count = len(pmids)
-    # print(f"Found {count:,} results")
-
-    output_file = f"pubmed_{query.replace(' ', '_')}.bib"
-    # print(f"Saving results to: {output_file}")
-
-    # Process in larger batches asynchronously
-    results = asyncio.run(batch__fetch_details(pmids[:n_entries]))
-    # here, results seems long string
-
-    # Process results and save
-    with open(output_file, "w", encoding="utf-8") as f:
-        for i in range(0, len(results), 2):
-            xml_response = results[i]
-            json_response = results[i + 1]
-
-            if isinstance(xml_response, str):
-                abstracts = _parse_abstract_xml(xml_response)
-                if isinstance(json_response, dict) and "result" in json_response:
-                    details = json_response["result"]
-                    save_bibtex(details, abstracts, output_file)
-
-    # Process results and save
-    temp_bibtex = []
-    for i in range(0, len(results), 2):
-        xml_response = results[i]
-        json_response = results[i + 1]
-
-        if isinstance(xml_response, str):
-            abstracts = _parse_abstract_xml(xml_response)
-            if isinstance(json_response, dict) and "result" in json_response:
-                details = json_response["result"]
-                for pmid in details:
-                    if pmid != "uids":
-                        citation = _get_citation(pmid)
-                        if citation:
-                            temp_bibtex.append(citation)
-                        else:
-                            entry = format_bibtex(
-                                details[pmid], pmid, abstracts.get(pmid, "")
-                            )
-                            temp_bibtex.append(entry)
-
-    # Write all entries at once
-    with open(output_file, "w", encoding="utf-8") as f:
-        f.write("\n".join(temp_bibtex))
-
-    return 0
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="PubMed article search and retrieval tool"
-    )
-    parser.add_argument(
-        "--query",
-        "-q",
-        type=str,
-        help='Search query (default: "epilepsy prediction")',
-    )
-    parser.add_argument(
-        "--n_entries",
-        "-n",
-        type=int,
-        default=10,
-        help='Search query (default: "epilepsy prediction")',
-    )
-    args = parser.parse_args()
-    scitex.str.printc(args, c="yellow")
-    return args
-
-
-def run_main() -> None:
-    global CONFIG
-    import sys
-
-    import matplotlib.pyplot as plt
-
-    import scitex
-
-    CONFIG, sys.stdout, sys.stderr, plt, CC = scitex.session.start(
-        sys,
-        verbose=False,
-    )
-
-    args = parse_args()
-    exit_status = search_pubmed(args.query, args.n_entries)
-
-    scitex.session.close(
-        CONFIG,
-        verbose=False,
-        notify=False,
-        message="",
-        exit_status=exit_status,
-    )
-
-
-if __name__ == "__main__":
-    run_main()
-
-# EOF
diff --git a/src/scitex/web/_skills/SKILL.md b/src/scitex/web/_skills/SKILL.md
deleted file mode 100644
index 7152595b7..000000000
--- a/src/scitex/web/_skills/SKILL.md
+++ /dev/null
@@ -1,45 +0,0 @@
----
-name: stx.web
-description: Web utilities for PubMed search, URL scraping, content summarization, and image downloading.
----
-
-# stx.web
-
-The `stx.web` module provides web utilities for scientific use cases: searching PubMed for papers, scraping URLs for content and images, summarizing web pages, and downloading images in bulk.
-
-## Python API
-
-```python
-import scitex as stx
-
-# Search PubMed
-papers = stx.web.search_pubmed("EEG deep learning classification", max_results=20)
-metrics = stx.web.get_crossref_metrics(doi="10.1000/xyz123")
-
-# Summarize a URL
-summary = stx.web.summarize_url("https://arxiv.org/abs/2401.00000")
-
-# Crawl URL for structured content
-content = stx.web.crawl_url("https://example.com")
-json_data = stx.web.crawl_to_json("https://example.com")
-
-# Scrape URLs and images from a page
-urls = stx.web.get_urls("https://example.com")
-image_urls = stx.web.get_image_urls("https://example.com")
-
-# Download images
-stx.web.download_images(
-    urls=image_urls,
-    output_dir="./downloaded_images",
-    max_workers=5
-)
-```
-
-## Key Features
-
-- `search_pubmed(query, max_results)` — search PubMed and return structured paper data
-- `get_crossref_metrics(doi)` — fetch citation counts and impact metrics from CrossRef
-- `summarize_url(url)` — extract and summarize main content from a URL
-- `crawl_url` / `crawl_to_json` — structured web crawling
-- `get_urls` / `get_image_urls` — scrape links and images from pages
-- `download_images(urls, output_dir)` — bulk image download with concurrency
diff --git a/src/scitex/web/_skills/images.md b/src/scitex/web/_skills/images.md
deleted file mode 100644
index ad994fb47..000000000
--- a/src/scitex/web/_skills/images.md
+++ /dev/null
@@ -1,43 +0,0 @@
----
-description: Bulk-download images from a web page with download_images() and collect all image URLs with get_image_urls().
----
-
-# Image Downloading
-
-## download_images
-
-Download all images found on a web page to a local directory.
-
-```python
-download_images(url: str, output_dir: str = ".", extensions: list[str] | None = None) -> list[str]
-```
-
-Returns a list of local file paths for successfully downloaded images.
-
-```python
-import scitex as stx
-
-saved = stx.web.download_images(
-    "https://example.com/gallery",
-    output_dir="./downloaded_images",
-    extensions=[".png", ".jpg"],
-)
-print(f"Downloaded {len(saved)} images")
-```
-
----
-
-## get_image_urls
-
-Collect all image URLs from a web page without downloading them.
-
-```python
-get_image_urls(url: str) -> list[str]
-```
-
-```python
-import scitex as stx
-
-img_urls = stx.web.get_image_urls("https://example.com/gallery")
-print(img_urls[:3])
-```
diff --git a/src/scitex/web/_skills/pubmed.md b/src/scitex/web/_skills/pubmed.md
deleted file mode 100644
index e61a1e8b5..000000000
--- a/src/scitex/web/_skills/pubmed.md
+++ /dev/null
@@ -1,45 +0,0 @@
----
-description: Search PubMed for papers matching a query with search_pubmed() and retrieve Crossref citation counts with get_crossref_metrics().
----
-
-# PubMed Search
-
-## search_pubmed
-
-Query PubMed and return structured results including abstracts, authors, and DOIs.
-
-```python
-search_pubmed(
-    query: str,
-    max_results: int = 20,
-    email: str | None = None,
-) -> list[dict]
-```
-
-```python
-import scitex as stx
-
-papers = stx.web.search_pubmed("EEG epilepsy deep learning", max_results=10)
-for p in papers:
-    print(p["title"], p.get("doi"))
-```
-
-Each result dict contains: `pmid`, `title`, `abstract`, `authors`, `journal`, `year`, `doi`.
-
----
-
-## get_crossref_metrics
-
-Retrieve citation count and journal impact factor for a DOI via the Crossref API.
-
-```python
-get_crossref_metrics(doi: str) -> dict
-```
-
-```python
-import scitex as stx
-
-metrics = stx.web.get_crossref_metrics("10.1038/s41586-021-03819-2")
-print(metrics)
-# {'cited_by': 523, 'journal': 'Nature', 'type': 'journal-article'}
-```
diff --git a/src/scitex/web/_skills/url.md b/src/scitex/web/_skills/url.md
deleted file mode 100644
index a8ddf881a..000000000
--- a/src/scitex/web/_skills/url.md
+++ /dev/null
@@ -1,71 +0,0 @@
----
-description: Extract and summarize web page content with summarize_url(), crawl pages with crawl_url() and crawl_to_json(), and collect all hyperlinks with get_urls().
----
-
-# URL Utilities
-
-## summarize_url
-
-Fetch a URL and return a concise text summary of the main content.
-
-```python
-summarize_url(url: str, max_length: int = 500) -> str
-```
-
-```python
-import scitex as stx
-
-summary = stx.web.summarize_url("https://arxiv.org/abs/2301.12345")
-print(summary)
-```
-
----
-
-## crawl_url
-
-Fetch the full main text content of a page.
-
-```python
-crawl_url(url: str) -> str
-```
-
-```python
-import scitex as stx
-
-content = stx.web.crawl_url("https://example.com/article")
-print(content[:500])
-```
-
----
-
-## crawl_to_json
-
-Fetch a page and return structured content as a dict.
-
-```python
-crawl_to_json(url: str) -> dict
-```
-
-```python
-import scitex as stx
-
-data = stx.web.crawl_to_json("https://example.com/article")
-# Returns: {'title': ..., 'content': ..., 'links': [...], 'url': ...}
-```
-
----
-
-## get_urls
-
-Extract all hyperlinks from a web page.
-
-```python
-get_urls(url: str) -> list[str]
-```
-
-```python
-import scitex as stx
-
-links = stx.web.get_urls("https://example.com")
-print(links[:5])
-```
diff --git a/src/scitex/web/_summarize_url.py b/src/scitex/web/_summarize_url.py
deleted file mode 100755
index 5f191d95d..000000000
--- a/src/scitex/web/_summarize_url.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#!./env/bin/python3
-# -*- coding: utf-8 -*-
-# Time-stamp: "2024-07-29 21:43:30 (ywatanabe)"
-# ./src/scitex/web/_crawl.py
-
-
-import json
-import urllib.parse
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from pprint import pprint
-
-import requests
-from bs4 import BeautifulSoup
-from tqdm import tqdm
-
-import scitex
-
-try:
-    from readability import Document
-except ImportError:
-    try:
-        from readability.readability import Document
-    except ImportError:
-        Document = None
-
-import re
-
-# def crawl_url(url, max_depth=1):
-#     print("\nCrawling...")
-#     visited = set()
-#     to_visit = [(url, 0)]
-#     contents = {}
-
-#     while to_visit:
-#         current_url, depth = to_visit.pop(0)
-#         if current_url in visited or depth > max_depth:
-#             continue
-
-#         try:
-#             response = requests.get(current_url)
-#             if response.status_code == 200:
-#                 visited.add(current_url)
-#                 contents[current_url] = response.text
-#                 soup = BeautifulSoup(response.text, "html.parser")
-
-#                 for link in soup.find_all("a", href=True):
-#                     absolute_link = urllib.parse.urljoin(
-#                         current_url, link["href"]
-#                     )
-#                     if absolute_link not in visited:
-#                         to_visit.append((absolute_link, depth + 1))
-
-#         except requests.RequestException:
-#             pass
-
-#     return visited, contents
-
-
-def extract_main_content(html):
-    if Document is None:
-        # Fallback: just strip HTML tags
-        content = re.sub("<[^<]+?>", "", html)
-        content = " ".join(content.split())
-        return content[:5000]  # Limit to first 5000 chars
-
-    doc = Document(html)
-    content = doc.summary()
-    # Remove HTML tags
-    content = re.sub("<[^<]+?>", "", content)
-    # Remove extra whitespace
-    content = " ".join(content.split())
-    return content
-
-
-def crawl_url(url, max_depth=1):
-    print("\nCrawling...")
-    visited = set()
-    to_visit = [(url, 0)]
-    contents = {}
-
-    while to_visit:
-        current_url, depth = to_visit.pop(0)
-        if current_url in visited or depth > max_depth:
-            continue
-
-        try:
-            response = requests.get(current_url)
-            if response.status_code == 200:
-                visited.add(current_url)
-                main_content = extract_main_content(response.text)
-                contents[current_url] = main_content
-                soup = BeautifulSoup(response.text, "html.parser")
-
-                for link in soup.find_all("a", href=True):
-                    absolute_link = urllib.parse.urljoin(current_url, link["href"])
-                    if absolute_link not in visited:
-                        to_visit.append((absolute_link, depth + 1))
-
-        except requests.RequestException:
-            pass
-
-    return visited, contents
-
-
-def crawl_to_json(start_url):
-    if not start_url.startswith("http"):
-        start_url = "https://" + start_url
-    crawled_urls, contents = crawl_url(start_url)
-
-    print("\nSummalizing as json...")
-
-    def process_url(url):
-        llm = scitex.ai.GenAI("gpt-4o-mini")
-        return {
-            "url": url,
-            "content": llm(f"Summarize this page in 1 line:\n\n{contents[url]}"),
-        }
-
-    with ThreadPoolExecutor() as executor:
-        future_to_url = {executor.submit(process_url, url): url for url in crawled_urls}
-        crawled_pages = []
-        for future in tqdm(
-            as_completed(future_to_url),
-            total=len(crawled_urls),
-            desc="Processing URLs",
-        ):
-            crawled_pages.append(future.result())
-
-    result = {"start_url": start_url, "crawled_pages": crawled_pages}
-
-    return json.dumps(result, indent=2)
-
-
-def summarize_all(json_contents):
-    llm = scitex.ai.GenAI("gpt-4o-mini")
-    out = llm(f"Summarize this json file with 5 bullet points:\n\n{json_contents}")
-    return out
-
-
-def summarize_url(start_url):
-    json_result = crawl_to_json(start_url)
-    ground_summary = summarize_all(json_result)
-
-    pprint(ground_summary)
-    return ground_summary, json_result
-
-
-main = summarize_url
-
-if __name__ == "__main__":
-    import argparse
-
-    import scitex
-
-    parser = argparse.ArgumentParser(description="")
-    parser.add_argument("--url", "-u", type=str, help="(default: %(default)s)")
-    args = parser.parse_args()
-    scitex.gen.print_block(args, c="yellow")
-
-    main(args.url)
diff --git a/src/scitex/web/download_images.py b/src/scitex/web/download_images.py
deleted file mode 100755
index b891eda90..000000000
--- a/src/scitex/web/download_images.py
+++ /dev/null
@@ -1,323 +0,0 @@
-#!/usr/bin/env python3
-# File: ./src/scitex/web/download_images.py
-
-"""
-Image Downloader for SciTeX.
-
-Downloads images from URLs with minimum size filtering.
-
-Usage:
-    python -m scitex.web.download_images https://example.com
-    python -m scitex.web.download_images https://example.com -o ./downloads
-    python -m scitex.web.download_images https://example.com --min-size 800x600
-"""
-
-import os
-import re
-import urllib.parse
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from datetime import datetime
-from pathlib import Path
-from typing import List, Optional, Tuple
-
-import requests
-from tqdm import tqdm
-
-# NOTE: ``bs4`` is imported lazily inside functions that actually use it.
-# Importing at module load leaks ``ModuleNotFoundError`` through
-# ``scitex.web.__init__`` and breaks ``scitex --json`` /
-# ``scitex --help-recursive`` on installs without beautifulsoup4.
-# See ywatanabe1989/todo#279.
-
-try:
-    from io import BytesIO
-
-    from PIL import Image
-
-    PILLOW_AVAILABLE = True
-except ImportError:
-    PILLOW_AVAILABLE = False
-
-from scitex.logging import getLogger
-
-logger = getLogger(__name__)
-
-# Configuration
-DEFAULT_MIN_WIDTH = 400
-DEFAULT_MIN_HEIGHT = 300
-DEFAULT_TIMEOUT = 10
-DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-
-
-def _get_default_download_dir() -> str:
-    """Get default download directory using SCITEX_DIR if available."""
-    scitex_root = os.environ.get("SCITEX_DIR", os.path.expanduser("~/.scitex"))
-    return os.path.join(scitex_root, "web", "downloads")
-
-
-def _normalize_url_for_directory(url: str) -> str:
-    """Convert URL to a safe directory name."""
-    parsed = urllib.parse.urlparse(url)
-    domain = parsed.netloc.replace("www.", "")
-    path = parsed.path.strip("/").replace("/", "-")
-
-    normalized = f"{domain}-{path}" if path else domain
-    normalized = re.sub(r"[^\w\-.]", "-", normalized)
-    normalized = re.sub(r"-+", "-", normalized)
-    normalized = normalized[:100].strip("-")
-
-    return normalized
-
-
-def _is_direct_image_url(url: str) -> bool:
-    """Check if URL appears to be a direct image link."""
-    extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]
-    path = urllib.parse.urlparse(url.lower()).path
-    return any(path.endswith(ext) for ext in extensions)
-
-
-def _extract_image_urls(url: str, same_domain: bool = False) -> List[str]:
-    """Extract image URLs from a webpage."""
-    from bs4 import BeautifulSoup  # lazy: see module note, todo#279
-
-    try:
-        logger.info(f"Fetching page: {url}")
-        response = requests.get(
-            url,
-            timeout=DEFAULT_TIMEOUT,
-            headers={"User-Agent": DEFAULT_USER_AGENT},
-        )
-        response.raise_for_status()
-    except requests.RequestException as e:
-        logger.error(f"Failed to fetch page: {e}")
-        return []
-
-    soup = BeautifulSoup(response.content, "html.parser")
-    parsed_base = urllib.parse.urlparse(url)
-    image_urls = set()
-
-    for img in soup.find_all("img"):
-        img_url = img.get("src") or img.get("data-src")
-        if not img_url:
-            continue
-
-        img_url = urllib.parse.urljoin(url, img_url)
-
-        if img_url.lower().endswith((".svg", ".svgz")):
-            continue
-
-        if same_domain:
-            parsed_img = urllib.parse.urlparse(img_url)
-            if parsed_img.netloc != parsed_base.netloc:
-                continue
-
-        image_urls.add(img_url)
-
-    logger.info(f"Found {len(image_urls)} images on page")
-    return list(image_urls)
-
-
-def _download_single_image(
-    img_url: str,
-    output_dir: Path,
-    counter: int,
-    min_size: Optional[Tuple[int, int]],
-) -> Optional[str]:
-    """Download a single image."""
-    try:
-        response = requests.get(
-            img_url,
-            timeout=DEFAULT_TIMEOUT,
-            headers={"User-Agent": DEFAULT_USER_AGENT},
-        )
-        response.raise_for_status()
-
-        # Validate content-type
-        content_type = response.headers.get("content-type", "")
-        if not content_type.startswith("image/"):
-            logger.debug(f"Skipping non-image: {content_type}")
-            return None
-
-        # Check dimensions
-        if min_size and PILLOW_AVAILABLE:
-            try:
-                img = Image.open(BytesIO(response.content))
-                width, height = img.size
-                if width < min_size[0] or height < min_size[1]:
-                    logger.debug(
-                        f"Skipping small image: {width}x{height} "
-                        f"(min: {min_size[0]}x{min_size[1]})"
-                    )
-                    return None
-            except Exception:
-                pass
-
-        # Determine extension
-        ext = "jpg"
-        if PILLOW_AVAILABLE:
-            try:
-                img = Image.open(BytesIO(response.content))
-                fmt = img.format.lower() if img.format else "jpeg"
-                ext = "jpg" if fmt == "jpeg" else fmt
-            except Exception:
-                pass
-        elif "png" in content_type:
-            ext = "png"
-        elif "gif" in content_type:
-            ext = "gif"
-        elif "webp" in content_type:
-            ext = "webp"
-
-        filename = f"{counter:04d}.{ext}"
-        filepath = output_dir / filename
-
-        with open(filepath, "wb") as f:
-            f.write(response.content)
-
-        logger.info(f"Downloaded: {filename}")
-        return str(filepath)
-
-    except Exception as e:
-        logger.warning(f"Error downloading {img_url}: {e}")
-        return None
-
-
-def download_images(
-    url: str,
-    output_dir: Optional[str] = None,
-    min_size: Optional[Tuple[int, int]] = None,
-    max_workers: int = 5,
-    same_domain: bool = False,
-) -> List[str]:
-    """
-    Download images from a URL.
-
-    Args:
-        url: Webpage URL or direct image URL
-        output_dir: Output directory (default: $SCITEX_DIR/web/downloads)
-        min_size: Minimum (width, height) to filter small images (default: 400x300)
-        max_workers: Concurrent download threads
-        same_domain: Only download images from the same domain
-
-    Returns:
-        List of downloaded file paths
-
-    Example:
-        >>> paths = download_images("https://example.com")
-        >>> paths = download_images("https://example.com/photo.jpg")
-        >>> paths = download_images("https://example.com", min_size=(800, 600))
-    """
-    if not PILLOW_AVAILABLE:
-        logger.warning("Pillow not available. Size filtering disabled.")
-        min_size = None
-    elif min_size is None:
-        min_size = (DEFAULT_MIN_WIDTH, DEFAULT_MIN_HEIGHT)
-
-    # Setup output directory
-    if output_dir is None:
-        output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR")
-        if output_dir is None:
-            output_dir = _get_default_download_dir()
-
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    normalized = _normalize_url_for_directory(url)
-    output_path = Path(output_dir).expanduser() / f"{timestamp}-{normalized}-images"
-    output_path.mkdir(parents=True, exist_ok=True)
-
-    logger.info(f"Output directory: {output_path}")
-
-    # Get image URLs
-    if _is_direct_image_url(url):
-        image_urls = [url]
-        logger.info("Direct image URL detected")
-    else:
-        image_urls = _extract_image_urls(url, same_domain=same_domain)
-
-    if not image_urls:
-        logger.warning("No images found")
-        return []
-
-    # Download concurrently
-    downloaded = []
-    counter = [1]
-
-    def download_with_counter(img_url: str) -> Optional[str]:
-        idx = counter[0]
-        counter[0] += 1
-        return _download_single_image(img_url, output_path, idx, min_size)
-
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        futures = {executor.submit(download_with_counter, u): u for u in image_urls}
-
-        for future in tqdm(
-            as_completed(futures), total=len(image_urls), desc="Downloading"
-        ):
-            result = future.result()
-            if result:
-                downloaded.append(result)
-
-    logger.info(f"Downloaded {len(downloaded)} images to {output_path}")
-    return downloaded
-
-
-def main():
-    """CLI entry point."""
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description="Download images from URL",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  python -m scitex.web.download_images https://example.com
-  python -m scitex.web.download_images https://example.com -o ./downloads
-  python -m scitex.web.download_images https://example.com --min-size 800x600
-  python -m scitex.web.download_images https://example.com --no-min-size
-        """,
-    )
-    parser.add_argument("url", help="URL to download images from")
-    parser.add_argument("-o", "--output", help="Output directory")
-    parser.add_argument(
-        "--min-size",
-        default="400x300",
-        help="Minimum size WIDTHxHEIGHT (default: 400x300)",
-    )
-    parser.add_argument(
-        "--no-min-size",
-        action="store_true",
-        help="Disable size filtering",
-    )
-    parser.add_argument(
-        "--same-domain",
-        action="store_true",
-        help="Only download from same domain",
-    )
-    parser.add_argument(
-        "--workers",
-        type=int,
-        default=5,
-        help="Concurrent downloads (default: 5)",
-    )
-
-    args = parser.parse_args()
-
-    min_size = None
-    if not args.no_min_size and args.min_size:
-        w, h = map(int, args.min_size.split("x"))
-        min_size = (w, h)
-
-    paths = download_images(
-        args.url,
-        output_dir=args.output,
-        min_size=min_size,
-        max_workers=args.workers,
-        same_domain=args.same_domain,
-    )
-
-    print(f"\nDownloaded {len(paths)} images:")
-    for p in paths:
-        print(f"  {p}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/scitex/web/test__scraping.py b/tests/scitex/web/test__scraping.py
deleted file mode 100644
index 0534300db..000000000
--- a/tests/scitex/web/test__scraping.py
+++ /dev/null
@@ -1,712 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# File: ./tests/scitex/web/test__scraping.py
-
-"""
-Tests for web scraping utilities.
-"""
-
-import re
-import shutil
-import tempfile
-from pathlib import Path
-from unittest.mock import MagicMock, Mock, mock_open, patch
-
-import pytest
-
-
-class TestGetUrls:
-    """Test get_urls function."""
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_get_urls_basic(self, mock_get):
-        """Test basic URL extraction."""
-        from scitex.web import get_urls
-
-        mock_response = Mock()
-        mock_response.text = """
-        <html>
-            <body>
-                <a href="https://example.com/page1">Link 1</a>
-                <a href="/page2">Link 2</a>
-                <a href="https://example.com/page3">Link 3</a>
-            </body>
-        </html>
-        """
-        mock_response.raise_for_status = Mock()
-        mock_get.return_value = mock_response
-
-        urls = get_urls("https://example.com")
-
-        assert len(urls) == 3
-        assert "https://example.com/page1" in urls
-        assert "https://example.com/page2" in urls
-        assert "https://example.com/page3" in urls
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_get_urls_with_pattern(self, mock_get):
-        """Test URL extraction with pattern filtering."""
-        from scitex.web import get_urls
-
-        mock_response = Mock()
-        mock_response.text = """
-        <html>
-            <body>
-                <a href="https://example.com/doc.pdf">PDF</a>
-                <a href="https://example.com/page.html">HTML</a>
-                <a href="https://example.com/data.pdf">Another PDF</a>
-            </body>
-        </html>
-        """
-        mock_response.raise_for_status = Mock()
-        mock_get.return_value = mock_response
-
-        urls = get_urls("https://example.com", pattern=r"\.pdf$")
-
-        assert len(urls) == 2
-        assert all(url.endswith(".pdf") for url in urls)
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_get_urls_same_domain(self, mock_get):
-        """Test URL extraction with same domain filter."""
-        from scitex.web import get_urls
-
-        mock_response = Mock()
-        mock_response.text = """
-        <html>
-            <body>
-                <a href="https://example.com/page1">Internal</a>
-                <a href="https://other.com/page2">External</a>
-                <a href="/page3">Relative</a>
-            </body>
-        </html>
-        """
-        mock_response.raise_for_status = Mock()
-        mock_get.return_value = mock_response
-
-        urls = get_urls("https://example.com", same_domain=True)
-
-        assert len(urls) == 2
-        assert all("example.com" in url for url in urls)
-        assert not any("other.com" in url for url in urls)
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_get_urls_relative_urls(self, mock_get):
-        """Test conversion of relative URLs to absolute."""
-        from scitex.web import get_urls
-
-        mock_response = Mock()
-        mock_response.text = """
-        <html>
-            <body>
-                <a href="/page1">Page 1</a>
-                <a href="page2">Page 2</a>
-                <a href="../page3">Page 3</a>
-            </body>
-        </html>
-        """
-        mock_response.raise_for_status = Mock()
-        mock_get.return_value = mock_response
-
-        urls = get_urls("https://example.com/dir/", absolute=True)
-
-        assert len(urls) == 3
-        assert all(url.startswith("https://") for url in urls)
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_get_urls_request_failure(self, mock_get):
-        """Test handling of request failures."""
-        import requests
-
-        from scitex.web import get_urls
-
-        mock_get.side_effect = requests.RequestException("Network error")
-
-        urls = get_urls("https://example.com")
-
-        assert urls == []
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_get_urls_duplicate_removal(self, mock_get):
-        """Test that duplicate URLs are removed."""
-        from scitex.web import get_urls
-
-        mock_response = Mock()
-        mock_response.text = """
-        <html>
-            <body>
-                <a href="https://example.com/page1">Link 1</a>
-                <a href="https://example.com/page1">Link 1 again</a>
-                <a href="/page1">Relative to same page</a>
-            </body>
-        </html>
-        """
-        mock_response.raise_for_status = Mock()
-        mock_get.return_value = mock_response
-
-        urls = get_urls("https://example.com")
-
-        # Should only have one instance of page1
-        assert len(urls) == 1
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_get_urls_empty_page(self, mock_get):
-        """Test handling of page with no links."""
-        from scitex.web import get_urls
-
-        mock_response = Mock()
-        mock_response.text = "<html><body>No links here</body></html>"
-        mock_response.raise_for_status = Mock()
-        mock_get.return_value = mock_response
-
-        urls = get_urls("https://example.com")
-
-        assert urls == []
-
-
-class TestGetImageUrls:
-    """Test get_image_urls function."""
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_get_image_urls_basic(self, mock_get):
-        """Test basic image URL extraction."""
-        from scitex.web import get_image_urls
-
-        mock_response = Mock()
-        mock_response.text = """
-        <html>
-            <body>
-                <img src="https://example.com/image1.jpg">
-                <img src="/images/image2.png">
-                <img src="https://example.com/image3.gif">
-            </body>
-        </html>
-        """
-        mock_response.raise_for_status = Mock()
-        mock_get.return_value = mock_response
-
-        img_urls = get_image_urls("https://example.com")
-
-        assert len(img_urls) == 3
-        assert "https://example.com/image1.jpg" in img_urls
-        assert "https://example.com/images/image2.png" in img_urls
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_get_image_urls_with_pattern(self, mock_get):
-        """Test image URL extraction with pattern filtering."""
-        from scitex.web import get_image_urls
-
-        mock_response = Mock()
-        mock_response.text = """
-        <html>
-            <body>
-                <img src="https://example.com/image1.jpg">
-                <img src="https://example.com/image2.png">
-                <img src="https://example.com/image3.jpg">
-            </body>
-        </html>
-        """
-        mock_response.raise_for_status = Mock()
-        mock_get.return_value = mock_response
-
-        img_urls = get_image_urls("https://example.com", pattern=r"\.jpg$")
-
-        assert len(img_urls) == 2
-        assert all(url.endswith(".jpg") for url in img_urls)
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_get_image_urls_same_domain(self, mock_get):
-        """Test image URL extraction with same domain filter."""
-        from scitex.web import get_image_urls
-
-        mock_response = Mock()
-        mock_response.text = """
-        <html>
-            <body>
-                <img src="https://example.com/image1.jpg">
-                <img src="https://cdn.other.com/image2.jpg">
-                <img src="/image3.jpg">
-            </body>
-        </html>
-        """
-        mock_response.raise_for_status = Mock()
-        mock_get.return_value = mock_response
-
-        img_urls = get_image_urls("https://example.com", same_domain=True)
-
-        assert len(img_urls) == 2
-        assert all("example.com" in url for url in img_urls)
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_get_image_urls_request_failure(self, mock_get):
-        """Test handling of request failures."""
-        import requests
-
-        from scitex.web import get_image_urls
-
-        mock_get.side_effect = requests.RequestException("Network error")
-
-        img_urls = get_image_urls("https://example.com")
-
-        assert img_urls == []
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_get_image_urls_no_images(self, mock_get):
-        """Test handling of page with no images."""
-        from scitex.web import get_image_urls
-
-        mock_response = Mock()
-        mock_response.text = "<html><body>No images here</body></html>"
-        mock_response.raise_for_status = Mock()
-        mock_get.return_value = mock_response
-
-        img_urls = get_image_urls("https://example.com")
-
-        assert img_urls == []
-
-
-class TestDownloadImages:
-    """Test download_images function."""
-
-    def setup_method(self):
-        """Set up temporary directory for tests."""
-        self.temp_dir = tempfile.mkdtemp()
-
-    def teardown_method(self):
-        """Clean up temporary directory after tests."""
-        if Path(self.temp_dir).exists():
-            shutil.rmtree(self.temp_dir)
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_download_images_basic(self, mock_get):
-        """Test basic image downloading."""
-        from scitex.web import download_images
-
-        # Mock page response
-        page_response = Mock()
-        page_response.text = """
-        <html>
-            <body>
-                <img src="https://example.com/image1.jpg">
-                <img src="https://example.com/image2.png">
-            </body>
-        </html>
-        """
-        page_response.raise_for_status = Mock()
-
-        # Mock image responses
-        img_response1 = Mock()
-        img_response1.content = b"fake image data 1"
-        img_response1.headers = {"content-type": "image/jpeg"}
-        img_response1.raise_for_status = Mock()
-
-        img_response2 = Mock()
-        img_response2.content = b"fake image data 2"
-        img_response2.headers = {"content-type": "image/png"}
-        img_response2.raise_for_status = Mock()
-
-        mock_get.side_effect = [page_response, img_response1, img_response2]
-
-        paths = download_images("https://example.com", output_dir=self.temp_dir)
-
-        assert len(paths) == 2
-        assert all(Path(p).exists() for p in paths)
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_download_images_with_pattern(self, mock_get):
-        """Test image downloading with pattern filter."""
-        from scitex.web import download_images
-
-        page_response = Mock()
-        page_response.text = """
-        <html>
-            <body>
-                <img src="https://example.com/image1.jpg">
-                <img src="https://example.com/image2.png">
-            </body>
-        </html>
-        """
-        page_response.raise_for_status = Mock()
-
-        img_response = Mock()
-        img_response.content = b"fake image data"
-        img_response.headers = {"content-type": "image/jpeg"}
-        img_response.raise_for_status = Mock()
-
-        mock_get.side_effect = [page_response, img_response]
-
-        paths = download_images(
-            "https://example.com", output_dir=self.temp_dir, pattern=r"\.jpg$"
-        )
-
-        assert len(paths) == 1
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_download_images_duplicate_filenames(self, mock_get):
-        """Test handling of duplicate filenames."""
-        from scitex.web import download_images
-
-        page_response = Mock()
-        page_response.text = """
-        <html>
-            <body>
-                <img src="https://example.com/dir1/image.jpg">
-                <img src="https://example.com/dir2/image.jpg">
-            </body>
-        </html>
-        """
-        page_response.raise_for_status = Mock()
-
-        img_response = Mock()
-        img_response.content = b"fake image data"
-        img_response.headers = {"content-type": "image/jpeg"}
-        img_response.raise_for_status = Mock()
-
-        mock_get.side_effect = [page_response, img_response, img_response]
-
-        paths = download_images("https://example.com", output_dir=self.temp_dir)
-
-        # Should have both images with different filenames
-        assert len(paths) == 2
-        assert len(set(paths)) == 2  # All paths are unique
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_download_images_request_failure(self, mock_get):
-        """Test handling of request failures."""
-        import requests
-
-        from scitex.web import download_images
-
-        mock_get.side_effect = requests.RequestException("Network error")
-
-        paths = download_images("https://example.com", output_dir=self.temp_dir)
-
-        assert paths == []
-
-    @patch("scitex.web._scraping.requests.get")
-    def test_download_images_same_domain(self, mock_get):
-        """Test downloading only images from same domain."""
-        from scitex.web import download_images
-
-        page_response = Mock()
-        page_response.text = """
-        <html>
-            <body>
-                <img src="https://example.com/image1.jpg">
-                <img src="https://cdn.other.com/image2.jpg">
-            </body>
-        </html>
-        """
-        page_response.raise_for_status = Mock()
-
-        img_response = Mock()
-        img_response.content = b"fake image data"
-        img_response.headers = {"content-type": "image/jpeg"}
-        img_response.raise_for_status = Mock()
-
-        mock_get.side_effect = [page_response, img_response]
-
-        paths = download_images(
-            "https://example.com", output_dir=self.temp_dir, same_domain=True
-        )
-
-        # Should only download the first image
-        assert len(paths) == 1
-
-    @patch("scitex.web._scraping.requests.get")
-    @patch.dict("os.environ", {}, clear=True)
-    def test_download_images_no_output_dir(self, mock_get):
-        """Test default output directory creation using SCITEX_DIR."""
-        import os
-
-        from scitex.web import download_images
-
-        page_response = Mock()
-        page_response.text = """
-        <html>
-            <body>
-                <img src="https://example.com/image.jpg">
-            </body>
-        </html>
-        """
-        page_response.raise_for_status = Mock()
-
-        img_response = Mock()
-        img_response.content = b"fake image data"
-        img_response.headers = {"content-type": "image/jpeg"}
-        img_response.raise_for_status = Mock()
-
-        mock_get.side_effect = [page_response, img_response]
-
-        # Set SCITEX_DIR to a temp location for testing
-        test_scitex_dir = Path(self.temp_dir) / "scitex"
-        os.environ["SCITEX_DIR"] = str(test_scitex_dir)
-
-        paths = download_images("https://example.com")
-
-        assert len(paths) == 1
-        expected_dir = test_scitex_dir / "web" / "downloads"
-        assert expected_dir.exists()
-
-    @patch("scitex.web._scraping.requests.get")
-    @patch.dict(
-        "os.environ", {"SCITEX_WEB_DOWNLOADS_DIR": "/tmp/test_downloads"}, clear=True
-    )
-    def test_download_images_env_var_priority(self, mock_get):
-        """Test that SCITEX_WEB_DOWNLOADS_DIR takes priority."""
-        import os
-
-        from scitex.web import download_images
-
-        page_response = Mock()
-        page_response.text = """
-        <html>
-            <body>
-                <img src="https://example.com/image.jpg">
-            </body>
-        </html>
-        """
-        page_response.raise_for_status = Mock()
-
-        img_response = Mock()
-        img_response.content = b"fake image data"
-        img_response.headers = {"content-type": "image/jpeg"}
-        img_response.raise_for_status = Mock()
-
-        mock_get.side_effect = [page_response, img_response]
-
-        # Set both env vars
-        os.environ["SCITEX_DIR"] = "/tmp/scitex"
-        os.environ["SCITEX_WEB_DOWNLOADS_DIR"] = self.temp_dir
-
-        paths = download_images("https://example.com")
-
-        # Should use SCITEX_WEB_DOWNLOADS_DIR, not SCITEX_DIR
-        assert len(paths) == 1
-        assert paths[0].startswith(self.temp_dir)
-
-    @patch("scitex.web._scraping.requests.get")
-    @patch("scitex.web._scraping.PILLOW_AVAILABLE", True)
-    @patch("scitex.web._scraping.Image.open")
-    def test_download_images_min_size_filter(self, mock_image_open, mock_get):
-        """Test minimum size filtering."""
-        from scitex.web import download_images
-
-        page_response = Mock()
-        page_response.text = """
-        <html>
-            <body>
-                <img src="https://example.com/small.jpg">
-                <img src="https://example.com/large.jpg">
-            </body>
-        </html>
-        """
-        page_response.raise_for_status = Mock()
-
-        img_response_small = Mock()
-        img_response_small.content = b"small image"
-        img_response_small.headers = {"content-type": "image/jpeg"}
-        img_response_small.raise_for_status = Mock()
-
-        img_response_large = Mock()
-        img_response_large.content = b"large image"
-        img_response_large.headers = {"content-type": "image/jpeg"}
-        img_response_large.raise_for_status = Mock()
-
-        # Mock image sizes
-        small_img = Mock()
-        small_img.size = (50, 50)
-        large_img = Mock()
-        large_img.size = (500, 500)
-
-        mock_image_open.side_effect = [small_img, large_img]
-        mock_get.side_effect = [page_response, img_response_small, img_response_large]
-
-        paths = download_images(
-            "https://example.com", output_dir=self.temp_dir, min_size=(100, 100)
-        )
-
-        # Only the large image should be downloaded
-        assert len(paths) == 1
-
-
-class TestScrapingModuleImport:
-    """Test that scraping functions are properly exported."""
-
-    def test_scraping_functions_available(self):
-        """Test that all scraping functions are available."""
-        import scitex.web
-
-        assert hasattr(scitex.web, "get_urls")
-        assert hasattr(scitex.web, "download_images")
-        assert hasattr(scitex.web, "get_image_urls")
-
-        assert callable(scitex.web.get_urls)
-        assert callable(scitex.web.download_images)
-        assert callable(scitex.web.get_image_urls)
-
-
-if __name__ == "__main__":
-    import os
-
-    import pytest
-
-    pytest.main([os.path.abspath(__file__)])
-
-# --------------------------------------------------------------------------------
-# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_scraping.py
-# --------------------------------------------------------------------------------
-# #!/usr/bin/env python3
-# # File: ./src/scitex/web/_scraping.py
-#
-# """Web scraping utilities for extracting URLs."""
-#
-# import re
-# import urllib.parse
-# from typing import List, Optional, Set
-#
-# import requests
-# from bs4 import BeautifulSoup
-#
-# from scitex.logging import getLogger
-#
-# logger = getLogger(__name__)
-#
-# DEFAULT_TIMEOUT = 10
-# DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-#
-#
-# def get_urls(
-#     url: str,
-#     pattern: Optional[str] = None,
-#     absolute: bool = True,
-#     same_domain: bool = False,
-#     include_external: bool = True,
-# ) -> List[str]:
-#     """
-#     Extract all URLs from a webpage.
-#
-#     Args:
-#         url: The URL of the webpage to scrape
-#         pattern: Optional regex pattern to filter URLs (e.g., r'\\.pdf$' for PDF files)
-#         absolute: If True, convert relative URLs to absolute URLs
-#         same_domain: If True, only return URLs from the same domain
-#         include_external: If True, include external links (only applies if same_domain=False)
-#
-#     Returns:
-#         List of URLs found on the page
-#
-#     Example:
-#         >>> urls = get_urls('https://example.com', pattern=r'\\.pdf$')
-#         >>> urls = get_urls('https://example.com', same_domain=True)
-#     """
-#     try:
-#         logger.info(f"Fetching URLs from: {url}")
-#         response = requests.get(
-#             url,
-#             timeout=DEFAULT_TIMEOUT,
-#             headers={"User-Agent": DEFAULT_USER_AGENT},
-#         )
-#         response.raise_for_status()
-#     except requests.RequestException as e:
-#         logger.error(f"Failed to fetch URL {url}: {e}")
-#         return []
-#
-#     soup = BeautifulSoup(response.text, "html.parser")
-#     urls_found: Set[str] = set()
-#
-#     parsed_base = urllib.parse.urlparse(url)
-#
-#     for link in soup.find_all("a", href=True):
-#         href = link["href"]
-#
-#         if absolute:
-#             href = urllib.parse.urljoin(url, href)
-#
-#         if same_domain:
-#             parsed_href = urllib.parse.urlparse(href)
-#             if parsed_href.netloc != parsed_base.netloc:
-#                 continue
-#         elif not include_external:
-#             parsed_href = urllib.parse.urlparse(href)
-#             if parsed_href.netloc and parsed_href.netloc != parsed_base.netloc:
-#                 continue
-#
-#         if pattern and not re.search(pattern, href):
-#             continue
-#
-#         urls_found.add(href)
-#
-#     result = sorted(list(urls_found))
-#     logger.info(f"Found {len(result)} URLs")
-#     return result
-#
-#
-# def get_image_urls(
-#     url: str,
-#     pattern: Optional[str] = None,
-#     same_domain: bool = False,
-# ) -> List[str]:
-#     """
-#     Extract all image URLs from a webpage without downloading them.
-#
-#     Args:
-#         url: The URL of the webpage to scrape
-#         pattern: Optional regex pattern to filter image URLs
-#         same_domain: If True, only return images from the same domain
-#
-#     Returns:
-#         List of image URLs found on the page
-#
-#     Note:
-#         - SVG files are automatically skipped (vector graphics)
-#         - Checks both 'src' and 'data-src' attributes for lazy-loaded images
-#
-#     Example:
-#         >>> img_urls = get_image_urls('https://example.com')
-#         >>> img_urls = get_image_urls('https://example.com', pattern=r'\\.png$')
-#     """
-#     try:
-#         logger.info(f"Fetching image URLs from: {url}")
-#         response = requests.get(
-#             url,
-#             timeout=DEFAULT_TIMEOUT,
-#             headers={"User-Agent": DEFAULT_USER_AGENT},
-#         )
-#         response.raise_for_status()
-#     except requests.RequestException as e:
-#         logger.error(f"Failed to fetch URL {url}: {e}")
-#         return []
-#
-#     soup = BeautifulSoup(response.text, "html.parser")
-#     image_urls: Set[str] = set()
-#
-#     parsed_base = urllib.parse.urlparse(url)
-#
-#     for img in soup.find_all("img"):
-#         img_url = img.get("src") or img.get("data-src")
-#         if not img_url:
-#             continue
-#
-#         img_url = urllib.parse.urljoin(url, img_url)
-#
-#         if img_url.lower().endswith((".svg", ".svgz")):
-#             continue
-#
-#         if same_domain:
-#             parsed_img = urllib.parse.urlparse(img_url)
-#             if parsed_img.netloc != parsed_base.netloc:
-#                 continue
-#
-#         if pattern and not re.search(pattern, img_url):
-#             continue
-#
-#         image_urls.add(img_url)
-#
-#     result = sorted(list(image_urls))
-#     logger.info(f"Found {len(result)} image URLs")
-#     return result
-
-# --------------------------------------------------------------------------------
-# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_scraping.py
-# --------------------------------------------------------------------------------
diff --git a/tests/scitex/web/test__search_pubmed.py b/tests/scitex/web/test__search_pubmed.py
deleted file mode 100755
index bf91741b5..000000000
--- a/tests/scitex/web/test__search_pubmed.py
+++ /dev/null
@@ -1,1170 +0,0 @@
-#!/usr/bin/env python3
-# Time-stamp: "2024-11-08 05:50:57 (ywatanabe)"
-# File: ./scitex_repo/tests/scitex/web/test__search_pubmed.py
-
-"""
-Tests for PubMed search functionality.
-"""
-
-import pytest
-
-aiohttp = pytest.importorskip("aiohttp")
-pytest.importorskip("scitex.web.search_pubmed")
-
-import asyncio  # noqa: F401, E402
-import json  # noqa: F401, E402
-import xml.etree.ElementTree as ET  # noqa: F401, E402
-from io import StringIO  # noqa: F401, E402
-from unittest.mock import MagicMock, Mock, mock_open, patch  # noqa: E402
-
-try:
-    from scitex.web import (
-        _fetch_details,
-        _get_citation,
-        _parse_abstract_xml,
-        _search_pubmed,
-        batch__fetch_details,
-        fetch_async,
-        format_bibtex,
-        get_crossref_metrics,
-        parse_args,
-        run_main,
-        save_bibtex,
-        search_pubmed,
-    )
-except ImportError:
-    pytest.skip("scitex.web.search_pubmed not available", allow_module_level=True)
-
-
-class TestSearchPubmed:
-    """Test _search_pubmed function."""
-
-    def test_search_pubmed_success(self):
-        """Test successful PubMed search."""
-        mock_response = Mock()
-        mock_response.ok = True
-        mock_response.json.return_value = {
-            "esearchresult": {"idlist": ["12345", "67890"], "count": "2"}
-        }
-
-        with patch("requests.get", return_value=mock_response):
-            result = _search_pubmed("test query", retmax=10)
-            assert result == mock_response.json.return_value
-            assert len(result["esearchresult"]["idlist"]) == 2
-
-    def test_search_pubmed_failure(self):
-        """Test failed PubMed search."""
-        mock_response = Mock()
-        mock_response.ok = False
-
-        with patch("requests.get", return_value=mock_response):
-            with patch("scitex.str.printc") as mock_print:
-                result = _search_pubmed("test query")
-                assert result == {}
-                mock_print.assert_called_once()
-
-    def test_search_pubmed_network_error(self):
-        """Test network error during search."""
-        import requests
-
-        with patch(
-            "requests.get",
-            side_effect=requests.exceptions.RequestException("Network error"),
-        ):
-            with patch("scitex.str.printc") as mock_print:
-                result = _search_pubmed("test query")
-                assert result == {}
-                mock_print.assert_called_once()
-
-    def test_search_pubmed_parameters(self):
-        """Test search parameters are correctly passed."""
-        mock_response = Mock()
-        mock_response.ok = True
-        mock_response.json.return_value = {"esearchresult": {}}
-
-        with patch("requests.get", return_value=mock_response) as mock_get:
-            _search_pubmed("epilepsy", retmax=500)
-
-            # Check that correct parameters were passed
-            args, kwargs = mock_get.call_args
-            assert kwargs["params"]["term"] == "epilepsy"
-            assert kwargs["params"]["retmax"] == 500
-            assert kwargs["params"]["db"] == "pubmed"
-
-
-class TestFetchDetails:
-    """Test _fetch_details function."""
-
-    def test_fetch_details_success(self):
-        """Test successful fetch of article details."""
-        mock_abstract_response = Mock()
-        mock_abstract_response.ok = True
-        mock_abstract_response.text = "<xml>abstract data</xml>"
-
-        mock_details_response = Mock()
-        mock_details_response.ok = True
-        mock_details_response.json.return_value = {
-            "result": {"12345": {"title": "Test"}}
-        }
-
-        with patch(
-            "requests.get", side_effect=[mock_abstract_response, mock_details_response]
-        ):
-            result = _fetch_details("webenv123", "query_key456", retstart=0, retmax=100)
-            assert result["abstracts"] == "<xml>abstract data</xml>"
-            assert result["details"] == mock_details_response.json.return_value
-
-    def test_fetch_details_failure(self):
-        """Test failed fetch of article details."""
-        mock_response = Mock()
-        mock_response.ok = False
-
-        with patch("requests.get", return_value=mock_response):
-            result = _fetch_details("webenv123", "query_key456")
-            assert result == {}
-
-    def test_fetch_details_parameters(self):
-        """Test fetch details parameters."""
-        mock_response = Mock()
-        mock_response.ok = True
-        mock_response.text = ""
-        mock_response.json.return_value = {}
-
-        with patch("requests.get", return_value=mock_response) as mock_get:
-            _fetch_details("env123", "key456", retstart=100, retmax=50)
-
-            # Verify two calls were made
-            assert mock_get.call_count == 2
-
-            # Check parameters for abstract fetch
-            first_call_params = mock_get.call_args_list[0][1]["params"]
-            assert first_call_params["WebEnv"] == "env123"
-            assert first_call_params["query_key"] == "key456"
-            assert first_call_params["retstart"] == 100
-            assert first_call_params["retmax"] == 50
-
-
-class TestParseAbstractXml:
-    """Test _parse_abstract_xml function."""
-
-    def test_parse_abstract_xml_complete(self):
-        """Test parsing complete XML with all fields."""
-        xml_text = """
-        <PubmedArticleSet>
-            <PubmedArticle>
-                <MedlineCitation>
-                    <PMID>12345</PMID>
-                    <Article>
-                        <Abstract>
-                            <AbstractText>This is the abstract text.</AbstractText>
-                        </Abstract>
-                    </Article>
-                </MedlineCitation>
-                <PubmedData>
-                    <ArticleIdList>
-                        <ArticleId IdType="doi">10.1234/test.doi</ArticleId>
-                    </ArticleIdList>
-                </PubmedData>
-                <MeshHeadingList>
-                    <MeshHeading>
-                        <DescriptorName>Keyword1</DescriptorName>
-                    </MeshHeading>
-                    <MeshHeading>
-                        <DescriptorName>Keyword2</DescriptorName>
-                    </MeshHeading>
-                </MeshHeadingList>
-            </PubmedArticle>
-        </PubmedArticleSet>
-        """
-
-        result = _parse_abstract_xml(xml_text)
-        assert "12345" in result
-        assert result["12345"][0] == "This is the abstract text."
-        assert result["12345"][1] == ["Keyword1", "Keyword2"]
-        assert result["12345"][2] == "10.1234/test.doi"
-
-    def test_parse_abstract_xml_missing_fields(self):
-        """Test parsing XML with missing fields."""
-        xml_text = """
-        <PubmedArticleSet>
-            <PubmedArticle>
-                <MedlineCitation>
-                    <PMID>67890</PMID>
-                </MedlineCitation>
-            </PubmedArticle>
-        </PubmedArticleSet>
-        """
-
-        result = _parse_abstract_xml(xml_text)
-        assert "67890" in result
-        assert result["67890"][0] == ""  # No abstract
-        assert result["67890"][1] == []  # No keywords
-        assert result["67890"][2] == ""  # No DOI
-
-    def test_parse_abstract_xml_multiple_articles(self):
-        """Test parsing XML with multiple articles."""
-        xml_text = """
-        <PubmedArticleSet>
-            <PubmedArticle>
-                <MedlineCitation>
-                    <PMID>11111</PMID>
-                </MedlineCitation>
-            </PubmedArticle>
-            <PubmedArticle>
-                <MedlineCitation>
-                    <PMID>22222</PMID>
-                </MedlineCitation>
-            </PubmedArticle>
-        </PubmedArticleSet>
-        """
-
-        result = _parse_abstract_xml(xml_text)
-        assert len(result) == 2
-        assert "11111" in result
-        assert "22222" in result
-
-
-class TestGetCitation:
-    """Test _get_citation function."""
-
-    def test_get_citation_success(self):
-        """Test successful citation retrieval."""
-        mock_response = Mock()
-        mock_response.ok = True
-        mock_response.text = "@article{test_citation}"
-
-        with patch("requests.get", return_value=mock_response):
-            result = _get_citation("12345")
-            assert result == "@article{test_citation}"
-
-    def test_get_citation_failure(self):
-        """Test failed citation retrieval."""
-        mock_response = Mock()
-        mock_response.ok = False
-
-        with patch("requests.get", return_value=mock_response):
-            result = _get_citation("12345")
-            assert result == ""
-
-    def test_get_citation_parameters(self):
-        """Test citation parameters."""
-        mock_response = Mock()
-        mock_response.ok = True
-        mock_response.text = ""
-
-        with patch("requests.get", return_value=mock_response) as mock_get:
-            _get_citation("99999")
-
-            args, kwargs = mock_get.call_args
-            assert kwargs["params"]["db"] == "pubmed"
-            assert kwargs["params"]["id"] == "99999"
-            assert kwargs["params"]["rettype"] == "bibtex"
-
-
-class TestGetCrossrefMetrics:
-    """Test get_crossref_metrics function."""
-
-    def test_get_crossref_metrics_success(self):
-        """Test successful CrossRef metrics retrieval."""
-        mock_response = Mock()
-        mock_response.ok = True
-        mock_response.json.return_value = {
-            "message": {
-                "is-referenced-by-count": 42,
-                "type": "journal-article",
-                "publisher": "Test Publisher",
-                "reference": [1, 2, 3],
-                "DOI": "10.1234/test",
-            }
-        }
-
-        with patch("requests.get", return_value=mock_response):
-            result = get_crossref_metrics("10.1234/test")
-            assert result["citations"] == 42
-            assert result["type"] == "journal-article"
-            assert result["publisher"] == "Test Publisher"
-            assert result["references"] == 3
-            assert result["doi"] == "10.1234/test"
-
-    def test_get_crossref_metrics_failure(self):
-        """Test failed CrossRef metrics retrieval."""
-        mock_response = Mock()
-        mock_response.ok = False
-
-        with patch("requests.get", return_value=mock_response):
-            result = get_crossref_metrics("10.1234/test")
-            assert result == {}
-
-    def test_get_crossref_metrics_missing_fields(self):
-        """Test CrossRef metrics with missing fields."""
-        mock_response = Mock()
-        mock_response.ok = True
-        mock_response.json.return_value = {"message": {}}
-
-        with patch("requests.get", return_value=mock_response):
-            result = get_crossref_metrics("10.1234/test")
-            assert result["citations"] == 0
-            assert result["type"] == ""
-            assert result["publisher"] == ""
-            assert result["references"] == 0
-            assert result["doi"] == ""
-
-
-class TestSaveBibtex:
-    """Test save_bibtex function."""
-
-    def test_save_bibtex_with_citations(self):
-        """Test saving BibTeX with official citations."""
-        papers = {
-            "12345": {
-                "title": "Test Paper",
-                "authors": [{"name": "John Doe"}],
-                "source": "Test Journal",
-                "pubdate": "2023",
-            }
-        }
-        abstracts = {"12345": ("Abstract text", ["Keyword1"], "10.1234/test")}
-
-        mock_citation = "@article{official_citation}"
-
-        with patch("builtins.open", mock_open()) as mock_file:
-            with patch(
-                "scitex.web._search_pubmed._get_citation", return_value=mock_citation
-            ):
-                with patch("scitex.str.printc"):
-                    save_bibtex(papers, abstracts, "test.bib")
-
-                    # Verify file was written
-                    mock_file.assert_called_once_with("test.bib", "w", encoding="utf-8")
-                    handle = mock_file()
-                    handle.write.assert_called_with(mock_citation)
-
-    def test_save_bibtex_without_citations(self):
-        """Test saving BibTeX without official citations."""
-        papers = {
-            "67890": {
-                "title": "Test Paper Without Citation",
-                "authors": [{"name": "Jane Smith"}],
-                "source": "Another Journal",
-                "pubdate": "2024",
-            }
-        }
-        abstracts = {}
-
-        with patch("builtins.open", mock_open()) as mock_file:
-            with patch("scitex.web._search_pubmed._get_citation", return_value=""):
-                with patch(
-                    "scitex.web._search_pubmed.format_bibtex",
-                    return_value="@article{formatted}",
-                ) as mock_format:
-                    with patch("scitex.str.printc"):
-                        save_bibtex(papers, abstracts, "test.bib")
-
-                        # Verify format_bibtex was called
-                        mock_format.assert_called_once()
-                        handle = mock_file()
-                        handle.write.assert_called_with("@article{formatted}\n")
-
-    def test_save_bibtex_skip_uids(self):
-        """Test that 'uids' key is skipped."""
-        papers = {"uids": ["12345"], "12345": {"title": "Real Paper"}}
-        abstracts = {}
-
-        with patch("builtins.open", mock_open()) as mock_file:  # noqa: F841
-            with patch("scitex.web._search_pubmed._get_citation", return_value=""):
-                with patch("scitex.web._search_pubmed.format_bibtex") as mock_format:
-                    with patch("scitex.str.printc"):
-                        save_bibtex(papers, abstracts, "test.bib")
-
-                        # Verify format_bibtex was called only once (not for 'uids')
-                        assert mock_format.call_count == 1
-
-
-class TestFormatBibtex:
-    """Test format_bibtex function."""
-
-    def test_format_bibtex_complete(self):
-        """Test formatting complete BibTeX entry."""
-        paper = {
-            "title": "Machine Learning for Medical Diagnosis",
-            "authors": [{"name": "John A. Smith"}, {"name": "Jane B. Doe"}],
-            "source": "Nature Medicine",
-            "pubdate": "2023 Jul 15",
-        }
-        pmid = "12345678"
-        abstract_data = (
-            "This is the abstract text.",
-            ["Machine Learning", "Diagnosis"],
-            "10.1038/s41591-023-12345",
-        )
-
-        with patch(
-            "scitex.web._search_pubmed.get_crossref_metrics",
-            return_value={"publisher": "Nature Publishing", "references": 50},
-        ):
-            result = format_bibtex(paper, pmid, abstract_data)
-
-            # Check key components
-            assert "@article{John.Smith_2023_machine_learning" in result
-            assert "author = {John A. Smith and Jane B. Doe}" in result
-            assert "title = {Machine Learning for Medical Diagnosis}" in result
-            assert "journal = {Nature Medicine}" in result
-            assert "year = {2023}" in result
-            assert "pmid = {12345678}" in result
-            assert "doi = {10.1038/s41591-023-12345}" in result
-            assert "keywords = {Machine Learning, Diagnosis}" in result
-            assert "abstract = {This is the abstract text.}" in result
-
-    def test_format_bibtex_minimal(self):
-        """Test formatting BibTeX with minimal data."""
-        paper = {
-            "title": "A",
-            "authors": [{"name": "X"}],
-            "source": "Unknown Journal",
-            "pubdate": "",
-        }
-        pmid = "99999"
-        abstract_data = ("", [], "")
-
-        with patch("scitex.web._search_pubmed.get_crossref_metrics", return_value={}):
-            result = format_bibtex(paper, pmid, abstract_data)
-
-            # Check it doesn't crash and produces valid entry
-            assert "@article{" in result
-            assert "pmid = {99999}" in result
-
-    def test_format_bibtex_special_characters(self):
-        """Test formatting with special characters in names."""
-        paper = {
-            "title": "Test-Paper: With Special Characters!",
-            "authors": [{"name": "O'Neill-Smith"}],
-            "source": "Test Journal",
-            "pubdate": "2023",
-        }
-        pmid = "11111"
-        abstract_data = ("", [], "")
-
-        with patch("scitex.web._search_pubmed.get_crossref_metrics", return_value={}):
-            result = format_bibtex(paper, pmid, abstract_data)
-
-            # Check citation key is properly cleaned (format: FirstName.LastName_year_...)
-            assert "@article{ONeillSmith.ONeillSmith_2023_testpaper_with" in result
-
-
-class TestAsyncFunctions:
-    """Test async functions."""
-
-    @pytest.mark.asyncio
-    async def test_fetch_async_json(self):
-        """Test async fetch with JSON response."""
-        from unittest.mock import AsyncMock
-
-        mock_response = MagicMock()
-        mock_response.status = 200
-        mock_response.json = AsyncMock(return_value={"test": "data"})
-
-        mock_session = MagicMock()
-        mock_session.get.return_value.__aenter__.return_value = mock_response
-
-        result = await fetch_async(mock_session, "http://test.com", {"retmode": "json"})
-        assert result == {"test": "data"}
-
-    @pytest.mark.asyncio
-    async def test_fetch_async_xml(self):
-        """Test async fetch with XML response."""
-        from unittest.mock import AsyncMock
-
-        mock_response = MagicMock()
-        mock_response.status = 200
-        mock_response.text = AsyncMock(return_value="<xml>test</xml>")
-
-        mock_session = MagicMock()
-        mock_session.get.return_value.__aenter__.return_value = mock_response
-
-        result = await fetch_async(mock_session, "http://test.com", {"retmode": "xml"})
-        assert result == "<xml>test</xml>"
-
-    @pytest.mark.asyncio
-    async def test_fetch_async_failure(self):
-        """Test async fetch with failed response."""
-        mock_response = MagicMock()
-        mock_response.status = 404
-
-        mock_session = MagicMock()
-        mock_session.get.return_value.__aenter__.return_value = mock_response
-
-        result = await fetch_async(mock_session, "http://test.com", {})
-        assert result == {}
-
-    @pytest.mark.asyncio
-    async def test_batch_fetch_details(self):
-        """Test batch fetching details."""
-        pmids = ["11111", "22222", "33333"]
-
-        with patch("aiohttp.ClientSession") as mock_session_class:
-            mock_session = MagicMock()
-            mock_session_class.return_value.__aenter__.return_value = mock_session
-
-            with patch(
-                "scitex.web._search_pubmed.fetch_async",
-                side_effect=[
-                    "<xml>1</xml>",
-                    {"result": "1"},
-                    "<xml>2</xml>",
-                    {"result": "2"},
-                ],
-            ):
-                results = await batch__fetch_details(pmids, batch_size=2)
-
-                assert len(results) == 4  # 2 batches × 2 requests each
-                assert results[0] == "<xml>1</xml>"
-                assert results[1] == {"result": "1"}
-
-
-class TestSearchPubmedMain:
-    """Test main search_pubmed function."""
-
-    def test_search_pubmed_no_results(self):
-        """Test search with no results."""
-        with patch("scitex.web._search_pubmed._search_pubmed", return_value={}):
-            result = search_pubmed("test query", n_entries=10)
-            assert result == 1
-
-    def test_search_pubmed_success(self):
-        """Test successful search and save."""
-        search_results = {"esearchresult": {"idlist": ["12345", "67890"], "count": "2"}}
-
-        batch_results = [
-            "<PubmedArticleSet></PubmedArticleSet>",  # XML
-            {
-                "result": {"12345": {"title": "Test1"}, "67890": {"title": "Test2"}}
-            },  # JSON
-        ]
-
-        with patch(
-            "scitex.web._search_pubmed._search_pubmed", return_value=search_results
-        ):
-            with patch("asyncio.run", return_value=batch_results):
-                with patch("builtins.open", mock_open()) as mock_file:
-                    with patch(
-                        "scitex.web._search_pubmed._parse_abstract_xml", return_value={}
-                    ):
-                        with patch(
-                            "scitex.web._search_pubmed._get_citation", return_value=""
-                        ):
-                            with patch(
-                                "scitex.web._search_pubmed.format_bibtex",
-                                return_value="@article{}",
-                            ):
-                                result = search_pubmed("test query", n_entries=2)
-                                assert result == 0
-
-                                # Verify file was opened (may be called multiple times)
-                                assert mock_file.call_count >= 1
-
-    def test_search_pubmed_query_sanitization(self):
-        """Test that query is properly sanitized for filename."""
-        search_results = {"esearchresult": {"idlist": [], "count": "0"}}
-
-        with patch(
-            "scitex.web._search_pubmed._search_pubmed", return_value=search_results
-        ):
-            with patch("asyncio.run", return_value=[]):
-                with patch("builtins.open", mock_open()) as mock_file:
-                    search_pubmed("test query with spaces", n_entries=0)
-
-                    # Check filename has underscores
-                    filename = mock_file.call_args_list[0][0][0]
-                    assert filename == "pubmed_test_query_with_spaces.bib"
-
-
-class TestParseArgs:
-    """Test parse_args function."""
-
-    def test_parse_args_with_query(self):
-        """Test parsing arguments with query."""
-        with patch(
-            "sys.argv",
-            ["script.py", "--query", "epilepsy prediction", "--n_entries", "20"],
-        ):
-            with patch("scitex.str.printc"):
-                args = parse_args()
-                assert args.query == "epilepsy prediction"
-                assert args.n_entries == 20
-
-    def test_parse_args_defaults(self):
-        """Test parsing arguments with defaults."""
-        with patch("sys.argv", ["script.py"]):
-            with patch("scitex.str.printc"):
-                args = parse_args()
-                assert args.query is None
-                assert args.n_entries == 10
-
-    def test_parse_args_short_options(self):
-        """Test parsing with short options."""
-        with patch("sys.argv", ["script.py", "-q", "test", "-n", "5"]):
-            with patch("scitex.str.printc"):
-                args = parse_args()
-                assert args.query == "test"
-                assert args.n_entries == 5
-
-
-class TestRunMain:
-    """Test run_main function."""
-
-    def test_run_main_success(self):
-        """Test successful main execution."""
-        mock_args = Mock()
-        mock_args.query = "test query"
-        mock_args.n_entries = 10
-
-        # Patch at the location where scitex is imported in the module
-        with patch(
-            "scitex.web._search_pubmed.scitex.session.start",
-            return_value=(None, None, None, None, None),
-        ):
-            with patch("scitex.web._search_pubmed.parse_args", return_value=mock_args):
-                with patch(
-                    "scitex.web._search_pubmed.search_pubmed", return_value=0
-                ) as mock_search:
-                    with patch("scitex.web._search_pubmed.scitex.session.close"):
-                        run_main()
-
-                        mock_search.assert_called_once_with("test query", 10)
-
-    def test_run_main_with_error(self):
-        """Test main execution with error."""
-        mock_args = Mock()
-        mock_args.query = "test"
-        mock_args.n_entries = 5
-
-        with patch(
-            "scitex.web._search_pubmed.scitex.session.start",
-            return_value=(None, None, None, None, None),
-        ):
-            with patch("scitex.web._search_pubmed.parse_args", return_value=mock_args):
-                with patch("scitex.web._search_pubmed.search_pubmed", return_value=1):
-                    with patch(
-                        "scitex.web._search_pubmed.scitex.session.close"
-                    ) as mock_close:
-                        run_main()
-
-                        # Verify close was called with exit_status=1
-                        assert mock_close.call_args[1]["exit_status"] == 1
-
-
-if __name__ == "__main__":
-    import os
-
-    import pytest
-
-    pytest.main([os.path.abspath(__file__)])
-
-# --------------------------------------------------------------------------------
-# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_search_pubmed.py
-# --------------------------------------------------------------------------------
-# #!/usr/bin/env python3
-# # Time-stamp: "2024-11-13 14:30:43 (ywatanabe)"
-# # File: ./scitex_repo/src/scitex/web/_search_pubmed.py
-#
-# """
-# 1. Functionality:
-#    - Searches PubMed database for scientific articles
-#    - Retrieves detailed information about matched articles
-#    - Displays article metadata including title, authors, journal, year, and abstract
-# 2. Input:
-#    - Search query string (e.g., "epilepsy prediction")
-#    - Optional parameters for batch size and result limit
-# 3. Output:
-#    - Formatted article information displayed to stdout
-#    - BibTeX file with official citations
-# 4. Prerequisites:
-#    - Internet connection
-#    - requests package
-#    - scitex package
-# """
-#
-# """Imports"""
-# import argparse
-# import asyncio
-# import xml.etree.ElementTree as ET
-# from typing import Any, Dict, List, Optional, Union
-#
-# import aiohttp
-# import requests
-#
-# import scitex
-#
-# """Functions & Classes"""
-#
-#
-# def _search_pubmed(query: str, retmax: int = 300) -> Dict[str, Any]:
-#     try:
-#         base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
-#         search_url = f"{base_url}esearch.fcgi"
-#         params = {
-#             "db": "pubmed",
-#             "term": query,
-#             "retmax": retmax,
-#             "retmode": "json",
-#             "usehistory": "y",
-#         }
-#
-#         response = requests.get(search_url, params=params, timeout=10)
-#         if not response.ok:
-#             scitex.str.printc("PubMed API request failed", c="red")
-#             return {}
-#         return response.json()
-#     except requests.exceptions.RequestException as e:
-#         scitex.str.printc(f"Network error: {e}", c="red")
-#         return {}
-#
-#
-# def _fetch_details(
-#     webenv: str, query_key: str, retstart: int = 0, retmax: int = 100
-# ) -> Dict[str, Any]:
-#     """Fetches detailed information including abstracts for articles.
-#
-#     Parameters
-#     ----------
-#     [Previous parameters remain the same]
-#
-#     Returns
-#     -------
-#     Dict[str, Any]
-#         Dictionary containing article details and abstracts
-#     """
-#     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
-#
-#     # Fetch abstracts
-#     efetch_url = f"{base_url}efetch.fcgi"
-#     efetch_params = {
-#         "db": "pubmed",
-#         "query_key": query_key,
-#         "WebEnv": webenv,
-#         "retstart": retstart,
-#         "retmax": retmax,
-#         "retmode": "xml",
-#         "rettype": "abstract",
-#         "field": "abstract,mesh",
-#     }
-#
-#     abstract_response = requests.get(efetch_url, params=efetch_params)
-#
-#     # Fetch metadata
-#     fetch_url = f"{base_url}esummary.fcgi"
-#     params = {
-#         "db": "pubmed",
-#         "query_key": query_key,
-#         "WebEnv": webenv,
-#         "retstart": retstart,
-#         "retmax": retmax,
-#         "retmode": "json",
-#     }
-#
-#     details_response = requests.get(fetch_url, params=params)
-#
-#     if not all([abstract_response.ok, details_response.ok]):
-#         # print(f"Error fetching data")
-#         return {}
-#
-#     return {
-#         "abstracts": abstract_response.text,
-#         "details": details_response.json(),
-#     }
-#
-#
-# def _parse_abstract_xml(xml_text: str) -> Dict[str, tuple]:
-#     """Parses XML response to extract abstracts.
-#
-#     Parameters
-#     ----------
-#     xml_text : str
-#         XML response from PubMed
-#
-#     Returns
-#     -------
-#     Dict[str, str]
-#         Dictionary mapping PMIDs to abstracts
-#     """
-#     root = ET.fromstring(xml_text)
-#     results = {}
-#
-#     for article in root.findall(".//PubmedArticle"):
-#         pmid = article.find(".//PMID").text
-#         abstract_element = article.find(".//Abstract/AbstractText")
-#         abstract = abstract_element.text if abstract_element is not None else ""
-#
-#         # DOI
-#         doi_element = article.find(".//ArticleId[@IdType='doi']")
-#         doi = doi_element.text if doi_element is not None else ""
-#
-#         # Get MeSH terms
-#         keywords = []
-#         mesh_terms = article.findall(".//MeshHeading/DescriptorName")
-#         keywords = [term.text for term in mesh_terms if term is not None]
-#
-#         results[pmid] = (abstract, keywords, doi)
-#
-#     return results
-#
-#
-# def _get_citation(pmid: str) -> str:
-#     """Gets official citation in BibTeX format.
-#
-#     Parameters
-#     ----------
-#     pmid : str
-#         PubMed ID
-#
-#     Returns
-#     -------
-#     str
-#         Official BibTeX citation
-#     """
-#     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
-#     cite_url = f"{base_url}efetch.fcgi"
-#     params = {
-#         "db": "pubmed",
-#         "id": pmid,
-#         "rettype": "bibtex",
-#         "retmode": "text",
-#     }
-#     response = requests.get(cite_url, params=params)
-#     return response.text if response.ok else ""
-#
-#
-# def get_crossref_metrics(
-#     doi: str, api_key: Optional[str] = None, email: Optional[str] = None
-# ) -> Dict[str, Any]:
-#     """Get article metrics from CrossRef using DOI."""
-#     import os
-#
-#     base_url = "https://api.crossref.org/works/"
-#
-#     # Use provided email or fallback to environment variables
-#     if not email:
-#         email = os.getenv(
-#             "SCITEX_CROSSREF_EMAIL",
-#             os.getenv("SCITEX_PUBMED_EMAIL", "research@example.com"),
-#         )
-#     headers = {"User-Agent": f"SciTeX/1.0 (mailto:{email})"}
-#
-#     # Add API key as query parameter if provided
-#     params = {}
-#     if api_key:
-#         params["key"] = api_key
-#
-#     try:
-#         response = requests.get(
-#             f"{base_url}{doi}", headers=headers, params=params, timeout=10
-#         )
-#         if response.ok:
-#             data = response.json()["message"]
-#             return {
-#                 "citations": data.get("is-referenced-by-count", 0),
-#                 "type": data.get("type", ""),
-#                 "publisher": data.get("publisher", ""),
-#                 "references": len(data.get("reference", [])),
-#                 "doi": data.get("DOI", ""),
-#             }
-#     except Exception as e:
-#         print(f"CrossRef API error for DOI {doi}: {e}")
-#     return {}
-#
-#
-# async def get_crossref_metrics_async(
-#     doi: str, api_key: Optional[str] = None, email: Optional[str] = None
-# ) -> Dict[str, Any]:
-#     """Get article metrics from CrossRef using DOI (async version)."""
-#     import os
-#
-#     base_url = "https://api.crossref.org/works/"
-#
-#     # Use provided email or fallback to environment variables
-#     if not email:
-#         email = os.getenv(
-#             "SCITEX_CROSSREF_EMAIL",
-#             os.getenv("SCITEX_PUBMED_EMAIL", "research@example.com"),
-#         )
-#     headers = {"User-Agent": f"SciTeX/1.0 (mailto:{email})"}
-#
-#     # Add API key as query parameter if provided
-#     params = {}
-#     if api_key:
-#         params["key"] = api_key
-#
-#     try:
-#         async with aiohttp.ClientSession() as session:
-#             async with session.get(
-#                 f"{base_url}{doi}", headers=headers, params=params, timeout=10
-#             ) as response:
-#                 if response.ok:
-#                     data = await response.json()
-#                     message = data["message"]
-#                     return {
-#                         "citations": message.get("is-referenced-by-count", 0),
-#                         "type": message.get("type", ""),
-#                         "publisher": message.get("publisher", ""),
-#                         "references": len(message.get("reference", [])),
-#                         "doi": message.get("DOI", ""),
-#                     }
-#     except Exception as e:
-#         print(f"CrossRef API error for DOI {doi}: {e}")
-#     return {}
-#
-#
-# def save_bibtex(
-#     papers: Dict[str, Any], abstracts: Dict[str, str], output_file: str
-# ) -> None:
-#     """Saves paper metadata as BibTeX file with abstracts.
-#
-#     Parameters
-#     ----------
-#     papers : Dict[str, Any]
-#         Dictionary of paper metadata
-#     abstracts : Dict[str, str]
-#         Dictionary of PMIDs to abstracts
-#     output_file : str
-#         Output file path
-#     """
-#     with open(output_file, "w", encoding="utf-8") as bibtex_file:
-#         for pmid, paper in papers.items():
-#             if pmid == "uids":
-#                 continue
-#
-#             citation = _get_citation(pmid)
-#             if citation:
-#                 bibtex_file.write(citation)
-#             else:
-#                 # Use default tuple if pmid not in abstracts
-#                 default_data = ("", [], "")  # abstract, keywords, doi
-#                 bibtex_entry = format_bibtex(
-#                     paper, pmid, abstracts.get(pmid, default_data)
-#                 )
-#                 bibtex_file.write(bibtex_entry + "\n")
-#     scitex.str.printc(f"Saved to: {str(bibtex_file)}", c="yellow")
-#
-#
-# def format_bibtex(paper: Dict[str, Any], pmid: str, abstract_data: tuple) -> str:
-#     abstract, keywords, doi = abstract_data
-#
-#     # Get CrossRef and Scimago metrics
-#     crossref_metrics = get_crossref_metrics(doi) if doi else {}
-#     journal = paper.get("source", "Unknown Journal")
-#     # journal_metrics = get_journal_metrics(journal)
-#
-#     authors = paper.get("authors", [{"name": "Unknown"}])
-#     author_names = " and ".join(author["name"] for author in authors)
-#     pubdate = paper.get("pubdate", "")
-#     year = pubdate.split()[0] if pubdate.strip() else ""
-#     title = paper.get("title", "No Title")
-#
-#     # Name formatting
-#     first_author = authors[0]["name"]
-#     first_name = first_author.split()[0]
-#     last_name = first_author.split()[-1]
-#     clean_first_name = "".join(c for c in first_name if c.isalnum())
-#     clean_last_name = "".join(c for c in last_name if c.isalnum())
-#
-#     # Title words
-#     title_words = title.split()
-#     first_title_word = "".join(c.lower() for c in title_words[0] if c.isalnum())
-#     second_title_word = (
-#         "".join(c.lower() for c in title_words[1] if c.isalnum())
-#         if len(title_words) > 1
-#         else ""
-#     )
-#
-#     citation_key = f"{clean_first_name}.{clean_last_name}_{year}_{first_title_word}_{second_title_word}"
-#
-#     entry = f"""@article{{{citation_key},
-#     author = {{{author_names}}},
-#     title = {{{title}}},
-#     journal = {{{journal}}},
-#     year = {{{year}}},
-#     pmid = {{{pmid}}},
-#     doi = {{{doi}}},
-#     publisher = {{{crossref_metrics.get("publisher", "")}}},
-#     references = {{{crossref_metrics.get("references", 0)}}},
-#     keywords = {{{", ".join(keywords)}}},
-#     abstract = {{{abstract}}}
-# }}
-# """
-#     return entry
-#
-#
-# async def fetch_async(
-#     session: aiohttp.ClientSession, url: str, params: Dict
-# ) -> Union[Dict, str]:
-#     """Asynchronous fetch helper."""
-#     async with session.get(url, params=params) as response:
-#         if response.status == 200:
-#             if params.get("retmode") == "xml":
-#                 return await response.text()
-#             elif params.get("retmode") == "json":
-#                 return await response.json()
-#             return await response.text()
-#         return {}
-#
-#
-# async def batch__fetch_details(pmids: List[str], batch_size: int = 20) -> List[Dict]:
-#     """Fetches details for multiple PMIDs concurrently.
-#
-#     Parameters
-#     ----------
-#     pmids : List[str]
-#         List of PubMed IDs
-#     batch_size : int, optional
-#         Size of each batch for concurrent requests
-#
-#     Returns
-#     -------
-#     List[Dict]
-#         List of response data
-#     """
-#     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
-#
-#     async with aiohttp.ClientSession() as session:
-#         tasks = []
-#         for i in range(0, len(pmids), batch_size):
-#             batch_pmids = pmids[i : i + batch_size]
-#
-#             # Fetch both details and citations concurrently
-#             efetch_params = {
-#                 "db": "pubmed",
-#                 "id": ",".join(batch_pmids),
-#                 "retmode": "xml",
-#                 "rettype": "abstract",
-#             }
-#
-#             esummary_params = {
-#                 "db": "pubmed",
-#                 "id": ",".join(batch_pmids),
-#                 "retmode": "json",
-#             }
-#
-#             tasks.append(fetch_async(session, f"{base_url}efetch.fcgi", efetch_params))
-#             tasks.append(
-#                 fetch_async(session, f"{base_url}esummary.fcgi", esummary_params)
-#             )
-#
-#         results = await asyncio.gather(*tasks)
-#         return results
-#
-#
-# def search_pubmed(query: str, n_entries: int = 10) -> int:
-#     # query = args.query or "epilepsy prediction"
-#     # print(f"Using query: {query}")
-#
-#     search_results = _search_pubmed(query)
-#     if not search_results:
-#         # print("No results found or error occurred")
-#         return 1
-#
-#     pmids = search_results["esearchresult"]["idlist"]
-#     count = len(pmids)
-#     # print(f"Found {count:,} results")
-#
-#     output_file = f"pubmed_{query.replace(' ', '_')}.bib"
-#     # print(f"Saving results to: {output_file}")
-#
-#     # Process in larger batches asynchronously
-#     results = asyncio.run(batch__fetch_details(pmids[:n_entries]))
-#     # here, results seems long string
-#
-#     # Process results and save
-#     with open(output_file, "w", encoding="utf-8") as f:
-#         for i in range(0, len(results), 2):
-#             xml_response = results[i]
-#             json_response = results[i + 1]
-#
-#             if isinstance(xml_response, str):
-#                 abstracts = _parse_abstract_xml(xml_response)
-#                 if isinstance(json_response, dict) and "result" in json_response:
-#                     details = json_response["result"]
-#                     save_bibtex(details, abstracts, output_file)
-#
-#     # Process results and save
-#     temp_bibtex = []
-#     for i in range(0, len(results), 2):
-#         xml_response = results[i]
-#         json_response = results[i + 1]
-#
-#         if isinstance(xml_response, str):
-#             abstracts = _parse_abstract_xml(xml_response)
-#             if isinstance(json_response, dict) and "result" in json_response:
-#                 details = json_response["result"]
-#                 for pmid in details:
-#                     if pmid != "uids":
-#                         citation = _get_citation(pmid)
-#                         if citation:
-#                             temp_bibtex.append(citation)
-#                         else:
-#                             entry = format_bibtex(
-#                                 details[pmid], pmid, abstracts.get(pmid, "")
-#                             )
-#                             temp_bibtex.append(entry)
-#
-#     # Write all entries at once
-#     with open(output_file, "w", encoding="utf-8") as f:
-#         f.write("\n".join(temp_bibtex))
-#
-#     return 0
-#
-#
-# def parse_args() -> argparse.Namespace:
-#     parser = argparse.ArgumentParser(
-#         description="PubMed article search and retrieval tool"
-#     )
-#     parser.add_argument(
-#         "--query",
-#         "-q",
-#         type=str,
-#         help='Search query (default: "epilepsy prediction")',
-#     )
-#     parser.add_argument(
-#         "--n_entries",
-#         "-n",
-#         type=int,
-#         default=10,
-#         help='Search query (default: "epilepsy prediction")',
-#     )
-#     args = parser.parse_args()
-#     scitex.str.printc(args, c="yellow")
-#     return args
-#
-#
-# def run_main() -> None:
-#     global CONFIG
-#     import sys
-#
-#     import matplotlib.pyplot as plt
-#
-#     import scitex
-#
-#     CONFIG, sys.stdout, sys.stderr, plt, CC = scitex.session.start(
-#         sys,
-#         verbose=False,
-#     )
-#
-#     args = parse_args()
-#     exit_status = search_pubmed(args.query, args.n_entries)
-#
-#     scitex.session.close(
-#         CONFIG,
-#         verbose=False,
-#         notify=False,
-#         message="",
-#         exit_status=exit_status,
-#     )
-#
-#
-# if __name__ == "__main__":
-#     run_main()
-#
-# # EOF
-
-# --------------------------------------------------------------------------------
-# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_search_pubmed.py
-# --------------------------------------------------------------------------------
diff --git a/tests/scitex/web/test__summarize_url.py b/tests/scitex/web/test__summarize_url.py
deleted file mode 100755
index f1e978102..000000000
--- a/tests/scitex/web/test__summarize_url.py
+++ /dev/null
@@ -1,629 +0,0 @@
-#!/usr/bin/env python3
-# Time-stamp: "2024-11-08 05:51:10 (ywatanabe)"
-# File: ./scitex_repo/tests/scitex/web/test__summarize_url.py
-
-"""
-Tests for URL summarization functionality.
-"""
-
-import pytest
-
-pytest.importorskip("aiohttp")
-pytest.importorskip("scitex.web.summarize_url")
-
-import json  # noqa: E402
-import re  # noqa: F401, E402
-from concurrent.futures import Future  # noqa: E402
-from unittest.mock import MagicMock, Mock, call, patch  # noqa: F401, E402
-
-from bs4 import BeautifulSoup  # noqa: F401, E402
-
-try:
-    from scitex.web import (
-        crawl_to_json,
-        crawl_url,
-        extract_main_content,
-        summarize_all,
-        summarize_url,
-    )
-except ImportError:
-    pytest.skip("scitex.web.summarize_url not available", allow_module_level=True)
-from scitex.web._summarize_url import main  # noqa: F401, E402
-
-
-class TestExtractMainContent:
-    """Test extract_main_content function."""
-
-    def test_extract_main_content_with_readability(self):
-        """Test content extraction with readability library."""
-        html_content = """
-        <html>
-            <body>
-                <h1>Main Title</h1>
-                <p>This is the main content.</p>
-                <div>Some extra content</div>
-            </body>
-        </html>
-        """
-
-        # Test when Document is available
-        mock_doc = Mock()
-        mock_doc.summary.return_value = (
-            "<h1>Main Title</h1> <p>This is the main content.</p>"
-        )
-
-        with patch("scitex.web._summarize_url.Document", return_value=mock_doc):
-            result = extract_main_content(html_content)
-            assert "Main Title" in result
-            assert "This is the main content" in result
-            assert "<" not in result  # HTML tags removed
-
-    def test_extract_main_content_without_readability(self):
-        """Test content extraction when readability is not available."""
-        html_content = "<p>Test content</p>"
-
-        with patch("scitex.web._summarize_url.Document", None):
-            result = extract_main_content(html_content)
-            assert result == "Test content"[:5000]  # Limited to 5000 chars
-
-    def test_extract_main_content_complex_html(self):
-        """Test extraction with complex HTML."""
-        html_content = """
-        <html>
-            <head><title>Test</title></head>
-            <body>
-                <script>var x = 1;</script>
-                <p>Real   content   with   spaces</p>
-                <style>body { color: red; }</style>
-            </body>
-        </html>
-        """
-
-        mock_doc = Mock()
-        mock_doc.summary.return_value = "<p>Real   content   with   spaces</p>"
-
-        with patch("scitex.web._summarize_url.Document", return_value=mock_doc):
-            result = extract_main_content(html_content)
-            assert result == "Real content with spaces"  # Extra spaces removed
-
-    def test_extract_main_content_empty_html(self):
-        """Test extraction with empty HTML."""
-        with patch("scitex.web._summarize_url.Document", None):
-            result = extract_main_content("")
-            assert result == ""
-
-    def test_extract_main_content_no_tags(self):
-        """Test extraction with plain text."""
-        plain_text = "Just plain text without HTML"
-
-        with patch("scitex.web._summarize_url.Document", None):
-            result = extract_main_content(plain_text)
-            assert result == plain_text
-
-
-class TestCrawlUrl:
-    """Test crawl_url function."""
-
-    def test_crawl_url_single_page(self):
-        """Test crawling a single page."""
-        mock_response = Mock()
-        mock_response.status_code = 200
-        mock_response.text = "<html><body><p>Test content</p></body></html>"
-
-        with patch("requests.get", return_value=mock_response):
-            with patch(
-                "scitex.web._summarize_url.extract_main_content",
-                return_value="Test content",
-            ):
-                visited, contents = crawl_url("http://test.com", max_depth=0)
-
-                assert "http://test.com" in visited
-                assert contents["http://test.com"] == "Test content"
-                assert len(visited) == 1
-
-    def test_crawl_url_with_links(self):
-        """Test crawling with links to follow."""
-        mock_response = Mock()
-        mock_response.status_code = 200
-        mock_response.text = """
-        <html><body>
-            <p>Main page</p>
-            <a href="/page2">Link to page 2</a>
-            <a href="http://test.com/page3">Link to page 3</a>
-        </body></html>
-        """
-
-        with patch("requests.get", return_value=mock_response):
-            with patch(
-                "scitex.web._summarize_url.extract_main_content", return_value="Content"
-            ):
-                visited, contents = crawl_url("http://test.com", max_depth=1)
-
-                # Should visit main page and try to visit linked pages
-                assert "http://test.com" in visited
-
-    def test_crawl_url_max_depth(self):
-        """Test that max_depth is respected."""
-        mock_response = Mock()
-        mock_response.status_code = 200
-        mock_response.text = '<a href="/deep">Link</a>'
-
-        with patch("requests.get", return_value=mock_response):
-            with patch(
-                "scitex.web._summarize_url.extract_main_content", return_value="Content"
-            ):
-                visited, contents = crawl_url("http://test.com", max_depth=0)
-
-                # Should only visit the initial URL with max_depth=0
-                assert len(visited) == 1
-                assert "http://test.com" in visited
-
-    def test_crawl_url_request_exception(self):
-        """Test handling of request exceptions."""
-        import requests
-
-        with patch(
-            "requests.get", side_effect=requests.RequestException("Network error")
-        ):
-            visited, contents = crawl_url("http://test.com")
-
-            assert len(visited) == 0
-            assert len(contents) == 0
-
-    def test_crawl_url_non_200_status(self):
-        """Test handling of non-200 status codes."""
-        mock_response = Mock()
-        mock_response.status_code = 404
-
-        with patch("requests.get", return_value=mock_response):
-            visited, contents = crawl_url("http://test.com")
-
-            assert len(visited) == 0
-            assert len(contents) == 0
-
-    def test_crawl_url_avoid_duplicate_visits(self):
-        """Test that URLs are not visited twice."""
-        mock_response = Mock()
-        mock_response.status_code = 200
-        # Use exact same URL to test duplicate avoidance
-        mock_response.text = '<a href="http://test.com">Home</a>'
-
-        call_count = 0
-
-        def mock_get(*args, **kwargs):
-            nonlocal call_count
-            call_count += 1
-            return mock_response
-
-        with patch("requests.get", side_effect=mock_get):
-            with patch(
-                "scitex.web._summarize_url.extract_main_content", return_value="Content"
-            ):
-                visited, contents = crawl_url("http://test.com", max_depth=1)
-
-                # Should only call once despite self-referential link to exact same URL
-                assert call_count == 1
-
-
-class TestCrawlToJson:
-    """Test crawl_to_json function."""
-
-    def test_crawl_to_json_basic(self):
-        """Test basic JSON conversion."""
-        mock_urls = {"http://test.com"}
-        mock_contents = {"http://test.com": "Test page content"}
-
-        with patch(
-            "scitex.web._summarize_url.crawl_url",
-            return_value=(mock_urls, mock_contents),
-        ):
-            with patch("scitex.ai.GenAI") as mock_genai:
-                mock_llm = Mock()
-                mock_llm.return_value = "Summary of test page"
-                mock_genai.return_value = mock_llm
-
-                # Mock ThreadPoolExecutor
-                mock_future = Mock(spec=Future)
-                mock_future.result.return_value = {
-                    "url": "http://test.com",
-                    "content": "Summary of test page",
-                }
-
-                with patch("concurrent.futures.ThreadPoolExecutor") as mock_executor:
-                    mock_executor.return_value.__enter__.return_value.submit.return_value = (
-                        mock_future
-                    )
-                    with patch(
-                        "concurrent.futures.as_completed", return_value=[mock_future]
-                    ):
-                        with patch("tqdm.tqdm", side_effect=lambda x, **kwargs: x):
-                            result = crawl_to_json("test.com")
-
-                            parsed = json.loads(result)
-                            assert parsed["start_url"] == "https://test.com"
-                            assert len(parsed["crawled_pages"]) == 1
-                            assert (
-                                parsed["crawled_pages"][0]["url"] == "http://test.com"
-                            )
-
-    def test_crawl_to_json_url_normalization(self):
-        """Test URL normalization (adding https://)."""
-        with patch("scitex.web._summarize_url.crawl_url", return_value=(set(), {})):
-            with patch("concurrent.futures.ThreadPoolExecutor"):
-                with patch("concurrent.futures.as_completed", return_value=[]):
-                    with patch("tqdm.tqdm", side_effect=lambda x, **kwargs: x):
-                        result = crawl_to_json("example.com")
-                        parsed = json.loads(result)
-                        assert parsed["start_url"] == "https://example.com"
-
-    def test_crawl_to_json_already_has_protocol(self):
-        """Test URL with existing protocol."""
-        with patch("scitex.web._summarize_url.crawl_url", return_value=(set(), {})):
-            with patch("concurrent.futures.ThreadPoolExecutor"):
-                with patch("concurrent.futures.as_completed", return_value=[]):
-                    with patch("tqdm.tqdm", side_effect=lambda x, **kwargs: x):
-                        result = crawl_to_json("http://example.com")
-                        parsed = json.loads(result)
-                        assert parsed["start_url"] == "http://example.com"
-
-    def test_crawl_to_json_multiple_pages(self):
-        """Test JSON conversion with multiple pages."""
-        mock_urls = {"http://test.com", "http://test.com/page2"}
-        mock_contents = {
-            "http://test.com": "Main content",
-            "http://test.com/page2": "Page 2 content",
-        }
-
-        with patch(
-            "scitex.web._summarize_url.crawl_url",
-            return_value=(mock_urls, mock_contents),
-        ):
-            with patch("scitex.ai.GenAI") as mock_genai:
-                mock_llm = Mock()
-                mock_llm.side_effect = ["Summary 1", "Summary 2"]
-                mock_genai.return_value = mock_llm
-
-                # Create futures for each URL
-                futures = []
-                for i, url in enumerate(mock_urls):
-                    mock_future = Mock(spec=Future)
-                    mock_future.result.return_value = {
-                        "url": url,
-                        "content": f"Summary {i + 1}",
-                    }
-                    futures.append(mock_future)
-
-                with patch("concurrent.futures.ThreadPoolExecutor") as mock_executor:
-                    mock_executor.return_value.__enter__.return_value.submit.side_effect = (
-                        futures
-                    )
-                    with patch("concurrent.futures.as_completed", return_value=futures):
-                        with patch("tqdm.tqdm", side_effect=lambda x, **kwargs: x):
-                            result = crawl_to_json("test.com")
-
-                            parsed = json.loads(result)
-                            assert len(parsed["crawled_pages"]) == 2
-
-
-class TestSummarizeAll:
-    """Test summarize_all function."""
-
-    def test_summarize_all_basic(self):
-        """Test basic summarization."""
-        json_content = json.dumps(
-            {
-                "start_url": "http://test.com",
-                "crawled_pages": [
-                    {"url": "http://test.com", "content": "Test summary"}
-                ],
-            }
-        )
-
-        with patch("scitex.ai.GenAI") as mock_genai:
-            mock_llm = Mock()
-            mock_llm.return_value = (
-                "• Point 1\n• Point 2\n• Point 3\n• Point 4\n• Point 5"
-            )
-            mock_genai.return_value = mock_llm
-
-            result = summarize_all(json_content)
-
-            assert "Point 1" in result
-            assert "Point 5" in result
-            mock_llm.assert_called_once()
-
-            # Check that the prompt includes the JSON content
-            call_args = mock_llm.call_args[0][0]
-            assert "5 bullet points" in call_args
-            assert json_content in call_args
-
-    def test_summarize_all_empty_json(self):
-        """Test summarization with empty JSON."""
-        empty_json = json.dumps({"start_url": "", "crawled_pages": []})
-
-        with patch("scitex.ai.GenAI") as mock_genai:
-            mock_llm = Mock()
-            mock_llm.return_value = "No content to summarize"
-            mock_genai.return_value = mock_llm
-
-            result = summarize_all(empty_json)
-            assert result == "No content to summarize"
-
-
-class TestSummarizeUrl:
-    """Test summarize_url function."""
-
-    def test_summarize_url_complete_flow(self):
-        """Test complete URL summarization flow."""
-        mock_json = json.dumps(
-            {
-                "start_url": "https://test.com",
-                "crawled_pages": [
-                    {"url": "https://test.com", "content": "Page summary"}
-                ],
-            }
-        )
-        mock_summary = "• Summary point 1\n• Summary point 2"
-
-        with patch("scitex.web._summarize_url.crawl_to_json", return_value=mock_json):
-            with patch(
-                "scitex.web._summarize_url.summarize_all", return_value=mock_summary
-            ):
-                with patch("builtins.print"):  # Suppress pprint output
-                    ground_summary, json_result = summarize_url("test.com")
-
-                    assert ground_summary == mock_summary
-                    assert json_result == mock_json
-
-    def test_summarize_url_error_handling(self):
-        """Test error handling in summarize_url."""
-        with patch(
-            "scitex.web._summarize_url.crawl_to_json",
-            side_effect=Exception("Crawl error"),
-        ):
-            with pytest.raises(Exception) as exc_info:
-                summarize_url("test.com")
-            assert str(exc_info.value) == "Crawl error"
-
-    def test_summarize_url_pprint_called(self):
-        """Test that pprint is called with the summary."""
-        mock_json = '{"test": "data"}'
-        mock_summary = "Test summary"
-
-        with patch("scitex.web._summarize_url.crawl_to_json", return_value=mock_json):
-            with patch(
-                "scitex.web._summarize_url.summarize_all", return_value=mock_summary
-            ):
-                # pprint is imported as 'from pprint import pprint' in the module
-                with patch("scitex.web._summarize_url.pprint") as mock_pprint:
-                    summarize_url("test.com")
-                    mock_pprint.assert_called_once_with(mock_summary)
-
-
-class TestMain:
-    """Test main function and module alias."""
-
-    def test_main_is_summarize_url(self):
-        """Test that main is an alias for summarize_url."""
-        assert main == summarize_url
-
-    def test_main_execution(self):
-        """Test main function execution returns expected result structure."""
-        mock_json = '{"test": "data"}'
-        mock_summary = "Test summary"
-
-        # main is the same function as summarize_url, so we patch the inner calls
-        with patch("scitex.web._summarize_url.crawl_to_json", return_value=mock_json):
-            with patch(
-                "scitex.web._summarize_url.summarize_all", return_value=mock_summary
-            ):
-                with patch("scitex.web._summarize_url.pprint"):
-                    result = main("http://example.com")
-                    assert result[0] == mock_summary
-                    assert result[1] == mock_json
-
-    def test_script_execution(self):
-        """Test script execution with arguments."""
-        import argparse
-
-        with patch("sys.argv", ["script.py", "--url", "http://example.com"]):
-            # Import and execute the argument parsing similar to __main__ block
-            parser = argparse.ArgumentParser(description="")
-            parser.add_argument("--url", "-u", type=str, help="(default: %(default)s)")
-            args = parser.parse_args()
-
-            assert args.url == "http://example.com"
-
-    def test_readability_import_fallback(self):
-        """Test readability import fallback mechanism."""
-        # This tests the import logic in the actual module
-        # The module tries to import from 'readability' first, then 'readability.readability'
-        import sys
-
-        # Test when both imports fail
-        with patch.dict(
-            "sys.modules", {"readability": None, "readability.readability": None}
-        ):
-            # Re-import the module to trigger the import logic
-            if "scitex.web._summarize_url" in sys.modules:
-                del sys.modules["scitex.web._summarize_url"]
-
-            # This should set Document to None
-            from scitex.web import _summarize_url  # noqa: F401
-
-            # The Document variable should be None when imports fail
-            # (This is handled in the actual module's import section)
-
-
-if __name__ == "__main__":
-    import os
-
-    import pytest
-
-    pytest.main([os.path.abspath(__file__)])
-
-# --------------------------------------------------------------------------------
-# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_summarize_url.py
-# --------------------------------------------------------------------------------
-# #!./env/bin/python3
-# # -*- coding: utf-8 -*-
-# # Time-stamp: "2024-07-29 21:43:30 (ywatanabe)"
-# # ./src/scitex/web/_crawl.py
-#
-#
-# import requests
-# from bs4 import BeautifulSoup
-# import urllib.parse
-# from concurrent.futures import ThreadPoolExecutor, as_completed
-# import json
-# from tqdm import tqdm
-# import scitex
-# from pprint import pprint
-#
-# try:
-#     from readability import Document
-# except ImportError:
-#     try:
-#         from readability.readability import Document
-#     except ImportError:
-#         Document = None
-#
-# import re
-#
-#
-# # def crawl_url(url, max_depth=1):
-# #     print("\nCrawling...")
-# #     visited = set()
-# #     to_visit = [(url, 0)]
-# #     contents = {}
-#
-# #     while to_visit:
-# #         current_url, depth = to_visit.pop(0)
-# #         if current_url in visited or depth > max_depth:
-# #             continue
-#
-# #         try:
-# #             response = requests.get(current_url)
-# #             if response.status_code == 200:
-# #                 visited.add(current_url)
-# #                 contents[current_url] = response.text
-# #                 soup = BeautifulSoup(response.text, "html.parser")
-#
-# #                 for link in soup.find_all("a", href=True):
-# #                     absolute_link = urllib.parse.urljoin(
-# #                         current_url, link["href"]
-# #                     )
-# #                     if absolute_link not in visited:
-# #                         to_visit.append((absolute_link, depth + 1))
-#
-# #         except requests.RequestException:
-# #             pass
-#
-# #     return visited, contents
-#
-#
-# def extract_main_content(html):
-#     if Document is None:
-#         # Fallback: just strip HTML tags
-#         content = re.sub("<[^<]+?>", "", html)
-#         content = " ".join(content.split())
-#         return content[:5000]  # Limit to first 5000 chars
-#
-#     doc = Document(html)
-#     content = doc.summary()
-#     # Remove HTML tags
-#     content = re.sub("<[^<]+?>", "", content)
-#     # Remove extra whitespace
-#     content = " ".join(content.split())
-#     return content
-#
-#
-# def crawl_url(url, max_depth=1):
-#     print("\nCrawling...")
-#     visited = set()
-#     to_visit = [(url, 0)]
-#     contents = {}
-#
-#     while to_visit:
-#         current_url, depth = to_visit.pop(0)
-#         if current_url in visited or depth > max_depth:
-#             continue
-#
-#         try:
-#             response = requests.get(current_url)
-#             if response.status_code == 200:
-#                 visited.add(current_url)
-#                 main_content = extract_main_content(response.text)
-#                 contents[current_url] = main_content
-#                 soup = BeautifulSoup(response.text, "html.parser")
-#
-#                 for link in soup.find_all("a", href=True):
-#                     absolute_link = urllib.parse.urljoin(current_url, link["href"])
-#                     if absolute_link not in visited:
-#                         to_visit.append((absolute_link, depth + 1))
-#
-#         except requests.RequestException:
-#             pass
-#
-#     return visited, contents
-#
-#
-# def crawl_to_json(start_url):
-#     if not start_url.startswith("http"):
-#         start_url = "https://" + start_url
-#     crawled_urls, contents = crawl_url(start_url)
-#
-#     print("\nSummalizing as json...")
-#
-#     def process_url(url):
-#         llm = scitex.ai.GenAI("gpt-4o-mini")
-#         return {
-#             "url": url,
-#             "content": llm(f"Summarize this page in 1 line:\n\n{contents[url]}"),
-#         }
-#
-#     with ThreadPoolExecutor() as executor:
-#         future_to_url = {executor.submit(process_url, url): url for url in crawled_urls}
-#         crawled_pages = []
-#         for future in tqdm(
-#             as_completed(future_to_url),
-#             total=len(crawled_urls),
-#             desc="Processing URLs",
-#         ):
-#             crawled_pages.append(future.result())
-#
-#     result = {"start_url": start_url, "crawled_pages": crawled_pages}
-#
-#     return json.dumps(result, indent=2)
-#
-#
-# def summarize_all(json_contents):
-#     llm = scitex.ai.GenAI("gpt-4o-mini")
-#     out = llm(f"Summarize this json file with 5 bullet points:\n\n{json_contents}")
-#     return out
-#
-#
-# def summarize_url(start_url):
-#     json_result = crawl_to_json(start_url)
-#     ground_summary = summarize_all(json_result)
-#
-#     pprint(ground_summary)
-#     return ground_summary, json_result
-#
-#
-# main = summarize_url
-#
-# if __name__ == "__main__":
-#     import argparse
-#     import scitex
-#
-#     parser = argparse.ArgumentParser(description="")
-#     parser.add_argument("--url", "-u", type=str, help="(default: %(default)s)")
-#     args = parser.parse_args()
-#     scitex.gen.print_block(args, c="yellow")
-#
-#     main(args.url)
-
-# --------------------------------------------------------------------------------
-# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/_summarize_url.py
-# --------------------------------------------------------------------------------
diff --git a/tests/scitex/web/test_download_images.py b/tests/scitex/web/test_download_images.py
deleted file mode 100644
index 122a88dca..000000000
--- a/tests/scitex/web/test_download_images.py
+++ /dev/null
@@ -1,332 +0,0 @@
-# Add your tests here
-
-if __name__ == "__main__":
-    import os
-
-    import pytest
-
-    pytest.main([os.path.abspath(__file__)])
-
-# --------------------------------------------------------------------------------
-# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/download_images.py
-# --------------------------------------------------------------------------------
-# #!/usr/bin/env python3
-# # File: ./src/scitex/web/download_images.py
-#
-# """
-# Image Downloader for SciTeX.
-#
-# Downloads images from URLs with minimum size filtering.
-#
-# Usage:
-#     python -m scitex.web.download_images https://example.com
-#     python -m scitex.web.download_images https://example.com -o ./downloads
-#     python -m scitex.web.download_images https://example.com --min-size 800x600
-# """
-#
-# import os
-# import re
-# import urllib.parse
-# from concurrent.futures import ThreadPoolExecutor, as_completed
-# from datetime import datetime
-# from pathlib import Path
-# from typing import List, Optional, Tuple
-#
-# import requests
-# from bs4 import BeautifulSoup
-# from tqdm import tqdm
-#
-# try:
-#     from io import BytesIO
-#
-#     from PIL import Image
-#
-#     PILLOW_AVAILABLE = True
-# except ImportError:
-#     PILLOW_AVAILABLE = False
-#
-# from scitex.logging import getLogger
-#
-# logger = getLogger(__name__)
-#
-# # Configuration
-# DEFAULT_MIN_WIDTH = 400
-# DEFAULT_MIN_HEIGHT = 300
-# DEFAULT_TIMEOUT = 10
-# DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-#
-#
-# def _get_default_download_dir() -> str:
-#     """Get default download directory using SCITEX_DIR if available."""
-#     scitex_root = os.environ.get("SCITEX_DIR", os.path.expanduser("~/.scitex"))
-#     return os.path.join(scitex_root, "web", "downloads")
-#
-#
-# def _normalize_url_for_directory(url: str) -> str:
-#     """Convert URL to a safe directory name."""
-#     parsed = urllib.parse.urlparse(url)
-#     domain = parsed.netloc.replace("www.", "")
-#     path = parsed.path.strip("/").replace("/", "-")
-#
-#     normalized = f"{domain}-{path}" if path else domain
-#     normalized = re.sub(r"[^\w\-.]", "-", normalized)
-#     normalized = re.sub(r"-+", "-", normalized)
-#     normalized = normalized[:100].strip("-")
-#
-#     return normalized
-#
-#
-# def _is_direct_image_url(url: str) -> bool:
-#     """Check if URL appears to be a direct image link."""
-#     extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]
-#     path = urllib.parse.urlparse(url.lower()).path
-#     return any(path.endswith(ext) for ext in extensions)
-#
-#
-# def _extract_image_urls(url: str, same_domain: bool = False) -> List[str]:
-#     """Extract image URLs from a webpage."""
-#     try:
-#         logger.info(f"Fetching page: {url}")
-#         response = requests.get(
-#             url,
-#             timeout=DEFAULT_TIMEOUT,
-#             headers={"User-Agent": DEFAULT_USER_AGENT},
-#         )
-#         response.raise_for_status()
-#     except requests.RequestException as e:
-#         logger.error(f"Failed to fetch page: {e}")
-#         return []
-#
-#     soup = BeautifulSoup(response.content, "html.parser")
-#     parsed_base = urllib.parse.urlparse(url)
-#     image_urls = set()
-#
-#     for img in soup.find_all("img"):
-#         img_url = img.get("src") or img.get("data-src")
-#         if not img_url:
-#             continue
-#
-#         img_url = urllib.parse.urljoin(url, img_url)
-#
-#         if img_url.lower().endswith((".svg", ".svgz")):
-#             continue
-#
-#         if same_domain:
-#             parsed_img = urllib.parse.urlparse(img_url)
-#             if parsed_img.netloc != parsed_base.netloc:
-#                 continue
-#
-#         image_urls.add(img_url)
-#
-#     logger.info(f"Found {len(image_urls)} images on page")
-#     return list(image_urls)
-#
-#
-# def _download_single_image(
-#     img_url: str,
-#     output_dir: Path,
-#     counter: int,
-#     min_size: Optional[Tuple[int, int]],
-# ) -> Optional[str]:
-#     """Download a single image."""
-#     try:
-#         response = requests.get(
-#             img_url,
-#             timeout=DEFAULT_TIMEOUT,
-#             headers={"User-Agent": DEFAULT_USER_AGENT},
-#         )
-#         response.raise_for_status()
-#
-#         # Validate content-type
-#         content_type = response.headers.get("content-type", "")
-#         if not content_type.startswith("image/"):
-#             logger.debug(f"Skipping non-image: {content_type}")
-#             return None
-#
-#         # Check dimensions
-#         if min_size and PILLOW_AVAILABLE:
-#             try:
-#                 img = Image.open(BytesIO(response.content))
-#                 width, height = img.size
-#                 if width < min_size[0] or height < min_size[1]:
-#                     logger.debug(
-#                         f"Skipping small image: {width}x{height} "
-#                         f"(min: {min_size[0]}x{min_size[1]})"
-#                     )
-#                     return None
-#             except Exception:
-#                 pass
-#
-#         # Determine extension
-#         ext = "jpg"
-#         if PILLOW_AVAILABLE:
-#             try:
-#                 img = Image.open(BytesIO(response.content))
-#                 fmt = img.format.lower() if img.format else "jpeg"
-#                 ext = "jpg" if fmt == "jpeg" else fmt
-#             except Exception:
-#                 pass
-#         elif "png" in content_type:
-#             ext = "png"
-#         elif "gif" in content_type:
-#             ext = "gif"
-#         elif "webp" in content_type:
-#             ext = "webp"
-#
-#         filename = f"{counter:04d}.{ext}"
-#         filepath = output_dir / filename
-#
-#         with open(filepath, "wb") as f:
-#             f.write(response.content)
-#
-#         logger.info(f"Downloaded: {filename}")
-#         return str(filepath)
-#
-#     except Exception as e:
-#         logger.warning(f"Error downloading {img_url}: {e}")
-#         return None
-#
-#
-# def download_images(
-#     url: str,
-#     output_dir: Optional[str] = None,
-#     min_size: Optional[Tuple[int, int]] = None,
-#     max_workers: int = 5,
-#     same_domain: bool = False,
-# ) -> List[str]:
-#     """
-#     Download images from a URL.
-#
-#     Args:
-#         url: Webpage URL or direct image URL
-#         output_dir: Output directory (default: $SCITEX_DIR/web/downloads)
-#         min_size: Minimum (width, height) to filter small images (default: 400x300)
-#         max_workers: Concurrent download threads
-#         same_domain: Only download images from the same domain
-#
-#     Returns:
-#         List of downloaded file paths
-#
-#     Example:
-#         >>> paths = download_images("https://example.com")
-#         >>> paths = download_images("https://example.com/photo.jpg")
-#         >>> paths = download_images("https://example.com", min_size=(800, 600))
-#     """
-#     if not PILLOW_AVAILABLE:
-#         logger.warning("Pillow not available. Size filtering disabled.")
-#         min_size = None
-#     elif min_size is None:
-#         min_size = (DEFAULT_MIN_WIDTH, DEFAULT_MIN_HEIGHT)
-#
-#     # Setup output directory
-#     if output_dir is None:
-#         output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR")
-#         if output_dir is None:
-#             output_dir = _get_default_download_dir()
-#
-#     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-#     normalized = _normalize_url_for_directory(url)
-#     output_path = Path(output_dir).expanduser() / f"{timestamp}-{normalized}-images"
-#     output_path.mkdir(parents=True, exist_ok=True)
-#
-#     logger.info(f"Output directory: {output_path}")
-#
-#     # Get image URLs
-#     if _is_direct_image_url(url):
-#         image_urls = [url]
-#         logger.info("Direct image URL detected")
-#     else:
-#         image_urls = _extract_image_urls(url, same_domain=same_domain)
-#
-#     if not image_urls:
-#         logger.warning("No images found")
-#         return []
-#
-#     # Download concurrently
-#     downloaded = []
-#     counter = [1]
-#
-#     def download_with_counter(img_url: str) -> Optional[str]:
-#         idx = counter[0]
-#         counter[0] += 1
-#         return _download_single_image(img_url, output_path, idx, min_size)
-#
-#     with ThreadPoolExecutor(max_workers=max_workers) as executor:
-#         futures = {executor.submit(download_with_counter, u): u for u in image_urls}
-#
-#         for future in tqdm(
-#             as_completed(futures), total=len(image_urls), desc="Downloading"
-#         ):
-#             result = future.result()
-#             if result:
-#                 downloaded.append(result)
-#
-#     logger.info(f"Downloaded {len(downloaded)} images to {output_path}")
-#     return downloaded
-#
-#
-# def main():
-#     """CLI entry point."""
-#     import argparse
-#
-#     parser = argparse.ArgumentParser(
-#         description="Download images from URL",
-#         formatter_class=argparse.RawDescriptionHelpFormatter,
-#         epilog="""
-# Examples:
-#   python -m scitex.web.download_images https://example.com
-#   python -m scitex.web.download_images https://example.com -o ./downloads
-#   python -m scitex.web.download_images https://example.com --min-size 800x600
-#   python -m scitex.web.download_images https://example.com --no-min-size
-#         """,
-#     )
-#     parser.add_argument("url", help="URL to download images from")
-#     parser.add_argument("-o", "--output", help="Output directory")
-#     parser.add_argument(
-#         "--min-size",
-#         default="400x300",
-#         help="Minimum size WIDTHxHEIGHT (default: 400x300)",
-#     )
-#     parser.add_argument(
-#         "--no-min-size",
-#         action="store_true",
-#         help="Disable size filtering",
-#     )
-#     parser.add_argument(
-#         "--same-domain",
-#         action="store_true",
-#         help="Only download from same domain",
-#     )
-#     parser.add_argument(
-#         "--workers",
-#         type=int,
-#         default=5,
-#         help="Concurrent downloads (default: 5)",
-#     )
-#
-#     args = parser.parse_args()
-#
-#     min_size = None
-#     if not args.no_min_size and args.min_size:
-#         w, h = map(int, args.min_size.split("x"))
-#         min_size = (w, h)
-#
-#     paths = download_images(
-#         args.url,
-#         output_dir=args.output,
-#         min_size=min_size,
-#         max_workers=args.workers,
-#         same_domain=args.same_domain,
-#     )
-#
-#     print(f"\nDownloaded {len(paths)} images:")
-#     for p in paths:
-#         print(f"  {p}")
-#
-#
-# if __name__ == "__main__":
-#     main()
-
-# --------------------------------------------------------------------------------
-# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/web/download_images.py
-# --------------------------------------------------------------------------------