diff --git a/.claude-plugin/manifest.json b/.claude-plugin/manifest.json index a47aada..6e82564 100644 --- a/.claude-plugin/manifest.json +++ b/.claude-plugin/manifest.json @@ -25,6 +25,11 @@ "description": "Automatically fetch and summarize the latest AI news, research, and industry developments. Use when users request: (1) Daily AI news updates, (2) Latest AI technology developments, (3) Recent AI research papers or breakthroughs, (4) AI industry trends and market news, (5) Specific AI company announcements, or (6) Automated daily AI briefings. Supports customizable search queries, multi-source aggregation, and formatted output in various styles.", "path": "skills/ai-news-daily" }, + { + "name": "coin-news-openclaw", + "description": "Collect and summarize cryptocurrency and coin market news with OpenClaw-friendly workflows. Use when users request coin news, crypto news, token-specific news, daily market briefings, or a replacement for Dify-based news aggregation. Supports configurable sources, keyword scoring, source weighting, deduplication, and structured JSON output for downstream tuning.", + "path": "skills/coin-news-openclaw" + }, { "name": "gemini-image-generator", "description": "Generate, edit, or transform images with Gemini using bundled Python scripts (Flash or Pro) including aspect ratio, resolution, image-to-image edits, logo overlays, and reference images. Use when users request image generation, image edits, image-to-image transformations, logo placement, or specific aspect ratios or resolutions.", diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 861f27e..58d99d1 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -37,6 +37,15 @@ "./skills/ai-news-daily" ] }, + { + "name": "coin-news-tools", + "description": "Crypto and coin news aggregation tools with configurable sources and scoring for OpenClaw workflows", + "source": "./", + "strict": false, + "skills": [ + "./skills/coin-news-openclaw" + ] + }, { "name": "image-generation-tools", "description": "Image generation and editing tools", diff --git a/skills/coin-news-openclaw/SKILL.md b/skills/coin-news-openclaw/SKILL.md new file mode 100644 index 0000000..630fc19 --- /dev/null +++ b/skills/coin-news-openclaw/SKILL.md @@ -0,0 +1,135 @@ +--- +name: coin-news-openclaw +description: Collect and summarize cryptocurrency and coin market news with OpenClaw-friendly workflows. Use when users request coin news, crypto news, token-specific news, daily market briefings, or a replacement for Dify-based news aggregation. Supports configurable sources, keyword scoring, source weighting, deduplication, and structured JSON output for downstream tuning. +--- + +# Coin News OpenClaw + +Collect cryptocurrency news from configurable sources, normalize the articles, score relevance, and return a structured digest that can be tuned over time. + +## Use This Skill When + +- The user wants a daily or on-demand crypto news digest +- The user wants news for a specific token or narrative +- The user wants to replace or compare against an existing Dify news workflow +- The user wants a configurable pipeline that PA can tune later + +## Workflow + +1. Read `references/sources.yaml` to determine enabled sources and source weights. +2. Read `references/scoring.yaml` to determine token aliases, topic keywords, negative keywords, and ranking logic. +3. If deterministic collection is needed, run `scripts/fetch_coin_news.py`. +4. Filter the normalized article list to the user’s requested scope. +5. Rank articles using source weight, keyword matches, recency, and duplicate suppression. +6. Return a short digest or a structured JSON array for downstream workflow use. + +## CLI Usage + +```bash +# Basic usage - JSON output (default) +python3 scripts/fetch_coin_news.py --days 1 + +# ⭐ Markdown output with clickable links (recommended for reading) +python3 scripts/fetch_coin_news.py --days 1 --format markdown + +# Limit number of articles +python3 scripts/fetch_coin_news.py --days 1 --limit 10 --format markdown + +# Filter by specific tokens +python3 scripts/fetch_coin_news.py --days 1 --token BTC --token ETH + +# Filter by specific topics +python3 scripts/fetch_coin_news.py --days 1 --topic etf --topic regulation + +# Adjust token fetch limit (default: 100, max: 250) +python3 scripts/fetch_coin_news.py --days 1 --token-limit 50 + +# Disable dynamic token fetching (use only YAML config) +python3 scripts/fetch_coin_news.py --days 1 --no-dynamic-tokens +``` + +## Output Formats + +### JSON (default) +```bash +python3 scripts/fetch_coin_news.py --days 1 +``` +Returns structured JSON for programmatic use. + +### Markdown (recommended for reading) +```bash +python3 scripts/fetch_coin_news.py --days 1 --format markdown +``` +Returns formatted markdown with **clickable links** for each article: + +```markdown +## 1. [Article Title](https://example.com/article) +**来源**: CoinDesk | **时间**: 2026-03-25 | **分数**: 78 +**Token**: BTC, ETH +**主题**: etf + +Summary text here... +--- +``` + +## Time Range + +- Default: last 24 hours +- Support explicit day windows such as: + - recent 2 days + - recent 3 days + - recent 7 days +- Support common Chinese requests such as: + - 最近2天 + - 最近3天 + - 最近一周 + - 过去7天 +- For deterministic runs, prefer `--days ` over manually converting to hours. +- If both `--days` and `--hours` are provided, `--days` takes precedence. +- Recommended mapping: + - 最近2天 -> `--days 2` + - 最近3天 -> `--days 3` + - 最近一周 -> `--days 7` + - 过去7天 -> `--days 7` + +## Dynamic Token Fetching + +The skill automatically fetches the top 100 tokens (by market cap) from CoinGecko API and merges them with the YAML config: + +- **Source**: CoinGecko API (free, no API key required) +- **Cache TTL**: 24 hours (stored in `scoring.yaml` under `dynamic_tokens`) +- **Merge logic**: YAML `token_aliases` overrides dynamic tokens (for manual tuning) +- **Disable**: Use `--no-dynamic-tokens` to use only YAML config + +## Output Contract + +Prefer this JSON structure for workflow handoff: + +```json +[ + { + “title”: “Example headline”, + “url”: “https://example.com/article”, + “source”: “CoinDesk”, + “published_at”: “2026-03-20T09:00:00Z”, + “summary”: “One paragraph summary.”, + “score”: 78, + “matched_topics”: [“bitcoin”, “etf”], + “matched_tokens”: [“BTC”], + “duplicate_group_key”: “normalized-title-key” + } +] +``` + +## Tuning Rules + +- Do not hardcode source lists in prompts. Update `references/sources.yaml`. +- Do not hardcode scoring logic in prompts. Update `references/scoring.yaml`. +- Prefer established publications before secondary aggregators. +- If the user asks for “latest” or “today”, prioritize the last 24 hours and show exact dates. + +## References + +- `references/sources.yaml`: source registry and weights +- `references/scoring.yaml`: token aliases, topic keywords, penalties, thresholds +- `scripts/fetch_coin_news.py`: deterministic RSS collector and scorer diff --git a/skills/coin-news-openclaw/references/scoring.yaml b/skills/coin-news-openclaw/references/scoring.yaml new file mode 100644 index 0000000..ceb9f72 --- /dev/null +++ b/skills/coin-news-openclaw/references/scoring.yaml @@ -0,0 +1,617 @@ +defaults: + recency_half_life_hours: 18 + duplicate_title_similarity_threshold: 0.88 + min_output_score: 15 +weights: + source_weight_multiplier: 35 + token_match: 18 + topic_match: 10 + title_keyword_bonus: 8 + negative_keyword_penalty: -20 +token_aliases: + BTC: + - bitcoin + - btc + ETH: + - ethereum + - ether + - eth + BNB: + - binance coin + - bnb + - binance + SOL: + - solana + - sol + XRP: + - ripple + - xrp + USDT: + - tether + - usdt + USDC: + - usd coin + - usdc + ADA: + - cardano + - ada + DOGE: + - dogecoin + - doge + AVAX: + - avalanche + - avax + DOT: + - polkadot + - dot + LINK: + - chainlink + - link + TRX: + - tron + - trx + MATIC: + - polygon + - matic + SHIB: + - shiba inu + - shib + LTC: + - litecoin + - ltc + UNI: + - uniswap + - uni + ATOM: + - cosmos + - atom + XLM: + - stellar + - xlm + ETC: + - ethereum classic + - etc + BCH: + - bitcoin cash + - bch + FET: + - fetch.ai + - fet + NEAR: + - near protocol + - near + APT: + - aptos + - apt + ARB: + - arbitrum + - arb + OP: + - optimism + - op + TIA: + - celestia + - tia + SEI: + - sei + SUI: + - sui + WLD: + - worldcoin + - wld + PEPE: + - pepe + WIF: + - dogwifhat + - wif + ONDO: + - ondo + JUP: + - jupiter + - jup + ZEC: + - zcash + - zec + XMR: + - monero + - xmr + BASE: + - base + - base chain + ZK: + - zkSync + - zksync + - zk sync + MANTA: + - manta + - manta network + BLUR: + - blur + - nft marketplace + RNDR: + - render + - rndr + TAO: + - bittensor + - tao + GRT: + - the graph + - grt + AAVE: + - aave + MKR: + - maker + - mkr + CRV: + - curve + - crv + LDO: + - lido + - ldo + ENA: + - ethena + - ena + PENDLE: + - pendle + SAND: + - sandbox + - sand + MANA: + - decentraland + - mana + AXS: + - axie infinity + - axs + IMX: + - immutable x + - imx + OKB: + - okb + - okx + KCS: + - kucoin + - kcs + INJ: + - injective + - inj + FTM: + - fantom + - ftm + ALGO: + - algorand + - algo + VET: + - vechain + - vet + FIL: + - filecoin + - fil + TON: + - toncoin + - ton + ICP: + - internet computer + - icp +topic_keywords: + regulation: + - sec + - regulation + - lawsuit + - compliance + - enforcement + - sec chair + - gary gensler + etf: + - etf + - spot etf + - fund flow + - bitcoin etf + - ethereum etf + - institutional inflow + - grayscale + - blackrock + - fidelity + defi: + - defi + - dex + - staking + - lending + - yield + - liquidity + - tvl + - airdrop + - points + stablecoin: + - stablecoin + - usdt + - usdc + - busd + - dai + - circle + - tether + security: + - exploit + - hack + - phishing + - rug pull + - scam + - vulnerability + - breach + nft: + - nft + - nfts + - opensea + - magic eden + - nft marketplace + layer2: + - layer 2 + - layer2 + - l2 + - rollup + - zk rollup + - optimistic + - scaling + ai_crypto: + - ai crypto + - ai token + - ai agent + - decentralized ai + memecoin: + - meme coin + - memecoin + - meme token + exchange: + - exchange + - listing + - delisting + - trading volume + - binance + - coinbase + - kraken + - bybit + - okx + adoption: + - adoption + - mainstream + - institutional + - enterprise + - treasury + - el salvador + upgrade: + - upgrade + - hard fork + - network upgrade + - ethereum upgrade + - dencun + - cancun + macro: + - fed + - interest rate + - inflation + - recession + - btc dominance + - market cap + mining: + - mining + - miner + - hash rate + - halving + - block reward + web3: + - web3 + - web 3 + - decentralized web + - dapp + - decentralized app + - dao + - governance + - smart contract + crypto_general: + - cryptocurrency + - crypto market + - digital asset + - blockchain + - crypto industry + - virtual asset + innovation: + - innovation + - breakthrough + - new technology + - protocol launch + - mainnet launch + - new chain + - tech upgrade + - roadmap +negative_keywords: +- sponsored +- advertorial +- giveaway +- price prediction +- 100x +- 1000x +- moon shot +- get rich +- buy now +- must buy +- limited time +- exclusive offer +- click here +- sign up bonus +dynamic_tokens: + fetched_at: '2026-03-26T02:55:46.573789Z' + token_aliases: + BTC: + - bitcoin + - btc + ETH: + - ethereum + - eth + USDT: + - tether + - usdt + BNB: + - bnb + XRP: + - xrp + USDC: + - usdc + SOL: + - solana + - sol + TRX: + - tron + - trx + FIGR_HELOC: + - figure heloc + - figr_heloc + DOGE: + - dogecoin + - doge + USDS: + - usds + WBT: + - whitebit coin + - wbt + ADA: + - cardano + - ada + HYPE: + - hyperliquid + - hype + BCH: + - bitcoin cash + - bch + LEO: + - leo token + - leo + LINK: + - chainlink + - link + XMR: + - monero + - xmr + USDE: + - ethena usde + - usde + XLM: + - stellar + - xlm + CC: + - canton + - cc + USD1: + - usd1 + LTC: + - litecoin + - ltc + DAI: + - dai + M: + - memecore + - m + RAIN: + - rain + AVAX: + - avalanche + - avax + HBAR: + - hedera + - hbar + PYUSD: + - paypal usd + - pyusd + ZEC: + - zcash + - zec + SUI: + - sui + SHIB: + - shiba inu + - shib + TAO: + - bittensor + - tao + TON: + - toncoin + - ton + CRO: + - cronos + - cro + WLFI: + - world liberty financial + - wlfi + XAUT: + - tether gold + - xaut + USYC: + - circle usyc + - usyc + MNT: + - mantle + - mnt + UNI: + - uniswap + - uni + PAXG: + - pax gold + - paxg + DOT: + - polkadot + - dot + BUIDL: + - blackrock usd institutional digital liquidity fund + - buidl + PI: + - pi network + - pi + OKB: + - okb + USDG: + - global dollar + - usdg + USDF: + - falcon usd + - usdf + SKY: + - sky + AAVE: + - aave + ASTER: + - aster + NEAR: + - near protocol + - near + SIREN: + - siren + HTX: + - htx dao + - htx + PEPE: + - pepe + RLUSD: + - ripple usd + - rlusd + BGB: + - bitget token + - bgb + ETC: + - ethereum classic + - etc + ICP: + - internet computer + - icp + BFUSD: + - bfusd + USDY: + - ondo us dollar yield + - usdy + ONDO: + - ondo + GT: + - gate + - gt + PUMP: + - pump.fun + - pump + QNT: + - quant + - qnt + KCS: + - kucoin + - kcs + KAS: + - kaspa + - kas + JTRSY: + - janus henderson anemoy treasury fund + - jtrsy + POL: + - pol (ex-matic) + - pol + WLD: + - worldcoin + - wld + EUTBL: + - spiko eu t-bills money market fund + - eutbl + RENDER: + - render + MORPHO: + - morpho + ENA: + - ethena + - ena + NEXO: + - nexo + USTB: + - superstate short duration u.s. government securities fund (ustb) + - ustb + USDTB: + - usdtb + ATOM: + - cosmos hub + - atom + APT: + - aptos + - apt + USDD: + - usdd + ALGO: + - algorand + - algo + HASH: + - provenance blockchain + - hash + NIGHT: + - midnight + - night + TRUMP: + - official trump + - trump + FIL: + - filecoin + - fil + FLR: + - flare + - flr + OUSG: + - ousg + XDC: + - xdc network + - xdc + BDX: + - beldex + - bdx + VET: + - vechain + - vet + ARB: + - arbitrum + - arb + YLDS: + - ylds + GHO: + - gho + FET: + - artificial superintelligence alliance + - fet + STABLE: + - ​​stable + - stable + USD0: + - usual usd + - usd0 + BONK: + - bonk + JUP: + - jupiter + - jup + ZRO: + - layerzero + - zro + JST: + - just + - jst + TUSD: + - trueusd + - tusd diff --git a/skills/coin-news-openclaw/references/sources.yaml b/skills/coin-news-openclaw/references/sources.yaml new file mode 100644 index 0000000..f1f4b74 --- /dev/null +++ b/skills/coin-news-openclaw/references/sources.yaml @@ -0,0 +1,125 @@ +defaults: + max_articles_per_source: 30 + request_timeout_seconds: 20 + user_agent: "FeedMob Coin News Collector/0.1" + +sources: + - id: coindesk + name: CoinDesk + enabled: true + type: rss + category: general + weight: 1.0 + url: https://www.coindesk.com/arc/outboundfeeds/rss/ + + - id: cointelegraph + name: Cointelegraph + enabled: true + type: rss + category: general + weight: 0.95 + url: https://cointelegraph.com/rss + + - id: decrypt + name: Decrypt + enabled: true + type: rss + category: general + weight: 0.95 + url: https://decrypt.co/feed + + - id: blockworks + name: Blockworks + enabled: true + type: rss + category: markets + weight: 0.9 + url: https://blockworks.co/feed + + - id: theblock + name: The Block + enabled: true + type: rss + category: markets + weight: 0.9 + url: https://www.theblock.co/rss.xml + + - id: cryptoslate + name: CryptoSlate + enabled: false # 403 Forbidden - temporarily disabled + type: rss + category: general + weight: 0.85 + url: https://cryptoslate.com/feed/ + + - id: crypto-news + name: crypto.news + enabled: true + type: rss + category: general + weight: 0.82 + url: https://crypto.news/feed/ + + - id: bitcoincom + name: Bitcoin.com News + enabled: true + type: rss + category: bitcoin + weight: 0.78 + url: https://news.bitcoin.com/feed/ + + - id: cryptopotato + name: CryptoPotato + enabled: true + type: rss + category: analysis + weight: 0.74 + url: https://cryptopotato.com/feed/ + + - id: thedefiant + name: The Defiant + enabled: true + type: rss + category: defi + weight: 0.8 + url: https://thedefiant.io/feed + + - id: unchained + name: Unchained + enabled: true + type: rss + category: analysis + weight: 0.76 + url: https://unchainedcrypto.com/feed/ + + - id: utoday + name: U.Today + enabled: true + type: rss + category: general + weight: 0.68 + url: https://u.today/rss + + - id: coingape + name: CoinGape + enabled: true + type: rss + category: general + weight: 0.65 + url: https://coingape.com/feed/ + + - id: bankless + name: Bankless + enabled: true + type: rss + category: defi + weight: 0.78 + url: https://www.bankless.com/feed + + - id: ambcrypto + name: AMBCrypto + enabled: true + type: rss + category: general + weight: 0.66 + url: https://ambcrypto.com/feed/ diff --git a/skills/coin-news-openclaw/scripts/fetch_coin_news.py b/skills/coin-news-openclaw/scripts/fetch_coin_news.py new file mode 100644 index 0000000..348059f --- /dev/null +++ b/skills/coin-news-openclaw/scripts/fetch_coin_news.py @@ -0,0 +1,341 @@ +#!/usr/bin/env python3 +"""Fetch, score, and deduplicate crypto news from configured RSS sources.""" + +from __future__ import annotations + +import argparse +import datetime as dt +import email.utils +import json +import math +import re +import sys +import urllib.request +import xml.etree.ElementTree as ET +from dataclasses import dataclass +from difflib import SequenceMatcher +from pathlib import Path + +import yaml + + +SCRIPT_DIR = Path(__file__).resolve().parent +SKILL_DIR = SCRIPT_DIR.parent +SOURCES_PATH = SKILL_DIR / "references" / "sources.yaml" +SCORING_PATH = SKILL_DIR / "references" / "scoring.yaml" + +# Token fetch settings +TOKENS_TTL_HOURS = 24 +COINGECKO_API_URL = "https://api.coingecko.com/api/v3/coins/markets?vs_currency=usd&order=market_cap_desc&per_page={limit}" + + +@dataclass +class Article: + source_id: str + source_name: str + source_weight: float + category: str + title: str + url: str + summary: str + published_at: dt.datetime | None + + +def load_yaml(path: Path) -> dict: + with path.open("r", encoding="utf-8") as handle: + return yaml.safe_load(handle) or {} + + +def save_yaml(path: Path, data: dict) -> None: + """Save data to YAML file with preserving structure.""" + with path.open("w", encoding="utf-8") as handle: + yaml.dump(data, handle, allow_unicode=True, default_flow_style=False, sort_keys=False, indent=2) + + +def fetch_tokens_from_coingecko(limit: int = 100, timeout: int = 15) -> dict[str, list[str]]: + """Fetch top tokens from CoinGecko API (no API key required).""" + url = COINGECKO_API_URL.format(limit=min(limit, 250)) + request = urllib.request.Request( + url, + headers={"User-Agent": "FeedMob Coin News Collector/0.1"} + ) + with urllib.request.urlopen(request, timeout=timeout) as response: + data = json.loads(response.read().decode("utf-8")) + + token_aliases: dict[str, list[str]] = {} + for coin in data: + symbol = coin.get("symbol", "").upper() + name = coin.get("name", "").lower() + if symbol and name: + aliases = [name, symbol.lower()] + # Remove duplicates while preserving order + seen = set() + unique_aliases = [a for a in aliases if not (a in seen or seen.add(a))] + token_aliases[symbol] = unique_aliases + + return token_aliases + + +def is_tokens_fresh(scoring: dict, ttl_hours: int) -> bool: + """Check if dynamic tokens in scoring.yaml are still fresh.""" + fetched_at_str = scoring.get("dynamic_tokens", {}).get("fetched_at") + if not fetched_at_str: + return False + + try: + fetched_at = dt.datetime.fromisoformat(fetched_at_str) + if fetched_at.tzinfo is None: + fetched_at = fetched_at.replace(tzinfo=dt.timezone.utc) + age_hours = (dt.datetime.now(dt.timezone.utc) - fetched_at).total_seconds() / 3600 + return age_hours <= ttl_hours + except (ValueError, TypeError): + return False + + +def update_scoring_with_tokens(scoring: dict, tokens: dict[str, list[str]]) -> None: + """Update scoring.yaml with new dynamic tokens.""" + scoring["dynamic_tokens"] = { + "fetched_at": dt.datetime.now(dt.timezone.utc).isoformat().replace("+00:00", "Z"), + "token_aliases": tokens + } + save_yaml(SCORING_PATH, scoring) + + +def get_token_aliases(args: argparse.Namespace, scoring: dict) -> dict[str, list[str]]: + """Get token aliases: dynamic from CoinGecko (in YAML) + manual overrides (in YAML).""" + # Manual tokens (higher priority, user-defined) + manual_tokens = scoring.get("token_aliases", {}) + + if args.no_dynamic_tokens: + return manual_tokens + + # Check if dynamic tokens in YAML are fresh + if is_tokens_fresh(scoring, TOKENS_TTL_HOURS): + dynamic_tokens = scoring.get("dynamic_tokens", {}).get("token_aliases", {}) + return {**dynamic_tokens, **manual_tokens} + + # Fetch new tokens from CoinGecko and update scoring.yaml + try: + dynamic_tokens = fetch_tokens_from_coingecko(args.token_limit) + update_scoring_with_tokens(scoring, dynamic_tokens) + print(f"info: updated scoring.yaml with {len(dynamic_tokens)} tokens from CoinGecko", file=sys.stderr) + return {**dynamic_tokens, **manual_tokens} + except Exception as exc: + print(f"warning: failed to fetch dynamic tokens: {exc}, using YAML only", file=sys.stderr) + # Fall back to existing dynamic tokens if available + existing_dynamic = scoring.get("dynamic_tokens", {}).get("token_aliases", {}) + return {**existing_dynamic, **manual_tokens} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--token", action="append", default=[]) + parser.add_argument("--topic", action="append", default=[]) + parser.add_argument("--days", type=int, help="Lookback window in days") + parser.add_argument("--hours", type=int, default=24) + parser.add_argument("--limit", type=int, default=20) + parser.add_argument("--pretty", action="store_true") + parser.add_argument("--format", choices=["json", "markdown", "md"], default="json", + help="Output format: json (default), markdown/md (with clickable links)") + parser.add_argument("--token-limit", type=int, default=100, + help="Number of top tokens to fetch from CoinGecko (default: 100, max: 250)") + parser.add_argument("--no-dynamic-tokens", action="store_true", + help="Disable dynamic token fetching, use only YAML config") + return parser.parse_args() + + +def fetch_url(url: str, user_agent: str, timeout: int) -> bytes: + request = urllib.request.Request(url, headers={"User-Agent": user_agent}) + with urllib.request.urlopen(request, timeout=timeout) as response: + return response.read() + + +def strip_html(value: str | None) -> str: + if not value: + return "" + value = re.sub(r"<[^>]+>", " ", value) + return re.sub(r"\s+", " ", value).strip() + + +def parse_datetime(value: str | None) -> dt.datetime | None: + if not value: + return None + try: + parsed = email.utils.parsedate_to_datetime(value) + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=dt.timezone.utc) + return parsed.astimezone(dt.timezone.utc) + except (TypeError, ValueError, IndexError): + return None + + +def parse_feed(feed_bytes: bytes, source: dict) -> list[Article]: + root = ET.fromstring(feed_bytes) + articles: list[Article] = [] + for item in root.findall(".//item"): + title = strip_html(item.findtext("title")) + url = strip_html(item.findtext("link")) + summary = strip_html(item.findtext("description")) + published_at = parse_datetime(item.findtext("pubDate")) + if not title or not url: + continue + articles.append( + Article( + source_id=source["id"], + source_name=source["name"], + source_weight=float(source.get("weight", 1.0)), + category=source.get("category", "general"), + title=title, + url=url, + summary=summary, + published_at=published_at, + ) + ) + return articles + + +def normalize(value: str) -> str: + return re.sub(r"[^a-z0-9]+", " ", value.lower()).strip() + + +def recency_score(published_at: dt.datetime | None, half_life_hours: float) -> float: + if not published_at: + return 0.2 + age_hours = max((dt.datetime.now(dt.timezone.utc) - published_at).total_seconds() / 3600, 0) + return math.exp(-math.log(2) * age_hours / half_life_hours) + + +def collect_matches(text: str, mapping: dict[str, list[str]]) -> list[str]: + return [key for key, aliases in mapping.items() if any(alias in text for alias in aliases)] + + +def score_article(article: Article, scoring: dict, token_aliases: dict[str, list[str]]) -> tuple[float, list[str], list[str]]: + full_text = normalize(f"{article.title} {article.summary}") + title_text = normalize(article.title) + token_matches = collect_matches(full_text, token_aliases) + topic_matches = collect_matches(full_text, scoring["topic_keywords"]) + negative_hits = sum(1 for keyword in scoring["negative_keywords"] if keyword in full_text) + + weights = scoring["weights"] + score = article.source_weight * weights["source_weight_multiplier"] + score += len(token_matches) * weights["token_match"] + score += len(topic_matches) * weights["topic_match"] + if token_matches or topic_matches: + if any(alias in title_text for aliases in list(token_aliases.values()) + list(scoring["topic_keywords"].values()) for alias in aliases): + score += weights["title_keyword_bonus"] + score += negative_hits * weights["negative_keyword_penalty"] + score += recency_score(article.published_at, scoring["defaults"]["recency_half_life_hours"]) * 20 + return score, token_matches, topic_matches + + +def is_duplicate(left: str, right: str, threshold: float) -> bool: + return SequenceMatcher(a=left, b=right).ratio() >= threshold + + +def format_output(articles: list[dict], fmt: str, pretty: bool) -> str: + """Format articles for output.""" + if fmt == "json": + return json.dumps(articles, ensure_ascii=False, indent=2 if pretty else None) + + # Markdown format with clickable links + lines = ["# 📰 Coin News Digest\n"] + for i, article in enumerate(articles, 1): + title = article["title"] + url = article["url"] + source = article["source"] + published = article.get("published_at", "")[:10] if article.get("published_at") else "" + summary = article.get("summary", "")[:200] + "..." if len(article.get("summary", "")) > 200 else article.get("summary", "") + tokens = ", ".join(article.get("matched_tokens", [])) + topics = ", ".join(article.get("matched_topics", [])) + + lines.append(f"## {i}. [{title}]({url})") + lines.append(f"**来源**: {source} | **时间**: {published} | **分数**: {article['score']}") + if tokens: + lines.append(f"**Token**: {tokens}") + if topics: + lines.append(f"**主题**: {topics}") + lines.append(f"\n{summary}\n") + lines.append("---\n") + + return "\n".join(lines) + + +def main() -> int: + args = parse_args() + sources = load_yaml(SOURCES_PATH) + scoring = load_yaml(SCORING_PATH) + + # Get token aliases (dynamic + static merge) + token_aliases = get_token_aliases(args, scoring) + + all_articles: list[Article] = [] + for source in sources["sources"]: + if not source.get("enabled", True): + continue + try: + feed_bytes = fetch_url( + source["url"], + sources["defaults"]["user_agent"], + int(sources["defaults"]["request_timeout_seconds"]), + ) + all_articles.extend(parse_feed(feed_bytes, source)[: int(sources["defaults"]["max_articles_per_source"])]) + except Exception as exc: + print(f"warning: failed to fetch {source['name']}: {exc}", file=sys.stderr) + + lookback_hours = args.hours + if args.days is not None: + lookback_hours = max(args.days, 0) * 24 + cutoff = dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=lookback_hours) + wanted_tokens = {token.upper() for token in args.token} + wanted_topics = {topic.lower() for topic in args.topic} + min_score = float(scoring["defaults"]["min_output_score"]) + duplicate_threshold = float(scoring["defaults"]["duplicate_title_similarity_threshold"]) + + ranked = [] + seen_titles: list[str] = [] + for article in all_articles: + if article.published_at and article.published_at < cutoff: + continue + score, token_matches, topic_matches = score_article(article, scoring, token_aliases) + if wanted_tokens and not wanted_tokens.intersection(token_matches): + continue + if wanted_topics and not wanted_topics.intersection(topic_matches): + continue + if score < min_score: + continue + + title_key = normalize(article.title) + if any(is_duplicate(title_key, seen, duplicate_threshold) for seen in seen_titles): + continue + seen_titles.append(title_key) + + ranked.append( + { + "title": article.title, + "url": article.url, + "source": article.source_name, + "source_id": article.source_id, + "category": article.category, + "published_at": article.published_at.isoformat() if article.published_at else None, + "summary": article.summary, + "score": round(score, 2), + "matched_tokens": token_matches, + "matched_topics": topic_matches, + "duplicate_group_key": title_key, + } + ) + + ranked.sort(key=lambda item: (item["score"], item["published_at"] or ""), reverse=True) + result = ranked[: args.limit] + + if args.format in ("markdown", "md"): + print(format_output(result, "markdown", args.pretty)) + else: + json.dump(result, sys.stdout, ensure_ascii=False, indent=2 if args.pretty else None) + if args.pretty: + sys.stdout.write("\n") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())