From 3b05b21ad581b61acd595dd7c84d23c359f19d9a Mon Sep 17 00:00:00 2001 From: CJ Robinson Date: Fri, 20 Mar 2026 15:02:22 -0400 Subject: [PATCH] new hmong scraper --- scrapers/3hmongtv/page_analysis.json | 52 ++- scrapers/3hmongtv/scraper.py | 539 ++++++--------------------- scrapers/3hmongtv/seed.json | 2 +- 3 files changed, 156 insertions(+), 437 deletions(-) diff --git a/scrapers/3hmongtv/page_analysis.json b/scrapers/3hmongtv/page_analysis.json index b2bda42..4117118 100644 --- a/scrapers/3hmongtv/page_analysis.json +++ b/scrapers/3hmongtv/page_analysis.json @@ -1,12 +1,48 @@ { - "item_selectors": [], + "item_selectors": [ + "a[href^=\"/stories/\"]", + ".rounded-lg.border.bg-card" + ], "next_page_selectors": [], - "item_examples": {}, + "item_examples": { + "a[href^=\"/stories/\"]": [ + "
\"Federal
\"FederalFederal Judge Finds ICE Agents Likely Engaged in Racial Profiling During Minnesota Operations" + ], + "h3.font-semibold": [ + "

Federal Judge Finds ICE Agents Likely Engaged in Racial Profiling During Minnesota Operations

" + ] + }, + "date_selectors": [ + "a[href^=\"/stories/\"] .flex.items-center.justify-between span:last-child", + ".flex.items-center.justify-between span:last-child" + ], + "date_examples": { + "a[href^=\"/stories/\"] .flex.items-center.justify-between span:last-child": [ + "March 16, 2026" + ], + ".flex.items-center.justify-between span:last-child": [ + "March 16, 2026" + ] + }, + "url_selectors": [ + "a[href^=\"/stories/\"]" + ], + "url_examples": { + "a[href^=\"/stories/\"]": [ + "
\"Federal List[dict]: +async def scrape_page(page): """ Extract article data from the current page. @@ -127,320 +39,117 @@ async def scrape_page(page) -> List[dict]: List of dictionaries containing article data with keys: - title: Headline or title of the article - date: Publication date in YYYY-MM-DD format or None - - url: Link to the full article + - url: Absolute URL to the full article - scraper: module path for traceability """ items = [] - seen_urls = set() - # ensure page has settled a bit + # Use anchor elements that link to story pages as article containers. + # This selector is robust based on the provided examples. try: - await page.wait_for_load_state('networkidle', timeout=10000) + anchors = await page.query_selector_all('a[href^="/stories/"]') except Exception: - pass + anchors = [] - # Find candidate containers using a broad list of selectors to be resilient. - try: - containers = await page.query_selector_all(COMBINED_ITEM_SELECTOR) - except Exception: - containers = [] - - # If no containers found, fall back to selecting items that contain links in article listing regions: - if not containers: + for a in anchors: try: - containers = await page.query_selector_all("main, .content, .site-content, #content, .archive, .blog") - # if that yields containers, we'll search anchors inside them later - except Exception: - containers = [] + # Extract URL and resolve to absolute + href = await a.get_attribute('href') + if not href: + continue + url = urllib.parse.urljoin(base_url, href.strip()) + + # Extract title using h3 inside the anchor; use text_content() per instructions + title_el = await a.query_selector('h3') + title = None + if title_el: + raw_title = await title_el.text_content() + if raw_title: + title = raw_title.strip() + + # Title and URL are required; if title missing, skip this item + if not title or not url: + continue - # Primary pass: if we have container elements, extract items from them - if containers: - for el in containers: + # Extract date if present. Based on examples, date sits in a .flex... span:last-child inside the anchor. + date = None try: - # Attempt to find a title anchor inside the container using common heading selectors - title_el = await el.query_selector("h1 a, h2 a, h3 a, .entry-title a, .post-title a, a[rel='bookmark'], a.title, .title a") - - # If no heading anchor, try the first anchor with visible text - if not title_el: - anchors = await el.query_selector_all("a[href]") - title_el = None - for a in anchors: - txt = await _safe_text(a) - href = await _safe_attr(a, "href") - if txt and href and len(txt) > 3: - title_el = a - break - - title = await _safe_text(title_el) if title_el else None - - # If title still missing, try aria-label or title attribute - if not title and title_el: - title = (await _safe_attr(title_el, "aria-label")) or (await _safe_attr(title_el, "title")) - if title: - title = title.strip() - - # Extract URL - prioritize href from title anchor - url = None - if title_el: - href = await _safe_attr(title_el, "href") - if href: - url = urllib.parse.urljoin(base_url, href.strip()) - - # If still no URL, try first anchor in container - if not url: - first_anchor = await el.query_selector("a[href]") - if first_anchor: - href = await _safe_attr(first_anchor, "href") - if href: - url = urllib.parse.urljoin(base_url, href.strip()) - - # Normalize url (remove fragments) - if url: - try: - parsed = urllib.parse.urlparse(url) - url = urllib.parse.urlunparse(parsed._replace(fragment="")) - except Exception: - pass - - # Extract date: prefer