diff --git a/.github/workflows/install-mcp-uptime.yml b/.github/workflows/install-mcp-uptime.yml new file mode 100644 index 000000000..be9bedaed --- /dev/null +++ b/.github/workflows/install-mcp-uptime.yml @@ -0,0 +1,126 @@ +name: Install MCP Uptime Monitoring + +on: + workflow_dispatch: + inputs: + mcp_url: + description: 'MCP tools/list URL' + required: false + default: 'https://mcp.buywhere.ai/health' + dry_run: + description: 'If true, validate steps without making changes' + required: false + default: 'false' + +permissions: + contents: read + +jobs: + install-mcp-uptime: + name: Install MCP uptime monitoring on production VM + runs-on: ubuntu-latest + environment: production + steps: + - uses: actions/checkout@v4 + + - name: Set up SSH + uses: webfactory/ssh-agent@v0.9.0 + with: + ssh-private-key: ${{ secrets.PRODUCTION_DEPLOY_SSH_KEY }} + + - name: Trust production host + run: | + mkdir -p ~/.ssh + ssh-keyscan -p "${{ secrets.PRODUCTION_DEPLOY_PORT || 22 }}" -H "${{ secrets.PRODUCTION_DEPLOY_HOST }}" >> ~/.ssh/known_hosts + + - name: Upload scripts to VM + env: + SSH_HOST: ${{ secrets.PRODUCTION_DEPLOY_HOST }} + SSH_PORT: ${{ secrets.PRODUCTION_DEPLOY_PORT || 22 }} + SSH_USER: ${{ secrets.PRODUCTION_DEPLOY_USER }} + run: | + REMOTE_TMP="/tmp/mcp-uptime-install" + ssh -p "$SSH_PORT" "$SSH_USER@$SSH_HOST" "mkdir -p $REMOTE_TMP" + for f in scripts/setup-mcp-uptime-monitoring.sh scripts/check-mcp-uptime.sh scripts/report-mcp-uptime.sh scripts/mcp-uptime-dashboard.html; do + scp -P "$SSH_PORT" "$f" "$SSH_USER@$SSH_HOST:$REMOTE_TMP/" + done + + - name: Run setup script + env: + SSH_HOST: ${{ secrets.PRODUCTION_DEPLOY_HOST }} + SSH_PORT: ${{ secrets.PRODUCTION_DEPLOY_PORT || 22 }} + SSH_USER: ${{ secrets.PRODUCTION_DEPLOY_USER }} + MCP_URL: ${{ github.event.inputs.mcp_url }} + DRY_RUN: ${{ github.event.inputs.dry_run }} + run: | + if [ "$DRY_RUN" = "true" ]; then + echo "DRY RUN would execute:" + echo " ssh -p $SSH_PORT $SSH_USER@$SSH_HOST 'cd /tmp/mcp-uptime-install && ./setup-mcp-uptime-monitoring.sh $MCP_URL'" + exit 0 + fi + ssh -p "$SSH_PORT" "$SSH_USER@$SSH_HOST" \ + "cd /tmp/mcp-uptime-install && chmod +x *.sh && ./setup-mcp-uptime-monitoring.sh '${MCP_URL}'" + + - name: Install nginx config and reload + continue-on-error: true + env: + SSH_HOST: ${{ secrets.PRODUCTION_DEPLOY_HOST }} + SSH_PORT: ${{ secrets.PRODUCTION_DEPLOY_PORT || 22 }} + SSH_USER: ${{ secrets.PRODUCTION_DEPLOY_USER }} + run: | + ssh -p "$SSH_PORT" "$SSH_USER@$SSH_HOST" bash -s <<'REMOTE' + set -euo pipefail + WEB_ROOT="$(echo $HOME)/mcp-uptime/www" + NGINX_CONF="/etc/nginx/sites-enabled/mcp-uptime.conf" + + if [ -f "$NGINX_CONF" ]; then + echo "nginx config already exists — skipping write" + else + CONTENT="location /mcp-uptime { + alias ${WEB_ROOT}; + index index.html; + add_header Cache-Control \"no-cache, max-age=0\"; + add_header X-Frame-Options \"SAMEORIGIN\"; + }" + TMPFILE=$(mktemp) + echo "$CONTENT" > "$TMPFILE" + cp "$TMPFILE" "$NGINX_CONF" 2>/dev/null \ + || sudo -n cp "$TMPFILE" "$NGINX_CONF" 2>/dev/null \ + || sudo cp "$TMPFILE" "$NGINX_CONF" 2>/dev/null \ + || echo "$CONTENT" | sudo -n tee "$NGINX_CONF" > /dev/null 2>/dev/null \ + || echo "$CONTENT" | sudo tee "$NGINX_CONF" > /dev/null 2>/dev/null \ + || echo "WARNING: could not write nginx config — add manually" + rm -f "$TMPFILE" + [ -f "$NGINX_CONF" ] && echo "nginx config written to $NGINX_CONF" + fi + + nginx -t 2>&1 || sudo -n nginx -t 2>&1 || sudo nginx -t 2>&1 || { echo "ERROR: nginx config test failed"; exit 1; } + echo "nginx config syntax OK" + + nginx -s reload 2>/dev/null \ + || sudo -n nginx -s reload 2>/dev/null \ + || sudo nginx -s reload 2>/dev/null \ + || systemctl reload nginx 2>/dev/null \ + || sudo -n systemctl reload nginx 2>/dev/null \ + || echo "WARNING: nginx reload failed — reload manually" + echo "nginx reloaded" + REMOTE + + - name: Verify installation + run: | + echo "=== Verifying MCP uptime monitoring ===" + sleep 10 + HTTP=$(curl -s -o /dev/null -w "%{http_code}" \ + https://api.buywhere.ai/mcp-uptime/uptime.json 2>/dev/null || echo "000") + echo "GET /mcp-uptime/uptime.json -> HTTP ${HTTP}" + if [ "$HTTP" = "200" ]; then + echo "SUCCESS: Dashboard is live" + else + echo "WARNING: Dashboard returned ${HTTP} - may need a moment to generate first report" + fi + + - name: Summary + run: | + echo "=== MCP Uptime Monitoring Install Summary ===" + echo "Dashboard: https://api.buywhere.ai/mcp-uptime" + echo "Status: $(curl -s https://api.buywhere.ai/mcp-uptime/uptime.json | python3 -c 'import sys,json; d=json.load(sys.stdin); print(d.get("status","unknown"))' 2>/dev/null || echo 'pending')" diff --git a/api/dist/migrate.js b/api/dist/migrate.js index 735793e50..cb9067676 100644 --- a/api/dist/migrate.js +++ b/api/dist/migrate.js @@ -20,6 +20,31 @@ ALTER TABLE products ADD COLUMN IF NOT EXISTS country_code VARCHAR(2); ALTER TABLE products ADD COLUMN IF NOT EXISTS gtin VARCHAR(14); ALTER TABLE products ADD COLUMN IF NOT EXISTS mpn VARCHAR(100); +-- Unique constraint for ingest upsert (ON CONFLICT (sku, source)) -- BUY-10814 / BUY-10929 blocker +DO $$ +DECLARE dup_count BIGINT; +BEGIN + IF EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'products_sku_source_unique') THEN + RETURN; + END IF; + SELECT COUNT(*) INTO dup_count FROM ( + SELECT sku, source, COUNT(*) AS cnt FROM products + WHERE sku IS NOT NULL AND source IS NOT NULL + GROUP BY sku, source HAVING COUNT(*) > 1 + ) dups; + IF dup_count > 0 THEN + DELETE FROM products WHERE id IN ( + SELECT id FROM ( + SELECT id, ROW_NUMBER() OVER (PARTITION BY sku, source ORDER BY id DESC) AS rn + FROM products WHERE sku IS NOT NULL AND source IS NOT NULL + ) ranked WHERE rn > 1 + ); + END IF; + ALTER TABLE products ADD CONSTRAINT products_sku_source_unique UNIQUE (sku, source); +EXCEPTION WHEN OTHERS THEN + RAISE WARNING 'Could not create constraint: %', SQLERRM; +END $$; + -- Full-text search support on products table CREATE INDEX IF NOT EXISTS idx_products_search_vector ON products USING GIN(search_vector); diff --git a/api/dist/sentry.js b/api/dist/sentry.js index 440947a47..6627f6d9b 100644 --- a/api/dist/sentry.js +++ b/api/dist/sentry.js @@ -48,20 +48,16 @@ function initSentry() { dsn, environment: process.env.NODE_ENV || 'production', tracesSampleRate: 0.1, - enableTracing: true, }); console.log('[sentry] Error tracking initialized (env=%s)', process.env.NODE_ENV || 'production'); } function sentryRequestHandler(req, _res, next) { - if (Sentry.getCurrentHub?.()?.getScope?.()) { - const scope = Sentry.getCurrentHub().getScope(); - scope.setUser({ - ip_address: req.ip, - id: req.sessionId || undefined, - }); - scope.setExtra('country', req.query.country || req.body?.country || ''); - scope.setTag('method', req.method); - scope.setTag('path', req.path); - } + Sentry.setUser({ + ip_address: req.ip, + id: req.sessionId || undefined, + }); + Sentry.setExtra('country', req.query.country || req.body?.country || ''); + Sentry.setTag('method', req.method); + Sentry.setTag('path', req.path); next(); } diff --git a/api/src/migrate.ts b/api/src/migrate.ts index da1677cc8..7e9e5997e 100644 --- a/api/src/migrate.ts +++ b/api/src/migrate.ts @@ -18,6 +18,31 @@ ALTER TABLE products ADD COLUMN IF NOT EXISTS country_code VARCHAR(2); ALTER TABLE products ADD COLUMN IF NOT EXISTS gtin VARCHAR(14); ALTER TABLE products ADD COLUMN IF NOT EXISTS mpn VARCHAR(100); +-- Unique constraint for ingest upsert (ON CONFLICT (sku, source)) -- BUY-10814 / BUY-10929 blocker +DO $$ +DECLARE dup_count BIGINT; +BEGIN + IF EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'products_sku_source_unique') THEN + RETURN; + END IF; + SELECT COUNT(*) INTO dup_count FROM ( + SELECT sku, source, COUNT(*) AS cnt FROM products + WHERE sku IS NOT NULL AND source IS NOT NULL + GROUP BY sku, source HAVING COUNT(*) > 1 + ) dups; + IF dup_count > 0 THEN + DELETE FROM products WHERE id IN ( + SELECT id FROM ( + SELECT id, ROW_NUMBER() OVER (PARTITION BY sku, source ORDER BY id DESC) AS rn + FROM products WHERE sku IS NOT NULL AND source IS NOT NULL + ) ranked WHERE rn > 1 + ); + END IF; + ALTER TABLE products ADD CONSTRAINT products_sku_source_unique UNIQUE (sku, source); +EXCEPTION WHEN OTHERS THEN + RAISE WARNING 'Could not create constraint: %', SQLERRM; +END $$; + -- Full-text search support on products table CREATE INDEX IF NOT EXISTS idx_products_search_vector ON products USING GIN(search_vector); diff --git a/api/src/routes/products.ts b/api/src/routes/products.ts index e0982875e..0aab06d2c 100644 --- a/api/src/routes/products.ts +++ b/api/src/routes/products.ts @@ -11,7 +11,9 @@ const SEARCH_CACHE_TTL_SECONDS = 60; const router = Router(); // GET /v1/products/search -// Query params: q, domain, region, country, category, min_price, max_price, currency, limit, offset, source_page +// Query params: q, domain, region, country, category, category_id, category_path, +// brand, merchant_id, availability, min_price, max_price, +// currency, limit, offset, page, fields, sort, sort_by, source_page, compact router.get( '/search', agentDetectMiddleware, @@ -24,6 +26,14 @@ router.get( const domain = req.query.domain as string | undefined; const region = req.query.region as string | undefined; const category = req.query.category as string | undefined; + const categoryId = req.query.category_id as string | undefined; + const categoryPath = (req.query.category_path as string) ? (req.query.category_path as string).split(',').map(p => p.trim()).filter(Boolean) : undefined; + const brand = req.query.brand as string | undefined; + const merchantId = req.query.merchant_id as string | undefined; + const availability = req.query.availability as string | undefined; + const rawFields = (req.query.fields as string) || undefined; + const fields = rawFields ? rawFields.split(',').map(f => f.trim()).filter(Boolean) : undefined; + const sort = ((req.query.sort || req.query.sort_by) as string) || undefined; // country_code is the canonical param; `country` is kept as a backward-compat alias. // Default to SG when neither country nor region is specified (BUY-6598: prevent cross-region accessory pollution). const explicitCountry = ((req.query.country_code as string | undefined) || (req.query.country as string | undefined))?.toUpperCase() || undefined; @@ -34,12 +44,14 @@ router.get( // Price filters (min_price/max_price) apply in this inferred currency. const currency = (req.query.currency as string) || (countryCode ? (COUNTRY_CURRENCY[countryCode] || 'SGD') : 'SGD'); const limit = Math.min(parseInt((req.query.limit as string) || '20'), 100); - const offset = parseInt((req.query.offset as string) || '0'); + const rawPage = parseInt((req.query.page as string) || '0'); + const rawOffset = parseInt((req.query.offset as string) || '0'); + const offset = rawPage > 0 ? (rawPage - 1) * limit : rawOffset; const sourcePage = req.query.source_page as string | undefined; const compact = req.query.compact === 'true'; // Check Redis cache for this exact query (60s TTL) - const cacheKey = `fts:${q}:${domain || ''}:${region || ''}:${countryCode || ''}:${category || ''}:${currency}:${minPrice ?? ''}:${maxPrice ?? ''}:${limit}:${offset}:${compact ? 'c' : 'f'}`; + const cacheKey = `fts:${q}:${domain || ''}:${region || ''}:${countryCode || ''}:${category || ''}:${categoryId || ''}:${categoryPath?.join(',') || ''}:${brand || ''}:${merchantId || ''}:${availability || ''}:${currency}:${minPrice ?? ''}:${maxPrice ?? ''}:${limit}:${offset}:${sort || ''}:${fields?.join(',') || ''}:${compact ? 'c' : 'f'}`; try { const cached = await redis.get(cacheKey); if (cached) { @@ -86,6 +98,43 @@ router.get( params.push(`%${category}%`); idx++; } + if (brand) { + conditions.push(`brand ILIKE $${idx}`); + params.push(`%${brand}%`); + idx++; + } + if (availability) { + const avail = availability.toLowerCase(); + if (avail === 'in_stock') { + conditions.push(`(metadata->>'availability' = $${idx} OR (metadata->>'availability' IS NULL AND is_active = true))`); + params.push(avail); + idx++; + } else if (avail === 'out_of_stock') { + conditions.push(`(metadata->>'availability' = $${idx} OR (metadata->>'availability' IS NULL AND is_active = false))`); + params.push(avail); + idx++; + } else if (avail === 'preorder' || avail === 'discontinued') { + conditions.push(`metadata->>'availability' = $${idx}`); + params.push(avail); + idx++; + } + } + if (categoryId) { + conditions.push(`category_id = $${idx}`); + params.push(categoryId); + idx++; + } + if (categoryPath && categoryPath.length > 0) { + const pathPlaceholders = categoryPath.map((_, i) => `$${idx + i}`).join(','); + conditions.push(`category_path @> ARRAY[${pathPlaceholders}]::text[]`); + params.push(...categoryPath); + idx += categoryPath.length; + } + if (merchantId) { + conditions.push(`merchant_id = $${idx}`); + params.push(merchantId); + idx++; + } if (minPrice !== undefined) { conditions.push(`price >= $${idx}`); params.push(minPrice); @@ -108,36 +157,56 @@ router.get( const countResult = await db.query(countQuery, params.slice(0, idx - 1)); const approxCount = parseInt(countResult.rows[0].count, 10); + const VALID_SORT = new Set(['relevance', 'price_asc', 'price_desc', 'newest', 'highest_rated', 'most_reviewed']); + const effectiveSort = sort && VALID_SORT.has(sort) ? sort : undefined; + const useFtsRanking = (!effectiveSort || effectiveSort === 'relevance') && ftsParamIdx; + + // Build ORDER BY for non-fts-ranking path + function buildSortOrder(): string { + if (!effectiveSort || effectiveSort === 'relevance') return 'updated_at DESC'; + switch (effectiveSort) { + case 'price_asc': return 'price ASC, updated_at DESC'; + case 'price_desc': return 'price DESC, updated_at DESC'; + case 'newest': return 'updated_at DESC'; + case 'highest_rated': return 'avg_rating DESC NULLS LAST, updated_at DESC'; + case 'most_reviewed': return 'review_count DESC NULLS LAST, updated_at DESC'; + default: return 'updated_at DESC'; + } + } + // For large result sets (>1000 rows), computing ts_rank over all matches is expensive. // Instead, let the GIN index fetch up to CANDIDATE_LIMIT rows, rank those by ts_rank, // then return the top N. This gives relevance ordering at a fraction of the cost. // For small result sets (<= 1000 rows), ts_rank over all matches is fast. const CANDIDATE_LIMIT = Math.max(500, (limit + offset) * 10); + const specColumns = `created_at, description, brand, mpn, gtin, category_path, category, category_id, merchant_id, avg_rating, review_count`; let dataQuery: string; - if (ftsParamIdx && approxCount <= 1000) { - // Small result set: ts_rank over all matches is fast, gives best relevance + if (useFtsRanking && approxCount <= 1000) { dataQuery = ` SELECT id, sku AS source_id, source AS domain, url, + al.destination_url AS affiliate_url, title, price, currency, image_url, metadata, updated_at, - region, country_code + region, country_code, ${specColumns} FROM products + LEFT JOIN affiliate_links al ON al.product_id = products.id::text AND al.merchant_id = products.merchant_id ${whereClause} ORDER BY ts_rank(search_vector, plainto_tsquery('english', $${ftsParamIdx})) DESC, updated_at DESC LIMIT $${idx} OFFSET $${idx + 1} `; - } else if (ftsParamIdx) { - // Large result set: GIN index fetches CANDIDATE_LIMIT rows using bitmap scan, then ranks. - // No ORDER BY in the inner query — this lets PostgreSQL stop the heap scan after - // CANDIDATE_LIMIT rows (vs scanning all 25k+ matching rows to sort by rank first). - // 12x faster for broad queries (14ms vs 170ms for "headphones" on 2M product corpus). + } else if (useFtsRanking) { dataQuery = ` - SELECT id, source_id, domain, url, title, price, currency, image_url, metadata, updated_at, region, country_code + SELECT id, source_id, domain, url, + affiliate_url, + title, price, currency, image_url, metadata, updated_at, + region, country_code, ${specColumns} FROM ( SELECT id, sku AS source_id, source AS domain, url, + al.destination_url AS affiliate_url, title, price, currency, image_url, metadata, updated_at, - region, country_code, + region, country_code, ${specColumns}, ts_rank(search_vector, plainto_tsquery('english', $${ftsParamIdx})) AS rank FROM products + LEFT JOIN affiliate_links al ON al.product_id = products.id::text AND al.merchant_id = products.merchant_id ${whereClause} LIMIT ${CANDIDATE_LIMIT} ) _candidates @@ -145,14 +214,15 @@ router.get( LIMIT $${idx} OFFSET $${idx + 1} `; } else { - // No FTS query (e.g. filter-only) — sort by recency dataQuery = ` SELECT id, sku AS source_id, source AS domain, url, + al.destination_url AS affiliate_url, title, price, currency, image_url, metadata, updated_at, - region, country_code + region, country_code, ${specColumns} FROM products + LEFT JOIN affiliate_links al ON al.product_id = products.id::text AND al.merchant_id = products.merchant_id ${whereClause} - ORDER BY updated_at DESC + ORDER BY ${buildSortOrder()} LIMIT $${idx} OFFSET $${idx + 1} `; } @@ -164,11 +234,36 @@ router.get( const responseTimeMs = Date.now() - requestStart; const products = dataResult.rows.map((row) => - buildProduct(row as Record, currency, compact) + buildProduct(row as Record, currency, compact, true) ); + // Apply field selection if `fields` param is specified + let filteredProducts = products; + if (fields && fields.length > 0) { + const VALID_FIELDS = new Set([ + 'id', 'name', 'price', 'url', 'merchant', 'category', 'country', + 'ingested_at', 'updated_at', 'description', 'image_url', 'images', + 'brand', 'sku', 'mpn', 'gtin', 'availability', 'compare_at_price', + 'rating', 'title', 'country_code', 'region', + 'canonical_id', 'normalized_price_usd', 'structured_specs', + 'comparison_attributes', 'metadata', 'original_price', 'discount_pct', + ]); + const requested = fields.filter(f => VALID_FIELDS.has(f)); + if (requested.length > 0) { + filteredProducts = products.map(p => { + const picked: Record = {}; + for (const f of requested) { + if (f in (p as unknown as Record)) { + picked[f] = (p as unknown as Record)[f]; + } + } + return picked as unknown as typeof p; + }); + } + } + const responseBody = buildSearchResponse( - products, total, limit, offset, responseTimeMs, false + filteredProducts, total, limit, offset, responseTimeMs, false ); // Cache result in Redis (fire-and-forget) @@ -259,7 +354,8 @@ router.get( `SELECT id, sku AS source_id, source AS domain, url, title, price, (metadata->>'original_price')::numeric AS original_price, currency, image_url, metadata, updated_at, - region, country_code, + region, country_code, created_at, description, brand, mpn, gtin, + category_path, category, merchant_id, avg_rating, review_count, ROUND(((1 - price / NULLIF((metadata->>'original_price')::numeric, 0)) * 100)::numeric, 1) AS discount_pct FROM products WHERE ${dealWhere} @@ -562,7 +658,8 @@ router.get( result = await db.query( `SELECT id, sku AS source_id, source AS domain, url, title, price, currency, image_url, metadata, updated_at, - region, country_code, brand, category_path, avg_rating AS rating, review_count + region, country_code, created_at, description, brand, mpn, gtin, + category_path, category, merchant_id, avg_rating, review_count FROM products WHERE id = $1`, [id] ); @@ -777,13 +874,13 @@ router.post( } ); -function extractCategories(products: Array<{ domain?: string; merchant?: string; metadata?: Record | null }>): string[] { +function extractCategories(products: Array<{ domain?: string; merchant?: { id: string; name: string | null; domain: string }; metadata?: Record | null }>): string[] { const cats = new Set(); for (const p of products) { - const source = p.domain || p.merchant; + const source = p.domain || (p.merchant?.domain) || ''; if (source) { - const domain = source.replace('.sg', '').replace('.com', ''); - cats.add(domain); + const domainName = source.replace('.sg', '').replace('.com', ''); + cats.add(domainName); } if (p.metadata && typeof p.metadata === 'object') { const meta = p.metadata as Record; diff --git a/api/src/sentry.ts b/api/src/sentry.ts index 5dec4cbf2..6bbc72357 100644 --- a/api/src/sentry.ts +++ b/api/src/sentry.ts @@ -1,5 +1,5 @@ import * as Sentry from '@sentry/node'; -import type express from 'express'; +import type { Request, Response, NextFunction } from 'express'; export function initSentry() { const dsn = process.env.SENTRY_DSN; @@ -11,22 +11,18 @@ export function initSentry() { dsn, environment: process.env.NODE_ENV || 'production', tracesSampleRate: 0.1, - enableTracing: true, }); console.log('[sentry] Error tracking initialized (env=%s)', process.env.NODE_ENV || 'production'); } -export function sentryRequestHandler(req: express.Request, _res: express.Response, next: express.NextFunction) { - if (Sentry.getCurrentHub?.()?.getScope?.()) { - const scope = Sentry.getCurrentHub().getScope(); - scope.setUser({ - ip_address: req.ip, - id: (req as any).sessionId || undefined, - }); - scope.setExtra('country', (req.query.country as string) || (req.body?.country as string) || ''); - scope.setTag('method', req.method); - scope.setTag('path', req.path); - } +export function sentryRequestHandler(req: Request, _res: Response, next: NextFunction) { + Sentry.setUser({ + ip_address: req.ip, + id: (req as any).sessionId || undefined, + }); + Sentry.setExtra('country', (req.query.country as string) || (req.body?.country as string) || ''); + Sentry.setTag('method', req.method); + Sentry.setTag('path', req.path); next(); } diff --git a/deploy/nginx/buywhere.ai.conf b/deploy/nginx/buywhere.ai.conf index 5b10f7c44..4f23fc7de 100644 --- a/deploy/nginx/buywhere.ai.conf +++ b/deploy/nginx/buywhere.ai.conf @@ -97,6 +97,37 @@ server { return 308 https://api.buywhere.ai/openapi.json; } + # ── LLM discovery files: serve directly from Cloud Run ─────────────────────── + location = /llms.txt { + proxy_pass https://buywhere-site-production-3cjo6zft4q-as.a.run.app; + proxy_ssl_server_name on; + proxy_ssl_name buywhere-site-production-3cjo6zft4q-as.a.run.app; + proxy_set_header Host buywhere-site-production-3cjo6zft4q-as.a.run.app; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_http_version 1.1; + proxy_set_header Connection ""; + proxy_connect_timeout 10s; + proxy_read_timeout 30s; + add_header Cache-Control "public, max-age=3600" always; + } + + location = /llms-full.txt { + proxy_pass https://buywhere-site-production-3cjo6zft4q-as.a.run.app; + proxy_ssl_server_name on; + proxy_ssl_name buywhere-site-production-3cjo6zft4q-as.a.run.app; + proxy_set_header Host buywhere-site-production-3cjo6zft4q-as.a.run.app; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_http_version 1.1; + proxy_set_header Connection ""; + proxy_connect_timeout 10s; + proxy_read_timeout 30s; + add_header Cache-Control "public, max-age=3600" always; + } + # ── Default: proxy everything else to Cloud Run ─────────────────────────────── location / { proxy_pass https://buywhere-site-production-3cjo6zft4q-as.a.run.app; diff --git a/scrapers/amazon_us.py b/scrapers/amazon_us.py new file mode 100644 index 000000000..3d6cc5ca9 --- /dev/null +++ b/scrapers/amazon_us.py @@ -0,0 +1,1092 @@ +""" +Amazon US product scraper. + +Scrapes product search results from Amazon.com and outputs structured JSON +matching the BuyWhere catalog schema for ingestion via /v1/ingest/products. + +Usage: + python -m scrapers.amazon_us --api-key [--batch-size 100] [--delay 1.5] + python -m scrapers.amazon_us --scrape-only [--session-file session.json] + +Categories covered: Electronics, Computers, Cell Phones, Home, Kitchen, Tools, +Sports, Apparel (Men/Women), Beauty, Health, Baby, Toys, Video Games, Books, +Automotive, Pet Supplies, Office, Grocery, Arts & Crafts, Musical Instruments, +Appliances, Outdoor Living, Luggage, Jewelry, Movies & Music, Industrial. +Target: 500,000+ products +""" +import argparse +import asyncio +import json +import os +import re +import time +import urllib.parse +from typing import Any +from urllib.parse import urljoin + +import httpx +from bs4 import BeautifulSoup + +from scrapers.scraper_registry import register + +MERCHANT_ID = "amazon_us" +SOURCE = "amazon_us" +BASE_URL = "https://www.amazon.com" +OUTPUT_DIR = "/home/paperclip/buywhere-api/data/amazon_us" + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Referer": "https://www.amazon.com/", +} + +RATE_LIMIT_WAIT = 30 +MAX_RETRIES = 5 + +CATEGORIES: list[dict[str, Any]] = [ + { + "id": "electronics", + "name": "Electronics", + "keywords": [ + "laptop", "gaming laptop", "ultrabook", "Chromebook", "MacBook Air", "MacBook Pro", + "smartphone", "iPhone", "Samsung Galaxy", "Google Pixel", "tablet", "iPad", + "headphones", "wireless earbuds", "Bluetooth speaker", "soundbar", + "smart watch", "Apple Watch", "Samsung Watch", "Fitbit", + "monitor", "4K monitor", "gaming monitor", "ultrawide monitor", + "keyboard", "mechanical keyboard", "wireless keyboard", + "mouse", "gaming mouse", "wireless mouse", + "webcam", "USB hub", "power bank", "wireless charger", + "SSD", "external hard drive", "USB flash drive", "memory card", + "WiFi router", "mesh WiFi", "network switch", + "projector", "home theater", "streaming device", "Roku", "Fire TV", + "digital camera", "DSLR", "mirrorless camera", "action camera", "GoPro", + "drone", "camera lens", "tripod", + "smart home", "smart speaker", "smart display", "smart plug", + "smart light bulb", "smart thermostat", "smart lock", "video doorbell", + "security camera", "baby monitor", "Echo Dot", "Nest Hub", + "TV", "OLED TV", "QLED TV", "4K TV", "smart TV", + "e-reader", "Kindle", "tablet case", "screen protector", + "car electronics", "dash cam", "GPS navigator", "car charger", + "VR headset", "Meta Quest", "Apple Vision Pro", + ], + }, + { + "id": "computers", + "name": "Computers & Accessories", + "keywords": [ + "desktop computer", "all-in-one PC", "gaming PC", "mini PC", "workstation", + "business laptop", "2-in-1 laptop", "iPad Pro", "Surface Pro", "Chromebook", + "RAM", "DDR5", "graphics card", "RTX", "Radeon", "CPU", "Intel Core", "AMD Ryzen", + "motherboard", "power supply", "PC case", "CPU cooler", + "monitor arm", "laptop stand", "laptop bag", "laptop sleeve", + "docking station", "USB-C hub", "HDMI cable", "DisplayPort cable", + "printer", "laser printer", "inkjet printer", "all-in-one printer", + "scanner", "label maker", "3D printer", "3D printer filament", + "NAS", "server", "rack mount", "KVM switch", + "tablet mount", "car mount", "desk mount", + "ergonomic keyboard", "vertical mouse", "wrist rest", + "laptop charger", "power adapter", "battery backup", "UPS", + "drawing tablet", "pen display", "stylus", + "blue light glasses", "computer glasses", + ], + }, + { + "id": "cell_phones", + "name": "Cell Phones & Accessories", + "keywords": [ + "iPhone 16", "iPhone 15", "iPhone 14", "iPhone SE", + "Samsung Galaxy S", "Samsung Galaxy Z Flip", "Samsung Galaxy Z Fold", + "Google Pixel 9", "Google Pixel 8", "OnePlus", "Motorola", + "phone case", "iPhone case", "Samsung case", "screen protector", + "phone charger", "fast charger", "car charger", "wireless charger pad", + "power bank", "phone stand", "pop socket", "phone grip", + "Bluetooth headset", "earphone", "wired earbuds", + "selfie stick", "phone tripod", "phone lens", + "SIM card", "eSIM", "prepaid phone", + "phone cable", "USB cable", "Lightning cable", "USB-C cable", + "phone mount", "car phone mount", "magnetic phone mount", + "phone wallet", "phone ring holder", "phone lanyard", + "tablet case", "iPad case", "iPad keyboard case", + ], + }, + { + "id": "home_kitchen", + "name": "Home & Kitchen", + "keywords": [ + "air fryer", "Instant Pot", "slow cooker", "rice cooker", "pressure cooker", + "coffee maker", "espresso machine", "Keurig", "Nespresso", "French press", + "toaster", "toaster oven", "air fryer oven", "microwave", + "blender", "food processor", "stand mixer", "hand mixer", "immersion blender", + "knife set", "cooking knife", "cutting board", "cookware set", + "nonstick pan", "cast iron skillet", "stainless steel pan", + "baking sheet", "cake pan", "muffin pan", "cooling rack", + "kitchen towels", "oven mitt", "apron", "dish rack", + "food storage", "meal prep container", "water bottle", "lunch box", + "dinnerware set", "plate set", "glassware", "wine glass", "mug", + "flatware set", "silverware", "utensil set", + "bed sheets", "comforter", "duvet cover", "pillow", "pillowcase", + "blanket", "throw blanket", "quilt", "bedspread", + "mattress topper", "mattress pad", "mattress protector", + "towel set", "bath towel", "hand towel", "washcloth", "bath mat", + "shower curtain", "shower caddy", "toilet brush", "plunger", + "curtains", "blackout curtains", "sheer curtains", "drapery", + "rug", "area rug", "runner rug", "doormat", + "lamp", "table lamp", "floor lamp", "desk lamp", "bedside lamp", + "wall art", "canvas print", "poster", "wall decor", "wall clock", + "vase", "candle holder", "decorative pillow", "throw pillow", + "storage bin", "storage basket", "shelf", "closet organizer", + "clothes hanger", "shoe rack", "jewelry box", "jewelry organizer", + "desk organizer", "file cabinet", "letter tray", + "trash can", "kitchen trash can", "bathroom trash can", + "step stool", "folding stool", "utility cart", + "ironing board", "clothes steamer", "iron", "garment rack", + ], + }, + { + "id": "patio_garden", + "name": "Patio, Lawn & Garden", + "keywords": [ + "patio furniture set", "outdoor dining set", "patio chair", "Adirondack chair", + "outdoor table", "patio umbrella", "market umbrella", "cantilever umbrella", + "grill", "gas grill", "charcoal grill", "pellet grill", "smoker", + "lawn mower", "push mower", "riding mower", "robot mower", + "leaf blower", "string trimmer", "weed eater", "chainsaw", + "pressure washer", "garden hose", "hose reel", "sprinkler", + "flower pot", "planter", "garden bed", "raised bed", + "seeds", "plant seeds", "fertilizer", "potting soil", "compost", + "garden tool set", "shovel", "rake", "hoe", "pruning shears", + "bird feeder", "bird bath", "bird house", + "outdoor lighting", "solar lights", "string lights", "path lights", + "fire pit", "outdoor heater", "chimenea", "campfire pit", + "outdoor storage", "shed", "deck box", "storage bench", + "pool float", "pool toy", "pool cleaner", "pool cover", + "trampoline", "playset", "swing set", "sandbox", + "pest control", "bug zapper", "mosquito repellent", "traps", + ], + }, + { + "id": "tools", + "name": "Tools & Home Improvement", + "keywords": [ + "power drill", "cordless drill", "impact driver", "hammer drill", + "circular saw", "miter saw", "table saw", "jigsaw", "reciprocating saw", + "sander", "orbital sander", "belt sander", "angle grinder", + "screwdriver set", "wrench set", "socket set", "ratchet set", + "hammer", "tape measure", "level", "stud finder", + "tool box", "tool bag", "tool chest", "tool cabinet", + "workbench", "work table", "tool organizer", + "ladder", "step ladder", "extension ladder", "multi-purpose ladder", + "paint sprayer", "paint roller", "paint brush", "painter's tape", + "paint", "wall paint", "wood stain", "primer", + "plumbing", "pipe wrench", "plunger", "snake", "faucet", + "electrical", "wire stripper", "voltage tester", "outlet", "switch", + "door lock", "deadbolt", "smart lock", "door knob", + "cabinet hardware", "drawer pull", "cabinet knob", "hinge", + "wall shelf", "floating shelf", "wall bracket", "pegboard", + "caulk gun", "utility knife", "glue gun", "clamp", + "air compressor", "nail gun", "stapler", "generator", + "safety glasses", "work gloves", "dust mask", "ear protection", + "shop vacuum", "wet dry vacuum", "garage storage", "shelving unit", + ], + }, + { + "id": "sports", + "name": "Sports & Outdoors", + "keywords": [ + "running shoes", "men running shoes", "women running shoes", "trail running", + "yoga mat", "yoga block", "yoga strap", "exercise ball", + "dumbbell set", "kettlebell", "barbell", "weight plate", "weight bench", + "resistance band", "pull up bar", "push up stand", "ab roller", + "treadmill", "exercise bike", "elliptical", "row machine", + "stationary bike", "Peloton", "spin bike", "indoor cycle", + "jump rope", "foam roller", "massage gun", "fitness tracker", + "protein powder", "pre workout", "BCAA", "creatine", "protein bar", + "bike", "mountain bike", "road bike", "hybrid bike", "electric bike", + "bike helmet", "bike lock", "bike light", "bike pump", + "scooter", "electric scooter", "hoverboard", "skateboard", + "tent", "camping tent", "family tent", "backpacking tent", + "sleeping bag", "camping pad", "camping pillow", "air mattress", + "camping chair", "camping table", "camping stove", "camping lantern", + "cooler", "ice chest", "portable cooler", "camping cooler", + "backpack", "hiking backpack", "daypack", "hydration pack", + "hiking boots", "hiking shoes", "hiking socks", "trekking poles", + "fishing rod", "fishing reel", "fishing tackle", "fishing lure", + "golf club set", "golf driver", "golf iron", "golf putter", + "golf ball", "golf bag", "golf glove", "golf rangefinder", + "tennis racket", "tennis ball", "pickleball paddle", "pickleball set", + "soccer ball", "basketball", "football", "volleyball", + "baseball bat", "baseball glove", "softball", "hockey stick", + "swim goggles", "swim cap", "swim fins", "kickboard", + "wetsuit", "rash guard", "board shorts", + "ski goggles", "snowboard", "snowboard boots", "ski helmet", + "boxing gloves", "boxing bag", "mma gloves", "shin guards", + ], + }, + { + "id": "apparel_men", + "name": "Men's Clothing", + "keywords": [ + "men t shirt", "men polo shirt", "men button down", "men dress shirt", + "men jeans", "men chinos", "men shorts", "men cargo shorts", + "men suit", "men blazer", "men sport coat", "men vest", + "men jacket", "men winter coat", "men rain jacket", "men puffer jacket", + "men hoodie", "men sweatshirt", "men sweater", "men cardigan", + "men activewear", "men gym shorts", "men joggers", "men compression shirt", + "men underwear", "men boxers", "men briefs", "men undershirt", + "men socks", "men dress socks", "men athletic socks", + "men shoes", "men dress shoes", "men casual shoes", "men boots", + "men sneakers", "men sandals", "men loafers", "men oxfords", + "men hat", "men baseball cap", "men beanie", "men fedora", + "men belt", "men wallet", "men watch", "men tie", + "men swim trunks", "men swimwear", "men board shorts", + "men pajama", "men robe", "men slippers", + "men big and tall", "men plus size", + ], + }, + { + "id": "apparel_women", + "name": "Women's Clothing", + "keywords": [ + "women dress", "women maxi dress", "women cocktail dress", "women sundress", + "women top", "women blouse", "women tank top", "women crop top", + "women jeans", "women skinny jeans", "women bootcut jeans", "women wide leg", + "women leggings", "women yoga pants", "women shorts", + "women skirt", "women maxi skirt", "women mini skirt", "women pencil skirt", + "women jacket", "women blazer", "women winter coat", "women raincoat", + "women hoodie", "women sweatshirt", "women sweater", "women cardigan", + "women activewear", "women sports bra", "women gym leggings", + "women swimsuit", "women bikini", "women one piece", "women cover up", + "women lingerie", "women bra", "women panties", "women shapewear", + "women socks", "women tights", "women hosiery", + "women shoes", "women heels", "women flats", "women sandals", + "women boots", "women sneakers", "women wedges", "women loafers", + "women handbag", "women crossbody bag", "women tote bag", "women clutch", + "women backpack", "women wallet", "women wristlet", + "women jewelry", "women necklace", "women earring", "women bracelet", + "women ring", "women watch", "women anklet", + "women hat", "women scarf", "women belt", "women sunglasses", + "women pajama", "women robe", "women slippers", + "women plus size", "women petite", "women maternity", + ], + }, + { + "id": "beauty", + "name": "Beauty & Personal Care", + "keywords": [ + "moisturizer", "face cream", "serum", "vitamin C serum", "retinol", + "sunscreen", "face sunscreen", "body sunscreen", "SPF 50", + "cleanser", "face wash", "makeup remover", "micellar water", + "toner", "face mist", "essence", "face oil", + "eye cream", "eye serum", "under eye patches", + "foundation", "concealer", "powder", "blush", "bronzer", + "eyeshadow", "eyeliner", "mascara", "eyebrow pencil", + "lipstick", "lip gloss", "lip liner", "lip balm", + "nail polish", "gel nail polish", "nail art", "nail file", + "shampoo", "conditioner", "hair mask", "hair oil", + "hair styling", "hairspray", "hair gel", "hair mousse", + "hair dryer", "flat iron", "curling iron", "hair brush", + "beard trimmer", "hair clipper", "electric shaver", "razor", + "deodorant", "body wash", "body lotion", "body oil", + "hand cream", "foot cream", "cologne", "perfume", + "essential oil", "diffuser", "aromatherapy", + "makeup brush set", "beauty sponge", "makeup bag", + "manicure set", "pedicure set", "nail clipper", + "toothbrush", "electric toothbrush", "toothpaste", "whitening", + "mouthwash", "dental floss", "water flosser", + ], + }, + { + "id": "health", + "name": "Health & Household", + "keywords": [ + "vitamins", "multivitamin", "vitamin D", "vitamin C", "omega 3", + "supplements", "probiotic", "collagen", "magnesium", "zinc", + "cold medicine", "allergy medicine", "pain relief", "ibuprofen", + "first aid kit", "bandage", "thermometer", "blood pressure monitor", + "pulse oximeter", "glucose monitor", "nebulizer", + "heating pad", "ice pack", "massager", "neck massager", + "face mask", "KN95", "N95", "surgical mask", + "hand sanitizer", "disinfectant wipes", "disinfectant spray", + "air purifier", "humidifier", "dehumidifier", "fan", + "water filter", "water pitcher", "water bottle filter", + "paper towel", "toilet paper", "tissue box", "napkin", + "laundry detergent", "fabric softener", "stain remover", + "dish soap", "dishwasher detergent", "sponge", "scrub brush", + "all purpose cleaner", "glass cleaner", "bathroom cleaner", + "trash bag", "ziploc bag", "aluminum foil", "plastic wrap", + "batteries", "AA battery", "AAA battery", "rechargeable battery", + "light bulb", "LED bulb", "smart bulb", + "food storage container", "glass container", "lunch bag", + "insect repellent", "mouse trap", "cockroach bait", + ], + }, + { + "id": "baby", + "name": "Baby", + "keywords": [ + "baby diaper", "baby wipes", "diaper pail", "diaper cream", + "baby formula", "baby food", "baby bottle", "baby sippy cup", + "baby pacifier", "teether", "baby bib", "burp cloth", + "baby clothes", "baby onesie", "baby sleeper", "baby swaddle", + "baby shoes", "baby socks", "baby hat", "baby mittens", + "baby car seat", "infant car seat", "convertible car seat", "booster seat", + "baby stroller", "umbrella stroller", "double stroller", "jogging stroller", + "baby carrier", "baby wrap", "baby sling", "backpack carrier", + "baby monitor", "video baby monitor", "audio baby monitor", + "crib", "baby bassinet", "playpen", "baby crib mattress", + "baby bouncer", "baby swing", "baby rocker", "baby play mat", + "baby gate", "baby proofing", "cabinet lock", "corner guard", + "high chair", "baby booster seat", "baby feeding set", + "baby bath tub", "baby towel", "baby wash", "baby lotion", + "toddler bed", "toddler pillow", "toddler blanket", + "potty training", "potty chair", "potty seat", + "nursery decor", "baby mobile", "night light", "baby lamp", + ], + }, + { + "id": "toys", + "name": "Toys & Games", + "keywords": [ + "LEGO set", "LEGO City", "LEGO Technic", "LEGO Star Wars", + "building blocks", "magnetic tiles", "construction set", + "action figure", "superhero figure", "doll", "Barbie", "dollhouse", + "stuffed animal", "plush toy", "squishmallow", "teddy bear", + "board game", "Monopoly", "Sorry", "Clue", "Scrabble", + "card game", "Uno", "Phase 10", "Skip-Bo", + "puzzle", "jigsaw puzzle", "floor puzzle", "wooden puzzle", + "educational toy", "STEM toy", "science kit", "robot kit", + "arts and crafts", "coloring book", "crayon", "paint set", + "play dough", "slime kit", "modeling clay", + "remote control car", "RC car", "RC truck", "RC helicopter", + "train set", "wooden train", "toy train", "model train", + "baby toy", "rattle", "activity toy", "stacking toy", + "pretend play", "play kitchen", "tool set", "doctor kit", + "outdoor toy", "bubble machine", "water gun", "kite", + "kids bike", "balance bike", "tricycle", "scooter", + "kids game", "kids craft", "party favor", + "fidget toy", "pop it", "fidget spinner", "stress ball", + "card game TCG", "Pokemon card", "Magic The Gathering", "Yu-Gi-Oh", + ], + }, + { + "id": "video_games", + "name": "Video Games", + "keywords": [ + "PS5", "PlayStation 5", "PS5 controller", "PS5 headset", + "Xbox Series X", "Xbox Series S", "Xbox controller", "Xbox headset", + "Nintendo Switch", "Switch OLED", "Switch Lite", "Joy Con", + "PS5 game", "PlayStation game", "Xbox game", "Nintendo Switch game", + "PC game", "Steam card", "Xbox Game Pass", "PlayStation Plus", + "gaming chair", "gaming desk", "gaming headset", "gaming keyboard", + "gaming mouse", "mouse pad", "gaming monitor", + "PC controller", "racing wheel", "flight stick", "arcade stick", + "capture card", "streaming mic", "streaming camera", + "VR", "Meta Quest 3", "PlayStation VR", "VR accessories", + "Nintendo", "Mario", "Zelda", "Pokemon", + "Call of Duty", "Madden", "FIFA", "NBA 2K", + "Minecraft", "Fortnite", "Roblox", "GTA", + "retro gaming", "Nintendo Classic", "Sega Genesis", + "gaming laptop", "gaming PC", "gaming controller charger", + ], + }, + { + "id": "books", + "name": "Books", + "keywords": [ + "fiction books", "fantasy books", "science fiction", "mystery novel", "romance novel", + "thriller book", "horror book", "historical fiction", "literary fiction", + "nonfiction", "biography", "memoir", "history book", + "self help book", "business book", "personal finance", "investing", + "cookbook", "recipe book", "baking cookbook", + "children book", "picture book", "chapter book", "young adult", + "textbook", "study guide", "test prep", "SAT prep", + "science book", "technology book", "programming book", + "art book", "photography book", "coffee table book", + "religion book", "spirituality", "Bible", "Christian book", + "travel guide", "travel book", "atlas", "map", + "comic book", "graphic novel", "manga", "anime book", + "audio book", "Kindle book", "ebook", "audible", + "book set", "box set", "series collection", + "language learning", "Spanish book", "French book", + "craft book", "home improvement book", "gardening book", + "health book", "diet book", "fitness book", + "political book", "philosophy", "true crime", + ], + }, + { + "id": "automotive", + "name": "Automotive", + "keywords": [ + "car seat cover", "car floor mat", "all weather floor mat", "cargo liner", + "car phone mount", "car charger", "car USB adapter", + "dash cam", "front dash cam", "rear dash cam", + "car battery", "jump starter", "battery charger", "portable jump starter", + "car cover", "car sun shade", "windshield cover", + "car wax", "car polish", "car cleaner", "car detailing kit", + "motor oil", "synthetic oil", "oil filter", "transmission fluid", + "car tool set", "emergency kit", "roadside kit", "tow strap", + "car jack", "jack stand", "ramp", "tire inflator", + "roof rack", "roof box", "cargo carrier", "bike rack", + "car interior accessory", "steering wheel cover", "gear shifter", + "car lighting", "LED headlight", "fog light", "interior light", + "car audio", "car stereo", "car speaker", "car subwoofer", + "tire", "all season tire", "winter tire", "summer tire", + "wheel cover", "hubcap", "lug nut", "wheel lock", + "motorcycle helmet", "motorcycle cover", "motorcycle oil", + "ATV accessory", "offroad accessory", + ], + }, + { + "id": "pet_supplies", + "name": "Pet Supplies", + "keywords": [ + "dog food", "dry dog food", "wet dog food", "puppy food", + "cat food", "dry cat food", "wet cat food", "kitten food", + "dog treat", "dog chew", "dog bone", "dental chew", + "cat treat", "catnip", "cat snack", + "dog bed", "dog crate", "dog kennel", "dog house", + "cat bed", "cat tree", "cat tower", "cat condo", + "dog leash", "dog collar", "dog harness", "dog muzzle", + "cat collar", "cat harness", "cat leash", + "dog toy", "chew toy", "fetch toy", "puzzle toy", + "cat toy", "cat wand", "laser toy", "cat tunnel", + "dog bowl", "cat bowl", "automatic feeder", "water fountain", + "litter box", "cat litter", "litter mat", "litter scoop", + "dog shampoo", "dog brush", "nail clipper", "pet wipes", + "dog waste bag", "poop bag", "poop bag dispenser", + "pet carrier", "pet travel bag", "car seat cover for dogs", + "aquarium", "fish tank", "aquarium filter", "fish food", + "bird cage", "bird food", "bird toy", "bird perch", + "hamster cage", "hamster wheel", "guinea pig cage", + "reptile tank", "reptile light", "reptile heat lamp", + ], + }, + { + "id": "office", + "name": "Office Products", + "keywords": [ + "office chair", "ergonomic chair", "executive chair", "task chair", + "standing desk", "adjustable desk", "desk riser", "standing desk converter", + "desk", "computer desk", "corner desk", "writing desk", + "bookcase", "bookshelf", "storage cabinet", "filing cabinet", + "desk lamp", "LED desk lamp", "architect lamp", "clamp lamp", + "whiteboard", "dry erase board", "bulletin board", "cork board", + "printer paper", "copy paper", "notebook", "legal pad", + "pen", "pencil", "marker", "highlighter", + "binder", "folder", "divider", "sheet protector", + "stapler", "tape dispenser", "paper clip", "rubber band", + "envelope", "shipping label", "mailing box", "packing tape", + "shredder", "paper shredder", "laminator", "binding machine", + "calculator", "scientific calculator", "printing calculator", + "business card holder", "name badge", "badge holder", + "fireproof safe", "security safe", "lock box", "cash box", + "classroom supplies", "teacher supplies", "bulletin board decor", + "planner", "calendar", "weekly planner", "daily planner", + "post it note", "index card", "sticky flag", "tab divider", + ], + }, + { + "id": "grocery", + "name": "Grocery & Gourmet Food", + "keywords": [ + "coffee beans", "ground coffee", "coffee pods", "Keurig pods", + "tea", "green tea", "black tea", "herbal tea", + "protein bar", "granola bar", "snack bar", "energy bar", + "chocolate", "dark chocolate", "milk chocolate", "chocolate bar", + "candy", "gummy candy", "hard candy", "lollipop", + "chips", "potato chips", "tortilla chips", "pita chips", + "crackers", "pretzels", "popcorn", "nuts", + "granola", "oatmeal", "cereal", "breakfast bar", + "pasta", "spaghetti", "penne", "linguine", + "rice", "white rice", "brown rice", "jasmine rice", + "olive oil", "vegetable oil", "coconut oil", "avocado oil", + "vinegar", "soy sauce", "hot sauce", "ketchup", + "salt", "pepper", "spice", "seasoning", + "honey", "maple syrup", "peanut butter", "jam", + "canned soup", "canned vegetable", "canned fruit", "canned fish", + "broth", "stock", "bone broth", "tomato sauce", + "baking mix", "flour", "sugar", "vanilla extract", + "jerky", "beef jerky", "turkey jerky", "dried fruit", + "trail mix", "seeds", "dried seaweed", "rice cake", + "water", "sparkling water", "flavored water", "coconut water", + "energy drink", "sports drink", "protein shake", "soda", + ], + }, + { + "id": "arts_crafts", + "name": "Arts, Crafts & Sewing", + "keywords": [ + "yarn", "knitting yarn", "crochet yarn", "embroidery floss", + "sewing machine", "serger", "sewing kit", "sewing scissors", + "fabric", "cotton fabric", "quilt fabric", "felt fabric", + "beads", "jewelry making", "beading kit", "bead organizer", + "paint", "acrylic paint", "oil paint", "watercolor paint", + "paint brush set", "canvas", "easel", "palette", + "drawing pencil", "colored pencil", "charcoal", "pastel", + "sketchbook", "drawing paper", "watercolor paper", + "clay", "pottery clay", "polymer clay", "air dry clay", + "embroidery kit", "cross stitch", "needlepoint", + "quilting kit", "quilting fabric", "quilting ruler", + "scrapbooking", "paper craft", "card making", "sticker", + "cricut", "cutting machine", "vinyl", "heat press", + "glue", "hot glue", "craft glue", "epoxy", + "ribbon", "lace", "button", "zipper", + "origami paper", "tissue paper", "wrapping paper", + "party decoration", "balloon", "banner", "confetti", + "wood burning", "leather craft", "resin mold", "candle making", + "soap making", "soap mold", "mica powder", "fragrance oil", + ], + }, + { + "id": "musical_instruments", + "name": "Musical Instruments", + "keywords": [ + "guitar", "acoustic guitar", "electric guitar", "classical guitar", + "guitar amplifier", "guitar pedal", "guitar strings", "guitar pick", + "bass guitar", "electric bass", "bass amplifier", + "keyboard piano", "digital piano", "MIDI keyboard", "synthesizer", + "drum set", "electronic drum", "snare drum", "cymbals", + "violin", "fiddle", "viola", "cello", "double bass", + "flute", "clarinet", "saxophone", "trumpet", "trombone", + "microphone", "studio mic", "dynamic mic", "condenser mic", + "studio monitor", "audio interface", "mixer", "headphone", + "DJ equipment", "DJ controller", "turntable", "DJ mixer", + "ukulele", "mandolin", "banjo", "harmonica", + "accordion", "bagpipes", "pan flute", + "sheet music", "music stand", "metronome", "tuner", + "instrument case", "guitar case", "violin case", + "guitar stand", "keyboard stand", "drum throne", + "amplifier", "PA system", "power amplifier", "speaker", + "recording equipment", "studio headphones", + "effects pedal", "wah pedal", "distortion", "reverb", + ], + }, + { + "id": "appliances", + "name": "Major Appliances", + "keywords": [ + "refrigerator", "French door refrigerator", "side by side refrigerator", + "washer", "washing machine", "front load washer", "top load washer", + "dryer", "electric dryer", "gas dryer", "washer dryer combo", + "dishwasher", "built in dishwasher", "portable dishwasher", + "range", "gas range", "electric range", "induction range", + "oven", "wall oven", "double oven", "convection oven", + "cooktop", "gas cooktop", "electric cooktop", "induction cooktop", + "microwave", "over the range microwave", "countertop microwave", + "freezer", "chest freezer", "upright freezer", + "wine cooler", "wine refrigerator", "beverage cooler", + "ice maker", "portable ice maker", "countertop ice maker", + "water dispenser", "water cooler", "bottleless water cooler", + "range hood", "vent hood", "downdraft vent", + "trash compactor", "disposal", "garbage disposal", + "humidifier", "dehumidifier", "air conditioner", + "portable AC", "window AC", "heater", "space heater", + ], + }, + { + "id": "luggage", + "name": "Luggage & Travel Gear", + "keywords": [ + "suitcase", "carry on luggage", "checked luggage", "hardside suitcase", + "travel backpack", "hiking backpack", "laptop backpack", + "duffel bag", "weekender bag", "gym bag", + "travel tote", "travel crossbody", "travel wallet", + "luggage set", "luggage 3 piece", "luggage 4 piece", + "luggage cover", "luggage tag", "luggage strap", + "travel pillow", "neck pillow", "eye mask", "travel blanket", + "travel organizer", "packing cube", "toiletry bag", "shoe bag", + "passport holder", "travel document organizer", "money belt", + "garment bag", "garment suitcase", "suit bag", + "kids luggage", "kids backpack", "kids travel bag", + "travel adapter", "power adapter", "voltage converter", + "travel scale", "luggage scale", "TSA lock", + "umbrella", "travel umbrella", "compact umbrella", + "travel towel", "quick dry towel", "microfiber towel", + ], + }, + { + "id": "movies_music", + "name": "Movies, Music & TV", + "keywords": [ + "Blu-ray", "4K Blu-ray", "Blu-ray movie", "DVD movie", + "Vinyl record", "LP record", "vinyl album", + "CD album", "music CD", "box set music", + "movie collection", "TV series DVD", "box set DVD", + "documentary DVD", "concert DVD", + "record player", "turntable", "vinyl player", + "CD player", "Blu-ray player", "DVD player", + "movie poster", "band poster", "music poster", + ], + }, +] + + +@register("amazon_us") +class AmazonUSScraper: + def __init__( + self, + api_key: str | None = None, + api_base: str = "http://localhost:8000", + batch_size: int = 100, + delay: float = 2.0, + scrape_only: bool = False, + output_dir: str | None = None, + max_pages_per_keyword: int = 25, + proxies: list[str] | None = None, + session_file: str | None = None, + ): + self.api_key = api_key + self.api_base = api_base.rstrip("/") + self.batch_size = batch_size + self.delay = delay + self.scrape_only = scrape_only + self.output_dir = output_dir or OUTPUT_DIR + self.max_pages_per_keyword = max_pages_per_keyword + self.proxies = proxies or [] + self._proxy_index = 0 + self.session_file = session_file + self.client = httpx.AsyncClient(timeout=30.0, headers=HEADERS, follow_redirects=True) + self.total_scraped = 0 + self.total_ingested = 0 + self.total_updated = 0 + self.total_failed = 0 + self.seen_asins: set[str] = set() + self._load_session() + self._ensure_output_dir() + + def _load_session(self) -> None: + if self.session_file and os.path.exists(self.session_file): + try: + with open(self.session_file, "r") as f: + data = json.load(f) + self.seen_asins = set(data.get("seen_asins", [])) + print(f"Loaded session with {len(self.seen_asins)} previously scraped ASINs") + except Exception: + pass + + def _save_session(self) -> None: + if self.session_file: + try: + with open(self.session_file, "w") as f: + json.dump({"seen_asins": list(self.seen_asins)}, f) + except Exception: + pass + + def _get_proxy(self) -> str | None: + all_proxies = self.proxies.copy() + scraperapi_key = os.environ.get("SCRAPERAPI_KEY") + if scraperapi_key: + all_proxies.append(f"http://scraperapi:{scraperapi_key}@proxy-server.scraperapi.com:8001") + if not all_proxies: + return None + proxy = all_proxies[self._proxy_index % len(all_proxies)] + self._proxy_index += 1 + return proxy + + async def _fetch_with_playwright(self, url: str) -> str | None: + try: + from playwright.async_api import async_playwright + except ImportError: + return None + try: + async with async_playwright() as p: + browser = await p.chromium.launch( + headless=True, + args=[ + "--no-sandbox", + "--disable-blink-features=AutomationControlled", + "--disable-dev-shm-usage", + ], + ) + context = await browser.new_context( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + viewport={"width": 1920, "height": 1080}, + locale="en-US", + ) + await context.add_init_script(""" + Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); + Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); + Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); + """) + page = await context.new_page() + try: + await page.goto(url, wait_until="domcontentloaded", timeout=30000) + await page.wait_for_timeout(3000) + html = await page.content() + return html + except Exception as e: + print(f" Playwright error: {e}") + return None + finally: + await browser.close() + except Exception as e: + print(f" Playwright launch error: {e}") + return None + + def _ensure_output_dir(self) -> None: + os.makedirs(self.output_dir, exist_ok=True) + ts = time.strftime("%Y%m%d_%H%M%S") + self.products_outfile = os.path.join(self.output_dir, f"products_{ts}.jsonl") + + async def close(self) -> None: + await self.client.aclose() + + async def _get_with_retry( + self, url: str, params: dict[str, Any] | None = None, retries: int = MAX_RETRIES + ) -> str | None: + full_url = url + if params: + query = urllib.parse.urlencode(params) + full_url = f"{url}?{query}" + + scraperapi_key = os.environ.get("SCRAPERAPI_KEY") + if scraperapi_key: + for attempt in range(retries): + try: + resp = await self.client.get( + "http://api.scraperapi.com", + params={ + "api_key": scraperapi_key, + "url": full_url, + "render": "true", + }, + ) + if resp.status_code == 429: + await asyncio.sleep(RATE_LIMIT_WAIT * (attempt + 1)) + continue + resp.raise_for_status() + return resp.text + except httpx.HTTPStatusError as e: + if e.response.status_code == 429: + await asyncio.sleep(RATE_LIMIT_WAIT * (attempt + 1)) + continue + if "credits" in (e.response.text or "").lower(): + print(" ScraperAPI credits exhausted, falling back to Playwright...") + break + if attempt < retries - 1: + await asyncio.sleep((2 ** attempt) * self.delay) + else: + return None + except Exception: + if attempt < retries - 1: + await asyncio.sleep((2 ** attempt) * self.delay) + else: + return None + + print(" Using Playwright...") + return await self._fetch_with_playwright(full_url) + + def _write_products_to_file(self, products: list[dict[str, Any]]) -> None: + if not products: + return + with open(self.products_outfile, "a", encoding="utf-8") as f: + for product in products: + f.write(json.dumps(product, ensure_ascii=False) + "\n") + + def _parse_price(self, value: str | None) -> float: + if not value: + return 0.0 + cleaned = value.replace("$", "").replace(",", "").strip() + match = re.search(r"\d+(?:\.\d+)?", cleaned) + if not match: + return 0.0 + try: + return float(match.group(0)) + except ValueError: + return 0.0 + + def _parse_int(self, value: str | None) -> int: + if not value: + return 0 + digits = re.sub(r"[^\d]", "", value) + return int(digits) if digits else 0 + + def _extract_brand(self, title: str) -> str: + if not title: + return "" + first_token = title.split()[0].strip("()[],:") + if not first_token: + return "" + if any(char.isdigit() for char in first_token): + return "" + return first_token[:80] + + def transform_product( + self, raw: dict[str, Any], category_name: str, keyword: str + ) -> dict[str, Any] | None: + try: + asin = str(raw.get("asin", "") or raw.get("sku", "")).strip() + if not asin: + return None + + title = (raw.get("title") or "").strip() + if not title: + return None + + url = raw.get("url") or f"{BASE_URL}/dp/{asin}" + if not url.startswith("http"): + url = urljoin(BASE_URL, url) + + price = self._parse_price(raw.get("price")) + original_price = self._parse_price(raw.get("original_price")) or price + review_count = self._parse_int(raw.get("review_count")) + + rating = 0.0 + rating_text = raw.get("rating") or "" + rating_match = re.search(r"(\d+(?:\.\d+)?)", rating_text) + if rating_match: + rating = float(rating_match.group(1)) + + category_path = [category_name] + if keyword and keyword.lower() != category_name.lower(): + category_path.append(keyword) + + is_prime = bool(raw.get("is_prime", False)) + + brand = raw.get("brand") + if not brand or not brand.strip(): + brand = self._extract_brand(title) + + return { + "sku": asin, + "merchant_id": MERCHANT_ID, + "title": title, + "description": raw.get("description") or "", + "price": price, + "currency": "USD", + "url": url, + "image_url": raw.get("image_url") or "", + "category": category_name, + "category_path": category_path, + "brand": brand, + "is_active": True, + "metadata": { + "keyword": keyword, + "original_price": original_price, + "rating": rating, + "review_count": review_count, + "is_sponsored": bool(raw.get("is_sponsored", False)), + "is_prime": is_prime, + "country_code": "US", + "region": "us", + }, + } + except Exception: + return None + + def parse_search_results( + self, html: str, category_name: str, keyword: str + ) -> tuple[list[dict[str, Any]], bool]: + soup = BeautifulSoup(html, "html.parser") + products: list[dict[str, Any]] = [] + + for card in soup.select('[data-component-type="s-search-result"][data-asin]'): + asin = (card.get("data-asin") or "").strip() + if not asin: + continue + + title_el = card.select_one("h2 span") + if not title_el: + continue + + link_el = card.select_one("h2 a") + price_el = card.select_one(".a-price .a-offscreen") + original_price_el = card.select_one(".a-text-price .a-offscreen") + image_el = card.select_one("img.s-image") + rating_el = card.select_one(".a-icon-alt") + review_el = card.select_one('a[href*="#customerReviews"] span, a[href*="#customerReviews"]:not(span)') + sponsored_el = card.select_one('[aria-label="Sponsored"], .puis-sponsored-label-text') + prime_el = card.select_one('.a-icon-prime, [aria-label*="Prime"], .prime-badge') + + raw_product = { + "asin": asin, + "title": title_el.get_text(" ", strip=True), + "url": link_el.get("href", "") if link_el else "", + "price": price_el.get_text(strip=True) if price_el else "", + "original_price": ( + original_price_el.get_text(strip=True) if original_price_el else "" + ), + "image_url": image_el.get("src", "") if image_el else "", + "rating": rating_el.get_text(" ", strip=True) if rating_el else "", + "review_count": review_el.get_text(" ", strip=True) if review_el else "", + "is_sponsored": sponsored_el is not None, + "is_prime": prime_el is not None, + } + + transformed = self.transform_product(raw_product, category_name, keyword) + if transformed: + products.append(transformed) + + has_next_page = soup.select_one(".s-pagination-next:not(.s-pagination-disabled)") is not None + return products, has_next_page + + async def ingest_batch(self, products: list[dict[str, Any]]) -> tuple[int, int, int]: + if not products: + return 0, 0, 0 + + if self.scrape_only: + self._write_products_to_file(products) + return len(products), 0, 0 + + url = f"{self.api_base}/v1/ingest/products" + headers = {"Authorization": f"Bearer {self.api_key}"} + payload = {"source": SOURCE, "products": products} + + try: + resp = await self.client.post(url, json=payload, headers=headers) + resp.raise_for_status() + result = resp.json() + return ( + result.get("rows_inserted", 0), + result.get("rows_updated", 0), + result.get("rows_failed", 0), + ) + except Exception as e: + print(f" Ingestion error: {e}") + return 0, 0, len(products) + + async def scrape_keyword( + self, category: dict[str, Any], keyword: str + ) -> dict[str, int]: + category_name = category["name"] + print(f"\n[{category_name}] keyword='{keyword}'") + counts = {"scraped": 0, "ingested": 0, "updated": 0, "failed": 0} + batch: list[dict[str, Any]] = [] + + for page in range(1, self.max_pages_per_keyword + 1): + params = {"k": keyword, "page": page} + html = await self._get_with_retry(f"{BASE_URL}/s", params=params) + if not html: + print(f" Page {page}: request failed") + break + + parsed_products, has_next_page = self.parse_search_results(html, category_name, keyword) + + fresh_products = [] + for product in parsed_products: + if product["sku"] in self.seen_asins: + continue + self.seen_asins.add(product["sku"]) + fresh_products.append(product) + + if not fresh_products: + print(f" Page {page}: no new products") + if not has_next_page: + break + await asyncio.sleep(self.delay) + continue + + for product in fresh_products: + batch.append(product) + counts["scraped"] += 1 + + if len(batch) >= self.batch_size: + i, u, f = await self.ingest_batch(batch) + counts["ingested"] += i + counts["updated"] += u + counts["failed"] += f + self.total_ingested += i + self.total_updated += u + self.total_failed += f + batch = [] + await asyncio.sleep(self.delay) + + print(f" Page {page}: parsed={len(parsed_products)} new={len(fresh_products)} total={counts['scraped']}") + + if page % 5 == 0: + self._save_session() + + if not has_next_page: + break + + await asyncio.sleep(self.delay) + + if batch: + i, u, f = await self.ingest_batch(batch) + counts["ingested"] += i + counts["updated"] += u + counts["failed"] += f + self.total_ingested += i + self.total_updated += u + self.total_failed += f + + self.total_scraped += counts["scraped"] + self._save_session() + return counts + + async def run(self) -> dict[str, Any]: + mode = "scrape only" if self.scrape_only else f"API: {self.api_base}" + print("Amazon US Scraper starting...") + print(f"Mode: {mode}") + print(f"Batch size: {self.batch_size}, Delay: {self.delay}s") + print(f"Max pages per keyword: {self.max_pages_per_keyword}") + print(f"Output: {self.products_outfile}") + + total_keywords = sum(len(c["keywords"]) for c in CATEGORIES) + print(f"Categories: {len(CATEGORIES)}, Keywords: {total_keywords}") + print(f"Target: 500,000+ products") + + start = time.time() + + for category in CATEGORIES: + for keyword in category["keywords"]: + counts = await self.scrape_keyword(category, keyword) + print(f" [{category['name']} / {keyword}] Done: {counts}") + await asyncio.sleep(self.delay) + + elapsed = time.time() - start + self._save_session() + summary = { + "elapsed_seconds": round(elapsed, 1), + "total_scraped": self.total_scraped, + "total_ingested": self.total_ingested, + "total_updated": self.total_updated, + "total_failed": self.total_failed, + "output_file": self.products_outfile, + "unique_asins": len(self.seen_asins), + } + print(f"\nScraper complete: {summary}") + return summary + + +async def main() -> None: + parser = argparse.ArgumentParser(description="Amazon US Scraper") + parser.add_argument("--api-key", help="BuyWhere API key") + parser.add_argument("--api-base", default="http://localhost:8000", help="BuyWhere API base URL") + parser.add_argument("--batch-size", type=int, default=100) + parser.add_argument("--delay", type=float, default=2.0, help="Delay between requests/batches (seconds)") + parser.add_argument("--scrape-only", action="store_true", help="Save to JSONL without ingesting") + parser.add_argument("--output-dir", help="Override output directory") + parser.add_argument("--max-pages-per-keyword", type=int, default=25) + parser.add_argument("--pages", type=int, help="Shorthand for --max-pages-per-keyword") + parser.add_argument("--session-file", help="Path to session file for resume support") + parser.add_argument("--proxies", nargs="*", help="List of proxy URLs to rotate through") + args = parser.parse_args() + + if args.pages: + args.max_pages_per_keyword = args.pages + + if not args.scrape_only and not args.api_key: + parser.error("--api-key is required unless --scrape-only is used") + + scraper = AmazonUSScraper( + api_key=args.api_key, + api_base=args.api_base, + batch_size=args.batch_size, + delay=args.delay, + scrape_only=args.scrape_only, + output_dir=args.output_dir, + max_pages_per_keyword=args.max_pages_per_keyword, + proxies=args.proxies, + session_file=args.session_file, + ) + + try: + await scraper.run() + finally: + await scraper.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scripts/check-mcp-uptime.sh b/scripts/check-mcp-uptime.sh new file mode 100644 index 000000000..f8e6b4eb8 --- /dev/null +++ b/scripts/check-mcp-uptime.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# scripts/check-mcp-uptime.sh — Poll MCP tools/list and validate tool availability (BUY-10855) +# Intended to run as a cron job every 60s. +# Usage: ./scripts/check-mcp-uptime.sh +# MCP_URL — MCP server URL (default: https://api.buywhere.ai/mcp) +# EXPECTED_TOOLS — comma-separated list of expected tool names (default: search_products,get_product,compare_products,get_deals,list_categories,find_best_price) +# EXPECTED_COUNT — expected number of tools (default: 6) +# LOG_DIR — directory for uptime log (default: /var/log/buywhere) +# LOG_FILE — full path to log file (overrides LOG_DIR) +set -euo pipefail + +MCP_URL="${MCP_URL:-https://api.buywhere.ai/mcp}" +EXPECTED_TOOLS_STR="${EXPECTED_TOOLS:-search_products,get_product,compare_products,get_deals,list_categories,find_best_price}" +EXPECTED_COUNT="${EXPECTED_COUNT:-6}" +LOG_FILE="${LOG_FILE:-${LOG_DIR:-/var/log/buywhere}/mcp-uptime.ndjson}" +TS=$(date -u +%Y-%m-%dT%H:%M:%SZ) + +mkdir -p "$(dirname "$LOG_FILE")" + +IFS=',' read -ra EXPECTED_TOOLS <<< "$EXPECTED_TOOLS_STR" + +START_NS=$(date +%s%N) +RESPONSE_FILE=$(mktemp) +HTTP_CODE=$(curl -s -o "$RESPONSE_FILE" -w "%{http_code}" --max-time 10 \ + -X POST "$MCP_URL" \ + -H "Content-Type: application/json" \ + -d '{"jsonrpc":"2.0","method":"tools/list","id":1}' 2>/dev/null || echo "000") +END_NS=$(date +%s%N) +LATENCY_MS=$(( (END_NS - START_NS) / 1000000 )) + +ALERT="" +FOUND_TOOLS="" +MISSING_TOOLS="" +TOOL_COUNT=0 + +if [ "$HTTP_CODE" = "200" ]; then + TOOL_NAMES=$(python3 -c " +import json, sys +try: + data = json.load(open('$RESPONSE_FILE')) + tools = data.get('result', {}).get('tools', []) + names = [t['name'] for t in tools] + print('TOOL_COUNT:' + str(len(names))) + print('TOOLS:' + ','.join(names)) +except Exception as e: + print('ERROR:' + str(e)) +" 2>/dev/null || echo "PARSE_ERROR") + + if echo "$TOOL_NAMES" | grep -q "^TOOL_COUNT:"; then + TOOL_COUNT=$(echo "$TOOL_NAMES" | grep "^TOOL_COUNT:" | sed 's/^TOOL_COUNT://') + FOUND_TOOLS=$(echo "$TOOL_NAMES" | grep "^TOOLS:" | sed 's/^TOOLS://') + + if [ "$TOOL_COUNT" -lt "$EXPECTED_COUNT" ]; then + ALERT="tool_count_mismatch" + fi + + MISSING="" + for tool in "${EXPECTED_TOOLS[@]}"; do + if ! echo ",$FOUND_TOOLS," | grep -q ",$tool,"; then + MISSING="${MISSING},${tool}" + fi + done + MISSING_TOOLS="${MISSING#,}" + + if [ -n "$MISSING_TOOLS" ]; then + if [ -n "$ALERT" ]; then + ALERT="${ALERT}+missing_tools" + else + ALERT="missing_tools" + fi + fi + + if [ -z "$ALERT" ]; then + RESULT="up" + else + RESULT="degraded" + fi + else + RESULT="degraded" + ALERT="parse_error" + FOUND_TOOLS="" + TOOL_COUNT=0 + fi +else + RESULT="down" + ALERT="http_${HTTP_CODE}" +fi + +rm -f "$RESPONSE_FILE" + +echo "{\"ts\":\"$TS\",\"result\":\"$RESULT\",\"http_code\":$HTTP_CODE,\"latency_ms\":$LATENCY_MS,\"tool_count\":$TOOL_COUNT,\"expected_count\":$EXPECTED_COUNT,\"found_tools\":\"$FOUND_TOOLS\",\"missing_tools\":\"$MISSING_TOOLS\",\"alert\":\"$ALERT\"}" >> "$LOG_FILE" + +tail -n 129600 "$LOG_FILE" > "${LOG_FILE}.tmp" && mv "${LOG_FILE}.tmp" "$LOG_FILE" + +ALERT_MSG="" +if [ -n "$ALERT" ]; then + ALERT_MSG=" alert=$ALERT" +fi +echo "[$TS] MCP=$RESULT http=$HTTP_CODE latency=${LATENCY_MS}ms tools=${TOOL_COUNT}/${EXPECTED_COUNT}${ALERT_MSG}" diff --git a/scripts/mcp-uptime-dashboard.html b/scripts/mcp-uptime-dashboard.html new file mode 100644 index 000000000..74cf3a734 --- /dev/null +++ b/scripts/mcp-uptime-dashboard.html @@ -0,0 +1,187 @@ + + + + + +BuyWhere MCP Uptime Dashboard + + + +

MCP Uptime Monitor

+

Loading...

+ +
+ +
+
30-Day Uptime
+
5-Min Uptime
+
p95 Latency
+
Error Rate (30d)
+
+ +
Current Status
+
+ +
+
+
Hourly Uptime (24h)
+
+
+
+
p95 Latency Trend (24h)
+
+
+
+ +
Hourly Breakdown
+
HourUptimeChecksAvg Latency
+ + + + + + diff --git a/scripts/report-mcp-uptime.sh b/scripts/report-mcp-uptime.sh new file mode 100644 index 000000000..e99d3664d --- /dev/null +++ b/scripts/report-mcp-uptime.sh @@ -0,0 +1,188 @@ +#!/usr/bin/env bash +# scripts/report-mcp-uptime.sh — Generate uptime + tool availability metrics (BUY-10855) +# Usage: ./scripts/report-mcp-uptime.sh [output_dir] +# LOG_FILE — path to the NDJSON log (default: /var/log/buywhere/mcp-uptime.ndjson) +# OUTPUT_DIR — directory for generated report files (default: /var/www/mcp-uptime or $1) +# MCP_URL — MCP server URL (default: https://api.buywhere.ai/mcp) +set -euo pipefail + +LOG_FILE="${LOG_FILE:-${LOG_DIR:-/var/log/buywhere}/mcp-uptime.ndjson}" +OUTPUT_DIR="${1:-${OUTPUT_DIR:-/var/www/mcp-uptime}}" +MCP_URL="${MCP_URL:-https://api.buywhere.ai/mcp}" +TS=$(date -u +%Y-%m-%dT%H:%M:%SZ) + +if [ ! -f "$LOG_FILE" ]; then + echo "ERROR: log file not found: $LOG_FILE" >&2 + exit 1 +fi + +mkdir -p "$OUTPUT_DIR" + +python3 - "$LOG_FILE" "$OUTPUT_DIR" "$MCP_URL" <<'PYEOF' +import json, os, sys, statistics +from datetime import datetime, timezone, timedelta +from collections import Counter + +log_file = sys.argv[1] +output_dir = sys.argv[2] +mcp_url = sys.argv[3] + +with open(log_file) as f: + lines = [json.loads(l) for l in f if l.strip()] + +if not lines: + print("ERROR: no data in log file") + sys.exit(1) + +now = datetime.now(timezone.utc) + +five_min_ago = now - timedelta(minutes=5) +thirty_days_ago = now - timedelta(days=30) + +window_5m = [e for e in lines if datetime.fromisoformat(e["ts"]) >= five_min_ago] +window_30d = [e for e in lines if datetime.fromisoformat(e["ts"]) >= thirty_days_ago] + +def calc_metrics(entries, label): + total = len(entries) + up = sum(1 for e in entries if e["result"] == "up") + down = sum(1 for e in entries if e["result"] == "down") + degraded = sum(1 for e in entries if e["result"] == "degraded") + uptime_pct = round((up / total * 100), 4) if total > 0 else 0 + error_rate = round((down / total * 100), 4) if total > 0 else 0 + + latencies = sorted([e["latency_ms"] for e in entries]) + p95 = latencies[int(len(latencies) * 0.95)] if latencies else 0 + p99 = latencies[int(len(latencies) * 0.99)] if latencies else 0 + avg_latency = round(statistics.mean(latencies), 1) if latencies else 0 + max_latency = max(latencies) if latencies else 0 + + tool_counts = [e.get("tool_count", 0) for e in entries if "tool_count" in e] + min_tool_count = min(tool_counts) if tool_counts else 0 + expected_count = entries[-1].get("expected_count", 6) if entries else 6 + + missing_tools_all = [] + for e in entries: + mt = e.get("missing_tools", "") + if mt: + missing_tools_all.extend(mt.split(",")) + missing_tool_counts = dict(Counter(missing_tools_all)) + + return { + "label": label, + "total_checks": total, + "up": up, + "down": down, + "degraded": degraded, + "uptime_pct": uptime_pct, + "error_rate": error_rate, + "p95_latency_ms": p95, + "p99_latency_ms": p99, + "avg_latency_ms": avg_latency, + "max_latency_ms": max_latency, + "min_tool_count": min_tool_count, + "expected_tool_count": expected_count, + "missing_tool_frequency": missing_tool_counts, + "window_entries": total + } + +metrics_5m = calc_metrics(window_5m, "5m") +metrics_30d = calc_metrics(window_30d, "30d") + +latest = lines[-1] + +alerts = [] + +if metrics_5m["uptime_pct"] < 99.9 and metrics_5m["total_checks"] >= 3: + alerts.append({ + "level": "CRITICAL", + "type": "uptime", + "message": f"Uptime {metrics_5m['uptime_pct']}% below 99.9% threshold in last 5 minutes", + "window": "5m", + "threshold": 99.9, + "actual": metrics_5m["uptime_pct"] + }) + +if latest.get("alert") and "missing_tools" in str(latest.get("alert", "")): + alerts.append({ + "level": "CRITICAL", + "type": "missing_tools", + "message": f"Tools missing from MCP: {latest.get('missing_tools', 'unknown')}", + "tools_missing": latest.get("missing_tools", ""), + "tool_count": latest.get("tool_count", 0), + "expected_count": latest.get("expected_count", 6) + }) + +if latest.get("alert") and "tool_count_mismatch" in str(latest.get("alert", "")): + alerts.append({ + "level": "WARNING", + "type": "tool_count", + "message": f"Tool count {latest.get('tool_count', 0)} below expected {latest.get('expected_count', 6)}", + "tool_count": latest.get("tool_count", 0), + "expected_count": latest.get("expected_count", 6) + }) + +hourly_trend = [] +for h in range(23, -1, -1): + hour_start = now - timedelta(hours=h+1) + hour_end = now - timedelta(hours=h) + hour_entries = [e for e in lines if hour_start <= datetime.fromisoformat(e["ts"]) < hour_end] + if hour_entries: + up = sum(1 for e in hour_entries if e["result"] == "up") + total = len(hour_entries) + tc = [e.get("tool_count", 0) for e in hour_entries] + min_tc = min(tc) if tc else 0 + hourly_trend.append({ + "hour": hour_start.strftime("%Y-%m-%dT%H:00:00Z"), + "uptime_pct": round(up / total * 100, 2) if total > 0 else 0, + "checks": total, + "min_tool_count": min_tc, + "avg_latency_ms": round(statistics.mean([e["latency_ms"] for e in hour_entries]), 1) + }) + +latency_trend = [] +for m in range(287, -1, -1): + bucket_start = now - timedelta(minutes=m+5) + bucket_end = now - timedelta(minutes=m) + bucket = [e for e in lines if bucket_start <= datetime.fromisoformat(e["ts"]) < bucket_end] + if bucket: + latencies = [e["latency_ms"] for e in bucket] + latencies.sort() + p95 = latencies[int(len(latencies) * 0.95)] if latencies else 0 + tc = [e.get("tool_count", 0) for e in bucket] + latency_trend.append({ + "ts": bucket_start.strftime("%Y-%m-%dT%H:%M:00Z"), + "p95_ms": p95, + "avg_ms": round(statistics.mean(latencies), 1), + "samples": len(bucket), + "min_tool_count": min(tc) if tc else 0 + }) + +status = "alert" if alerts else "healthy" + +report = { + "generated_at": now.strftime("%Y-%m-%dT%H:%M:%SZ"), + "mcp_url": mcp_url, + "latest": latest, + "metrics_5m": metrics_5m, + "metrics_30d": metrics_30d, + "hourly_trend": hourly_trend, + "latency_trend": latency_trend, + "alerts": alerts, + "status": status +} + +report_path = os.path.join(output_dir, "uptime.json") +with open(report_path, "w") as f: + json.dump(report, f, indent=2) + +print(f"Report written to {report_path}") +print(f" 30d uptime: {metrics_30d['uptime_pct']}% | 5m uptime: {metrics_5m['uptime_pct']}%") +print(f" p95: {metrics_5m['p95_latency_ms']}ms | p99: {metrics_5m['p99_latency_ms']}ms | avg: {metrics_5m['avg_latency_ms']}ms") +print(f" Min tool count (5m): {metrics_5m['min_tool_count']}/{metrics_5m['expected_tool_count']}") +if latest.get("missing_tools"): + print(f" MISSING TOOLS: {latest['missing_tools']}") +if alerts: + for a in alerts: + print(f" ALERT [{a['level']}]: {a['message']}") +print(f" Status: {report['status']}") +PYEOF diff --git a/scripts/setup-mcp-uptime-monitoring.sh b/scripts/setup-mcp-uptime-monitoring.sh new file mode 100644 index 000000000..c94a39d0c --- /dev/null +++ b/scripts/setup-mcp-uptime-monitoring.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +set -euo pipefail + +MCP_URL="${1:-https://api.buywhere.ai/mcp}" +SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd)" + +# Determine if we can write to system directories (root or sudo NOPASSWD) +USE_SYSTEM=false +if [ "$(id -u)" = 0 ] 2>/dev/null; then + USE_SYSTEM=true +elif command -v sudo &>/dev/null; then + if sudo -n true 2>/dev/null || sudo true 2>/dev/null; then + USE_SYSTEM=true + fi +fi + +if [ "$USE_SYSTEM" = true ]; then + BIN_DIR="/usr/local/bin" + LOG_DIR="/var/log/buywhere" + WEB_ROOT="/var/www/mcp-uptime" +else + BIN_DIR="${HOME:-/tmp}/.local/bin" + LOG_DIR="${HOME:-/tmp}/mcp-uptime/logs" + WEB_ROOT="${HOME:-/tmp}/mcp-uptime/www" +fi + +echo "=== Installing MCP uptime monitoring ===" +echo "MCP URL: $MCP_URL" +echo "Web root: $WEB_ROOT" +echo "Log dir: $LOG_DIR" +echo "Scripts: $SCRIPTS_DIR" +echo "System mod: $USE_SYSTEM" + +if [ "$USE_SYSTEM" = true ]; then + [ "$(id -u)" = 0 ] && SUDO="" || SUDO="sudo" + $SUDO mkdir -p "$LOG_DIR" "$WEB_ROOT" + $SUDO cp "$SCRIPTS_DIR/check-mcp-uptime.sh" "$BIN_DIR/check-mcp-uptime.sh" + $SUDO cp "$SCRIPTS_DIR/report-mcp-uptime.sh" "$BIN_DIR/report-mcp-uptime.sh" + $SUDO cp "$SCRIPTS_DIR/mcp-uptime-dashboard.html" "$WEB_ROOT/index.html" + $SUDO chmod +x "$BIN_DIR/check-mcp-uptime.sh" "$BIN_DIR/report-mcp-uptime.sh" + + CRON_FILE="/etc/cron.d/buywhere-mcp-uptime" + printf '%s\n' \ + "# MCP uptime check (BUY-8992)" \ + "* * * * * root ${BIN_DIR}/check-mcp-uptime.sh >> ${LOG_DIR}/check.log 2>&1" \ + "" \ + "# Generate dashboard report" \ + "*/5 * * * * root ${BIN_DIR}/report-mcp-uptime.sh ${WEB_ROOT} >> ${LOG_DIR}/report.log 2>&1" \ + | $SUDO tee "$CRON_FILE" > /dev/null + $SUDO chmod 644 "$CRON_FILE" + + if command -v systemctl &>/dev/null; then + $SUDO systemctl restart cron 2>/dev/null || true + fi + + NGINX_CONF="/etc/nginx/sites-enabled/mcp-uptime.conf" + if [ ! -f "$NGINX_CONF" ]; then + printf '%s\n' \ + "# MCP uptime dashboard (BUY-8992)" \ + "location /mcp-uptime {" \ + " alias ${WEB_ROOT};" \ + " index index.html;" \ + " add_header Cache-Control \"no-cache, max-age=0\";" \ + " add_header X-Frame-Options \"SAMEORIGIN\";" \ + "}" \ + | $SUDO tee "$NGINX_CONF" > /dev/null + echo "nginx config written to $NGINX_CONF" + else + echo "nginx config already exists at $NGINX_CONF — skipping" + fi +else + mkdir -p "$LOG_DIR" "$WEB_ROOT" "$BIN_DIR" + cp "$SCRIPTS_DIR/check-mcp-uptime.sh" "$BIN_DIR/check-mcp-uptime.sh" + cp "$SCRIPTS_DIR/report-mcp-uptime.sh" "$BIN_DIR/report-mcp-uptime.sh" + cp "$SCRIPTS_DIR/mcp-uptime-dashboard.html" "$WEB_ROOT/index.html" + chmod +x "$BIN_DIR/check-mcp-uptime.sh" "$BIN_DIR/report-mcp-uptime.sh" + + (crontab -l 2>/dev/null || true; echo "LOG_DIR=${LOG_DIR} WEB_ROOT=${WEB_ROOT} * * * * * ${BIN_DIR}/check-mcp-uptime.sh >> ${LOG_DIR}/check.log 2>&1") | crontab - + (crontab -l 2>/dev/null || true; echo "LOG_DIR=${LOG_DIR} * */5 * * * * ${BIN_DIR}/report-mcp-uptime.sh ${WEB_ROOT} >> ${LOG_DIR}/report.log 2>&1") | crontab - + + echo "NOTE: nginx config not installed — run manually as root:" + echo " cat > /etc/nginx/sites-enabled/mcp-uptime.conf <<'EOF'" + echo " location /mcp-uptime {" + echo " alias ${WEB_ROOT};" + echo " index index.html;" + echo " add_header Cache-Control \"no-cache, max-age=0\";" + echo " add_header X-Frame-Options \"SAMEORIGIN\";" + echo " }" + echo " EOF" + echo " nginx -t && nginx -s reload" +fi + +if [ -f "$BIN_DIR/report-mcp-uptime.sh" ]; then + if [ "$USE_SYSTEM" = true ]; then + $SUDO "$BIN_DIR/report-mcp-uptime.sh" "$WEB_ROOT" || echo "WARNING: initial report failed" + else + "$BIN_DIR/report-mcp-uptime.sh" "$WEB_ROOT" || echo "WARNING: initial report failed" + fi +fi + +echo "" +echo "=== Installation complete ===" +echo "Dashboard: ${WEB_ROOT}/index.html" +echo "Log file: ${LOG_DIR}/mcp-uptime.ndjson" +echo "Report: ${WEB_ROOT}/uptime.json" +echo "Bin dir: $BIN_DIR" +echo "System mod: $USE_SYSTEM" diff --git a/src/app/sitemap.ts b/src/app/sitemap.ts deleted file mode 100644 index 893b7583a..000000000 --- a/src/app/sitemap.ts +++ /dev/null @@ -1,28 +0,0 @@ -import { MetadataRoute } from "next"; - -export default function sitemap(): MetadataRoute.Sitemap { - const base = "https://buywhere.ai"; - const now = new Date(); - - const routes = [ - { url: "/", priority: 1.0, changeFrequency: "weekly" as const }, - { url: "/quickstart/", priority: 0.9, changeFrequency: "weekly" as const }, - { url: "/integrate/", priority: 0.9, changeFrequency: "weekly" as const }, - { url: "/api-keys", priority: 0.9, changeFrequency: "monthly" as const }, - { url: "/merchants/", priority: 0.9, changeFrequency: "weekly" as const }, - { url: "/partners/", priority: 0.8, changeFrequency: "monthly" as const }, - { url: "/use-cases/", priority: 0.8, changeFrequency: "monthly" as const }, - { url: "/pricing/", priority: 0.8, changeFrequency: "monthly" as const }, - { url: "/about/", priority: 0.6, changeFrequency: "monthly" as const }, - { url: "/contact/", priority: 0.5, changeFrequency: "monthly" as const }, - { url: "/privacy/", priority: 0.3, changeFrequency: "yearly" as const }, - { url: "/terms/", priority: 0.3, changeFrequency: "yearly" as const }, - ]; - - return routes.map(({ url, priority, changeFrequency }) => ({ - url: `${base}${url}`, - lastModified: now, - changeFrequency, - priority, - })); -} diff --git a/src/lib/sitemaps.ts b/src/lib/sitemaps.ts index 05733714c..694e160d1 100644 --- a/src/lib/sitemaps.ts +++ b/src/lib/sitemaps.ts @@ -2,6 +2,18 @@ import { getAllBlogPosts } from "@/lib/blog"; import { PRODUCT_TAXONOMY, US_CATEGORY_META } from "@/lib/taxonomy"; import { getUSProducts, type USProductForSitemap } from "@/lib/us-products"; import { getSGProducts, type SGProductForSitemap } from "@/lib/sg-products"; +import fs from "node:fs"; + +function safeGetBlogPosts() { + try { + if (fs.existsSync(process.cwd() + "/content/blog")) { + return getAllBlogPosts(); + } + } catch { + // blog directory not available at runtime + } + return []; +} export const SITEMAP_BASE_URL = "https://buywhere.ai"; export const MAX_URLS_PER_SITEMAP = 50_000; @@ -55,6 +67,8 @@ const STATIC_SITEMAP_ROUTES = [ { path: "/contact/", priority: 0.5, changeFrequency: "monthly" as const }, { path: "/best-gaming-laptops-us/", priority: 0.9, changeFrequency: "weekly" as const }, { path: "/iphone-16-price-singapore/", priority: 0.9, changeFrequency: "weekly" as const }, + { path: "/laptop-singapore/", priority: 0.9, changeFrequency: "weekly" as const }, + { path: "/air-purifier-singapore/", priority: 0.9, changeFrequency: "weekly" as const }, { path: "/best-robot-vacuums-2026/", priority: 0.9, changeFrequency: "weekly" as const }, { path: "/privacy/", priority: 0.3, changeFrequency: "yearly" as const }, { path: "/terms/", priority: 0.3, changeFrequency: "yearly" as const }, @@ -122,7 +136,7 @@ export function renderSitemapIndex(urls: Array<{ url: string; lastModified: Date export function getStaticSitemapEntries(): SitemapUrlEntry[] { const now = new Date(); - const blogPosts = getAllBlogPosts(); + const blogPosts = safeGetBlogPosts(); return [ ...STATIC_SITEMAP_ROUTES.map(({ path, priority, changeFrequency }) => ({