diff --git a/.github/workflows/inject-posthog-vm.yml b/.github/workflows/inject-posthog-vm.yml new file mode 100644 index 000000000..9b8038aee --- /dev/null +++ b/.github/workflows/inject-posthog-vm.yml @@ -0,0 +1,101 @@ +name: Inject PostHog Key to Production VM + +on: + workflow_dispatch: + +permissions: + contents: read + +jobs: + inject-posthog-vm: + name: Inject POSTHOG_API_KEY to Production VM + runs-on: ubuntu-latest + environment: production + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up SSH agent + uses: webfactory/ssh-agent@v0.9.0 + with: + ssh-private-key: ${{ secrets.PRODUCTION_DEPLOY_SSH_KEY }} + + - name: Trust production host + run: | + mkdir -p ~/.ssh + ssh-keyscan -p "${{ secrets.PRODUCTION_DEPLOY_PORT || 22 }}" -H "${{ secrets.PRODUCTION_DEPLOY_HOST }}" >> ~/.ssh/known_hosts + + - name: Detect service and inject POSTHOG_API_KEY + env: + DEPLOY_HOST: ${{ secrets.PRODUCTION_DEPLOY_HOST }} + DEPLOY_PORT: ${{ secrets.PRODUCTION_DEPLOY_PORT || 22 }} + DEPLOY_USER: ${{ secrets.PRODUCTION_DEPLOY_USER }} + POSTHOG_KEY: ${{ secrets.POSTHOG_API_KEY_PRODUCTION }} + run: | + ssh -p "$DEPLOY_PORT" "$DEPLOY_USER@$DEPLOY_HOST" << 'EOF' + set -euo pipefail + + echo "=== Detecting service management ===" + + POSTHOG_KEY="${POSTHOG_KEY}" + + # Check for systemd service + if systemctl list-unit-files 2>/dev/null | grep -qiE 'buywhere|fastapi|uvicorn'; then + echo "Detected: systemd-managed" + SVC=$(systemctl list-units --type=service --all 2>/dev/null | grep -iE 'buywhere|fastapi|uvicorn' | awk '{print $1}' | head -1) + SVC="${SVC:-buywhere-api}" + + # Update env file + ENV_FILE="/etc/systemd/system/${SVC}.d/override.conf" + mkdir -p "$(dirname $ENV_FILE)" + if ! grep -q "POSTHOG_API_KEY=" "$ENV_FILE" 2>/dev/null; then + echo "POSTHOG_API_KEY=$POSTHOG_KEY" >> "$ENV_FILE" + fi + systemctl daemon-reload + systemctl restart "$SVC" || true + echo "Done: systemd updated and service restarted" + + # Check for PM2 + elif command -v pm2 &>/dev/null && pm2 list 2>/dev/null | grep -qiE 'buywhere|api|fastapi'; then + echo "Detected: PM2-managed" + PM2_NAME=$(pm2 list 2>/dev/null | grep -iE 'buywhere|api|fastapi' | awk '{print $2}' | head -1) + if [ -n "$PM2_NAME" ]; then + export POSTHOG_API_KEY="$POSTHOG_KEY" + pm2 restart "$PM2_NAME" || true + echo "Done: PM2 process restarted" + fi + + # Check for Docker + elif command -v docker &>/dev/null && docker ps 2>/dev/null | grep -qiE 'buywhere|api'; then + echo "Detected: Docker-managed" + CONTAINER=$(docker ps 2>/dev/null | grep -iE 'buywhere|api' | awk '{print $1}' | head -1) + if [ -n "$CONTAINER" ]; then + docker exec "$CONTAINER" env POSTHOG_API_KEY="$POSTHOG_KEY" sh -c 'echo "POSTHOG_API_KEY set"' 2>/dev/null || true + docker restart "$CONTAINER" 2>/dev/null || true + echo "Done: Docker container restarted" + fi + + # Check for raw process + elif pgrep -f "uvicorn\|gunicorn" &>/dev/null; then + echo "Detected: Raw process (uvicorn/gunicorn)" + # Add to /etc/environment as fallback + if ! grep -q "POSTHOG_API_KEY=" /etc/environment 2>/dev/null; then + echo "POSTHOG_API_KEY=$POSTHOG_KEY" >> /etc/environment + fi + export POSTHOG_API_KEY="$POSTHOG_KEY" + PID=$(pgrep -f "uvicorn\|gunicorn" | head -1) + echo "Process PID: $PID — killed for restart" + kill -TERM "$PID" 2>/dev/null || true + sleep 2 + # Restart hint + echo "WARNING: Manual restart required for raw process" + else + echo "No known service found — adding to /etc/environment" + if ! grep -q "POSTHOG_API_KEY=" /etc/environment 2>/dev/null; then + echo "POSTHOG_API_KEY=$POSTHOG_KEY" >> /etc/environment + fi + echo "Added to /etc/environment — verify service picks it up" + fi + + echo "=== PostHog key injection complete ===" + EOF diff --git a/.github/workflows/nginx-deploy.yml b/.github/workflows/nginx-deploy.yml index 3a0d1e667..8ed67838c 100644 --- a/.github/workflows/nginx-deploy.yml +++ b/.github/workflows/nginx-deploy.yml @@ -89,15 +89,15 @@ jobs: # Validate before touching live config nginx -t -c /etc/nginx/nginx.conf 2>&1 || true - cp "${SRC}" "${DEST}" - nginx -t + sudo cp "${SRC}" "${DEST}" + sudo nginx -t if [[ "${DRY_RUN}" == "true" ]]; then echo "DRY RUN: config validated OK, skipping reload" exit 0 fi - nginx -s reload + sudo nginx -s reload echo "nginx reloaded — ${CONFIG_NAME} is live (sha ${DEPLOY_SHA})" # Cleanup tmp diff --git a/api/dist/analytics/posthog.js b/api/dist/analytics/posthog.js index 6c11d5eeb..6f1d27348 100644 --- a/api/dist/analytics/posthog.js +++ b/api/dist/analytics/posthog.js @@ -3,6 +3,8 @@ Object.defineProperty(exports, "__esModule", { value: true }); exports.trackApiQuery = trackApiQuery; exports.trackAffiliateClick = trackAffiliateClick; exports.trackRegistration = trackRegistration; +exports.trackProductSearch = trackProductSearch; +exports.trackProductView = trackProductView; exports.trackComparePageView = trackComparePageView; exports.trackCompareRetailerClick = trackCompareRetailerClick; exports.shutdownPostHog = shutdownPostHog; @@ -77,6 +79,36 @@ function trackRegistration(apiKey, agentName, signupChannel, utmSource) { }, }); } +function trackProductSearch(event) { + const ph = getClient(); + if (!ph) + return; + ph.capture({ + distinctId: event.apiKey, + event: 'product_search', + properties: { + query_text: event.queryText, + result_count: event.resultCount, + response_time_ms: event.responseTimeMs, + source_page: event.sourcePage, + }, + }); +} +function trackProductView(event) { + const ph = getClient(); + if (!ph) + return; + ph.capture({ + distinctId: event.apiKey || 'anonymous', + event: 'product_view', + properties: { + product_id: event.productId, + retailer: event.retailer, + category: event.category, + source: event.source, + }, + }); +} function trackComparePageView(event) { const ph = getClient(); if (!ph) diff --git a/api/dist/mcp-server.js b/api/dist/mcp-server.js index 7ce58493a..92927d6cd 100644 --- a/api/dist/mcp-server.js +++ b/api/dist/mcp-server.js @@ -29,6 +29,9 @@ app.get('/health', async (_req, res) => { res.status(500).json({ status: 'error', error: String(err) }); } }); +app.get('/healthz', (_req, res) => { + res.json({ status: 'ok', server: 'mcp' }); +}); app.use('/mcp', mcp_1.default); // JSON-RPC root alias — allow POST / as shorthand for POST /mcp app.use('/', mcp_1.default); diff --git a/api/dist/routes/products.js b/api/dist/routes/products.js index db8a92b4f..f4500f791 100644 --- a/api/dist/routes/products.js +++ b/api/dist/routes/products.js @@ -16,7 +16,7 @@ const router = (0, express_1.Router)(); // GET /v1/products/search // Query params: q, domain, region, country, min_price, max_price, currency, limit, offset, source_page router.get('/search', agentDetect_1.agentDetectMiddleware, apiKey_1.requireApiKey, apiKey_1.checkRateLimit, (0, queryLog_1.queryLogMiddleware)('products.search'), async (req, res) => { - const start = Date.now(); + const requestStart = Date.now(); const q = req.query.q || ''; const domain = req.query.domain; const region = req.query.region; @@ -37,7 +37,7 @@ router.get('/search', agentDetect_1.agentDetectMiddleware, apiKey_1.requireApiKe const cached = await config_1.redis.get(cacheKey); if (cached) { const parsed = JSON.parse(cached); - const elapsed = Date.now() - start; + const elapsed = Date.now() - requestStart; // compact envelope uses flat keys; legacy uses nested meta if (parsed.meta) { parsed.meta.cached = true; @@ -152,7 +152,7 @@ router.get('/search', agentDetect_1.agentDetectMiddleware, apiKey_1.requireApiKe params.push(limit, offset); const dataResult = await config_1.db.query(dataQuery, params); const total = parseInt(countResult.rows[0].count, 10); - const responseTimeMs = Date.now() - start; + const responseTimeMs = Date.now() - requestStart; const products = dataResult.rows.map((row) => { if (compact) { // Compact format for AI agents (BUY-2073): Phase 2 shape. @@ -250,6 +250,12 @@ router.get('/search', agentDetect_1.agentDetectMiddleware, apiKey_1.requireApiKe sourcePage: sourcePage || null, endpoint: 'products.search', }); + (0, posthog_1.trackProductSearch)({ + apiKey: (0, apiKey_1.hashKey)(req.apiKeyRecord.key), + queryText: q, + resultCount: products.length, + responseTimeMs, + }); } res.json(responseBody); }); @@ -553,6 +559,12 @@ router.get('/:id', agentDetect_1.agentDetectMiddleware, apiKey_1.requireApiKey, sourcePage: null, endpoint: 'products.get', }); + (0, posthog_1.trackProductView)({ + apiKey: (0, apiKey_1.hashKey)(req.apiKeyRecord.key), + productId: row.id, + retailer: row.domain, + category: (row.category_path ? row.category_path.split(' > ')[0] : null), + }); } res.json({ data: product }); }); diff --git a/api/src/analytics/posthog.ts b/api/src/analytics/posthog.ts index 3132eddc0..6d2ebb3ca 100644 --- a/api/src/analytics/posthog.ts +++ b/api/src/analytics/posthog.ts @@ -94,6 +94,52 @@ export function trackRegistration(apiKey: string, agentName: string, signupChann }); } +export interface ProductSearchEvent { + apiKey: string; + queryText: string; + resultCount: number; + responseTimeMs: number; + sourcePage?: string | null; +} + +export function trackProductSearch(event: ProductSearchEvent): void { + const ph = getClient(); + if (!ph) return; + ph.capture({ + distinctId: event.apiKey, + event: 'product_search', + properties: { + query_text: event.queryText, + result_count: event.resultCount, + response_time_ms: event.responseTimeMs, + source_page: event.sourcePage, + }, + }); +} + +export interface ProductViewEvent { + apiKey: string | null; + productId: string; + retailer: string; + category: string | null; + source?: string | null; +} + +export function trackProductView(event: ProductViewEvent): void { + const ph = getClient(); + if (!ph) return; + ph.capture({ + distinctId: event.apiKey || 'anonymous', + event: 'product_view', + properties: { + product_id: event.productId, + retailer: event.retailer, + category: event.category, + source: event.source, + }, + }); +} + export interface ComparePageViewEvent { slug: string; productId: string; diff --git a/api/src/routes/docs.ts b/api/src/routes/docs.ts index 2c0e53cee..8b023c936 100644 --- a/api/src/routes/docs.ts +++ b/api/src/routes/docs.ts @@ -17,17 +17,32 @@ function buildMcpGuideMarkdown(baseUrl: string, mcpUrl: string): string { BuyWhere exposes its product catalog as an MCP (Model Context Protocol) server. AI agents can search, compare, and retrieve product data without writing HTTP glue code. -**Transport:** HTTP (\`POST ${mcpUrl}\`) for remote agents. STDIO (local process) coming soon via npm. +**Transport:** HTTP (\`POST ${mcpUrl}\`) for remote agents. STDIO/local process available via the published \`@buywhere/mcp-server\` npm package. ## Install -**The hosted MCP server is live.** Point your MCP client directly at \`${mcpUrl}\` — no local install required. +Use one of two supported setup paths: -> **Note:** The \`buywhere-mcp\` npm package (for STDIO / local process mode) is not yet published. Use the HTTP transport below until it is available. +- **Hosted MCP:** point your MCP client directly at \`${mcpUrl}\` +- **Local MCP package:** run \`npx -y @buywhere/mcp-server\` ## Configure Claude Desktop -Add to \`~/Library/Application Support/Claude/claude_desktop_config.json\` (macOS) or \`%APPDATA%\\Claude\\claude_desktop_config.json\` (Windows): +Add to \`~/Library/Application Support/Claude/claude_desktop_config.json\` (macOS) or \`%APPDATA%\\Claude\\claude_desktop_config.json\` (Windows) for local STDIO mode: + +\`\`\`json +{ + "mcpServers": { + "buywhere": { + "command": "npx", + "args": ["-y", "@buywhere/mcp-server"], + "env": { "BUYWHERE_API_KEY": "bw_live_xxx" } + } + } +} +\`\`\` + +Or for hosted HTTP transport: \`\`\`json { @@ -44,19 +59,24 @@ Restart Claude Desktop. The BuyWhere tools appear automatically. ## Configure Cursor -In \`.cursor/mcp.json\` in your project root (or \`~/.cursor/mcp.json\` globally): +In \`.cursor/mcp.json\` in your project root (or \`~/.cursor/mcp.json\` globally) for local STDIO mode: \`\`\`json { "mcpServers": { "buywhere": { - "url": "${mcpUrl}", - "headers": { "Authorization": "Bearer bw_live_xxx" } + "command": "npx", + "args": ["-y", "@buywhere/mcp-server"], + "env": { "BUYWHERE_API_KEY": "bw_live_xxx" } } } } \`\`\` +Hosted HTTP transport remains valid for cloud or remote setups. + +Restart Cursor. The BuyWhere tools appear automatically. + ## Remote HTTP Transport For agents running in cloud environments: @@ -267,16 +287,27 @@ router.get('/guides/mcp', (req: Request, res: Response) => {
BuyWhere exposes its product catalog as an MCP (Model Context Protocol) server. AI agents can search, compare, and retrieve product data without writing HTTP glue code.
-Transport: HTTP (POST ${mcpUrl}) for remote agents. STDIO (local process) coming soon via npm.
Transport: HTTP (POST ${mcpUrl}) for remote agents. STDIO/local process available via the published @buywhere/mcp-server npm package.
The hosted MCP server is live. Point your MCP client directly at ${mcpUrl} — no local install required.
buywhere-mcp npm package (for STDIO / local process mode) is not yet published. Use the HTTP transport below until it is available.
-Use one of two supported setup paths:
+${mcpUrl}npx -y @buywhere/mcp-serverAdd to ~/Library/Application Support/Claude/claude_desktop_config.json (macOS) or %APPDATA%\\Claude\\claude_desktop_config.json (Windows):
Add to ~/Library/Application Support/Claude/claude_desktop_config.json (macOS) or %APPDATA%\\Claude\\claude_desktop_config.json (Windows) for local STDIO mode:
{
+ "mcpServers": {
+ "buywhere": {
+ "command": "npx",
+ "args": ["-y", "@buywhere/mcp-server"],
+ "env": { "BUYWHERE_API_KEY": "bw_live_xxx" }
+ }
+ }
+}
+Or for hosted HTTP transport:
{
"mcpServers": {
"buywhere": {
@@ -287,6 +318,21 @@ router.get('/guides/mcp', (req: Request, res: Response) => {
}
Restart Claude Desktop. The BuyWhere tools appear automatically.
+In .cursor/mcp.json in your project root (or ~/.cursor/mcp.json globally) for local STDIO mode:
{
+ "mcpServers": {
+ "buywhere": {
+ "command": "npx",
+ "args": ["-y", "@buywhere/mcp-server"],
+ "env": { "BUYWHERE_API_KEY": "bw_live_xxx" }
+ }
+ }
+}
+Hosted HTTP transport remains valid for cloud or remote setups.
+Restart Cursor. The BuyWhere tools appear automatically.
+Restart Claude Desktop. The BuyWhere tools appear automatically.
+In .cursor/mcp.json in your project root (or ~/.cursor/mcp.json globally):
{
diff --git a/deploy/gcp/api-service.yaml b/deploy/gcp/api-service.yaml
index 678a7a1c1..d6d82b6ca 100644
--- a/deploy/gcp/api-service.yaml
+++ b/deploy/gcp/api-service.yaml
@@ -28,14 +28,16 @@ spec:
env:
- name: API_BASE_URL
value: "https://api.buywhere.ai"
+ - name: APP_BASE_URL
+ value: "https://api.buywhere.ai"
- name: PG_POOL_MAX
value: "10"
- # Cloud SQL via Unix socket (PgBouncer not needed with Cloud SQL Proxy)
+ # Cloud SQL via Unix socket
- name: DATABASE_URL
valueFrom:
secretKeyRef:
- name: buywhere-db-url
- key: latest
+ name: buywhere-db-url-staging
+ key: "1"
- name: REDIS_HOST
valueFrom:
secretKeyRef:
diff --git a/deploy/gcp/mcp-service.yaml b/deploy/gcp/mcp-service.yaml
index 89067e175..bdf6dbdda 100644
--- a/deploy/gcp/mcp-service.yaml
+++ b/deploy/gcp/mcp-service.yaml
@@ -50,14 +50,14 @@ spec:
memory: "512Mi"
livenessProbe:
httpGet:
- path: /health
+ path: /healthz
port: 8081
initialDelaySeconds: 5
periodSeconds: 10
startupProbe:
httpGet:
- path: /health
+ path: /healthz
port: 8081
- initialDelaySeconds: 10
+ initialDelaySeconds: 30
periodSeconds: 10
failureThreshold: 30
diff --git a/deploy/nginx/api.buywhere.ai.conf b/deploy/nginx/api.buywhere.ai.conf
index 9753ab3b7..90599a218 100644
--- a/deploy/nginx/api.buywhere.ai.conf
+++ b/deploy/nginx/api.buywhere.ai.conf
@@ -1,68 +1,53 @@
-# Global nginx configuration for BuyWhere API
-# api.buywhere.ai - Main API server
+# Site fragment for api.buywhere.ai
+# Deployed by nginx-deploy.yml into /etc/nginx/sites-enabled/
+# Global http-level settings (gzip, log_format, etc.) live in /etc/nginx/nginx.conf.
-events {
- worker_connections 1024;
+upstream api_backend {
+ server 127.0.0.1:8000;
+ keepalive 32;
}
-http {
- include /etc/nginx/mime.types;
- default_type application/octet-stream;
-
- log_format main '$remote_addr - $remote_user [$time_local] "$request" '
- '$status $body_bytes_sent "$http_referer" '
- '"$http_user_agent" "$http_x_forwarded_for"';
-
- access_log /var/log/nginx/access.log main;
- error_log /var/log/nginx/error.log warn;
-
- sendfile on;
- tcp_nopush on;
- tcp_nodelay on;
- keepalive_timeout 65;
- types_hash_max_size 2048;
-
- gzip on;
- gzip_vary on;
- gzip_proxied any;
- gzip_comp_level 6;
- gzip_types text/plain text/css text/xml application/json application/javascript application/xml+rss;
-
- upstream api_backend {
- server 127.0.0.1:8000;
- keepalive 32;
+server {
+ listen 443 ssl http2;
+ server_name api.buywhere.ai;
+
+ ssl_certificate /etc/letsencrypt/live/api.buywhere.ai/fullchain.pem;
+ ssl_certificate_key /etc/letsencrypt/live/api.buywhere.ai/privkey.pem;
+ include /etc/letsencrypt/options-ssl-nginx.conf;
+ ssl_dhparam /etc/letsencrypt/ssl-dhparams.pem;
+
+ # Security headers
+ add_header X-Frame-Options "SAMEORIGIN" always;
+ add_header X-Content-Type-Options "nosniff" always;
+ add_header X-XSS-Protection "1; mode=block" always;
+ add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
+
+ location /.well-known/glama.json {
+ alias /home/paperclip/buywhere-api/glama.json;
+ add_header Content-Type application/json;
+ add_header Cache-Control "public, max-age=86400";
}
- server {
- listen 443 ssl http2;
- server_name api.buywhere.ai;
-
- ssl_certificate /etc/letsencrypt/live/api.buywhere.ai/fullchain.pem;
- ssl_certificate_key /etc/letsencrypt/live/api.buywhere.ai/privkey.pem;
- include /etc/letsencrypt/options-ssl-nginx.conf;
- ssl_dhparam /etc/letsencrypt/ssl-dhparams.pem;
-
- # Security headers
- add_header X-Frame-Options "SAMEORIGIN" always;
- add_header X-Content-Type-Options "nosniff" always;
- add_header X-XSS-Protection "1; mode=block" always;
- add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
-
- location /.well-known/glama.json {
- alias /home/paperclip/buywhere-api/glama.json;
- add_header Content-Type application/json;
- add_header Cache-Control "public, max-age=86400";
- }
+ location /mcp {
+ proxy_pass http://127.0.0.1:8000;
+ proxy_http_version 1.1;
+ proxy_set_header Host $host;
+ proxy_set_header X-Real-IP $remote_addr;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header X-Forwarded-Proto $scheme;
+ proxy_set_header Connection "";
+ proxy_buffering off;
+ proxy_read_timeout 300s;
+ }
- location / {
- proxy_pass http://api_backend;
- proxy_http_version 1.1;
- proxy_set_header Host $host;
- proxy_set_header X-Real-IP $remote_addr;
- proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
- proxy_set_header X-Forwarded-Proto $scheme;
- proxy_set_header Connection "";
- proxy_buffering off;
- }
+ location / {
+ proxy_pass http://api_backend;
+ proxy_http_version 1.1;
+ proxy_set_header Host $host;
+ proxy_set_header X-Real-IP $remote_addr;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header X-Forwarded-Proto $scheme;
+ proxy_set_header Connection "";
+ proxy_buffering off;
}
}
\ No newline at end of file
diff --git a/deploy/nginx/buywhere.ai.conf b/deploy/nginx/buywhere.ai.conf
new file mode 100644
index 000000000..27c61cc78
--- /dev/null
+++ b/deploy/nginx/buywhere.ai.conf
@@ -0,0 +1,37 @@
+server {
+ listen 80;
+ server_name buywhere.ai www.buywhere.ai;
+ return 301 https://$host$request_uri;
+}
+
+server {
+ listen 443 ssl;
+ server_name buywhere.ai www.buywhere.ai;
+
+ ssl_certificate /etc/letsencrypt/live/api.buywhere.ai/fullchain.pem;
+ ssl_certificate_key /etc/letsencrypt/live/api.buywhere.ai/privkey.pem;
+ include /etc/letsencrypt/options-ssl-nginx.conf;
+ ssl_dhparam /etc/letsencrypt/ssl-dhparams.pem;
+
+ location /openapi.json {
+ return 308 https://api.buywhere.ai/openapi.json;
+ }
+
+ location /healthz {
+ proxy_pass https://buywhere-site-production-3cjo6zft4q-as.a.run.app;
+ proxy_set_header Host buywhere.ai;
+ proxy_set_header X-Real-IP $remote_addr;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header X-Forwarded-Proto $scheme;
+ proxy_read_timeout 30s;
+ }
+
+ location / {
+ proxy_pass https://buywhere-site-production-3cjo6zft4q-as.a.run.app;
+ proxy_set_header Host buywhere.ai;
+ proxy_set_header X-Real-IP $remote_addr;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header X-Forwarded-Proto $scheme;
+ proxy_read_timeout 30s;
+ }
+}
\ No newline at end of file
diff --git a/ecs/amazon_au-task-definition.json b/ecs/amazon_au-task-definition.json
new file mode 100644
index 000000000..73930d068
--- /dev/null
+++ b/ecs/amazon_au-task-definition.json
@@ -0,0 +1,95 @@
+{
+ "family": "buywhere-scraper-amazon_au",
+ "networkMode": "awsvpc",
+ "requiresCompatibilities": [
+ "FARGATE"
+ ],
+ "cpu": "1024",
+ "memory": "2048",
+ "containerDefinitions": [
+ {
+ "name": "amazon-au-scraper",
+ "image": "buywhere/scraper:latest",
+ "essential": true,
+ "command": [
+ "python",
+ "-m",
+ "scrapers.amazon_au",
+ "--scrape-only",
+ "--target",
+ "500000"
+ ],
+ "environment": [
+ {
+ "name": "SCRAPER_TYPE",
+ "value": "amazon_au"
+ },
+ {
+ "name": "REDIS_URL",
+ "value": "redis://redis.buywhere:6379/0"
+ },
+ {
+ "name": "DATABASE_URL",
+ "value": "postgresql+asyncpg://buywhere:buywhere@db.buywhere:5432/catalog"
+ },
+ {
+ "name": "LOG_LEVEL",
+ "value": "INFO"
+ }
+ ],
+ "secrets": [
+ {
+ "name": "PRODUCT_API_KEY",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/product-api-key:PRODUCT_API_KEY::"
+ },
+ {
+ "name": "SCRAPER_USER_AGENT",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/scraper-agents:SCRAPER_USER_AGENT::"
+ },
+ {
+ "name": "SCRAPERAPI_KEY",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/scraper-api-key:SCRAPERAPI_KEY::"
+ }
+ ],
+ "logConfiguration": {
+ "logDriver": "awslogs",
+ "options": {
+ "awslogs-group": "/ecs/buywhere/scrapers",
+ "awslogs-region": "ap-southeast-1",
+ "awslogs-stream-prefix": "amazon_au"
+ }
+ },
+ "healthCheck": {
+ "command": [
+ "CMD-SHELL",
+ "exit 0"
+ ],
+ "interval": 30,
+ "timeout": 5,
+ "retries": 3,
+ "startPeriod": 120
+ },
+ "portMappings": [],
+ "mountPoints": [],
+ "volumesFrom": []
+ }
+ ],
+ "executionRoleArn": "arn:aws:iam::123456789:role/ecsTaskExecutionRole",
+ "taskRoleArn": "arn:aws:iam::123456789:role/ecsScraperTaskRole",
+ "volumes": [],
+ "placementConstraints": [],
+ "tags": [
+ {
+ "key": "Environment",
+ "value": "production"
+ },
+ {
+ "key": "Project",
+ "value": "BuyWhere"
+ },
+ {
+ "key": "Scraper",
+ "value": "amazon_au"
+ }
+ ]
+}
diff --git a/ecs/shopee_ph-task-definition.json b/ecs/shopee_ph-task-definition.json
new file mode 100644
index 000000000..8dd7499b5
--- /dev/null
+++ b/ecs/shopee_ph-task-definition.json
@@ -0,0 +1,113 @@
+{
+ "family": "buywhere-scraper-shopee_ph",
+ "networkMode": "awsvpc",
+ "requiresCompatibilities": [
+ "FARGATE"
+ ],
+ "cpu": "512",
+ "memory": "1024",
+ "containerDefinitions": [
+ {
+ "name": "shopee-ph-scraper",
+ "image": "buywhere/scraper:latest",
+ "essential": true,
+ "command": [
+ "python",
+ "-m",
+ "scrapers.shopee_ph",
+ "--use-scraperapi"
+ ],
+ "environment": [
+ {
+ "name": "SCRAPER_TYPE",
+ "value": "shopee_ph"
+ },
+ {
+ "name": "REDIS_URL",
+ "value": "redis://redis.buywhere:6379/0"
+ },
+ {
+ "name": "DATABASE_URL",
+ "value": "postgresql+asyncpg://buywhere:buywhere@db.buywhere:5432/catalog"
+ },
+ {
+ "name": "LOG_LEVEL",
+ "value": "INFO"
+ }
+ ],
+ "secrets": [
+ {
+ "name": "PRODUCT_API_KEY",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/product-api-key:PRODUCT_API_KEY::"
+ },
+ {
+ "name": "SCRAPER_USER_AGENT",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/scraper-agents:SCRAPER_USER_AGENT::"
+ },
+ {
+ "name": "SCRAPERAPI_KEY",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/scraper-api-key:SCRAPERAPI_KEY::"
+ },
+ {
+ "name": "BRIGHTDATA_USERNAME",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/brightdata:BRIGHTDATA_USERNAME::"
+ },
+ {
+ "name": "BRIGHTDATA_PASSWORD",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/brightdata:BRIGHTDATA_PASSWORD::"
+ },
+ {
+ "name": "BRIGHTDATA_PROXY_HOST",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/brightdata:BRIGHTDATA_PROXY_HOST::"
+ },
+ {
+ "name": "BRIGHTDATA_PROXY_PORT",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/brightdata:BRIGHTDATA_PROXY_PORT::"
+ },
+ {
+ "name": "BRIGHTDATA_PH_PORT",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/brightdata:BRIGHTDATA_PH_PORT::"
+ }
+ ],
+ "logConfiguration": {
+ "logDriver": "awslogs",
+ "options": {
+ "awslogs-group": "/ecs/buywhere/scrapers",
+ "awslogs-region": "ap-southeast-1",
+ "awslogs-stream-prefix": "shopee_ph"
+ }
+ },
+ "healthCheck": {
+ "command": [
+ "CMD-SHELL",
+ "exit 0"
+ ],
+ "interval": 30,
+ "timeout": 5,
+ "retries": 3,
+ "startPeriod": 60
+ },
+ "portMappings": [],
+ "mountPoints": [],
+ "volumesFrom": []
+ }
+ ],
+ "executionRoleArn": "arn:aws:iam::123456789:role/ecsTaskExecutionRole",
+ "taskRoleArn": "arn:aws:iam::123456789:role/ecsScraperTaskRole",
+ "volumes": [],
+ "placementConstraints": [],
+ "tags": [
+ {
+ "key": "Environment",
+ "value": "production"
+ },
+ {
+ "key": "Project",
+ "value": "BuyWhere"
+ },
+ {
+ "key": "Scraper",
+ "value": "shopee_ph"
+ }
+ ]
+}
diff --git a/ecs/shopee_th-task-definition.json b/ecs/shopee_th-task-definition.json
new file mode 100644
index 000000000..dde1f1b91
--- /dev/null
+++ b/ecs/shopee_th-task-definition.json
@@ -0,0 +1,93 @@
+{
+ "family": "buywhere-scraper-shopee_th",
+ "networkMode": "awsvpc",
+ "requiresCompatibilities": [
+ "FARGATE"
+ ],
+ "cpu": "512",
+ "memory": "1024",
+ "containerDefinitions": [
+ {
+ "name": "shopee-th-scraper",
+ "image": "buywhere/scraper:latest",
+ "essential": true,
+ "command": [
+ "python",
+ "-m",
+ "scrapers.shopee_th",
+ "--use-scraperapi"
+ ],
+ "environment": [
+ {
+ "name": "SCRAPER_TYPE",
+ "value": "shopee_th"
+ },
+ {
+ "name": "REDIS_URL",
+ "value": "redis://redis.buywhere:6379/0"
+ },
+ {
+ "name": "DATABASE_URL",
+ "value": "postgresql+asyncpg://buywhere:buywhere@db.buywhere:5432/catalog"
+ },
+ {
+ "name": "LOG_LEVEL",
+ "value": "INFO"
+ }
+ ],
+ "secrets": [
+ {
+ "name": "PRODUCT_API_KEY",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/product-api-key:PRODUCT_API_KEY::"
+ },
+ {
+ "name": "SCRAPER_USER_AGENT",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/scraper-agents:SCRAPER_USER_AGENT::"
+ },
+ {
+ "name": "SCRAPERAPI_KEY",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/scraper-api-key:SCRAPERAPI_KEY::"
+ }
+ ],
+ "logConfiguration": {
+ "logDriver": "awslogs",
+ "options": {
+ "awslogs-group": "/ecs/buywhere/scrapers",
+ "awslogs-region": "ap-southeast-1",
+ "awslogs-stream-prefix": "shopee_th"
+ }
+ },
+ "healthCheck": {
+ "command": [
+ "CMD-SHELL",
+ "exit 0"
+ ],
+ "interval": 30,
+ "timeout": 5,
+ "retries": 3,
+ "startPeriod": 60
+ },
+ "portMappings": [],
+ "mountPoints": [],
+ "volumesFrom": []
+ }
+ ],
+ "executionRoleArn": "arn:aws:iam::123456789:role/ecsTaskExecutionRole",
+ "taskRoleArn": "arn:aws:iam::123456789:role/ecsScraperTaskRole",
+ "volumes": [],
+ "placementConstraints": [],
+ "tags": [
+ {
+ "key": "Environment",
+ "value": "production"
+ },
+ {
+ "key": "Project",
+ "value": "BuyWhere"
+ },
+ {
+ "key": "Scraper",
+ "value": "shopee_th"
+ }
+ ]
+}
diff --git a/ecs/zalora_my-task-definition.json b/ecs/zalora_my-task-definition.json
new file mode 100644
index 000000000..bf51d78eb
--- /dev/null
+++ b/ecs/zalora_my-task-definition.json
@@ -0,0 +1,93 @@
+{
+ "family": "buywhere-scraper-zalora_my",
+ "networkMode": "awsvpc",
+ "requiresCompatibilities": [
+ "FARGATE"
+ ],
+ "cpu": "512",
+ "memory": "1024",
+ "containerDefinitions": [
+ {
+ "name": "zalora-my-scraper",
+ "image": "buywhere/scraper:latest",
+ "essential": true,
+ "command": [
+ "python",
+ "-m",
+ "scrapers.zalora_my",
+ "--use-scraperapi"
+ ],
+ "environment": [
+ {
+ "name": "SCRAPER_TYPE",
+ "value": "zalora_my"
+ },
+ {
+ "name": "REDIS_URL",
+ "value": "redis://redis.buywhere:6379/0"
+ },
+ {
+ "name": "DATABASE_URL",
+ "value": "postgresql+asyncpg://buywhere:buywhere@db.buywhere:5432/catalog"
+ },
+ {
+ "name": "LOG_LEVEL",
+ "value": "INFO"
+ }
+ ],
+ "secrets": [
+ {
+ "name": "PRODUCT_API_KEY",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/product-api-key:PRODUCT_API_KEY::"
+ },
+ {
+ "name": "SCRAPER_USER_AGENT",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/scraper-agents:SCRAPER_USER_AGENT::"
+ },
+ {
+ "name": "SCRAPERAPI_KEY",
+ "valueFrom": "arn:aws:secretsmanager:ap-southeast-1:123456789:secret:buywhere/scraper-api-key:SCRAPERAPI_KEY::"
+ }
+ ],
+ "logConfiguration": {
+ "logDriver": "awslogs",
+ "options": {
+ "awslogs-group": "/ecs/buywhere/scrapers",
+ "awslogs-region": "ap-southeast-1",
+ "awslogs-stream-prefix": "zalora_my"
+ }
+ },
+ "healthCheck": {
+ "command": [
+ "CMD-SHELL",
+ "exit 0"
+ ],
+ "interval": 30,
+ "timeout": 5,
+ "retries": 3,
+ "startPeriod": 60
+ },
+ "portMappings": [],
+ "mountPoints": [],
+ "volumesFrom": []
+ }
+ ],
+ "executionRoleArn": "arn:aws:iam::123456789:role/ecsTaskExecutionRole",
+ "taskRoleArn": "arn:aws:iam::123456789:role/ecsScraperTaskRole",
+ "volumes": [],
+ "placementConstraints": [],
+ "tags": [
+ {
+ "key": "Environment",
+ "value": "production"
+ },
+ {
+ "key": "Project",
+ "value": "BuyWhere"
+ },
+ {
+ "key": "Scraper",
+ "value": "zalora_my"
+ }
+ ]
+}
diff --git a/public/apis.json b/public/apis.json
new file mode 100644
index 000000000..92561a78d
--- /dev/null
+++ b/public/apis.json
@@ -0,0 +1,78 @@
+{
+ "aid": "buywhere.ai:buywhere-public-api-index",
+ "name": "BuyWhere Public API Index",
+ "description": "Machine-readable index for BuyWhere's public developer API and agent integration surfaces.",
+ "url": "https://buywhere.ai/apis.json",
+ "type": "Index",
+ "specificationVersion": "0.18",
+ "created": "2026-05-01",
+ "modified": "2026-05-01",
+ "tags": [
+ "api",
+ "developer-tools",
+ "ecommerce",
+ "shopping",
+ "product-search",
+ "price-comparison",
+ "ai-agents",
+ "mcp"
+ ],
+ "maintainers": [
+ {
+ "FN": "BuyWhere",
+ "email": "api@buywhere.ai",
+ "url": "https://buywhere.ai"
+ }
+ ],
+ "apis": [
+ {
+ "aid": "buywhere.ai:catalog-api",
+ "name": "BuyWhere Catalog API",
+ "description": "Product search, offer comparison, and merchant handoff API for AI shopping agents.",
+ "baseURL": "https://api.buywhere.ai/v1",
+ "humanURL": "https://api.buywhere.ai/docs/guides/mcp",
+ "tags": [
+ "rest",
+ "openapi",
+ "mcp",
+ "ai-agents",
+ "product-search",
+ "commerce"
+ ],
+ "properties": [
+ {
+ "type": "Documentation",
+ "url": "https://api.buywhere.ai/docs/guides/mcp"
+ },
+ {
+ "type": "OpenAPI",
+ "url": "https://api.buywhere.ai/api/openapi.json"
+ },
+ {
+ "type": "OpenAPI Alias",
+ "url": "https://buywhere.ai/openapi.json"
+ },
+ {
+ "type": "Plugin Manifest",
+ "url": "https://api.buywhere.ai/.well-known/ai-plugin.json"
+ },
+ {
+ "type": "Signup",
+ "url": "https://api.buywhere.ai/v1/developers/signup"
+ },
+ {
+ "type": "Terms of Service",
+ "url": "https://buywhere.ai/terms"
+ },
+ {
+ "type": "Authentication",
+ "url": "https://api.buywhere.ai/docs/guides/mcp#authentication"
+ },
+ {
+ "type": "MCP Endpoint",
+ "url": "https://api.buywhere.ai/mcp"
+ }
+ ]
+ }
+ ]
+}
diff --git a/scrapers/amazon_au.py b/scrapers/amazon_au.py
new file mode 100644
index 000000000..6666c3a03
--- /dev/null
+++ b/scrapers/amazon_au.py
@@ -0,0 +1,700 @@
+"""
+Amazon Australia product scraper using ScraperAPI premium residential proxies.
+
+Target: 500,000+ products across Electronics, Home, Fashion, Sports, Beauty.
+Tag: region=au, country_code=AU, currency=AUD
+
+Key features:
+- ScraperAPI premium residential tier for anti-bot bypass
+- Search-based scraping across keyword-driven categories
+- Category path tagging for filtered catalog queries
+- Deduplication by ASIN
+- Concurrent scraping with per-category semaphore
+
+Usage:
+ SCRAPERAPI_KEY=... python -m scrapers.amazon_au --scrape-only --target 500000
+ SCRAPERAPI_KEY=... python -m scrapers.amazon_au --api-key --batch-size 200
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import random
+import re
+import time
+import urllib.parse
+from typing import Any
+
+import httpx
+from bs4 import BeautifulSoup
+
+MERCHANT_ID = "amazon_au"
+SOURCE = "amazon_au"
+BASE_URL = "https://www.amazon.com.au"
+OUTPUT_DIR = "/home/paperclip/buywhere-api/data/amazon_au"
+
+HEADERS = {
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+ "Accept-Language": "en-AU,en;q=0.9",
+ "Referer": "https://www.amazon.com.au/",
+}
+
+CATEGORIES = [
+ {
+ "id": "electronics",
+ "name": "Electronics",
+ "keywords": [
+ "laptop",
+ "smartphone",
+ "headphones wireless",
+ "4k monitor",
+ "keyboard mechanical",
+ "gaming mouse",
+ "power bank",
+ "smart watch",
+ "tablet",
+ "bluetooth speaker",
+ "smart home hub",
+ "webcam hd",
+ "camera digital",
+ "tv 55 inch",
+ "soundbar",
+ ],
+ },
+ {
+ "id": "home",
+ "name": "Home & Kitchen",
+ "keywords": [
+ "air fryer",
+ "vacuum cleaner robot",
+ "bedding set queen",
+ "storage organiser",
+ "desk lamp led",
+ "cookware set nonstick",
+ "office chair ergonomic",
+ "water bottle stainless",
+ "knife block set",
+ "coffee maker machine",
+ "air purifier HEPA",
+ "mattress queen memory foam",
+ "curtains blackout",
+ "toaster oven",
+ "blender high speed",
+ ],
+ },
+ {
+ "id": "fashion",
+ "name": "Fashion",
+ "keywords": [
+ "women dress summer",
+ "men shirt casual cotton",
+ "running shoes women",
+ "sneakers men",
+ "handbag leather women",
+ "wallet men leather",
+ "hoodie pullover",
+ "jacket winter women",
+ "jeans women skinny",
+ "athletic wear women",
+ "socks pack cotton",
+ "hat baseball cap",
+ "scarf wool women",
+ "sunglasses men UV",
+ "watch men dress",
+ ],
+ },
+ {
+ "id": "beauty",
+ "name": "Beauty & Personal Care",
+ "keywords": [
+ "skincare routine set",
+ "makeup palette eyeshadow",
+ "moisturiser face SPF",
+ "shampoo hair loss",
+ "electric toothbrush",
+ "perfume women designer",
+ "cologne men signature",
+ "nail polish set",
+ "hair dryer professional",
+ "razor men electric",
+ "lipstick long lasting",
+ "serum vitamin C",
+ "face mask sheet",
+ "makeup brush set",
+ "sunscreen SPF 50",
+ ],
+ },
+ {
+ "id": "sports",
+ "name": "Sports & Outdoors",
+ "keywords": [
+ "fitness equipment home gym",
+ "camping gear essentials",
+ "hiking backpack 40L",
+ "cycling helmet adult",
+ "yoga mat premium thick",
+ "dumbbells set 20kg",
+ "fishing rod combo",
+ "running shoes men trail",
+ "sports jersey football",
+ "outdoor recreation equipment",
+ "winter sports gear",
+ "team sports ball soccer",
+ "sports accessories fitness",
+ "kettlebell cast iron",
+ "exercise bike magnetic",
+ ],
+ },
+ {
+ "id": "toys",
+ "name": "Toys & Games",
+ "keywords": [
+ "lego city set",
+ "barbie doll fashion",
+ "action figure marvel",
+ "board game family",
+ "puzzle 1000 pieces",
+ "outdoor toy kids",
+ "educational toy age 5",
+ "remote control car 4wd",
+ "building blocks creative",
+ "dollhouse wooden",
+ "arts crafts kit kids",
+ "nerf gun elite",
+ "video game console ps5",
+ "toy train set",
+ "dinosaur action figure",
+ ],
+ },
+ {
+ "id": "books",
+ "name": "Books",
+ "keywords": [
+ "bestseller fiction 2024",
+ "self help book",
+ "cookbook australian",
+ "biography famous",
+ "science fiction book",
+ "mystery novel thriller",
+ "business book leadership",
+ "children books age 6",
+ "audiobook bestseller",
+ "journal refill lined",
+ "colouring book adult",
+ "puzzle book crosswords",
+ "fantasy novel 2024",
+ "romance book bestseller",
+ "travel guide australia",
+ ],
+ },
+ {
+ "id": "grocery",
+ "name": "Grocery & Gourmet",
+ "keywords": [
+ "organic coffee beans",
+ "protein powder",
+ "healthy snack boxes",
+ "olive oil extra virgin",
+ "tea loose leaf",
+ "chocolate block dark",
+ "nuts mixed raw",
+ "pasta dried italian",
+ "sauce pasta tomato",
+ "spice rack set",
+ "honey manuka",
+ "milk oat organic",
+ "bread sourdough",
+ "cereal granola",
+ "rice basmati long grain",
+ ],
+ },
+]
+
+
+class AmazonAUScraper:
+ def __init__(
+ self,
+ scrape_only: bool = True,
+ target: int = 500000,
+ delay: float = 1.5,
+ concurrency: int = 8,
+ max_pages_per_keyword: int = 50,
+ scraperapi_key: str | None = None,
+ batch_size: int = 100,
+ api_base: str = "http://localhost:8000",
+ api_key: str | None = None,
+ ):
+ self.scrape_only = scrape_only
+ self.target = target
+ self.delay = delay
+ self.concurrency = concurrency
+ self.max_pages_per_keyword = max_pages_per_keyword
+ self.scraperapi_key = scraperapi_key or os.environ.get("SCRAPERAPI_KEY", "")
+ self.batch_size = batch_size
+ self.api_base = api_base.rstrip("/")
+ self.api_key = api_key or os.environ.get("BUYWHERE_API_KEY")
+ self._semaphore = asyncio.Semaphore(concurrency)
+ self._client = httpx.AsyncClient(timeout=120.0, headers=HEADERS)
+ self._seen_asins: set[str] = set()
+ self._all_seen_skus: set[str] = set()
+ self.total_scraped = 0
+ self.total_ingested = 0
+ self.total_updated = 0
+ self.total_failed = 0
+ self._ensure_output_dir()
+ ts = time.strftime("%Y%m%d_%H%M%S")
+ self._output_file = os.path.join(OUTPUT_DIR, f"amazon_au_{ts}.ndjson")
+ self._skipped_file = os.path.join(OUTPUT_DIR, f"amazon_au_skipped_{ts}.txt")
+
+ def _ensure_output_dir(self) -> None:
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+ async def close(self) -> None:
+ await self._client.aclose()
+
+ async def _fetch_with_scraperapi(
+ self, url: str, retries: int = 3, autoparse: bool = False
+ ) -> str | None:
+ encoded_url = urllib.parse.quote(url, safe="")
+ flags = "&autoparse=true" if autoparse else "&premium=true&keep_headers=true"
+ proxy_url = (
+ f"http://api.scraperapi.com?api_key={self.scraperapi_key}&url={encoded_url}"
+ f"{flags}"
+ )
+ await asyncio.sleep(random.uniform(0.5, 2.0))
+ for attempt in range(retries):
+ try:
+ resp = await self._client.get(proxy_url, timeout=120.0)
+ text = resp.text
+ if resp.status_code == 200 and len(text) > 1000:
+ if "captcha" in text.lower() or "robot check" in text.lower():
+ print(f" CAPTCHA/robot check for {url}, retry {attempt+1}/{retries}")
+ await asyncio.sleep(15 * (attempt + 1))
+ continue
+ return text
+ elif resp.status_code in (500, 429, 503):
+ wait_time = (2 ** attempt) * 10
+ print(f" HTTP {resp.status_code}, waiting {wait_time}s")
+ await asyncio.sleep(wait_time)
+ continue
+ except Exception as e:
+ print(f" Exception fetching {url}: {e}")
+ await asyncio.sleep(2 ** attempt)
+ continue
+ return None
+
+ def _parse_autoparse_results(
+ self, raw_text: str
+ ) -> tuple[list[dict[str, Any]], int]:
+ try:
+ data = json.loads(raw_text)
+ items = data.get("results", []) if isinstance(data, dict) else []
+ except Exception:
+ return [], 0
+
+ products: list[dict[str, Any]] = []
+ for item in items:
+ if item.get("type") != "search_product":
+ continue
+ try:
+ asin = item.get("asin", "")
+ if not asin or asin in self._seen_asins:
+ continue
+
+ price_info = item.get("price", {}) or {}
+ price_val = price_info.get("value", 0) if isinstance(price_info, dict) else 0
+ orig_info = item.get("original_price") or {}
+ orig_val = (
+ orig_info.get("value", price_val)
+ if isinstance(orig_info, dict)
+ else price_val
+ )
+
+ rating_info = item.get("rating", {}) or {}
+ rating = rating_info.get("value", 0) if isinstance(rating_info, dict) else 0
+ review_count = rating_info.get("total_reviews", 0) if isinstance(rating_info, dict) else 0
+
+ url = item.get("url", "") or f"{BASE_URL}/dp/{asin}"
+ if not url.startswith("http"):
+ url = urllib.parse.urljoin(BASE_URL, url)
+
+ raw_product = {
+ "asin": asin,
+ "title": item.get("name", ""),
+ "url": url,
+ "price": price_val,
+ "original_price": orig_val,
+ "image_url": item.get("thumbnail", ""),
+ "rating": rating,
+ "review_count": review_count,
+ "is_sponsored": bool(item.get("is_sponsored")),
+ }
+ transformed = self.transform_product(raw_product, "", "")
+ if transformed:
+ products.append(transformed)
+ except Exception:
+ continue
+
+ next_page = data.get("next_page", 0) if isinstance(data, dict) else 0
+ return products, next_page
+
+ def _write_products(self, products: list[dict]) -> None:
+ if not products:
+ return
+ with open(self._output_file, "a", encoding="utf-8") as f:
+ for p in products:
+ sku = p.get("sku", "")
+ if sku:
+ self._all_seen_skus.add(sku)
+ f.write(json.dumps(p, ensure_ascii=False) + "\n")
+
+ def _log_skipped(self, asin: str, reason: str) -> None:
+ with open(self._skipped_file, "a", encoding="utf-8") as f:
+ f.write(f"{asin}: {reason}\n")
+
+ def _parse_price(self, value: str | None) -> float:
+ if not value:
+ return 0.0
+ cleaned = value.replace("A$", "").replace("$", "").replace("£", "").replace(",", "").strip()
+ match = re.search(r"\d+(?:\.\d+)?", cleaned)
+ return float(match.group(0)) if match else 0.0
+
+ def _parse_int(self, value: str | None) -> int:
+ if not value:
+ return 0
+ digits = re.sub(r"[^\d]", "", value)
+ return int(digits) if digits else 0
+
+ def _extract_brand(self, title: str) -> str:
+ if not title:
+ return ""
+ first_token = title.split()[0].strip("()-[]:,.")
+ if not first_token or any(c.isdigit() for c in first_token):
+ return ""
+ return first_token[:80]
+
+ def parse_search_results(
+ self, html: str, category_name: str, keyword: str
+ ) -> tuple[list[dict[str, Any]], bool]:
+ soup = BeautifulSoup(html, "html.parser")
+ products: list[dict[str, Any]] = []
+
+ for card in soup.select('[data-component-type="s-search-result"][data-asin]'):
+ asin = (card.get("data-asin") or "").strip()
+ if not asin:
+ continue
+
+ title_el = card.select_one("h2 span")
+ if not title_el:
+ continue
+
+ link_el = card.select_one("h2 a")
+ price_el = card.select_one(".a-price .a-offscreen")
+ original_price_el = card.select_one(".a-text-price .a-offscreen")
+ image_el = card.select_one("img.s-image")
+ rating_el = card.select_one(".a-icon-alt")
+ review_el = card.select_one('a[href*="#customerReviews"] span')
+ sponsored_el = card.select_one(
+ '[aria-label="Sponsored"], .puis-sponsored-label-text'
+ )
+
+ raw_product = {
+ "asin": asin,
+ "title": title_el.get_text(" ", strip=True),
+ "url": link_el.get("href", "") if link_el else "",
+ "price": price_el.get_text(strip=True) if price_el else "",
+ "original_price": (
+ original_price_el.get_text(strip=True)
+ if original_price_el
+ else ""
+ ),
+ "image_url": image_el.get("src", "") if image_el else "",
+ "rating": rating_el.get_text(" ", strip=True) if rating_el else "",
+ "review_count": (
+ review_el.get_text(" ", strip=True) if review_el else ""
+ ),
+ "is_sponsored": sponsored_el is not None,
+ }
+
+ transformed = self.transform_product(
+ raw_product, category_name, keyword
+ )
+ if transformed:
+ products.append(transformed)
+
+ next_btn = soup.select_one(".s-pagination-next:not(.s-pagination-disabled)")
+ has_next_page = next_btn is not None
+ return products, has_next_page
+
+ def transform_product(
+ self, raw: dict[str, Any], category_name: str, keyword: str
+ ) -> dict[str, Any] | None:
+ try:
+ asin = str(raw.get("asin", "") or raw.get("sku", "")).strip()
+ if not asin or asin in self._seen_asins:
+ return None
+ self._seen_asins.add(asin)
+
+ title = (raw.get("title") or "").strip()
+ if not title:
+ return None
+
+ url = raw.get("url") or f"{BASE_URL}/dp/{asin}"
+ if not url.startswith("http"):
+ url = urllib.parse.urljoin(BASE_URL, url)
+
+ price = self._parse_price(raw.get("price"))
+ original_price = (
+ self._parse_price(raw.get("original_price")) or price
+ )
+ review_count = self._parse_int(raw.get("review_count"))
+
+ rating = 0.0
+ rating_text = raw.get("rating") or ""
+ rating_match = re.search(r"(\d+(?:\.\d+)?)", rating_text)
+ if rating_match:
+ rating = float(rating_match.group(1))
+
+ category_path = [category_name]
+ if keyword and keyword.lower() != category_name.lower():
+ category_path.append(keyword)
+
+ return {
+ "sku": asin,
+ "merchant_id": MERCHANT_ID,
+ "title": title,
+ "description": raw.get("description") or "",
+ "price": price,
+ "currency": "AUD",
+ "url": url,
+ "image_url": raw.get("image_url") or "",
+ "category": category_name,
+ "category_path": category_path,
+ "brand": raw.get("brand") or self._extract_brand(title),
+ "is_active": True,
+ "metadata": {
+ "keyword": keyword,
+ "original_price": original_price,
+ "rating": rating,
+ "review_count": review_count,
+ "is_sponsored": bool(raw.get("is_sponsored", False)),
+ "region": "au",
+ "country_code": "AU",
+ "source_type": "scraperapi_premium_residential",
+ "scraped_at": int(time.time()),
+ "amazon_asin": asin,
+ },
+ }
+ except Exception:
+ return None
+
+ async def fetch_search_page(
+ self, keyword: str, page: int = 1
+ ) -> tuple[list[dict[str, Any]], bool]:
+ query_params = f"k={urllib.parse.quote(keyword, safe='')}&page={page}"
+ url = f"{BASE_URL}/s?{query_params}"
+ raw = await self._fetch_with_scraperapi(url, autoparse=True)
+ if not raw:
+ return [], False
+ products, next_page = self._parse_autoparse_results(raw)
+ has_next_page = bool(next_page)
+ return products, has_next_page
+
+ async def _ingest_batch(
+ self, products: list[dict[str, Any]]
+ ) -> tuple[int, int, int]:
+ if not products:
+ return 0, 0, 0
+ if self.scrape_only:
+ self._write_products(products)
+ return len(products), 0, 0
+
+ url = f"{self.api_base}/v1/ingest/products"
+ headers = {"Authorization": f"Bearer {self.api_key}"}
+ payload = {"source": SOURCE, "products": products}
+
+ try:
+ resp = await self._client.post(url, json=payload, headers=headers)
+ resp.raise_for_status()
+ result = resp.json()
+ return (
+ result.get("rows_inserted", 0),
+ result.get("rows_updated", 0),
+ result.get("rows_failed", 0),
+ )
+ except Exception as e:
+ print(f" Ingestion error: {e}")
+ return 0, 0, len(products)
+
+ async def scrape_keyword(
+ self, category: dict[str, Any], keyword: str
+ ) -> dict[str, int]:
+ category_name = category["name"]
+ print(f"\n[{category_name}] keyword='{keyword}'")
+ counts: dict[str, int] = {
+ "scraped": 0,
+ "ingested": 0,
+ "updated": 0,
+ "failed": 0,
+ }
+ batch: list[dict[str, Any]] = []
+
+ for page in range(1, self.max_pages_per_keyword + 1):
+ if self.target > 0 and self.total_scraped >= self.target:
+ break
+
+ async with self._semaphore:
+ parsed_products, has_next_page = await self.fetch_search_page(
+ keyword, page
+ )
+
+ if not parsed_products:
+ print(f" Page {page}: no products")
+ if not has_next_page:
+ break
+ await asyncio.sleep(self.delay)
+ continue
+
+ for product in parsed_products:
+ asin = product.get("sku", "")
+ if asin in self._seen_asins:
+ continue
+ self._seen_asins.add(asin)
+ batch.append(product)
+ counts["scraped"] += 1
+
+ if len(batch) >= self.batch_size:
+ i, u, f = await self._ingest_batch(batch)
+ counts["ingested"] += i
+ counts["updated"] += u
+ counts["failed"] += f
+ self.total_ingested += i
+ self.total_updated += u
+ self.total_failed += f
+ self.total_scraped += i
+ batch = []
+ await asyncio.sleep(self.delay)
+
+ print(
+ f" Page {page}: parsed={len(parsed_products)} new={counts['scraped']} "
+ f"total={self.total_scraped}"
+ )
+
+ if not has_next_page:
+ break
+
+ await asyncio.sleep(self.delay)
+
+ if batch:
+ i, u, f = await self._ingest_batch(batch)
+ counts["ingested"] += i
+ counts["updated"] += u
+ counts["failed"] += f
+ self.total_ingested += i
+ self.total_updated += u
+ self.total_failed += f
+ self.total_scraped += i
+
+ return counts
+
+ async def run(self) -> dict[str, Any]:
+ print("Amazon AU Scraper starting...")
+ print(f"Target: {self.target} products")
+ print(f"Mode: {'scrape only' if self.scrape_only else f'API: {self.api_base}'}")
+ print(f"Output file: {self._output_file}")
+ print(f"Categories: {len(CATEGORIES)}")
+ print(f"Concurrency: {self.concurrency}")
+
+ start = time.time()
+
+ tasks = [
+ self.scrape_keyword(cat, kw)
+ for cat in CATEGORIES
+ for kw in cat["keywords"]
+ ]
+ results = await asyncio.gather(*tasks)
+
+ elapsed = time.time() - start
+
+ total_scraped = sum(r.get("scraped", 0) for r in results)
+ total_ingested = sum(r.get("ingested", 0) for r in results)
+ total_updated = sum(r.get("updated", 0) for r in results)
+ total_failed = sum(r.get("failed", 0) for r in results)
+
+ summary = {
+ "elapsed_seconds": round(elapsed, 1),
+ "total_scraped": total_scraped,
+ "total_ingested": total_ingested,
+ "total_updated": total_updated,
+ "total_failed": total_failed,
+ "unique_asins": len(self._seen_asins),
+ "output_file": self._output_file,
+ "categories_covered": len(CATEGORIES),
+ }
+ print(f"Scraper complete: {summary}")
+ return summary
+
+
+async def main() -> None:
+ parser = argparse.ArgumentParser(description="Amazon.com.au Australia product scraper")
+ parser.add_argument(
+ "--scrape-only",
+ action="store_true",
+ default=True,
+ help="Save to NDJSON without ingesting",
+ )
+ parser.add_argument(
+ "--target",
+ type=int,
+ default=500000,
+ help="Target number of products (0 = unlimited)",
+ )
+ parser.add_argument(
+ "--delay", type=float, default=1.5, help="Delay between requests (seconds)"
+ )
+ parser.add_argument(
+ "--concurrency", type=int, default=8, help="Max concurrent keyword scrapes"
+ )
+ parser.add_argument(
+ "--max-pages-per-keyword",
+ type=int,
+ default=50,
+ help="Max pages per keyword",
+ )
+ parser.add_argument(
+ "--scraperapi-key", default=None, help="ScraperAPI key (or env SCRAPERAPI_KEY)"
+ )
+ parser.add_argument(
+ "--api-key", default=None, help="BuyWhere API key (for ingestion)"
+ )
+ parser.add_argument(
+ "--api-base",
+ default="http://localhost:8000",
+ help="BuyWhere API base URL",
+ )
+ parser.add_argument(
+ "--batch-size", type=int, default=100, help="Batch size for ingestion"
+ )
+ args = parser.parse_args()
+
+ scraper = AmazonAUScraper(
+ scrape_only=args.scrape_only,
+ target=args.target,
+ delay=args.delay,
+ concurrency=args.concurrency,
+ max_pages_per_keyword=args.max_pages_per_keyword,
+ scraperapi_key=args.scraperapi_key,
+ batch_size=args.batch_size,
+ api_base=args.api_base,
+ api_key=args.api_key,
+ )
+ try:
+ await scraper.run()
+ finally:
+ await scraper.close()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/scrapers/shopee_sg.py b/scrapers/shopee_sg.py
index 5f5f05241..83d99c118 100644
--- a/scrapers/shopee_sg.py
+++ b/scrapers/shopee_sg.py
@@ -29,6 +29,7 @@
MERCHANT_ID = "shopee_sg"
SOURCE = "shopee_sg"
+PLATFORM = "shopee"
BASE_URL = "https://www.shopee.sg"
OUTPUT_DIR = "/home/paperclip/buywhere-api/data/shopee-main"
@@ -81,7 +82,7 @@ def __init__(
self.total_ingested = 0
self.total_updated = 0
self.total_failed = 0
- self.products_outfile = None
+ self.products_outfile: str | None = None
self._ensure_output_dir()
def _ensure_output_dir(self):
@@ -184,12 +185,13 @@ def transform_product(self, raw: dict, category: dict) -> dict[str, Any] | None:
return {
"sku": sku,
- "merchant_id": MERCHANT_ID,
+ "platform": PLATFORM,
"title": name,
"description": "",
"price": price,
"currency": "SGD",
- "url": product_url,
+ "country_code": "SG",
+ "product_url": product_url,
"image_url": image_url,
"category": category["name"],
"category_path": [category["name"], category["sub"]],
@@ -224,18 +226,17 @@ async def ingest_batch(self, products: list[dict]) -> tuple[int, int, int]:
self._write_products_to_file(products)
return len(products), 0, 0
- url = f"{self.api_base}/v1/ingest/products"
+ url = f"{self.api_base}/v1/products/ingest"
headers = {"Authorization": f"Bearer {self.api_key}"}
- payload = {"source": SOURCE, "products": products}
try:
- resp = await self.client.post(url, json=payload, headers=headers)
+ resp = await self.client.post(url, json=products, headers=headers)
resp.raise_for_status()
result = resp.json()
return (
- result.get("rows_inserted", 0),
- result.get("rows_updated", 0),
- result.get("rows_failed", 0),
+ result.get("inserted", 0),
+ result.get("updated", 0),
+ result.get("skipped", 0),
)
except Exception as e:
print(f" Ingestion error: {e}")
diff --git a/scrapers/zalora_my.py b/scrapers/zalora_my.py
new file mode 100644
index 000000000..10f157527
--- /dev/null
+++ b/scrapers/zalora_my.py
@@ -0,0 +1,674 @@
+"""
+Zalora Malaysia sitemap crawler.
+
+Adapted from the Zalora SG sitemap crawler (BUY-1521) for Malaysia market.
+- fetch product sitemap shards directly
+- resume from the current daily NDJSON baseline
+- only attempt proxy fetches after direct product-page failures
+- emit shard-level coverage stats for the next scaling pass
+
+Usage:
+ SCRAPERAPI_KEY=... python -m scrapers.zalora_my --use-scraperapi
+ SCRAPERAPI_KEY=... python -m scrapers.zalora_my --scrape-only --max-products 500000
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import re
+import shutil
+import time
+import xml.etree.ElementTree as ET
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from itertools import zip_longest
+from pathlib import Path
+from typing import Any, Optional
+from urllib.parse import quote
+
+import aiohttp
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+)
+logger = logging.getLogger("zalora_my")
+
+DATA_DIR = Path("/home/paperclip/buywhere-api/data")
+REPORT_DIR = DATA_DIR / "reports"
+BASE_URL = "https://www.zalora.com.my"
+PRODUCT_BASE = f"{BASE_URL}/product/"
+CANONICAL_BASE = f"{BASE_URL}/p/"
+
+SITEMAP_URLS = [
+ "https://www.zalora.com.my/product-sitemap-1.xml",
+ "https://www.zalora.com.my/product-sitemap-2.xml",
+ "https://www.zalora.com.my/product-sitemap-3.xml",
+ "https://www.zalora.com.my/product-sitemap-4.xml",
+ "https://www.zalora.com.my/product-sitemap-5.xml",
+]
+
+NS = {
+ "sm": "http://www.sitemaps.org/schemas/sitemap/0.9",
+ "image": "http://www.google.com/schemas/sitemap-image/1.1",
+}
+
+DEFAULT_HEADERS = {
+ "User-Agent": (
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/124.0.0.0 Safari/537.36"
+ ),
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+ "Accept-Language": "en-MY,en;q=0.9",
+ "Referer": f"{BASE_URL}/",
+}
+
+_CATEGORY_SIGNALS: list[tuple[list[str], str]] = [
+ (["heel_height", "toe_type", "shoe_width", "sole_material", "upper_material"], "Shoes"),
+ (["bag_type", "bag_closure", "strap_length", "lining_material"], "Bags & Wallets"),
+ (["neckline", "sleeve_length", "waist_size", "clothing_length", "fit_type"], "Clothing"),
+ (["lens_material", "frame_shape", "frame_material"], "Sunglasses & Eyewear"),
+ (["band_material", "case_diameter", "movement_type"], "Watches"),
+ (["chain_length", "gemstone", "metal_type"], "Jewellery"),
+ (["fragrance_concentration", "skin_type", "spf"], "Beauty"),
+ (["sport_type", "sport"], "Sports & Outdoors"),
+ (["cup_size", "bra_type"], "Lingerie & Swimwear"),
+]
+
+
+def utc_date_string() -> str:
+ return datetime.now(timezone.utc).strftime("%Y%m%d")
+
+
+def extract_product_id_from_slug(slug: str) -> str:
+ match = re.search(r"-(\d+)$", slug)
+ return match.group(1) if match else slug
+
+
+def extract_product_id_from_url(url: str) -> str:
+ match = re.search(r"/p/[\w-]+-(\d+)", url)
+ return match.group(1) if match else url
+
+
+def slug_to_name_hint(slug: str) -> str:
+ parts = slug.rsplit("-", 1)
+ base = parts[0] if len(parts) == 2 and parts[1].isdigit() else slug
+ return base.replace("-", " ").title()
+
+
+def extract_brand_from_slug(slug: str) -> str:
+ return slug.split("-", 1)[0].title()
+
+
+def infer_category_from_attributes(type_a1_set: set[str]) -> str:
+ for signals, label in _CATEGORY_SIGNALS:
+ if any(sig in type_a1_set for sig in signals):
+ return label
+ return "Fashion"
+
+
+@dataclass(frozen=True)
+class SitemapEntry:
+ shard: str
+ slug: str
+ image_url: str
+
+ @property
+ def product_id(self) -> str:
+ return extract_product_id_from_slug(self.slug)
+
+ @property
+ def url(self) -> str:
+ return f"{CANONICAL_BASE}{self.slug}"
+
+
+def interleave_entries_by_shard(entries: list[SitemapEntry]) -> list[SitemapEntry]:
+ grouped: dict[str, list[SitemapEntry]] = defaultdict(list)
+ shard_order: list[str] = []
+
+ for entry in entries:
+ if entry.shard not in grouped:
+ shard_order.append(entry.shard)
+ grouped[entry.shard].append(entry)
+
+ interleaved: list[SitemapEntry] = []
+ for row in zip_longest(*(grouped[shard] for shard in shard_order)):
+ for entry in row:
+ if entry is not None:
+ interleaved.append(entry)
+ return interleaved
+
+
+def summarize_ndjson_file(path: Path) -> dict[str, Any]:
+ summary: dict[str, Any] = {
+ "path": str(path),
+ "exists": path.exists(),
+ }
+ if not path.exists():
+ return summary
+
+ line_count = 0
+ with path.open("r", encoding="utf-8", errors="ignore") as handle:
+ for line in handle:
+ if line.strip():
+ line_count += 1
+
+ summary["line_count"] = line_count
+ summary["size_bytes"] = path.stat().st_size
+ return summary
+
+
+class ZaloraMyScraper:
+ def __init__(
+ self,
+ rate_limit: float = 1.0,
+ max_retries: int = 3,
+ timeout: int = 30,
+ output_dir: Optional[str] = None,
+ output_file: Optional[str] = None,
+ coverage_report: Optional[str] = None,
+ max_products: int = 0,
+ max_concurrency: int = 5,
+ use_scraperapi: bool = False,
+ scraperapi_key: str = "",
+ resume_from: Optional[list[str]] = None,
+ ):
+ self.rate_limit = rate_limit
+ self.max_retries = max_retries
+ self.timeout = timeout
+ self.max_products = max_products
+ self.max_concurrency = max_concurrency
+ self.use_scraperapi = use_scraperapi
+ self.scraperapi_key = scraperapi_key or os.environ.get("SCRAPERAPI_KEY", "")
+
+ self.output_dir = Path(output_dir) if output_dir else DATA_DIR
+ self.output_dir.mkdir(parents=True, exist_ok=True)
+ REPORT_DIR.mkdir(parents=True, exist_ok=True)
+
+ self.output_file = Path(output_file) if output_file else self.output_dir / f"zalora_my_{utc_date_string()}.ndjson"
+ self.coverage_report = (
+ Path(coverage_report)
+ if coverage_report
+ else REPORT_DIR / f"zalora_my_coverage_{utc_date_string()}.json"
+ )
+
+ seen_resume_paths: set[Path] = set()
+ resume_paths: list[Path] = []
+ for raw_path in (resume_from or []):
+ path = Path(raw_path)
+ resolved = path.resolve(strict=False)
+ if resolved in seen_resume_paths:
+ continue
+ seen_resume_paths.add(resolved)
+ resume_paths.append(path)
+ if self.output_file.exists():
+ resolved_output = self.output_file.resolve(strict=False)
+ if resolved_output not in seen_resume_paths:
+ resume_paths.append(self.output_file)
+ self.resume_paths = resume_paths
+
+ self._session: Optional[aiohttp.ClientSession] = None
+ self._queue: asyncio.Queue[SitemapEntry | None] = asyncio.Queue()
+ self._semaphore = asyncio.Semaphore(max_concurrency)
+ self._write_lock = asyncio.Lock()
+
+ self.existing_product_ids: set[str] = set()
+ self.written_product_ids: set[str] = set()
+
+ self.total_scraped = 0
+ self.total_failed = 0
+ self.status_counts: Counter[str] = Counter()
+ self.shard_stats: dict[str, Counter[str]] = defaultdict(Counter)
+ self.shard_categories: dict[str, Counter[str]] = defaultdict(Counter)
+ self.shard_failure_samples: dict[str, dict[str, list[str]]] = defaultdict(lambda: defaultdict(list))
+
+ async def _ensure_session(self) -> None:
+ if self._session is None or self._session.closed:
+ timeout = aiohttp.ClientTimeout(total=self.timeout)
+ self._session = aiohttp.ClientSession(timeout=timeout, headers=DEFAULT_HEADERS)
+
+ async def close(self) -> None:
+ if self._session and not self._session.closed:
+ await self._session.close()
+ self._session = None
+
+ def load_existing_product_ids(self) -> int:
+ before = len(self.existing_product_ids)
+ for path in self.resume_paths:
+ if not path.exists():
+ continue
+ logger.info("Loading existing Zalora MY baseline from %s", path)
+ with path.open("r", encoding="utf-8", errors="ignore") as handle:
+ for line in handle:
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ record = json.loads(line)
+ except json.JSONDecodeError:
+ continue
+ product_id = (
+ record.get("product_id")
+ or record.get("sku")
+ or record.get("metadata", {}).get("product_id")
+ )
+ if not product_id:
+ continue
+ if product_id.startswith("zalora_my_"):
+ product_id = product_id.removeprefix("zalora_my_")
+ self.existing_product_ids.add(str(product_id))
+ return len(self.existing_product_ids) - before
+
+ def seed_output_from_resume_source(self) -> Optional[Path]:
+ if self.output_file.exists():
+ return None
+
+ for path in self.resume_paths:
+ if not path.exists():
+ continue
+ if path.resolve(strict=False) == self.output_file.resolve(strict=False):
+ continue
+ self.output_file.parent.mkdir(parents=True, exist_ok=True)
+ shutil.copy2(path, self.output_file)
+ logger.info("Seeded canonical Zalora MY output %s from resume source %s", self.output_file, path)
+ return path
+ return None
+
+ async def _fetch_text(
+ self,
+ url: str,
+ *,
+ allow_proxy_fallback: bool,
+ ) -> tuple[Optional[str], int, str]:
+ await self._ensure_session()
+ if self._session is None:
+ return None, 0, "session_unavailable"
+
+ direct_result = await self._fetch_direct(url)
+ if direct_result[0] is not None or not allow_proxy_fallback:
+ return direct_result
+
+ _content, status, _mode = direct_result
+ if status == 403 and self.use_scraperapi and self.scraperapi_key:
+ proxy_result = await self._fetch_via_scraperapi(url)
+ if proxy_result[0] is not None:
+ return proxy_result
+ return direct_result
+
+ async def _fetch_direct(self, url: str) -> tuple[Optional[str], int, str]:
+ assert self._session is not None
+ for attempt in range(self.max_retries):
+ try:
+ async with self._session.get(url) as response:
+ text = await response.text()
+ if response.status == 200:
+ return text, response.status, "direct"
+ if response.status == 429 and attempt < self.max_retries - 1:
+ await asyncio.sleep((2 ** attempt) * 5)
+ continue
+ return None, response.status, "direct"
+ except Exception as exc:
+ logger.warning("Direct fetch error for %s: %s", url, exc)
+ if attempt < self.max_retries - 1:
+ await asyncio.sleep(2 ** attempt)
+ return None, 0, "direct"
+
+ async def _fetch_via_scraperapi(self, url: str) -> tuple[Optional[str], int, str]:
+ assert self._session is not None
+ encoded_url = quote(url, safe="")
+ proxy_url = (
+ f"http://api.scraperapi.com?api_key={self.scraperapi_key}"
+ f"&url={encoded_url}&render=true"
+ )
+ for attempt in range(self.max_retries):
+ try:
+ async with self._session.get(
+ proxy_url,
+ timeout=aiohttp.ClientTimeout(total=max(self.timeout, 60)),
+ ) as response:
+ text = await response.text()
+ if response.status == 200:
+ return text, response.status, "scraperapi"
+ if response.status == 429 and attempt < self.max_retries - 1:
+ await asyncio.sleep((2 ** attempt) * 5)
+ continue
+ return None, response.status, "scraperapi"
+ except Exception as exc:
+ logger.warning("ScraperAPI fetch error for %s: %s", url, exc)
+ if attempt < self.max_retries - 1:
+ await asyncio.sleep(2 ** attempt)
+ return None, 0, "scraperapi"
+
+ async def fetch_sitemap_entries(self) -> list[SitemapEntry]:
+ entries: list[SitemapEntry] = []
+ seen_ids: set[str] = set()
+
+ for sitemap_url in SITEMAP_URLS:
+ shard = Path(sitemap_url).name
+ logger.info("Fetching sitemap shard %s", shard)
+ content, status, _mode = await self._fetch_text(
+ sitemap_url,
+ allow_proxy_fallback=False,
+ )
+ if content is None:
+ logger.warning("Failed to fetch sitemap shard %s (status=%s)", shard, status)
+ self.shard_stats[shard]["sitemap_fetch_failed"] += 1
+ continue
+
+ try:
+ root = ET.fromstring(content)
+ except ET.ParseError as exc:
+ logger.warning("Failed to parse sitemap shard %s: %s", shard, exc)
+ self.shard_stats[shard]["sitemap_parse_failed"] += 1
+ continue
+
+ for url_el in root.findall("sm:url", NS):
+ loc_el = url_el.find("sm:loc", NS)
+ image_el = url_el.find("image:image/image:loc", NS)
+ if loc_el is None or not loc_el.text:
+ continue
+ match = re.search(r"/p/(.+)$", loc_el.text.strip())
+ if not match:
+ continue
+ slug = match.group(1)
+ product_id = extract_product_id_from_slug(slug)
+ self.shard_stats[shard]["discovered"] += 1
+ if product_id in seen_ids:
+ self.shard_stats[shard]["duplicate_product_id"] += 1
+ continue
+ seen_ids.add(product_id)
+ entries.append(
+ SitemapEntry(
+ shard=shard,
+ slug=slug,
+ image_url=image_el.text.strip() if image_el is not None and image_el.text else "",
+ )
+ )
+
+ return entries
+
+ def _parse_product_page(self, html: str, entry: SitemapEntry) -> Optional[dict[str, Any]]:
+ match = re.search(
+ r'',
+ html,
+ re.DOTALL,
+ )
+ if not match:
+ return None
+
+ try:
+ data = json.loads(match.group(1))
+ except json.JSONDecodeError:
+ return None
+
+ product_raw = (
+ data.get("props", {})
+ .get("pageProps", {})
+ .get("preloadedState", {})
+ .get("pdv", {})
+ .get("product", {})
+ )
+ if not product_raw:
+ return None
+
+ product_id = extract_product_id_from_slug(entry.slug)
+ name = str(product_raw.get("name", "")).strip() or slug_to_name_hint(entry.slug)
+
+ price_str = str(product_raw.get("price", "0") or "0")
+ for simple in product_raw.get("simples", []):
+ simple_price = simple.get("price")
+ if simple_price is None:
+ continue
+ try:
+ if float(simple_price) > 0:
+ price_str = str(simple_price)
+ break
+ except (TypeError, ValueError):
+ continue
+
+ try:
+ price_amount = float(price_str)
+ except ValueError:
+ price_amount = 0.0
+
+ if price_amount <= 0:
+ return None
+
+ attr_keys = {
+ str(key).lower().replace(" ", "_")
+ for key in (product_raw.get("attributes") or {})
+ }
+ category = infer_category_from_attributes(attr_keys)
+
+ seller_names = {
+ simple.get("fulfillment_information", {}).get("seller_name")
+ for simple in product_raw.get("simples", [])
+ if simple.get("fulfillment_information", {}).get("seller_name")
+ }
+ merchant_name = seller_names.pop() if len(seller_names) == 1 else "ZALORA"
+
+ in_stock = any(
+ simple.get("stock_status") == 1
+ for simple in product_raw.get("simples", [])
+ )
+
+ canonical_url = product_raw.get("url") or entry.url
+ brand = str(product_raw.get("brand", "")).strip() or extract_brand_from_slug(entry.slug)
+
+ return {
+ "sku": f"zalora_my_{product_id}",
+ "source": "zalora_my",
+ "title": name,
+ "price": {
+ "amount": price_amount,
+ "currency": "MYR",
+ },
+ "currency": "MYR",
+ "url": canonical_url,
+ "image_url": entry.image_url,
+ "category": category,
+ "category_path": [category],
+ "brand": brand,
+ "is_active": True,
+ "is_available": in_stock,
+ "in_stock": in_stock,
+ "merchant_id": "zalora",
+ "name": merchant_name,
+ "metadata": {
+ "category_slug": category.lower().replace(" & ", "_").replace(" ", "_"),
+ "platform": "zalora_my",
+ "scraped_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+ },
+ }
+
+ async def _write_product(self, product: dict[str, Any]) -> None:
+ async with self._write_lock:
+ with self.output_file.open("a", encoding="utf-8") as handle:
+ handle.write(json.dumps(product, ensure_ascii=False) + "\n")
+
+ def _record_failure_sample(self, entry: SitemapEntry, key: str, url: str, limit: int = 5) -> None:
+ samples = self.shard_failure_samples[entry.shard][key]
+ if url not in samples and len(samples) < limit:
+ samples.append(url)
+
+ def _record_failure(self, entry: SitemapEntry, status: int, mode: str) -> None:
+ shard_counter = self.shard_stats[entry.shard]
+ key = f"{mode}_status_{status}" if status else f"{mode}_request_failed"
+ shard_counter[key] += 1
+ self.status_counts[key] += 1
+ self._record_failure_sample(entry, key, f"{PRODUCT_BASE}{entry.slug}")
+ self.total_failed += 1
+
+ async def _scrape_worker(self, worker_id: int) -> None:
+ while True:
+ entry = await self._queue.get()
+ if entry is None:
+ self._queue.task_done()
+ break
+
+ try:
+ async with self._semaphore:
+ html, status, mode = await self._fetch_text(
+ f"{CANONICAL_BASE}{entry.slug}",
+ allow_proxy_fallback=True,
+ )
+
+ if html is None:
+ self._record_failure(entry, status, mode)
+ continue
+
+ product = self._parse_product_page(html, entry)
+ if product is None:
+ self.shard_stats[entry.shard]["parse_failed"] += 1
+ self.status_counts["parse_failed"] += 1
+ self._record_failure_sample(entry, "parse_failed", f"{PRODUCT_BASE}{entry.slug}")
+ self.total_failed += 1
+ continue
+
+ product_id = product["sku"].removeprefix("zalora_my_")
+ if product_id in self.written_product_ids:
+ self.shard_stats[entry.shard]["duplicate_after_resume"] += 1
+ continue
+
+ await self._write_product(product)
+ self.written_product_ids.add(product_id)
+ self.total_scraped += 1
+ self.shard_stats[entry.shard]["scraped"] += 1
+ self.shard_categories[entry.shard][product["category"]] += 1
+
+ if self.total_scraped and self.total_scraped % 500 == 0:
+ logger.info(
+ "Worker %s: scraped %s priced products so far",
+ worker_id,
+ self.total_scraped,
+ )
+ finally:
+ await asyncio.sleep(self.rate_limit)
+ self._queue.task_done()
+
+ async def run(self) -> dict[str, Any]:
+ start_time = time.time()
+ await self._ensure_session()
+
+ seeded_from = self.seed_output_from_resume_source()
+ if seeded_from is not None and self.output_file not in self.resume_paths:
+ self.resume_paths.append(self.output_file)
+
+ loaded = self.load_existing_product_ids()
+ logger.info("Loaded %s existing Zalora MY product ids from resume baseline", loaded)
+
+ entries = interleave_entries_by_shard(await self.fetch_sitemap_entries())
+ if not entries:
+ await self.close()
+ return {"error": "No sitemap entries found"}
+
+ queued = 0
+ for entry in entries:
+ if entry.product_id in self.existing_product_ids:
+ self.shard_stats[entry.shard]["skipped_existing"] += 1
+ continue
+ await self._queue.put(entry)
+ self.shard_stats[entry.shard]["queued"] += 1
+ queued += 1
+ if self.max_products and queued >= self.max_products:
+ break
+
+ logger.info("Queued %s fresh Zalora MY URLs across %s sitemap shards", queued, len(self.shard_stats))
+
+ worker_count = max(1, min(self.max_concurrency, 10))
+ workers = [asyncio.create_task(self._scrape_worker(i + 1)) for i in range(worker_count)]
+
+ await self._queue.join()
+
+ for _ in workers:
+ await self._queue.put(None)
+ await asyncio.gather(*workers, return_exceptions=True)
+
+ elapsed = time.time() - start_time
+ summary = {
+ "elapsed_seconds": round(elapsed, 1),
+ "resume_baseline_count": len(self.existing_product_ids),
+ "seeded_output_from": str(seeded_from) if seeded_from is not None else None,
+ "resume_sources": [summarize_ndjson_file(path) for path in self.resume_paths],
+ "total_scraped": self.total_scraped,
+ "total_failed": self.total_failed,
+ "total_output_count": len(self.existing_product_ids) + len(self.written_product_ids),
+ "output_file": str(self.output_file),
+ "coverage_report": str(self.coverage_report),
+ "status_counts": dict(self.status_counts),
+ "shards": {
+ shard: {
+ **dict(counter),
+ "categories": dict(self.shard_categories.get(shard, Counter()).most_common(10)),
+ "failure_samples": self.shard_failure_samples.get(shard, {}),
+ }
+ for shard, counter in sorted(self.shard_stats.items())
+ },
+ }
+
+ self.coverage_report.write_text(
+ json.dumps(summary, ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
+
+ logger.info("Zalora MY sitemap scrape complete: %s", json.dumps(summary, ensure_ascii=False))
+ await self.close()
+ return summary
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(description="Zalora MY sitemap crawler")
+ parser.add_argument("--rate-limit", type=float, default=1.0)
+ parser.add_argument("--max-retries", type=int, default=3)
+ parser.add_argument("--timeout", type=int, default=30)
+ parser.add_argument("--output-dir", default=None)
+ parser.add_argument("--output-file", default=None)
+ parser.add_argument("--coverage-report", default=None)
+ parser.add_argument("--max-products", type=int, default=0)
+ parser.add_argument("--max-concurrency", type=int, default=5)
+ parser.add_argument("--use-scraperapi", action="store_true")
+ parser.add_argument("--scraperapi-key", default="")
+ parser.add_argument(
+ "--resume-from",
+ action="append",
+ default=[],
+ help="Additional NDJSON files to load for existing-product skipping",
+ )
+ return parser
+
+
+async def main_async(args: argparse.Namespace) -> dict[str, Any]:
+ scraper = ZaloraMyScraper(
+ rate_limit=args.rate_limit,
+ max_retries=args.max_retries,
+ timeout=args.timeout,
+ output_dir=args.output_dir,
+ output_file=args.output_file,
+ coverage_report=args.coverage_report,
+ max_products=args.max_products,
+ max_concurrency=args.max_concurrency,
+ use_scraperapi=args.use_scraperapi or bool(os.environ.get("SCRAPERAPI_KEY")),
+ scraperapi_key=args.scraperapi_key,
+ resume_from=args.resume_from,
+ )
+ return await scraper.run()
+
+
+def main() -> int:
+ args = build_parser().parse_args()
+ try:
+ asyncio.run(main_async(args))
+ return 0
+ except KeyboardInterrupt:
+ logger.info("Interrupted")
+ return 130
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/src/app/auth.py b/src/app/auth.py
new file mode 100644
index 000000000..f5edfba3e
--- /dev/null
+++ b/src/app/auth.py
@@ -0,0 +1,371 @@
+import hashlib
+import os
+import secrets
+import uuid
+from datetime import datetime, timezone, timedelta
+from typing import Optional
+
+import bcrypt
+import httpx
+from fastapi import Depends, HTTPException, Request, Security, status
+from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
+from jose import JWTError, jwt
+from sqlalchemy import select, update
+from sqlalchemy.ext.asyncio import AsyncSession
+from starlette.middleware.base import BaseHTTPMiddleware
+
+from app.config import get_settings
+from app.database import AsyncSessionLocal, get_db
+from app.models.product import ApiKey
+from app.models.user import User
+
+PAPERCLIP_API_URL = os.environ.get("PAPERCLIP_API_URL", "https://api.paperclip.ai")
+
+
+async def _verify_paperclip_token_with_api(token: str) -> Optional[dict]:
+ try:
+ async with httpx.AsyncClient(timeout=10.0) as client:
+ resp = await client.get(
+ f"{PAPERCLIP_API_URL}/api/agents/me",
+ headers={"Authorization": f"Bearer {token}"},
+ )
+ if resp.status_code == 200:
+ return resp.json()
+ return None
+ except Exception:
+ return None
+
+settings = get_settings()
+bearer_scheme = HTTPBearer()
+
+
+def hash_key(raw_key: str) -> str:
+ return hashlib.sha256(raw_key.encode()).hexdigest()
+
+
+def hash_password(password: str) -> str:
+ return bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode()
+
+
+def verify_password(password: str, hashed: str) -> bool:
+ return bcrypt.checkpw(password.encode(), hashed.encode())
+
+
+def _is_bcrypt_hash(h: str) -> bool:
+ return h.startswith("$2b$") or h.startswith("$2a$")
+
+
+def _verify_key_bcrypt(raw_key: str | bytes, hashed: str) -> bool:
+ if isinstance(raw_key, bytes):
+ return bcrypt.checkpw(raw_key, hashed.encode())
+ return bcrypt.checkpw(raw_key.encode(), hashed.encode())
+
+
+def generate_api_key() -> tuple[str, str]:
+ """Returns (raw_key, key_hash). Store hash, give raw to developer."""
+ raw = "bw_" + secrets.token_urlsafe(32)
+ return raw, hash_key(raw)
+
+
+def create_access_token(data: dict) -> str:
+ return jwt.encode(data, settings.jwt_secret_key, algorithm=settings.jwt_algorithm)
+
+
+def create_user_token(user_id: str, email: str) -> tuple[str, int]:
+ expires_delta = timedelta(minutes=settings.jwt_expire_minutes)
+ expire = datetime.now(timezone.utc) + expires_delta
+ payload = {
+ "sub": user_id,
+ "email": email,
+ "type": "user",
+ "exp": expire,
+ }
+ token = jwt.encode(payload, settings.jwt_secret_key, algorithm=settings.jwt_algorithm)
+ return token, settings.jwt_expire_minutes * 60
+
+
+def decode_access_token(token: str) -> Optional[dict]:
+ try:
+ return jwt.decode(token, settings.jwt_secret_key, algorithms=[settings.jwt_algorithm])
+ except JWTError:
+ return None
+
+
+async def get_current_user(
+ credentials: HTTPAuthorizationCredentials = Security(bearer_scheme),
+ db: AsyncSession = Depends(get_db),
+) -> User:
+ token = credentials.credentials
+ payload = decode_access_token(token)
+
+ if payload is None:
+ raise HTTPException(
+ status_code=status.HTTP_401_UNAUTHORIZED,
+ detail="Invalid or expired token",
+ headers={"WWW-Authenticate": "Bearer"},
+ )
+
+ if payload.get("type") != "user":
+ raise HTTPException(
+ status_code=status.HTTP_401_UNAUTHORIZED,
+ detail="Invalid token type",
+ headers={"WWW-Authenticate": "Bearer"},
+ )
+
+ user_id = payload.get("sub")
+ if not user_id:
+ raise HTTPException(
+ status_code=status.HTTP_401_UNAUTHORIZED,
+ detail="Invalid token payload",
+ headers={"WWW-Authenticate": "Bearer"},
+ )
+
+ result = await db.execute(
+ select(User).where(User.id == user_id, User.is_active == True)
+ )
+ user = result.scalar_one_or_none()
+
+ if not user:
+ raise HTTPException(
+ status_code=status.HTTP_401_UNAUTHORIZED,
+ detail="User not found or inactive",
+ headers={"WWW-Authenticate": "Bearer"},
+ )
+
+ return user
+
+
+async def get_current_api_key(
+ request: Request,
+ credentials: HTTPAuthorizationCredentials = Security(bearer_scheme),
+ db: AsyncSession = Depends(get_db),
+) -> ApiKey:
+ if hasattr(request.state, "api_key") and request.state.api_key is not None:
+ api_key = request.state.api_key
+ await db.execute(
+ update(ApiKey)
+ .where(ApiKey.id == api_key.id)
+ .values(last_used_at=datetime.now(timezone.utc))
+ )
+ return api_key
+
+ token = credentials.credentials
+
+ paperclip_key = await upsert_paperclip_agent_key(token, db)
+ if paperclip_key is not None:
+ await db.execute(
+ update(ApiKey)
+ .where(ApiKey.id == paperclip_key.id)
+ .values(last_used_at=datetime.now(timezone.utc))
+ )
+ return paperclip_key
+
+ payload = decode_access_token(token)
+ if payload and "key_id" in payload:
+ key_id = payload["key_id"]
+ result = await db.execute(
+ select(ApiKey).where(ApiKey.id == key_id, ApiKey.is_active == True)
+ )
+ api_key = result.scalar_one_or_none()
+ else:
+ key_hash = hash_key(token)
+ result = await db.execute(
+ select(ApiKey).where(ApiKey.key_hash == key_hash, ApiKey.is_active == True)
+ )
+ api_key = result.scalar_one_or_none()
+
+ if api_key is None:
+ result = await db.execute(
+ select(ApiKey).where(
+ ApiKey.is_active == True,
+ ApiKey.key_hash.like("$2%"),
+ )
+ )
+ candidates = result.scalars().all()
+ for candidate in candidates:
+ token_bytes = token.encode("utf-8")
+ if len(token_bytes) > 72:
+ token_to_verify = hashlib.sha256(token_bytes).digest()
+ else:
+ token_to_verify = token_bytes
+ if _verify_key_bcrypt(token_to_verify, candidate.key_hash):
+ api_key = candidate
+ break
+
+ if not api_key:
+ raise HTTPException(
+ status_code=status.HTTP_401_UNAUTHORIZED,
+ detail="Invalid or revoked API key",
+ headers={"WWW-Authenticate": "Bearer"},
+ )
+
+ await db.execute(
+ update(ApiKey)
+ .where(ApiKey.id == api_key.id)
+ .values(last_used_at=datetime.now(timezone.utc))
+ )
+
+ return api_key
+
+
+async def provision_api_key(
+ developer_id: str,
+ name: str,
+ tier: str = "basic",
+ db: AsyncSession = None,
+ rate_limit: int = None,
+ allowed_origins: list = None,
+ utm_source: str = None,
+ utm_medium: str = None,
+ utm_campaign: str = None,
+ utm_content: str = None,
+ utm_term: str = None,
+ signup_channel: str = None,
+ is_active: bool = True,
+) -> tuple[str, ApiKey]:
+ """Create a new API key. Returns (raw_key, ApiKey record)."""
+ raw_key, key_hash = generate_api_key()
+ key_id = str(uuid.uuid4())
+
+ api_key = ApiKey(
+ id=key_id,
+ key_hash=key_hash,
+ developer_id=developer_id,
+ name=name,
+ tier=tier,
+ is_active=is_active,
+ rate_limit=rate_limit,
+ allowed_origins=allowed_origins,
+ signup_channel=signup_channel,
+ )
+ db.add(api_key)
+ await db.flush()
+
+ return raw_key, api_key
+
+
+async def resolve_paperclip_agent_key(token: str, db: AsyncSession) -> Optional[ApiKey]:
+ payload = await _verify_paperclip_token_with_api(token)
+ if not payload:
+ return None
+
+ agent_id = payload.get("id") or payload.get("sub")
+ if not agent_id:
+ return None
+
+ result = await db.execute(
+ select(ApiKey).where(
+ ApiKey.key_hash == f"paperclip:{agent_id}",
+ ApiKey.is_active == True,
+ )
+ )
+ return result.scalar_one_or_none()
+
+
+async def upsert_paperclip_agent_key(token: str, db: AsyncSession) -> Optional[ApiKey]:
+ payload = await _verify_paperclip_token_with_api(token)
+ if not payload:
+ return None
+
+ agent_id = payload.get("id") or payload.get("sub")
+ if not agent_id:
+ return None
+
+ key_hash = hashlib.sha256(agent_id.encode()).hexdigest()
+
+ existing = await db.execute(
+ select(ApiKey).where(
+ ApiKey.signup_channel == "paperclip_agent",
+ ApiKey.name == agent_id,
+ ApiKey.is_active == True,
+ )
+ )
+ api_key = existing.scalar_one_or_none()
+
+ if api_key is None:
+ api_key = ApiKey(
+ id=str(uuid.uuid4()),
+ key_hash=key_hash,
+ developer_id=payload.get("company_id", ""),
+ name=agent_id,
+ tier="enterprise",
+ is_active=True,
+ rate_limit=10000,
+ allowed_origins=None,
+ signup_channel="paperclip_agent",
+ )
+ db.add(api_key)
+ await db.flush()
+
+ return api_key
+
+
+async def resolve_api_key_from_token(token: str, db: AsyncSession) -> Optional[ApiKey]:
+ if not token:
+ return None
+
+ payload = decode_access_token(token)
+ if payload and "key_id" in payload:
+ key_id = payload["key_id"]
+ result = await db.execute(
+ select(ApiKey).where(ApiKey.id == key_id, ApiKey.is_active == True)
+ )
+ return result.scalar_one_or_none()
+
+ paperclip_key = await upsert_paperclip_agent_key(token, db)
+ if paperclip_key is not None:
+ return paperclip_key
+
+ key_hash = hash_key(token)
+ result = await db.execute(
+ select(ApiKey).where(ApiKey.key_hash == key_hash, ApiKey.is_active == True)
+ )
+ api_key = result.scalar_one_or_none()
+
+ if api_key is not None:
+ return api_key
+
+ result = await db.execute(
+ select(ApiKey).where(
+ ApiKey.is_active == True,
+ ApiKey.key_hash.like("$2%"),
+ )
+ )
+ candidates = result.scalars().all()
+ for candidate in candidates:
+ if len(token) > 72:
+ token_to_verify = hashlib.sha256(token.encode()).digest()
+ else:
+ token_to_verify = token.encode()
+ if _verify_key_bcrypt(token_to_verify, candidate.key_hash):
+ return candidate
+
+ return None
+
+
+class ApiKeyContextMiddleware(BaseHTTPMiddleware):
+ async def dispatch(self, request: Request, call_next):
+ token: str | None = None
+
+ auth_header = request.headers.get("Authorization", "")
+ if auth_header.startswith("Bearer "):
+ token = auth_header.removeprefix("Bearer ").strip()
+ else:
+ x_api_key = request.headers.get("X-API-Key")
+ if x_api_key:
+ token = x_api_key.strip()
+
+ if token:
+ try:
+ async with AsyncSessionLocal() as db:
+ api_key = await resolve_api_key_from_token(token, db)
+ if api_key is None:
+ api_key = await upsert_paperclip_agent_key(token, db)
+ if api_key is not None:
+ await db.commit()
+ if api_key is not None:
+ request.state.api_key = api_key
+ except Exception:
+ pass
+
+ return await call_next(request)
diff --git a/src/app/best-headphones-singapore/page.tsx b/src/app/best-headphones-singapore/page.tsx
new file mode 100644
index 000000000..3561d61c6
--- /dev/null
+++ b/src/app/best-headphones-singapore/page.tsx
@@ -0,0 +1,13 @@
+import type { Metadata } from "next";
+import { SeoLandingPage } from "@/components/seo/SeoLandingPage";
+import { buildSeoLandingMetadata, seoLandingPages } from "@/lib/seo-landing-pages";
+
+const config = seoLandingPages["best-headphones-singapore"];
+
+export async function generateMetadata(): Promise {
+ return buildSeoLandingMetadata(config);
+}
+
+export default function BestHeadphonesSingaporePage() {
+ return ;
+}
diff --git a/src/app/categories/beauty-health/page.tsx b/src/app/categories/beauty-health/page.tsx
index f489722b8..3b76a1e78 100644
--- a/src/app/categories/beauty-health/page.tsx
+++ b/src/app/categories/beauty-health/page.tsx
@@ -3,8 +3,8 @@ import { HeroSearch } from '@/components/HeroSearch';
import { buildSgCategoryMetadata } from '@/lib/seo-category-metadata';
export const metadata = buildSgCategoryMetadata(
- 'Beauty & Health Singapore | Compare Best Skincare, Makeup & Wellness Prices',
- 'Find the best beauty products in Singapore. Compare cheapest prices on skincare, makeup, fragrances, and health supplements from Watsons, Guardian, Sephora, and more.',
+ 'Beauty & Health Price Comparison Singapore | Skincare, Makeup & Wellness Deals 2026',
+ 'Compare cheapest beauty products in Singapore: skincare, makeup, fragrances, health supplements from Watsons, Guardian, Sephora. Find the best beauty deals online.',
'beauty-health'
);
@@ -36,8 +36,8 @@ export default function BeautyHealthCategoryPage() {
},
{
"@type": "CollectionPage",
- name: "Beauty & Health Singapore | Compare Best Skincare, Makeup & Wellness Prices",
- description: "Find the best beauty products in Singapore. Compare cheapest prices on skincare, makeup, fragrances, and health supplements from Watsons, Guardian, Sephora, and more.",
+ name: "Beauty & Health Price Comparison Singapore | Skincare, Makeup & Wellness Deals 2026",
+ description: "Compare cheapest beauty products in Singapore: skincare, makeup, fragrances, health supplements from Watsons, Guardian, Sephora. Find the best beauty deals online.",
url: "https://buywhere.ai/categories/beauty-health",
publisher: {
"@type": "Organization",
@@ -77,7 +77,7 @@ export default function BeautyHealthCategoryPage() {
{/* Header */}
- Beauty & Health Singapore | Compare Best Skincare, Makeup & Wellness Prices
+ Beauty & Health Price Comparison Singapore — Skincare, Makeup & Wellness Deals
Singaporeans take their beauty and health seriously, and the market reflects that with an impressive range of products from global brands and local favourites. BuyWhere aggregates beauty and health product listings from retailers across Singapore, making it easy to compare prices on skincare, cosmetics, fragrances, hair care, and health supplements all in one place.
diff --git a/src/app/categories/electronics/page.tsx b/src/app/categories/electronics/page.tsx
index 32b6857dd..6a09bfe1b 100644
--- a/src/app/categories/electronics/page.tsx
+++ b/src/app/categories/electronics/page.tsx
@@ -3,8 +3,8 @@ import { HeroSearch } from '@/components/HeroSearch';
import { buildSgCategoryMetadata } from '@/lib/seo-category-metadata';
export const metadata = buildSgCategoryMetadata(
- 'Electronics Singapore | Compare Best Prices on Gadgets & Tech',
- 'Find the best electronics in Singapore. Compare cheapest prices on smartphones, laptops, TVs, and more from top retailers. Updated daily with the latest deals.',
+ 'Electronics Price Comparison Singapore | Compare Gadgets & Tech Deals 2026',
+ 'Compare cheapest electronics prices in Singapore: smartphones, laptops, TVs, gaming from Shopee, Lazada, Courts, Harvey Norman. Updated daily. Find the best tech deals.',
'electronics'
);
@@ -36,8 +36,8 @@ export default function ElectronicsCategoryPage() {
},
{
"@type": "CollectionPage",
- name: "Electronics Singapore | Compare Best Prices on Gadgets & Tech",
- description: "Find the best electronics in Singapore. Compare cheapest prices on smartphones, laptops, TVs, and more from top retailers.",
+ name: "Electronics Price Comparison Singapore | Compare Gadgets & Tech Deals 2026",
+ description: "Compare cheapest electronics prices in Singapore: smartphones, laptops, TVs, gaming from Shopee, Lazada, Courts, Harvey Norman. Updated daily.",
url: "https://buywhere.ai/categories/electronics",
publisher: {
"@type": "Organization",
@@ -77,7 +77,7 @@ export default function ElectronicsCategoryPage() {
{/* Header */}
- Electronics Singapore | Compare Best Prices on Gadgets & Tech
+ Best Electronics Price Comparison Singapore — Gadgets & Tech Deals
Looking for the best electronics in Singapore? BuyWhere aggregates product listings from hundreds of retailers so you can compare prices, specs, and availability all in one place. Whether you are hunting for the latest smartphone, upgrading your home entertainment system, or building a gaming PC, we help you find exactly what you need at the lowest prices.
diff --git a/src/app/categories/fashion/page.tsx b/src/app/categories/fashion/page.tsx
index 4d2bf7000..e83a07c50 100644
--- a/src/app/categories/fashion/page.tsx
+++ b/src/app/categories/fashion/page.tsx
@@ -3,8 +3,8 @@ import { HeroSearch } from '@/components/HeroSearch';
import { buildSgCategoryMetadata } from '@/lib/seo-category-metadata';
export const metadata = buildSgCategoryMetadata(
- 'Fashion Singapore | Compare Prices on Clothing, Shoes & Accessories',
- 'Shop fashion online in Singapore. Compare cheapest prices on clothing, shoes, bags, and accessories from Zalora, ASOS, Love Bonito, and more. Free shipping deals available.',
+ 'Fashion Price Comparison Singapore | Clothing, Shoes & Accessories Deals 2026',
+ 'Compare cheapest fashion prices in Singapore: clothing, shoes, bags, accessories from Zalora, Shopee, Lazada. Find the best deals on fashion online.',
'fashion'
);
@@ -36,8 +36,8 @@ export default function FashionCategoryPage() {
},
{
"@type": "CollectionPage",
- name: "Fashion Singapore | Compare Prices on Clothing, Shoes & Accessories",
- description: "Shop fashion online in Singapore. Compare cheapest prices on clothing, shoes, bags, and accessories from Zalora, ASOS, Love Bonito, and more.",
+ name: "Fashion Price Comparison Singapore | Clothing, Shoes & Accessories Deals 2026",
+ description: "Compare cheapest fashion prices in Singapore: clothing, shoes, bags, accessories from Zalora, Shopee, Lazada. Find the best deals on fashion online.",
url: "https://buywhere.ai/categories/fashion",
publisher: {
"@type": "Organization",
@@ -77,7 +77,7 @@ export default function FashionCategoryPage() {
{/* Header */}
- Fashion Singapore | Compare Prices on Clothing, Shoes & Accessories
+ Fashion Price Comparison Singapore — Clothing, Shoes & Accessories Deals
Singapore's fashion scene is vibrant and diverse, from local boutique labels to international fast-fashion giants. BuyWhere brings together product listings from hundreds of fashion retailers so you can discover the latest trends without hopping between dozens of websites. Compare prices, check size availability, and find the best deals on everything from everyday essentials to occasion wear.
diff --git a/src/app/categories/grocery/page.tsx b/src/app/categories/grocery/page.tsx
index 318b98dab..a2f8a134e 100644
--- a/src/app/categories/grocery/page.tsx
+++ b/src/app/categories/grocery/page.tsx
@@ -3,8 +3,8 @@ import { HeroSearch } from '@/components/HeroSearch';
import { buildSgCategoryMetadata } from '@/lib/seo-category-metadata';
export const metadata = buildSgCategoryMetadata(
- 'Grocery Singapore | Compare Prices on Food, Beverages & Daily Essentials',
- 'Compare grocery prices in Singapore. Find cheapest deals on rice, cooking ingredients, snacks, beverages, and daily essentials from NTUC, Sheng Siong, Cold Storage, and more.',
+ 'Grocery Price Comparison Singapore | Food, Beverages & Daily Essentials Deals 2026',
+ 'Compare cheapest grocery prices in Singapore: rice, cooking ingredients, snacks, beverages, daily essentials from NTUC, Sheng Siong, Cold Storage. Find the best grocery deals online.',
'grocery'
);
@@ -36,8 +36,8 @@ export default function GroceryCategoryPage() {
},
{
"@type": "CollectionPage",
- name: "Grocery Singapore | Compare Prices on Food, Beverages & Daily Essentials",
- description: "Compare grocery prices in Singapore. Find cheapest deals on rice, cooking ingredients, snacks, beverages, and daily essentials from NTUC, Sheng Siong, Cold Storage, and more.",
+ name: "Grocery Price Comparison Singapore | Food, Beverages & Daily Essentials Deals 2026",
+ description: "Compare cheapest grocery prices in Singapore: rice, cooking ingredients, snacks, beverages, daily essentials from NTUC, Sheng Siong, Cold Storage. Find the best grocery deals online.",
url: "https://buywhere.ai/categories/grocery",
publisher: {
"@type": "Organization",
diff --git a/src/app/categories/home-living/page.tsx b/src/app/categories/home-living/page.tsx
index 92a1c72b2..63946e029 100644
--- a/src/app/categories/home-living/page.tsx
+++ b/src/app/categories/home-living/page.tsx
@@ -3,8 +3,8 @@ import { HeroSearch } from '@/components/HeroSearch';
import { buildSgCategoryMetadata } from '@/lib/seo-category-metadata';
export const metadata = buildSgCategoryMetadata(
- 'Home & Living Singapore | Compare Best Prices on Furniture & Household Items',
- 'Shop home and living products in Singapore. Compare cheapest prices on furniture, kitchen appliances, bedding, and home decor from IKEA, Courts, and top retailers.',
+ 'Home & Living Price Comparison Singapore | Furniture & Household Deals 2026',
+ 'Compare cheapest home and living prices in Singapore: furniture, kitchen appliances, bedding, and home decor from IKEA, Courts, Harvey Norman. Find the best household deals.',
'home-living'
);
@@ -36,8 +36,8 @@ export default function HomeLivingCategoryPage() {
},
{
"@type": "CollectionPage",
- name: "Home & Living Singapore | Compare Best Prices on Furniture & Household Items",
- description: "Shop home and living products in Singapore. Compare cheapest prices on furniture, kitchen appliances, bedding, and home decor from IKEA, Courts, and top retailers.",
+ name: "Home & Living Price Comparison Singapore | Furniture & Household Deals 2026",
+ description: "Compare cheapest home and living prices in Singapore: furniture, kitchen appliances, bedding, and home decor from IKEA, Courts, Harvey Norman. Find the best household deals.",
url: "https://buywhere.ai/categories/home-living",
publisher: {
"@type": "Organization",
@@ -77,7 +77,7 @@ export default function HomeLivingCategoryPage() {
{/* Header */}
- Home & Living Singapore | Compare Best Prices on Furniture & Household Items
+ Home & Living Price Comparison Singapore — Furniture & Household Deals 2026
Transform your living space without overspending. BuyWhere aggregates home and living product listings from furniture stores, kitchen appliance shops, bedding retailers, and home decor boutiques across Singapore. Whether you are moving into a new flat, renovating your HDB apartment, or simply refreshing your space, we help you find the best products at the best prices.
diff --git a/src/app/developers/page.tsx b/src/app/developers/page.tsx
index f15fb48d9..2c1149938 100644
--- a/src/app/developers/page.tsx
+++ b/src/app/developers/page.tsx
@@ -4,9 +4,9 @@ import Nav from "@/components/Nav";
import Footer from "@/components/Footer";
export const metadata: Metadata = {
- title: "Developer Portal — BuyWhere MCP & API for AI Agents",
+ title: "Developer Portal — BuyWhere MCP Server & Product Catalog API for AI Agents",
description:
- "BuyWhere gives AI agents a product catalog layer for live product discovery, comparison, and merchant handoff starting in Singapore, expanding across Southeast Asia.",
+ "Give your AI agent a live product catalog: search 5M+ products across Singapore retailers, compare prices, and generate purchase links — no scrapers, no maintenance. MCP server + REST API.",
alternates: {
canonical: "https://buywhere.ai/developers",
},
@@ -38,13 +38,13 @@ export default function DevelopersPage() {
Developer Portal
- Launch your shopping agent with one clear setup path.
+ Ship product search in your AI agent — live catalog, no scraping.
- BuyWhere gives AI agents a product catalog layer for live product discovery, comparison, and merchant handoff starting in Singapore, expanding across Southeast Asia.
+ BuyWhere gives AI agents a live product catalog layer: search 5M+ products across Singapore retailers, compare prices, and generate purchase links — all via one API call. MCP server included.
- Start with one API request, then add the published MCP package when you want BuyWhere tools inside Claude Desktop, Cursor, or another MCP client.
+ No scrapers. No maintenance. No HTML parsing. One authenticated request replaces real-time product data from every major SG retailer.
- Give your AI agent a product catalog layer for Singapore commerce.
+ Your AI agent needs a live product catalog. Here it is.
- BuyWhere exposes product search, price comparison, and deal discovery
- as tools your agent can call directly — no scraping, no HTML parsing,
- no rate-limit battles with e-commerce sites.
+ BuyWhere gives your AI agent product search, price comparison, and deal discovery across 5M+ Singapore products — via MCP tools or REST API. No scrapers, no maintenance, no HTML parsing.
+
+
+ Compatible with Claude Desktop, Cursor, Windsurf, and any MCP-compatible agent. One authenticated request replaces 20+ retailer integrations.
str:
+ for header in COUNTRY_HEADER_PATTERNS:
+ country = request.headers.get(header)
+ if country:
+ return country.upper()
+ return "unknown"
+
+
+@app.exception_handler(Exception)
+async def global_exception_handler(request: Request, exc: Exception):
+ logger.exception(f"Unhandled exception: {exc}")
+
+ if is_sentry_enabled():
+ request_id = request.headers.get("X-Request-Id", "unknown")
+ path = request.url.path
+ method = request.method
+ country = _get_country_from_request(request)
+
+ is_p0 = isinstance(exc, (ConnectionError, TimeoutError, OSError)) or "timeout" in str(exc).lower()
+
+ capture_exception_with_context(
+ exc=exc,
+ request_id=request_id,
+ path=path,
+ method=method,
+ country=country,
+ is_p0=is_p0,
+ )
+
+ return error_response("INTERNAL_ERROR", "An internal server error occurred", status_code=500)
+
+
+@app.exception_handler(RateLimitExceeded)
+async def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
+ from app.config import get_settings
+ from app.services.analytics.post_hog import track_upgrade_intent
+
+ settings = get_settings()
+ retry_after = 60
+ try:
+ if hasattr(exc, 'limit') and exc.limit is not None:
+ limit_obj = exc.limit
+ if hasattr(limit_obj, 'GRANULARITY'):
+ granularity = getattr(limit_obj, 'GRANULARITY', None)
+ if granularity and hasattr(granularity, 'seconds'):
+ retry_after = getattr(granularity, 'seconds', 60)
+ retry_after = min(retry_after, 3600)
+ except Exception:
+ pass
+
+ from app.models.product import ApiKey
+ api_key: ApiKey | None = getattr(request.state, "api_key", None)
+ developer_id = "anonymous"
+ api_key_id = "unknown"
+ current_tier = "free"
+
+ if api_key:
+ developer_id = str(getattr(api_key, 'developer_id', 'anonymous'))
+ api_key_id = str(getattr(api_key, 'id', 'unknown'))
+ current_tier = str(getattr(api_key, 'tier', 'free'))
+
+ track_upgrade_intent(
+ developer_id=developer_id,
+ current_tier=current_tier,
+ requested_plan="developer",
+ api_key_id=api_key_id,
+ hit_rate_limit=True,
+ )
+
+ upgrade_cta = {
+ "message": "Rate limit exceeded. Upgrade your plan for higher limits.",
+ "current_tier": current_tier,
+ "available_plans": [
+ {"name": "Pro", "price": 49, "rate_limit": 600, "currency": "SGD"},
+ ],
+ }
+
+ details = {
+ "retry_after": retry_after,
+ "upgrade": upgrade_cta,
+ "upgrade_url": f"{settings.public_url}/v1/billing/upgrade",
+ "upgrade_tiers_url": f"{settings.public_url}/v1/billing/tiers",
+ "cta": "Upgrade to Pro for 600 req/min — S$49/month",
+ }
+ return error_response(
+ "RATE_LIMIT_EXCEEDED",
+ "Rate limit exceeded",
+ details=details,
+ status_code=429
+ )
+
+
+@app.get("/chatgpt-openapi.json", tags=["integrations"], summary="ChatGPT-compatible OpenAPI spec for GPT Builder")
+async def chatgpt_openapi():
+ import json
+ from pathlib import Path
+ spec_path = Path(__file__).resolve().parent.parent / "chatgpt-openapi.json"
+ return JSONResponse(
+ content=json.loads(spec_path.read_text()),
+ headers={"Cache-Control": "public, max-age=3600"},
+ )
+
+
+@app.get("/llms.txt", include_in_schema=False, summary="AI agent discovery file")
+async def llms_txt():
+ from starlette.responses import Response
+ return Response(
+ content=LLMS_TXT_CONTENT,
+ media_type="text/plain",
+ headers={"Cache-Control": "public, max-age=86400"},
+ )
+
+
+@app.get("/ai.txt", include_in_schema=False, summary="AI agent usage guide")
+async def ai_txt():
+ from starlette.responses import Response
+ return Response(
+ content=AI_TXT_CONTENT,
+ media_type="text/plain",
+ headers={"Cache-Control": "public, max-age=86400"},
+ )
+
+
+@app.get("/tools/openai.json", include_in_schema=False, summary="OpenAI function-calling tool schema")
+async def openai_tools_schema():
+ from app.schemas.tools import OPENAI_TOOLS
+ return JSONResponse(
+ content=OPENAI_TOOLS,
+ headers={"Cache-Control": "public, max-age=86400"},
+ )
+
+
+@app.get("/tools/mcp.json", include_in_schema=False, summary="MCP tool schema")
+async def mcp_tools_schema():
+ from app.schemas.tools import MCP_TOOLS
+ return JSONResponse(
+ content={"tools": MCP_TOOLS},
+ headers={"Cache-Control": "public, max-age=86400"},
+ )
+
+
+MCP_REGISTRY_AUTH_CONTENT = "v=MCPv1; k=ed25519; p=h7SEyb+uUyDnAuhTuNfFKVLgvbKI+4eIJQQCfXiccxs="
+
+
+@app.get("/.well-known/mcp-registry-auth", include_in_schema=False, summary="MCP registry auth proof")
+async def mcp_registry_auth():
+ from starlette.responses import Response
+ return Response(
+ content=MCP_REGISTRY_AUTH_CONTENT,
+ media_type="text/plain",
+ headers={"Cache-Control": "public, max-age=86400"},
+ )
+
+
+import json as _json
+from pathlib import Path as _Path
+
+GLAMA_JSON_PATH = _Path(__file__).parent.parent / "glama.json"
+
+
+@app.get("/.well-known/glama.json", include_in_schema=False, summary="Glama MCP registry manifest")
+async def glama_json():
+ return JSONResponse(
+ content=_json.loads(GLAMA_JSON_PATH.read_text()),
+ headers={"Cache-Control": "public, max-age=86400"},
+ )
+
+
+@app.get("/.well-known/mcp/server-card.json", include_in_schema=False, summary="MCP static server card")
+async def mcp_server_card():
+ card_path = _Path(__file__).parent.parent / "smithery_server_card.json"
+ return JSONResponse(
+ content=_json.loads(card_path.read_text()),
+ headers={"Cache-Control": "public, max-age=86400"},
+ )
+
+
+@app.get("/v1/health", response_model=ComprehensiveHealthReport, tags=["system"], summary="Comprehensive health check with dependency status")
+async def health_check(request: Request):
+ from app.database import AsyncSessionLocal
+
+ async with AsyncSessionLocal() as db:
+ db_health_data = await get_db_health(db)
+ db_pool_data = await get_db_pool_health()
+ disk_data = await check_disk_space()
+ api_self_test = await check_api_self_test(db)
+ scraper_data = await get_scraper_health(db)
+ redis_data = await check_redis_ping()
+ typesense_data = await check_typesense_health()
+ ingestion_freshness_data = await check_ingestion_freshness()
+ celery_data = await check_celery_queue_depth()
+
+ db_report = DBHealthReport(**db_health_data)
+ disk_report = DiskSpaceHealth(**disk_data)
+ api_report = APIResponseTimeHealth(**api_self_test)
+ scraper_report = ScraperHealthReport(**scraper_data)
+ redis_report = RedisHealth(**redis_data)
+ typesense_report = TypesenseHealth(**typesense_data)
+ ingestion_freshness_report = IngestionFreshnessHealth(**ingestion_freshness_data)
+ celery_report = CeleryQueueHealth(**celery_data)
+
+ if db_pool_data.get("ok"):
+ db_report.pool.size = db_pool_data.get("size", 0)
+ db_report.pool.checkedin = db_pool_data.get("checked_in", 0)
+ db_report.pool.checkedout = db_pool_data.get("checked_out", 0)
+ db_report.pool.overflow = db_pool_data.get("overflow", 0)
+ db_report.pool.invalid = db_pool_data.get("invalid", 0)
+
+ overall_status = "healthy"
+ unhealthy = [
+ not db_report.ok,
+ not disk_report.ok,
+ not api_report.ok,
+ not redis_report.ok,
+ not typesense_report.ok,
+ not ingestion_freshness_report.ok,
+ ]
+ if any(unhealthy):
+ overall_status = "unhealthy"
+ elif not scraper_report.healthy_count == scraper_report.total_scrapers:
+ overall_status = "degraded"
+
+ return ComprehensiveHealthReport(
+ generated_at=datetime.now(timezone.utc),
+ overall_status=overall_status,
+ db=db_report,
+ disk=disk_report,
+ api_self_test=api_report,
+ scrapers=scraper_report,
+ redis=redis_report,
+ typesense=typesense_report,
+ ingestion_freshness=ingestion_freshness_report,
+ celery_queue=celery_report,
+ )
+
+
+@app.get("/v1", tags=["system"])
+async def api_root():
+ return {
+ "api": "BuyWhere Catalog API",
+ "version": "v1",
+ "endpoints": {
+ "search": "GET /v1/search",
+ "search_semantic": "GET /v1/search/semantic",
+ "search_filters": "GET /v1/search/filters",
+ "products": "GET /v1/products",
+ "best_price": "GET /v1/products/best-price",
+ "compare_search": "GET /v1/products/compare?q=",
+ "compare_matrix": "POST /v1/products/compare",
+ "compare_diff": "POST /v1/products/compare/diff",
+ "trending": "GET /v1/products/trending",
+ "export": "GET /v1/products/export?format=csv|json",
+ "feed": "GET /v1/products/feed?updatedSince=ISO8601",
+ "feed_new": "GET /v1/feed/new",
+ "feed_deals": "GET /v1/feed/deals",
+ "feed_changes_sse": "GET /v1/feed/changes",
+ "product": "GET /v1/products/{id}",
+ "price_history": "GET /v1/products/{id}/price-history",
+ "price_stats": "GET /v1/products/{id}/price-stats",
+ "price_comparison": "GET /v1/products/{id}/price-comparison",
+ "track_click": "POST /v1/products/{id}/click",
+ "similar": "GET /v1/products/{id}/similar",
+ "categories": "GET /categories",
+ "categories_taxonomy": "GET /categories/taxonomy",
+ "categories_products": "GET /categories/{id}/products",
+ "brands": "GET /v1/brands",
+ "brands_products": "GET /v1/brands/{brand_name}/products",
+ "countries": "GET /v1/countries",
+ "sources": "GET /v1/sources",
+ "deals": "GET /v1/deals",
+ "deals_price_drops": "GET /v1/deals/price-drops",
+ "graphql": "POST /api/graphql",
+ "graphql_playground": "GET /api/graphql",
+ "ingestion": "POST /v1/ingestion",
+ "ingest": "POST /v1/ingest/products",
+ "import_csv": "POST /v1/import/csv",
+ "status": "GET /v1/status",
+ "metrics": "GET /v1/metrics",
+ "metrics_quality": "GET /v1/metrics/quality",
+ "catalog_health": "GET /v1/catalog/health",
+ "click_analytics": "GET /v1/analytics/clicks",
+ "usage_analytics": "GET /v1/analytics/usage",
+ "admin_stats": "GET /v1/admin/stats",
+ "auth_register": "POST /v1/auth/register",
+ "developer_me": "GET /v1/developers/me",
+ "keys_create": "POST /v1/keys",
+ "keys_list": "GET /v1/keys",
+ "keys_revoke": "DELETE /v1/keys/{id}",
+ "keys_rotate": "POST /v1/keys/{id}/rotate",
+ "webhooks_create": "POST /v1/webhooks",
+ "webhooks_list": "GET /v1/webhooks",
+ "webhooks_delete": "DELETE /v1/webhooks/{id}",
+ "webhooks_test": "POST /v1/webhooks/test",
+ "alerts_create": "POST /v1/alerts",
+ "alerts_list": "GET /v1/alerts",
+ "alerts_delete": "DELETE /v1/alerts/{id}",
+ "image_register": "POST /v1/images?url=...",
+ "image_proxy": "GET /v1/images/{hash}?w=&h=&format=",
+ "image_info": "GET /v1/images/{hash}/info",
+ "changelog": "GET /v1/changelog",
+ "billing_subscribe": "POST /v1/billing/subscribe",
+ "billing_status": "GET /v1/billing/status",
+ "billing_tiers": "GET /v1/billing/tiers",
+ "usage": "GET /v1/usage",
+ "sitemap": "GET /sitemap.xml",
+ "robots": "GET /robots.txt",
+ },
+ "auth": "Bearer token required (API key)",
+ "docs": "/api/docs",
+ "versioning": "URI-based (/v1/*). Accept-Version header optional. v1 is deprecated - use v2.",
+ }
+
+
+@app.get("/dashboard", tags=["system"])
+async def dashboard():
+ from starlette.responses import FileResponse
+ return FileResponse("templates/dashboard.html")
+
+@app.get("/playground", tags=["system"])
+async def playground():
+ from starlette.responses import FileResponse
+ return FileResponse("templates/playground.html")
+
+
+@app.get("/docs", include_in_schema=False)
+async def custom_swagger_ui():
+ from starlette.responses import FileResponse
+ return FileResponse("templates/swagger.html")
+
+
+@app.get("/api/docs", include_in_schema=False)
+async def api_swagger_ui():
+ from starlette.responses import RedirectResponse
+ return RedirectResponse(url="/docs")
+
+
+@app.get("/quickstart", include_in_schema=False)
+async def quickstart_redirect():
+ """Redirect /quickstart to /docs/guides/mcp for clean public URL."""
+ from starlette.responses import RedirectResponse
+ return RedirectResponse(url="/docs/guides/mcp", status_code=301)
+
+
+@app.get("/docs/guides/mcp", include_in_schema=False)
+async def mcp_integration_guide():
+ """MCP integration guide — canonical URL referenced in public materials (BUY-579)."""
+ from starlette.responses import HTMLResponse
+ api_base = getattr(settings, "app_base_url", "https://api.buywhere.ai")
+ json_ld = f"""
+
+"""
+ html = f"""
+
+
+
+
+BuyWhere MCP Integration Guide
+{json_ld}
+
+
+
+BuyWhere MCP Integration
+BuyWhere exposes its product catalog as an MCP (Model Context Protocol) server. AI agents can search, compare, and retrieve product data without writing HTTP glue code.
+Transport: HTTP (POST {api_base}/mcp) for remote agents. STDIO/local process available via the published @buywhere/mcp-server npm package.
+
+Install
+Use one of two supported setup paths:
+
+- Hosted MCP: point your MCP client directly at
{api_base}/mcp
+- Local MCP package: run
npx -y @buywhere/mcp-server
+
+
+Configure Claude Desktop
+Add to ~/Library/Application Support/Claude/claude_desktop_config.json (macOS) or %APPDATA%\Claude\claude_desktop_config.json (Windows) for local STDIO mode:
+{{
+ "mcpServers": {{
+ "buywhere": {{
+ "command": "npx",
+ "args": ["-y", "@buywhere/mcp-server"],
+ "env": {{ "BUYWHERE_API_KEY": "bw_live_xxx" }}
+ }}
+ }}
+}}
+Or for hosted HTTP transport:
+{{
+ "mcpServers": {{
+ "buywhere": {{
+ "url": "{api_base}/mcp",
+ "headers": {{ "Authorization": "Bearer bw_live_xxx" }}
+ }}
+ }}
+}}
+Restart Claude Desktop. The BuyWhere tools appear automatically.
+
+Configure Cursor
+In .cursor/mcp.json in your project root (or ~/.cursor/mcp.json globally) for local STDIO mode:
+{{
+ "mcpServers": {{
+ "buywhere": {{
+ "command": "npx",
+ "args": ["-y", "@buywhere/mcp-server"],
+ "env": {{ "BUYWHERE_API_KEY": "bw_live_xxx" }}
+ }}
+ }}
+}}
+Hosted HTTP transport remains valid for cloud or remote setups.
+Restart Claude Desktop. The BuyWhere tools appear automatically.
+
+Configure Cursor
+In .cursor/mcp.json in your project root (or ~/.cursor/mcp.json globally):
+{{
+ "mcpServers": {{
+ "buywhere": {{
+ "url": "{api_base}/mcp",
+ "headers": {{ "Authorization": "Bearer bw_live_xxx" }}
+ }}
+ }}
+}}
+
+Remote HTTP Transport
+For agents running in cloud environments:
+POST {api_base}/mcp
+Authorization: Bearer bw_live_xxx
+Content-Type: application/json
+
+{{
+ "jsonrpc": "2.0",
+ "method": "tools/call",
+ "params": {{
+ "name": "search_products",
+ "arguments": {{ "query": "wireless headphones", "max_price": 150 }}
+ }},
+ "id": 1
+}}
+
+Available Tools
+
+Tool Description
+search_productsSearch catalog by keyword, category, price range, platform, country
+get_productFull product details by ID
+compare_productsSide-by-side comparison of 2–5 products
+get_dealsCurrent deals and price drops
+list_categoriesBrowse available product categories
+
+
+Authentication
+Pass your API key as a Bearer token. Get a free key at {api_base}/v1/auth/register.
+
+Key tier Rate limit Use case
+bw_free_*60 req/min Demo, testing
+bw_live_*600 req/min Production
+bw_partner_*Unlimited Platform data partners
+
+
+Error Handling
+
+MCP error code Meaning
+invalid_paramsMissing or invalid tool arguments
+not_foundProduct / category not found
+rate_limitedRate limit exceeded — exponential backoff (2s → 4s → 8s)
+unauthorizedInvalid or missing API key
+internal_errorBuyWhere API error
+
+
+
+ OpenAPI spec ·
+ Plugin manifest ·
+ api@buywhere.ai
+
+
+"""
+ return HTMLResponse(content=html)
+
+
+@app.get("/status", tags=["system"])
+async def status_page():
+ from starlette.responses import FileResponse
+ return FileResponse("static/status.html")
+
+
+@app.get("/v1/test/error", tags=["system"], summary="Test endpoint to verify Sentry error tracking")
+async def test_error_endpoint():
+ raise ValueError("This is a test error for Sentry verification - BUY-3002")
diff --git a/src/app/pricing/page.tsx b/src/app/pricing/page.tsx
index 88b7fe768..13ac58b4a 100644
--- a/src/app/pricing/page.tsx
+++ b/src/app/pricing/page.tsx
@@ -4,9 +4,9 @@ import Link from "next/link";
import type { Metadata } from "next";
export const metadata: Metadata = {
- title: "Pricing — BuyWhere Product Catalog API",
+ title: "Pricing — BuyWhere Product Catalog API & MCP Server | Singapore",
description:
- "BuyWhere is in developer beta. Get free API access to Singapore's structured product catalog for AI agents.",
+ "BuyWhere pricing: pay per product search query or subscribe for higher limits. AI agent-ready product catalog API with live prices from Shopee, Lazada, and 20+ Singapore retailers. Free tier available.",
};
const betaFeatures = [
@@ -18,6 +18,83 @@ const betaFeatures = [
"Rate-limited API access",
];
+const perQueryPricing = [
+ {
+ endpoint: "/v1/products/search",
+ description: "Natural language & semantic product search",
+ price: "S$0.002",
+ note: "per search call",
+ },
+ {
+ endpoint: "/v1/products/{id}",
+ description: "Structured product lookup by ID",
+ price: "S$0.005",
+ note: "per lookup call",
+ },
+ {
+ endpoint: "/v1/products/{id}/links",
+ description: "Affiliate link generation for product",
+ price: "S$0.05",
+ note: "per affiliate-link call",
+ },
+];
+
+const subscriptionTiers = [
+ {
+ name: "Free",
+ price: "S$0",
+ description: "For experimentation and testing",
+ limit: "100 calls/day",
+ },
+ {
+ name: "Pro",
+ price: "USD $29/mo",
+ description: "For production pilots",
+ limit: "50,000 calls/day",
+ },
+ {
+ name: "Scale",
+ price: "USD $99/mo",
+ description: "For live agent workloads",
+ limit: "200,000 calls/day",
+ },
+];
+
+const volumeTier = {
+ name: "Volume",
+ price: "S$299/mo",
+ description: "Unlimited calls capped at 1M/month",
+ features: [
+ "100,000 search calls included",
+ "50,000 lookup calls included",
+ "5,000 affiliate-link calls included",
+ "Overage at reduced per-call rates",
+ ],
+};
+
+const modelComparison = [
+ {
+ aspect: "Best for",
+ subscription: "Predictable, committed usage",
+ perQuery: "Variable, experimental volume",
+ },
+ {
+ aspect: "Cost model",
+ subscription: "Fixed monthly fee",
+ perQuery: "Pay per call",
+ },
+ {
+ aspect: "Free tier",
+ subscription: "100 calls/day hard cap",
+ perQuery: "1,000 search + 500 lookup + 100 affiliate/mo",
+ },
+ {
+ aspect: "Affiliate-link model",
+ subscription: "Uses daily quota",
+ perQuery: "S$0.05/call — offset by earned commission",
+ },
+];
+
const faqs = [
{
q: "Is BuyWhere free during the beta?",
@@ -27,13 +104,25 @@ const faqs = [
q: "What exactly counts as an API query?",
a: "A query is any single authenticated request to a BuyWhere API endpoint — whether it's a product search, a price lookup, or a catalog fetch. Batch requests that return multiple results still count as one query. Requests that return errors (4xx/5xx) are not counted.",
},
+ {
+ q: "How does the free tier work?",
+ a: "The free tier has a hard daily cap of 100 calls per day for search and lookup endpoints, plus 100 affiliate-link calls per month. There is no rollover — unused quota does not carry over to the next day. Once you hit the cap, you must wait for the daily reset or upgrade to a paid plan.",
+ },
+ {
+ q: "What's the difference between subscription and per-query pricing?",
+ a: "Subscription (daily rate limit) is best for predictable, committed usage — you pay a fixed monthly fee and get a daily quota of calls. Per-query is best for variable, experimental volume — you pay only for the calls you actually make. Per-query pricing is pay-as-you-go with no monthly commitment. Both models can coexist on your account.",
+ },
{
q: "How does BuyWhere make money?",
a: "BuyWhere's business model is built around referral fees, merchant partnerships, and demand routing economics. When AI agents use our catalog to match buyers with products, we participate in the commerce economics of that transaction. We are not a subscription API business — we succeed when merchants get qualified demand.",
},
+ {
+ q: "How does the affiliate-link pricing work?",
+ a: "Affiliate-link calls cost S$0.05 per call. However, when your AI agent converts a product lookup into a purchase, you earn a referral commission that offsets this cost. For active affiliate agents, the effective net cost of the affiliate-link endpoint approaches zero.",
+ },
{
q: "Will there be paid tiers later?",
- a: "We expect to offer tiered access for high-volume use cases as the platform matures. Pricing details will be shared when they are finalized and aligned with our partners. For now, focus on building — the beta is free.",
+ a: "We offer both subscription (daily rate limit) and per-query (per-call) pricing now. Subscription plans start at USD $29/month for 50,000 calls/day. Per-query pricing starts at S$0.002 per search call. High-volume developers can also opt for the Volume tier at S$299/month for 1M calls/month.",
},
{
q: "I'm a merchant. Is there a cost to list my catalog?",
@@ -49,29 +138,143 @@ function CheckIcon() {
);
}
+function PlusIcon() {
+ return (
+
+ );
+}
+
export default function PricingPage() {
return (
- {/* Header */}
- Developer Beta — Free Access
+ Developer Beta — Open Access
- Build on BuyWhere for free during beta
+ Two ways to pay for BuyWhere API access
- We’re focused on getting the product catalog and API right. During developer beta,
- access is free with rate limits so you can build and test your AI commerce integrations.
+ Subscribe for predictable daily quota — or pay per query for flexible, variable usage.
+ Both models grow with you.
- {/* Beta plan */}
+
+
+
+
+
+
+ Per-Query
+
+ Pay as you grow
+
+ Endpoint pricing
+
+
+
+ Endpoint
+ Price
+
+
+
+ {perQueryPricing.map((p) => (
+
+
+ {p.endpoint}
+ {p.description}
+ {p.note}
+
+
+ {p.price}
+
+
+ ))}
+
+
+
+
+
+
+
+ Volume tier available
+
+ {volumeTier.price} — {volumeTier.description}
+
+
+
+
+
+
+
+
+
+ Subscription
+
+ Predictable daily quota
+
+ Daily rate limit tiers
+
+ {subscriptionTiers.map((tier) => (
+
+
+ {tier.name}
+ {tier.description}
+
+
+ {tier.price}
+ {tier.limit}
+
+
+ ))}
+
+
+ Get your API key →
+
+
+
+
+
+
+
+
+
+ Which model is right for you?
+
+
+
+
+
+ Aspect
+ Per-Query
+ Subscription
+
+
+
+ {modelComparison.map((row, i) => (
+
+ {row.aspect}
+ {row.perQuery}
+ {row.subscription}
+
+ ))}
+
+
+
+
+
+
@@ -95,6 +298,13 @@ export default function PricingPage() {
))}
+
+
+ Free tier hard cap: 100 calls/day — unused quota does not roll over.
+ No credit card required. Upgrade anytime.
+
+
+
- {/* Future model */}
For developers
- We expect to offer tiered API access as the platform matures — with free tiers for
- experimentation and paid tiers for production-scale usage. Details will be shared when
- pricing is finalized.
+ BuyWhere offers two pricing models to fit different developer cost psychology.
+ Per-query pricing is ideal for experimental and variable workloads — pay only for
+ what you use. Subscription plans offer predictable monthly costs for committed usage.
- Beta users will get advance notice and fair transition terms.
+ Both models include access to the full Singapore product catalog and all BuyWhere API endpoints.
@@ -137,7 +346,6 @@ export default function PricingPage() {
- {/* Enterprise */}
@@ -157,7 +365,6 @@ export default function PricingPage() {
- {/* FAQ */}
Frequently asked questions
@@ -172,12 +379,11 @@ export default function PricingPage() {
- {/* Bottom CTA */}
Ready to start building?
- Get your API key and make your first query in under 5 minutes. Free during beta.
+ Get your API key and make your first query in under 5 minutes.
Server:
+ global _api_server
+ if _api_server is None:
+ server = Server("buywhere")
+
+ @server.list_tools()
+ async def list_tools() -> ListToolsResult:
+ return ListToolsResult(
+ tools=[
+ Tool(
+ name="search_products",
+ description=(
+ "Search the BuyWhere product catalog by keyword. "
+ "Returns ranked results from Singapore e-commerce platforms "
+ "(Lazada, Shopee, Qoo10, Carousell)."
+ ),
+ inputSchema={
+ "type": "object",
+ "properties": {
+ "query": {"type": "string", "description": "Product search query."},
+ "category": {"type": "string", "description": "Optional category filter."},
+ "min_price": {"type": "number", "description": "Minimum price in SGD."},
+ "max_price": {"type": "number", "description": "Maximum price in SGD."},
+ "source": {
+ "type": "string",
+ "description": "Platform filter (lazada_sg, shopee_sg, etc.).",
+ },
+ "limit": {
+ "type": "integer",
+ "description": "Max results (default 10, max 50).",
+ "default": 10,
+ "minimum": 1,
+ "maximum": 50,
+ },
+ },
+ "required": ["query"],
+ },
+ ),
+ Tool(
+ name="get_product",
+ description="Retrieve full details for a specific product by its BuyWhere ID.",
+ inputSchema={
+ "type": "object",
+ "properties": {
+ "product_id": {
+ "type": "integer",
+ "description": "The BuyWhere product ID.",
+ },
+ },
+ "required": ["product_id"],
+ },
+ ),
+ Tool(
+ name="find_best_price",
+ description=(
+ "Find the single cheapest listing for a product across all Singapore "
+ "e-commerce platforms. Returns the platform, price, and affiliate URL "
+ "for the lowest available price."
+ ),
+ inputSchema={
+ "type": "object",
+ "properties": {
+ "product_name": {
+ "type": "string",
+ "description": "Product name or search query.",
+ },
+ "category": {
+ "type": "string",
+ "description": "Optional category to narrow the search.",
+ },
+ },
+ "required": ["product_name"],
+ },
+ ),
+ Tool(
+ name="get_deals",
+ description=(
+ "Find products with significant price drops compared to their original "
+ "price. Returns deals sorted by discount percentage with current price, "
+ "original price, and savings."
+ ),
+ inputSchema={
+ "type": "object",
+ "properties": {
+ "category": {
+ "type": "string",
+ "description": "Optional category filter (e.g. 'electronics').",
+ },
+ "min_discount_pct": {
+ "type": "number",
+ "description": "Minimum discount percentage (default 10).",
+ "default": 10,
+ "minimum": 0,
+ "maximum": 100,
+ },
+ "limit": {
+ "type": "integer",
+ "description": "Max results (default 10, max 50).",
+ "default": 10,
+ "minimum": 1,
+ "maximum": 50,
+ },
+ },
+ "required": [],
+ },
+ ),
+ ]
+ )
+
+ @server.call_tool()
+ async def call_tool(name: str, arguments: dict[str, Any]) -> CallToolResult:
+ if name == "search_products":
+ return await _handle_search_products(arguments)
+ if name == "get_product":
+ return await _handle_get_product(arguments)
+ if name == "find_best_price":
+ return await _handle_find_best_price(arguments)
+ if name == "get_deals":
+ return await _handle_get_deals(arguments)
+ return CallToolResult(
+ content=[TextContent(type="text", text=f"Unknown tool: {name}")],
+ isError=True,
+ )
+
+ _api_server = server
+
+ return _api_server
+
+
+async def _handle_search_products(args: dict[str, Any]) -> CallToolResult:
+ from unittest.mock import AsyncMock
+
+ query = str(args.get("query", "")).strip()
+ if not query:
+ return CallToolResult(
+ content=[TextContent(type="text", text="Error: query is required")],
+ isError=True,
+ )
+
+ params = {"q": query, "limit": min(int(args.get("limit", 10)), 50)}
+ for key in ("category", "min_price", "max_price", "source"):
+ if args.get(key) is not None:
+ params[key] = args[key]
+
+ try:
+ data = await _api_get("/v1/products", params)
+ except Exception as exc:
+ logger.exception("search_products API error for %r", query)
+ return CallToolResult(
+ content=[TextContent(type="text", text=f"Search failed: {exc}")],
+ isError=True,
+ )
+
+ items = data.get("items", []) if isinstance(data, dict) else []
+ if not items:
+ return CallToolResult(
+ content=[TextContent(type="text", text=f"No products found for: {query}")]
+ )
+
+ lines = [f"Found {len(items)} product(s) for **{query}**:\n"]
+ for i, p in enumerate(items, 1):
+ lines.append(_fmt_product_summary(i, p))
+ return CallToolResult(content=[TextContent(type="text", text="\n".join(lines))])
+
+
+async def _handle_get_product(args: dict[str, Any]) -> CallToolResult:
+ product_id = args.get("product_id")
+ if not product_id:
+ return CallToolResult(
+ content=[TextContent(type="text", text="Error: product_id is required")],
+ isError=True,
+ )
+ try:
+ data = await _api_get(f"/v1/products/{product_id}")
+ except Exception as exc:
+ logger.exception("get_product API error for id %r", product_id)
+ return CallToolResult(
+ content=[TextContent(type="text", text=f"Fetch failed: {exc}")],
+ isError=True,
+ )
+ return CallToolResult(content=[TextContent(type="text", text=_fmt_product_detail(data))])
+
+
+async def _handle_find_best_price(args: dict[str, Any]) -> CallToolResult:
+ product_name = str(args.get("product_name", "")).strip()
+ if not product_name:
+ return CallToolResult(
+ content=[TextContent(type="text", text="Error: product_name is required")],
+ isError=True,
+ )
+ params = {"q": product_name}
+ if args.get("category"):
+ params["category"] = args["category"]
+
+ try:
+ p = await _api_get("/v1/products/best-price", params)
+ except Exception as exc:
+ logger.exception("find_best_price API error for %r", product_name)
+ return CallToolResult(
+ content=[TextContent(type="text", text=f"Search failed: {exc}")],
+ isError=True,
+ )
+
+ if not p or not isinstance(p, dict):
+ return CallToolResult(
+ content=[TextContent(type="text", text=f"No products found for: {product_name}")]
+ )
+
+ price_str = _fmt_price(p.get("price"), p.get("currency", "SGD"))
+ affiliate = p.get("affiliate_url") or p.get("buy_url") or ""
+ lines = [
+ f"## Best Price: {p.get('name', 'Unknown')}",
+ f"**Platform:** {p.get('source', 'unknown')}",
+ f"**Price:** {price_str}",
+ f"**Category:** {p.get('category') or 'N/A'}",
+ ]
+ if affiliate:
+ lines.append(f"**Affiliate URL:** {affiliate}")
+ lines.append(f"**Product ID:** {p.get('id', '')}")
+ return CallToolResult(content=[TextContent(type="text", text="\n".join(lines))])
+
+
+async def _handle_get_deals(args: dict[str, Any]) -> CallToolResult:
+ min_discount_pct = float(args.get("min_discount_pct", 10))
+ limit = min(int(args.get("limit", 10)), 50)
+ params = {"min_discount_pct": min_discount_pct, "limit": limit}
+ if args.get("category"):
+ params["category"] = args["category"]
+
+ try:
+ data = await _api_get("/v1/deals", params)
+ except Exception as exc:
+ logger.exception("get_deals API error")
+ return CallToolResult(
+ content=[TextContent(type="text", text=f"Deals fetch failed: {exc}")],
+ isError=True,
+ )
+
+ items = data.get("items", []) if isinstance(data, dict) else []
+ if not items:
+ return CallToolResult(
+ content=[TextContent(type="text", text=f"No deals found with >={min_discount_pct}% discount.")]
+ )
+
+ lines = [f"Found {len(items)} deal(s) with >={min_discount_pct}% discount:\n"]
+ for i, d in enumerate(items, 1):
+ current = _fmt_price(d.get("price"), d.get("currency", "SGD"))
+ original = _fmt_price(d.get("original_price"), d.get("currency", "SGD")) if d.get("original_price") else "N/A"
+ discount = d.get("discount_pct", 0) or 0
+ lines.append(
+ f"{i}. **{d.get('name', 'Unknown')}**\n"
+ f" Current: {current} | Was: {original} | Discount: {discount}%\n"
+ f" Platform: {d.get('source', 'unknown')} | ID: {d.get('id', '')}\n"
+ )
+ return CallToolResult(content=[TextContent(type="text", text="\n".join(lines))])
+
+
+async def _api_get(path: str, params: dict[str, Any] | None = None) -> Any:
+ import httpx
+ from app.config import get_settings
+ settings = get_settings()
+ API_BASE_URL = settings.app_base_url or "http://localhost:8000"
+
+ headers = {"Accept": "application/json"}
+ raw_key = _mcp_api_key_var.get()
+ if raw_key:
+ headers["Authorization"] = f"Bearer {raw_key}"
+ async with httpx.AsyncClient(base_url=API_BASE_URL, headers=headers, timeout=10.0) as client:
+ resp = await client.get(path, params=params or {})
+ resp.raise_for_status()
+ return resp.json()
+
+
+def _fmt_price(price: Any, currency: str = "SGD") -> str:
+ if price is None:
+ return "N/A"
+ try:
+ return f"{currency} {float(price):.2f}"
+ except (TypeError, ValueError):
+ return str(price)
+
+
+def _fmt_product_summary(index: int, p: dict[str, Any]) -> str:
+ name = p.get("name") or p.get("title") or "Unknown"
+ price = _fmt_price(p.get("price"), p.get("currency", "SGD"))
+ source = p.get("source", "unknown")
+ pid = p.get("id", "")
+ url = p.get("affiliate_url") or p.get("buy_url") or ""
+ url_line = f"\n URL: {url}" if url else ""
+ return f"{index}. **{name}**\n Price: {price} | Platform: {source}{url_line}\n ID: {pid}\n"
+
+
+def _fmt_product_detail(p: dict[str, Any]) -> str:
+ if not isinstance(p, dict):
+ return str(p)
+ lines = [f"## {p.get('name') or 'Product'}"]
+ for key, label in [
+ ("id", "ID"),
+ ("source", "Platform"),
+ ("price", "Price"),
+ ("currency", "Currency"),
+ ("category", "Category"),
+ ("affiliate_url", "Affiliate URL"),
+ ("buy_url", "Buy URL"),
+ ("image_url", "Image"),
+ ]:
+ val = p.get(key)
+ if val is not None:
+ lines.append(f"**{label}:** {val}")
+ return "\n".join(lines)
+
+
+class JSONRPCRequest(BaseModel):
+ jsonrpc: str = "2.0"
+ method: str
+ params: dict[str, Any] | None = None
+ id: Any = None
+
+
+class JSONRPCResponse(BaseModel):
+ jsonrpc: str = "2.0"
+ id: Any
+ result: Any | None = None
+ error: dict[str, Any] | None = None
+
+
+@router.get("/health", include_in_schema=False)
+async def mcp_health():
+ return {"status": "ok", "service": "mcp"}
+
+
+@router.head("/health", include_in_schema=False)
+async def mcp_health_head():
+ return Response(status_code=200)
+
+
+@router.post("/v1/tools/list")
+async def list_tools(request: Request, api_key: ApiKey = Depends(get_current_api_key)):
+ server = get_mcp_server()
+ result = await server.list_tools()
+ return JSONRPCResponse(id="pending", result=result)
+
+
+@router.post("/v1/tools/call")
+async def call_tool(
+ request: Request,
+ body: JSONRPCRequest,
+ api_key: ApiKey = Depends(get_current_api_key),
+):
+ auth_header = request.headers.get("authorization", "")
+ raw_key = auth_header.removeprefix("Bearer ").strip()
+ _mcp_api_key_var.set(raw_key)
+ try:
+ server = get_mcp_server()
+ result = await server.call_tool(body.method, body.params or {})
+ return JSONRPCResponse(id=body.id, result=result)
+ except Exception as exc:
+ logger.exception("MCP tool call error: %s %s", body.method, exc)
+ return JSONRPCResponse(
+ id=body.id,
+ error={"code": -32603, "message": str(exc)}
+ )
+ finally:
+ _mcp_api_key_var.set(None)
\ No newline at end of file
diff --git a/src/lib/seo-landing-pages.ts b/src/lib/seo-landing-pages.ts
index a3172d762..c623a543e 100644
--- a/src/lib/seo-landing-pages.ts
+++ b/src/lib/seo-landing-pages.ts
@@ -413,13 +413,13 @@ export const seoLandingPages: Record = {
shopperCta: {
title: "Compare laptop prices in Singapore",
body: "See live laptop offers across Singapore retailers in one search flow.",
- href: "/search?q=laptop&country=sg",
+ href: "/search?q=laptop&country=sg&utm_source=blog&utm_medium=seo&utm_campaign=where-to-buy",
label: "Shop laptops",
},
developerCta: {
title: "Build laptop comparison tools for Singapore",
body: "Use BuyWhere to power local price-comparison and product-discovery experiences across SG electronics retailers.",
- href: "/developers",
+ href: "/quickstart?utm_source=blog&utm_medium=seo&utm_campaign=where-to-buy",
label: "View developer docs",
},
fallbackProducts: [
@@ -691,4 +691,91 @@ export const seoLandingPages: Record = {
{ id: "r6", name: "Roborock Q5 Pro+", price: 499, currency: "USD", merchant: "Target", imageUrl: null, href: "/search?q=Roborock+Q5+Pro%2B&country=us", brand: "Roborock", category: "Robot Vacuums" },
],
},
+ "best-headphones-singapore": {
+ slug: "best-headphones-singapore",
+ title: "Best Headphones in Singapore 2026 | Compare Prices Across Top Audio Retailers",
+ description:
+ "Compare the best headphones and earbuds in Singapore with live BuyWhere product results, retailer benchmarks, and quick buying advice across Sony, Bose, Apple, Sennheiser, and Samsung.",
+ heroEyebrow: "Singapore Audio Guide",
+ heroTitle: "Best Headphones in Singapore",
+ heroBody:
+ "Whether you want ANC for the MRT, a warm signature for long listening sessions, or workout-friendly earbuds, this page helps you compare the right headphones across SG retailers without checking each store separately.",
+ canonicalPath: "/best-headphones-singapore",
+ country: "SG",
+ currency: "SGD",
+ locale: "en_SG",
+ searchQuery: "headphones",
+ refreshedLabel: "Updated May 2, 2026",
+ productSectionTitle: "Live headphone offers across Singapore",
+ comparisonSectionTitle: "Popular headphone picks at a glance",
+ comparisonColumns: ["Model", "Price", "Type", "ANC", "Battery", "Best For"],
+ comparisonRows: [
+ { Model: "Sony WH-1000XM5", Price: "S$449", Type: "Over-ear", ANC: "Yes", Battery: "30h", "Best For": "Best overall noise cancellation" },
+ { Model: "Apple AirPods Max", Price: "S$599", Type: "Over-ear", ANC: "Yes", Battery: "20h", "Best For": "Best Apple ecosystem fit" },
+ { Model: "Bose QuietComfort Ultra", Price: "S$499", Type: "Over-ear", ANC: "Yes", Battery: "24h", "Best For": "Best comfort" },
+ { Model: "Sennheiser Momentum 4", Price: "S$379", Type: "Over-ear", ANC: "Yes", Battery: "60h", "Best For": "Best battery life" },
+ { Model: "Samsung Galaxy Buds3 Pro", Price: "S$229", Type: "Earbuds", ANC: "Yes", Battery: "6h", "Best For": "Best value ANC earbuds" },
+ { Model: "Sony WF-1000XM5", Price: "S$349", Type: "Earbuds", ANC: "Yes", Battery: "8h", "Best For": "Best premium earbuds" },
+ ],
+ highlightSectionTitle: "What matters most for SG buyers",
+ highlights: [
+ {
+ title: "ANC on the MRT matters",
+ body: "Active noise cancellation is the top feature for Singapore commuters. The Sony XM5 and Bose QC Ultra lead here, but both earbuds and over-ears now perform well on busy train rides.",
+ },
+ {
+ title: "Battery life affects daily use",
+ body: "Over-ear headphones typically outlast earbuds significantly. The Sennheiser Momentum 4 hits 60 hours, which is useful for long-haul commuters and travellers.",
+ },
+ {
+ title: "Brand ecosystem lock-in",
+ body: "Apple AirPods Max only makes sense within the Apple ecosystem. Android users get more flexibility with Sony, Bose, and Samsung options.",
+ },
+ ],
+ adviceSectionTitle: "How to pick the right headphones",
+ advicePoints: [
+ "For daily commuters on the MRT, prioritize ANC quality over battery capacity — the noise reduction matters more on a packed train than on a flight.",
+ "Workout and gym use cases favour earbuds with IPX water resistance over over-ear designs.",
+ "For music production, podcast editing, or audiophile use, look for an open-back or wired option instead of the ANC-focused picks above.",
+ "Compare Shopee, Lazada, Challenger, and Harvey Norman during 5.5, 9.9, and 11.11 campaigns for the lowest real checkout price.",
+ ],
+ faqSectionTitle: "Headphones Singapore FAQ",
+ faqs: [
+ {
+ question: "What are the best headphones in Singapore right now?",
+ answer:
+ "For most buyers, the Sony WH-1000XM5 remains the best overall pick because it combines top-tier ANC, strong battery life, and broad SG retail availability.",
+ },
+ {
+ question: "Are earbuds better than over-ear headphones?",
+ answer:
+ "Earbuds win on portability and workout suitability. Over-ear headphones win on comfort for long sessions, ANC performance, and battery life.",
+ },
+ {
+ question: "Where should I compare headphone prices in Singapore?",
+ answer:
+ "Use BuyWhere to search across Shopee, Lazada, Challenger, Harvey Norman, and electronics retailers simultaneously for the best current offer.",
+ },
+ ],
+ shopperCta: {
+ title: "Compare headphone prices in Singapore",
+ body: "Check live offers across Sony, Bose, Apple, Sennheiser, and Samsung retailers in one search.",
+ href: "/search?q=headphones&country=sg",
+ label: "Shop headphones",
+ },
+ developerCta: {
+ title: "Build audio product discovery tools for Singapore",
+ body: "Use BuyWhere API search to track headphone availability, merchant pricing, and product specifications across SG retailers.",
+ href: "/developers",
+ label: "View developer docs",
+ },
+ fallbackProducts: [
+ { id: "h1", name: "Sony WH-1000XM5", price: 449, currency: "SGD", merchant: "Challenger", imageUrl: null, href: "/search?q=Sony+WH-1000XM5&country=sg", brand: "Sony", category: "Headphones" },
+ { id: "h2", name: "Apple AirPods Max", price: 599, currency: "SGD", merchant: "Apple Store", imageUrl: null, href: "/search?q=AirPods+Max&country=sg", brand: "Apple", category: "Headphones" },
+ { id: "h3", name: "Bose QuietComfort Ultra", price: 499, currency: "SGD", merchant: "Harvey Norman", imageUrl: null, href: "/search?q=Bose+QuietComfort+Ultra&country=sg", brand: "Bose", category: "Headphones" },
+ { id: "h4", name: "Sennheiser Momentum 4", price: 379, currency: "SGD", merchant: "Shopee", imageUrl: null, href: "/search?q=Sennheiser+Momentum+4&country=sg", brand: "Sennheiser", category: "Headphones" },
+ { id: "h5", name: "Samsung Galaxy Buds3 Pro", price: 229, currency: "SGD", merchant: "Samsung", imageUrl: null, href: "/search?q=Galaxy+Buds3+Pro&country=sg", brand: "Samsung", category: "Headphones" },
+ { id: "h6", name: "Sony WF-1000XM5", price: 349, currency: "SGD", merchant: "Lazada", imageUrl: null, href: "/search?q=Sony+WF-1000XM5&country=sg", brand: "Sony", category: "Headphones" },
+ ],
+ },
};
diff --git a/src/lib/sitemaps.ts b/src/lib/sitemaps.ts
index 1935730b3..b3f8448d9 100644
--- a/src/lib/sitemaps.ts
+++ b/src/lib/sitemaps.ts
@@ -55,6 +55,9 @@ const STATIC_SITEMAP_ROUTES = [
{ path: "/best-gaming-laptops-us/", priority: 0.9, changeFrequency: "weekly" as const },
{ path: "/iphone-16-price-singapore/", priority: 0.9, changeFrequency: "weekly" as const },
{ path: "/best-robot-vacuums-2026/", priority: 0.9, changeFrequency: "weekly" as const },
+ { path: "/laptop-singapore/", priority: 0.9, changeFrequency: "weekly" as const },
+ { path: "/air-purifier-singapore/", priority: 0.9, changeFrequency: "weekly" as const },
+ { path: "/best-headphones-singapore/", priority: 0.9, changeFrequency: "weekly" as const },
{ path: "/privacy/", priority: 0.3, changeFrequency: "yearly" as const },
{ path: "/terms/", priority: 0.3, changeFrequency: "yearly" as const },
] as const;