From 0fb47da2a5f66dd9036fa6a8c7ca3f437924023b Mon Sep 17 00:00:00 2001 From: "Dwight J. Browne" Date: Sun, 23 Nov 2025 15:27:07 -0800 Subject: [PATCH] added visualization work --- .gitignore | 1 + crates/av-client/examples/news_analysis.rs | 3 +- .../av-client/examples/portfolio_tracker.rs | 4 +- crates/av-database/postgres/src/repository.rs | 7 +- tests/integration/crypto_update_integrations | 2 +- viz/README.md | 136 ++++ viz/crypto_heatmap.html | 525 ++++++++++++++++ viz/derive_article_symbols.sql | 33 + viz/extract_articles_by_source.sql | 75 +++ viz/extract_crypto.sh | 44 ++ viz/extract_crypto_heatmap.sql | 43 ++ viz/extract_data.sh | 58 ++ viz/extract_data_sources.sh | 58 ++ viz/extract_derived_symbols.sql | 122 ++++ viz/extract_news_data.sql | 87 +++ viz/extract_simple.sql | 56 ++ viz/index.html | 589 ++++++++++++++++++ 17 files changed, 1836 insertions(+), 7 deletions(-) create mode 100644 viz/README.md create mode 100644 viz/crypto_heatmap.html create mode 100644 viz/derive_article_symbols.sql create mode 100644 viz/extract_articles_by_source.sql create mode 100755 viz/extract_crypto.sh create mode 100644 viz/extract_crypto_heatmap.sql create mode 100755 viz/extract_data.sh create mode 100755 viz/extract_data_sources.sh create mode 100644 viz/extract_derived_symbols.sql create mode 100644 viz/extract_news_data.sql create mode 100644 viz/extract_simple.sql create mode 100644 viz/index.html diff --git a/.gitignore b/.gitignore index 4662b54..f45b197 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 +viz/*.json db_relations/* # User-specific stuff diff --git a/crates/av-client/examples/news_analysis.rs b/crates/av-client/examples/news_analysis.rs index c8af190..edea09e 100644 --- a/crates/av-client/examples/news_analysis.rs +++ b/crates/av-client/examples/news_analysis.rs @@ -42,6 +42,7 @@ use av_core::{Config, Error}; use av_models::news::NewsSentiment; use std::collections::HashMap; use tokio::time::{Duration, Instant, sleep}; +use tracing::error; /// Sentiment analysis results for a group of securities #[derive(Debug)] @@ -101,7 +102,7 @@ async fn main() -> Result<(), Box> { // Load configuration let config = Config::from_env().map_err(|e| { - eprintln!("Failed to load configuration. Make sure ALPHA_VANTAGE_API_KEY is set."); + error!("Failed to load configuration. Make sure ALPHA_VANTAGE_API_KEY is set."); e })?; diff --git a/crates/av-client/examples/portfolio_tracker.rs b/crates/av-client/examples/portfolio_tracker.rs index 892a27c..75a139e 100644 --- a/crates/av-client/examples/portfolio_tracker.rs +++ b/crates/av-client/examples/portfolio_tracker.rs @@ -44,6 +44,7 @@ use std::io::{self, Write}; use av_models::{fundamentals::TopGainersLosers, time_series::GlobalQuote}; use std::collections::HashMap; use tokio::time::{Duration, sleep}; +use tracing::error; /// Portfolio holding information #[derive(Debug, Clone)] @@ -91,8 +92,7 @@ async fn main() -> Result<(), Box> { // Load configuration from environment let config = Config::from_env().map_err(|e| { - eprintln!("Failed to load configuration. Make sure ALPHA_VANTAGE_API_KEY is set."); - eprintln!("Error: {}", e); + error!("Failed to load configuration. Make sure ALPHA_VANTAGE_API_KEY is set. Error: {}", e); e })?; diff --git a/crates/av-database/postgres/src/repository.rs b/crates/av-database/postgres/src/repository.rs index 12ab907..8bbecd4 100644 --- a/crates/av-database/postgres/src/repository.rs +++ b/crates/av-database/postgres/src/repository.rs @@ -37,6 +37,7 @@ use chrono::{DateTime, Utc}; use diesel::prelude::*; use diesel::r2d2::{ConnectionManager, Pool, PooledConnection}; use diesel::result::Error as DieselError; +use log::error; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::sync::Arc; @@ -1427,7 +1428,7 @@ impl CryptoRepository for CryptoRepositoryImpl { match result { Ok(_) => total_affected += 1, Err(e) => { - eprintln!("Error upserting market {:?}/{:?}: {}", market.exchange, market.base, e) + error!("Error upserting market {:?}/{:?}: {}", market.exchange, market.base, e) } } } @@ -1487,7 +1488,7 @@ impl CryptoRepository for CryptoRepositoryImpl { match result { Ok(_) => saved_count += 1, - Err(e) => eprintln!("Error upserting social data for sid {}: {}", social.sid, e), + Err(e) => error!("Error upserting social data for sid {}: {}", social.sid, e), } } @@ -1549,7 +1550,7 @@ impl CryptoRepository for CryptoRepositoryImpl { match result { Ok(_) => saved_count += 1, - Err(e) => eprintln!("Error upserting technical data for sid {}: {}", technical.sid, e), + Err(e) => error!("Error upserting technical data for sid {}: {}", technical.sid, e), } } diff --git a/tests/integration/crypto_update_integrations b/tests/integration/crypto_update_integrations index c9d0da0..f49da8a 100644 --- a/tests/integration/crypto_update_integrations +++ b/tests/integration/crypto_update_integrations @@ -485,7 +485,7 @@ async fn main() -> Result<()> { info!("Manual updates completed: {} basic, {} social, {} technical", basic_count, social_count, technical_count); } else { - eprintln!("Manual updates contain validation errors, skipping..."); + error!("Manual updates contain validation errors, skipping..."); } // Final statistics diff --git a/viz/README.md b/viz/README.md new file mode 100644 index 0000000..9f55cc8 --- /dev/null +++ b/viz/README.md @@ -0,0 +1,136 @@ +# Data Visualization - POC + +A static, self-contained D3.js visualization showing relationships between news articles and stock symbols. + +**crypto_heatmap.html** is a simple heatmap visualization of the crypto names by market cap. Highlights a problem with symbol mapping. see + +Will provide more complete instructions in a future commit. + +This is my first foray into frontend work and will move this to a separate repo but for now it is actually useful for debugging and ideas + +## Quick Start + +### 1. Extract Data from Database + +```bash +cd viz +./extract_data.sh +``` + +This will: +- Extract ticker symbols from article titles using regex +- Query the PostgreSQL database for the top 50 most-mentioned symbols +- Fetch 200 recent articles containing ticker symbols +- Generate relationships by matching symbols in article titles +- Save everything to `news_data.json` + +### 2. View Visualization + +Open `index.html` in a web browser: + +```bash +# Option 1: Direct open +open index.html # macOS +xdg-open index.html # Linux +start index.html # Windows + +# Option 2: Simple HTTP server (recommended for production) +python3 -m http.server 8000 +# Then visit: http://localhost:8000 +``` + +## Features + +### Visualization Elements + +- **Nodes**: + - 🔵 **Blue circles** = Stock symbols (NVDA, TSLA, AAPL, etc.) + - Size proportional to number of article mentions + - Larger nodes = more frequently mentioned symbols + - 🟣 **Purple circles** = News articles + - Smaller, uniform size + - Click to open article URL + +- **Links** (Edges): + - ⚪ Gray lines connecting symbols to articles that mention them + - Derived from regex pattern matching in article titles + +### Interactive Features + +1. **Hover**: View detailed information: + - Symbols: Show ticker, company name, and article count + - Articles: Show full title and publish date +2. **Click**: Click article nodes to open the source URL in a new tab +3. **Drag**: Drag nodes to manually arrange the graph +4. **Zoom/Pan**: Scroll to zoom, drag background to pan + +### Stats Panel + +Real-time statistics showing: +- Number of symbols displayed (50) +- Number of articles displayed (200) +- Number of connections (68+) + +## Customization + +### Adjust Time Range + +Edit `extract_news_data.sql` line 30: +```sql +WHERE a.ct >= CURRENT_DATE - INTERVAL '30 days' -- Change to '60 days', '7 days', etc. +``` + +### Change Symbol Limit + +Edit `extract_news_data.sql` line 17: +```sql +LIMIT 25 -- Change to 50, 100, etc. +``` + +### Modify Visual Appearance + +Edit `index.html`: + +**Colors** (around line 210): +```javascript +const typeColors = { + 'Equity': '#3b82f6', // Change colors here + 'Cryptocurrency': '#f59e0b', + 'ETF': '#10b981', + // Add more types as needed +}; +``` + +**Node Sizes** (around line 288): +```javascript +size: Math.sqrt(symbol.article_count) * 8 + 10 // Adjust multiplier +``` + +**Force Simulation** (around line 386): +```javascript +.force('charge', d3.forceManyBody().strength(-300)) // Adjust repulsion +.force('link', d3.forceLink(links).distance(100)) // Adjust link distance +``` + + + + + +## Performance Notes + +- **Recommended limits**: + - Symbols: 25-50 (current: 25) + - Articles: 100-500 per symbol + - Total nodes: < 1000 for smooth interaction + +- **For larger datasets**: + - Increase time to stabilize: `simulation.alphaTarget(0.01)` + + + +## Dependencies + +- **D3.js v7**: Loaded from CDN (https://d3js.org/d3.v7.min.js) +- **jq** (optional): For JSON formatting in extraction script + +No build process required - it's pure HTML/JavaScript \ No newline at end of file diff --git a/viz/crypto_heatmap.html b/viz/crypto_heatmap.html new file mode 100644 index 0000000..65d398b --- /dev/null +++ b/viz/crypto_heatmap.html @@ -0,0 +1,525 @@ + + + + + + Cryptocurrency Heatmap + + + + + + +
+ +
+ +
+
+
+ Loading crypto data... +
+ + + + \ No newline at end of file diff --git a/viz/derive_article_symbols.sql b/viz/derive_article_symbols.sql new file mode 100644 index 0000000..16f041f --- /dev/null +++ b/viz/derive_article_symbols.sql @@ -0,0 +1,33 @@ +-- Derive article-symbol relationships from article titles +-- Extracts ticker symbols from patterns like "( NASDAQ:NVDA )" or "( INTU )" + +WITH extracted_tickers AS ( + SELECT + a.hashid as articleid, + a.title, + a.ct, + -- Extract all ticker patterns from the title + regexp_matches(a.title, '\(\s*(?:NASDAQ:|NYSE:|NYSEARCA:|OTC:)?([A-Z]{1,5})\s*\)', 'g') as ticker_match + FROM articles a +), +ticker_list AS ( + SELECT + articleid, + title, + ct, + ticker_match[1] as ticker_symbol + FROM extracted_tickers +) +SELECT + t.ticker_symbol, + COUNT(*) as mention_count, + json_agg(json_build_object( + 'article_id', t.articleid, + 'title', left(t.title, 100), + 'date', t.ct + ) ORDER BY t.ct DESC) as articles +FROM ticker_list t +GROUP BY t.ticker_symbol +HAVING COUNT(*) >= 3 -- Only tickers mentioned in 3+ articles +ORDER BY mention_count DESC +LIMIT 20; \ No newline at end of file diff --git a/viz/extract_articles_by_source.sql b/viz/extract_articles_by_source.sql new file mode 100644 index 0000000..c6162e6 --- /dev/null +++ b/viz/extract_articles_by_source.sql @@ -0,0 +1,75 @@ +-- Extract article-source relationship data for visualization +-- This shows which sources publish which articles (alternative when article_symbols is empty) + +WITH recent_articles AS ( + -- Get articles from last 30 days + SELECT + a.hashid, + a.title, + a.ct, + a.url, + a.sourceid, + src.source_name, + src.domain + FROM articles a + JOIN sources src ON a.sourceid = src.id + WHERE a.ct >= CURRENT_DATE - INTERVAL '30 days' + LIMIT 100 -- Limit for POC +), +source_stats AS ( + -- Get source statistics + SELECT + src.id, + src.source_name, + src.domain, + COUNT(DISTINCT a.hashid) as article_count + FROM sources src + JOIN articles a ON src.id = a.sourceid + WHERE a.ct >= CURRENT_DATE - INTERVAL '30 days' + GROUP BY src.id, src.source_name, src.domain + HAVING COUNT(DISTINCT a.hashid) >= 3 +) +-- Build JSON output +SELECT jsonb_build_object( + 'metadata', jsonb_build_object( + 'generated_at', NOW(), + 'date_range_start', CURRENT_DATE - INTERVAL '30 days', + 'date_range_end', CURRENT_DATE, + 'description', 'Article-source relationships (last 30 days)', + 'note', 'This visualization shows articles grouped by source. To see symbol relationships, load data into article_symbols table.' + ), + 'sources', ( + SELECT jsonb_agg( + jsonb_build_object( + 'id', ss.id, + 'name', ss.source_name, + 'domain', ss.domain, + 'article_count', ss.article_count + ) + ) + FROM source_stats ss + ), + 'articles', ( + SELECT jsonb_agg( + jsonb_build_object( + 'id', ra.hashid, + 'title', ra.title, + 'date', ra.ct, + 'url', ra.url, + 'source_id', ra.sourceid, + 'source_name', ra.source_name + ) + ) + FROM recent_articles ra + ), + 'relationships', ( + SELECT jsonb_agg( + jsonb_build_object( + 'article_id', ra.hashid, + 'source_id', ra.sourceid, + 'date', ra.ct + ) + ) + FROM recent_articles ra + ) +) as data; diff --git a/viz/extract_crypto.sh b/viz/extract_crypto.sh new file mode 100755 index 0000000..9819941 --- /dev/null +++ b/viz/extract_crypto.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Database connection settings +DB_HOST="localhost" +DB_PORT="6433" +DB_NAME="sec_master" +DB_USER="ts_user" +DB_PASS="dev_pw" + +echo "Extracting crypto heatmap data..." +echo "Database: ${DB_NAME}@${DB_HOST}:${DB_PORT}" + +# Run the query and save to JSON file +PGPASSWORD="$DB_PASS" psql \ + -h "$DB_HOST" \ + -p "$DB_PORT" \ + -U "$DB_USER" \ + -d "$DB_NAME" \ + -t \ + -A \ + -f extract_crypto_heatmap.sql \ + -o crypto_data.json + +# Format JSON with jq if available +if command -v jq &> /dev/null; then + echo "Formatting JSON with jq..." + jq '.' crypto_data.json > crypto_data.tmp && mv crypto_data.tmp crypto_data.json +fi + +# Count records +if command -v jq &> /dev/null; then + CRYPTO_COUNT=$(jq '.cryptos | length' crypto_data.json) +else + CRYPTO_COUNT="?" +fi + +FILE_SIZE=$(ls -lh crypto_data.json | awk '{print $5}') + +echo "" +echo "✓ Data extraction complete!" +echo " File: crypto_data.json ($FILE_SIZE)" +echo " Cryptocurrencies: $CRYPTO_COUNT" +echo "" +echo "Open crypto_heatmap.html in a web browser to view the visualization" diff --git a/viz/extract_crypto_heatmap.sql b/viz/extract_crypto_heatmap.sql new file mode 100644 index 0000000..8f42220 --- /dev/null +++ b/viz/extract_crypto_heatmap.sql @@ -0,0 +1,43 @@ +-- Extract crypto data for heatmap visualization +-- Uses symbols table with sec_type = 'Cryptocurrency' + +SELECT jsonb_build_object( + 'metadata', jsonb_build_object( + 'generated_at', NOW(), + 'total_cryptos', (SELECT COUNT(*) FROM symbols WHERE sec_type = 'Cryptocurrency'), + 'description', 'Cryptocurrency heatmap by market cap and 24h price change' + ), + 'cryptos', ( + SELECT jsonb_agg(crypto_data) + FROM ( + SELECT jsonb_build_object( + 'symbol', s.symbol, + 'name', s.name, + 'market_cap', b.market_cap, + 'market_cap_rank', b.market_cap_rank, + 'current_price', b.current_price, + 'volume_24h', b.volume_24h, + 'price_change_24h', m.price_change_24h, + 'price_change_pct_24h', m.price_change_pct_24h, + 'price_change_pct_7d', m.price_change_pct_7d, + 'price_change_pct_30d', m.price_change_pct_30d, + 'circulating_supply', b.circulating_supply, + 'last_updated', b.last_updated, + 'coingecko_id', sm_cg.source_identifier, + 'coinmarketcap_id', sm_cmc.source_identifier + ) as crypto_data + FROM symbols s + LEFT JOIN crypto_overview_basic b ON s.sid = b.sid + LEFT JOIN crypto_overview_metrics m ON s.sid = m.sid + LEFT JOIN symbol_mappings sm_cg ON s.sid = sm_cg.sid AND sm_cg.source_name = 'coingecko' + LEFT JOIN symbol_mappings sm_cmc ON s.sid = sm_cmc.sid AND sm_cmc.source_name = 'coinmarketcap' + WHERE s.sec_type = 'Cryptocurrency' + AND s.priority < 9999999 + AND b.market_cap IS NOT NULL + AND b.current_price IS NOT NULL + AND b.market_cap > 0 + ORDER BY b.market_cap_rank NULLS LAST + LIMIT 100 + ) ranked_cryptos + ) +) as data; diff --git a/viz/extract_data.sh b/viz/extract_data.sh new file mode 100755 index 0000000..618e804 --- /dev/null +++ b/viz/extract_data.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Extract news relationship data from PostgreSQL and save to JSON +# Usage: ./extract_data.sh + +set -e + +# Database connection parameters +DB_HOST="${DB_HOST:-localhost}" +DB_PORT="${DB_PORT:-6433}" +DB_NAME="${DB_NAME:-sec_master}" +DB_USER="${DB_USER:-ts_user}" +DB_PASS="${DB_PASS:-dev_pw}" + +echo "Extracting news relationship data..." +echo "Database: $DB_NAME@$DB_HOST:$DB_PORT" + +# Run the query and save to JSON file (using derived relationships from article titles) +PGPASSWORD="$DB_PASS" psql \ + -h "$DB_HOST" \ + -p "$DB_PORT" \ + -U "$DB_USER" \ + -d "$DB_NAME" \ + -t \ + -A \ + -f extract_derived_symbols.sql \ + -o news_data.json + +# Pretty print the JSON +if command -v jq &> /dev/null; then + echo "Formatting JSON with jq..." + jq '.' news_data.json > news_data_formatted.json + mv news_data_formatted.json news_data.json +fi + +# Get file size +FILE_SIZE=$(du -h news_data.json | cut -f1) + +# Count records +if command -v jq &> /dev/null; then + SYMBOL_COUNT=$(jq '.symbols | length' news_data.json) + ARTICLE_COUNT=$(jq '.articles | length' news_data.json) + RELATIONSHIP_COUNT=$(jq '.relationships | length' news_data.json) + + echo "" + echo "✓ Data extraction complete!" + echo " File: news_data.json (${FILE_SIZE})" + echo " Symbols: $SYMBOL_COUNT" + echo " Articles: $ARTICLE_COUNT" + echo " Relationships: $RELATIONSHIP_COUNT" +else + echo "" + echo "✓ Data extraction complete!" + echo " File: news_data.json (${FILE_SIZE})" +fi + +echo "" +echo "Open index.html in a web browser to view the visualization" diff --git a/viz/extract_data_sources.sh b/viz/extract_data_sources.sh new file mode 100755 index 0000000..d96a7f9 --- /dev/null +++ b/viz/extract_data_sources.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Extract article-source relationship data (alternative when article_symbols is empty) +# Usage: ./extract_data_sources.sh + +set -e + +# Database connection parameters +DB_HOST="${DB_HOST:-localhost}" +DB_PORT="${DB_PORT:-6433}" +DB_NAME="${DB_NAME:-sec_master}" +DB_USER="${DB_USER:-ts_user}" +DB_PASS="${DB_PASS:-dev_pw}" + +echo "Extracting article-source relationship data..." +echo "Database: $DB_NAME@$DB_HOST:$DB_PORT" + +# Run the query and save to JSON file +PGPASSWORD="$DB_PASS" psql \ + -h "$DB_HOST" \ + -p "$DB_PORT" \ + -U "$DB_USER" \ + -d "$DB_NAME" \ + -t \ + -A \ + -f extract_articles_by_source.sql \ + -o news_data.json + +# Pretty print the JSON +if command -v jq &> /dev/null; then + echo "Formatting JSON with jq..." + jq '.' news_data.json > news_data_formatted.json + mv news_data_formatted.json news_data.json +fi + +# Get file size +FILE_SIZE=$(du -h news_data.json | cut -f1) + +# Count records +if command -v jq &> /dev/null; then + SOURCE_COUNT=$(jq '.data.sources | length' news_data.json) + ARTICLE_COUNT=$(jq '.data.articles | length' news_data.json) + RELATIONSHIP_COUNT=$(jq '.data.relationships | length' news_data.json) + + echo "" + echo "✓ Data extraction complete!" + echo " File: news_data.json (${FILE_SIZE})" + echo " Sources: $SOURCE_COUNT" + echo " Articles: $ARTICLE_COUNT" + echo " Relationships: $RELATIONSHIP_COUNT" +else + echo "" + echo "✓ Data extraction complete!" + echo " File: news_data.json (${FILE_SIZE})" +fi + +echo "" +echo "Open index_sources.html in a web browser to view the visualization" \ No newline at end of file diff --git a/viz/extract_derived_symbols.sql b/viz/extract_derived_symbols.sql new file mode 100644 index 0000000..0483453 --- /dev/null +++ b/viz/extract_derived_symbols.sql @@ -0,0 +1,122 @@ +-- Complete news visualization with article-symbol relationships derived from titles +-- This eliminates the need for the article_symbols table + +SELECT jsonb_build_object( + 'metadata', jsonb_build_object( + 'generated_at', NOW(), + 'total_articles', (SELECT COUNT(*) FROM articles), + 'description', 'News article-symbol visualization (derived from article titles)', + 'derivation_method', 'regex extraction from titles' + ), + 'symbols', ( + WITH extracted_tickers AS ( + SELECT DISTINCT + (regexp_matches(title, '\(\s*(?:NASDAQ:|NYSE:|NYSEARCA:|OTC:)?([A-Z]{1,5})\s*\)', 'g'))[1] as ticker + FROM articles + ), + symbol_stats AS ( + SELECT + s.sid, + s.symbol, + s.name, + COUNT(a.hashid) as article_count + FROM symbols s + INNER JOIN extracted_tickers et ON s.symbol = et.ticker + INNER JOIN articles a ON a.title ~ ('\(\s*(?:NASDAQ:|NYSE:|NYSEARCA:|OTC:)?' || s.symbol || '\s*\)') + WHERE s.sec_type = 'Equity' + GROUP BY s.sid, s.symbol, s.name + HAVING COUNT(a.hashid) >= 3 + ORDER BY COUNT(a.hashid) DESC + LIMIT 50 + ) + SELECT jsonb_agg( + jsonb_build_object( + 'id', sid, + 'symbol', symbol, + 'name', name, + 'article_count', article_count + ) + ) + FROM symbol_stats + ), + 'articles', ( + -- Select recent articles that have ticker symbols + WITH recent_articles AS ( + SELECT hashid, title, ct, url, sourceid + FROM articles + WHERE title ~ '\(\s*(?:NASDAQ:|NYSE:|NYSEARCA:|OTC:)?[A-Z]{1,5}\s*\)' + ORDER BY ct DESC + LIMIT 1000 + ) + SELECT jsonb_agg( + jsonb_build_object( + 'id', hashid, + 'title', title, + 'date', ct, + 'url', url, + 'source_id', sourceid + ) + ) + FROM recent_articles + ), + 'relationships', ( + WITH top_symbols AS ( + -- Get the same symbols as in the symbols section above + WITH extracted_tickers AS ( + SELECT DISTINCT + (regexp_matches(title, '\(\s*(?:NASDAQ:|NYSE:|NYSEARCA:|OTC:)?([A-Z]{1,5})\s*\)', 'g'))[1] as ticker + FROM articles + ) + SELECT s.sid + FROM symbols s + INNER JOIN extracted_tickers et ON s.symbol = et.ticker + INNER JOIN articles a ON a.title ~ ('\(\s*(?:NASDAQ:|NYSE:|NYSEARCA:|OTC:)?' || s.symbol || '\s*\)') + WHERE s.sec_type = 'Equity' + GROUP BY s.sid + HAVING COUNT(a.hashid) >= 3 + ORDER BY COUNT(a.hashid) DESC + LIMIT 50 + ), + recent_articles AS ( + -- Same article set as above + SELECT hashid + FROM articles + WHERE title ~ '\(\s*(?:NASDAQ:|NYSE:|NYSEARCA:|OTC:)?[A-Z]{1,5}\s*\)' + ORDER BY ct DESC + LIMIT 1000 + ), + extracted_relations AS ( + SELECT DISTINCT + a.hashid as article_id, + s.sid as symbol_id + FROM articles a + INNER JOIN recent_articles ra ON a.hashid = ra.hashid + CROSS JOIN LATERAL regexp_matches(a.title, '\(\s*(?:NASDAQ:|NYSE:|NYSEARCA:|OTC:)?([A-Z]{1,5})\s*\)', 'g') AS ticker_match + INNER JOIN symbols s ON s.symbol = ticker_match[1] + INNER JOIN top_symbols ts ON s.sid = ts.sid + WHERE s.sec_type = 'Equity' + ) + SELECT jsonb_agg( + jsonb_build_object( + 'article_id', article_id, + 'symbol_id', symbol_id + ) + ) + FROM extracted_relations + ), + 'sources', ( + SELECT jsonb_agg( + jsonb_build_object( + 'id', src.id, + 'name', src.source_name, + 'domain', src.domain + ) + ) + FROM sources src + WHERE EXISTS ( + SELECT 1 FROM articles a + WHERE a.sourceid = src.id + AND a.title ~ '\(\s*(?:NASDAQ:|NYSE:|NYSEARCA:|OTC:)?[A-Z]{1,5}\s*\)' + ) + ) +) as data; \ No newline at end of file diff --git a/viz/extract_news_data.sql b/viz/extract_news_data.sql new file mode 100644 index 0000000..e7a23d9 --- /dev/null +++ b/viz/extract_news_data.sql @@ -0,0 +1,87 @@ +-- Extract news relationship data for visualization +-- This query gets symbol-article relationships with sentiment data +-- Limited to last 30 days and top symbols for POC + +WITH top_symbols AS ( + -- Get top 25 most mentioned symbols + SELECT + s.sid, + s.symbol, + s.name, + s.sec_type, + COUNT(DISTINCT asym.articleid) as article_count + FROM symbols s + JOIN article_symbols asym ON s.sid = asym.sid + JOIN articles a ON asym.articleid = a.hashid + WHERE a.ct >= CURRENT_DATE - INTERVAL '30 days' + GROUP BY s.sid, s.symbol, s.name, s.sec_type + ORDER BY article_count DESC + LIMIT 25 +), +article_data AS ( + -- Get articles mentioning these top symbols + SELECT DISTINCT + a.hashid, + a.title, + a.ct, + a.url, + src.source_name, + src.domain + FROM articles a + JOIN sources src ON a.sourceid = src.id + JOIN article_symbols asym ON a.hashid = asym.articleid + WHERE asym.sid IN (SELECT sid FROM top_symbols) + AND a.ct >= CURRENT_DATE - INTERVAL '30 days' +) +-- Main query: combine symbols, articles, and relationships +SELECT jsonb_build_object( + 'metadata', jsonb_build_object( + 'generated_at', NOW(), + 'date_range_start', CURRENT_DATE - INTERVAL '30 days', + 'date_range_end', CURRENT_DATE, + 'description', 'News relationship data for top 25 symbols (last 30 days)' + ), + 'symbols', ( + SELECT jsonb_agg( + jsonb_build_object( + 'sid', ts.sid, + 'symbol', ts.symbol, + 'name', ts.name, + 'sec_type', ts.sec_type, + 'article_count', ts.article_count + ) + ) + FROM top_symbols ts + ), + 'articles', ( + SELECT jsonb_agg( + jsonb_build_object( + 'id', ad.hashid, + 'title', ad.title, + 'date', ad.ct, + 'url', ad.url, + 'source', ad.source_name, + 'domain', ad.domain + ) + ) + FROM article_data ad + ), + 'relationships', ( + SELECT jsonb_agg( + jsonb_build_object( + 'article_id', asym.articleid, + 'sid', asym.sid, + 'symbol', s.symbol, + 'relevance', COALESCE(ts.relevance, 0.5), + 'sentiment', COALESCE(ts.tsentiment, 0.0), + 'sentiment_label', COALESCE(ts.sentiment_label, 'Neutral') + ) + ) + FROM article_symbols asym + JOIN top_symbols ts_check ON asym.sid = ts_check.sid + JOIN symbols s ON asym.sid = s.sid + LEFT JOIN feeds f ON f.articleid = asym.articleid AND f.sid = asym.sid + LEFT JOIN tickersentiments ts ON ts.feedid = f.id + WHERE asym.articleid IN (SELECT hashid FROM article_data) + ) +) as data; diff --git a/viz/extract_simple.sql b/viz/extract_simple.sql new file mode 100644 index 0000000..182a56d --- /dev/null +++ b/viz/extract_simple.sql @@ -0,0 +1,56 @@ +-- Simplified extraction for testing +SELECT jsonb_build_object( + 'metadata', jsonb_build_object( + 'generated_at', NOW(), + 'total_articles', (SELECT COUNT(*) FROM articles), + 'description', 'News article-source visualization' + ), + 'sources', ( + SELECT jsonb_agg( + jsonb_build_object( + 'id', src.id, + 'name', src.source_name, + 'domain', src.domain, + 'article_count', article_counts.count + ) + ) + FROM sources src + JOIN ( + SELECT sourceid, COUNT(*) as count + FROM articles + GROUP BY sourceid + HAVING COUNT(*) >= 10 + ) article_counts ON src.id = article_counts.sourceid + ), + 'articles', ( + SELECT jsonb_agg( + jsonb_build_object( + 'id', a.hashid, + 'title', a.title, + 'date', a.ct, + 'url', a.url, + 'source_id', a.sourceid + ) + ) + FROM ( + SELECT hashid, title, ct, url, sourceid + FROM articles + ORDER BY ct DESC + LIMIT 100 + ) a + ), + 'relationships', ( + SELECT jsonb_agg( + jsonb_build_object( + 'article_id', a.hashid, + 'source_id', a.sourceid + ) + ) + FROM ( + SELECT hashid, sourceid + FROM articles + ORDER BY ct DESC + LIMIT 100 + ) a + ) +) as data; \ No newline at end of file diff --git a/viz/index.html b/viz/index.html new file mode 100644 index 0000000..a910d1d --- /dev/null +++ b/viz/index.html @@ -0,0 +1,589 @@ + + + + + + + News Relationship Network - POC + + + + + + +
+ +
+

Network Stats

+
+ Symbols: + - +
+
+ Articles: + - +
+
+ Connections: + - +
+
+
+ +
+
+
+ Loading data... +
+ + + + \ No newline at end of file