diff --git a/client-admin/src/components/conversation-admin/index.js b/client-admin/src/components/conversation-admin/index.js index 2754ece72d..7164acda60 100644 --- a/client-admin/src/components/conversation-admin/index.js +++ b/client-admin/src/components/conversation-admin/index.js @@ -11,6 +11,7 @@ import ConversationConfig from './conversation-config' import ConversationStats from './stats' import ModerateComments from './comment-moderation/' +import TopicModeration from './topic-moderation/' // import DataExport from "./data-export"; import ShareAndEmbed from './share-and-embed' @@ -82,6 +83,16 @@ class ConversationAdminContainer extends React.Component { Moderate + + + Topic Mod + + + } + /> . +/** @jsx jsx */ + +import ComponentHelpers from '../../../util/component-helpers' +import NoPermission from '../no-permission' +import React from 'react' +import { connect } from 'react-redux' +import { Heading, Flex, Box, jsx } from 'theme-ui' +import { Switch, Route, Link } from 'react-router-dom' + +import TopicTree from './topic-tree' +import TopicDetail from './topic-detail' +import TopicStats from './topic-stats' +import ProximityVisualization from './proximity-visualization' + +const mapStateToProps = (state, ownProps) => { + return { + topics: state.topic_mod_topics, + stats: state.topic_mod_stats, + zid_metadata: state.zid_metadata + } +} + +const pollFrequency = 60000 + +@connect((state) => state.zid_metadata) +@connect(mapStateToProps) +class TopicModeration extends React.Component { + loadTopics() { + // Dispatch actions to load topics data + // TODO: Implement actions for loading topic moderation data + console.log('Loading topics for conversation:', this.props.conversation_id) + } + + componentDidMount() { + this.loadTopics() + // Temporarily disable polling to debug crash + // this.getTopicsRepeatedly = setInterval(() => { + // this.loadTopics() + // }, pollFrequency) + } + + componentWillUnmount() { + clearInterval(this.getTopicsRepeatedly) + } + + render() { + // Check if zid_metadata is still loading + if (!this.props.zid_metadata || this.props.zid_metadata.loading) { + return ( + +
Loading...
+
+ ) + } + + if (ComponentHelpers.shouldShowPermissionsError(this.props)) { + return + } + + const { match, location } = this.props + const url = location.pathname.split('/')[4] + + return ( + + + Topic Moderation + + + + Topics Tree + + + Proximity Map + + + Statistics + + + + + } + /> +
Proximity Visualization Coming Soon
} + /> + } + /> +
Topic Detail Coming Soon
} + /> +
+
+
+ ) + } +} + +export default TopicModeration \ No newline at end of file diff --git a/client-admin/src/components/conversation-admin/topic-moderation/proximity-visualization.js b/client-admin/src/components/conversation-admin/topic-moderation/proximity-visualization.js new file mode 100644 index 0000000000..c058cf07b0 --- /dev/null +++ b/client-admin/src/components/conversation-admin/topic-moderation/proximity-visualization.js @@ -0,0 +1,304 @@ +// Copyright (C) 2012-present, The Authors. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License, version 3, as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . +/** @jsx jsx */ + +import React from 'react' +import { connect } from 'react-redux' +import { jsx, Box, Flex, Heading, Text, Button, Select } from 'theme-ui' + +const mapStateToProps = (state) => { + return { + zid_metadata: state.zid_metadata + } +} + +@connect(mapStateToProps) +class ProximityVisualization extends React.Component { + constructor(props) { + super(props) + this.state = { + proximityData: [], + selectedLayer: '0', + loading: true, + error: null, + svgRef: React.createRef() + } + } + + componentDidMount() { + this.loadProximityData() + } + + async loadProximityData() { + try { + this.setState({ loading: true, error: null }) + const { match } = this.props + const { selectedLayer } = this.state + const conversation_id = match.params.conversation_id + + // Fetch proximity data (UMAP coordinates) + const response = await fetch(`/api/v3/topicMod/proximity?report_id=${conversation_id}&layer_id=${selectedLayer}`) + const data = await response.json() + + if (data.status === 'success') { + this.setState({ + proximityData: data.proximity_data || [], + loading: false + }, () => { + this.renderVisualization() + }) + } else { + this.setState({ + error: data.message || 'Failed to load proximity data', + loading: false + }) + } + } catch (err) { + this.setState({ + error: 'Network error loading proximity data', + loading: false + }) + } + } + + componentDidUpdate(prevState) { + if (prevState.selectedLayer !== this.state.selectedLayer) { + this.loadProximityData() + } + } + + renderVisualization() { + const { proximityData } = this.state + const svgElement = this.state.svgRef.current + + if (!svgElement || proximityData.length === 0) return + + // Clear previous content + svgElement.innerHTML = '' + + // Set up dimensions + const width = 800 + const height = 600 + const margin = 50 + + // Calculate bounds + const xValues = proximityData.map(d => d.umap_x).filter(x => x !== undefined) + const yValues = proximityData.map(d => d.umap_y).filter(y => y !== undefined) + + if (xValues.length === 0 || yValues.length === 0) { + svgElement.innerHTML = 'No coordinate data available' + return + } + + const xMin = Math.min(...xValues) + const xMax = Math.max(...xValues) + const yMin = Math.min(...yValues) + const yMax = Math.max(...yValues) + + // Create scales + const xScale = (x) => margin + ((x - xMin) / (xMax - xMin)) * (width - 2 * margin) + const yScale = (y) => height - margin - ((y - yMin) / (yMax - yMin)) * (height - 2 * margin) + + // Color mapping for moderation status + const getColor = (status) => { + switch (status) { + case 'accepted': case 1: return '#22c55e' + case 'rejected': case -1: return '#ef4444' + case 'meta': case 0: return '#f59e0b' + default: return '#6b7280' + } + } + + // Group by cluster for better visualization + const clusters = {} + proximityData.forEach(point => { + const clusterId = point.cluster_id || 0 + if (!clusters[clusterId]) clusters[clusterId] = [] + clusters[clusterId].push(point) + }) + + // Render cluster backgrounds (convex hulls would be better, but this is simpler) + Object.entries(clusters).forEach(([clusterId, points]) => { + if (points.length < 3) return + + const clusterXs = points.map(p => xScale(p.umap_x)) + const clusterYs = points.map(p => yScale(p.umap_y)) + const centerX = clusterXs.reduce((a, b) => a + b) / clusterXs.length + const centerY = clusterYs.reduce((a, b) => a + b) / clusterYs.length + const maxRadius = Math.max(...points.map(p => + Math.sqrt(Math.pow(xScale(p.umap_x) - centerX, 2) + Math.pow(yScale(p.umap_y) - centerY, 2)) + )) + 20 + + const circle = document.createElementNS('http://www.w3.org/2000/svg', 'circle') + circle.setAttribute('cx', centerX) + circle.setAttribute('cy', centerY) + circle.setAttribute('r', maxRadius) + circle.setAttribute('fill', '#f3f4f6') + circle.setAttribute('stroke', '#d1d5db') + circle.setAttribute('stroke-width', '1') + circle.setAttribute('opacity', '0.3') + svgElement.appendChild(circle) + + // Add cluster label + const text = document.createElementNS('http://www.w3.org/2000/svg', 'text') + text.setAttribute('x', centerX) + text.setAttribute('y', centerY - maxRadius + 15) + text.setAttribute('text-anchor', 'middle') + text.setAttribute('fill', '#6b7280') + text.setAttribute('font-size', '12') + text.textContent = `Cluster ${clusterId}` + svgElement.appendChild(text) + }) + + // Render points + proximityData.forEach((point, index) => { + if (point.umap_x === undefined || point.umap_y === undefined) return + + const circle = document.createElementNS('http://www.w3.org/2000/svg', 'circle') + const x = xScale(point.umap_x) + const y = yScale(point.umap_y) + + circle.setAttribute('cx', x) + circle.setAttribute('cy', y) + circle.setAttribute('r', '4') + circle.setAttribute('fill', getColor(point.moderation_status)) + circle.setAttribute('stroke', '#fff') + circle.setAttribute('stroke-width', '1') + circle.setAttribute('cursor', 'pointer') + + // Add tooltip on hover + const title = document.createElementNS('http://www.w3.org/2000/svg', 'title') + title.textContent = `Comment ${point.comment_id}\nCluster: ${point.cluster_id}\nStatus: ${point.moderation_status || 'pending'}\n\n${point.comment_text?.substring(0, 100)}...` + circle.appendChild(title) + + svgElement.appendChild(circle) + }) + + // Add axes + const xAxis = document.createElementNS('http://www.w3.org/2000/svg', 'line') + xAxis.setAttribute('x1', margin) + xAxis.setAttribute('y1', height - margin) + xAxis.setAttribute('x2', width - margin) + xAxis.setAttribute('y2', height - margin) + xAxis.setAttribute('stroke', '#d1d5db') + xAxis.setAttribute('stroke-width', '1') + svgElement.appendChild(xAxis) + + const yAxis = document.createElementNS('http://www.w3.org/2000/svg', 'line') + yAxis.setAttribute('x1', margin) + yAxis.setAttribute('y1', margin) + yAxis.setAttribute('x2', margin) + yAxis.setAttribute('y2', height - margin) + yAxis.setAttribute('stroke', '#d1d5db') + yAxis.setAttribute('stroke-width', '1') + svgElement.appendChild(yAxis) + + // Add axis labels + const xLabel = document.createElementNS('http://www.w3.org/2000/svg', 'text') + xLabel.setAttribute('x', width / 2) + xLabel.setAttribute('y', height - 10) + xLabel.setAttribute('text-anchor', 'middle') + xLabel.setAttribute('fill', '#6b7280') + xLabel.textContent = 'UMAP Dimension 1' + svgElement.appendChild(xLabel) + + const yLabel = document.createElementNS('http://www.w3.org/2000/svg', 'text') + yLabel.setAttribute('x', 15) + yLabel.setAttribute('y', height / 2) + yLabel.setAttribute('text-anchor', 'middle') + yLabel.setAttribute('fill', '#6b7280') + yLabel.setAttribute('transform', `rotate(-90, 15, ${height / 2})`) + yLabel.textContent = 'UMAP Dimension 2' + svgElement.appendChild(yLabel) + } + + render() { + const { loading, error, proximityData, selectedLayer } = this.state + + if (loading) { + return ( + + Loading proximity visualization... + + ) + } + + if (error) { + return ( + + Error: {error} + + + ) + } + + return ( + + + Proximity Visualization + + Layer: + + + + + + This visualization shows comments positioned by semantic similarity using UMAP coordinates. + Comments that are closer together are more semantically similar. + + + {proximityData.length > 0 ? ( + + + + + + Pending + + + + Accepted + + + + Rejected + + + + Meta + + + + + + + + + + + Hover over points to see comment details. Points are grouped by semantic clusters. + + + ) : ( + + No proximity data available for this layer. + + )} + + ) + } +} + +export default ProximityVisualization \ No newline at end of file diff --git a/client-admin/src/components/conversation-admin/topic-moderation/topic-detail.js b/client-admin/src/components/conversation-admin/topic-moderation/topic-detail.js new file mode 100644 index 0000000000..9c76e06a25 --- /dev/null +++ b/client-admin/src/components/conversation-admin/topic-moderation/topic-detail.js @@ -0,0 +1,311 @@ +// Copyright (C) 2012-present, The Authors. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License, version 3, as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . +/** @jsx jsx */ + +import React from 'react' +import { connect } from 'react-redux' +import { jsx, Box, Flex, Heading, Text, Button, Checkbox, Label } from 'theme-ui' +import { Link } from 'react-router-dom' + +const mapStateToProps = (state) => { + return { + zid_metadata: state.zid_metadata + } +} + +@connect(mapStateToProps) +class TopicDetail extends React.Component { + constructor(props) { + super(props) + this.state = { + comments: [], + selectedComments: new Set(), + loading: true, + error: null, + topicInfo: null, + selectAll: false + } + } + + componentDidMount() { + this.loadTopicComments() + } + + componentDidUpdate(prevProps) { + if (prevProps.match.params.topicKey !== this.props.match.params.topicKey) { + this.loadTopicComments() + } + } + + async loadTopicComments() { + try { + this.setState({ loading: true, error: null }) + const { match } = this.props + const conversation_id = match.params.conversation_id + const topicKey = decodeURIComponent(match.params.topicKey) + + // Fetch comments for this specific topic + const response = await fetch(`/api/v3/topicMod/topics/${encodeURIComponent(topicKey)}/comments?report_id=${conversation_id}`) + const data = await response.json() + + if (data.status === 'success') { + this.setState({ + comments: data.comments || [], + loading: false, + selectedComments: new Set() + }) + } else { + this.setState({ + error: data.message || 'Failed to load comments', + loading: false + }) + } + } catch (err) { + this.setState({ + error: 'Network error loading comments', + loading: false + }) + } + } + + toggleComment(commentId) { + const { selectedComments } = this.state + const newSelected = new Set(selectedComments) + + if (newSelected.has(commentId)) { + newSelected.delete(commentId) + } else { + newSelected.add(commentId) + } + + this.setState({ + selectedComments: newSelected, + selectAll: newSelected.size === this.state.comments.length + }) + } + + toggleSelectAll() { + const { selectAll, comments } = this.state + + if (selectAll) { + this.setState({ + selectedComments: new Set(), + selectAll: false + }) + } else { + this.setState({ + selectedComments: new Set(comments.map(c => c.comment_id)), + selectAll: true + }) + } + } + + async moderateSelected(action) { + const { selectedComments } = this.state + + if (selectedComments.size === 0) { + return + } + + try { + const { match } = this.props + const conversation_id = match.params.conversation_id + + const response = await fetch('/api/v3/topicMod/moderate', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + report_id: conversation_id, + comment_ids: Array.from(selectedComments), + action: action, + moderator: 'admin' // TODO: Get from auth state + }) + }) + + const data = await response.json() + + if (data.status === 'success') { + // Reload comments to reflect changes + this.loadTopicComments() + } else { + console.error('Moderation failed:', data.message) + } + } catch (err) { + console.error('Network error during moderation:', err) + } + } + + getStatusColor(status) { + switch (status) { + case 'accepted': case 1: return 'green' + case 'rejected': case -1: return 'red' + case 'meta': case 0: return 'orange' + default: return 'gray' + } + } + + getStatusText(status) { + switch (status) { + case 'accepted': case 1: return 'Accepted' + case 'rejected': case -1: return 'Rejected' + case 'meta': case 0: return 'Meta' + default: return 'Pending' + } + } + + renderComment(comment) { + const { selectedComments } = this.state + const isSelected = selectedComments.has(comment.comment_id) + const status = comment.moderation_status || 'pending' + + return ( + this.toggleComment(comment.comment_id)}> + + + + this.toggleComment(comment.comment_id)} + sx={{ mr: 3, mt: 1 }} + onClick={(e) => e.stopPropagation()} + /> + + + {comment.comment_text} + + + ID: {comment.comment_id} + Cluster: {comment.cluster_id} + Layer: {comment.layer_id} + {comment.umap_x !== undefined && comment.umap_y !== undefined && ( + Position: ({comment.umap_x?.toFixed(2)}, {comment.umap_y?.toFixed(2)}) + )} + + + + + + + {this.getStatusText(status)} + + + + + ) + } + + render() { + const { match } = this.props + const { loading, error, comments, selectedComments, selectAll } = this.state + const topicKey = decodeURIComponent(match.params.topicKey) + + if (loading) { + return ( + + Loading comments... + + ) + } + + if (error) { + return ( + + Error: {error} + + + ) + } + + return ( + + + + + + + + Topic: {topicKey} + + + + {comments.length} comments + + + + {comments.length > 0 && ( + <> + + + + + + + + + + + + + + {comments.map(comment => this.renderComment(comment))} + + + )} + + {comments.length === 0 && ( + + No comments found for this topic. + + )} + + ) + } +} + +export default TopicDetail \ No newline at end of file diff --git a/client-admin/src/components/conversation-admin/topic-moderation/topic-moderation.css b/client-admin/src/components/conversation-admin/topic-moderation/topic-moderation.css new file mode 100644 index 0000000000..de003d593f --- /dev/null +++ b/client-admin/src/components/conversation-admin/topic-moderation/topic-moderation.css @@ -0,0 +1,229 @@ +/* Topic Moderation CSS Enhancements */ + +.topic-card { + transition: all 0.2s ease; + position: relative; +} + +.topic-card:hover { + transform: translateY(-2px); + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1); +} + +.topic-card.expanded { + border-left: 4px solid #3b82f6; +} + +.moderation-status-pending { + background-color: #f3f4f6; + border-left: 4px solid #9ca3af; +} + +.moderation-status-accepted { + background-color: #f0fdf4; + border-left: 4px solid #22c55e; +} + +.moderation-status-rejected { + background-color: #fef2f2; + border-left: 4px solid #ef4444; +} + +.moderation-status-meta { + background-color: #fffbeb; + border-left: 4px solid #f59e0b; +} + +.moderation-buttons { + display: flex; + gap: 8px; + opacity: 0.7; + transition: opacity 0.2s ease; +} + +.topic-card:hover .moderation-buttons { + opacity: 1; +} + +.proximity-point { + transition: all 0.2s ease; + cursor: pointer; +} + +.proximity-point:hover { + r: 6; + stroke-width: 2; +} + +.cluster-background { + opacity: 0.2; + transition: opacity 0.3s ease; +} + +.cluster-background:hover { + opacity: 0.4; +} + +.stats-card { + transition: transform 0.2s ease; +} + +.stats-card:hover { + transform: scale(1.05); +} + +.progress-bar { + border-radius: 10px; + overflow: hidden; + background: linear-gradient(45deg, #f3f4f6, #e5e7eb); +} + +.progress-segment { + transition: width 0.5s ease; + height: 100%; +} + +.loading-shimmer { + animation: shimmer 1.5s infinite; + background: linear-gradient(90deg, #f0f0f0 25%, #e0e0e0 50%, #f0f0f0 75%); + background-size: 200% 100%; +} + +@keyframes shimmer { + 0% { + background-position: -200% 0; + } + 100% { + background-position: 200% 0; + } +} + +.comment-card { + transition: all 0.2s ease; +} + +.comment-card:hover { + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); +} + +.comment-card.selected { + border-color: #3b82f6; + background-color: #eff6ff; +} + +.bulk-actions-bar { + position: sticky; + top: 0; + z-index: 10; + backdrop-filter: blur(8px); + background-color: rgba(249, 250, 251, 0.9); +} + +.layer-selector { + display: flex; + gap: 4px; + padding: 4px; + background-color: #f3f4f6; + border-radius: 8px; +} + +.layer-button { + padding: 8px 16px; + border: none; + border-radius: 4px; + background: transparent; + cursor: pointer; + transition: all 0.2s ease; +} + +.layer-button:hover { + background-color: #e5e7eb; +} + +.layer-button.active { + background-color: #3b82f6; + color: white; +} + +.visualization-container { + position: relative; + border: 1px solid #e5e7eb; + border-radius: 8px; + overflow: hidden; + background: linear-gradient(135deg, #ffffff 0%, #f9fafb 100%); +} + +.legend { + display: flex; + gap: 16px; + padding: 12px; + background-color: #f9fafb; + border-bottom: 1px solid #e5e7eb; + font-size: 12px; +} + +.legend-item { + display: flex; + align-items: center; + gap: 6px; +} + +.legend-dot { + width: 10px; + height: 10px; + border-radius: 50%; + border: 1px solid #fff; +} + +.error-state { + text-align: center; + padding: 48px 24px; + color: #6b7280; +} + +.error-state svg { + width: 48px; + height: 48px; + margin-bottom: 16px; + opacity: 0.5; +} + +.empty-state { + text-align: center; + padding: 48px 24px; + color: #9ca3af; +} + +.empty-state svg { + width: 64px; + height: 64px; + margin-bottom: 16px; + opacity: 0.3; +} + +/* Responsive design */ +@media (max-width: 768px) { + .moderation-buttons { + flex-direction: column; + gap: 4px; + } + + .legend { + flex-wrap: wrap; + gap: 8px; + } + + .stats-grid { + grid-template-columns: repeat(2, 1fr); + } +} + +@media (max-width: 480px) { + .stats-grid { + grid-template-columns: 1fr; + } + + .layer-selector { + flex-wrap: wrap; + } +} \ No newline at end of file diff --git a/client-admin/src/components/conversation-admin/topic-moderation/topic-stats.js b/client-admin/src/components/conversation-admin/topic-moderation/topic-stats.js new file mode 100644 index 0000000000..5a3f4bfcb9 --- /dev/null +++ b/client-admin/src/components/conversation-admin/topic-moderation/topic-stats.js @@ -0,0 +1,179 @@ +// Copyright (C) 2012-present, The Authors. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License, version 3, as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . +/** @jsx jsx */ + +import React from 'react' +import { connect } from 'react-redux' +import { jsx, Box, Flex, Heading, Text, Card } from 'theme-ui' + +const mapStateToProps = (state) => { + return { + zid_metadata: state.zid_metadata + } +} + +@connect(mapStateToProps) +class TopicStats extends React.Component { + constructor(props) { + super(props) + this.state = { + stats: null, + loading: true, + error: null + } + } + + componentDidMount() { + this.loadStats() + } + + async loadStats() { + try { + this.setState({ loading: true, error: null }) + const conversation_id = this.props.conversation_id + + console.log('TopicStats loadStats - conversation_id:', conversation_id) + + // Fetch moderation statistics + const response = await fetch(`/api/v3/topicMod/stats?conversation_id=${conversation_id}`) + const data = await response.json() + + if (data.status === 'success') { + this.setState({ + stats: data.stats, + loading: false + }) + } else { + this.setState({ + error: data.message || 'Failed to load statistics', + loading: false + }) + } + } catch (err) { + this.setState({ + error: 'Network error loading statistics', + loading: false + }) + } + } + + renderStatCard(title, value, color = 'primary') { + return ( + + + {value} + + + {title} + + + ) + } + + render() { + const { loading, error, stats } = this.state + + if (loading) { + return ( + + Loading statistics... + + ) + } + + if (error) { + return ( + + Error: {error} + + ) + } + + if (!stats) { + return ( + + No statistics available. + + ) + } + + const completionRate = stats.total_topics > 0 + ? ((stats.total_topics - stats.pending) / stats.total_topics * 100).toFixed(1) + : 0 + + return ( + + + Topic Moderation Statistics + + + + {this.renderStatCard('Total Topics', stats.total_topics)} + {this.renderStatCard('Pending', stats.pending, 'gray')} + {this.renderStatCard('Accepted', stats.accepted, 'green')} + {this.renderStatCard('Rejected', stats.rejected, 'red')} + {this.renderStatCard('Meta', stats.meta, 'orange')} + {this.renderStatCard('Completion Rate', `${completionRate}%`, 'blue')} + + + + + Moderation Progress + + + + + Overall Progress + {completionRate}% Complete + + + + + 0 ? (stats.accepted / stats.total_topics * 100) : 0}%`, + transition: 'width 0.3s ease' + }} + /> + 0 ? (stats.rejected / stats.total_topics * 100) : 0}%`, + transition: 'width 0.3s ease' + }} + /> + 0 ? (stats.meta / stats.total_topics * 100) : 0}%`, + transition: 'width 0.3s ease' + }} + /> + + + + + Accepted: {stats.accepted} + Rejected: {stats.rejected} + Meta: {stats.meta} + Pending: {stats.pending} + + + + + {stats.total_topics === 0 && ( + + + No topics have been generated for this conversation yet. + + + Run the Delphi pipeline to generate topics for moderation. + + + )} + + ) + } +} + +export default TopicStats \ No newline at end of file diff --git a/client-admin/src/components/conversation-admin/topic-moderation/topic-tree.js b/client-admin/src/components/conversation-admin/topic-moderation/topic-tree.js new file mode 100644 index 0000000000..4f7e44065a --- /dev/null +++ b/client-admin/src/components/conversation-admin/topic-moderation/topic-tree.js @@ -0,0 +1,290 @@ +// Copyright (C) 2012-present, The Authors. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License, version 3, as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . +/** @jsx jsx */ + +import React from 'react' +import { connect } from 'react-redux' +import { jsx, Box, Flex, Heading, Text, Button } from 'theme-ui' +import { Link } from 'react-router-dom' + +const mapStateToProps = (state) => { + return { + topics: state.topic_mod_topics || {}, + zid_metadata: state.zid_metadata + } +} + +@connect(mapStateToProps) +class TopicTree extends React.Component { + constructor(props) { + super(props) + this.state = { + selectedLayer: '0', + expandedTopics: new Set(), + topicsData: null, + loading: true, + error: null + } + } + + componentDidMount() { + this.loadTopics() + } + + async loadTopics() { + try { + this.setState({ loading: true, error: null }) + const conversation_id = this.props.conversation_id + + console.log('TopicTree loadTopics - conversation_id:', conversation_id) + + // Fetch topics from API + const response = await fetch(`/api/v3/topicMod/topics?conversation_id=${conversation_id}`) + const data = await response.json() + + if (data.status === 'success') { + this.setState({ + topicsData: data.topics_by_layer || {}, + loading: false + }) + } else { + this.setState({ + error: data.message || 'Failed to load topics', + loading: false + }) + } + } catch (err) { + this.setState({ + error: 'Network error loading topics', + loading: false + }) + } + } + + toggleTopic(topicKey) { + const { expandedTopics } = this.state + const newExpanded = new Set(expandedTopics) + + if (newExpanded.has(topicKey)) { + newExpanded.delete(topicKey) + } else { + newExpanded.add(topicKey) + } + + this.setState({ expandedTopics: newExpanded }) + } + + getStatusColor(status) { + switch (status) { + case 'accepted': return 'green' + case 'rejected': return 'red' + case 'meta': return 'orange' + default: return 'gray' + } + } + + async moderateTopic(topicKey, action) { + try { + const conversation_id = this.props.conversation_id + + const response = await fetch('/api/v3/topicMod/moderate', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + conversation_id: conversation_id, + topic_key: topicKey, + action: action, + moderator: 'admin' // TODO: Get from auth state + }) + }) + + const data = await response.json() + + if (data.status === 'success') { + // Reload topics to reflect changes + this.loadTopics() + } else { + console.error('Moderation failed:', data.message) + } + } catch (err) { + console.error('Network error during moderation:', err) + } + } + + renderTopic(topic, layerId, clusterId) { + const { match } = this.props + const { expandedTopics } = this.state + const topicKey = topic.topic_key || `${layerId}_${clusterId}` + const isExpanded = expandedTopics.has(topicKey) + const status = topic.moderation?.status || 'pending' + + return ( + + + + + + + Layer {layerId}, Cluster {clusterId} + + + Status: {status} + + + + {topic.topic_name || 'Unnamed Topic'} + + {topic.moderation?.comment_count && ( + + {topic.moderation.comment_count} comments + + )} + + + + + + + + + + + + + {isExpanded && ( + + + Model: {topic.model_name || 'Unknown'} + + + Created: {topic.created_at ? new Date(topic.created_at).toLocaleString() : 'Unknown'} + + {topic.moderation?.moderator && ( + + Moderated by: {topic.moderation.moderator} + + )} + + )} + + ) + } + + renderLayer(layerId, topics) { + const layerTopics = Object.entries(topics).sort(([a], [b]) => parseInt(a) - parseInt(b)) + + return ( + + + Layer {layerId} ({layerTopics.length} topics) + + {layerTopics.map(([clusterId, topic]) => + this.renderTopic(topic, layerId, clusterId) + )} + + ) + } + + render() { + const { loading, error, topicsData, selectedLayer } = this.state + + if (loading) { + return ( + + Loading topics... + + ) + } + + if (error) { + return ( + + Error: {error} + + + ) + } + + if (!topicsData || Object.keys(topicsData).length === 0) { + return ( + + No topics available for this conversation. + + Topics are generated by the Delphi pipeline. Make sure the pipeline has been run for this conversation. + + + ) + } + + const layers = Object.entries(topicsData).sort(([a], [b]) => parseInt(a) - parseInt(b)) + + return ( + + + View Layer: + {layers.map(([layerId]) => ( + + ))} + + + + {selectedLayer === 'all' + ? layers.map(([layerId, topics]) => this.renderLayer(layerId, topics)) + : topicsData[selectedLayer] && this.renderLayer(selectedLayer, topicsData[selectedLayer]) + } + + ) + } +} + +export default TopicTree \ No newline at end of file diff --git a/client-admin/src/util/component-helpers.js b/client-admin/src/util/component-helpers.js index e2101b33ae..fcc2d53d1d 100644 --- a/client-admin/src/util/component-helpers.js +++ b/client-admin/src/util/component-helpers.js @@ -5,8 +5,9 @@ const helpers = {} helpers.shouldShowPermissionsError = (props) => { return ( props.zid_metadata && - !props.zid_metadata.is_owner && - !props.zid_metadata.is_mod + props.zid_metadata.zid_metadata && + !props.zid_metadata.zid_metadata.is_owner && + !props.zid_metadata.zid_metadata.is_mod ) } diff --git a/client-report/src/components/app.jsx b/client-report/src/components/app.jsx index e6763967f9..5f7c2f767c 100644 --- a/client-report/src/components/app.jsx +++ b/client-report/src/components/app.jsx @@ -26,10 +26,14 @@ import CommentsReport from "./commentsReport/CommentsReport.jsx"; import TopicReport from "./topicReport/TopicReport.jsx"; import ExportReport from "./exportReport/ExportReport.jsx"; import TopicsVizReport from "./topicsVizReport/TopicsVizReport.jsx"; -import TopicMapNarrativeReport from "./topicMapNarrativeReport.jsx"; +import TopicPrioritize from "./topicPrioritize/TopicPrioritize.jsx"; +import TopicPrioritizeSimple from "./topicPrioritizeSimple/TopicPrioritizeSimple.jsx"; +import TopicAgenda from "./topicAgenda/TopicAgenda.jsx"; +import TopicHierarchy from "./topicHierarchy/TopicHierarchy.jsx"; + +const pathname = window.location.pathname; // "/report/2arcefpshi" or "/commentsReport/2arcefpshi" or "/topicReport/2arcefpshi" or "/topicsVizReport/2arcefpshi" or "/exportReport/2arcefpshi" or "/topicPrioritize/2arcefpshi" or "/topicPrioritizeSimple/2arcefpshi" or "/topicAgenda/2arcefpshi" or "/topicHierarchy/2arcefpshi" +const route_type = pathname.split("/")[1]; // "report", "narrativeReport", "commentsReport", "topicReport", "topicsVizReport", "exportReport", "topicPrioritize", "topicPrioritizeSimple", "topicAgenda", or "topicHierarchy" -const pathname = window.location.pathname; // "/report/2arcefpshi" or "/commentsReport/2arcefpshi" or "/topicReport/2arcefpshi" or "/topicsVizReport/2arcefpshi" or "/exportReport/2arcefpshi" -const route_type = pathname.split("/")[1]; // "report", "narrativeReport", "commentsReport", "topicReport", "topicsVizReport", or "exportReport", or "topicMapNarrativeReport" const report_id = pathname.split("/")[2]; // Debug the route @@ -716,6 +720,10 @@ const App = (props) => { shouldShowNarrativeReport: route_type === "narrativeReport", shouldShowTopicReport: route_type === "topicReport", shouldShowExportReport: route_type === "exportReport", + shouldShowTopicPrioritize: route_type === "topicPrioritize", + shouldShowTopicPrioritizeSimple: route_type === "topicPrioritizeSimple", + shouldShowTopicAgenda: route_type === "topicAgenda", + shouldShowTopicHierarchy: route_type === "topicHierarchy", }); // Directly render ExportReport if the URL starts with /exportReport @@ -729,6 +737,50 @@ const App = (props) => { ); } + // Directly render TopicPrioritize if the URL starts with /topicPrioritize + if (route_type === "topicPrioritize") { + console.log("RENDERING: TopicPrioritize"); + return ( + + ); + } + + // Directly render TopicPrioritizeSimple if the URL starts with /topicPrioritizeSimple + if (route_type === "topicPrioritizeSimple") { + console.log("RENDERING: TopicPrioritizeSimple"); + return ( + + ); + } + + // Directly render TopicAgenda if the URL starts with /topicAgenda + if (route_type === "topicAgenda") { + console.log("RENDERING: TopicAgenda"); + return ; + } + + // Directly render TopicHierarchy if the URL starts with /topicHierarchy + if (route_type === "topicHierarchy") { + console.log("RENDERING: TopicHierarchy"); + return ( + + ); + } + // Directly render TopicReport if the URL starts with /topicReport if (route_type === "topicReport") { console.log("RENDERING: TopicReport"); diff --git a/client-report/src/components/framework/useReportId.js b/client-report/src/components/framework/useReportId.js index 53f8e58774..853f7dc896 100644 --- a/client-report/src/components/framework/useReportId.js +++ b/client-report/src/components/framework/useReportId.js @@ -6,10 +6,12 @@ export function useReportId() { useEffect(() => { // Parse the URL to extract the report ID const pathname = window.location.pathname; - - // Match patterns like /report/rid or /narrativeReport/rid or /commentsReport/rid - const match = pathname.match(/^\/(report|narrativeReport|commentsReport|topicMapNarrativeReport)\/([a-zA-Z0-9]+)/); - + + // Match patterns like /report/rid or /narrativeReport/rid or /commentsReport/rid or /topicPrioritize/rid or /topicPrioritizeSimple/rid or /topicAgenda/rid or /topicHierarchy/rid + const match = pathname.match( + /^\/(report|narrativeReport|commentsReport|topicPrioritize|topicPrioritizeSimple|topicAgenda|topicMapNarrativeReport|topicHierarchy|topicReport|topicsVizReport|exportReport)\/([a-zA-Z0-9]+)/ + ); + if (match && match[2]) { setReportId(match[2]); } else { diff --git a/client-report/src/components/participantsGraph/hull.jsx b/client-report/src/components/participantsGraph/hull.jsx index ee0fde0541..769b7e02fa 100644 --- a/client-report/src/components/participantsGraph/hull.jsx +++ b/client-report/src/components/participantsGraph/hull.jsx @@ -1,11 +1,13 @@ // Copyright (C) 2012-present, The Authors. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License, version 3, as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . import React from "react"; +import { line } from "d3-shape"; +import { curveLinear } from "d3-shape"; // import * as globals from "../globals"; const Hull = ({hull}) => { - const line = d3.line().curve(d3.curveLinear); - const pathString = line(hull.hull); + const lineGenerator = line().curve(curveLinear); + const pathString = lineGenerator(hull.hull); return ( { + const { report_id } = useReportId(); + const { + loading, + error, + topicData, + hierarchyAnalysis, + clusterGroups, + fetchUMAPData + } = useTopicData(report_id); + + const [selections, setSelections] = useState(new Set()); + const [commentMap, setCommentMap] = useState(new Map()); + + // Build comment map for easy lookup + useEffect(() => { + if (comments && comments.length > 0) { + const map = new Map(); + comments.forEach(comment => { + // Store by both tid (as number) and as string for flexibility + map.set(comment.tid, comment.txt); + map.set(String(comment.tid), comment.txt); + }); + setCommentMap(map); + console.log(`Built comment map with ${map.size / 2} comments`); + } + }, [comments]); + + // Fetch UMAP data when topic data is loaded + useEffect(() => { + if (topicData && conversation) { + fetchUMAPData(conversation); + } + }, [topicData, conversation, fetchUMAPData]); + + // Load previous selections when component mounts + useEffect(() => { + if (conversation && conversation.conversation_id) { + loadPreviousSelections(); + } + }, [conversation]); + + const loadPreviousSelections = async () => { + try { + const response = await fetch(`/api/v3/topicAgenda/selections?conversation_id=${conversation.conversation_id}`, { + method: 'GET', + credentials: 'include' + }); + + const result = await response.json(); + + if (result.status === 'success' && result.data) { + // Convert stored selections back to topic keys + const storedSelections = new Set(); + result.data.archetypal_selections.forEach(selection => { + storedSelections.add(selection.topic_key); + }); + setSelections(storedSelections); + console.log('Loaded previous selections:', Array.from(storedSelections)); + } + } catch (error) { + console.error('Error loading previous selections:', error); + } + }; + + const toggleTopicSelection = (topicKey) => { + const newSelections = new Set(selections); + if (newSelections.has(topicKey)) { + newSelections.delete(topicKey); + } else { + newSelections.add(topicKey); + } + setSelections(newSelections); + }; + + const handleDone = async () => { + try { + // Convert topic selections to archetypal comments + console.log("Selected topics:", Array.from(selections)); + + // Extract archetypal comments from selections + const archetypes = extractArchetypalComments(selections, topicData, clusterGroups, commentMap); + console.log("Extracted archetypes:", archetypes); + + // Log in a more readable format + console.log("\n=== SELECTED ARCHETYPAL COMMENTS ==="); + archetypes.forEach(group => { + console.log(`\nTopic: Layer ${group.layerId}, Cluster ${group.clusterId}`); + group.archetypes.forEach((archetype, i) => { + console.log(` ${i + 1}. "${archetype.text}" (ID: ${archetype.commentId})`); + }); + }); + console.log("=====================================\n"); + + // Transform to API format + const apiSelections = archetypes.map(group => ({ + layer_id: group.layerId, + cluster_id: group.clusterId, + topic_key: group.topicKey, + archetypal_comments: group.archetypes.map(a => ({ + comment_id: a.commentId, + comment_text: a.text, + coordinates: a.coordinates, + distance_to_centroid: a.distance + })) + })); + + // Send to API + const response = await fetch('/api/v3/topicAgenda/selections', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + conversation_id: conversation.conversation_id, + selections: apiSelections + }), + credentials: 'include' + }); + + const result = await response.json(); + + if (result.status === 'success') { + console.log('Topic agenda selections saved successfully:', result.data); + // TODO: Show success UI feedback + } else { + console.error('Failed to save selections:', result.message); + // TODO: Show error UI feedback + } + + } catch (error) { + console.error('Error saving topic agenda selections:', error); + // TODO: Show error UI feedback + } + }; + + if (loading) { + return ( +
+
+
Loading topic data...
+
+ +
+ ); + } + + if (error) { + return ( +
+
+
+

Error

+

{error}

+
+
+ +
+ ); + } + + return ( +
+
+
+ + + + +
+ +
+
+
+ +
+ ); +}; + +export default TopicAgenda; \ No newline at end of file diff --git a/client-report/src/components/topicAgenda/components/LayerHeader.jsx b/client-report/src/components/topicAgenda/components/LayerHeader.jsx new file mode 100644 index 0000000000..8834b9a836 --- /dev/null +++ b/client-report/src/components/topicAgenda/components/LayerHeader.jsx @@ -0,0 +1,17 @@ +import React from "react"; + +const LayerHeader = () => { + return ( +
+

Select Your Priority Topics

+ +
+ Choose topics that matter most to you, and set the agenda for you and others. + The more you select, the more specific options you'll see. + Topics are based on statements written by people in this conversation. +
+
+ ); +}; + +export default LayerHeader; \ No newline at end of file diff --git a/client-report/src/components/topicAgenda/components/ScrollableTopicsGrid.jsx b/client-report/src/components/topicAgenda/components/ScrollableTopicsGrid.jsx new file mode 100644 index 0000000000..b03d48c73f --- /dev/null +++ b/client-report/src/components/topicAgenda/components/ScrollableTopicsGrid.jsx @@ -0,0 +1,211 @@ +import React, { useEffect, useState } from "react"; +import TopicItem from "./TopicItem"; +import { getFilteredTopics } from "../utils/topicFiltering"; + +const ScrollableTopicsGrid = ({ + topicData, + selections, + onToggleSelection, + clusterGroups, + hierarchyAnalysis +}) => { + const [visibleLayers, setVisibleLayers] = useState(new Set()); + + if (!topicData || !hierarchyAnalysis) return null; + + const runKeys = Object.keys(topicData.runs); + const firstRun = topicData.runs[runKeys[0]]; + + if (!firstRun.topics_by_layer) return null; + + // Get the two coarsest layers (highest numbers) + // sortedLayers is ordered from highest to lowest (e.g., [7, 6, 5, 4, 3, 2, 1, 0]) + const sortedLayers = [...hierarchyAnalysis.layers].sort((a, b) => b - a); + const coarsestLayer = sortedLayers[0]; // e.g., 7 + const secondCoarsestLayer = sortedLayers[1]; // e.g., 6 + + // CRITICAL FEATURE: Cascading Auto-population + // ================================================================ + // EVERY LAYER DRIVES THE NEXT LEVEL OF DETAIL! + // + // DESIGN PHILOSOPHY: + // This creates an infinitely explorable space where each selection + // opens up new, more specific possibilities. It's like zooming into + // a fractal - the deeper you go, the more detail you discover. + // + // HOW IT WORKS: + // 1. Start with the two coarsest layers visible + // 2. Select from layer 2 → reveals nearby topics in layer 1 + // 3. Select from layer 1 → reveals nearby topics in layer 0 + // 4. And so on... each selection cascades down to finer layers + // + // THE CASCADE EFFECT: + // - Layer 3 (coarsest): Always visible as a safety net + // - Layer 2: Always visible, first driver of specificity + // - Layer 1: Appears when Layer 2 has selections + // - Layer 0: Appears when Layer 1 has selections + // - Future layers: Continue the pattern... + // + // SPATIAL PROXIMITY RULES: + // - We use UMAP coordinates to find "nearby" topics + // - Distance threshold gets tighter as you go deeper (more selective) + // - This ensures relevance increases with depth + // + // USER EXPERIENCE: + // - Feels like having a conversation that gets more specific + // - Never overwhelming - only shows what's relevant + // - Creates a sense of discovery and exploration + // - Users can stop at any level when they've found what they want + // + // IMPLEMENTATION NOTE: + // We track which layers should be visible based on selections + // in their parent layers. This creates a dependency chain where + // each layer's visibility depends on selections in the layer above. + // ================================================================ + + useEffect(() => { + if (!firstRun || !firstRun.topics_by_layer) return; + + const newVisibleLayers = new Set(); + + // Build a map of selections by layer + const selectionsByLayer = new Map(); + + // Categorize all selections by their layer + Array.from(selections).forEach(topicKey => { + // Find which layer this topic belongs to + for (const layerId of sortedLayers) { + const topic = Object.values(firstRun.topics_by_layer[layerId] || {}) + .find(t => t.topic_key === topicKey); + if (topic) { + if (!selectionsByLayer.has(layerId)) { + selectionsByLayer.set(layerId, new Set()); + } + selectionsByLayer.get(layerId).add(topicKey); + break; + } + } + }); + + // For each layer, check if its parent layer has selections + // Note: sortedLayers is ordered from coarsest to finest (e.g., [2, 1, 0]) + sortedLayers.forEach((layerId, index) => { + if (index < 2) return; // Skip the first two layers (always visible) + + const parentLayer = sortedLayers[index - 1]; // Parent is the previous in the array + + if (selectionsByLayer.has(parentLayer)) { + // Parent layer has selections, this layer should be visible + newVisibleLayers.add(layerId); + } + }); + + setVisibleLayers(newVisibleLayers); + }, [selections, sortedLayers.join(','), !!firstRun]); // Stable dependencies + + const renderLayerTopics = (layerId, layerLabel, parentLayerId = null) => { + const allTopics = firstRun.topics_by_layer[layerId]; + if (!allTopics) return null; + + let topicEntries; + + if (parentLayerId !== null) { + // This is a dynamically shown layer - filter by proximity to parent selections + const selectionsByLayer = new Map(); + + // Get selections from the parent layer + Array.from(selections).forEach(topicKey => { + const topic = Object.values(firstRun.topics_by_layer[parentLayerId] || {}) + .find(t => t.topic_key === topicKey); + if (topic) { + if (!selectionsByLayer.has(parentLayerId)) { + selectionsByLayer.set(parentLayerId, new Set()); + } + selectionsByLayer.get(parentLayerId).add(topicKey); + } + }); + + if (selectionsByLayer.size === 0) return null; + + // Get filtered topics based on proximity + const filteredTopics = getFilteredTopics( + allTopics, + layerId, + hierarchyAnalysis, + selectionsByLayer, + clusterGroups + ); + + // Apply distance threshold that gets tighter as we go deeper + const layerDepth = sortedLayers.indexOf(layerId); + const distanceThreshold = 3.0 - (layerDepth * 0.5); // 3.0, 2.5, 2.0, 1.5... + + topicEntries = filteredTopics.filter(entry => + entry.proximityScore !== null && entry.proximityScore < Math.max(distanceThreshold, 1.0) + ); + + if (topicEntries.length === 0) return null; + } else { + // This is a static layer - show all topics + topicEntries = Object.entries(allTopics).map(([clusterId, topic]) => ({ + clusterId, + topic, + proximityScore: null, + source: 'all' + })); + } + + return ( + + {layerLabel && ( +
+ {layerLabel} +
+ )} + {topicEntries.map(entry => ( + + ))} +
+ ); + }; + + // Determine layer labels based on depth + const getLayerLabel = (layerId, index) => { + if (index === 0) return null; // Coarsest layer has no label + if (index === 1) return "More Specific Topics"; + if (index === 2) return "SUPER SPECIFIC TOPICS"; + return null; // No labels for deeper layers + }; + + return ( +
+
+ {/* Always show the two coarsest layers */} + {renderLayerTopics(coarsestLayer, null)} + {secondCoarsestLayer !== undefined && + renderLayerTopics(secondCoarsestLayer, "More Specific Topics")} + + {/* Show additional layers based on selections in parent layers */} + {sortedLayers.slice(2).map((layerId, index) => { + if (!visibleLayers.has(layerId)) return null; + + const parentLayer = sortedLayers[index + 1]; // Parent is the previous in sorted order + return renderLayerTopics( + layerId, + getLayerLabel(layerId, index + 2), + parentLayer + ); + })} +
+
+ ); +}; + +export default ScrollableTopicsGrid; \ No newline at end of file diff --git a/client-report/src/components/topicAgenda/components/TopicAgendaStyles.jsx b/client-report/src/components/topicAgenda/components/TopicAgendaStyles.jsx new file mode 100644 index 0000000000..5032d76184 --- /dev/null +++ b/client-report/src/components/topicAgenda/components/TopicAgendaStyles.jsx @@ -0,0 +1,346 @@ +import React from "react"; + +const TopicAgendaStyles = () => ( + +); + +export default TopicAgendaStyles; diff --git a/client-report/src/components/topicAgenda/components/TopicItem.jsx b/client-report/src/components/topicAgenda/components/TopicItem.jsx new file mode 100644 index 0000000000..ba296ef319 --- /dev/null +++ b/client-report/src/components/topicAgenda/components/TopicItem.jsx @@ -0,0 +1,38 @@ +import React from "react"; +import { getCommentCount, cleanTopicDisplayName } from "../utils/topicUtils"; + +const TopicItem = ({ + entry, + layerId, + isSelected, + onToggleSelection, + clusterGroups, + isBanked = false +}) => { + const { clusterId, topic, proximityScore, closestBankedTopic } = entry; + const topicKey = topic.topic_key; + const commentCount = getCommentCount(layerId, clusterId, clusterGroups); + const displayName = cleanTopicDisplayName(topic.topic_name, layerId, clusterId); + + return ( +
onToggleSelection(topicKey)} + > +
+ {displayName} + {isSelected && ( + + + + + )} +
+
+ ); +}; + +export default TopicItem; diff --git a/client-report/src/components/topicAgenda/hooks/useAgendaBuilder.js b/client-report/src/components/topicAgenda/hooks/useAgendaBuilder.js new file mode 100644 index 0000000000..f3480f1053 --- /dev/null +++ b/client-report/src/components/topicAgenda/hooks/useAgendaBuilder.js @@ -0,0 +1,89 @@ +import { useState, useEffect } from "react"; + +export const useAgendaBuilder = (hierarchyAnalysis) => { + const [currentLayer, setCurrentLayer] = useState(null); + const [bankedTopics, setBankedTopics] = useState(new Map()); + const [currentSelections, setCurrentSelections] = useState(new Set()); + const [completedLayers, setCompletedLayers] = useState(new Set()); + + // Set current layer to the highest available layer when hierarchy is loaded + useEffect(() => { + if (currentLayer === null && hierarchyAnalysis && hierarchyAnalysis.layers.length > 0) { + const maxLayer = Math.max(...hierarchyAnalysis.layers); + setCurrentLayer(maxLayer); + console.log(`Setting current layer to highest available: ${maxLayer}`); + } + }, [hierarchyAnalysis, currentLayer]); + + const toggleTopicSelection = (topicKey) => { + const newSelections = new Set(currentSelections); + if (newSelections.has(topicKey)) { + newSelections.delete(topicKey); + } else { + newSelections.add(topicKey); + } + setCurrentSelections(newSelections); + }; + + const bankAndClear = () => { + if (currentSelections.size === 0) { + alert("Please select at least one topic to bank before proceeding."); + return; + } + + // Bank the current selections + const newBankedTopics = new Map(bankedTopics); + newBankedTopics.set(currentLayer, new Set(currentSelections)); + setBankedTopics(newBankedTopics); + + // Mark current layer as completed + const newCompletedLayers = new Set(completedLayers); + newCompletedLayers.add(currentLayer); + setCompletedLayers(newCompletedLayers); + + // Clear current selections + setCurrentSelections(new Set()); + + // Move to next layer (lower number = finer granularity) + const nextLayer = currentLayer - 1; + const minLayer = hierarchyAnalysis ? Math.min(...hierarchyAnalysis.layers) : 0; + + if ( + nextLayer >= minLayer && + hierarchyAnalysis && + hierarchyAnalysis.layers.includes(nextLayer) + ) { + setCurrentLayer(nextLayer); + console.log( + `Banked ${currentSelections.size} topics from Layer ${currentLayer}, moving to Layer ${nextLayer}` + ); + } else { + // Set currentLayer to null to indicate completion + setCurrentLayer(null); + console.log( + `Agenda building complete! Banked topics from ${newCompletedLayers.size} layers.` + ); + } + }; + + const resetAgenda = () => { + setBankedTopics(new Map()); + setCurrentSelections(new Set()); + setCompletedLayers(new Set()); + if (hierarchyAnalysis && hierarchyAnalysis.layers.length > 0) { + const maxLayer = Math.max(...hierarchyAnalysis.layers); + setCurrentLayer(maxLayer); + } + }; + + return { + currentLayer, + bankedTopics, + currentSelections, + completedLayers, + setCurrentSelections, + toggleTopicSelection, + bankAndClear, + resetAgenda, + }; +}; diff --git a/client-report/src/components/topicAgenda/hooks/useTopicData.js b/client-report/src/components/topicAgenda/hooks/useTopicData.js new file mode 100644 index 0000000000..40f70b67a7 --- /dev/null +++ b/client-report/src/components/topicAgenda/hooks/useTopicData.js @@ -0,0 +1,180 @@ +import { useState, useEffect, useCallback } from "react"; +import net from "../../../util/net"; + +export const useTopicData = (reportId) => { + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const [topicData, setTopicData] = useState(null); + const [hierarchyAnalysis, setHierarchyAnalysis] = useState(null); + const [umapData, setUmapData] = useState(null); + const [clusterGroups, setClusterGroups] = useState({}); + + const analyzeHierarchy = (data) => { + const runKeys = Object.keys(data.runs); + if (runKeys.length === 0) { + setHierarchyAnalysis({ hasHierarchy: false, reason: "No runs data" }); + return; + } + + const firstRun = data.runs[runKeys[0]]; + if (!firstRun.topics_by_layer) { + setHierarchyAnalysis({ hasHierarchy: false, reason: "No topics_by_layer data in run" }); + return; + } + + const layers = Object.keys(firstRun.topics_by_layer) + .map((k) => parseInt(k)) + .sort((a, b) => a - b); + console.log("Analyzing layers:", layers); + + const analysis = { + hasHierarchy: false, + layers: layers, + layerCounts: {}, + sampleTopics: {}, + totalComments: 0, + structure: "unknown", + runInfo: { + model_name: firstRun.model_name, + created_at: firstRun.created_at, + job_uuid: firstRun.job_uuid, + }, + }; + + layers.forEach((layerId) => { + const topics = firstRun.topics_by_layer[layerId]; + analysis.layerCounts[layerId] = Object.keys(topics).length; + + analysis.sampleTopics[layerId] = Object.values(topics) + .slice(0, 3) + .map((topic) => ({ + name: topic.topic_name, + key: topic.topic_key, + cluster_id: topic.cluster_id, + model_name: topic.model_name, + })); + }); + + const counts = Object.values(analysis.layerCounts); + const hasVariedCounts = Math.max(...counts) !== Math.min(...counts); + + if (hasVariedCounts && layers.length > 1) { + analysis.hasHierarchy = true; + analysis.structure = "hierarchical"; + analysis.reason = `Found ${layers.length} layers with varying topic counts: ${counts.join( + ", " + )}`; + } else if (layers.length === 1) { + analysis.structure = "flat"; + analysis.reason = "Only one layer found - flat structure"; + } else { + analysis.structure = "unclear"; + analysis.reason = "Multiple layers but similar counts - unclear hierarchy"; + } + + console.log("Hierarchy analysis:", analysis); + setHierarchyAnalysis(analysis); + }; + + const groupPointsByLayer = (data) => { + const groups = {}; + const allClusterIds = new Set(); + + for (let layer = 0; layer <= 3; layer++) { + groups[layer] = new Map(); + } + + data.forEach((point) => { + Object.entries(point.clusters || {}).forEach(([layerId, clusterId]) => { + const layer = parseInt(layerId); + const key = `${layer}_${clusterId}`; + + if (layer === 0) { + allClusterIds.add(clusterId); + } + + if (!groups[layer].has(key)) { + groups[layer].set(key, []); + } + + groups[layer].get(key).push({ + comment_id: point.comment_id, + cluster_id: clusterId, + layer: layer, + umap_x: point.umap_x, + umap_y: point.umap_y, + weight: point.weight || 1, + }); + }); + }); + + return groups; + }; + + const fetchUMAPData = useCallback(async (conversation) => { + try { + const conversationId = conversation?.conversation_id || reportId; + console.log("Fetching UMAP data for spatial filtering..."); + + const response = await fetch( + `/api/v3/topicMod/proximity?conversation_id=${conversationId}&layer_id=all` + ); + const data = await response.json(); + + if (data.status === "success" && data.proximity_data) { + console.log(`Loaded ${data.proximity_data.length} UMAP points for spatial filtering`); + setUmapData(data.proximity_data); + + const groups = groupPointsByLayer(data.proximity_data); + setClusterGroups(groups); + + console.log("UMAP cluster groups:", groups); + } else { + console.log("No UMAP data available for spatial filtering"); + } + } catch (err) { + console.error("Error fetching UMAP data:", err); + } + }, [reportId]); + + useEffect(() => { + if (!reportId) return; + + setLoading(true); + net + .polisGet("/api/v3/delphi", { + report_id: reportId, + }) + .then((response) => { + console.log("TopicAgenda topics response:", response); + + if (response && response.status === "success") { + if (response.runs && Object.keys(response.runs).length > 0) { + setTopicData(response); + analyzeHierarchy(response); + } else { + setError("No LLM topic data available yet. Run Delphi analysis first."); + } + } else { + setError("Failed to retrieve topic data"); + } + + setLoading(false); + }) + .catch((err) => { + console.error("Error fetching topic data:", err); + setError("Failed to connect to the topicMod endpoint"); + setLoading(false); + }); + }, [reportId]); + + return { + loading, + error, + topicData, + hierarchyAnalysis, + umapData, + clusterGroups, + fetchUMAPData, + }; +}; diff --git a/client-report/src/components/topicAgenda/utils/archetypeExtraction.js b/client-report/src/components/topicAgenda/utils/archetypeExtraction.js new file mode 100644 index 0000000000..664280074b --- /dev/null +++ b/client-report/src/components/topicAgenda/utils/archetypeExtraction.js @@ -0,0 +1,129 @@ +import { calculateClusterCentroid, calculateDistance } from './topicUtils'; + +/** + * Extract archetypal comments from topic selections + * These serve as stable anchor points across Delphi runs + * + * STRATEGY: + * 1. For each selected topic, find its cluster in UMAP space + * 2. Identify the most representative comments (archetypes) + * 3. Return comment IDs that persist across topic model updates + * + * WHY THIS MATTERS: + * - Topic names/clusters change between Delphi runs + * - But the underlying comments remain stable + * - By storing comment IDs instead of topic IDs, we maintain consistency + * - These archetypal comments represent what users actually care about + */ +export const extractArchetypalComments = (selections, topicData, clusterGroups, commentMap = new Map()) => { + const archetypeComments = []; + + // Parse selections to extract layer and cluster info + selections.forEach(topicKey => { + // Topic key formats: + // Old: "4c5b018b-51ac-4a3e-9d41-6307a73ebf68#2#3" + // New: "layer3_9" + + let layerId, clusterId; + + if (topicKey.startsWith('layer')) { + // New format: "layer3_9" + const match = topicKey.match(/layer(\d+)_(\d+)/); + if (match) { + layerId = parseInt(match[1]); + clusterId = match[2]; + } + } else { + // Old format with # separators + const parts = topicKey.split('#'); + if (parts.length >= 3) { + layerId = parseInt(parts[parts.length - 2]); + clusterId = parts[parts.length - 1]; + } + } + + if (layerId !== undefined && clusterId !== undefined) { + + // Find the cluster in clusterGroups + const clusterKey = `${layerId}_${clusterId}`; + const clusterPoints = clusterGroups[layerId]?.get(clusterKey); + + if (clusterPoints && clusterPoints.length > 0) { + // Strategy 1: Get comments closest to cluster centroid + const centroid = calculateClusterCentroid(clusterPoints); + + if (centroid) { + // Sort points by distance to centroid + const sortedPoints = clusterPoints + .map(point => ({ + ...point, + distanceToCentroid: calculateDistance( + { x: point.umap_x, y: point.umap_y }, + centroid + ) + })) + .sort((a, b) => a.distanceToCentroid - b.distanceToCentroid); + + // Take the top N most central comments as archetypes + const numArchetypes = Math.min(3, sortedPoints.length); + const archetypes = sortedPoints.slice(0, numArchetypes); + + archetypeComments.push({ + topicKey, + layerId, + clusterId, + archetypes: archetypes.map(a => { + // Try to get comment text from the map (comment_id might be string or number) + const commentText = commentMap.get(a.comment_id) || + commentMap.get(parseInt(a.comment_id)) || + commentMap.get(String(a.comment_id)) || + a.comment_text || + `[Comment ${a.comment_id}]`; + console.log(`Archetype comment ${a.comment_id}: "${commentText}"`); + return { + commentId: a.comment_id, + text: commentText, + distance: a.distanceToCentroid, + coordinates: { x: a.umap_x, y: a.umap_y } + }; + }) + }); + } else { + console.log(`No cluster points found for ${clusterKey}`); + } + } else { + console.log(`No cluster points found for layer ${layerId}, cluster ${clusterId}`); + } + } + }); + + return archetypeComments; +}; + +/** + * Convert archetypal comments to a format suitable for storage + * This creates a stable representation that survives Delphi re-runs + */ +export const serializeArchetypes = (archetypeComments) => { + // Flatten to just comment IDs and their coordinates + const stableAnchors = []; + + archetypeComments.forEach(group => { + group.archetypes.forEach(archetype => { + stableAnchors.push({ + commentId: archetype.commentId, + text: archetype.text, // Include text for debugging + coordinates: archetype.coordinates, + sourceLayer: group.layerId, + sourceCluster: group.clusterId + }); + }); + }); + + return { + version: 1, + timestamp: new Date().toISOString(), + anchors: stableAnchors, + totalSelections: archetypeComments.length + }; +}; \ No newline at end of file diff --git a/client-report/src/components/topicAgenda/utils/topicFiltering.js b/client-report/src/components/topicAgenda/utils/topicFiltering.js new file mode 100644 index 0000000000..ef54363b4d --- /dev/null +++ b/client-report/src/components/topicAgenda/utils/topicFiltering.js @@ -0,0 +1,106 @@ +import { calculateClusterCentroid, calculateDistance } from './topicUtils'; + +// Get filtered topics for current layer based on spatial proximity to banked topics +export const getFilteredTopics = (allTopics, layerId, hierarchyAnalysis, bankedTopics, clusterGroups) => { + const maxLayer = hierarchyAnalysis ? Math.max(...hierarchyAnalysis.layers) : layerId; + + if (layerId === maxLayer || bankedTopics.size === 0) { + return Object.entries(allTopics).map(([clusterId, topic]) => ({ + clusterId, + topic, + proximityScore: null, + source: 'all' + })); + } + + // For subsequent layers, filter based on proximity to banked topics + const higherLayerId = layerId + 1; + const bankedFromHigherLayer = bankedTopics.get(higherLayerId); + + if (!bankedFromHigherLayer || !clusterGroups[higherLayerId] || !clusterGroups[layerId]) { + return Object.entries(allTopics).map(([clusterId, topic]) => ({ + clusterId, + topic, + proximityScore: null, + source: 'all' + })); + } + + // Calculate proximity to banked topics + const adaptiveDistance = 4.0; + + const topicsWithProximity = Object.entries(allTopics).map(([clusterId, topic]) => { + const clusterKey = `${layerId}_${clusterId}`; + const targetPoints = clusterGroups[layerId].get(clusterKey); + + let minProximity = Infinity; + let closestBankedTopic = null; + + if (targetPoints && targetPoints.length > 0) { + const targetCentroid = calculateClusterCentroid(targetPoints); + if (targetCentroid) { + // Check distance to each banked topic + bankedFromHigherLayer.forEach(bankedTopicKey => { + // Extract cluster info from topic key + let bankedClusterId; + if (bankedTopicKey.includes('#')) { + // Format: "2_uuid#2#6" -> clusterId = "6" + const parts = bankedTopicKey.split('#'); + bankedClusterId = parts[parts.length - 1]; + } else if (bankedTopicKey.includes('_')) { + // Format: "2_6" -> clusterId = "6" + const parts = bankedTopicKey.split('_'); + bankedClusterId = parts[parts.length - 1]; + } + + const bankedClusterKey = `${higherLayerId}_${bankedClusterId}`; + const bankedPoints = clusterGroups[higherLayerId].get(bankedClusterKey); + + if (bankedPoints && bankedPoints.length > 0) { + const bankedCentroid = calculateClusterCentroid(bankedPoints); + if (bankedCentroid) { + const distance = calculateDistance(targetCentroid, bankedCentroid); + if (distance < minProximity) { + minProximity = distance; + closestBankedTopic = bankedClusterKey; + } + } + } + }); + } + } + + const finalScore = minProximity === Infinity ? null : minProximity; + + return { + clusterId, + topic, + proximityScore: finalScore, + closestBankedTopic: closestBankedTopic, + source: (minProximity !== Infinity && minProximity <= adaptiveDistance) ? 'close' : 'far' + }; + }); + + // For coarsest and second coarsest layers: show all topics, just sort by proximity + // For finest layers: apply the proximity filtering and hide nulls + let filteredTopics; + if (layerId === maxLayer - 1) { + // Second coarsest layer: show all topics + filteredTopics = topicsWithProximity; + } else { + // Finest layers: apply proximity filtering and hide topics without distance data + filteredTopics = topicsWithProximity.filter(item => + item.source === 'close' + ); + } + + // Sort by proximity score (closest first, then nulls at end) + const sortedTopics = filteredTopics.sort((a, b) => { + if (a.proximityScore === null && b.proximityScore === null) return 0; + if (a.proximityScore === null) return 1; + if (b.proximityScore === null) return -1; + return a.proximityScore - b.proximityScore; + }); + + return sortedTopics; +}; diff --git a/client-report/src/components/topicAgenda/utils/topicUtils.js b/client-report/src/components/topicAgenda/utils/topicUtils.js new file mode 100644 index 0000000000..2aff33c641 --- /dev/null +++ b/client-report/src/components/topicAgenda/utils/topicUtils.js @@ -0,0 +1,33 @@ +// Calculate cluster centroid in UMAP space +export const calculateClusterCentroid = (clusterPoints) => { + if (!clusterPoints || clusterPoints.length === 0) return null; + const centroidX = clusterPoints.reduce((sum, p) => sum + p.umap_x, 0) / clusterPoints.length; + const centroidY = clusterPoints.reduce((sum, p) => sum + p.umap_y, 0) / clusterPoints.length; + return { x: centroidX, y: centroidY }; +}; + +// Calculate Euclidean distance between two points +export const calculateDistance = (point1, point2) => { + return Math.sqrt( + Math.pow(point1.x - point2.x, 2) + + Math.pow(point1.y - point2.y, 2) + ); +}; + +// Get comment count for a cluster +export const getCommentCount = (layerId, clusterId, clusterGroups) => { + const clusterKey = `${layerId}_${clusterId}`; + const points = clusterGroups[layerId]?.get(clusterKey); + return points ? points.length : 0; +}; + +// Clean topic display name by removing layer/cluster prefix +export const cleanTopicDisplayName = (topicName, layerId, clusterId) => { + if (!topicName) return `Topic ${clusterId}`; + + const layerClusterPrefix = `${layerId}_${clusterId}`; + if (topicName.startsWith(layerClusterPrefix)) { + return topicName.substring(layerClusterPrefix.length).replace(/^:\s*/, ''); + } + return topicName; +}; diff --git a/client-report/src/components/topicHierarchy/TopicHierarchy.jsx b/client-report/src/components/topicHierarchy/TopicHierarchy.jsx new file mode 100644 index 0000000000..21abb99b7f --- /dev/null +++ b/client-report/src/components/topicHierarchy/TopicHierarchy.jsx @@ -0,0 +1,1179 @@ +import React, { useState, useEffect, useRef } from "react"; +import { useReportId } from "../framework/useReportId"; +import { select, selectAll, mouse as d3Mouse } from "d3-selection"; +import { scaleLinear, scaleOrdinal } from "d3-scale"; +import { extent } from "d3-array"; +import { polygonHull } from "d3-polygon"; +import { hierarchy, pack } from "d3-hierarchy"; + +const TopicHierarchy = ({ conversation }) => { + const { report_id } = useReportId(); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const [hierarchyData, setHierarchyData] = useState(null); + const [umapData, setUmapData] = useState(null); + const [layerVisibility, setLayerVisibility] = useState({ + 0: true, + 1: true, + 2: true, + 3: true + }); + const [visualizationType, setVisualizationType] = useState('hulls'); // 'density' or 'hulls' + const [densityLayerVisibility, setDensityLayerVisibility] = useState({ + 0: false, + 1: false, + 2: false, + 3: true // Only layer 3 by default + }); + const [topicNames, setTopicNames] = useState(new Map()); + const circlePackRef = useRef(null); + const umapRef = useRef(null); + const densityRef = useRef(null); + + useEffect(() => { + if (!report_id) return; + fetchHierarchyData(); + }, [report_id]); + + // Fetch hierarchical cluster structure from DynamoDB (from TopicPrioritize.jsx - working version) + const fetchHierarchyData = async () => { + try { + // Use the zinvite from conversation data instead of report_id + const conversationId = conversation?.conversation_id || report_id; + const response = await fetch(`/api/v3/topicMod/hierarchy?conversation_id=${conversationId}`); + const data = await response.json(); + + if (data.status === "success" && data.hierarchy) { + setHierarchyData(data); + console.log("Hierarchy data loaded successfully:", data); + console.log("Setting hierarchyData state with:", data); + + // Also fetch topic names for better labeling + try { + const topicsResponse = await fetch(`/api/v3/topicMod/topics?conversation_id=${conversationId}`); + const topicsData = await topicsResponse.json(); + + if (topicsData.status === "success" && topicsData.topics_by_layer) { + // Create topic name lookup map from topics_by_layer + const topicNameMap = new Map(); + Object.entries(topicsData.topics_by_layer).forEach(([layer, topics]) => { + topics.forEach(topic => { + const key = `layer${layer}_${topic.cluster_id}`; + topicNameMap.set(key, topic.topic_name); + }); + }); + + // Store topic names in state for density visualization + setTopicNames(topicNameMap); + + // Add topic names to hierarchy + const addTopicNames = (node) => { + const key = `layer${node.layer}_${node.clusterId}`; + if (topicNameMap.has(key)) { + node.topic_name = topicNameMap.get(key); + } + if (node.children) { + node.children.forEach(addTopicNames); + } + }; + + addTopicNames(data.hierarchy); + } + } catch (topicErr) { + console.log("Could not fetch topic names, proceeding without them:", topicErr); + } + + // Fetch UMAP data for all clusters + await fetchUMAPData(conversationId); + } else { + console.log("No hierarchy data available:", data.message); + setError("No hierarchy data available"); + } + setLoading(false); + } catch (err) { + console.error("Error fetching hierarchy data:", err); + setError("Failed to load hierarchy data"); + setLoading(false); + } + }; + + // Fetch UMAP coordinates for ALL comments + const fetchUMAPData = async (conversationId) => { + try { + console.log("Fetching ALL UMAP coordinates..."); + const response = await fetch(`/api/v3/topicMod/proximity?conversation_id=${conversationId}&layer_id=all`); + const data = await response.json(); + + console.log("CLIENT DEBUG: UMAP response received with", data.proximity_data?.length, "items"); + console.log("CLIENT DEBUG: Response status:", data.status); + console.log("CLIENT DEBUG: Response message:", data.message); + + // Log first few items in detail + if (data.proximity_data && data.proximity_data.length > 0) { + console.log("CLIENT DEBUG: First 3 data points:", data.proximity_data.slice(0, 3)); + + // Check structure of first item + const firstItem = data.proximity_data[0]; + console.log("CLIENT DEBUG: First item structure:"); + console.log(" - comment_id:", firstItem.comment_id); + console.log(" - umap_x:", firstItem.umap_x); + console.log(" - umap_y:", firstItem.umap_y); + console.log(" - clusters:", firstItem.clusters); + console.log(" - clusters type:", typeof firstItem.clusters); + console.log(" - clusters keys:", Object.keys(firstItem.clusters || {})); + } + + if (data.status === "success" && data.proximity_data) { + // Debug: Check cluster assignments + const samplePoints = data.proximity_data.slice(0, 5); + console.log("Sample points with clusters:", samplePoints.map(p => ({ + comment_id: p.comment_id, + clusters: p.clusters, + cluster_keys: Object.keys(p.clusters || {}), + cluster_count: Object.keys(p.clusters || {}).length, + raw_point: p // Show the whole point structure + }))); + + // Count how many points have cluster assignments + const pointsWithClusters = data.proximity_data.filter(p => Object.keys(p.clusters || {}).length > 0); + console.log(`Points with cluster assignments: ${pointsWithClusters.length} / ${data.proximity_data.length}`); + + if (pointsWithClusters.length === 0) { + console.log("No cluster assignments found! Using raw coordinates and assigning all to layer 0"); + // Fallback: show all points as layer 0 if no cluster assignments + const fallbackData = data.proximity_data.map(point => ({ + comment_id: point.comment_id, + cluster_id: 0, + layer: 0, + umap_x: point.umap_x, + umap_y: point.umap_y, + weight: point.weight + })); + console.log("Fallback data:", fallbackData.length, "points"); + console.log("Sample fallback point:", fallbackData[0]); + setUmapData(fallbackData); + return; + } + + // Process the data to create points for each layer based on cluster assignments + const processedData = []; + + data.proximity_data.forEach(point => { + // Create a point for each layer where this comment has a cluster assignment + Object.entries(point.clusters || {}).forEach(([layerId, clusterId]) => { + processedData.push({ + comment_id: point.comment_id, + cluster_id: clusterId, + layer: parseInt(layerId), + umap_x: point.umap_x, + umap_y: point.umap_y, + weight: point.weight + }); + }); + }); + + console.log("UMAP data loaded:", processedData.length, "layer-comment assignments"); + console.log("Raw comments:", data.proximity_data.length); + console.log("Sample processed point:", processedData[0]); + setUmapData(processedData); + } else { + console.log("No UMAP data:", data.message); + setUmapData([]); + } + } catch (err) { + console.error("Error fetching UMAP data:", err); + } + }; + + // Toggle layer visibility + const toggleLayerVisibility = (layerId) => { + setLayerVisibility(prev => ({ + ...prev, + [layerId]: !prev[layerId] + })); + }; + + // Toggle density layer visibility + const toggleDensityLayerVisibility = (layerId) => { + setDensityLayerVisibility(prev => ({ + ...prev, + [layerId]: !prev[layerId] + })); + }; + + // Create UMAP spatial visualization with Canvas for performance + const createUMAPVisualization = () => { + if (!umapData || !umapRef.current) return; + + if (umapData.length === 0) { + console.log("No UMAP data to visualize"); + return; + } + + console.log("Creating Canvas UMAP visualization with", umapData.length, "points"); + + // Generate colors similar to datamapplot's approach + const generateClusterColor = (clusterId, layer) => { + // Use a color palette similar to datamapplot + const baseColors = [ + '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', + '#e377c2', '#7f7f7f', '#bcbd22', '#17becf', '#ff9999', '#66b3ff', + '#99ff99', '#ffcc99', '#ff99cc', '#c2c2f0', '#ffb3e6', '#c2f0c2', + '#ffd9b3', '#b3b3ff', '#ffb3b3', '#b3ffb3', '#ffccb3', '#ccb3ff' + ]; + + // Ensure we have valid inputs + if (typeof clusterId !== 'number' || typeof layer !== 'number') { + return '#999999'; // Default gray color + } + + // Create a deterministic color based on cluster ID + const colorIndex = (clusterId * 7 + layer * 3) % baseColors.length; + return baseColors[colorIndex]; + }; + + // Clear previous visualization + umapRef.current.innerHTML = ''; + + const size = 800; // Square canvas + const width = size; + const height = size; + const margin = { top: 20, right: 20, bottom: 20, left: 20 }; + + // Create canvas + const canvas = select(umapRef.current) + .append("canvas") + .attr("width", width) + .attr("height", height) + .style("width", "100%") + .style("height", "auto") + .style("border", "1px solid #ddd"); + + const context = canvas.node().getContext("2d"); + + // Enable high DPI + const devicePixelRatio = window.devicePixelRatio || 1; + canvas.attr("width", width * devicePixelRatio) + .attr("height", height * devicePixelRatio); + context.scale(devicePixelRatio, devicePixelRatio); + + // Create scales + const xExtent = extent(umapData, d => d.umap_x); + const yExtent = extent(umapData, d => d.umap_y); + + console.log("UMAP data extents:", { xExtent, yExtent }); + + const xScale = scaleLinear() + .domain(xExtent) + .range([margin.left, width - margin.right]); + + const yScale = scaleLinear() + .domain(yExtent) + .range([height - margin.bottom, margin.top]); + + // Clear canvas + context.clearRect(0, 0, width, height); + + // Get unique raw comment coordinates (without layer duplicates) + const uniqueComments = new Map(); + umapData.forEach(point => { + const key = `${point.comment_id}`; + if (!uniqueComments.has(key)) { + uniqueComments.set(key, { + comment_id: point.comment_id, + umap_x: point.umap_x, + umap_y: point.umap_y, + clusters_by_layer: {} + }); + } + uniqueComments.get(key).clusters_by_layer[point.layer] = point.cluster_id; + }); + + const uniquePoints = Array.from(uniqueComments.values()); + console.log(`Drawing ${uniquePoints.length} unique comments with cluster assignments for each layer`); + + // Group points by cluster for each layer to draw hulls + const clusterGroups = {}; + for (let layer = 0; layer <= 3; layer++) { + clusterGroups[layer] = new Map(); + + uniquePoints.forEach(point => { + const clusterId = point.clusters_by_layer[layer]; + if (clusterId !== undefined) { + const key = `L${layer}C${clusterId}`; + if (!clusterGroups[layer].has(key)) { + clusterGroups[layer].set(key, []); + } + clusterGroups[layer].get(key).push(point); + } + }); + } + + // Debug: Show cluster distribution + for (let layer = 0; layer <= 3; layer++) { + const clusters = clusterGroups[layer]; + console.log(`Layer ${layer}: ${clusters.size} clusters`); + + // Show first few clusters and their sizes + let count = 0; + clusters.forEach((points, clusterKey) => { + if (count < 3) { + console.log(` ${clusterKey}: ${points.length} points`); + count++; + } + }); + } + + // Draw convex hulls for each individual cluster in each layer + const layerColors = ["#ff6b6b", "#4ecdc4", "#45b7d1", "#96ceb4"]; + const layerAlphas = [0.1, 0.15, 0.2, 0.25]; // Different opacities to show containment + const layerLineWidths = [0.5, 1, 1.5, 2]; // Different line weights + + // Draw hulls from coarsest to finest (3 → 0) so finer hulls appear on top + for (let layer = 3; layer >= 0; layer--) { + // Skip this layer if it's not visible + if (!layerVisibility[layer]) continue; + + const clusters = clusterGroups[layer]; + + console.log(`Drawing ${clusters.size} individual cluster hulls for Layer ${layer}`); + + clusters.forEach((points, clusterKey) => { + if (points.length < 3) return; // Need at least 3 points for hull + + const hullPoints = points.map(p => [xScale(p.umap_x), yScale(p.umap_y)]); + const hull = polygonHull(hullPoints); + + if (hull && hull.length > 2) { + context.beginPath(); + context.moveTo(hull[0][0], hull[0][1]); + for (let i = 1; i < hull.length; i++) { + context.lineTo(hull[i][0], hull[i][1]); + } + context.closePath(); + + // Fill hull with layer color and alpha + context.fillStyle = layerColors[layer]; + context.globalAlpha = layerAlphas[layer]; + context.fill(); + + // Stroke hull with layer color and line width + context.strokeStyle = layerColors[layer]; + context.globalAlpha = 0.7; + context.lineWidth = layerLineWidths[layer]; + context.stroke(); + } + }); + } + + // Reset alpha for points + context.globalAlpha = 1.0; + + // Draw all points in neutral color since they belong to multiple clusters + uniquePoints.forEach(point => { + const x = xScale(point.umap_x); + const y = yScale(point.umap_y); + + context.beginPath(); + context.arc(x, y, 1.5, 0, 2 * Math.PI); + context.fillStyle = "#333"; + context.globalAlpha = 0.7; + context.fill(); + }); + + // Add legend with toggle controls outside the canvas + const containerDiv = select(umapRef.current); + const legendDiv = containerDiv + .append("div") + .style("margin-top", "20px") + .style("background", "rgba(255,255,255,0.95)") + .style("padding", "15px") + .style("border-radius", "8px") + .style("box-shadow", "0 4px 8px rgba(0,0,0,0.2)") + .style("font-size", "13px") + .style("border", "1px solid #ddd") + .style("max-width", "300px"); + + legendDiv.append("div") + .style("font-weight", "bold") + .style("margin-bottom", "10px") + .style("font-size", "14px") + .style("color", "#333") + .text("Hull Layer Controls"); + + [3, 2, 1, 0].forEach((layer, i) => { // Show from coarsest to finest + const item = legendDiv.append("div") + .style("display", "flex") + .style("align-items", "center") + .style("margin", "6px 0") + .style("padding", "3px") + .style("border-radius", "4px") + .style("background", layerVisibility[layer] ? "rgba(0,0,0,0.02)" : "rgba(0,0,0,0.05)") + .style("cursor", "pointer") + .on("click", () => { + toggleLayerVisibility(layer); + }); + + // Checkbox indicator + const layerColors = ["#ff6b6b", "#4ecdc4", "#45b7d1", "#96ceb4"]; + const checkbox = item.append("div") + .style("width", "16px") + .style("height", "16px") + .style("border", "2px solid #ccc") + .style("border-radius", "3px") + .style("margin-right", "8px") + .style("display", "flex") + .style("align-items", "center") + .style("justify-content", "center") + .style("background", layerVisibility[layer] ? layerColors[layer] : "white") + .style("border-color", layerColors[layer]); + + if (layerVisibility[layer]) { + checkbox.append("div") + .style("width", "8px") + .style("height", "8px") + .style("background", "white") + .style("border-radius", "1px"); + } + + // Color indicator showing colors for this layer + const colorBox = item.append("div") + .style("width", "20px") + .style("height", "12px") + .style("background", layerColors[layer]) + .style("opacity", layerVisibility[layer] ? "0.8" : "0.3") + .style("border", "1px solid #ccc") + .style("margin-right", "8px") + .style("border-radius", "2px"); + + // Label + item.append("span") + .style("color", layerVisibility[layer] ? "#333" : "#999") + .style("font-weight", layerVisibility[layer] ? "500" : "normal") + .text(`Layer ${layer} ${layer === 0 ? '(Finest)' : layer === 3 ? '(Coarsest)' : ''}`); + }); + + // Add basic interactivity with mouse tracking + canvas.on("mousemove", function() { + const mousePos = d3Mouse(this); + const x = mousePos[0]; + const y = mousePos[1]; + + // Convert back to data coordinates + const dataX = xScale.invert(x); + const dataY = yScale.invert(y); + + // Find closest point (simple implementation) + let closestPoint = null; + let minDistance = Infinity; + + umapData.forEach(point => { + const distance = Math.sqrt( + Math.pow(point.umap_x - dataX, 2) + + Math.pow(point.umap_y - dataY, 2) + ); + if (distance < minDistance && distance < 1.0) { // Within reasonable distance + minDistance = distance; + closestPoint = point; + } + }); + + // Update cursor + canvas.style("cursor", closestPoint ? "pointer" : "default"); + }); + + console.log("Canvas UMAP visualization rendered successfully"); + }; + + // Create separate density visualization + const createDensityVisualization = () => { + if (!umapData || !densityRef.current) return; + + if (umapData.length === 0) { + console.log("No UMAP data to visualize"); + return; + } + + console.log("Creating Canvas density visualization with", umapData.length, "points"); + + // Generate colors similar to datamapplot's approach + const generateClusterColor = (clusterId, layer) => { + const baseColors = [ + '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', + '#e377c2', '#7f7f7f', '#bcbd22', '#17becf', '#ff9999', '#66b3ff', + '#99ff99', '#ffcc99', '#ff99cc', '#c2c2f0', '#ffb3e6', '#c2f0c2', + '#ffd9b3', '#b3b3ff', '#ffb3b3', '#b3ffb3', '#ffccb3', '#ccb3ff' + ]; + + if (typeof clusterId !== 'number' || typeof layer !== 'number') { + return '#999999'; + } + + const colorIndex = (clusterId * 7 + layer * 3) % baseColors.length; + return baseColors[colorIndex]; + }; + + // Clear previous visualization + densityRef.current.innerHTML = ''; + + const size = 800; + const width = size; + const height = size; + const margin = { top: 20, right: 20, bottom: 20, left: 20 }; + + // Create canvas + const canvas = select(densityRef.current) + .append("canvas") + .attr("width", width) + .attr("height", height) + .style("width", "100%") + .style("height", "auto") + .style("border", "1px solid #ddd"); + + const context = canvas.node().getContext("2d"); + + // Enable high DPI + const devicePixelRatio = window.devicePixelRatio || 1; + canvas.attr("width", width * devicePixelRatio) + .attr("height", height * devicePixelRatio); + context.scale(devicePixelRatio, devicePixelRatio); + + // Create scales + const xExtent = extent(umapData, d => d.umap_x); + const yExtent = extent(umapData, d => d.umap_y); + + const xScale = scaleLinear() + .domain(xExtent) + .range([margin.left, width - margin.right]); + + const yScale = scaleLinear() + .domain(yExtent) + .range([height - margin.bottom, margin.top]); + + // Clear canvas + context.clearRect(0, 0, width, height); + + // Get unique raw comment coordinates + const uniqueComments = new Map(); + umapData.forEach(point => { + const key = `${point.comment_id}`; + if (!uniqueComments.has(key)) { + uniqueComments.set(key, { + comment_id: point.comment_id, + umap_x: point.umap_x, + umap_y: point.umap_y, + clusters_by_layer: {} + }); + } + uniqueComments.get(key).clusters_by_layer[point.layer] = point.cluster_id; + }); + + const uniquePoints = Array.from(uniqueComments.values()); + + // Group points by cluster for each layer + const clusterGroups = {}; + for (let layer = 0; layer <= 3; layer++) { + clusterGroups[layer] = new Map(); + + uniquePoints.forEach(point => { + const clusterId = point.clusters_by_layer[layer]; + if (clusterId !== undefined) { + const key = `L${layer}C${clusterId}`; + if (!clusterGroups[layer].has(key)) { + clusterGroups[layer].set(key, []); + } + clusterGroups[layer].get(key).push(point); + } + }); + } + + // Create 2D density plots only for visible layers + const densityRadius = 25; + const gridSize = 4; + + // Draw density from coarsest to finest (3 → 0) so finer densities appear on top + for (let layer = 3; layer >= 0; layer--) { + // Skip this layer if it's not visible + if (!densityLayerVisibility[layer]) continue; + + const clusters = clusterGroups[layer]; + + console.log(`Drawing density plots for ${clusters.size} clusters in Layer ${layer}`); + + clusters.forEach((points, clusterKey) => { + if (points.length < 2) return; + + const clusterIdMatch = clusterKey.match(/C(\d+)/); + const clusterId = clusterIdMatch ? parseInt(clusterIdMatch[1]) : 0; + const clusterColor = generateClusterColor(clusterId, layer); + + if (!clusterColor || typeof clusterColor !== 'string') { + console.warn(`Invalid color generated for cluster ${clusterKey}`); + return; + } + + // Create density map for this cluster + const densityMap = new Map(); + + // Calculate density at grid points + for (let x = margin.left; x < width - margin.right; x += gridSize) { + for (let y = margin.top; y < height - margin.bottom; y += gridSize) { + let density = 0; + const gridKey = `${x},${y}`; + + points.forEach(point => { + const px = xScale(point.umap_x); + const py = yScale(point.umap_y); + const distance = Math.sqrt((x - px) ** 2 + (y - py) ** 2); + + if (distance <= densityRadius) { + density += Math.exp(-(distance ** 2) / (2 * (densityRadius / 3) ** 2)); + } + }); + + if (density > 0.1) { + densityMap.set(gridKey, density); + } + } + } + + // Draw contour lines instead of filled density + const maxDensity = Math.max(...densityMap.values()); + if (maxDensity > 0) { + // Create contour levels (like topographic lines) + const contourLevels = [0.2, 0.4, 0.6, 0.8].map(level => level * maxDensity); + + contourLevels.forEach((level, levelIndex) => { + // Find grid points at this density level + const contourPoints = []; + densityMap.forEach((density, gridKey) => { + if (Math.abs(density - level) < maxDensity * 0.1) { // Within 10% of level + const [x, y] = gridKey.split(',').map(Number); + contourPoints.push([x, y]); + } + }); + + // Draw contour lines + if (contourPoints.length > 2) { + try { + const hull = d3.polygonHull(contourPoints); + if (hull && hull.length > 2) { + context.beginPath(); + context.moveTo(hull[0][0], hull[0][1]); + for (let i = 1; i < hull.length; i++) { + context.lineTo(hull[i][0], hull[i][1]); + } + context.closePath(); + + // Draw contour line + context.strokeStyle = clusterColor; + context.lineWidth = 2 - (levelIndex * 0.3); // Thicker lines for higher density + context.globalAlpha = 0.6; + context.stroke(); + + // Light fill for the innermost contour + if (levelIndex === contourLevels.length - 1) { + const hex = clusterColor.replace('#', ''); + const r = parseInt(hex.substr(0, 2), 16); + const g = parseInt(hex.substr(2, 2), 16); + const b = parseInt(hex.substr(4, 2), 16); + context.fillStyle = `rgba(${r}, ${g}, ${b}, 0.1)`; + context.fill(); + } + } + } catch (error) { + console.error(`Error drawing contour for ${clusterKey}:`, error); + } + } + }); + } + }); + } + + // Reset alpha for points + context.globalAlpha = 1.0; + + // Draw all points with lighter color + uniquePoints.forEach(point => { + const x = xScale(point.umap_x); + const y = yScale(point.umap_y); + + context.beginPath(); + context.arc(x, y, 1, 0, 2 * Math.PI); + context.fillStyle = "#ccc"; // Much lighter gray + context.globalAlpha = 0.5; // More transparent + context.fill(); + }); + + // Get topic name from the stored topic names map + const getTopicName = (layer, clusterId) => { + const key = `layer${layer}_${clusterId}`; + return topicNames.get(key) || null; + }; + + // Draw topic names at cluster centroids + context.globalAlpha = 1.0; + context.font = "5px Arial"; // Even smaller font + context.textAlign = "center"; + context.textBaseline = "middle"; + + for (let layer = 3; layer >= 0; layer--) { + if (!densityLayerVisibility[layer]) continue; + + const clusters = clusterGroups[layer]; + + clusters.forEach((points, clusterKey) => { + if (points.length < 2) return; + + // Calculate centroid + const centroidX = points.reduce((sum, p) => sum + xScale(p.umap_x), 0) / points.length; + const centroidY = points.reduce((sum, p) => sum + yScale(p.umap_y), 0) / points.length; + + // Get cluster ID and topic name + const clusterIdMatch = clusterKey.match(/C(\d+)/); + const clusterId = clusterIdMatch ? parseInt(clusterIdMatch[1]) : 0; + const topicName = getTopicName(layer, clusterId); + + // Format: "3_7: Transportation" or just the topic name if it doesn't already include the layer_cluster + let label; + if (topicName) { + // Check if topic name already includes the layer_cluster format + const layerClusterPrefix = `${layer}_${clusterId}`; + if (topicName.startsWith(layerClusterPrefix)) { + label = topicName; // Already formatted + } else { + label = `${layerClusterPrefix}: ${topicName}`; + } + } else { + label = `${layer}_${clusterId}`; + } + + // Draw text with subtle background for readability + const textMetrics = context.measureText(label); + const padding = 1; // Much smaller padding + const bgWidth = textMetrics.width + (padding * 2); + const bgHeight = 6; // Much smaller height for tiny font + + // Draw very subtle background + context.fillStyle = "rgba(255, 255, 255, 0.1)"; // Much more transparent + context.fillRect( + centroidX - bgWidth/2, + centroidY - bgHeight/2, + bgWidth, + bgHeight + ); + + // Draw text with white stroke outline + context.lineWidth = 1; // Thinner stroke for tiny text + context.strokeStyle = "white"; + context.strokeText(label, centroidX, centroidY); + + // Draw text fill + context.fillStyle = "#333"; + context.fillText(label, centroidX, centroidY); + }); + } + + // Add legend for density visualization + const legendDiv = select(densityRef.current) + .append("div") + .style("margin-top", "20px") + .style("background", "rgba(255,255,255,0.95)") + .style("padding", "15px") + .style("border-radius", "8px") + .style("box-shadow", "0 4px 8px rgba(0,0,0,0.2)") + .style("font-size", "13px") + .style("border", "1px solid #ddd") + .style("max-width", "300px"); + + legendDiv.append("div") + .style("font-weight", "bold") + .style("margin-bottom", "10px") + .style("font-size", "14px") + .style("color", "#333") + .text("Density Layer Controls"); + + [3, 2, 1, 0].forEach((layer, i) => { + const item = legendDiv.append("div") + .style("display", "flex") + .style("align-items", "center") + .style("margin", "6px 0") + .style("padding", "3px") + .style("border-radius", "4px") + .style("background", densityLayerVisibility[layer] ? "rgba(0,0,0,0.02)" : "rgba(0,0,0,0.05)") + .style("cursor", "pointer") + .on("click", () => { + toggleDensityLayerVisibility(layer); + }); + + // Checkbox indicator + const checkbox = item.append("div") + .style("width", "16px") + .style("height", "16px") + .style("border", "2px solid #ccc") + .style("border-radius", "3px") + .style("margin-right", "8px") + .style("display", "flex") + .style("align-items", "center") + .style("justify-content", "center") + .style("background", densityLayerVisibility[layer] ? generateClusterColor(0, layer) : "white") + .style("border-color", generateClusterColor(0, layer)); + + if (densityLayerVisibility[layer]) { + checkbox.append("div") + .style("width", "8px") + .style("height", "8px") + .style("background", "white") + .style("border-radius", "1px"); + } + + // Color indicator + const colorBox = item.append("div") + .style("width", "20px") + .style("height", "12px") + .style("background", `linear-gradient(45deg, ${generateClusterColor(0, layer)}, ${generateClusterColor(1, layer)}, ${generateClusterColor(2, layer)})`) + .style("opacity", densityLayerVisibility[layer] ? "0.8" : "0.3") + .style("border", "1px solid #ccc") + .style("margin-right", "8px") + .style("border-radius", "2px"); + + // Label + item.append("span") + .style("color", densityLayerVisibility[layer] ? "#333" : "#999") + .style("font-weight", densityLayerVisibility[layer] ? "500" : "normal") + .text(`Layer ${layer} ${layer === 0 ? '(Finest)' : layer === 3 ? '(Coarsest)' : ''}`); + }); + + console.log("Canvas density visualization rendered successfully"); + }; + + // Create D3.js circle pack visualization (from TopicPrioritize.jsx - working version) + const createCirclePack = () => { + if (!hierarchyData || !circlePackRef.current) return; + + // Clear previous visualization + select(circlePackRef.current).selectAll("*").remove(); + + const width = 800; + const height = 600; + + // Create SVG + const svg = select(circlePackRef.current) + .append("svg") + .attr("width", width) + .attr("height", height) + .attr("style", "border: 1px solid #ccc; border-radius: 8px;"); + + // Create hierarchy from data + const hierarchyRoot = hierarchy(hierarchyData.hierarchy) + .sum(d => d.size || 1) // Use cluster size for circle size + .sort((a, b) => b.value - a.value); + + // Create pack layout + const packLayout = pack() + .size([width - 20, height - 20]) + .padding(3); + + const nodes = packLayout(hierarchyRoot); + + // Color scale by layer + const colorScale = scaleOrdinal() + .domain([0, 1, 2, 3]) + .range(["#ff6b6b", "#4ecdc4", "#45b7d1", "#96ceb4"]); + + // Create groups for each node + const nodeGroups = svg.selectAll("g") + .data(nodes.descendants()) + .enter() + .append("g") + .attr("transform", d => `translate(${d.x + 10},${d.y + 10})`); + + // Add circles + nodeGroups.append("circle") + .attr("r", d => d.r) + .attr("fill", d => { + if (d.depth === 0) return "#f8f9fa"; // Root + return colorScale(d.data.layer); + }) + .attr("stroke", d => d.depth === 0 ? "#dee2e6" : "#343a40") + .attr("stroke-width", d => d.depth === 0 ? 2 : 1) + .attr("fill-opacity", d => d.depth === 0 ? 0.1 : 0.7) + .style("cursor", "pointer") + .on("click", function(event, d) { + if (d.data.layer !== undefined) { + console.log("Clicked cluster:", d.data); + // setSelectedLayer(d.data.layer); // Comment out if this state doesn't exist in TopicHierarchy + } + }); + + // Add text labels for larger circles + nodeGroups.append("text") + .attr("text-anchor", "middle") + .attr("dy", "0.3em") + .attr("font-size", d => Math.min(d.r / 4, 12)) + .attr("fill", "#343a40") + .attr("font-weight", "bold") + .style("pointer-events", "none") + .text(d => { + if (d.depth === 0) return "Topics"; + if (d.r < 20) return ""; // Hide text for very small circles + return `L${d.data.layer} C${d.data.clusterId}`; + }); + + // Add size labels for larger circles + nodeGroups.append("text") + .attr("text-anchor", "middle") + .attr("dy", "1.5em") + .attr("font-size", d => Math.min(d.r / 6, 10)) + .attr("fill", "#6c757d") + .style("pointer-events", "none") + .text(d => { + if (d.depth === 0 || d.r < 25) return ""; + return `${d.data.size} comments`; + }); + + // Add legend + const legend = svg.append("g") + .attr("transform", `translate(${width - 150}, 20)`); + + legend.append("text") + .attr("font-weight", "bold") + .attr("font-size", "14") + .text("Layers"); + + const legendItems = legend.selectAll(".legend-item") + .data([ + { layer: 0, label: "Layer 0 (Finest)", color: "#ff6b6b" }, + { layer: 1, label: "Layer 1", color: "#4ecdc4" }, + { layer: 2, label: "Layer 2", color: "#45b7d1" }, + { layer: 3, label: "Layer 3 (Coarsest)", color: "#96ceb4" } + ]) + .enter() + .append("g") + .attr("class", "legend-item") + .attr("transform", (d, i) => `translate(0, ${20 + i * 20})`); + + legendItems.append("circle") + .attr("r", 8) + .attr("fill", d => d.color) + .attr("fill-opacity", 0.7); + + legendItems.append("text") + .attr("x", 15) + .attr("dy", "0.3em") + .attr("font-size", "12") + .text(d => d.label); + }; + + // Effect to create circle pack when hierarchy data is available and DOM is ready + useEffect(() => { + console.log("Circle pack useEffect triggered:", { + hierarchyData: !!hierarchyData, + hierarchyDataStructure: hierarchyData ? Object.keys(hierarchyData) : null, + refCurrent: !!circlePackRef.current + }); + + const tryCreateCirclePack = () => { + if (hierarchyData && circlePackRef.current) { + console.log("Attempting to create circle pack..."); + createCirclePack(); + return true; + } + console.log("Circle pack creation failed:", { + hierarchyData: !!hierarchyData, + refCurrent: !!circlePackRef.current + }); + return false; + }; + + if (hierarchyData) { + // Try immediately + if (!tryCreateCirclePack()) { + // If that fails, try with a delay + const timer = setTimeout(() => { + if (!tryCreateCirclePack()) { + console.log("Circle pack: ref still not available after timeout"); + } + }, 300); + + return () => clearTimeout(timer); + } + } + }, [hierarchyData]); + + // Effect to create UMAP visualization when data is available + useEffect(() => { + if (umapData) { + createUMAPVisualization(); + } + }, [umapData]); + + // Effect to re-render UMAP visualization when layer visibility changes + useEffect(() => { + if (umapData) { + createUMAPVisualization(); + } + }, [layerVisibility]); + + // Effect to create density visualization when data is available + useEffect(() => { + if (umapData) { + createDensityVisualization(); + } + }, [umapData, topicNames]); + + // Effect to re-render density visualization when density layer visibility changes + useEffect(() => { + if (umapData) { + createDensityVisualization(); + } + }, [densityLayerVisibility]); + + if (loading) { + return ( +
+

Topic Hierarchy

+
Loading hierarchical topic data...
+
+ ); + } + + if (error) { + return ( +
+

Topic Hierarchy

+
+

Error

+

{error}

+
+
+ ); + } + + return ( +
+
+

Topic Hierarchy

+
+ Interactive circle pack visualization of hierarchical topic clusters +
+
Report ID: {report_id}
+
+ +
+ {/* Density Visualization - First */} +
+

Topic Spatial Distribution - Contours

+

UMAP projection with topographic contour lines showing cluster density (Layer 3 coarsest by default)

+
+
+ + {/* UMAP Spatial Visualization */} +
+

Topic Spatial Distribution - Hulls

+

UMAP projection showing semantic neighborhoods with convex hulls around clusters

+
+
+ + {/* Circle Pack Visualization */} +
+

Topic Hierarchy

+

Nested circle pack showing hierarchical topic containment relationships

+
+
+
+ + +
+ ); +}; + +export default TopicHierarchy; \ No newline at end of file diff --git a/client-report/src/components/topicPrioritize/TopicPrioritize.jsx b/client-report/src/components/topicPrioritize/TopicPrioritize.jsx new file mode 100644 index 0000000000..7463f84f21 --- /dev/null +++ b/client-report/src/components/topicPrioritize/TopicPrioritize.jsx @@ -0,0 +1,1156 @@ +import React, { useState, useEffect, useRef } from "react"; +import net from "../../util/net"; +import { useReportId } from "../framework/useReportId"; +import CommentList from "../lists/commentList.jsx"; + +const TopicPrioritize = ({ math, comments, conversation, ptptCount, formatTid, voteColors }) => { + const { report_id } = useReportId(); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const [topicData, setTopicData] = useState(null); + const [hierarchyAnalysis, setHierarchyAnalysis] = useState(null); + const [currentLayer, setCurrentLayer] = useState(null); // Will be set to highest available layer + const [topicPriorities, setTopicPriorities] = useState(new Map()); // Store topic priorities + const [selectedTopics, setSelectedTopics] = useState(new Set()); // Track selected topics for filtering + const [umapData, setUmapData] = useState(null); // UMAP coordinates for spatial filtering + const [clusterGroups, setClusterGroups] = useState({}); // Points grouped by layer and cluster + const [spatialMode, setSpatialMode] = useState('subset'); // 'subset' or 'sort' + + useEffect(() => { + if (!report_id) return; + + setLoading(true); + // Fetch topic data from Delphi endpoint (same as CommentsReport) + net + .polisGet("/api/v3/delphi", { + report_id: report_id, + }) + .then((response) => { + console.log("TopicMod topics response:", response); + + if (response && response.status === "success") { + if (response.runs && Object.keys(response.runs).length > 0) { + setTopicData(response); + analyzeHierarchy(response); + // Fetch UMAP data for spatial filtering + fetchUMAPData(); + } else { + setError("No LLM topic data available yet. Run Delphi analysis first."); + } + } else { + setError("Failed to retrieve topic data"); + } + + setLoading(false); + }) + .catch((err) => { + console.error("Error fetching topic data:", err); + setError("Failed to connect to the topicMod endpoint"); + setLoading(false); + }); + }, [report_id]); + + // Fetch UMAP coordinates for spatial filtering + const fetchUMAPData = async () => { + try { + const conversationId = conversation?.conversation_id || report_id; + console.log("Fetching UMAP data for spatial filtering..."); + + const response = await fetch(`/api/v3/topicMod/proximity?conversation_id=${conversationId}&layer_id=all`); + const data = await response.json(); + + if (data.status === "success" && data.proximity_data) { + console.log(`Loaded ${data.proximity_data.length} UMAP points for spatial filtering`); + setUmapData(data.proximity_data); + + // Group points by layer and cluster + const groups = groupPointsByLayer(data.proximity_data); + setClusterGroups(groups); + + console.log("UMAP cluster groups:", groups); + } else { + console.log("No UMAP data available for spatial filtering"); + } + } catch (err) { + console.error("Error fetching UMAP data:", err); + } + }; + + // Analyze if topics actually contain each other hierarchically + const analyzeHierarchy = (data) => { + // Get the first (most recent) run + const runKeys = Object.keys(data.runs); + if (runKeys.length === 0) { + setHierarchyAnalysis({ hasHierarchy: false, reason: "No runs data" }); + return; + } + + const firstRun = data.runs[runKeys[0]]; + if (!firstRun.topics_by_layer) { + setHierarchyAnalysis({ hasHierarchy: false, reason: "No topics_by_layer data in run" }); + return; + } + + const layers = Object.keys(firstRun.topics_by_layer).map(k => parseInt(k)).sort((a, b) => a - b); + console.log("Analyzing layers:", layers); + + // Set current layer to the highest available layer if not set + if (currentLayer === null && layers.length > 0) { + const maxLayer = Math.max(...layers); + setCurrentLayer(maxLayer); + console.log(`Setting current layer to highest available: ${maxLayer}`); + } + + // For now, let's investigate what the data structure looks like + const analysis = { + hasHierarchy: false, // We'll determine this + layers: layers, + layerCounts: {}, + sampleTopics: {}, + totalComments: 0, + structure: "unknown", // Will be "flat", "hierarchical", or "mixed" + runInfo: { + model_name: firstRun.model_name, + created_at: firstRun.created_at, + job_uuid: firstRun.job_uuid + } + }; + + layers.forEach(layerId => { + const topics = firstRun.topics_by_layer[layerId]; + analysis.layerCounts[layerId] = Object.keys(topics).length; + + // Take first few topics as samples + analysis.sampleTopics[layerId] = Object.values(topics).slice(0, 3).map(topic => ({ + name: topic.topic_name, + key: topic.topic_key, + cluster_id: topic.cluster_id, + model_name: topic.model_name + })); + }); + + // Simple heuristic: if we have multiple layers with different counts, + // it suggests some hierarchical structure + const counts = Object.values(analysis.layerCounts); + const hasVariedCounts = Math.max(...counts) !== Math.min(...counts); + + if (hasVariedCounts && layers.length > 1) { + analysis.hasHierarchy = true; + analysis.structure = "hierarchical"; + analysis.reason = `Found ${layers.length} layers with varying topic counts: ${counts.join(", ")}`; + } else if (layers.length === 1) { + analysis.structure = "flat"; + analysis.reason = "Only one layer found - flat structure"; + } else { + analysis.structure = "unclear"; + analysis.reason = "Multiple layers but similar counts - unclear hierarchy"; + } + + console.log("Hierarchy analysis:", analysis); + setHierarchyAnalysis(analysis); + }; + + // Set topic priority with cycling + const cyclePriority = (topicKey) => { + const currentPriority = topicPriorities.get(topicKey) || 'low'; + let nextPriority; + + switch (currentPriority) { + case 'low': nextPriority = 'medium'; break; + case 'medium': nextPriority = 'high'; break; + case 'high': nextPriority = 'critical'; break; // spam + case 'critical': nextPriority = 'low'; break; // back to start + default: nextPriority = 'medium'; + } + + const newPriorities = new Map(topicPriorities); + newPriorities.set(topicKey, nextPriority); + setTopicPriorities(newPriorities); + console.log(`Topic ${topicKey} cycled to ${nextPriority} - spatial filtering will update`); + + // Force re-render of current layer to apply spatial filtering + setTimeout(() => { + console.log("Priority change complete, spatial filtering active"); + }, 100); + }; + + // === SPATIAL MATH FUNCTIONS === + + // Calculate cluster centroid in UMAP space + const calculateClusterCentroid = (clusterPoints) => { + if (!clusterPoints || clusterPoints.length === 0) return null; + const centroidX = clusterPoints.reduce((sum, p) => sum + p.umap_x, 0) / clusterPoints.length; + const centroidY = clusterPoints.reduce((sum, p) => sum + p.umap_y, 0) / clusterPoints.length; + return { x: centroidX, y: centroidY }; + }; + + // Calculate Euclidean distance between two points + const calculateDistance = (point1, point2) => { + return Math.sqrt( + Math.pow(point1.x - point2.x, 2) + + Math.pow(point1.y - point2.y, 2) + ); + }; + + // === DENSITY COMPUTATION FUNCTIONS === + + /** + * Calculates Gaussian kernel density at a specific point + */ + const calculateGaussianKernelDensity = (x, y, points, radius = 25, sigma = null) => { + if (!sigma) { + sigma = radius / 3; // Default sigma is radius/3 + } + + let density = 0; + + points.forEach(point => { + const distance = Math.sqrt((x - point.umap_x) ** 2 + (y - point.umap_y) ** 2); + + if (distance <= radius) { + density += Math.exp(-(distance ** 2) / (2 * sigma ** 2)); + } + }); + + return density; + }; + + /** + * Computes density surface over a grid using Gaussian kernels + */ + const computeGridDensitySurface = (points, bounds, gridSize = 4, radius = 25, densityThreshold = 0.1) => { + const densityMap = new Map(); + const sigma = radius / 3; + + // Calculate density at grid points + for (let x = bounds.minX; x < bounds.maxX; x += gridSize) { + for (let y = bounds.minY; y < bounds.maxY; y += gridSize) { + let density = 0; + const gridKey = `${x},${y}`; + + points.forEach(point => { + const distance = Math.sqrt((x - point.umap_x) ** 2 + (y - point.umap_y) ** 2); + + if (distance <= radius) { + density += Math.exp(-(distance ** 2) / (2 * sigma ** 2)); + } + }); + + if (density > densityThreshold) { + densityMap.set(gridKey, density); + } + } + } + + return densityMap; + }; + + /** + * Calculates bounds for a set of points + */ + const calculatePointBounds = (points, margin = 20) => { + const xValues = points.map(p => p.umap_x); + const yValues = points.map(p => p.umap_y); + + return { + minX: Math.min(...xValues) - margin, + maxX: Math.max(...xValues) + margin, + minY: Math.min(...yValues) - margin, + maxY: Math.max(...yValues) + margin + }; + }; + + /** + * Finds maximum density value in a density map + */ + const findMaxDensity = (densityMap) => { + let maxDensity = 0; + densityMap.forEach(density => { + if (density > maxDensity) { + maxDensity = density; + } + }); + return maxDensity; + }; + + /** + * Complete density analysis for a cluster of points + */ + const analyzeDensity = (points, options = {}) => { + const { + gridSize = 4, + radius = 25, + densityThreshold = 0.1, + margin = 20 + } = options; + + if (points.length < 2) { + return null; + } + + // Calculate bounds + const bounds = calculatePointBounds(points, margin); + + // Compute density surface + const densityMap = computeGridDensitySurface(points, bounds, gridSize, radius, densityThreshold); + + if (densityMap.size === 0) { + return null; + } + + // Find maximum density + const maxDensity = findMaxDensity(densityMap); + + // Calculate centroid + const centroid = calculateClusterCentroid(points); + + return { + bounds, + densityMap, + maxDensity, + centroid, + pointCount: points.length + }; + }; + + // Group UMAP points by layer and cluster + const groupPointsByLayer = (data) => { + const groups = {}; + + for (let layer = 0; layer <= 3; layer++) { + groups[layer] = new Map(); + } + + data.forEach(point => { + Object.entries(point.clusters || {}).forEach(([layerId, clusterId]) => { + const layer = parseInt(layerId); + const key = `${layer}_${clusterId}`; + + if (!groups[layer].has(key)) { + groups[layer].set(key, []); + } + + groups[layer].get(key).push({ + comment_id: point.comment_id, + cluster_id: clusterId, + layer: layer, + umap_x: point.umap_x, + umap_y: point.umap_y, + weight: point.weight || 1 + }); + }); + }); + + return groups; + }; + + /** + * Calculate density overlap between two clusters + */ + const calculateDensityOverlap = (densityMap1, densityMap2) => { + let overlapScore = 0; + let commonGridPoints = 0; + let totalGridPoints = 0; + + // Find maximum densities for normalization + const maxDensity1 = findMaxDensity(densityMap1); + const maxDensity2 = findMaxDensity(densityMap2); + const maxDensity = Math.max(maxDensity1, maxDensity2); + + // Check overlap at each grid point where both clusters have density + densityMap1.forEach((density1, gridKey) => { + totalGridPoints++; + if (densityMap2.has(gridKey)) { + const density2 = densityMap2.get(gridKey); + // Overlap is the minimum of the two normalized densities + const normalizedDensity1 = density1 / maxDensity; + const normalizedDensity2 = density2 / maxDensity; + overlapScore += Math.min(normalizedDensity1, normalizedDensity2); + commonGridPoints++; + } + }); + + // Also check points that exist in map2 but not map1 + densityMap2.forEach((density2, gridKey) => { + if (!densityMap1.has(gridKey)) { + totalGridPoints++; + } + }); + + // Return overlap as proportion of total possible overlap + return commonGridPoints > 0 ? overlapScore / commonGridPoints : 0; + }; + + /** + * Find nearby topics using density-based proximity + */ + const findNearbyTopicsDensity = (sourceClusters, targetLayerGroups, options = {}) => { + const { + overlapThreshold = 0.1, + fallbackDistance = 0.5, + useDensity = true + } = options; + + console.log(`🧮 Computing density-based proximity with threshold ${overlapThreshold}`); + const startTime = performance.now(); + + const nearbyTopics = new Set(); + let densityComputations = 0; + let fallbackComputations = 0; + let logCounter = 0; + + sourceClusters.forEach((sourcePoints, sourceKey) => { + if (!sourcePoints || sourcePoints.length === 0) return; + + // Compute density surface for source cluster + const sourceDensityAnalysis = useDensity ? analyzeDensity(sourcePoints) : null; + + if (sourceDensityAnalysis) { + densityComputations++; + + // Check overlap with each target cluster + targetLayerGroups.forEach((targetPoints, targetKey) => { + if (!targetPoints || targetPoints.length === 0) return; + + const targetDensityAnalysis = analyzeDensity(targetPoints); + + if (targetDensityAnalysis) { + const overlapScore = calculateDensityOverlap( + sourceDensityAnalysis.densityMap, + targetDensityAnalysis.densityMap + ); + + // Debug: Show density map details for first few comparisons + if (logCounter < 3) { + console.log(`🔍 DENSITY DEBUG: ${sourceKey} has ${sourceDensityAnalysis.densityMap.size} grid points, ${targetKey} has ${targetDensityAnalysis.densityMap.size} grid points`); + console.log(`🔍 DENSITY DEBUG: ${sourceKey} bounds:`, sourceDensityAnalysis.bounds); + console.log(`🔍 DENSITY DEBUG: ${targetKey} bounds:`, targetDensityAnalysis.bounds); + console.log(`🔍 DENSITY DEBUG: ${sourceKey} max density: ${sourceDensityAnalysis.maxDensity.toFixed(3)}`); + console.log(`🔍 DENSITY DEBUG: ${targetKey} max density: ${targetDensityAnalysis.maxDensity.toFixed(3)}`); + } + + if (overlapScore > overlapThreshold) { + nearbyTopics.add(targetKey); + if (logCounter < 10) { + console.log(`✅ Density overlap: ${sourceKey} → ${targetKey} (score: ${overlapScore.toFixed(3)})`); + } + } else { + if (logCounter < 10) { + console.log(`❌ Low overlap: ${sourceKey} → ${targetKey} (score: ${overlapScore.toFixed(3)}, threshold: ${overlapThreshold})`); + } + } + logCounter++; + } + }); + } else { + // Fallback to centroid-based distance + fallbackComputations++; + const sourceCentroid = calculateClusterCentroid(sourcePoints); + + if (sourceCentroid) { + targetLayerGroups.forEach((targetPoints, targetKey) => { + const targetCentroid = calculateClusterCentroid(targetPoints); + if (targetCentroid) { + const distance = calculateDistance(sourceCentroid, targetCentroid); + if (distance <= fallbackDistance) { + nearbyTopics.add(targetKey); + console.log(`⚡ Fallback distance: ${sourceKey} → ${targetKey} (distance: ${distance.toFixed(3)})`); + } + } + }); + } + } + }); + + const elapsed = performance.now() - startTime; + console.log(`🧮 Density computation complete: ${elapsed.toFixed(1)}ms, ${densityComputations} density, ${fallbackComputations} fallback`); + + return nearbyTopics; + }; + + // Legacy function for backward compatibility + const findNearbyTopics = (sourceCentroids, targetLayerGroups, maxDistance = 0.5) => { + const nearbyTopics = new Set(); + + sourceCentroids.forEach(sourceCentroid => { + targetLayerGroups.forEach((points, clusterKey) => { + const targetCentroid = calculateClusterCentroid(points); + if (targetCentroid) { + const distance = calculateDistance(sourceCentroid, targetCentroid); + if (distance <= maxDistance) { + nearbyTopics.add(clusterKey); + } + } + }); + }); + + return nearbyTopics; + }; + + // Toggle topic selection for filtering + const toggleTopicSelection = (topicKey) => { + const newSelected = new Set(selectedTopics); + if (newSelected.has(topicKey)) { + newSelected.delete(topicKey); + } else { + newSelected.add(topicKey); + } + setSelectedTopics(newSelected); + }; + + // Get priority color + const getPriorityColor = (priority) => { + switch (priority) { + case 'low': return '#d6d8db'; + case 'medium': return '#e2e6ea'; + case 'high': return '#d1d5db'; + case 'critical': return '#f0a7ab'; + default: return '#e9ecef'; + } + }; + + // Get priority indicator + const getPriorityIndicator = (priority) => { + switch (priority) { + case 'low': return '· LOW'; + case 'medium': return '•• MEDIUM'; + case 'high': return '••• HIGH'; + case 'critical': return '🗑 SPAM/TRASH'; + default: return '· LOW'; + } + }; + + // Get comment count for a cluster + const getCommentCount = (layerId, clusterId) => { + const clusterKey = `${layerId}_${clusterId}`; + const points = clusterGroups[layerId]?.get(clusterKey); + return points ? points.length : 0; + }; + + + // Get filtered/sorted topics based on spatial proximity + const getFilteredTopics = (allTopics, layerId) => { + // For highest layer (coarsest), show all topics + const maxLayer = hierarchyAnalysis ? Math.max(...hierarchyAnalysis.layers) : layerId; + if (layerId === maxLayer || !clusterGroups[layerId] || !umapData) { + return Object.entries(allTopics).map(([clusterId, topic]) => ({ + clusterId, + topic, + proximityScore: null + })); + } + + // For other layers, filter based on spatial proximity to higher priority topics + const higherLayerId = layerId + 1; + const higherLayerTopics = topicData?.runs[Object.keys(topicData.runs)[0]]?.topics_by_layer[higherLayerId]; + + if (!higherLayerTopics || !clusterGroups[higherLayerId]) { + return Object.entries(allTopics).map(([clusterId, topic]) => ({ + clusterId, + topic, + proximityScore: null + })); + } + + // Find HIGH and MEDIUM priority clusters in the higher layer + const priorityClusters = new Map(); + Object.entries(higherLayerTopics).forEach(([clusterId, topic]) => { + const priority = topicPriorities.get(topic.topic_key); + if (priority === 'high' || priority === 'medium') { + const clusterKey = `${higherLayerId}_${clusterId}`; + const points = clusterGroups[higherLayerId].get(clusterKey); + if (points && points.length > 0) { + priorityClusters.set(clusterKey, points); + } + } + }); + + // If no high priority topics, show all + if (priorityClusters.size === 0) { + return Object.entries(allTopics).map(([clusterId, topic]) => ({ + clusterId, + topic, + proximityScore: null + })); + } + + console.log(`🎯 Layer ${layerId}: Found ${priorityClusters.size} priority clusters in Layer ${higherLayerId}`); + + // Find nearby topics using simple centroid distance with adaptive thresholds + const getAdaptiveDistance = (layer) => { + switch (layer) { + case 0: return 1.5; // Very lenient for finest layer + case 1: return 1.2; // Lenient for fine layer + case 2: return 0.8; // Moderate for mid layer + default: return 0.5; // Standard for coarse layer + } + }; + + const adaptiveDistance = getAdaptiveDistance(layerId); + console.log(`🔧 Layer ${layerId}: Using adaptive distance ${adaptiveDistance}`); + + // Calculate centroids of priority clusters + const priorityCentroids = []; + priorityClusters.forEach((points, clusterKey) => { + const centroid = calculateClusterCentroid(points); + if (centroid) { + priorityCentroids.push(centroid); + } + }); + + // Calculate proximity scores for all topics + const topicsWithProximity = Object.entries(allTopics).map(([clusterId, topic]) => { + const clusterKey = `${layerId}_${clusterId}`; + const targetPoints = clusterGroups[layerId].get(clusterKey); + + let minProximity = Infinity; + let closestCluster = null; + if (targetPoints && targetPoints.length > 0) { + const targetCentroid = calculateClusterCentroid(targetPoints); + if (targetCentroid) { + // Track which priority cluster is closest + Array.from(priorityClusters.keys()).forEach(sourceKey => { + const sourcePoints = priorityClusters.get(sourceKey); + const sourceCentroid = calculateClusterCentroid(sourcePoints); + if (sourceCentroid) { + const distance = calculateDistance(sourceCentroid, targetCentroid); + if (distance < minProximity) { + minProximity = distance; + closestCluster = sourceKey; + } + } + }); + } + } + + return { + clusterId, + topic, + proximityScore: minProximity === Infinity ? null : minProximity, + closestCluster: closestCluster + }; + }); + + if (spatialMode === 'subset') { + // Filter mode: only show topics within threshold + const filteredTopics = topicsWithProximity.filter(item => + item.proximityScore !== null && item.proximityScore <= adaptiveDistance + ); + console.log(`Layer ${layerId}: Filtered from ${Object.keys(allTopics).length} to ${filteredTopics.length} topics based on spatial proximity`); + return filteredTopics; + } else { + // Sort mode: show all topics sorted by proximity + const sortedTopics = topicsWithProximity.sort((a, b) => { + if (a.proximityScore === null && b.proximityScore === null) return 0; + if (a.proximityScore === null) return 1; + if (b.proximityScore === null) return -1; + return a.proximityScore - b.proximityScore; + }); + console.log(`Layer ${layerId}: Sorted ${Object.keys(allTopics).length} topics by spatial proximity`); + return sortedTopics; + } + }; + + // Render dense priority selection for current layer + const renderPriorityLayer = () => { + if (!topicData || !topicData.runs || !hierarchyAnalysis) { + return
No topic data available
; + } + + const runKeys = Object.keys(topicData.runs); + const firstRun = topicData.runs[runKeys[0]]; + + if (!firstRun.topics_by_layer || !firstRun.topics_by_layer[currentLayer]) { + return
No topics found for layer {currentLayer}
; + } + + const allTopics = firstRun.topics_by_layer[currentLayer]; + const topicEntries = getFilteredTopics(allTopics, currentLayer); + + return ( +
+
+

Layer {currentLayer} Topic Prioritization

+
+ {topicEntries.length} topics{currentLayer < Math.max(...hierarchyAnalysis.layers) ? ` (${spatialMode === 'subset' ? 'filtered' : 'sorted'})` : ''} • Click to prioritize: LOW → MEDIUM → HIGH → SPAM/TRASH +
+
+ +
+ {topicEntries.map((entry) => { + const { clusterId, topic, proximityScore, closestCluster } = entry; + const topicKey = topic.topic_key; + const currentPriority = topicPriorities.get(topicKey) || 'low'; // Default to 'low' + const isSelected = selectedTopics.has(topicKey); + + // Clean topic name + let displayName = topic.topic_name; + const layerClusterPrefix = `${currentLayer}_${clusterId}`; + if (displayName && displayName.startsWith(layerClusterPrefix)) { + displayName = displayName.substring(layerClusterPrefix.length).replace(/^:\s*/, ''); + } + + return ( +
cyclePriority(topicKey)} + > +
+
+ + {currentLayer}_{clusterId} + {proximityScore !== null && closestCluster && ( + (d: {proximityScore.toFixed(2)} from {closestCluster}) + )} + ({getCommentCount(currentLayer, clusterId)} comments) + +
+ {['low', 'medium', 'high', 'critical'].map(priority => ( + + {priority === 'low' ? 'LOW' : + priority === 'medium' ? 'MEDIUM' : + priority === 'high' ? 'HIGH' : + 'SPAM/TRASH'} + + ))} +
+
+
{displayName || `Topic ${clusterId}`}
+
+
+ ); + })} +
+
+ ); + }; + + // Render layer navigation + const renderLayerNavigation = () => { + if (!hierarchyAnalysis) return null; + + return ( +
+
+ + +
+ +
+ {hierarchyAnalysis.layers.slice().reverse().map(layerId => ( + + ))} +
+ + {selectedTopics.size > 0 && ( +
+
{selectedTopics.size} topics selected for filtering
+ +
+ )} +
+ ); + }; + + // Render compact hierarchy analysis (moved to bottom) + const renderCompactAnalysis = () => { + if (!hierarchyAnalysis) return null; + + return ( +
+

Topic Structure Overview

+
+ {hierarchyAnalysis.structure.toUpperCase()} + + {hierarchyAnalysis.layers.map(layerId => + `L${layerId}:${hierarchyAnalysis.layerCounts[layerId]}` + ).join(' • ')} + +
+
+ ); + }; + + if (loading) { + return ( +
+

Topic Prioritize

+
Loading topic data...
+
+ ); + } + + if (error) { + return ( +
+

Topic Prioritize

+
+

Error

+

{error}

+
+
+ ); + } + + return ( +
+ {renderLayerNavigation()} + +
+ {renderPriorityLayer()} +
+ + + +
+ ); +}; + +export default TopicPrioritize; \ No newline at end of file diff --git a/client-report/src/components/topicPrioritizeSimple/TopicPrioritizeSimple.jsx b/client-report/src/components/topicPrioritizeSimple/TopicPrioritizeSimple.jsx new file mode 100644 index 0000000000..90025c323c --- /dev/null +++ b/client-report/src/components/topicPrioritizeSimple/TopicPrioritizeSimple.jsx @@ -0,0 +1,338 @@ +import React, { useState, useEffect } from "react"; +import net from "../../util/net"; +import { useReportId } from "../framework/useReportId"; + +const TopicPrioritizeSimple = ({ conversation }) => { + const { report_id } = useReportId(); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const [topicData, setTopicData] = useState(null); + const [selectedTopics, setSelectedTopics] = useState(new Set()); + + useEffect(() => { + if (!report_id) return; + + setLoading(true); + // Fetch topic data from Delphi endpoint + net + .polisGet("/api/v3/delphi", { + report_id: report_id, + }) + .then((response) => { + console.log("TopicMod topics response:", response); + + if (response && response.status === "success") { + if (response.runs && Object.keys(response.runs).length > 0) { + setTopicData(response); + } else { + setError("No LLM topic data available yet. Run Delphi analysis first."); + } + } else { + setError("Failed to retrieve topic data"); + } + + setLoading(false); + }) + .catch((err) => { + console.error("Error fetching topic data:", err); + setError("Failed to connect to the topicMod endpoint"); + setLoading(false); + }); + }, [report_id]); + + const toggleTopicSelection = (topicKey) => { + const newSelected = new Set(selectedTopics); + if (newSelected.has(topicKey)) { + newSelected.delete(topicKey); + } else { + newSelected.add(topicKey); + } + setSelectedTopics(newSelected); + }; + + const renderTopicSelection = () => { + if (!topicData || !topicData.runs) { + return
No topic data available
; + } + + const runKeys = Object.keys(topicData.runs); + const firstRun = topicData.runs[runKeys[0]]; + + if (!firstRun.topics_by_layer || !firstRun.topics_by_layer[3]) { + return
No topics found for the coarsest layer
; + } + + const coarsestTopics = firstRun.topics_by_layer[3]; + const topicEntries = Object.entries(coarsestTopics); + + return ( +
+
+

+ Which should rank higher in priority? Help set the agenda for you and for everyone: ({selectedTopics.size} selected) +

+
+ +
+ {topicEntries.map(([clusterId, topic]) => { + const topicKey = topic.topic_key; + const isSelected = selectedTopics.has(topicKey); + + // Clean topic name + let displayName = topic.topic_name; + const layerClusterPrefix = `3_${clusterId}`; + if (displayName && displayName.startsWith(layerClusterPrefix)) { + displayName = displayName.substring(layerClusterPrefix.length).replace(/^:\s*/, ''); + } + + return ( + + ); + })} +
+ + {selectedTopics.size > 0 && ( +
+

Selected Topics

+
+ {Array.from(selectedTopics).map(topicKey => { + const [clusterId, topic] = Object.entries(coarsestTopics).find( + ([_, t]) => t.topic_key === topicKey + ) || []; + + if (!topic) return null; + + let displayName = topic.topic_name; + const layerClusterPrefix = `3_${clusterId}`; + if (displayName && displayName.startsWith(layerClusterPrefix)) { + displayName = displayName.substring(layerClusterPrefix.length).replace(/^:\s*/, ''); + } + + return ( +
+ 3_{clusterId} + {displayName} +
+ ); + })} +
+
+ )} +
+ ); + }; + + if (loading) { + return ( +
+
Loading topic data...
+
+ ); + } + + if (error) { + return ( +
+
+

Error

+

{error}

+
+
+ ); + } + + return ( +
+ {renderTopicSelection()} + + +
+ ); +}; + +export default TopicPrioritizeSimple; \ No newline at end of file diff --git a/client-report/src/components/topicReport/TopicSectionsBuilder.jsx b/client-report/src/components/topicReport/TopicSectionsBuilder.jsx index 362834b973..c11203358e 100644 --- a/client-report/src/components/topicReport/TopicSectionsBuilder.jsx +++ b/client-report/src/components/topicReport/TopicSectionsBuilder.jsx @@ -67,6 +67,7 @@ const TopicSectionsBuilder = ({ topicData, narrativeData, children }) => { const allTopics = []; const jobUuid = latestRun.job_uuid; + const maxLayer = Math.max(...Object.keys(latestRun.topics_by_layer).map(Number)); Object.keys(latestRun.topics_by_layer).forEach(layer => { const clusters = latestRun.topics_by_layer[layer]; @@ -90,7 +91,7 @@ const TopicSectionsBuilder = ({ topicData, narrativeData, children }) => { key: sectionKey, displayKey: topicKey, name: topic.topic_name || topicKey, - sortKey: parseInt(layer) * 1000 + parseInt(clusterId) + sortKey: (maxLayer - parseInt(layer)) * 1000 + parseInt(clusterId) }); }); } diff --git a/client-report/webpack.common.js b/client-report/webpack.common.js index e8fedd6041..1bdf06b88f 100644 --- a/client-report/webpack.common.js +++ b/client-report/webpack.common.js @@ -1,41 +1,42 @@ // Copyright (C) 2012-present, The Authors. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License, version 3, as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . -const CopyWebpackPlugin = require('copy-webpack-plugin'); -const HtmlWebpackPlugin = require('html-webpack-plugin'); -const MiniCssExtractPlugin = require('mini-css-extract-plugin'); +const CopyWebpackPlugin = require("copy-webpack-plugin"); +const HtmlWebpackPlugin = require("html-webpack-plugin"); +const MiniCssExtractPlugin = require("mini-css-extract-plugin"); module.exports = { - entry: './src/index.js', + entry: "./src/index.js", module: { rules: [ { test: /\.css$/, - use: [MiniCssExtractPlugin.loader, 'css-loader'], + use: [MiniCssExtractPlugin.loader, "css-loader"], }, { test: /\.jsx?$/, exclude: /node_modules/, use: { - loader: 'babel-loader', + loader: "babel-loader", options: { - presets: ['@babel/preset-env', '@babel/preset-react'] - } - } + presets: ["@babel/preset-env", "@babel/preset-react"], + }, + }, }, ], }, + resolve: { + extensions: [".js", ".jsx"], + }, plugins: [ new CopyWebpackPlugin({ - patterns: [ - { from: 'public/favicon.ico', to: 'favicon.ico' }, - ], + patterns: [{ from: "public/favicon.ico", to: "favicon.ico" }], }), new HtmlWebpackPlugin({ - template: 'public/index.html', - filename: 'index_report.html', + template: "public/index.html", + filename: "index_report.html", }), ], externals: { - d3: 'd3' - } + d3: "d3", + }, }; diff --git a/client-report/webpack.dev.js b/client-report/webpack.dev.js index 970cc4df35..13de7a5a49 100644 --- a/client-report/webpack.dev.js +++ b/client-report/webpack.dev.js @@ -45,7 +45,7 @@ module.exports = { }, performance: { // TODO: Find and remove orphan modules; Reduce bundle size. - hints: 'warning', // 'error' for errors, 'warning' for warnings, false to disable + hints: false, // 'error' for errors, 'warning' for warnings, false to disable maxAssetSize: 7100000, // Size limit in bytes, default is 250000 (250 KB) maxEntrypointSize: 7100000, // Size limit in bytes, default is 250000 (250 KB) }, diff --git a/delphi/CLAUDE.md b/delphi/CLAUDE.md index 2d370bb0ed..d9a0536b1d 100644 --- a/delphi/CLAUDE.md +++ b/delphi/CLAUDE.md @@ -80,6 +80,11 @@ Always use the commands above to determine the most substantial conversation whe - LLM API keys (Anthropic, OpenAI, etc.) are available in the parent `.env` file - Default Ollama model: `llama3.1:8b` (configurable via `OLLAMA_MODEL`) +- **Sentence Transformer Configuration**: + - Default embedding model: `all-MiniLM-L6-v2` (configurable via `SENTENCE_TRANSFORMER_MODEL`) + - For multilingual support, set `SENTENCE_TRANSFORMER_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` + - Both models produce 384-dimensional embeddings + ## IMPORTANT: Finding All Logs **CRITICAL NOTE**: The FULL system logs are stored in the DynamoDB JobQueue table's job results! When debugging issues: diff --git a/delphi/create_dynamodb_tables.py b/delphi/create_dynamodb_tables.py index 84c6beaf74..377e5ed4e5 100644 --- a/delphi/create_dynamodb_tables.py +++ b/delphi/create_dynamodb_tables.py @@ -250,6 +250,19 @@ def create_evoc_tables(dynamodb, delete_existing=False): ], 'Projection': {'ProjectionType': 'ALL'}, 'ProvisionedThroughput': {'ReadCapacityUnits': 5, 'WriteCapacityUnits': 5} + }, + { + 'IndexName': 'zid-index', + 'KeySchema': [ + {'AttributeName': 'conversation_id', 'KeyType': 'HASH'} + ], + 'Projection': { + 'ProjectionType': 'ALL' + }, + 'ProvisionedThroughput': { + 'ReadCapacityUnits': 5, + 'WriteCapacityUnits': 5 + } } ], 'ProvisionedThroughput': { @@ -379,6 +392,21 @@ def create_evoc_tables(dynamodb, delete_existing=False): 'ReadCapacityUnits': 5, 'WriteCapacityUnits': 5 } + }, + # Topic Agenda table for storing user selections + 'Delphi_TopicAgendaSelections': { + 'KeySchema': [ + {'AttributeName': 'conversation_id', 'KeyType': 'HASH'}, + {'AttributeName': 'participant_id', 'KeyType': 'RANGE'} + ], + 'AttributeDefinitions': [ + {'AttributeName': 'conversation_id', 'AttributeType': 'S'}, + {'AttributeName': 'participant_id', 'AttributeType': 'S'} + ], + 'ProvisionedThroughput': { + 'ReadCapacityUnits': 5, + 'WriteCapacityUnits': 5 + } } } diff --git a/delphi/create_topic_agenda_table.py b/delphi/create_topic_agenda_table.py new file mode 100755 index 0000000000..e7a607e105 --- /dev/null +++ b/delphi/create_topic_agenda_table.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +""" +Create Topic Agenda DynamoDB table for Delphi system. + +This script creates the Delphi_TopicAgendaSelections table for storing user topic selections. + +Usage: + python create_topic_agenda_table.py [options] + +Options: + --endpoint-url ENDPOINT_URL DynamoDB endpoint URL + --region REGION AWS region (default: us-east-1) + --force Force recreate table if it exists +""" + +import boto3 +import os +import logging +import argparse +import time + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +def create_topic_agenda_table(dynamodb, force_recreate=False): + """ + Create the Topic Agenda table for storing user selections. + + Args: + dynamodb: boto3 DynamoDB resource + force_recreate: If True, delete existing table before creating + """ + table_name = 'Delphi_TopicAgendaSelections' + + # Check if table exists + existing_tables = [t.name for t in dynamodb.tables.all()] + + if table_name in existing_tables: + if force_recreate: + logger.info(f"Deleting existing table {table_name}...") + table = dynamodb.Table(table_name) + table.delete() + table.meta.client.get_waiter('table_not_exists').wait(TableName=table_name) + logger.info(f"Table {table_name} deleted.") + else: + logger.info(f"Table {table_name} already exists. Use --force to recreate.") + return False + + # Create table + logger.info(f"Creating table {table_name}...") + + table = dynamodb.create_table( + TableName=table_name, + KeySchema=[ + {'AttributeName': 'conversation_id', 'KeyType': 'HASH'}, + {'AttributeName': 'participant_id', 'KeyType': 'RANGE'} + ], + AttributeDefinitions=[ + {'AttributeName': 'conversation_id', 'AttributeType': 'S'}, + {'AttributeName': 'participant_id', 'AttributeType': 'S'} + ], + ProvisionedThroughput={ + 'ReadCapacityUnits': 5, + 'WriteCapacityUnits': 5 + } + ) + + # Wait for table to be active + table.meta.client.get_waiter('table_exists').wait(TableName=table_name) + logger.info(f"Table {table_name} created and active.") + + return True + +def main(): + # Parse arguments + parser = argparse.ArgumentParser(description='Create Topic Agenda DynamoDB table') + parser.add_argument('--endpoint-url', type=str, default=None, + help='DynamoDB endpoint URL') + parser.add_argument('--region', type=str, default='us-east-1', + help='AWS region (default: us-east-1)') + parser.add_argument('--force', action='store_true', + help='Force recreate table if it exists') + args = parser.parse_args() + + # Set up environment variables for local DynamoDB + if args.endpoint_url: + if 'localhost' in args.endpoint_url or '127.0.0.1' in args.endpoint_url: + os.environ['AWS_ACCESS_KEY_ID'] = 'dummy' + os.environ['AWS_SECRET_ACCESS_KEY'] = 'dummy' + + # Create DynamoDB resource + dynamodb = boto3.resource( + 'dynamodb', + endpoint_url=args.endpoint_url, + region_name=args.region + ) + + # Create table + start_time = time.time() + success = create_topic_agenda_table(dynamodb, args.force) + elapsed_time = time.time() - start_time + + if success: + logger.info(f"Table creation completed in {elapsed_time:.2f} seconds") + else: + logger.info(f"Table creation skipped (already exists)") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/delphi/docs/EVOC_LAYER_HIERARCHY_DEBUG.md b/delphi/docs/EVOC_LAYER_HIERARCHY_DEBUG.md new file mode 100644 index 0000000000..ddb27185c8 --- /dev/null +++ b/delphi/docs/EVOC_LAYER_HIERARCHY_DEBUG.md @@ -0,0 +1,214 @@ +# EVōC Layer Hierarchy Debug Investigation + +**Date**: 2025-06-20 +**Issue**: Circle pack visualization showing incorrect hierarchy - all Layer 0 clusters as roots instead of nested structure +**Status**: INVESTIGATING - Need to verify EVōC layer ordering and Delphi storage logic + +## Problem Statement + +The circle pack visualization is showing all 360 clusters as flat siblings instead of a true hierarchical nesting. Investigation reveals: + +1. **DynamoDB Structure**: All Layer 0 clusters have `parent_cluster: null` and are treated as roots +2. **Expected Structure**: Layer 3 (coarsest) should contain Layer 2, which should contain Layer 1, which should contain Layer 0 (finest) +3. **Actual Structure**: Layer 1 clusters have `parent_cluster` pointing to Layer 0 clusters + +## Key Questions to Answer + +1. **How does EVōC order its cluster_layers_?** (Fine-to-coarse or coarse-to-fine?) +2. **What are the parent-child relationships supposed to represent?** (Containment or merge history?) +3. **Is there a bug in the Delphi storage logic?** (Are we storing relationships backwards?) + +## Evidence Collected + +### 1. EVōC Documentation (GitHub) +**Source**: https://github.com/TutteInstitute/evoc/blob/main/evoc/clustering.py +**Finding**: "each layer is a clustering of the data into a different number of clusters; the earlier the cluster vector is in this list the finer the granularity of clustering." + +**Interpretation**: +- Layer 0 = finest granularity (most clusters) +- Layer 1 = coarser (fewer clusters) +- Layer 2 = even coarser +- Layer 3 = coarsest (fewest clusters) + +### 2. DynamoDB Actual Data Structure + +**Query**: `Delphi_CommentClustersStructureKeywords` for conversation 40523 + +**Layer Distribution**: +- Layer 0: 237 clusters +- Layer 1: 82 clusters +- Layer 2: 31 clusters +- Layer 3: 10 clusters + +**Sample Layer 0 Item**: +```json +{ + "cluster_key": "layer0_0", + "layer_id": "0", + "parent_cluster": null, + "child_clusters": [{"cluster_id": "0", "layer_id": "1"}] +} +``` + +**Sample Layer 1 Item**: +```json +{ + "cluster_key": "layer1_0", + "layer_id": "1", + "parent_cluster": {"cluster_id": "1", "layer_id": "0"}, + "child_clusters": null +} +``` + +**Observation**: Layer 0 (finest) has no parents, Layer 1 (coarser) has Layer 0 parents. This suggests Layer 0 is treated as "root" level. + +### 3. Delphi Storage Code Analysis + +**File**: `/delphi/umap_narrative/polismath_commentgraph/utils/converter.py` +**Lines**: 936-939, 947-951 + +```python +parent_cluster = { + 'layer_id': layer_id - 1, # PARENT is layer_id - 1 + 'cluster_id': int(parent_id) +} + +child_clusters = [ + { + 'layer_id': layer_id + 1, # CHILDREN are layer_id + 1 + 'cluster_id': int(child_id) + } +] +``` + +**Analysis**: +- Layer N has parents in Layer N-1 +- Layer N has children in Layer N+1 +- This makes Layer 0 the root level (no parents) +- This makes Layer 3 the deepest level (no children) + +## Tests Needed + +### Test 1: EVōC Layer Ordering Verification +**Goal**: Confirm that EVōC layer_id=0 is truly the finest granularity + +```python +# Create controlled test data with known cluster structure +# Run EVōC clustering +# Verify cluster count decreases as layer_id increases +``` + +### Test 2: Semantic Verification +**Goal**: Verify that higher layer_id clusters actually contain/represent multiple lower layer_id clusters + +```python +# For each Layer 1 cluster, find all Layer 0 clusters that "belong" to it +# Verify semantic coherence - do Layer 0 clusters within a Layer 1 parent make sense? +``` + +### Test 3: Algorithm Logic Verification +**Goal**: Understand what EVōC's hierarchical structure actually represents + +**Questions**: +- Are higher layers created by merging lower layers? +- Are layers created independently with different granularity parameters? +- What does "parent-child" mean in the context of hierarchical clustering? + +### Test 4: Circle Pack Expectation Verification +**Goal**: Confirm what the visualization should look like + +**Expected for Circle Pack**: +- Largest circles = Layer 3 (10 clusters, coarsest topics) +- Medium circles = Layer 2 (31 clusters) nested inside Layer 3 +- Smaller circles = Layer 1 (82 clusters) nested inside Layer 2 +- Smallest circles = Layer 0 (237 clusters) nested inside Layer 1 + +**Current Reality**: +- All Layer 0 clusters (237) shown as top-level siblings +- No nesting structure visible + +## Hypotheses + +### Hypothesis A: Delphi Storage Bug +**Theory**: The parent-child relationships are stored backwards in DynamoDB +**Evidence**: Layer 1 has Layer 0 parents (backwards from intuitive containment) +**Fix**: Invert the relationship storage logic + +### Hypothesis B: Misunderstanding EVōC +**Theory**: EVōC layers represent merge history, not containment hierarchy +**Evidence**: Need to investigate what cluster_layers_ actually represents +**Fix**: Build containment hierarchy differently + +### Hypothesis C: Both are Correct +**Theory**: The storage is correct, but circle pack needs different data interpretation +**Evidence**: Need to verify the intended use case +**Fix**: Transform the data for visualization + +## Next Steps + +1. **Run Controlled EVōC Test** - Create synthetic data with known structure +2. **Verify Semantic Clustering** - Check if relationships make sense +3. **Check EVōC Source Code** - Understand cluster_layers_ generation +4. **Test Visualization Logic** - Confirm circle pack expectations +5. **Document Findings** - Update this document with results + +## Test Results + +### ✅ Test 1: EVōC Layer Ordering CONFIRMED +**Controlled test with synthetic data (200 samples, 4 known clusters)**: +- Layer 0: 21 clusters (finest granularity) +- Layer 1: 9 clusters (coarser) +- Layer 2: 4 clusters (coarsest) + +**CONCLUSION**: EVōC definitely orders layers from fine-to-coarse (Layer 0 = finest) + +### ✅ Test 2: Polis Data Relationship Analysis +**Layer distribution in conversation 40523**: +- Layer 0: 237 clusters (finest) +- Layer 1: 82 clusters +- Layer 2: 31 clusters +- Layer 3: 10 clusters (coarsest) + +**Relationship direction**: +- 173/237 Layer 0 clusters have Layer 1 children +- 82/82 Layer 1 clusters have Layer 0 parents +- Only 82/173 relationships are bidirectionally consistent + +### ❌ Test 3: The REAL Problem Identified + +**The storage relationships represent MERGE HISTORY, not CONTAINMENT:** + +1. **Multiple Layer 0 clusters merge into single Layer 1 clusters** + - Example: L0 clusters [0, 1, 10] all point to L1 cluster 0 as their child + - But L1 cluster 0 only has ONE parent (L0 cluster 1) + +2. **This is merge/aggregation, NOT containment hierarchy** + - Layer 0 clusters don't "contain" Layer 1 clusters + - Multiple Layer 0 clusters "merge into" single Layer 1 clusters + +## Root Cause Found + +**The parent-child relationships in DynamoDB represent the clustering algorithm's merge process, NOT spatial containment suitable for circle pack visualization.** + +For circle pack visualization, we need **containment hierarchy** where: +- 1 Layer 3 cluster contains multiple Layer 2 clusters +- 1 Layer 2 cluster contains multiple Layer 1 clusters +- 1 Layer 1 cluster contains multiple Layer 0 clusters + +But EVōC stores **merge relationships** where: +- Multiple Layer 0 clusters merge into 1 Layer 1 cluster +- Multiple Layer 1 clusters merge into 1 Layer 2 cluster + +## Solution + +**We need to INVERT the relationships for circle pack visualization:** +1. Start with Layer 3 (coarsest) as roots +2. Each Layer 3 cluster contains the Layer 2 clusters that merged into it +3. Each Layer 2 cluster contains the Layer 1 clusters that merged into it +4. Each Layer 1 cluster contains the Layer 0 clusters that merged into it + +This requires building the containment hierarchy by **following the merge relationships backwards**. + +--- + +**Status**: ✅ ROOT CAUSE IDENTIFIED - Need to invert merge relationships to create containment hierarchy for circle pack \ No newline at end of file diff --git a/delphi/docs/SPATIAL_TOPIC_PRIORITIZATION_SYSTEM.md b/delphi/docs/SPATIAL_TOPIC_PRIORITIZATION_SYSTEM.md new file mode 100644 index 0000000000..ea2a2c3a79 --- /dev/null +++ b/delphi/docs/SPATIAL_TOPIC_PRIORITIZATION_SYSTEM.md @@ -0,0 +1,457 @@ +# Spatial Topic Prioritization System (STPS) + +## Executive Summary + +The Spatial Topic Prioritization System (STPS) is a core component of pol.is designed for **national-scale agenda setting**. It enables intelligent filtering and prioritization of topics across hierarchical layers using spatial proximity in UMAP embedding space. By leveraging the semantic relationships encoded in UMAP coordinates, STPS allows users to set broad priorities at coarse layers and automatically discover related topics at finer layers, creating a **cascading spatial filter** through the topic hierarchy. + +## System Overview + +### Core Concept + +Topics that are spatially close in UMAP space are semantically related. When a user prioritizes topics at Layer 3 (coarsest), STPS uses **density estimation** around those priority topics to identify semantically related topics in Layer 2. This process cascades down through all layers (3→2→1→0), creating an intelligent agenda-setting workflow that preserves semantic coherence. + +### Key Innovation + +Rather than using simple centroid-based distance calculations, STPS employs **Gaussian density estimation** to account for: + +- Variable cluster shapes and sizes +- Overlapping semantic neighborhoods +- Non-uniform topic distributions in UMAP space +- Gradual semantic transitions between related topics + +## Technical Architecture + +### Data Requirements + +#### Input Data + +1. **UMAP Coordinates** + + - All comments with (x, y) coordinates in UMAP embedding space + - Obtained from: `/api/v3/topicMod/proximity?conversation_id=${id}&layer_id=all` + - Structure: `{comment_id, umap_x, umap_y, clusters: {0: id, 1: id, 2: id, 3: id}}` + +2. **Hierarchical Cluster Assignments** + + - Each comment assigned to clusters at layers 0-3 + - Layer 0: Finest granularity (most clusters) + - Layer 3: Coarsest granularity (fewest clusters) + - Stored in: `Delphi_CommentHierarchicalClusterAssignments` + +3. **Topic Metadata** + - Topic names, descriptions, comment counts per cluster + - Obtained from: `/api/v3/topicMod/topics?conversation_id=${id}` + - Maps cluster IDs to human-readable topic labels + +#### Derived Data (Computed) + +1. **Cluster Point Collections** + + - All UMAP points grouped by `(layer, cluster_id)` + - Used for density calculations and spatial operations + +2. **Density Surfaces** + + - 2D Gaussian density maps for each cluster at each layer + - Grid-based sampling with configurable resolution + - Precomputed for performance in production + +3. **Spatial Proximity Matrices** + - Cross-layer proximity relationships + - Distance/overlap metrics between clusters across layers + - Cached for real-time filtering + +### Core Algorithms + +#### 1. Density Estimation + +**Gaussian Kernel Density Estimation:** + +``` +density(x, y) = Σ exp(-(distance²) / (2 * σ²)) +``` + +Where: + +- `distance = √((x - point.x)² + (y - point.y)²)` +- `σ = radius / 3` (standard deviation) +- `radius = 25` (configurable density neighborhood) + +**Grid Sampling:** + +- Sample density at regular grid points (default: 4px spacing) +- Create continuous density surface for each cluster +- Store significant density values (threshold: 0.1) + +#### 2. Spatial Proximity Detection + +**Density Overlap Method:** + +``` +proximity_score = ∫∫ min(density_A(x,y), density_B(x,y)) dx dy +``` + +This measures the **spatial overlap** between density surfaces of clusters in different layers. + +**Distance-Based Method (Fallback):** + +``` +distance = √((centroid_A.x - centroid_B.x)² + (centroid_A.y - centroid_B.y)²) +proximity = exp(-distance / threshold) +``` + +#### 3. Cascading Filter Algorithm + +**Layer-by-Layer Filtering:** + +1. **Layer 3 (Start):** User sets priorities (LOW/MEDIUM/HIGH/SPAM) +2. **Layer 2 Filter:** + - Find density surfaces of HIGH/MEDIUM priority Layer 3 clusters + - Calculate overlap with all Layer 2 cluster density surfaces + - Show only Layer 2 clusters with proximity_score > threshold +3. **Layer 1 Filter:** Same process using Layer 2 priorities +4. **Layer 0 Filter:** Same process using Layer 1 priorities + +**Thresholds (Configurable):** + +- HIGH priority clusters: Include clusters with proximity_score > 0.3 +- MEDIUM priority clusters: Include clusters with proximity_score > 0.2 +- Adaptive thresholds based on cluster size and density + +## Implementation Phases + +### Phase 1: Client-Side Prototype (Current) + +**Status:** In Development +**Location:** `/client-report/src/components/topicPrioritize/` + +**Capabilities:** + +- Fetch UMAP data from existing API +- Compute density surfaces in browser +- Real-time spatial filtering as user sets priorities +- Interactive layer navigation with spatial constraints + +**Limitations:** + +- Performance constraints for large datasets +- Recomputes spatial relationships on each interaction +- No persistence of spatial proximity data + +### Phase 2: Server-Side Computation (Next) + +**Status:** Planned +**Location:** `/delphi/umap_narrative/` pipeline + +**Server-Side Precomputation:** + +1. **During UMAP Pipeline:** + + - Compute density surfaces for all clusters at all layers + - Calculate spatial proximity matrices between layers + - Store in `Delphi_SpatialProximityCache` table + +2. **API Enhancements:** + + - `/api/v3/topicMod/spatial-proximity?conversation_id=${id}&layer=${n}` + - Returns precomputed proximity relationships + - Fast filtering based on cached spatial data + +3. **Storage Schema:** + +```sql +Delphi_SpatialProximityCache: +- conversation_id (string) +- source_layer (number) +- source_cluster_id (number) +- target_layer (number) +- target_cluster_id (number) +- proximity_score (number) +- density_overlap (number) +- centroid_distance (number) +- created_at (timestamp) +``` + +### Phase 3: Advanced Spatial Features (Future) + +**Status:** Research + +**Multi-Dimensional Proximity:** + +- Extend beyond 2D UMAP to n-dimensional semantic space +- More sophisticated density estimation methods +- Machine learning-based proximity prediction + +**Dynamic Threshold Optimization:** + +- Adaptive thresholds based on conversation characteristics +- User behavior learning for personalized filtering +- A/B testing of different proximity algorithms + +## User Experience Design + +### Interaction Flow + +1. **Layer 3 (Coarsest Topics):** + + - User sees 8-15 broad topics (e.g., "Healthcare," "Education," "Urban Planning") + - Click to cycle: LOW → MEDIUM → HIGH → SPAM/TRASH + - Visual feedback: Progressive border darkening, background changes + +2. **Layer 2 (Filtered Topics):** + + - Shows only topics spatially related to HIGH/MEDIUM Layer 3 selections + - Count reduces from ~31 to ~12 topics (example) + - Header indicates "(spatially filtered)" + - User continues prioritization + +3. **Layer 1 & 0 (Progressive Refinement):** + - Further spatial filtering based on Layer 2 priorities + - Increasingly specific topics emerge + - Maintains semantic coherence throughout + +### Visual Design Principles + +**Spatial Feedback:** + +- Border thickness/darkness indicates priority level +- Filtered topics show spatial relationship hints +- Count changes demonstrate filtering effectiveness + +**Cognitive Load Reduction:** + +- Start broad (Layer 3), refine incrementally +- Only show relevant topics at each layer +- Clear priority progression (LOW→MEDIUM→HIGH→SPAM) + +**Agenda Setting Focus:** + +- Optimized for national-scale conversation prioritization +- Balances comprehensiveness with focus +- Preserves semantic relationships during filtering + +## Performance Considerations + +### Computational Complexity + +**Density Estimation:** + +- O(n \* g) where n = points, g = grid cells +- For 10,000 comments, ~1-2 seconds computation time +- Scales linearly with conversation size + +**Spatial Proximity:** + +- O(c1 \* c2) where c1, c2 = cluster counts in adjacent layers +- Typically ~300 cluster pairs per layer transition +- Milliseconds for proximity lookup with precomputation + +**Real-Time Performance Targets:** + +- Client-side filtering: <100ms response time +- Server-side precomputation: <5 minutes for 50k comment conversation +- API response: <50ms for proximity data + +### Optimization Strategies + +**Precomputation (Phase 2):** + +- Calculate spatial relationships during UMAP pipeline +- Store proximity matrices in DynamoDB +- Serve cached results via API + +**Progressive Loading:** + +- Load Layer 3 data immediately +- Fetch deeper layer data on demand +- Cache spatial relationships in browser + +**Algorithmic Optimizations:** + +- Spatial indexing (QuadTree) for fast proximity queries +- Hierarchical density approximation +- GPU acceleration for density computation (future) + +## Data Storage Requirements + +### New DynamoDB Tables + +#### `Delphi_SpatialProximityCache` + +``` +Primary Key: conversation_id + source_layer + source_cluster_id +Sort Key: target_layer + target_cluster_id +Attributes: +- proximity_score (number, 0.0-1.0) +- density_overlap (number) +- centroid_distance (number) +- spatial_method (string: "density" | "centroid") +- created_at (timestamp) +- expires_at (timestamp, TTL) +``` + +#### `Delphi_ClusterDensitySurfaces` + +``` +Primary Key: conversation_id + layer + cluster_id +Attributes: +- density_grid (binary: compressed grid data) +- centroid_x, centroid_y (numbers) +- bounding_box (object: {min_x, min_y, max_x, max_y}) +- point_count (number) +- max_density (number) +- grid_resolution (number) +- created_at (timestamp) +``` + +#### `Delphi_UserTopicPriorities` (Future) + +``` +Primary Key: conversation_id + user_id +Attributes: +- priority_map (object: {topic_key: priority_level}) +- spatial_filter_active (boolean) +- last_updated (timestamp) +- session_id (string) +``` + +### Storage Estimates + +**Per Conversation (50k comments):** + +- SpatialProximityCache: ~50KB (sparse matrix) +- ClusterDensitySurfaces: ~200KB (compressed grids) +- Total additional storage: <1MB per conversation + +**DynamoDB Costs:** + +- Read/Write capacity scales with user activity +- Proximity cache accessed on layer navigation +- Density surfaces loaded once per session + +## Integration Points + +### Frontend Integration + +**TopicPrioritize Component:** + +- Spatial filtering logic integrated into topic rendering +- Real-time updates as user sets priorities +- Visual feedback for spatial relationships + +**API Consumption:** + +- `/api/v3/topicMod/spatial-proximity` for cached relationships +- `/api/v3/topicMod/proximity` for raw UMAP data (Phase 1) +- Graceful fallback to non-spatial mode if data unavailable + +### Backend Integration + +**UMAP Pipeline Enhancement:** + +- Add spatial computation step after UMAP generation +- Integrate with existing `run_pipeline.py` workflow +- Store spatial data alongside topic generation + +**API Layer:** + +- New endpoints for spatial proximity data +- Enhanced existing endpoints with spatial metadata +- Performance monitoring for spatial queries + +### Monitoring & Analytics + +**Performance Metrics:** + +- Spatial computation time per conversation +- API response times for proximity queries +- User interaction patterns with spatial filtering + +**Quality Metrics:** + +- Spatial filtering effectiveness (user satisfaction) +- Semantic coherence of filtered topics +- False positive/negative rates in proximity detection + +## Research & Development Opportunities + +### Short-Term Improvements + +**Algorithm Refinement:** + +- Optimize density estimation parameters +- Test different proximity scoring methods +- Validate spatial filtering effectiveness + +**User Experience:** + +- A/B testing of different interaction models +- Accessibility improvements for spatial interfaces +- Mobile optimization for touch interactions + +### Long-Term Research + +**Advanced Spatial Methods:** + +- Machine learning-based proximity prediction +- Multi-modal embedding spaces (text + metadata) +- Temporal evolution of spatial relationships + +**Scalability Research:** + +- Distributed spatial computation +- Incremental updates to spatial relationships +- Cross-conversation spatial pattern learning + +## Success Metrics + +### Technical Metrics + +- **Performance:** <100ms client-side filtering, <50ms API response +- **Accuracy:** >80% user satisfaction with spatial filtering relevance +- **Scalability:** Handle 100k+ comment conversations + +### User Experience Metrics + +- **Efficiency:** 50% reduction in time to identify relevant topics +- **Comprehensiveness:** 90% of important topics discovered through spatial filtering +- **Usability:** Users successfully navigate layer hierarchy without training + +### Business Impact Metrics + +- **Adoption:** Spatial filtering used in >70% of prioritization sessions +- **Quality:** Improved agenda-setting outcomes (measurable via follow-up surveys) +- **Scale:** System deployed for national-level pol.is conversations + +## Risk Mitigation + +### Technical Risks + +- **Performance degradation:** Implement fallback to non-spatial mode +- **Data quality issues:** Robust validation of UMAP coordinates +- **Algorithm limitations:** Multiple proximity detection methods + +### User Experience Risks + +- **Complexity overload:** Progressive disclosure of spatial features +- **Misleading filtering:** Clear indication of filtered vs. total topics +- **Accessibility concerns:** Alternative navigation methods + +### Operational Risks + +- **Storage costs:** Efficient compression and TTL policies +- **Computational costs:** Precomputation during off-peak hours +- **Data consistency:** Atomic updates to spatial relationships + +## Conclusion + +The Spatial Topic Prioritization System represents a significant advancement in pol.is's capacity for intelligent agenda setting. By leveraging the semantic structure encoded in UMAP embeddings, STPS enables users to efficiently navigate large topic hierarchies while preserving thematic coherence. The system's phased implementation approach allows for iterative refinement while delivering immediate value through client-side prototyping. + +The long-term vision extends beyond simple proximity filtering to encompass adaptive, learning-based spatial relationship detection that improves with usage. This positions pol.is as a leading platform for large-scale democratic discourse, capable of handling national-level conversations while maintaining semantic precision and user agency. + +--- + +**Document Version:** 1.0 +**Created:** 2024-06-23 +**Author:** Claude Code Assistant +**Status:** System Design - Phase 1 Implementation Active diff --git a/delphi/docs/TOPIC_AGENDA_IMPLEMENTATION_SUMMARY.md b/delphi/docs/TOPIC_AGENDA_IMPLEMENTATION_SUMMARY.md new file mode 100644 index 0000000000..76a27d2fd2 --- /dev/null +++ b/delphi/docs/TOPIC_AGENDA_IMPLEMENTATION_SUMMARY.md @@ -0,0 +1,91 @@ +# Topic Agenda Implementation Summary + +## Quick Overview + +This document provides a concise summary of the proposed Topic Agenda storage implementation for quick review. + +## Key Design Decisions + +### 1. Storage Strategy +- **Store archetypal comment IDs** instead of topic names/clusters +- **Why**: Comment IDs are stable across Delphi runs, topics are not +- **Result**: User selections persist even when topic modeling changes + +### 2. DynamoDB Table Structure +- **Table Name**: `Delphi_TopicAgendaSelections` +- **Primary Key**: `conversation_id` (zid) +- **Sort Key**: `participant_id` (pid) +- **Why**: Follows existing Polis patterns, enables efficient per-user queries + +### 3. Data Stored Per Selection +```json +{ + "layer_id": 3, + "cluster_id": "9", + "archetypal_comments": [ + { + "comment_id": "123", + "coordinates": { "x": 1.23, "y": 4.56 } + } + ] +} +``` + +### 4. API Endpoints +- `POST /api/v3/topicAgenda/selections` - Save selections +- `GET /api/v3/topicAgenda/selections?conversation_id={zid}` - Retrieve +- `PUT /api/v3/topicAgenda/selections` - Update +- `DELETE /api/v3/topicAgenda/selections?conversation_id={zid}` - Delete + +## Implementation Steps + +### Immediate (Phase 1) +1. Create DynamoDB table +2. Implement backend routes +3. Update TopicAgenda.jsx to save on "Done" click +4. Add retrieval on component mount + +### Near-term (Phase 2) +1. Add loading/error states +2. Implement overwrite confirmation +3. Add success feedback + +### Future (Phase 3) +1. Handle Delphi re-runs (spatial matching) +2. Add confidence scoring +3. Implement migration UI + +## Key Questions for Review + +1. **Overwrite behavior**: Should we append to existing selections or replace? +2. **Multiple sessions**: Should we track selection history or just current state? +3. **Visibility**: Should selections be private to user or shareable? +4. **Expiration**: Should old selections expire after N days? + +## Next Immediate Actions + +1. **Backend**: Create `/server/src/routes/delphi/topicAgenda.ts` +2. **Frontend**: Update `handleDone()` in TopicAgenda.jsx +3. **Database**: Create DynamoDB table via AWS console or CloudFormation + +## Code to Add to TopicAgenda.jsx + +```javascript +// Add to handleDone function after archetypal extraction: +const response = await fetch('/api/v3/topicAgenda/selections', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + conversation_id: conversation.conversation_id, + selections: archetypes.map(/* transform to API format */) + }), + credentials: 'include' +}); +``` + +## Success Criteria + +- ✅ User clicks "Done" → selections saved to DynamoDB +- ✅ User returns → previous selections loaded +- ✅ Delphi re-runs → selections still valid via comment IDs +- ✅ API responds in < 200ms \ No newline at end of file diff --git a/delphi/docs/TOPIC_AGENDA_STORAGE_DESIGN.md b/delphi/docs/TOPIC_AGENDA_STORAGE_DESIGN.md new file mode 100644 index 0000000000..e6c2c48966 --- /dev/null +++ b/delphi/docs/TOPIC_AGENDA_STORAGE_DESIGN.md @@ -0,0 +1,319 @@ +# Topic Agenda Storage Design + +## Overview + +This document outlines the design for storing user topic agenda selections as archetypal comments. The system stores stable comment references that persist across Delphi topic modeling runs, allowing users to maintain their preferences even as topic names and clusters change. + +## Problem Statement + +- Topic names and cluster assignments change between Delphi runs +- Users need their topic selections to persist across these changes +- Solution: Store archetypal comments (stable comment IDs) instead of topic references + +## Data Model + +### DynamoDB Table: `Delphi_TopicAgendaSelections` + +**Primary Key Design:** +- **Partition Key**: `conversation_id` (string) - The zid of the conversation +- **Sort Key**: `participant_id` (string) - The pid of the participant + +**Attributes:** +```json +{ + "conversation_id": "string", // zid as string + "participant_id": "string", // pid as string + + "archetypal_selections": [ + { + "layer_id": "number", // 0, 1, 2, 3, etc. + "cluster_id": "string", // The cluster within that layer + "topic_key": "string", // Original topic key for reference + "archetypal_comments": [ + { + "comment_id": "string", // Stable comment identifier + "comment_text": "string", // Cached for display + "coordinates": { + "x": "number", // UMAP x coordinate + "y": "number" // UMAP y coordinate + }, + "distance_to_centroid": "number" + } + ], + "selection_timestamp": "string" // ISO 8601 timestamp + } + ], + + "metadata": { + "job_id": "string", // Delphi job ID these selections are from + "created_at": "string", // ISO 8601 timestamp + "updated_at": "string", // ISO 8601 timestamp + "version": "number", // Schema version (start with 1) + "total_selections": "number" // Count of selected topics + } +} +``` + +## API Design + +### 1. Save Topic Agenda Selections + +**Endpoint:** `POST /api/v3/topicAgenda/selections` + +**Request Headers:** +``` +Content-Type: application/json +Cookie: [authentication cookie] +``` + +**Request Body:** +```json +{ + "conversation_id": "string", + "selections": [ + { + "layer_id": 3, + "cluster_id": "9", + "topic_key": "layer3_9", + "archetypal_comments": [ + { + "comment_id": "123", + "comment_text": "We need better public transportation", + "coordinates": { "x": 1.23, "y": 4.56 }, + "distance_to_centroid": 0.15 + } + ] + } + ] +} +``` + +**Response:** +```json +{ + "status": "success", + "message": "Topic agenda selections saved successfully", + "data": { + "conversation_id": "string", + "participant_id": "string", + "selections_count": 3, + "job_id": "string" + } +} +``` + +### 2. Retrieve Topic Agenda Selections + +**Endpoint:** `GET /api/v3/topicAgenda/selections?conversation_id={zid}` + +**Response:** +```json +{ + "status": "success", + "data": { + "conversation_id": "string", + "participant_id": "string", + "archetypal_selections": [...], + "metadata": {...} + } +} +``` + +### 3. Update Topic Agenda Selections + +**Endpoint:** `PUT /api/v3/topicAgenda/selections` + +Same structure as POST, but replaces existing selections entirely. + +### 4. Delete Topic Agenda Selections + +**Endpoint:** `DELETE /api/v3/topicAgenda/selections?conversation_id={zid}` + +## Implementation Plan + +### Phase 1: Backend Infrastructure +1. Create DynamoDB table with specified schema +2. Implement data access layer in `/server/src/db/topicAgenda.ts` +3. Create API routes in `/server/src/routes/delphi/topicAgenda.ts` +4. Add authentication and authorization checks +5. Implement input validation + +### Phase 2: Frontend Integration +1. Update `TopicAgenda.jsx` to call save API on "Done" click +2. Add loading states and error handling +3. Implement retrieval on component mount +4. Add confirmation UI for overwrites + +### Phase 3: Cross-Run Persistence +1. Implement comment matching algorithm for new Delphi runs +2. Create migration logic for when clusters change +3. Add fallback UI for missing comments +4. Implement confidence scoring for matches + +## Code Examples + +### Backend Route Implementation + +```typescript +// /server/src/routes/delphi/topicAgenda.ts +import { Router } from 'express'; +import { DynamoDBDocumentClient, PutCommand, GetCommand } from '@aws-sdk/lib-dynamodb'; +import { isAuthenticated } from '../../middleware/auth'; +import { getPidPromise } from '../../user'; +import Conversation from '../../conversation'; + +const router = Router(); +const TABLE_NAME = 'Delphi_TopicAgendaSelections'; + +router.post('/selections', isAuthenticated, async (req, res) => { + try { + const { conversation_id, selections } = req.body; + const uid = req.user.uid; + + // Convert conversation_id to zid + const zid = await Conversation.getZidFromConversationId(conversation_id); + const zidStr = zid.toString(); + + // Get participant ID + const pid = await getPidPromise(zidStr, uid); + const pidStr = pid.toString(); + + // Get current Delphi job ID + const jobId = await getCurrentDelphiJobId(zidStr); + + // Prepare DynamoDB item + const item = { + conversation_id: zidStr, + participant_id: pidStr, + archetypal_selections: selections, + metadata: { + job_id: jobId, + created_at: new Date().toISOString(), + updated_at: new Date().toISOString(), + version: 1, + total_selections: selections.length + } + }; + + // Save to DynamoDB + const putParams = { + TableName: TABLE_NAME, + Item: item + }; + + await docClient.send(new PutCommand(putParams)); + + res.json({ + status: 'success', + message: 'Topic agenda selections saved successfully', + data: { + conversation_id: zidStr, + participant_id: pidStr, + selections_count: selections.length, + job_id: jobId + } + }); + + } catch (error) { + console.error('Error saving topic agenda selections:', error); + res.status(500).json({ + status: 'error', + message: 'Failed to save topic agenda selections' + }); + } +}); +``` + +### Frontend Integration + +```javascript +// In TopicAgenda.jsx handleDone function +const handleDone = async () => { + try { + // Extract archetypal comments + const archetypes = extractArchetypalComments(selections, topicData, clusterGroups, commentMap); + + // Transform to API format + const apiSelections = archetypes.map(group => ({ + layer_id: group.layerId, + cluster_id: group.clusterId, + topic_key: group.topicKey, + archetypal_comments: group.archetypes.map(a => ({ + comment_id: a.commentId, + comment_text: a.text, + coordinates: a.coordinates, + distance_to_centroid: a.distance + })) + })); + + // Send to API + const response = await fetch('/api/v3/topicAgenda/selections', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + conversation_id: conversation.conversation_id, + selections: apiSelections + }), + credentials: 'include' + }); + + const result = await response.json(); + + if (result.status === 'success') { + console.log('Selections saved successfully'); + // Show success UI + } else { + console.error('Failed to save selections:', result.message); + // Show error UI + } + + } catch (error) { + console.error('Error saving selections:', error); + // Show error UI + } +}; +``` + +## Migration Strategy + +When a new Delphi run creates different clusters: + +1. **Spatial Matching**: Use UMAP coordinates to find closest new clusters +2. **Comment Preservation**: Keep original comment IDs as anchors +3. **Confidence Scoring**: Calculate confidence based on: + - Distance between old and new cluster centroids + - Percentage of comments that moved together + - Topic name similarity (if available) +4. **User Notification**: Inform users when their selections need review + +## Security Considerations + +1. **Authentication**: Require valid user session +2. **Authorization**: Users can only save/retrieve their own selections +3. **Rate Limiting**: Implement rate limits on save operations +4. **Input Validation**: Validate all input data formats +5. **Data Privacy**: Ensure participant selections remain private + +## Performance Considerations + +1. **Caching**: Cache retrieved selections in memory +2. **Batch Operations**: Support bulk updates for multiple selections +3. **Indexing**: Create GSI if needed for query patterns +4. **Compression**: Consider compressing large selection sets + +## Future Enhancements + +1. **Selection History**: Track changes over time +2. **Sharing**: Allow users to share their topic agendas +3. **Analytics**: Aggregate anonymous selection patterns +4. **Templates**: Pre-defined topic agenda templates +5. **Export**: Allow users to export their selections + +## Success Metrics + +1. **Persistence Rate**: % of selections that survive Delphi re-runs +2. **Accuracy**: % of correctly matched topics after re-runs +3. **Performance**: API response times < 200ms +4. **Adoption**: % of users who save their selections \ No newline at end of file diff --git a/delphi/docs/UMAP_VISUALIZATION_PLAN.md b/delphi/docs/UMAP_VISUALIZATION_PLAN.md new file mode 100644 index 0000000000..db20101857 --- /dev/null +++ b/delphi/docs/UMAP_VISUALIZATION_PLAN.md @@ -0,0 +1,194 @@ +# UMAP Spatial Visualization Plan + +## Overview +Create a 2D scatter plot visualization showing all topic clusters in their actual UMAP coordinate space, with convex hulls defining semantic zones. This will be displayed as its own visualization card above the circle pack on the TopicHierarchy route. + +## Implementation Approach + +### 1. Data Fetching +- Fetch all UMAP coordinates for all 360 clusters using the proximity endpoint +- Get coordinates for all layers (0, 1, 2, 3) simultaneously +- Include topic names and cluster metadata + +### 2. Visualization Components +- **Scatter Plot**: All clusters as points in 2D UMAP space +- **Convex Hulls**: Boundaries around each layer's cluster groups +- **Color Coding**: Different colors for each layer (0=finest, 3=coarsest) +- **Interactive Labels**: Hover to see topic names +- **Zoom/Pan**: D3 zoom behavior for exploration + +### 3. Layout Structure +```jsx +
+ {/* UMAP Spatial Visualization - NEW */} +
+

Topic Spatial Distribution

+

UMAP projection showing semantic neighborhoods

+
+
+ + {/* Circle Pack Visualization - EXISTING */} +
+

Topic Hierarchy

+
+
+
+``` + +### 4. Technical Implementation + +#### Data Structure +```javascript +// Expected data format from proximity endpoint +const umapData = [ + { + cluster_id: "layer0_43", + layer: 0, + topic_name: "Improving Bowling Green Healthcare", + umap_x: 12.5, + umap_y: 8.2, + size: 15 + }, + // ... all 360 clusters +]; +``` + +#### D3 Visualization Code +```javascript +const createUMAPVisualization = () => { + // 1. Set up SVG with zoom behavior + // 2. Create scales for UMAP coordinates + // 3. Plot all points as circles + // 4. Calculate and draw convex hulls for each layer + // 5. Add hover interactions and labels + // 6. Color code by layer +} +``` + +#### Convex Hull Generation +```javascript +// Group clusters by layer and generate hulls +const layerGroups = d3.group(data, d => d.layer); +layerGroups.forEach((clusters, layer) => { + const points = clusters.map(d => [xScale(d.umap_x), yScale(d.umap_y)]); + const hull = d3.polygonHull(points); + // Draw hull polygon +}); +``` + +### 5. Visual Design +- **Point sizes**: Proportional to cluster size +- **Colors**: Layer-based (blues for Layer 0 → reds for Layer 3) +- **Hulls**: Semi-transparent fills with contrasting borders +- **Typography**: Clear labels for topic names on hover +- **Grid**: Subtle background grid for spatial reference + +### 6. Interactions +- **Hover**: Show topic name and cluster details +- **Click**: Highlight related clusters or navigate to topic details +- **Zoom**: Mouse wheel zoom with pan +- **Layer toggle**: Show/hide specific layers + +### 7. Benefits +- **True Spatial Relationships**: Shows actual semantic neighborhoods +- **Zone Identification**: Clear boundaries between topic areas +- **Scalability**: Can handle all 360 clusters simultaneously +- **Complementary**: Works alongside circle pack for different perspectives +- **Interactive**: Allows exploration of topic landscape + +### 8. API Requirements +- Modify proximity endpoint to return all clusters at once +- Include UMAP coordinates, topic names, sizes, and layer info +- Ensure efficient data transfer for all 360 points + +This visualization will reveal the actual "topology" of topic space rather than artificial hierarchical containers, showing where topics naturally cluster in semantic space. + +## DynamoDB Query Commands (Development Only) + +**IMPORTANT**: These scan commands are for development/debugging only. In production, we use proper query operations through the API endpoints, not direct DynamoDB scans. + +### Useful Development Queries + +#### Get All Clusters for Conversation +```bash +# Count total clusters +aws dynamodb scan --table-name Delphi_CommentClustersStructureKeywords \ + --filter-expression "conversation_id = :cid" \ + --expression-attribute-values '{":cid":{"S":"40523"}}' \ + --endpoint-url http://localhost:8000 | jq '.Items | length' + +# Count by layer +aws dynamodb scan --table-name Delphi_CommentClustersStructureKeywords \ + --filter-expression "conversation_id = :cid" \ + --expression-attribute-values '{":cid":{"S":"40523"}}' \ + --endpoint-url http://localhost:8000 | \ + jq '.Items | group_by(.layer_id.N) | map({layer: .[0].layer_id.N, count: length})' +``` + +#### Trace Topic Hierarchy Chains +```bash +# Get specific cluster with parent info +aws dynamodb scan --table-name Delphi_CommentClustersStructureKeywords \ + --filter-expression "conversation_id = :cid AND cluster_key = :key" \ + --expression-attribute-values '{":cid":{"S":"40523"}, ":key":{"S":"layer3_0"}}' \ + --endpoint-url http://localhost:8000 | \ + jq '.Items[0] | {cluster: .cluster_key.S, layer: .layer_id.N, parent: .parent_cluster}' + +# Find all children of a parent cluster +aws dynamodb scan --table-name Delphi_CommentClustersStructureKeywords \ + --filter-expression "conversation_id = :cid" \ + --expression-attribute-values '{":cid":{"S":"40523"}}' \ + --endpoint-url http://localhost:8000 | \ + jq '.Items | map(select(.parent_cluster.M.cluster_id.N == "5" and .parent_cluster.M.layer_id.N == "2")) | map(.cluster_key.S)' +``` + +#### Get Topic Names +```bash +# Get topic name for specific cluster +aws dynamodb scan --table-name Delphi_CommentClustersLLMTopicNames \ + --filter-expression "conversation_id = :cid AND topic_key = :key" \ + --expression-attribute-values '{":cid":{"S":"40523"}, ":key":{"S":"layer0_43"}}' \ + --endpoint-url http://localhost:8000 | jq '.Items[0].topic_name.S' +``` + +#### Find Branching Structure +```bash +# Find parents with multiple children +aws dynamodb scan --table-name Delphi_CommentClustersStructureKeywords \ + --filter-expression "conversation_id = :cid" \ + --expression-attribute-values '{":cid":{"S":"40523"}}' \ + --endpoint-url http://localhost:8000 | \ + jq '.Items | group_by(.parent_cluster.M.cluster_id.N) | map({parent_id: .[0].parent_cluster.M.cluster_id.N, parent_layer: .[0].parent_cluster.M.layer_id.N, children_count: length}) | map(select(.children_count > 1))' +``` + +#### Sample Tree Traversal (Healthcare Branch) +``` +🏥 "Improving Bowling Green Healthcare" (layer0_43 - ROOT) + └── merges into "Healthcare in Bowling Green" (layer1_11) + └── merges into "Improved Healthcare Options" (layer2_5) + └── merges into "Healthcare in the Future" (layer3_0) +``` + +### Production API Endpoints (Use These Instead) + +#### For UMAP Visualization Data +```javascript +// Get all UMAP coordinates for all clusters +fetch(`/api/v3/topicMod/proximity?conversation_id=${conversationId}&layer_id=all`) + +// Get topics with names +fetch(`/api/v3/topicMod/topics?conversation_id=${conversationId}`) + +// Get hierarchy structure +fetch(`/api/v3/topicMod/hierarchy?conversation_id=${conversationId}`) +``` + +### Key Insights from Data Exploration +1. **Linear Chains**: Most topic relationships are linear chains rather than branching trees +2. **Layer Structure**: 237 Layer 0 → 82 Layer 1 → 31 Layer 2 → 10 Layer 3 +3. **Parent Direction**: "Parent" means "merges into" (Layer 0 → Layer 1 → Layer 2 → Layer 3) +4. **Root Clusters**: Some clusters at each layer have no parents (multiple entry points) +5. **Semantic Progression**: Topics flow from specific issues → local scope → broader concepts → future visions + +### Why UMAP Visualization is Better +The hierarchical tree structure shows mostly linear chains, which doesn't provide meaningful spatial containment for circle packing. UMAP coordinates will show the actual semantic neighborhoods and clustering zones that exist in the topic space. \ No newline at end of file diff --git a/delphi/docs/topic-moderation-system.md b/delphi/docs/topic-moderation-system.md new file mode 100644 index 0000000000..5b43eb72f2 --- /dev/null +++ b/delphi/docs/topic-moderation-system.md @@ -0,0 +1,311 @@ +# Topic-Based Moderation System for pol.is + +## Overview + +The Topic-Based Moderation System (TopicMod) is a powerful new feature that leverages hierarchical clustered topics generated by the Delphi pipeline to enable efficient comment moderation. Instead of moderating comments individually, moderators can now work with semantically grouped topics and apply bulk actions based on the UMAP proximity visualization. + +## Architecture + +### Backend Components + +#### API Endpoints (`/api/v3/topicMod/`) + +1. **GET /topics** - Retrieves topics with moderation status + - Parameters: `report_id`, optional `job_id` + - Returns: Topics organized by layer with moderation metadata + +2. **GET /topics/:topicKey/comments** - Gets comments for a specific topic + - Parameters: `report_id`, `topicKey` + - Returns: List of comments with UMAP coordinates and moderation status + +3. **POST /moderate** - Applies moderation actions + - Body: `report_id`, `topic_key` OR `comment_ids`, `action`, `moderator` + - Actions: `accept`, `reject`, `meta` + - Supports both topic-level and individual comment moderation + +4. **GET /proximity** - Retrieves UMAP coordinates for visualization + - Parameters: `report_id`, `layer_id` + - Returns: Comment positions for proximity-based moderation + +5. **GET /stats** - Moderation statistics + - Parameters: `report_id` + - Returns: Counts of pending, accepted, rejected, and meta topics + +#### Database Schema + +##### DynamoDB Tables + +- **Delphi_CommentClustersLLMTopicNames** (existing) + - Contains topic names and metadata generated by LLM + - Key: `conversation_id`, `topic_key` + +- **Delphi_TopicModerationStatus** (new) + - Tracks moderation decisions for topics + - Key: `conversation_id`, `topic_key` + - Attributes: `moderation_status`, `moderator`, `moderated_at`, `comment_count` + +- **Delphi_CommentClusters** (existing/extended) + - Contains comment-to-topic mappings with UMAP coordinates + - Key: `conversation_id`, `topic_key` + - Attributes: `comment_id`, `umap_x`, `umap_y`, `cluster_id`, `layer_id` + +##### PostgreSQL Updates + +The system updates the existing `comments` table in PostgreSQL: +- `mod`: Moderation status (-1=rejected, 0=meta, 1=accepted) +- `is_meta`: Boolean flag for meta comments + +### Frontend Components + +#### React Component Hierarchy + +``` +TopicModeration (index.js) +├── TopicTree (topic-tree.js) +│ ├── Layer selection +│ ├── Topic display with moderation controls +│ └── Bulk topic actions +├── TopicDetail (topic-detail.js) +│ ├── Comment list for specific topic +│ ├── Individual comment selection +│ └── Bulk comment actions +├── ProximityVisualization (proximity-visualization.js) +│ ├── UMAP scatter plot +│ ├── Cluster visualization +│ └── Interactive moderation +└── TopicStats (topic-stats.js) + ├── Moderation progress tracking + └── Statistics dashboard +``` + +#### Navigation Integration + +The TopicMod system is integrated into the client-admin conversation management interface: +- New "Topic Mod" tab in the conversation admin sidebar +- Routes: `/m/:conversation_id/topics/*` +- Follows existing patterns from comment moderation + +## Features + +### Hierarchical Topic Organization + +Topics are organized in hierarchical layers (0-2) representing different levels of granularity: +- **Layer 0**: Coarse-grained topics (broad themes) +- **Layer 1**: Medium granularity +- **Layer 2**: Fine-grained topics (specific subtopics) + +Moderators can view and work with any layer depending on their needs. + +### Bulk Moderation Actions + +#### Topic-Level Actions +- Accept/reject/mark as meta entire topics +- Automatically applies to all comments in the topic +- Maintains audit trail with moderator and timestamp + +#### Comment-Level Actions +- Select multiple comments within a topic +- Apply bulk actions to selected comments +- Individual comment moderation when needed + +### Proximity-Based Moderation + +The UMAP visualization provides spatial understanding of comment relationships: +- Comments positioned by semantic similarity +- Visual clustering shows related content +- Color coding by moderation status +- Interactive selection and bulk actions + +### Real-Time Statistics + +Comprehensive statistics tracking: +- Total topics by status (pending, accepted, rejected, meta) +- Completion rate progress bars +- Visual progress indicators +- Historical moderation data + +## Usage Workflow + +### 1. Topic Generation (Prerequisites) + +Before using TopicMod, ensure the Delphi pipeline has been run: + +```bash +# Generate embeddings and clusters +python 500_generate_embedding_umap_cluster.py + +# Generate topic names using LLM +python 600_generate_llm_topic_names.py + +# Create visualizations +python 700_datamapplot_for_layer.py +``` + +### 2. Topic-Level Moderation + +1. Navigate to **Topic Mod** in conversation admin +2. Select desired layer (0, 1, or 2) +3. Review topic names and sample comments +4. Apply bulk actions: Accept, Reject, or Mark as Meta +5. Use "View Comments" for detailed review + +### 3. Comment-Level Moderation + +1. Click "View Comments" on any topic +2. Review individual comments in the topic +3. Select specific comments using checkboxes +4. Apply bulk actions to selected comments +5. Use "Select All" for topic-wide actions + +### 4. Proximity-Based Analysis + +1. Switch to "Proximity Map" tab +2. Select layer for visualization +3. Observe comment clustering patterns +4. Identify outliers or problematic clusters +5. Use spatial relationships to inform moderation decisions + +### 5. Progress Monitoring + +1. View "Statistics" tab for overview +2. Track completion rates by status +3. Monitor moderation velocity +4. Generate reports for team coordination + +## Integration with Existing Systems + +### Comment Moderation +- TopicMod works alongside existing comment moderation +- Updates flow to traditional mod queue +- Maintains compatibility with Jigsaw Perspective API +- Preserves existing moderation workflows + +### Delphi Pipeline +- Leverages existing topic generation infrastructure +- Uses established DynamoDB schema patterns +- Integrates with narrative report generation +- Compatible with batch processing workflows + +### UMAP Visualization +- Built on existing EVōC clustering system +- Uses DataMapPlot visualization framework +- Maintains consistency with report visualizations +- Supports multi-layer analysis + +## Performance Considerations + +### Database Optimization +- DynamoDB queries optimized for conversation-level access +- PostgreSQL updates batched for efficiency +- Minimal impact on existing comment moderation performance +- Caching strategies for frequently accessed topics + +### Frontend Performance +- Component-level loading states +- Incremental data fetching +- Efficient re-rendering with React hooks +- SVG-based visualizations for performance + +### Scalability +- Designed to handle conversations with thousands of comments +- Layer-based organization reduces cognitive load +- Bulk operations minimize API calls +- Real-time polling with reasonable intervals + +## Error Handling and Edge Cases + +### Missing Topic Data +- Graceful fallback when Delphi data unavailable +- Clear messaging about pipeline requirements +- Fallback to traditional comment moderation + +### Network Failures +- Retry mechanisms for failed requests +- Optimistic UI updates with rollback +- Clear error messaging and recovery options + +### Data Consistency +- Atomic operations for topic-level moderation +- Transaction-like behavior for bulk actions +- Conflict resolution for concurrent moderation + +## Security and Permissions + +### Access Control +- Inherits existing conversation-level permissions +- Moderator role verification for all actions +- Audit trail for all moderation decisions + +### Data Protection +- No additional PII exposure +- Secure API endpoints with parameter validation +- CORS and authentication following existing patterns + +### Audit Trail +- All moderation actions logged with timestamp +- Moderator identity tracking +- Reversible actions where appropriate + +## Future Enhancements + +### Advanced Features +- Machine learning suggestions for topic categorization +- Automated pre-moderation based on topic patterns +- Integration with external content analysis APIs +- Custom topic naming and organization + +### UI/UX Improvements +- Drag-and-drop topic organization +- Advanced filtering and search +- Customizable dashboards +- Mobile-responsive design + +### Analytics Integration +- Topic-level engagement metrics +- Moderation efficiency tracking +- Bias detection and reporting +- A/B testing framework for moderation strategies + +## Technical Requirements + +### Server Dependencies +- Node.js with TypeScript support +- AWS SDK for DynamoDB access +- PostgreSQL client libraries +- Express.js framework + +### Client Dependencies +- React with hooks support +- theme-ui for consistent styling +- React Router for navigation +- SVG manipulation for visualizations + +### Infrastructure +- DynamoDB tables with appropriate indices +- PostgreSQL database with comment tables +- Redis cache for performance optimization (optional) +- CDN for static asset delivery + +## Deployment Notes + +### Development Setup +1. Ensure Delphi pipeline is configured +2. Create required DynamoDB tables +3. Update client-admin routing +4. Run both server and client builds + +### Production Deployment +1. Deploy server with new API endpoints +2. Update client-admin bundle +3. Run database migrations if needed +4. Monitor performance and error rates + +### Monitoring +- API endpoint response times +- DynamoDB read/write capacity +- PostgreSQL query performance +- Client-side error tracking + +--- + +This documentation provides a comprehensive overview of the TopicMod system. For specific implementation details, refer to the source code in `/server/src/routes/delphi/topicMod.ts` and `/client-admin/src/components/conversation-admin/topic-moderation/`. \ No newline at end of file diff --git a/delphi/example.env b/delphi/example.env index d3682d4c27..d7c7b87a1a 100644 --- a/delphi/example.env +++ b/delphi/example.env @@ -13,6 +13,12 @@ OLLAMA_MODEL=llama3.1:8b # For local development, use localhost OLLAMA_HOST=http://ollama:11434 +# Sentence Transformer configuration +# Default model for English text (384-dimensional embeddings) +SENTENCE_TRANSFORMER_MODEL=all-MiniLM-L6-v2 +# Alternative multilingual model (uncomment to use) +# SENTENCE_TRANSFORMER_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 + # Database configuration DATABASE_HOST=localhost DATABASE_NAME=polis_subset diff --git a/delphi/polismath/conversation/conversation.py b/delphi/polismath/conversation/conversation.py index a905130624..563451ffa0 100644 --- a/delphi/polismath/conversation/conversation.py +++ b/delphi/polismath/conversation/conversation.py @@ -763,135 +763,6 @@ def _compute_participant_info(self) -> None: logger.info(f"Participant info computation completed in {time.time() - start_time:.2f}s") - def _importance_metric(self, A: int, P: int, S: int, E: float) -> float: - """ - Calculate the importance metric for a comment. - Direct port of the Clojure importance-metric function. - - Args: - A: Number of agree votes - P: Number of pass votes - S: Total number of votes - E: Extremity value - - Returns: - Importance metric value - """ - p = (P + 1.0) / (S + 2.0) - a = (A + 1.0) / (S + 2.0) - return (1.0 - p) * (E + 1.0) * a - - def _priority_metric(self, is_meta: bool, A: int, P: int, S: int, E: float) -> float: - """ - Calculate the priority metric for a comment. - Direct port of the Clojure priority-metric function. - - Args: - is_meta: Whether the comment is a meta comment - A: Number of agree votes - P: Number of pass votes - S: Total number of votes - E: Extremity value - - Returns: - Priority metric value - """ - import math - - # Meta comments have a fixed high priority (equivalent to meta-priority in Clojure) - META_PRIORITY = 7.0 - - if is_meta: - return META_PRIORITY ** 2 - else: - # Regular priority calculation matching Clojure formula - importance = self._importance_metric(A, P, S, E) - # Scale by a factor which lets new comments bubble up - scaling_factor = 1.0 + (8.0 * (2.0 ** (-S / 5.0))) - return (importance * scaling_factor) ** 2 - - def _compute_comment_priorities(self) -> None: - """ - Compute comment priorities for Clojure format compatibility. - - In the Clojure version, comment priorities are used to determine which - comments to show users. This method computes similar values in a format - compatible with the Clojure output. - """ - # Import needed libraries - import numpy as np - import pandas as pd - import time - - start_time = time.time() - logger.info("Computing comment priorities...") - - # Initialize comment priorities - self.comment_priorities = {} - - # If we don't have a rating matrix, return empty priorities - if self.rating_mat.values.shape[0] == 0 or self.rating_mat.values.shape[1] == 0: - logger.info("No rating matrix data, skipping comment priorities") - return - - try: - # Get the list of comment IDs - comment_ids = self.rating_mat.colnames() - - # For each comment, calculate priority matching Clojure's calculation - for cid in comment_ids: - try: - # Determine if this is a meta comment - is_meta = cid in self.meta_tids - - # Get vote counts from group_votes if available - # Aggregate votes across all groups, just like in Clojure - A, D, S, P = 0, 0, 0, 0 - - # Sum votes across all groups (matches the Clojure reduce logic) - for gid, group_data in self.group_votes.items(): - if 'votes' in group_data and cid in group_data['votes']: - vote_data = group_data['votes'][cid] - A += vote_data.get('A', 0) - D += vote_data.get('D', 0) - S += vote_data.get('S', 0) - - # Calculate passes (P) as defined in Clojure: P = S - (A + D) - P = S - (A + D) - - # Get extremity value from PCA - E = 0 - if hasattr(self, 'pca') and self.pca and 'comment_extremity' in self.pca: - # Get comment index in the PCA data - try: - comment_idx = self.rating_mat.colnames().index(cid) - if comment_idx < len(self.pca['comment_extremity']): - E = self.pca['comment_extremity'][comment_idx] - except (ValueError, IndexError): - E = 0 - - # Calculate priority using the same formula as Clojure - priority = self._priority_metric(is_meta, A, P, S, E) - - # Match Clojure's fixed values for low-vote comments - if S < 7: - # In Clojure, these often get a fixed value of 49 - priority = 49 - - # Store priority as an integer to match Clojure format - self.comment_priorities[cid] = int(priority) - except Exception as e: - logger.warning(f"Error computing priority for comment {cid}: {e}") - # Default priority matching Clojure's common value for low-vote comments - self.comment_priorities[cid] = 49 - - logger.info(f"Comment priorities computation completed in {time.time() - start_time:.2f}s") - - except Exception as e: - logger.error(f"Error computing comment priorities: {e}") - # Make sure we have minimal comment priorities even if computation fails - for cid in self.rating_mat.colnames(): - self.comment_priorities[cid] = 49 # Clojure's common default value def recompute(self) -> 'Conversation': """ @@ -924,9 +795,6 @@ def recompute(self) -> 'Conversation': # Compute participant info result._compute_participant_info() - # Compute comment priorities (for Clojure format compatibility) - result._compute_comment_priorities() - return result def get_summary(self) -> Dict[str, Any]: diff --git a/delphi/polismath/database/dynamodb.py b/delphi/polismath/database/dynamodb.py index fdebd7cc7a..9952bb20ef 100644 --- a/delphi/polismath/database/dynamodb.py +++ b/delphi/polismath/database/dynamodb.py @@ -441,7 +441,7 @@ def write_conversation(self, conv) -> bool: # Create composite key for group representativeness zid_tick_gid = f"{zid}:{math_tick}:{group_id}" - logger.info(f"working on comment {comment_id}") + logger.debug(f"working on comment {comment_id}") batch.put_item(Item={ 'zid_tick_gid': zid_tick_gid, @@ -463,7 +463,7 @@ def write_conversation(self, conv) -> bool: # Create composite key for group representativeness zid_tick_gid = f"{zid}:{math_tick}:{group_id}" - logger.info(f"working on comment {comment_id}") + logger.debug(f"working on comment {comment_id}") batch.put_item(Item={ 'zid_tick_gid': zid_tick_gid, diff --git a/delphi/run_delphi.py b/delphi/run_delphi.py index 798a3dd388..48a00483a1 100644 --- a/delphi/run_delphi.py +++ b/delphi/run_delphi.py @@ -120,6 +120,22 @@ def main(): if extremity_exit_code != 0: print(f"{RED}Warning: Extremity calculation failed with exit code {extremity_exit_code}{NC}") + print("Continuing with priority calculation...") + + # Calculate comment priorities using group-based extremity + print(f"{GREEN}Calculating comment priorities with group-based extremity...{NC}") + priority_command = [ + "python", "/app/umap_narrative/502_calculate_priorities.py", + f"--conversation_id={zid}", + ] + if verbose_arg: + priority_command.append(verbose_arg) + + priority_process = subprocess.run(priority_command) + priority_exit_code = priority_process.returncode + + if priority_exit_code != 0: + print(f"{RED}Warning: Priority calculation failed with exit code {priority_exit_code}{NC}") print("Continuing with visualization...") if pipeline_exit_code == 0: diff --git a/delphi/run_delphi.sh b/delphi/run_delphi.sh index e2a374f9f1..53d7720466 100755 --- a/delphi/run_delphi.sh +++ b/delphi/run_delphi.sh @@ -91,7 +91,7 @@ else BATCH_SIZE_ARG="--batch-size=50000" # Default batch size fi -# Run the math pipeline +# Run the math pipeline echo -e "${GREEN}Running math pipeline...${NC}" python /app/polismath/run_math_pipeline.py --zid=${ZID} ${MAX_VOTES_ARG} ${BATCH_SIZE_ARG} MATH_EXIT_CODE=$? @@ -112,6 +112,15 @@ python /app/umap_narrative/501_calculate_comment_extremity.py --zid=${ZID} ${VER EXTREMITY_EXIT_CODE=$? if [ $EXTREMITY_EXIT_CODE -ne 0 ]; then echo -e "${RED}Warning: Extremity calculation failed with exit code ${EXTREMITY_EXIT_CODE}${NC}" + echo "Continuing with priority calculation..." +fi + +# Calculate comment priorities using group-based extremity +echo -e "${GREEN}Calculating comment priorities with group-based extremity...${NC}" +python /app/umap_narrative/502_calculate_priorities.py --conversation_id=${ZID} ${VERBOSE} +PRIORITY_EXIT_CODE=$? +if [ $PRIORITY_EXIT_CODE -ne 0 ]; then + echo -e "${RED}Warning: Priority calculation failed with exit code ${PRIORITY_EXIT_CODE}${NC}" echo "Continuing with visualization..." fi diff --git a/delphi/umap_narrative/500_generate_embedding_umap_cluster.py b/delphi/umap_narrative/500_generate_embedding_umap_cluster.py index 0127494099..05d05a7417 100755 --- a/delphi/umap_narrative/500_generate_embedding_umap_cluster.py +++ b/delphi/umap_narrative/500_generate_embedding_umap_cluster.py @@ -162,7 +162,9 @@ def process_comments(comments, conversation_id): # Generate embeddings with SentenceTransformer logger.info("Generating embeddings with SentenceTransformer...") - embedding_model = SentenceTransformer("all-MiniLM-L6-v2") + model_name = os.environ.get("SENTENCE_TRANSFORMER_MODEL", "all-MiniLM-L6-v2") + logger.info(f"Using model: {model_name}") + embedding_model = SentenceTransformer(model_name) document_vectors = embedding_model.encode(comment_texts, show_progress_bar=True) # Generate 2D projection with UMAP diff --git a/delphi/umap_narrative/502_calculate_priorities.py b/delphi/umap_narrative/502_calculate_priorities.py new file mode 100755 index 0000000000..f592e5dee5 --- /dev/null +++ b/delphi/umap_narrative/502_calculate_priorities.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python3 +""" +502_calculate_priorities.py + +Calculate comment priorities using group-based extremity values. + +This script runs after extremity calculation (501_calculate_comment_extremity.py) +and computes final priority values using the group-based extremity data. +""" + +import argparse +import boto3 +import json +import logging +import os +import sys +import time +from boto3.dynamodb.conditions import Key +from decimal import Decimal +from typing import Dict, List, Optional, Any + +# Set up logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +class PriorityCalculator: + """Calculate comment priorities using group-based extremity values.""" + + def __init__(self, conversation_id: int, endpoint_url: str = None): + """ + Initialize the priority calculator. + + Args: + conversation_id: The conversation ID to process + endpoint_url: DynamoDB endpoint URL (optional) + """ + self.conversation_id = conversation_id + self.endpoint_url = endpoint_url + + # Initialize DynamoDB connection + self.dynamodb = boto3.resource( + 'dynamodb', + endpoint_url=endpoint_url, + region_name='us-east-1', + aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID', 'dummy'), + aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY', 'dummy') + ) + + # Get table references + self.comment_routing_table = self.dynamodb.Table('Delphi_CommentRouting') + self.comment_extremity_table = self.dynamodb.Table('Delphi_CommentExtremity') + + logger.info(f"Initialized priority calculator for conversation {conversation_id}") + + def _importance_metric(self, A: int, P: int, S: int, E: float) -> float: + """ + Calculate importance metric (matches Clojure implementation). + + Args: + A: Number of agree votes + P: Number of pass votes + S: Total number of votes + E: Extremity value + + Returns: + Importance metric value + """ + p = (P + 1) / (S + 2) + a = (A + 1) / (S + 2) + return (1 - p) * (E + 1) * a + + def _priority_metric(self, is_meta: bool, A: int, P: int, S: int, E: float) -> float: + """ + Calculate priority metric (matches Clojure implementation). + + Args: + is_meta: Whether the comment is a meta comment + A: Number of agree votes + P: Number of pass votes + S: Total number of votes + E: Extremity value + + Returns: + Priority metric value + """ + META_PRIORITY = 7.0 + if is_meta: + return META_PRIORITY ** 2 + else: + importance = self._importance_metric(A, P, S, E) + scaling_factor = 1.0 + (8.0 * (2.0 ** (-S / 5.0))) + return (importance * scaling_factor) ** 2 + + def get_comment_extremity(self, comment_id: str) -> float: + """ + Get extremity value for a comment from DynamoDB. + + Args: + comment_id: The comment ID + + Returns: + Extremity value (0.0 to 1.0) or 0.0 if not found + """ + try: + response = self.comment_extremity_table.get_item( + Key={ + 'conversation_id': str(self.conversation_id), + 'comment_id': str(comment_id) + } + ) + if 'Item' in response: + return float(response['Item'].get('extremity_value', 0.0)) + else: + logger.debug(f"No extremity data found for comment {comment_id}") + return 0.0 + except Exception as e: + logger.warning(f"Error retrieving extremity for comment {comment_id}: {e}") + return 0.0 + + def get_comment_routing_data(self) -> List[Dict[str, Any]]: + """ + Get all comment routing data for the conversation. + + Returns: + List of comment routing items + """ + logger.info(f"Querying GSI 'zid-index' for conversation {self.conversation_id}...") + all_items = [] + try: + # Query the GSI where the partition key 'zid' matches the conversation_id + response = self.comment_routing_table.query( + IndexName='zid-index', + KeyConditionExpression=Key('zid').eq(str(self.conversation_id)) + ) + all_items.extend(response.get('Items', [])) + + # Handle pagination if the result set is large + while 'LastEvaluatedKey' in response: + logger.info("Paginating to fetch more comment routing data...") + response = self.comment_routing_table.query( + IndexName='zid-index', + KeyConditionExpression=Key('zid').eq(str(self.conversation_id)), + ExclusiveStartKey=response['LastEvaluatedKey'] + ) + all_items.extend(response.get('Items', [])) + + logger.info(f"Found {len(all_items)} comment routing entries via GSI query.") + return all_items + + except Exception as e: + logger.error(f"Error querying comment routing data from GSI: {e}") + return [] + + def calculate_comment_updates(self, comment_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Calculate priorities and return a list of items to be updated, + including their primary keys. + """ + updates = [] + for item in comment_data: + try: + comment_id = item.get('comment_id') + zid_tick = item.get('zid_tick') # The primary key we need for the update + stats = item.get('stats', {}) + + if not all([comment_id, zid_tick, stats]): + logger.warning(f"Skipping item due to missing data: {item}") + continue + + A = int(stats.get('agree', 0)) + D = int(stats.get('disagree', 0)) + S = int(stats.get('total', 0)) + P = S - (A + D) + + E = self.get_comment_extremity(comment_id) + is_meta = False # Assuming no meta comments for now + + priority = self._priority_metric(is_meta, A, P, S, E) + + # Prepare the update payload with the full key and the new priority + updates.append({ + 'Key': { + 'zid_tick': zid_tick, + 'comment_id': comment_id + }, + 'UpdateExpression': 'SET priority = :p', + 'ExpressionAttributeValues': {':p': int(priority)} + }) + + logger.debug(f"Comment {comment_id}: A={A}, P={P}, S={S}, E={E:.4f}, priority={int(priority)}") + + except Exception as e: + logger.warning(f"Error preparing update for comment {item.get('comment_id', 'N/A')}: {e}") + + return updates + + def update_priorities_in_dynamodb(self, updates: List[Dict[str, Any]]) -> bool: + """ + Update priority values in the comment routing table. + + Args: + priorities: Dictionary mapping comment_id to priority value + + Returns: + True if successful, False otherwise + """ + logger.info(f"Updating {len(updates)} priority values in DynamoDB") + try: + # Use a BatchWriter to efficiently handle multiple updates. + with self.comment_routing_table.batch_writer(overwrite_by_pkeys=['zid_tick', 'comment_id']) as batch: + for item_update in updates: + # NOTE: BatchWriter does not support update_item. We must put the entire item. + # This requires fetching the full item first or knowing its structure. + # A loop of update_item is simpler and already a huge improvement. + self.comment_routing_table.update_item(**item_update) + + logger.info("Successfully updated all priorities in DynamoDB") + return True + + except Exception as e: + logger.error(f"Error updating priorities in DynamoDB: {e}") + return False + + def run(self) -> bool: + """ + Run the complete priority calculation and update process. + """ + try: + start_time = time.time() + + # 1. Get all necessary data efficiently + comment_data = self.get_comment_routing_data() + + if not comment_data: + logger.warning("No comment routing data found - conversation likely has no votes yet. This is normal.") + return True + + # 2. Calculate priorities and prepare update payloads + updates_to_perform = self.calculate_comment_updates(comment_data) + + if not updates_to_perform: + logger.warning("No valid comments to update.") + return True + + # 3. Update DynamoDB + success = self.update_priorities_in_dynamodb(updates_to_perform) + + elapsed = time.time() - start_time + if success: + logger.info(f"Priority calculation and update completed successfully for {len(updates_to_perform)} comments in {elapsed:.2f}s") + + # Log some statistics (restored from original) + priority_values = [item['ExpressionAttributeValues'][':p'] for item in updates_to_perform] + if priority_values: + avg_priority = sum(priority_values) / len(priority_values) + max_priority = max(priority_values) + min_priority = min(priority_values) + logger.info(f"Priority statistics: min={min_priority}, max={max_priority}, avg={avg_priority:.2f}") + + else: + logger.error(f"Priority update failed after {elapsed:.2f}s") + + return success + + except Exception as e: + logger.critical(f"A critical error occurred in the run process: {e}", exc_info=True) + return False + +def main(): + """Main function.""" + parser = argparse.ArgumentParser(description='Calculate comment priorities using group-based extremity') + parser.add_argument('--conversation_id', '--zid', type=int, required=True, help='Conversation ID to process') + parser.add_argument('--endpoint-url', type=str, default=os.environ.get('DYNAMODB_ENDPOINT', 'http://host.docker.internal:8000'), help='DynamoDB endpoint URL') + parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose logging') + + args = parser.parse_args() + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + calculator = PriorityCalculator(args.conversation_id, args.endpoint_url) + success = calculator.run() + + if success: + logger.info("Priority calculation completed successfully.") + sys.exit(0) + else: + logger.error("Priority calculation failed.") + sys.exit(1) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/delphi/umap_narrative/600_generate_llm_topic_names.py b/delphi/umap_narrative/600_generate_llm_topic_names.py deleted file mode 100755 index 46a8457444..0000000000 --- a/delphi/umap_narrative/600_generate_llm_topic_names.py +++ /dev/null @@ -1,848 +0,0 @@ -#!/usr/bin/env python3 -""" -Generate LLM topic names for clusters using Ollama. - -This script connects to PostgreSQL and DynamoDB to get necessary data, -generates meaningful topic names for clusters using Ollama LLM, -and saves the results to DynamoDB. - -Usage: - python generate_llm_topic_names.py --conversation_id 36324 --layer 0 - # Model will default to OLLAMA_MODEL environment variable or "llama3.1:8b" if not specified - -Features: -- Runs as a separate step after the core clustering pipeline -- Directly connects to PostgreSQL for comment texts -- Reads cluster data from DynamoDB -- Generates descriptive topic names with LLM -- Only loads what's needed, making it memory efficient -- Can be run for specific layers or a full conversation -""" - -import os -import sys -import json -import time -import logging -import argparse -import numpy as np -import pandas as pd -from datetime import datetime -from tqdm import tqdm -from pathlib import Path -from boto3.dynamodb.conditions import Key, Attr - -# Import from local modules -from polismath_commentgraph.utils.storage import DynamoDBStorage -from polismath_commentgraph.utils.converter import DataConverter -from polismath_commentgraph.schemas.dynamo_models import LLMTopicName - -# Set up logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) - -def setup_environment(dynamo_endpoint=None): - """Set up environment variables for DynamoDB connection.""" - # DynamoDB settings (for local DynamoDB) - if dynamo_endpoint: - os.environ['DYNAMODB_ENDPOINT'] = dynamo_endpoint - elif not os.environ.get('DYNAMODB_ENDPOINT'): - # Log the endpoint being used - endpoint = os.environ.get('DYNAMODB_ENDPOINT') - logger.info(f"Using DynamoDB endpoint: {endpoint}") - - # Set up dummy credentials for local DynamoDB if not already set - if not os.environ.get('AWS_ACCESS_KEY_ID'): - os.environ['AWS_ACCESS_KEY_ID'] = 'fakeMyKeyId' - - if not os.environ.get('AWS_SECRET_ACCESS_KEY'): - os.environ['AWS_SECRET_ACCESS_KEY'] = 'fakeSecretAccessKey' - - if not os.environ.get('AWS_DEFAULT_REGION'): - os.environ['AWS_DEFAULT_REGION'] = 'us-east-1' - - logger.info(f"DynamoDB endpoint: {os.environ.get('DYNAMODB_ENDPOINT')}") - logger.info(f"AWS region: {os.environ.get('AWS_DEFAULT_REGION')}") - -def check_ollama_availability(): - """Check if Ollama is available and working.""" - try: - import ollama - import os - - # Check if OLLAMA_HOST is set in environment - ollama_host = os.environ.get('OLLAMA_HOST') - if ollama_host: - logger.info(f"Using OLLAMA_HOST from environment: {ollama_host}") - # Try to set the host for the Ollama client - try: - # For newer versions of ollama client - ollama.client._CLIENT_BASE_URL = ollama_host - logger.info(f"Set Ollama client base URL to {ollama_host}") - except: - logger.warning("Could not set ollama.client._CLIENT_BASE_URL, falling back to environment variable") - # The client will pick up OLLAMA_HOST automatically in newer versions - pass - - # Just check if we can connect to Ollama API - # Don't try to list models which might be causing issues - logger.info("Checking Ollama connection...") - # Get the model name from environment or default to llama3.1:8b - model_name = os.environ.get("OLLAMA_MODEL") - logger.info(f"Checking Ollama connection with model: {model_name}") - # Simple ping to verify Ollama is running - ollama.embeddings(model=model_name, prompt="test") - logger.info("Ollama is available") - return True - except ImportError: - logger.error("Ollama not installed. Please install with: pip install ollama") - return False - except Exception as e: - logger.error(f"Error connecting to Ollama: {e}") - return False - -def get_conversation_output_path(conversation_id, output_base_dir="polis_data"): - """Get the output path for visualization files for a conversation.""" - # Handle string or integer conversation_id - conversation_str = str(conversation_id) - - # Construct path to visualization directory - output_path = os.path.join(output_base_dir, conversation_str, "python_output") - - # Create directories for multilayer visualizations - multilayer_dir = os.path.join(output_path, "comments_multilayer") - enhanced_dir = os.path.join(output_path, "comments_enhanced_multilayer") - os.makedirs(multilayer_dir, exist_ok=True) - os.makedirs(enhanced_dir, exist_ok=True) - - return { - "base": output_path, - "multilayer": multilayer_dir, - "enhanced": enhanced_dir - } - -def load_comment_texts(conversation_id, dynamo_storage=None, output_base_dir="polis_data"): - """ - Load comment texts directly from PostgreSQL. - - Args: - conversation_id: Conversation ID - dynamo_storage: Optional DynamoDBStorage instance - output_base_dir: Base directory for output files (not used) - - Returns: - Dictionary mapping comment_id to text or None if not found - """ - # Connect to PostgreSQL directly - from polismath_commentgraph.utils.storage import PostgresClient - - logger.info(f"Loading comments directly from PostgreSQL for conversation {conversation_id}") - postgres_client = PostgresClient() - - try: - # Initialize connection - postgres_client.initialize() - - # Get comments - comments = postgres_client.get_comments_by_conversation(int(conversation_id)) - - if not comments: - logger.error(f"No comments found in PostgreSQL for conversation {conversation_id}") - return None - - # Create a dictionary of comment_id to text - comment_dict = {comment['tid']: comment['txt'] for comment in comments if comment.get('txt')} - - logger.info(f"Loaded {len(comment_dict)} comments from PostgreSQL") - return comment_dict - - except Exception as e: - logger.error(f"Error loading comments from PostgreSQL: {e}") - return None - - finally: - # Clean up connection - postgres_client.shutdown() - -def load_layer_data(conversation_id, layer_id, dynamo_storage=None, output_base_dir="polis_data"): - """ - Load cluster data for a specific layer. - - Args: - conversation_id: Conversation ID - layer_id: Layer ID to load - dynamo_storage: Optional DynamoDBStorage instance - output_base_dir: Base directory for output files - - Returns: - Dictionary with cluster data or None if not found - """ - if not dynamo_storage: - logger.error("DynamoDB storage is required") - return None - - # Initialize return data - layer_data = { - "clusters": {}, - "characteristics": {}, - "enhanced_topic_names": {}, - "comment_texts": None - } - - # Load comment texts directly from PostgreSQL - layer_data["comment_texts"] = load_comment_texts(conversation_id, dynamo_storage, output_base_dir) - if not layer_data["comment_texts"]: - logger.warning("Could not load comment texts from PostgreSQL") - - # Load cluster assignments from DynamoDB using CommentClusters table - try: - # Get conversation metadata to make sure the layer exists - meta = dynamo_storage.get_conversation_meta(conversation_id) - if not meta or 'cluster_layers' not in meta: - logger.error(f"No metadata or cluster_layers found for conversation {conversation_id}") - return None - - # Check if layer exists - cluster_layers = meta.get('cluster_layers', []) - layer_exists = False - for layer in cluster_layers: - if layer.get('layer_id') == layer_id: - layer_exists = True - break - - if not layer_exists: - logger.error(f"Layer {layer_id} does not exist in metadata") - return None - - # Query CommentClusters to get cluster assignments - logger.info(f"Loading clusters for layer {layer_id} from DynamoDB...") - - # Get all comment clusters for this conversation - table = dynamo_storage.dynamodb.Table(dynamo_storage.table_names['comment_clusters']) - response = table.query( - KeyConditionExpression=Key('conversation_id').eq(conversation_id) - ) - clusters = response.get('Items', []) - - # Handle pagination if needed - while 'LastEvaluatedKey' in response: - response = table.query( - KeyConditionExpression=Key('conversation_id').eq(conversation_id), - ExclusiveStartKey=response['LastEvaluatedKey'] - ) - clusters.extend(response.get('Items', [])) - - # Create a mapping of cluster IDs to comment IDs for this layer - for cluster in clusters: - layer_cluster_id = cluster.get(f'layer{layer_id}_cluster_id') - if layer_cluster_id is not None: - comment_id = cluster.get('comment_id') - if comment_id is not None: - if layer_cluster_id not in layer_data["clusters"]: - layer_data["clusters"][layer_cluster_id] = [] - layer_data["clusters"][layer_cluster_id].append(comment_id) - - logger.info(f"Loaded {len(layer_data['clusters'])} clusters for layer {layer_id}") - except Exception as e: - logger.error(f"Error loading clusters from DynamoDB: {e}") - return None - - # Try to load cluster characteristics from DynamoDB - try: - logger.info(f"Loading cluster characteristics for layer {layer_id} from DynamoDB...") - characteristics = dynamo_storage.get_cluster_characteristics_by_layer( - conversation_id, layer_id - ) - - # Convert to dictionary - for char in characteristics: - cluster_id = char.get('cluster_id') - if cluster_id is not None: - # Convert cluster_id to int for consistent keys - layer_data["characteristics"][int(cluster_id)] = char - - logger.info(f"Loaded {len(characteristics)} cluster characteristics from DynamoDB") - except Exception as e: - logger.error(f"Error loading cluster characteristics from DynamoDB: {e}") - - # Validate that we have at least the minimum required data - if not layer_data["clusters"]: - logger.error("No clusters found for this layer") - return None - - if not layer_data["characteristics"]: - logger.warning("No cluster characteristics found, will generate topic names without them") - - return layer_data - -def generate_topic_names(layer_data, conversation_name=None, model_name=None, provider_type=None): - """ - Generate topic names using Ollama LLM. - - Args: - layer_data: Dictionary with layer data - conversation_name: Optional name of the conversation for context - model_name: Ollama model name to use - - Returns: - Dictionary mapping cluster IDs to topic names - """ - try: - import ollama - import os - - # Check if OLLAMA_HOST is set in environment - ollama_host = os.environ.get('OLLAMA_HOST') - if ollama_host: - logger.info(f"Using OLLAMA_HOST from environment: {ollama_host}") - # Try to set the host for the Ollama client - try: - # For newer versions of ollama client - ollama.client._CLIENT_BASE_URL = ollama_host - logger.info(f"Set Ollama client base URL to {ollama_host}") - except: - logger.warning("Could not set ollama.client._CLIENT_BASE_URL, falling back to environment variable") - # The client will pick up OLLAMA_HOST automatically in newer versions - pass - except ImportError: - logger.error("Ollama not installed. Please install with: pip install ollama") - return {} - - logger.info(f"Generating topic names using Ollama model {model_name}...") - - # Get cluster assignments and comment texts - clusters = layer_data["clusters"] - comment_texts = layer_data["comment_texts"] - characteristics = layer_data["characteristics"] - - # If no comment texts available, can't generate names - if not comment_texts: - logger.error("No comment texts available, cannot generate topic names") - return {} - - # Function to get topic labels via Ollama - def get_topic_name(cluster_id): - # Get comment IDs for this cluster - comment_ids = clusters.get(cluster_id, []) - - # Get comments for this cluster - cluster_comments = [] - for comment_id in comment_ids: - if comment_id in comment_texts: - cluster_comments.append(comment_texts[comment_id]) - - # If no comments available, use a generic name - if not cluster_comments: - return f"Topic {cluster_id}" - - # Get characteristics if available - top_words = [] - if cluster_id in characteristics: - char = characteristics[cluster_id] - top_words = char.get('top_words', []) - - # Create prompt - prompt_prefix = f"For conversation {conversation_name or 'topic'}: " if conversation_name else "" - - # Add keywords to the prompt if available - if top_words: - prompt_prefix += f"Keywords for this cluster: {', '.join(top_words[:5])}. " - - prompt = f"{prompt_prefix}Please provide a concise topic label (3-5 words max) for the following group of comments. Return ONLY the label without any intro text like 'Here are the topic labels:' or quotation marks:\n\n" - - # Include up to 5 example comments (prefer shorter ones) - comment_lengths = [(i, len(comment)) for i, comment in enumerate(cluster_comments)] - comment_lengths.sort(key=lambda x: x[1]) # Sort by length - selected_indices = [idx for idx, _ in comment_lengths[:min(5, len(comment_lengths))]] - - for j, idx in enumerate(selected_indices): - prompt += f"{j+1}. {cluster_comments[idx]}\n" - - prompt += "\nTopic label:" - - try: - # Using the Ollama API with the chat endpoint - response = ollama.chat( - model=model_name, - messages=[{"role": "user", "content": prompt}] - ) - - # Get the response text from the appropriate field - if isinstance(response, dict) and 'message' in response and 'content' in response['message']: - topic_text = response['message']['content'].strip() - else: - # Handle API changes or different response formats - logger.warning(f"Unexpected response format: {response}") - if hasattr(response, 'strip'): - topic_text = response.strip() - else: - return f"Topic {cluster_id}" - - # Try to extract just the label - lines = topic_text.split('\n') - topic = lines[0] # First line is likely the label - - # Further clean up to ensure it's just a label - topic = topic.replace("Topic label:", "").strip() - - # Remove common LLM prefixes - prefixes_to_remove = [ - "Here are the topic labels:", - "Here are the topic labels", - "Here is the topic label:", - "Here is the topic label", - "The topic label is:", - "The topic label is", - "Topic name:", - "Topic name", - "Label:", - "Label" - ] - - for prefix in prefixes_to_remove: - if topic.startswith(prefix): - topic = topic[len(prefix):].strip() - - # Remove quotes if they're present (handle any quote combination) - topic = topic.strip('"\'') # Strip both double and single quotes - - # Remove common formats like "1. Topic Name" or "- Topic Name" - if topic.startswith("1. ") or topic.startswith("- "): - topic = topic[3:].strip() - - if len(topic) > 50: # If it's too long, truncate - topic = topic[:50] + "..." - - # Log the generated topic name with more visibility - logger.info(f"Generated topic for cluster {cluster_id}: '{topic}'") - - return topic - except Exception as e: - logger.error(f"Error generating topic with Ollama for cluster {cluster_id}: {e}") - return f"Topic {cluster_id}" - - # Create a mapping of cluster IDs to topic names - cluster_ids = list(clusters.keys()) - logger.info(f"Generating topic names for {len(cluster_ids)} clusters...") - cluster_topic_names = {} - - for cluster_id in tqdm(cluster_ids, desc="Generating topic names"): - # Skip negative cluster IDs (noise points) - if int(cluster_id) < 0: - cluster_topic_names[int(cluster_id)] = "Unclustered" - continue - - # Get topic name - topic_name = get_topic_name(cluster_id) - - # Store the topic name properly - avoid using the default value - if topic_name == f"Topic {cluster_id}": - # Try again with a different prompt if we got the default value - logger.warning(f"Got default topic name for cluster {cluster_id}, trying again with simpler prompt") - - comment_ids = clusters.get(cluster_id, []) - cluster_comments = [] - for comment_id in comment_ids: - if comment_id in comment_texts: - cluster_comments.append(comment_texts[comment_id]) - - if cluster_comments: - # Try a simpler prompt - try: - prompt = f"Based on these comments, give a very short topic label (3-5 words max). IMPORTANT: Return ONLY the label itself with no introduction or quotation marks:\n\n" - for i, comment in enumerate(cluster_comments[:3]): - prompt += f"{i+1}. {comment}\n" - prompt += "\nTopic label:" - - response = ollama.chat( - model=model_name, - messages=[{"role": "user", "content": prompt}] - ) - if 'message' in response and 'content' in response['message']: - topic_text = response['message']['content'].strip() - topic_name = topic_text.split('\n')[0].strip().replace("Topic label:", "").strip() - - # Remove common LLM prefixes (same as above) - prefixes_to_remove = [ - "Here are the topic labels:", - "Here are the topic labels", - "Here is the topic label:", - "Here is the topic label", - "The topic label is:", - "The topic label is", - "Topic name:", - "Topic name", - "Label:", - "Label" - ] - - for prefix in prefixes_to_remove: - if topic_name.startswith(prefix): - topic_name = topic_name[len(prefix):].strip() - - # Remove quotes if present - handle any quote combination - topic_name = topic_name.strip('"\'') - - # Remove common formats like "1. Topic Name" or "- Topic Name" - if topic_name.startswith("1. ") or topic_name.startswith("- "): - topic_name = topic_name[3:].strip() - - logger.info(f"Got better topic name for cluster {cluster_id}: '{topic_name}'") - except Exception as e: - logger.error(f"Error getting alternative topic name: {e}") - - cluster_topic_names[int(cluster_id)] = topic_name - logger.info(f"Stored final topic name for cluster {cluster_id}: '{topic_name}'") - - # No sleep between requests to speed up processing - # time.sleep(0.5) - - # Always add "Unclustered" for noise points - cluster_topic_names[-1] = "Unclustered" - - logger.info(f"Generated {len(cluster_topic_names)} topic names") - return cluster_topic_names - -def save_topic_names(conversation_id, layer_id, topic_names, model_name, dynamo_storage=None, output_base_dir="polis_data"): - """ - Save generated topic names to DynamoDB. - - Args: - conversation_id: Conversation ID - layer_id: Layer ID - topic_names: Dictionary mapping cluster IDs to topic names - model_name: Name of the LLM model used - dynamo_storage: Optional DynamoDBStorage instance - output_base_dir: Base directory for output files (not used) - - Returns: - True if successful, False otherwise - """ - if not dynamo_storage: - logger.error("DynamoDB storage is required") - return False - - # Save to DynamoDB - try: - logger.info(f"Saving LLM topic names to DynamoDB...") - - # Convert to model objects - from datetime import datetime - - # Debug: print what we're about to save - logger.info(f"Topic names to save: {topic_names}") - - # Create LLMTopicName objects directly - topic_models = [] - for cluster_id, topic_name in topic_names.items(): - # Double-check we're not using placeholder names - if topic_name == f"Topic {cluster_id}": - logger.warning(f"Using placeholder name 'Topic {cluster_id}' - this suggests Ollama didn't return a proper name") - - # Check if topic_name is empty or just whitespace - if not topic_name or not topic_name.strip(): - logger.warning(f"Empty topic name for cluster {cluster_id} - likely truncated by LLM prefix removal") - - topic_key = f"layer{layer_id}_{cluster_id}" - - # Prepend the layer_cluster format to ensure uniqueness - # Format: "0_5: Environmental Ethics" or "0_3:" for empty names - # Special handling for "Unclustered" (-1 cluster) - if int(cluster_id) == -1 and topic_name == "Unclustered": - prefixed_topic_name = f"{layer_id}_{cluster_id}: Unclustered" - else: - prefixed_topic_name = f"{layer_id}_{cluster_id}: {topic_name}" if topic_name.strip() else f"{layer_id}_{cluster_id}:" - - model = { - 'conversation_id': conversation_id, - 'topic_key': topic_key, - 'layer_id': layer_id, - 'cluster_id': int(cluster_id), - 'topic_name': prefixed_topic_name, - 'model_name': model_name, - 'created_at': datetime.now().isoformat() - } - topic_models.append(model) - logger.info(f"Added topic model for cluster {cluster_id}: {prefixed_topic_name}") - - # Store in batch - success_count = 0 - failure_count = 0 - - # Process in batches of 25 (DynamoDB batch limit) - table = dynamo_storage.dynamodb.Table(dynamo_storage.table_names['llm_topic_names']) - - for i in range(0, len(topic_models), 25): - batch = topic_models[i:i + 25] - - try: - with table.batch_writer() as writer: - for topic_model in batch: - writer.put_item(Item=topic_model) - logger.info(f"Saved topic for cluster {topic_model['cluster_id']}: {topic_model['topic_name']}") - success_count += len(batch) - except Exception as e: - logger.error(f"Error in batch write operation: {e}") - failure_count += len(batch) - - logger.info(f"Stored {success_count} LLM topic names with {failure_count} failures") - return success_count > 0 - - except Exception as e: - logger.error(f"Error saving topic names to DynamoDB: {e}") - return False - -def update_visualization_with_llm_names(conversation_id, layer_id, topic_names, layer_data, output_base_dir="polis_data"): - """ - Create a new visualization with LLM-generated topic names. - - Args: - conversation_id: Conversation ID - layer_id: Layer ID - topic_names: Dictionary mapping cluster IDs to topic names - layer_data: Dictionary with layer data - output_base_dir: Base directory for output files - - Returns: - Path to the saved visualization file - """ - # Get output paths - paths = get_conversation_output_path(conversation_id, output_base_dir) - - # Get required data - document_map = layer_data["document_map"] - cluster_layer = layer_data["cluster_layer"] - comment_texts = layer_data["comment_texts"] - - # If any required data is missing, we can't create visualization - if document_map is None or cluster_layer is None: - logger.error("Missing required data for visualization") - return None - - # Create labels for visualization - labels_for_viz = np.array([ - topic_names.get(label, "Unclustered") if label >= 0 else "Unclustered" - for label in cluster_layer - ]) - - # Create enhanced visualization - layer_file = os.path.join(paths["enhanced"], f"{conversation_id}_comment_layer_{layer_id}_llm_named.html") - - try: - layer_figure = datamapplot.create_interactive_plot( - document_map, # 2D coordinates for the data map - labels_for_viz, # Cluster labels for each data point - hover_text=comment_texts, # Text to show on hover - title=f"{conversation_id} Layer {layer_id} - {len(np.unique(cluster_layer[cluster_layer >= 0]))} topics with LLM names", - sub_title=f"Comment clustering - Layer {layer_id} with LLM-generated topic labels", - min_fontsize=12, - max_fontsize=18, - point_radius_min_pixels=2, - point_radius_max_pixels=10, - width="100%", - height=800 - ) - - # Save the figure - layer_figure.save(layer_file) - logger.info(f"Saved visualization with LLM topic names to {layer_file}") - - return layer_file - except Exception as e: - logger.error(f"Error creating visualization: {e}") - return None - -def update_conversation_with_ollama(conversation_id, layer_id=None, model_name=None, output_base_dir="polis_data", dynamo_endpoint=None, start_cluster=None, end_cluster=None): - """ - Update a conversation with Ollama-generated topic names. - - Args: - conversation_id: Conversation ID - layer_id: Optional specific layer ID to update (if None, update all layers) - model_name: Ollama model name to use - output_base_dir: Base directory for output files - dynamo_endpoint: Optional DynamoDB endpoint URL - start_cluster: Starting cluster ID for processing a range (inclusive) - end_cluster: Ending cluster ID for processing a range (inclusive) - - Returns: - True if successful, False otherwise - """ - # Set up environment - setup_environment(dynamo_endpoint) - - # Get model name from environment variable or use default - if model_name is None: - model_name = os.environ.get("OLLAMA_MODEL") - logger.info(f"Using model from environment: {model_name}") - - # Check Ollama availability - if not check_ollama_availability(): - logger.error("Ollama not available, cannot continue") - return False - - # Initialize DynamoDB storage - dynamo_storage = DynamoDBStorage( - endpoint_url=os.environ.get('DYNAMODB_ENDPOINT') - ) - - # Get conversation metadata (for name) - conversation_name = None - try: - meta = dynamo_storage.get_conversation_meta(conversation_id) - if meta: - conversation_name = meta.get('conversation_name') or meta.get('metadata', {}).get('conversation_name') - if not conversation_name: - conversation_name = f"Conversation {conversation_id}" - except Exception as e: - logger.error(f"Error getting conversation metadata: {e}") - conversation_name = f"Conversation {conversation_id}" - - logger.info(f"Processing conversation {conversation_id}: {conversation_name}") - - # If layer_id is specified, process just that layer - if layer_id is not None: - return update_layer_with_ollama( - conversation_id, layer_id, conversation_name, - model_name, output_base_dir, dynamo_storage, - start_cluster, end_cluster - ) - - # Otherwise, try to get all available layers - try: - meta = dynamo_storage.get_conversation_meta(conversation_id) - if meta and 'cluster_layers' in meta: - layers = meta['cluster_layers'] - layer_ids = [layer['layer_id'] for layer in layers] - - logger.info(f"Found {len(layer_ids)} layers: {layer_ids}") - - # Process each layer - results = [] - for layer_id in layer_ids: - result = update_layer_with_ollama( - conversation_id, layer_id, conversation_name, - model_name, output_base_dir, dynamo_storage, - start_cluster, end_cluster - ) - results.append(result) - - # Return True if any layer was successfully updated - return any(results) - else: - logger.warning("No layer information found in metadata, trying default layers 0-2") - # Try default layers 0, 1, 2 - results = [] - for layer_id in range(3): - result = update_layer_with_ollama( - conversation_id, layer_id, conversation_name, - model_name, output_base_dir, dynamo_storage, - start_cluster, end_cluster - ) - results.append(result) - - # Return True if any layer was successfully updated - return any(results) - except Exception as e: - logger.error(f"Error processing conversation layers: {e}") - return False - -def update_layer_with_ollama(conversation_id, layer_id, conversation_name, model_name, output_base_dir, dynamo_storage, start_cluster=None, end_cluster=None): - """ - Update a specific layer with Ollama-generated topic names. - - Args: - conversation_id: Conversation ID - layer_id: Layer ID to update - conversation_name: Name of the conversation - model_name: Ollama model name to use - output_base_dir: Base directory for output files (not used) - dynamo_storage: DynamoDBStorage instance - start_cluster: Starting cluster ID for processing a range (inclusive) - end_cluster: Ending cluster ID for processing a range (inclusive) - - Returns: - True if successful, False otherwise - """ - logger.info(f"Processing layer {layer_id} for conversation {conversation_id}") - - # Load layer data - layer_data = load_layer_data( - conversation_id, layer_id, dynamo_storage, output_base_dir - ) - - if not layer_data: - logger.error(f"Failed to load layer data for layer {layer_id}") - return False - - # If a cluster range is specified, filter the clusters - clusters = layer_data["clusters"] - if start_cluster is not None and end_cluster is not None: - logger.info(f"Processing cluster range from {start_cluster} to {end_cluster}") - filtered_clusters = {} - for cluster_id, comment_ids in clusters.items(): - if int(cluster_id) >= start_cluster and int(cluster_id) <= end_cluster: - filtered_clusters[cluster_id] = comment_ids - - # Replace the clusters with the filtered ones - layer_data["clusters"] = filtered_clusters - logger.info(f"Filtered from {len(clusters)} to {len(filtered_clusters)} clusters") - - # Generate topic names with Ollama - topic_names = generate_topic_names( - layer_data, conversation_name, model_name - ) - - if not topic_names: - logger.error("Failed to generate topic names") - return False - - # Save topic names to DynamoDB only - success = save_topic_names( - conversation_id, layer_id, topic_names, model_name, - dynamo_storage, output_base_dir - ) - - return success - -def main(): - """Main entry point.""" - parser = argparse.ArgumentParser(description='Update cluster topics with Ollama-generated names') - parser.add_argument('--conversation_id', type=str, required=True, - help='Conversation ID to process') - parser.add_argument('--layer', type=int, required=False, default=None, - help='Specific layer ID to update (default: all layers)') - parser.add_argument('--model', type=str, default=None, - help='Ollama model to use (default: uses OLLAMA_MODEL env var or llama3.1:8b)') - parser.add_argument('--output_dir', type=str, default="polis_data", - help='Base directory for output files (default: polis_data)') - parser.add_argument('--dynamo_endpoint', type=str, default=None, - help='DynamoDB endpoint URL') - parser.add_argument('--start_cluster', type=int, default=None, - help='Starting cluster ID for processing a range (inclusive)') - parser.add_argument('--end_cluster', type=int, default=None, - help='Ending cluster ID for processing a range (inclusive)') - - args = parser.parse_args() - - logger.info(f"Starting update_with_ollama_standalone.py for conversation {args.conversation_id}") - - success = update_conversation_with_ollama( - args.conversation_id, - layer_id=args.layer, - model_name=args.model, - output_base_dir=args.output_dir, - dynamo_endpoint=args.dynamo_endpoint, - start_cluster=args.start_cluster, - end_cluster=args.end_cluster - ) - - if success: - logger.info(f"Successfully updated topics for conversation {args.conversation_id}") - return 0 - else: - logger.error(f"Failed to update topics for conversation {args.conversation_id}") - return 1 - -if __name__ == "__main__": - sys.exit(main()) \ No newline at end of file diff --git a/delphi/umap_narrative/polismath_commentgraph/PROJECT_SUMMARY.md b/delphi/umap_narrative/polismath_commentgraph/PROJECT_SUMMARY.md index 3b5862b94b..4657f26f7d 100644 --- a/delphi/umap_narrative/polismath_commentgraph/PROJECT_SUMMARY.md +++ b/delphi/umap_narrative/polismath_commentgraph/PROJECT_SUMMARY.md @@ -22,7 +22,7 @@ This document summarizes the implementation of a serverless Lambda service for p ### 2. Comment Processing Pipeline -- **Text Embedding**: Using SentenceTransformer (`all-MiniLM-L6-v2`) to generate 384-dimensional embeddings +- **Text Embedding**: Using SentenceTransformer (configurable via `SENTENCE_TRANSFORMER_MODEL` env var, defaults to `all-MiniLM-L6-v2`) to generate 384-dimensional embeddings - **Dimensionality Reduction**: UMAP projection to 2D for visualization - **Hierarchical Clustering**: EVōC algorithm creates multiple layers of clusters - Fine-grained (most detailed) diff --git a/delphi/umap_narrative/polismath_commentgraph/cli.py b/delphi/umap_narrative/polismath_commentgraph/cli.py index 3b4679cbaa..2b18c6ad3e 100644 --- a/delphi/umap_narrative/polismath_commentgraph/cli.py +++ b/delphi/umap_narrative/polismath_commentgraph/cli.py @@ -56,7 +56,9 @@ def test_evoc(args): # Load sentence transformer model - same one used in successful examples logger.info("Loading SentenceTransformer model...") start_time = time.time() - embedding_model = SentenceTransformer("all-MiniLM-L6-v2") + model_name = os.environ.get("SENTENCE_TRANSFORMER_MODEL", "all-MiniLM-L6-v2") + logger.info(f"Using model: {model_name}") + embedding_model = SentenceTransformer(model_name) logger.info(f"Model loaded in {time.time() - start_time:.2f}s") # Process each dataset diff --git a/delphi/umap_narrative/polismath_commentgraph/core/embedding.py b/delphi/umap_narrative/polismath_commentgraph/core/embedding.py index 387dd73216..ac6a5de0e1 100644 --- a/delphi/umap_narrative/polismath_commentgraph/core/embedding.py +++ b/delphi/umap_narrative/polismath_commentgraph/core/embedding.py @@ -22,7 +22,7 @@ class EmbeddingEngine: def __init__( self, - model_name: str = "all-MiniLM-L6-v2", + model_name: Optional[str] = None, cache_dir: Optional[str] = None, device: Optional[str] = None ): @@ -34,10 +34,14 @@ def __init__( cache_dir: Optional directory to cache models device: Optional device to use (cpu, cuda, etc.) """ + # Get model name from environment variable or use provided name, with fallback to default + if model_name is None: + model_name = os.environ.get("SENTENCE_TRANSFORMER_MODEL", "all-MiniLM-L6-v2") + logger.info(f"Initializing embedding engine with model: {model_name}") self.model_name = model_name self._model = None # Lazy-loaded - self.vector_dim = 384 # Default for all-MiniLM-L6-v2 + self.vector_dim = 384 # Default for all-MiniLM-L6-v2 and paraphrase-multilingual-MiniLM-L12-v2 # Set up cache directory self.cache_dir = cache_dir or os.environ.get("MODEL_CACHE_DIR") diff --git a/delphi/umap_narrative/polismath_commentgraph/utils/converter.py b/delphi/umap_narrative/polismath_commentgraph/utils/converter.py index 3c77194252..e3c06564cf 100644 --- a/delphi/umap_narrative/polismath_commentgraph/utils/converter.py +++ b/delphi/umap_narrative/polismath_commentgraph/utils/converter.py @@ -4,6 +4,7 @@ import numpy as np import json +import os from typing import Dict, List, Any, Optional, Tuple, Union import logging from datetime import datetime @@ -143,7 +144,7 @@ def create_conversation_meta( processed_date=datetime.now().isoformat(), num_comments=len(document_vectors), num_participants=metadata.get('num_participants', 0) if metadata else 0, - embedding_model='all-MiniLM-L6-v2', + embedding_model=os.environ.get("SENTENCE_TRANSFORMER_MODEL", "all-MiniLM-L6-v2"), umap_parameters=umap_params, evoc_parameters=evoc_params, cluster_layers=cluster_layer_info, @@ -173,7 +174,7 @@ def create_comment_embedding( embedding = Embedding( vector=vector.tolist() if isinstance(vector, np.ndarray) else vector, dimensions=len(vector), - model='all-MiniLM-L6-v2' + model=os.environ.get("SENTENCE_TRANSFORMER_MODEL", "all-MiniLM-L6-v2") ) # Create the model with just the embedding vector diff --git a/delphi/umap_narrative/run_pipeline.py b/delphi/umap_narrative/run_pipeline.py index 9fc8a22ef4..597168af18 100755 --- a/delphi/umap_narrative/run_pipeline.py +++ b/delphi/umap_narrative/run_pipeline.py @@ -155,7 +155,9 @@ def process_comments(comments, conversation_id): # Generate embeddings with SentenceTransformer logger.info("Generating embeddings with SentenceTransformer...") - embedding_model = SentenceTransformer("all-MiniLM-L6-v2") + model_name = os.environ.get("SENTENCE_TRANSFORMER_MODEL", "all-MiniLM-L6-v2") + logger.info(f"Using model: {model_name}") + embedding_model = SentenceTransformer(model_name) document_vectors = embedding_model.encode(comment_texts, show_progress_bar=True) # Generate 2D projection with UMAP @@ -303,6 +305,8 @@ def get_topic_name(comments, prompt_prefix=""): # Clean up various prefixes - extended list from 600_generate_llm_topic_names.py prefixes_to_remove = [ + "Here is the list of topic labels:", + "Here is the list of topic labels", "Here are the topic labels:", "Here are the topic labels", "Here is the topic label:", @@ -320,9 +324,18 @@ def get_topic_name(comments, prompt_prefix=""): "Label" ] + # First, check if there's already a layer_cluster prefix (like "1_2:") and remove it + import re + layer_prefix_match = re.match(r'^\d+_\d+:\s*', raw_response) + if layer_prefix_match: + raw_response = raw_response[layer_prefix_match.end():] + for prefix in prefixes_to_remove: if raw_response.startswith(prefix): - raw_response = raw_response.replace(prefix, "", 1).strip() + raw_response = raw_response.replace(prefix, "", 1) + + # Strip all whitespace including newlines BEFORE splitting + raw_response = raw_response.strip() # Get just the first line, as we only want the label topic = raw_response.split('\n')[0].strip() @@ -346,7 +359,7 @@ def get_topic_name(comments, prompt_prefix=""): return topic except Exception as e: logger.error(f"Error generating topic with Ollama: {e}") - return f"Topic {cluster_id}" + return f"Topic {len(comments)}" # Generate labels using Ollama for cluster_id in cluster_characteristics.keys(): diff --git a/example.env b/example.env index e49813bff4..18d27820b8 100644 --- a/example.env +++ b/example.env @@ -124,6 +124,12 @@ OPENAI_API_KEY= # A value in miliseconds for caching AI responses for narrativeReport MAX_REPORT_CACHE_DURATION= +###### SENTENCE TRANSFORMER ###### +# Default model for English text (384-dimensional embeddings) +SENTENCE_TRANSFORMER_MODEL=all-MiniLM-L6-v2 +# Alternative multilingual model (uncomment to use) +# SENTENCE_TRANSFORMER_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 + ###### DYNAMODB ###### # When using local DynamoDB, this should be http://dynamodb:8000. diff --git a/server/app.ts b/server/app.ts index 5401f30296..277ec0c694 100644 --- a/server/app.ts +++ b/server/app.ts @@ -25,6 +25,23 @@ import { handle_GET_delphi_visualizations } from "./src/routes/delphi/visualizat import { handle_POST_delphi_jobs } from "./src/routes/delphi/jobs"; import { handle_GET_delphi_reports } from "./src/routes/delphi/reports"; import { handle_POST_delphi_batch_reports } from "./src/routes/delphi/batchReports"; + +import { + handle_GET_topicMod_topics, + handle_GET_topicMod_comments, + handle_POST_topicMod_moderate, + handle_GET_topicMod_proximity, + handle_GET_topicMod_hierarchy, + handle_GET_topicMod_stats, +} from "./src/routes/delphi/topicMod"; + +import { + handle_POST_topicAgenda_selections, + handle_GET_topicAgenda_selections, + handle_PUT_topicAgenda_selections, + handle_DELETE_topicAgenda_selections, +} from "./src/routes/delphi/topicAgenda"; + import { handle_GET_feeds_directory, handle_GET_consensus_feed, @@ -861,6 +878,148 @@ helpersInitialized.then( } }); + // TopicMod endpoints for topic-based moderation + app.get("/api/v3/topicMod/topics", moveToBody, function (req, res) { + try { + handle_GET_topicMod_topics(req, res); + } catch (err) { + res.json({ + status: "error", + message: "Internal server error in topicMod topics endpoint", + error: err.message || "Unknown error", + }); + } + }); + + app.get( + "/api/v3/topicMod/topics/:topicKey/comments", + moveToBody, + function (req, res) { + try { + handle_GET_topicMod_comments(req, res); + } catch (err) { + res.json({ + status: "error", + message: "Internal server error in topicMod comments endpoint", + error: err.message || "Unknown error", + }); + } + } + ); + + app.post("/api/v3/topicMod/moderate", moveToBody, function (req, res) { + try { + handle_POST_topicMod_moderate(req, res); + } catch (err) { + res.json({ + status: "error", + message: "Internal server error in topicMod moderate endpoint", + error: err.message || "Unknown error", + }); + } + }); + + app.get("/api/v3/topicMod/proximity", moveToBody, function (req, res) { + try { + handle_GET_topicMod_proximity(req, res); + } catch (err) { + res.json({ + status: "error", + message: "Internal server error in topicMod proximity endpoint", + error: err.message || "Unknown error", + }); + } + }); + + app.get("/api/v3/topicMod/stats", moveToBody, function (req, res) { + try { + handle_GET_topicMod_stats(req, res); + } catch (err) { + res.json({ + status: "error", + message: "Internal server error in topicMod stats endpoint", + error: err.message || "Unknown error", + }); + } + }); + + app.get("/api/v3/topicMod/hierarchy", moveToBody, function (req, res) { + try { + handle_GET_topicMod_hierarchy(req, res); + } catch (err) { + res.json({ + status: "error", + message: "Internal server error in topicMod hierarchy endpoint", + error: err.message || "Unknown error", + }); + } + }); + + // Topic Agenda routes + app.post("/api/v3/topicAgenda/selections", + auth(assignToP), + moveToBody, + function (req, res) { + try { + handle_POST_topicAgenda_selections(req, res); + } catch (err) { + res.json({ + status: "error", + message: "Internal server error in topicAgenda selections endpoint", + error: err.message || "Unknown error", + }); + } + } + ); + + app.get("/api/v3/topicAgenda/selections", + auth(assignToP), + moveToBody, + function (req, res) { + try { + handle_GET_topicAgenda_selections(req, res); + } catch (err) { + res.json({ + status: "error", + message: "Internal server error in topicAgenda selections endpoint", + error: err.message || "Unknown error", + }); + } + } + ); + + app.put("/api/v3/topicAgenda/selections", + auth(assignToP), + moveToBody, + function (req, res) { + try { + handle_PUT_topicAgenda_selections(req, res); + } catch (err) { + res.json({ + status: "error", + message: "Internal server error in topicAgenda selections endpoint", + error: err.message || "Unknown error", + }); + } + } + ); + + app.delete("/api/v3/topicAgenda/selections", + auth(assignToP), + moveToBody, + function (req, res) { + try { + handle_DELETE_topicAgenda_selections(req, res); + } catch (err) { + res.json({ + status: "error", + message: "Internal server error in topicAgenda selections endpoint", + error: err.message || "Unknown error", + }); + } + } + ); + // RSS Feeds routes app.get("/feeds/:reportId", function (req, res) { try { @@ -1668,6 +1827,20 @@ helpersInitialized.then( /^\/topicsVizReport\/r?[0-9][0-9A-Za-z]+(\/.*)?/, fetchIndexForReportPage ); + // Topic Prioritize route for dense comment view and hierarchy analysis + app.get( + /^\/topicPrioritize\/r?[0-9][0-9A-Za-z]+(\/.*)?/, + function (req, res, next) { + return fetchIndexForReportPage(req, res, next); + } + ); + // Topic Hierarchy route for circle pack visualization + app.get( + /^\/topicHierarchy\/r?[0-9][0-9A-Za-z]+(\/.*)?/, + function (req, res, next) { + return fetchIndexForReportPage(req, res, next); + } + ); // Export Report route for data export interface app.get( /^\/exportReport\/r?[0-9][0-9A-Za-z]+(\/.*)?/, @@ -1681,6 +1854,18 @@ helpersInitialized.then( return fetchIndexForReportPage(req, res, next); } ); + app.get( + /^\/topicPrioritizeSimple\/r?[0-9][0-9A-Za-z]+(\/.*)?/, + function (req, res, next) { + return fetchIndexForReportPage(req, res, next); + } + ); + app.get( + /^\/topicAgenda\/r?[0-9][0-9A-Za-z]+(\/.*)?/, + function (req, res, next) { + return fetchIndexForReportPage(req, res, next); + } + ); app.get(/^\/thirdPartyCookieTestPt1\.html$/, fetchThirdPartyCookieTestPt1); app.get(/^\/thirdPartyCookieTestPt2\.html$/, fetchThirdPartyCookieTestPt2); diff --git a/server/src/routes/delphi/index.ts b/server/src/routes/delphi/index.ts index 9ea5b8fa85..c79b0e7d54 100644 --- a/server/src/routes/delphi/index.ts +++ b/server/src/routes/delphi/index.ts @@ -3,11 +3,33 @@ import { handle_GET_delphi_visualizations } from './visualizations'; import { handle_POST_delphi_jobs } from './jobs'; import { handle_GET_delphi_reports } from './reports'; import { handle_POST_delphi_batch_reports } from './batchReports'; +import { + handle_GET_topicMod_topics, + handle_GET_topicMod_comments, + handle_POST_topicMod_moderate, + handle_GET_topicMod_proximity, + handle_GET_topicMod_stats +} from './topicMod'; +import { + handle_POST_topicAgenda_selections, + handle_GET_topicAgenda_selections, + handle_PUT_topicAgenda_selections, + handle_DELETE_topicAgenda_selections +} from './topicAgenda'; export { handle_GET_delphi, handle_GET_delphi_visualizations, handle_POST_delphi_jobs, handle_GET_delphi_reports, - handle_POST_delphi_batch_reports + handle_POST_delphi_batch_reports, + handle_GET_topicMod_topics, + handle_GET_topicMod_comments, + handle_POST_topicMod_moderate, + handle_GET_topicMod_proximity, + handle_GET_topicMod_stats, + handle_POST_topicAgenda_selections, + handle_GET_topicAgenda_selections, + handle_PUT_topicAgenda_selections, + handle_DELETE_topicAgenda_selections }; \ No newline at end of file diff --git a/server/src/routes/delphi/topicAgenda.ts b/server/src/routes/delphi/topicAgenda.ts new file mode 100644 index 0000000000..3a81a7b2e0 --- /dev/null +++ b/server/src/routes/delphi/topicAgenda.ts @@ -0,0 +1,332 @@ +import { Request, Response } from "express"; +import logger from "../../utils/logger"; +import { DynamoDBClient } from "@aws-sdk/client-dynamodb"; +import { DynamoDBDocumentClient, PutCommand, GetCommand, UpdateCommand, DeleteCommand } from "@aws-sdk/lib-dynamodb"; +import Config from "../../config"; +import { queryP as pgQueryP } from "../../db/pg-query"; +import Conversation from "../../conversation"; +import { getPidPromise } from "../../user"; + +// DynamoDB configuration (reuse pattern from other Delphi routes) +const dynamoDBConfig: any = { + region: Config.AWS_REGION || "us-east-1", +}; + +if (Config.dynamoDbEndpoint) { + dynamoDBConfig.endpoint = Config.dynamoDbEndpoint; + dynamoDBConfig.credentials = { + accessKeyId: "DUMMYIDEXAMPLE", + secretAccessKey: "DUMMYEXAMPLEKEY", + }; +} else { + if (Config.AWS_ACCESS_KEY_ID && Config.AWS_SECRET_ACCESS_KEY) { + dynamoDBConfig.credentials = { + accessKeyId: Config.AWS_ACCESS_KEY_ID, + secretAccessKey: Config.AWS_SECRET_ACCESS_KEY, + }; + } +} + +const client = new DynamoDBClient(dynamoDBConfig); +const docClient = DynamoDBDocumentClient.from(client, { + marshallOptions: { + convertEmptyValues: true, + removeUndefinedValues: true, + }, +}); + +const TABLE_NAME = "Delphi_TopicAgendaSelections"; + +/** + * Get the current Delphi job ID for a conversation + */ +async function getCurrentDelphiJobId(zid: string): Promise { + try { + const query = ` + SELECT job_id + FROM delphi_jobs + WHERE conversation_id = $1 + AND status = 'completed' + ORDER BY created_at DESC + LIMIT 1 + `; + const result = await pgQueryP(query, [zid]) as { rows: Array<{ job_id: string }> }; + return result.rows.length > 0 ? result.rows[0].job_id : null; + } catch (error) { + logger.error("Error getting current Delphi job ID", error); + return null; + } +} + +/** + * POST /api/v3/topicAgenda/selections + * Save topic agenda selections for a user + */ +export async function handle_POST_topicAgenda_selections(req: Request & { user?: any }, res: Response) { + try { + const { conversation_id, selections } = req.body; + + if (!conversation_id || !selections) { + return res.status(400).json({ + status: "error", + message: "conversation_id and selections are required", + }); + } + + if (!req.user || !req.user.uid) { + return res.status(401).json({ + status: "error", + message: "Authentication required", + }); + } + + // Convert conversation_id to zid + const zid = await Conversation.getZidFromConversationId(conversation_id); + const zidStr = zid.toString(); + + // Get participant ID + const pid = await getPidPromise(zidStr, req.user.uid); + const pidStr = pid.toString(); + + // Get current Delphi job ID + const jobId = await getCurrentDelphiJobId(zidStr); + + // Prepare DynamoDB item + const item = { + conversation_id: zidStr, + participant_id: pidStr, + archetypal_selections: selections, + metadata: { + job_id: jobId, + created_at: new Date().toISOString(), + updated_at: new Date().toISOString(), + version: 1, + total_selections: selections.length, + }, + }; + + // Save to DynamoDB + const putParams = { + TableName: TABLE_NAME, + Item: item, + }; + + await docClient.send(new PutCommand(putParams)); + + logger.info(`Saved topic agenda selections for user ${pidStr} in conversation ${zidStr}`); + + res.json({ + status: "success", + message: "Topic agenda selections saved successfully", + data: { + conversation_id: zidStr, + participant_id: pidStr, + selections_count: selections.length, + job_id: jobId, + }, + }); + } catch (error) { + logger.error("Error saving topic agenda selections", error); + res.status(500).json({ + status: "error", + message: "Failed to save topic agenda selections", + }); + } +} + +/** + * GET /api/v3/topicAgenda/selections + * Retrieve topic agenda selections for a user + */ +export async function handle_GET_topicAgenda_selections(req: Request & { user?: any }, res: Response) { + try { + const conversation_id = req.query.conversation_id as string; + + if (!conversation_id) { + return res.status(400).json({ + status: "error", + message: "conversation_id is required", + }); + } + + if (!req.user || !req.user.uid) { + return res.status(401).json({ + status: "error", + message: "Authentication required", + }); + } + + // Convert conversation_id to zid + const zid = await Conversation.getZidFromConversationId(conversation_id); + const zidStr = zid.toString(); + + // Get participant ID + const pid = await getPidPromise(zidStr, req.user.uid); + const pidStr = pid.toString(); + + // Retrieve from DynamoDB + const getParams = { + TableName: TABLE_NAME, + Key: { + conversation_id: zidStr, + participant_id: pidStr, + }, + }; + + const result = await docClient.send(new GetCommand(getParams)); + + if (!result.Item) { + return res.json({ + status: "success", + message: "No selections found", + data: null, + }); + } + + logger.info(`Retrieved topic agenda selections for user ${pidStr} in conversation ${zidStr}`); + + res.json({ + status: "success", + data: result.Item, + }); + } catch (error) { + logger.error("Error retrieving topic agenda selections", error); + res.status(500).json({ + status: "error", + message: "Failed to retrieve topic agenda selections", + }); + } +} + +/** + * PUT /api/v3/topicAgenda/selections + * Update topic agenda selections for a user + */ +export async function handle_PUT_topicAgenda_selections(req: Request & { user?: any }, res: Response) { + try { + const { conversation_id, selections } = req.body; + + if (!conversation_id || !selections) { + return res.status(400).json({ + status: "error", + message: "conversation_id and selections are required", + }); + } + + if (!req.user || !req.user.uid) { + return res.status(401).json({ + status: "error", + message: "Authentication required", + }); + } + + // Convert conversation_id to zid + const zid = await Conversation.getZidFromConversationId(conversation_id); + const zidStr = zid.toString(); + + // Get participant ID + const pid = await getPidPromise(zidStr, req.user.uid); + const pidStr = pid.toString(); + + // Get current Delphi job ID + const jobId = await getCurrentDelphiJobId(zidStr); + + // Update in DynamoDB + const updateParams = { + TableName: TABLE_NAME, + Key: { + conversation_id: zidStr, + participant_id: pidStr, + }, + UpdateExpression: "SET archetypal_selections = :selections, metadata = :metadata", + ExpressionAttributeValues: { + ":selections": selections, + ":metadata": { + job_id: jobId, + created_at: new Date().toISOString(), // Keep original creation time if exists + updated_at: new Date().toISOString(), + version: 1, + total_selections: selections.length, + }, + }, + ReturnValues: "ALL_NEW" as const, + }; + + const result = await docClient.send(new UpdateCommand(updateParams)); + + logger.info(`Updated topic agenda selections for user ${pidStr} in conversation ${zidStr}`); + + res.json({ + status: "success", + message: "Topic agenda selections updated successfully", + data: { + conversation_id: zidStr, + participant_id: pidStr, + selections_count: selections.length, + job_id: jobId, + }, + }); + } catch (error) { + logger.error("Error updating topic agenda selections", error); + res.status(500).json({ + status: "error", + message: "Failed to update topic agenda selections", + }); + } +} + +/** + * DELETE /api/v3/topicAgenda/selections + * Delete topic agenda selections for a user + */ +export async function handle_DELETE_topicAgenda_selections(req: Request & { user?: any }, res: Response) { + try { + const conversation_id = req.query.conversation_id as string; + + if (!conversation_id) { + return res.status(400).json({ + status: "error", + message: "conversation_id is required", + }); + } + + if (!req.user || !req.user.uid) { + return res.status(401).json({ + status: "error", + message: "Authentication required", + }); + } + + // Convert conversation_id to zid + const zid = await Conversation.getZidFromConversationId(conversation_id); + const zidStr = zid.toString(); + + // Get participant ID + const pid = await getPidPromise(zidStr, req.user.uid); + const pidStr = pid.toString(); + + // Delete from DynamoDB + const deleteParams = { + TableName: TABLE_NAME, + Key: { + conversation_id: zidStr, + participant_id: pidStr, + }, + }; + + await docClient.send(new DeleteCommand(deleteParams)); + + logger.info(`Deleted topic agenda selections for user ${pidStr} in conversation ${zidStr}`); + + res.json({ + status: "success", + message: "Topic agenda selections deleted successfully", + }); + } catch (error) { + logger.error("Error deleting topic agenda selections", error); + res.status(500).json({ + status: "error", + message: "Failed to delete topic agenda selections", + }); + } +} \ No newline at end of file diff --git a/server/src/routes/delphi/topicMod.ts b/server/src/routes/delphi/topicMod.ts new file mode 100644 index 0000000000..973d554678 --- /dev/null +++ b/server/src/routes/delphi/topicMod.ts @@ -0,0 +1,875 @@ +import { Request, Response } from "express"; +import logger from "../../utils/logger"; +import { DynamoDBClient } from "@aws-sdk/client-dynamodb"; +import { DynamoDBDocumentClient, QueryCommand, PutCommand, UpdateCommand } from "@aws-sdk/lib-dynamodb"; +import Config from "../../config"; +import { queryP as pgQueryP } from "../../db/pg-query"; +import Conversation from "../../conversation"; + +// DynamoDB configuration (reuse from topics.ts) +const dynamoDBConfig: any = { + region: Config.AWS_REGION || "us-east-1", +}; + +if (Config.dynamoDbEndpoint) { + dynamoDBConfig.endpoint = Config.dynamoDbEndpoint; + dynamoDBConfig.credentials = { + accessKeyId: "DUMMYIDEXAMPLE", + secretAccessKey: "DUMMYEXAMPLEKEY", + }; +} else { + if (Config.AWS_ACCESS_KEY_ID && Config.AWS_SECRET_ACCESS_KEY) { + dynamoDBConfig.credentials = { + accessKeyId: Config.AWS_ACCESS_KEY_ID, + secretAccessKey: Config.AWS_SECRET_ACCESS_KEY, + }; + } +} + +const client = new DynamoDBClient(dynamoDBConfig); +const docClient = DynamoDBDocumentClient.from(client, { + marshallOptions: { + convertEmptyValues: true, + removeUndefinedValues: true, + }, +}); + +/** + * GET /api/v3/topicMod/topics + * Retrieves topics with moderation status + */ +export async function handle_GET_topicMod_topics(req: Request, res: Response) { + try { + const conversation_id = req.query.conversation_id as string; + const job_id = req.query.job_id as string; + + if (!conversation_id) { + return res.json({ + status: "error", + message: "conversation_id is required", + }); + } + + // Get zid from conversation_id (which could be a zinvite) + const zid = await Conversation.getZidFromConversationId(conversation_id); + if (!zid) { + return res.json({ + status: "error", + message: "Could not find conversation for conversation_id", + }); + } + + const conversation_zid = zid.toString(); + logger.info(`Fetching TopicMod topics for zid: ${conversation_zid}`); + + // Query topics from existing table + const topicsParams = { + TableName: "Delphi_CommentClustersLLMTopicNames", + KeyConditionExpression: "conversation_id = :cid", + ExpressionAttributeValues: { + ":cid": conversation_zid, + }, + }; + + // Filter by job_id if provided + if (job_id) { + topicsParams.KeyConditionExpression += " AND begins_with(topic_key, :job_id)"; + topicsParams.ExpressionAttributeValues[":job_id"] = `${job_id}#`; + } + + const topicsData = await docClient.send(new QueryCommand(topicsParams)); + + if (!topicsData.Items || topicsData.Items.length === 0) { + return res.json({ + status: "success", + message: "No topics found for this conversation", + topics: [], + }); + } + + // Query moderation status for each topic + const moderationParams = { + TableName: "Delphi_TopicModerationStatus", + KeyConditionExpression: "conversation_id = :cid", + ExpressionAttributeValues: { + ":cid": conversation_zid, + }, + }; + + let moderationData; + try { + moderationData = await docClient.send(new QueryCommand(moderationParams)); + } catch (err: any) { + // Moderation table might not exist yet - that's okay + logger.info("Moderation status table not found, using default status"); + moderationData = { Items: [] }; + } + + // Create moderation status map + const moderationMap = new Map(); + moderationData.Items?.forEach((item) => { + moderationMap.set(item.topic_key, { + status: item.moderation_status, + moderator: item.moderator, + moderated_at: item.moderated_at, + comment_count: item.comment_count || 0, + }); + }); + + // Combine topics with moderation status + const topicsWithStatus = topicsData.Items.map((topic) => { + const moderation = moderationMap.get(topic.topic_key) || { + status: "pending", + moderator: null, + moderated_at: null, + comment_count: 0, + }; + + return { + topic_name: topic.topic_name, + model_name: topic.model_name, + created_at: topic.created_at, + topic_key: topic.topic_key, + layer_id: topic.layer_id, + cluster_id: topic.cluster_id, + moderation: moderation, + }; + }); + + // Group by layer for hierarchical display + const topicsByLayer: Record = {}; + topicsWithStatus.forEach((topic) => { + const layerId = topic.layer_id || "0"; + if (!topicsByLayer[layerId]) { + topicsByLayer[layerId] = []; + } + topicsByLayer[layerId].push(topic); + }); + + // Sort topics within each layer by cluster_id + Object.keys(topicsByLayer).forEach((layerId) => { + topicsByLayer[layerId].sort((a, b) => { + return parseInt(a.cluster_id || "0") - parseInt(b.cluster_id || "0"); + }); + }); + + return res.json({ + status: "success", + message: "Topics retrieved successfully", + topics_by_layer: topicsByLayer, + total_topics: topicsWithStatus.length, + }); + } catch (err: any) { + logger.error(`Error in handle_GET_topicMod_topics: ${err.message}`); + return res.json({ + status: "error", + message: "Error retrieving topics", + error: err.message, + }); + } +} + +/** + * GET /api/v3/topicMod/topics/:topicKey/comments + * Retrieves comments for a specific topic + */ +export async function handle_GET_topicMod_comments(req: Request, res: Response) { + try { + const conversation_id = req.query.conversation_id as string; + const topic_key = req.params.topicKey; + + if (!conversation_id || !topic_key) { + return res.json({ + status: "error", + message: "conversation_id and topic_key are required", + }); + } + + const zid = await Conversation.getZidFromConversationId(conversation_id); + if (!zid) { + return res.json({ + status: "error", + message: "Could not find conversation for conversation_id", + }); + } + + const comment_conversation_id = zid.toString(); + logger.info(`Fetching comments for topic ${topic_key} in conversation ${comment_conversation_id}`); + + // Query comments from topic clusters table + const params = { + TableName: "Delphi_CommentClusters", + KeyConditionExpression: "conversation_id = :cid AND topic_key = :tk", + ExpressionAttributeValues: { + ":cid": comment_conversation_id, + ":tk": topic_key, + }, + }; + + const data = await docClient.send(new QueryCommand(params)); + + if (!data.Items || data.Items.length === 0) { + return res.json({ + status: "success", + message: "No comments found for this topic", + comments: [], + }); + } + + // Get comment details from main comments table + const comments = data.Items.map((item) => ({ + comment_id: item.comment_id, + comment_text: item.comment_text, + umap_x: item.umap_x, + umap_y: item.umap_y, + cluster_id: item.cluster_id, + layer_id: item.layer_id, + moderation_status: item.moderation_status || "pending", + })); + + return res.json({ + status: "success", + message: "Comments retrieved successfully", + comments: comments, + total_comments: comments.length, + }); + } catch (err: any) { + logger.error(`Error in handle_GET_topicMod_comments: ${err.message}`); + return res.json({ + status: "error", + message: "Error retrieving comments", + error: err.message, + }); + } +} + +/** + * POST /api/v3/topicMod/moderate + * Applies moderation actions to topics or individual comments + */ +export async function handle_POST_topicMod_moderate(req: Request, res: Response) { + try { + const { conversation_id, topic_key, comment_ids, action, moderator } = req.body; + + if (!conversation_id || !action || !moderator) { + return res.json({ + status: "error", + message: "conversation_id, action, and moderator are required", + }); + } + + if (!["accept", "reject", "meta"].includes(action)) { + return res.json({ + status: "error", + message: "action must be 'accept', 'reject', or 'meta'", + }); + } + + const zid = await Conversation.getZidFromConversationId(conversation_id); + if (!zid) { + return res.json({ + status: "error", + message: "Could not find conversation for conversation_id", + }); + } + + const moderate_conversation_id = zid.toString(); + const now = new Date().toISOString(); + + // If topic_key is provided, moderate entire topic + if (topic_key) { + logger.info(`Moderating entire topic ${topic_key} as ${action}`); + + // Update topic moderation status + const topicParams = { + TableName: "Delphi_TopicModerationStatus", + Key: { + conversation_id: moderate_conversation_id, + topic_key: topic_key, + }, + UpdateExpression: "SET moderation_status = :status, moderator = :mod, moderated_at = :time", + ExpressionAttributeValues: { + ":status": action, + ":mod": moderator, + ":time": now, + }, + ReturnValues: "ALL_NEW" as const, + }; + + try { + await docClient.send(new UpdateCommand(topicParams)); + } catch (err: any) { + if (err.name === "ResourceNotFoundException") { + // Create the record if it doesn't exist + const putParams = { + TableName: "Delphi_TopicModerationStatus", + Item: { + conversation_id: moderate_conversation_id, + topic_key: topic_key, + moderation_status: action, + moderator: moderator, + moderated_at: now, + }, + }; + await docClient.send(new PutCommand(putParams)); + } else { + throw err; + } + } + + // Update individual comments in the topic + const commentsParams = { + TableName: "Delphi_CommentClusters", + KeyConditionExpression: "conversation_id = :cid AND topic_key = :tk", + ExpressionAttributeValues: { + ":cid": moderate_conversation_id, + ":tk": topic_key, + }, + }; + + const commentsData = await docClient.send(new QueryCommand(commentsParams)); + + if (commentsData.Items && commentsData.Items.length > 0) { + // Update moderation status in main comments table + const moderationStatus = action === "accept" ? 1 : (action === "reject" ? -1 : 0); + const isMeta = action === "meta" ? true : false; + + for (const comment of commentsData.Items) { + const comment_id = comment.comment_id; + + // Update in comments table + await pgQueryP( + "UPDATE comments SET mod = ($1), is_meta = ($2) WHERE zid = ($3) AND tid = ($4)", + [moderationStatus, isMeta, zid, comment_id] + ); + } + } + } + + // If comment_ids are provided, moderate individual comments + if (comment_ids && Array.isArray(comment_ids)) { + logger.info(`Moderating ${comment_ids.length} individual comments as ${action}`); + + const moderationStatus = action === "accept" ? 1 : (action === "reject" ? -1 : 0); + const isMeta = action === "meta" ? true : false; + + for (const comment_id of comment_ids) { + await pgQueryP( + "UPDATE comments SET mod = ($1), is_meta = ($2) WHERE zid = ($3) AND tid = ($4)", + [moderationStatus, isMeta, zid, comment_id] + ); + } + } + + return res.json({ + status: "success", + message: `Moderation action '${action}' applied successfully`, + moderated_at: now, + }); + } catch (err: any) { + logger.error(`Error in handle_POST_topicMod_moderate: ${err.message}`); + return res.json({ + status: "error", + message: "Error applying moderation action", + error: err.message, + }); + } +} + +/** + * GET /api/v3/topicMod/proximity + * Retrieves UMAP proximity data for visualization + */ +export async function handle_GET_topicMod_proximity(req: Request, res: Response) { + try { + const conversation_id = req.query.conversation_id as string; + const layer_id = req.query.layer_id as string || "all"; + + if (!conversation_id) { + return res.json({ + status: "error", + message: "conversation_id is required", + }); + } + + const zid = await Conversation.getZidFromConversationId(conversation_id); + if (!zid) { + return res.json({ + status: "error", + message: "Could not find conversation for conversation_id", + }); + } + + const proximity_conversation_id = zid.toString(); + logger.info(`Fetching proximity data for conversation ${proximity_conversation_id}, layer ${layer_id}`); + + // Get ALL UMAP coordinates from Delphi_UMAPGraph + // Node positions are stored where source_id = target_id + const umapParams = { + TableName: "Delphi_UMAPGraph", + KeyConditionExpression: "conversation_id = :cid", + FilterExpression: "source_id = target_id", // Only nodes, not edges + ExpressionAttributeValues: { + ":cid": proximity_conversation_id, + }, + }; + + const umapData = await docClient.send(new QueryCommand(umapParams)); + + if (!umapData.Items || umapData.Items.length === 0) { + return res.json({ + status: "success", + message: "No UMAP coordinates found", + proximity_data: [], + }); + } + + logger.info(`Found ${umapData.Items.length} UMAP coordinate points`); + + // If layer_id is "all", return all coordinates with cluster info from all layers + if (layer_id === "all") { + // Get ALL cluster assignments for all layers + const clusterParams = { + TableName: "Delphi_CommentHierarchicalClusterAssignments", + KeyConditionExpression: "conversation_id = :cid", + ExpressionAttributeValues: { + ":cid": proximity_conversation_id, + }, + }; + + let clusterData; + try { + clusterData = await docClient.send(new QueryCommand(clusterParams)); + logger.info(`Found ${clusterData.Items?.length || 0} cluster assignments`); + } catch (err: any) { + logger.error(`Error fetching cluster assignments: ${err.message}`); + clusterData = { Items: [] }; + } + + // Create a map of comment_id to cluster assignments for all layers + const commentToClustersByLayer = new Map(); + if (clusterData.Items && clusterData.Items.length > 0) { + logger.info(`CLUSTER DEBUG: Processing ${clusterData.Items.length} cluster assignment items`); + + // Debug: Show structure of first few cluster items + clusterData.Items.slice(0, 3).forEach((item, i) => { + logger.info(`CLUSTER DEBUG: Item ${i} full structure:`, JSON.stringify(item, null, 2)); + logger.info(`CLUSTER DEBUG: Item ${i} keys:`, Object.keys(item)); + }); + + clusterData.Items.forEach((item, index) => { + const commentId = item.comment_id; + + if (index < 5) { + logger.info(`CLUSTER DEBUG: Processing item ${index}: comment_id=${commentId}, layer0=${item.layer0_cluster_id}, layer1=${item.layer1_cluster_id}, layer2=${item.layer2_cluster_id}, layer3=${item.layer3_cluster_id}`); + } + + if (!commentToClustersByLayer.has(commentId)) { + commentToClustersByLayer.set(commentId, {}); + } + + // Add cluster assignments for each layer that has a value + const clustersByLayer = commentToClustersByLayer.get(commentId); + if (item.layer0_cluster_id !== null && item.layer0_cluster_id !== undefined) { + clustersByLayer['0'] = item.layer0_cluster_id; + } + if (item.layer1_cluster_id !== null && item.layer1_cluster_id !== undefined) { + clustersByLayer['1'] = item.layer1_cluster_id; + } + if (item.layer2_cluster_id !== null && item.layer2_cluster_id !== undefined) { + clustersByLayer['2'] = item.layer2_cluster_id; + } + if (item.layer3_cluster_id !== null && item.layer3_cluster_id !== undefined) { + clustersByLayer['3'] = item.layer3_cluster_id; + } + }); + + logger.info(`CLUSTER DEBUG: Created cluster assignments for ${commentToClustersByLayer.size} comments`); + + // Debug: Show sample assignments for first few comments + const firstFewCommentIds = Array.from(commentToClustersByLayer.keys()).slice(0, 3); + firstFewCommentIds.forEach(commentId => { + const assignments = commentToClustersByLayer.get(commentId); + logger.info(`CLUSTER DEBUG: Comment ${commentId} assignments:`, JSON.stringify(assignments)); + }); + } else { + logger.warn("CLUSTER DEBUG: No cluster assignment data found in Delphi_CommentHierarchicalClusterAssignments"); + } + + // Return ALL comment coordinates with cluster info for all layers + logger.info(`RESPONSE DEBUG: Starting to process ${umapData.Items.length} UMAP items`); + + const validUmapItems = umapData.Items.filter(item => { + // Filter out items with invalid positions + const x = item.position?.x; + const y = item.position?.y; + const isValid = x !== null && x !== undefined && !isNaN(x) && + y !== null && y !== undefined && !isNaN(y) && + isFinite(x) && isFinite(y); + return isValid; + }); + + logger.info(`RESPONSE DEBUG: ${validUmapItems.length} items have valid UMAP coordinates`); + + const proximityData = validUmapItems.map((item, index) => { + const commentId = item.source_id; + const clusterInfo = commentToClustersByLayer.get(commentId) || {}; + + if (index < 5) { + logger.info(`RESPONSE DEBUG: Processing UMAP item ${index}: comment_id=${commentId}, clusters=${JSON.stringify(clusterInfo)}`); + } + + const responseItem = { + comment_id: commentId, + umap_x: item.position.x, + umap_y: item.position.y, + weight: item.weight || 1, + clusters: clusterInfo, // cluster_id for each layer + }; + + if (index < 3) { + logger.info(`RESPONSE DEBUG: Response item ${index}:`, JSON.stringify(responseItem)); + } + + return responseItem; + }); + + // Count how many items actually have cluster assignments + const itemsWithClusters = proximityData.filter(item => Object.keys(item.clusters).length > 0); + logger.info(`RESPONSE DEBUG: ${itemsWithClusters.length} out of ${proximityData.length} response items have cluster assignments`); + + return res.json({ + status: "success", + message: "All proximity data retrieved successfully", + proximity_data: proximityData, + total_points: proximityData.length, + }); + } + + // If specific layer is requested, filter by that layer + const clusterParams = { + TableName: "Delphi_CommentHierarchicalClusterAssignments", + KeyConditionExpression: "conversation_id = :cid", + FilterExpression: "layer_id = :lid", + ExpressionAttributeValues: { + ":cid": proximity_conversation_id, + ":lid": parseInt(layer_id), + }, + }; + + const clusterData = await docClient.send(new QueryCommand(clusterParams)); + + // Create a map of comment_id to cluster_id for the specified layer + const commentToCluster = new Map(); + if (clusterData.Items) { + clusterData.Items.forEach(item => { + commentToCluster.set(item.comment_id, item.cluster_id); + }); + } + + // Filter UMAP coordinates to only include comments in the specified layer + const proximityData = umapData.Items + .filter(item => { + const commentId = item.source_id; // For nodes, source_id = target_id = comment_id + // Check for valid position data + const x = item.position?.x; + const y = item.position?.y; + const hasValidPosition = x !== null && x !== undefined && !isNaN(x) && + y !== null && y !== undefined && !isNaN(y) && + isFinite(x) && isFinite(y); + + return commentToCluster.has(commentId) && hasValidPosition; + }) + .map((item) => { + const commentId = item.source_id; + const clusterId = commentToCluster.get(commentId); + + return { + comment_id: commentId, + cluster_id: clusterId, + layer_id: parseInt(layer_id), + umap_x: item.position.x, + umap_y: item.position.y, + weight: item.weight || 1, + }; + }); + + return res.json({ + status: "success", + message: "Proximity data retrieved successfully", + proximity_data: proximityData, + total_points: proximityData.length, + }); + } catch (err: any) { + logger.error(`Error in handle_GET_topicMod_proximity: ${err.message}`); + return res.json({ + status: "error", + message: "Error retrieving proximity data", + error: err.message, + }); + } +} + +/** + * GET /api/v3/topicMod/hierarchy + * Retrieves hierarchical cluster structure for circle pack visualization + */ +export async function handle_GET_topicMod_hierarchy(req: Request, res: Response) { + try { + const conversation_id = req.query.conversation_id as string; + + if (!conversation_id) { + return res.json({ + status: "error", + message: "conversation_id is required", + }); + } + + const zid = await Conversation.getZidFromConversationId(conversation_id); + if (!zid) { + return res.json({ + status: "error", + message: "Could not find conversation for conversation_id", + }); + } + + const hierarchy_conversation_id = zid.toString(); + logger.info(`Fetching hierarchy data for conversation ${hierarchy_conversation_id}`); + + // Query cluster structure from DynamoDB + const params = { + TableName: "Delphi_CommentClustersStructureKeywords", + KeyConditionExpression: "conversation_id = :cid", + ExpressionAttributeValues: { + ":cid": hierarchy_conversation_id, + }, + }; + + const data = await docClient.send(new QueryCommand(params)); + + if (!data.Items || data.Items.length === 0) { + return res.json({ + status: "success", + message: "No hierarchy data found", + hierarchy: null, + }); + } + + // Process and structure the hierarchy data + const clusters = data.Items; + logger.info(`Found ${clusters.length} clusters in DynamoDB`); + + // Debug: log layer distribution + const layerCounts = {}; + clusters.forEach(cluster => { + const layer = cluster.layer_id; + layerCounts[layer] = (layerCounts[layer] || 0) + 1; + }); + logger.info(`Layer distribution:`, layerCounts); + + // Debug: log sample clusters from each layer + Object.keys(layerCounts).forEach(layer => { + const sampleCluster = clusters.find(c => c.layer_id.toString() === layer.toString()); + if (sampleCluster) { + logger.info(`Sample Layer ${layer} cluster:`, { + cluster_key: sampleCluster.cluster_key, + layer_id: sampleCluster.layer_id, + cluster_id: sampleCluster.cluster_id, + size: sampleCluster.size, + has_parent: !!sampleCluster.parent_cluster, + has_children: !!(sampleCluster.child_clusters && sampleCluster.child_clusters.length > 0), + parent_cluster: sampleCluster.parent_cluster, + child_clusters: sampleCluster.child_clusters + }); + } else { + logger.error(`No sample cluster found for layer ${layer}`); + } + }); + + const hierarchyMap = new Map(); + const layers = new Map(); + + // First pass: create all nodes + clusters.forEach((cluster) => { + const key = cluster.cluster_key; + const layerId = cluster.layer_id; + const clusterId = cluster.cluster_id; + + if (!layers.has(layerId)) { + layers.set(layerId, []); + } + + const node = { + id: key, + name: `Layer ${layerId} Cluster ${clusterId}`, + layer: layerId, + clusterId: clusterId, + size: cluster.size || 0, + topic_name: cluster.topic_name || cluster.llm_topic_name || cluster.keywords_string, + children: [], + parentId: null, + data: cluster + }; + + hierarchyMap.set(key, node); + layers.get(layerId).push(node); + }); + + // Second pass: INVERT DynamoDB parent-child relationships for circle pack + // DynamoDB: Layer 3 has parent Layer 2 (Layer 3 merges INTO Layer 2) + // Circle pack needs: Layer 2 CONTAINS Layer 3 (Layer 2 is bigger circle containing Layer 3) + clusters.forEach((cluster) => { + const key = cluster.cluster_key; + const node = hierarchyMap.get(key); + + // If this cluster HAS a parent in DynamoDB, make that parent contain THIS cluster as a child + if (cluster.parent_cluster && cluster.parent_cluster.layer_id !== undefined && cluster.parent_cluster.cluster_id !== undefined) { + const parentKey = `layer${cluster.parent_cluster.layer_id}_${cluster.parent_cluster.cluster_id}`; + const parentNode = hierarchyMap.get(parentKey); + if (parentNode) { + // INVERTED: The "parent" in DynamoDB becomes the container in circle pack + parentNode.children.push(node); + node.parentId = parentKey; + } else { + logger.warn(`Parent node ${parentKey} not found for child ${key}`); + } + } + }); + + // For circle pack: find ALL clusters that have no parents (roots at any level) + // Some Layer 3, some Layer 2, some Layer 1, and some Layer 0 clusters may be top-level + // In EVōC: smaller clusters are "parents" of larger (they merge into larger ones) + // For visualization: we want mixed-level roots showing the true hierarchy + const roots = Array.from(hierarchyMap.values()).filter(node => !node.parentId); + + + + + // Remove parent references to avoid circular JSON and clean up for D3 + const cleanNode = (node) => { + const cleaned = { + id: node.id, + name: node.name, + layer: node.layer, + clusterId: node.clusterId, + size: node.size, + topic_name: node.topic_name, + children: node.children.map(cleanNode) + }; + return cleaned; + }; + + // Create the hierarchy structure for D3 + const hierarchy = { + name: "Topics Hierarchy", + children: roots.map(cleanNode), + totalClusters: clusters.length, + layerCounts: Object.fromEntries( + Array.from(layers.entries()).map(([layerId, nodes]) => [layerId, nodes.length]) + ) + }; + + return res.json({ + status: "success", + message: "Hierarchy data retrieved successfully", + hierarchy: hierarchy, + totalClusters: clusters.length, + layers: Array.from(layers.keys()).sort(), + }); + } catch (err: any) { + logger.error(`Error in handle_GET_topicMod_hierarchy: ${err.message}`); + return res.json({ + status: "error", + message: "Error retrieving hierarchy data", + error: err.message, + }); + } +} + +/** + * GET /api/v3/topicMod/stats + * Retrieves moderation statistics + */ +export async function handle_GET_topicMod_stats(req: Request, res: Response) { + try { + const conversation_id = req.query.conversation_id as string; + + if (!conversation_id) { + return res.json({ + status: "error", + message: "conversation_id is required", + }); + } + + const zid = await Conversation.getZidFromConversationId(conversation_id); + if (!zid) { + return res.json({ + status: "error", + message: "Could not find conversation for conversation_id", + }); + } + + const stats_conversation_id = zid.toString(); + logger.info(`Fetching moderation stats for conversation ${stats_conversation_id}`); + + // Get moderation status for all topics + const params = { + TableName: "Delphi_TopicModerationStatus", + KeyConditionExpression: "conversation_id = :cid", + ExpressionAttributeValues: { + ":cid": stats_conversation_id, + }, + }; + + let data; + try { + data = await docClient.send(new QueryCommand(params)); + } catch (err: any) { + if (err.name === "ResourceNotFoundException") { + // No moderation data yet + return res.json({ + status: "success", + message: "No moderation data available yet", + stats: { + total_topics: 0, + pending: 0, + accepted: 0, + rejected: 0, + meta: 0, + }, + }); + } + throw err; + } + + // Calculate statistics + const stats = { + total_topics: data.Items?.length || 0, + pending: 0, + accepted: 0, + rejected: 0, + meta: 0, + }; + + data.Items?.forEach((item) => { + const status = item.moderation_status; + if (status === "accept") stats.accepted++; + else if (status === "reject") stats.rejected++; + else if (status === "meta") stats.meta++; + else stats.pending++; + }); + + return res.json({ + status: "success", + message: "Moderation statistics retrieved successfully", + stats: stats, + }); + } catch (err: any) { + logger.error(`Error in handle_GET_topicMod_stats: ${err.message}`); + return res.json({ + status: "error", + message: "Error retrieving moderation statistics", + error: err.message, + }); + } +} \ No newline at end of file diff --git a/server/src/server.ts b/server/src/server.ts index ca0ab9403d..72f9b2e557 100644 --- a/server/src/server.ts +++ b/server/src/server.ts @@ -6753,11 +6753,19 @@ Email verified! You can close this tab or hit the back button. getConversationHasMetadata(zid), _.isUndefined(uid) ? Promise.resolve({}) : getUserInfoForUid2(uid), getConversationTranslationsMinimal(zid, lang), + // Check if user is a moderator for this conversation + _.isUndefined(uid) + ? Promise.resolve([]) + : pgQueryP_readOnly( + "select * from moderators where zid = ($1) and uid = ($2);", + [zid, uid] + ), ]).then(function (results: any[]) { let conv = results[0] && results[0][0]; let convHasMetadata = results[1]; let requestingUserInfo = results[2]; let translations = results[3]; + let moderatorResult = results[4]; conv.auth_opt_allow_3rdparty = ifDefinedFirstElseSecond( conv.auth_opt_allow_3rdparty, @@ -6776,8 +6784,16 @@ Email verified! You can close this tab or hit the back button. if (!_.isUndefined(ownername) && conv.context !== "hongkong2014") { conv.ownername = ownername; } - conv.is_mod = conv.site_id === requestingUserInfo.site_id; - conv.is_owner = conv.owner === uid; + // Check if user is moderator: either same site_id OR explicitly added as moderator + const isExplicitModerator = + moderatorResult && moderatorResult.length > 0; + conv.is_mod = + conv.site_id === requestingUserInfo.site_id || + isExplicitModerator || + (JSON.parse(Config.adminUIDs) as Array).includes(uid); + conv.is_owner = + conv.owner === uid || + (JSON.parse(Config.adminUIDs) as Array).includes(uid); delete conv.uid; // conv.owner is what you want, uid shouldn't be returned. return conv; });