diff --git a/Dockerfile b/Dockerfile index ffb3bd9..5e6a084 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,6 +17,8 @@ WORKDIR /rails # Install base packages RUN apt-get update -qq && \ apt-get install --no-install-recommends -y curl libjemalloc2 libvips postgresql-client poppler-utils tesseract-ocr && \ + curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp && \ + chmod +x /usr/local/bin/yt-dlp && \ ln -s /usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2 /usr/local/lib/libjemalloc.so && \ rm -rf /var/lib/apt/lists /var/cache/apt/archives diff --git a/app/assets/stylesheets/application.css b/app/assets/stylesheets/application.css index c2aab30..3948494 100644 --- a/app/assets/stylesheets/application.css +++ b/app/assets/stylesheets/application.css @@ -1971,6 +1971,26 @@ th.sortable-header:hover { /* === Meeting Show — Structured Sections === */ +.transcript-banner { + display: flex; + align-items: center; + gap: var(--space-3); + padding: var(--space-3) var(--space-4); + margin-bottom: var(--space-6); + background: var(--color-accent-cool-bg); + border: 1px solid color-mix(in srgb, var(--color-accent-cool) 30%, var(--color-border)); + border-left: 3px solid var(--color-accent-cool); + border-radius: var(--radius-md); + font-family: var(--font-body); + font-size: var(--font-size-sm); + color: var(--color-text); +} + +.transcript-banner svg { + flex-shrink: 0; + color: var(--color-accent-cool); +} + .meeting-headline { font-size: var(--font-size-lg); line-height: 1.4; diff --git a/app/controllers/meetings_controller.rb b/app/controllers/meetings_controller.rb index f2b4cb6..890e25f 100644 --- a/app/controllers/meetings_controller.rb +++ b/app/controllers/meetings_controller.rb @@ -20,8 +20,9 @@ def show topic.topic_appearances.size > 1 end - # Prefer minutes_recap over packet_analysis + # Prefer minutes_recap over transcript_recap over packet_analysis @summary = @meeting.meeting_summaries.find_by(summary_type: "minutes_recap") || + @meeting.meeting_summaries.find_by(summary_type: "transcript_recap") || @meeting.meeting_summaries.find_by(summary_type: "packet_analysis") end end diff --git a/app/jobs/documents/download_transcript_job.rb b/app/jobs/documents/download_transcript_job.rb new file mode 100644 index 0000000..8ce7a84 --- /dev/null +++ b/app/jobs/documents/download_transcript_job.rb @@ -0,0 +1,82 @@ +require "open3" + +module Documents + class DownloadTranscriptJob < ApplicationJob + queue_as :default + + YOUTUBE_URL_PATTERN = %r{\Ahttps://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]+\z} + + def perform(meeting_id, video_url) + unless video_url.match?(YOUTUBE_URL_PATTERN) + Rails.logger.error "DownloadTranscriptJob: invalid video URL: #{video_url}" + return + end + + meeting = Meeting.find(meeting_id) + + # Idempotency: skip if transcript already exists + return if meeting.meeting_documents.exists?(document_type: "transcript") + + srt_content, plain_text = download_captions(video_url) + return unless plain_text + + document = meeting.meeting_documents.create!( + document_type: "transcript", + source_url: video_url, + extracted_text: plain_text, + text_quality: "auto_transcribed", + text_chars: plain_text.length, + fetched_at: Time.current + ) + + document.file.attach( + io: StringIO.new(srt_content), + filename: "transcript-#{meeting.starts_at.to_date}.srt", + content_type: "text/srt" + ) + + unless meeting.meeting_summaries.exists?(summary_type: "minutes_recap") + SummarizeMeetingJob.perform_later(meeting.id) + end + end + + private + + def download_captions(video_url) + Dir.mktmpdir("transcript") do |tmpdir| + stdout, stderr, status = Open3.capture3( + "yt-dlp", + "--write-auto-sub", + "--sub-lang", "en", + "--sub-format", "srt", + "--skip-download", + "-o", "#{tmpdir}/video", + video_url + ) + + unless status.success? + Rails.logger.error "yt-dlp failed for #{video_url}: #{stderr.strip}" + return nil + end + + srt_files = Dir.glob("#{tmpdir}/*.srt") + if srt_files.empty? + Rails.logger.error "yt-dlp produced no SRT file for #{video_url}" + return nil + end + + srt_content = File.read(srt_files.first) + plain_text = parse_srt(srt_content) + [ srt_content, plain_text ] + end + end + + def parse_srt(srt_content) + srt_content + .gsub(/^\d+\s*$/, "") + .gsub(/^\d{2}:\d{2}:\d{2},\d{3}\s*-->.*$/, "") + .gsub(/\n{3,}/, "\n\n") + .strip + end + end +end diff --git a/app/jobs/scrapers/discover_meetings_job.rb b/app/jobs/scrapers/discover_meetings_job.rb index 6bc88f1..4c786ed 100644 --- a/app/jobs/scrapers/discover_meetings_job.rb +++ b/app/jobs/scrapers/discover_meetings_job.rb @@ -21,6 +21,9 @@ def perform(since: nil) page = next_link.click end + + # Check for YouTube transcripts for recent council meetings + Scrapers::DiscoverTranscriptsJob.perform_later end private diff --git a/app/jobs/scrapers/discover_transcripts_job.rb b/app/jobs/scrapers/discover_transcripts_job.rb new file mode 100644 index 0000000..f9497f7 --- /dev/null +++ b/app/jobs/scrapers/discover_transcripts_job.rb @@ -0,0 +1,74 @@ +require "open3" + +module Scrapers + class DiscoverTranscriptsJob < ApplicationJob + queue_as :default + + YOUTUBE_CHANNEL_URL = "https://www.youtube.com/@Two_Rivers_WI/streams" + TITLE_PATTERN = /(?:City Council (?:Meeting|Work Session)) for \w+, (.+)$/i + COUNCIL_BODY_NAMES = [ "City Council Meeting", "City Council Work Session" ].freeze + LOOKBACK_WINDOW = 48.hours + + def perform + meetings = candidate_meetings + return if meetings.empty? + + videos = fetch_video_list + return if videos.nil? + + videos.each do |video_id, title| + match = TITLE_PATTERN.match(title) + next unless match + + date_str = match[1].strip + parsed_date = parse_date(date_str) + next unless parsed_date + + meeting = find_meeting(meetings, parsed_date) + next unless meeting + + video_url = "https://www.youtube.com/watch?v=#{video_id}" + Documents::DownloadTranscriptJob.perform_later(meeting.id, video_url) + end + end + + private + + def candidate_meetings + Meeting + .where(body_name: COUNCIL_BODY_NAMES) + .where("starts_at >= ? AND starts_at <= ?", LOOKBACK_WINDOW.ago, Time.current) + .includes(:meeting_documents) + .reject { |m| m.meeting_documents.any? { |d| d.document_type == "transcript" } } + end + + def fetch_video_list + stdout, stderr, status = Open3.capture3( + "yt-dlp", "--flat-playlist", "--print", "%(id)s | %(title)s", + YOUTUBE_CHANNEL_URL + ) + + unless status.success? + Rails.logger.error "DiscoverTranscriptsJob: yt-dlp failed — #{stderr.strip}" + return nil + end + + stdout.lines.filter_map do |line| + id, title = line.strip.split(" | ", 2) + next unless id.present? && title.present? + + [ id, title ] + end + end + + def parse_date(date_str) + Date.parse(date_str) + rescue ArgumentError, TypeError + nil + end + + def find_meeting(meetings, date) + meetings.find { |m| m.starts_at.to_date == date } + end + end +end diff --git a/app/jobs/summarize_meeting_job.rb b/app/jobs/summarize_meeting_job.rb index 20497f3..de6ee73 100644 --- a/app/jobs/summarize_meeting_job.rb +++ b/app/jobs/summarize_meeting_job.rb @@ -26,15 +26,36 @@ def generate_meeting_summary(meeting, ai_service, retrieval_service) formatted_context = retrieval_service.format_context(retrieved_chunks).split("\n\n") kb_context = ai_service.prepare_kb_context(formatted_context) - # Prefer minutes (authoritative) over packet minutes_doc = meeting.meeting_documents.find_by(document_type: "minutes_pdf") + transcript_doc = meeting.meeting_documents.find_by(document_type: "transcript") + + # Priority 1: Minutes (authoritative), optionally supplemented by transcript if minutes_doc&.extracted_text.present? - json_str = ai_service.analyze_meeting_content(minutes_doc.extracted_text, kb_context, "minutes", source: meeting) - save_summary(meeting, "minutes_recap", json_str) + input_text = minutes_doc.extracted_text + source_type = "minutes" + + if transcript_doc&.extracted_text.present? + input_text += "\n\n--- Additional context from meeting recording transcript ---\n\n" + + transcript_doc.extracted_text.truncate(15_000) + source_type = "minutes_with_transcript" + end + + json_str = ai_service.analyze_meeting_content(input_text, kb_context, "minutes", source: meeting) + summary = save_summary(meeting, "minutes_recap", json_str, source_type: source_type) + + # Clean up any old transcript-only summary now that minutes exist + meeting.meeting_summaries.where(summary_type: "transcript_recap").destroy_all return end - # Fall back to packet + # Priority 2: Transcript (when no minutes available) + if transcript_doc&.extracted_text.present? + json_str = ai_service.analyze_meeting_content(transcript_doc.extracted_text, kb_context, "transcript", source: meeting) + save_summary(meeting, "transcript_recap", json_str, source_type: "transcript") + return + end + + # Priority 3: Fall back to packet packet_doc = meeting.meeting_documents.where("document_type LIKE ?", "%packet%").first if packet_doc doc_text = if packet_doc.extractions.any? @@ -153,7 +174,7 @@ def build_retrieval_query(meeting) parts.join("\n") end - def save_summary(meeting, type, json_str) + def save_summary(meeting, type, json_str, source_type: nil) generation_data = begin JSON.parse(json_str) rescue JSON::ParserError => e @@ -161,10 +182,13 @@ def save_summary(meeting, type, json_str) {} end + generation_data["source_type"] = source_type if source_type + summary = meeting.meeting_summaries.find_or_initialize_by(summary_type: type) summary.generation_data = generation_data summary.content = nil summary.save! + summary end def save_topic_summary(meeting, topic, content, generation_data) diff --git a/app/models/meeting.rb b/app/models/meeting.rb index 1774964..4c64053 100644 --- a/app/models/meeting.rb +++ b/app/models/meeting.rb @@ -22,6 +22,8 @@ def document_status :minutes elsif docs.any? { |d| d.document_type == "packet_pdf" } :packet + elsif docs.any? { |d| d.document_type == "transcript" } + :transcript elsif docs.any? { |d| d.document_type == "agenda_pdf" } :agenda else diff --git a/app/views/meetings/show.html.erb b/app/views/meetings/show.html.erb index caf1aab..6c3e4e2 100644 --- a/app/views/meetings/show.html.erb +++ b/app/views/meetings/show.html.erb @@ -34,6 +34,17 @@ +<% if @summary&.generation_data&.dig("source_type") == "transcript" %> +
+ + + + + + This summary is based on the meeting's video recording. It will be updated when official minutes are published. +
+<% end %> + <% gd = @summary&.generation_data.presence %> <% if gd %> @@ -218,11 +229,15 @@ - Quality: <%= doc.text_quality.humanize %> <% end %> + <% elsif doc.document_type == "transcript" %> + Source: Video Recording <% end %>
<% if doc.file.attached? && doc.document_type.include?("pdf") %> <%= link_to "Download PDF", rails_blob_path(doc.file, disposition: "attachment"), class: "btn btn--secondary btn--sm" %> + <% elsif doc.document_type == "transcript" && doc.source_url.present? %> + <%= link_to "Watch Recording", safe_external_url(doc.source_url), target: "_blank", rel: "noopener", class: "btn btn--secondary btn--sm" %> <% else %> <%= link_to "View Original", safe_external_url(doc.source_url), target: "_blank", rel: "noopener", class: "btn btn--secondary btn--sm" %> <% end %> diff --git a/config/brakeman.ignore b/config/brakeman.ignore new file mode 100644 index 0000000..5eb7fa2 --- /dev/null +++ b/config/brakeman.ignore @@ -0,0 +1,10 @@ +{ + "ignored_warnings": [ + { + "fingerprint": "3e4893630b06b15e5d9d65ba5da1d33eb340b31840f53bcdef92240b27b28217", + "note": "Open3.capture3 with array arguments does not use a shell. URL is validated against YOUTUBE_URL_PATTERN before use." + } + ], + "updated": "2026-04-09", + "brakeman_version": "7.0.2" +} diff --git a/docs/superpowers/plans/2026-04-09-youtube-transcript-ingestion.md b/docs/superpowers/plans/2026-04-09-youtube-transcript-ingestion.md new file mode 100644 index 0000000..2de38c3 --- /dev/null +++ b/docs/superpowers/plans/2026-04-09-youtube-transcript-ingestion.md @@ -0,0 +1,1065 @@ +# YouTube Transcript Ingestion Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Ingest YouTube auto-generated captions from council meeting recordings to produce same-day preliminary summaries, then enrich minutes-based summaries with transcript context when minutes arrive. + +**Architecture:** Two new jobs (`DiscoverTranscriptsJob`, `DownloadTranscriptJob`) form a standalone pipeline triggered by the existing `DiscoverMeetingsJob`. Transcripts are stored as `MeetingDocument` records with `document_type: "transcript"`. `SummarizeMeetingJob` gains transcript awareness: uses transcript as primary source when no minutes exist, and as supplementary context when minutes arrive later. + +**Tech Stack:** Rails 8.1, yt-dlp (system binary), Open3, Solid Queue, Minitest + +**Spec:** `docs/superpowers/specs/2026-04-09-youtube-transcript-ingestion-design.md` + +--- + +### Task 0: Create feature branch + +- [ ] **Step 1: Create and switch to feature branch** + +```bash +git checkout -b feature/youtube-transcript-ingestion +``` + +--- + +### Task 1: Add yt-dlp to Dockerfile + +**Files:** +- Modify: `Dockerfile:17-21` + +- [ ] **Step 1: Add yt-dlp installation to the base stage** + +In `Dockerfile`, after the `apt-get install` line and before the `rm -rf` cleanup, add yt-dlp download: + +```dockerfile +# Install base packages +RUN apt-get update -qq && \ + apt-get install --no-install-recommends -y curl libjemalloc2 libvips postgresql-client poppler-utils tesseract-ocr && \ + curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp && \ + chmod +x /usr/local/bin/yt-dlp && \ + ln -s /usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2 /usr/local/lib/libjemalloc.so && \ + rm -rf /var/lib/apt/lists /var/cache/apt/archives +``` + +- [ ] **Step 2: Verify the Dockerfile builds** + +Run: `docker build --target base -t trr-base-test . 2>&1 | tail -5` +Expected: Build succeeds + +- [ ] **Step 3: Commit** + +```bash +git add Dockerfile +git commit -m "build: add yt-dlp to Docker image for YouTube transcript support" +``` + +--- + +### Task 2: Update Meeting#document_status + +**Files:** +- Modify: `app/models/meeting.rb:17-30` +- Test: `test/models/meeting_test.rb` (add new test) + +- [ ] **Step 1: Write the failing test** + +Add to `test/models/meeting_test.rb`: + +```ruby +test "document_status returns :transcript when transcript exists but no minutes or packet" do + meeting = Meeting.create!( + body_name: "City Council", + detail_page_url: "http://example.com/meeting-transcript-test", + starts_at: 1.day.ago + ) + meeting.meeting_documents.create!( + document_type: "transcript", + source_url: "https://www.youtube.com/watch?v=test123" + ) + assert_equal :transcript, meeting.document_status +end + +test "document_status returns :minutes even when transcript exists" do + meeting = Meeting.create!( + body_name: "City Council", + detail_page_url: "http://example.com/meeting-transcript-test-2", + starts_at: 1.day.ago + ) + meeting.meeting_documents.create!( + document_type: "minutes_pdf", + source_url: "http://example.com/minutes.pdf" + ) + meeting.meeting_documents.create!( + document_type: "transcript", + source_url: "https://www.youtube.com/watch?v=test123" + ) + assert_equal :minutes, meeting.document_status +end +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `bin/rails test test/models/meeting_test.rb -n "/document_status.*transcript/"` +Expected: First test FAILS (`:transcript` not returned), second may pass trivially + +- [ ] **Step 3: Update document_status method** + +In `app/models/meeting.rb`, replace the `document_status` method: + +```ruby +def document_status + docs = association(:meeting_documents).loaded? ? meeting_documents : meeting_documents.load + + if docs.any? { |d| d.document_type == "minutes_pdf" } + :minutes + elsif docs.any? { |d| d.document_type == "packet_pdf" } + :packet + elsif docs.any? { |d| d.document_type == "transcript" } + :transcript + elsif docs.any? { |d| d.document_type == "agenda_pdf" } + :agenda + else + :none + end +end +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `bin/rails test test/models/meeting_test.rb -n "/document_status.*transcript/"` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add app/models/meeting.rb test/models/meeting_test.rb +git commit -m "feat: add transcript tier to Meeting#document_status" +``` + +--- + +### Task 3: Create DiscoverTranscriptsJob + +**Files:** +- Create: `app/jobs/scrapers/discover_transcripts_job.rb` +- Create: `test/jobs/scrapers/discover_transcripts_job_test.rb` + +- [ ] **Step 1: Write the failing tests** + +Create `test/jobs/scrapers/discover_transcripts_job_test.rb`: + +```ruby +require "test_helper" + +module Scrapers + class DiscoverTranscriptsJobTest < ActiveJob::TestCase + setup do + @council_meeting = Meeting.create!( + body_name: "City Council Meeting", + detail_page_url: "http://example.com/council-apr-6", + starts_at: 1.day.ago + ) + @work_session = Meeting.create!( + body_name: "City Council Work Session", + detail_page_url: "http://example.com/ws-mar-30", + starts_at: 2.days.ago + ) + # Non-council meeting should be ignored + @plan_commission = Meeting.create!( + body_name: "Plan Commission", + detail_page_url: "http://example.com/plan-commission", + starts_at: 1.day.ago + ) + # Old meeting outside 48-hour window should be ignored + @old_meeting = Meeting.create!( + body_name: "City Council Meeting", + detail_page_url: "http://example.com/council-old", + starts_at: 5.days.ago + ) + end + + test "parses standard council meeting title and enqueues download" do + yt_output = "S8rW22zizHc | Two Rivers City Council Meeting for Monday, #{@council_meeting.starts_at.strftime('%B %-d, %Y')}\n" + + Open3.stub :capture3, [yt_output, "", stub_status(true)] do + assert_enqueued_with(job: Documents::DownloadTranscriptJob) do + DiscoverTranscriptsJob.perform_now + end + end + end + + test "parses work session title and enqueues download" do + yt_output = "pWhrHg4X0tU | Two Rivers City Council Work Session for Monday, #{@work_session.starts_at.strftime('%B %-d, %Y')}\n" + + Open3.stub :capture3, [yt_output, "", stub_status(true)] do + assert_enqueued_with(job: Documents::DownloadTranscriptJob) do + DiscoverTranscriptsJob.perform_now + end + end + end + + test "skips videos that cannot be parsed" do + yt_output = "abc123 | Some Random Video Title\n" + + Open3.stub :capture3, [yt_output, "", stub_status(true)] do + assert_no_enqueued_jobs(only: Documents::DownloadTranscriptJob) do + DiscoverTranscriptsJob.perform_now + end + end + end + + test "skips meetings that already have a transcript" do + @council_meeting.meeting_documents.create!( + document_type: "transcript", + source_url: "https://www.youtube.com/watch?v=S8rW22zizHc" + ) + yt_output = "S8rW22zizHc | Two Rivers City Council Meeting for Monday, #{@council_meeting.starts_at.strftime('%B %-d, %Y')}\n" + + Open3.stub :capture3, [yt_output, "", stub_status(true)] do + assert_no_enqueued_jobs(only: Documents::DownloadTranscriptJob) do + DiscoverTranscriptsJob.perform_now + end + end + end + + test "skips non-council meetings" do + yt_output = "abc123 | Plan Commission for Monday, #{@plan_commission.starts_at.strftime('%B %-d, %Y')}\n" + + Open3.stub :capture3, [yt_output, "", stub_status(true)] do + assert_no_enqueued_jobs(only: Documents::DownloadTranscriptJob) do + DiscoverTranscriptsJob.perform_now + end + end + end + + test "handles yt-dlp failure gracefully" do + Open3.stub :capture3, ["", "ERROR: network error", stub_status(false)] do + assert_nothing_raised do + DiscoverTranscriptsJob.perform_now + end + end + end + + private + + def stub_status(success) + status = Minitest::Mock.new + status.expect :success?, success + status + end + end +end +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `bin/rails test test/jobs/scrapers/discover_transcripts_job_test.rb` +Expected: FAIL (class not found) + +- [ ] **Step 3: Implement DiscoverTranscriptsJob** + +Create `app/jobs/scrapers/discover_transcripts_job.rb`: + +```ruby +require "open3" + +module Scrapers + class DiscoverTranscriptsJob < ApplicationJob + queue_as :default + + YOUTUBE_CHANNEL_URL = "https://www.youtube.com/@Two_Rivers_WI/streams" + + # Matches: "Two Rivers City Council Meeting for Monday, April 6, 2026" + # Matches: "Two Rivers City Council Work Session for Monday, March 30, 2026" + TITLE_PATTERN = /(?:City Council (?:Meeting|Work Session)) for \w+, (.+)$/i + + ELIGIBLE_BODY_KEYWORDS = ["council", "work session"].freeze + + def perform + candidates = find_candidate_meetings + return if candidates.empty? + + videos = fetch_video_list + return if videos.empty? + + videos.each do |video_id, title| + match_and_enqueue(video_id, title, candidates) + end + end + + private + + def find_candidate_meetings + Meeting + .where("starts_at >= ? AND starts_at <= ?", 48.hours.ago, Time.current) + .where("body_name ILIKE ? OR body_name ILIKE ?", "%council%", "%work session%") + .left_joins(:meeting_documents) + .where.not(meeting_documents: { document_type: "transcript" }) + .or( + Meeting + .where("starts_at >= ? AND starts_at <= ?", 48.hours.ago, Time.current) + .where("body_name ILIKE ? OR body_name ILIKE ?", "%council%", "%work session%") + .left_joins(:meeting_documents) + .where(meeting_documents: { id: nil }) + ) + .distinct + end + + def fetch_video_list + stdout, stderr, status = Open3.capture3( + "yt-dlp", "--flat-playlist", + "--print", "%(id)s | %(title)s", + YOUTUBE_CHANNEL_URL + ) + + unless status.success? + Rails.logger.error("DiscoverTranscriptsJob: yt-dlp failed: #{stderr}") + return [] + end + + stdout.each_line.filter_map do |line| + parts = line.strip.split(" | ", 2) + next unless parts.size == 2 + [parts[0], parts[1]] + end + end + + def match_and_enqueue(video_id, title, candidates) + match = title.match(TITLE_PATTERN) + unless match + Rails.logger.debug("DiscoverTranscriptsJob: Skipping unparseable title: #{title}") + return + end + + date_str = match[1] + parsed_date = Date.parse(date_str) rescue nil + unless parsed_date + Rails.logger.warn("DiscoverTranscriptsJob: Could not parse date '#{date_str}' from title: #{title}") + return + end + + meeting = candidates.find { |m| m.starts_at.to_date == parsed_date } + unless meeting + Rails.logger.debug("DiscoverTranscriptsJob: No candidate meeting for date #{parsed_date} (title: #{title})") + return + end + + video_url = "https://www.youtube.com/watch?v=#{video_id}" + Rails.logger.info("DiscoverTranscriptsJob: Matched '#{title}' to Meeting ##{meeting.id}, enqueuing download") + Documents::DownloadTranscriptJob.perform_later(meeting.id, video_url) + end + end +end +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `bin/rails test test/jobs/scrapers/discover_transcripts_job_test.rb` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add app/jobs/scrapers/discover_transcripts_job.rb test/jobs/scrapers/discover_transcripts_job_test.rb +git commit -m "feat: add DiscoverTranscriptsJob to find YouTube recordings for recent council meetings" +``` + +--- + +### Task 4: Create DownloadTranscriptJob + +**Files:** +- Create: `app/jobs/documents/download_transcript_job.rb` +- Create: `test/jobs/documents/download_transcript_job_test.rb` + +- [ ] **Step 1: Write the failing tests** + +Create `test/jobs/documents/download_transcript_job_test.rb`: + +```ruby +require "test_helper" + +module Documents + class DownloadTranscriptJobTest < ActiveJob::TestCase + setup do + @meeting = Meeting.create!( + body_name: "City Council Meeting", + detail_page_url: "http://example.com/council-test", + starts_at: 1.day.ago + ) + @video_url = "https://www.youtube.com/watch?v=S8rW22zizHc" + end + + test "downloads transcript and creates MeetingDocument" do + srt_content = <<~SRT + 1 + 00:01:27,560 --> 00:01:30,440 + Testing, testing. + + 2 + 00:18:12,520 --> 00:18:15,880 + Good evening. Welcome to the city council meeting. + SRT + + stub_yt_dlp(srt_content) do + DownloadTranscriptJob.perform_now(@meeting.id, @video_url) + end + + doc = @meeting.meeting_documents.find_by(document_type: "transcript") + assert doc, "Should create a transcript document" + assert_equal @video_url, doc.source_url + assert_equal "auto_transcribed", doc.text_quality + assert_includes doc.extracted_text, "Testing, testing." + assert_includes doc.extracted_text, "Good evening." + assert_not_includes doc.extracted_text, "00:01:27" + assert_not_includes doc.extracted_text, "-->" + assert doc.file.attached?, "Should attach the raw SRT file" + assert doc.text_chars.positive? + assert doc.fetched_at.present? + end + + test "skips if meeting already has a transcript" do + @meeting.meeting_documents.create!( + document_type: "transcript", + source_url: @video_url + ) + + stub_yt_dlp("dummy") do + DownloadTranscriptJob.perform_now(@meeting.id, @video_url) + end + + assert_equal 1, @meeting.meeting_documents.where(document_type: "transcript").count + end + + test "enqueues SummarizeMeetingJob when no minutes summary exists" do + srt_content = "1\n00:00:01,000 --> 00:00:02,000\nHello.\n" + + stub_yt_dlp(srt_content) do + assert_enqueued_with(job: SummarizeMeetingJob) do + DownloadTranscriptJob.perform_now(@meeting.id, @video_url) + end + end + end + + test "does not enqueue SummarizeMeetingJob when minutes summary exists" do + @meeting.meeting_summaries.create!( + summary_type: "minutes_recap", + generation_data: { "headline" => "test" } + ) + srt_content = "1\n00:00:01,000 --> 00:00:02,000\nHello.\n" + + stub_yt_dlp(srt_content) do + assert_no_enqueued_jobs(only: SummarizeMeetingJob) do + DownloadTranscriptJob.perform_now(@meeting.id, @video_url) + end + end + end + + test "handles yt-dlp failure gracefully" do + Open3.stub :capture3, ["", "ERROR: no captions", stub_status(false)] do + Dir.stub :mktmpdir, "/tmp/test-transcript" do + assert_nothing_raised do + DownloadTranscriptJob.perform_now(@meeting.id, @video_url) + end + end + end + + assert_equal 0, @meeting.meeting_documents.where(document_type: "transcript").count + end + + private + + def stub_yt_dlp(srt_content) + Dir.mktmpdir("test-transcript") do |tmpdir| + srt_path = File.join(tmpdir, "video.en.srt") + File.write(srt_path, srt_content) + + # Stub Open3.capture3 to succeed, and stub Dir.mktmpdir to use our tmpdir + original_mktmpdir = Dir.method(:mktmpdir) + + Dir.define_singleton_method(:mktmpdir) do |*args, &block| + if args.first == "transcript" + block.call(tmpdir) + else + original_mktmpdir.call(*args, &block) + end + end + + Open3.stub :capture3, ["", "", stub_status(true)] do + yield + end + ensure + Dir.define_singleton_method(:mktmpdir, original_mktmpdir) + end + end + + def stub_status(success) + status = Minitest::Mock.new + status.expect :success?, success + status + end + end +end +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `bin/rails test test/jobs/documents/download_transcript_job_test.rb` +Expected: FAIL (class not found) + +- [ ] **Step 3: Implement DownloadTranscriptJob** + +Create `app/jobs/documents/download_transcript_job.rb`: + +```ruby +require "open3" + +module Documents + class DownloadTranscriptJob < ApplicationJob + queue_as :default + + def perform(meeting_id, video_url) + meeting = Meeting.find(meeting_id) + + # Idempotency: skip if transcript already exists + return if meeting.meeting_documents.exists?(document_type: "transcript") + + srt_content = download_captions(video_url) + return unless srt_content + + plain_text = parse_srt(srt_content) + + document = meeting.meeting_documents.create!( + document_type: "transcript", + source_url: video_url, + extracted_text: plain_text, + text_quality: "auto_transcribed", + text_chars: plain_text.length, + fetched_at: Time.current + ) + + document.file.attach( + io: StringIO.new(srt_content), + filename: "transcript-#{meeting.starts_at.to_date}.srt", + content_type: "text/srt" + ) + + # Trigger preliminary summary if no minutes-based summary exists + unless meeting.meeting_summaries.exists?(summary_type: "minutes_recap") + SummarizeMeetingJob.perform_later(meeting.id) + end + end + + private + + def download_captions(video_url) + Dir.mktmpdir("transcript") do |tmpdir| + output_template = File.join(tmpdir, "video") + + stdout, stderr, status = Open3.capture3( + "yt-dlp", + "--write-auto-sub", + "--sub-lang", "en", + "--sub-format", "srt", + "--skip-download", + "-o", output_template, + video_url + ) + + unless status.success? + Rails.logger.error("DownloadTranscriptJob: yt-dlp failed for #{video_url}: #{stderr}") + return nil + end + + # yt-dlp writes to .en.srt + srt_path = Dir.glob(File.join(tmpdir, "*.srt")).first + unless srt_path && File.exist?(srt_path) + Rails.logger.error("DownloadTranscriptJob: No SRT file produced for #{video_url}") + return nil + end + + File.read(srt_path) + end + end + + def parse_srt(srt_content) + srt_content + .gsub(/^\d+\s*$/, "") # Remove sequence numbers + .gsub(/^\d{2}:\d{2}:\d{2},\d{3}\s*-->.*$/, "") # Remove timestamp lines + .gsub(/\n{3,}/, "\n\n") # Collapse multiple blank lines + .strip + end + end +end +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `bin/rails test test/jobs/documents/download_transcript_job_test.rb` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add app/jobs/documents/download_transcript_job.rb test/jobs/documents/download_transcript_job_test.rb +git commit -m "feat: add DownloadTranscriptJob to fetch YouTube auto-captions" +``` + +--- + +### Task 5: Update SummarizeMeetingJob for transcript support + +**Files:** +- Modify: `app/jobs/summarize_meeting_job.rb:29-52` +- Modify: `test/jobs/summarize_meeting_job_test.rb` + +- [ ] **Step 1: Write the failing tests** + +Add to `test/jobs/summarize_meeting_job_test.rb`: + +```ruby +test "generates meeting summary from transcript when no minutes exist" do + @meeting.meeting_documents.create!( + document_type: "transcript", + source_url: "https://www.youtube.com/watch?v=test123", + extracted_text: "Good evening. The council discussed the budget." + ) + + generation_data = { + "source_type" => "transcript", + "headline" => "Council discussed budget", + "highlights" => [], + "public_input" => [], + "item_details" => [] + } + + mock_ai = Minitest::Mock.new + mock_ai.expect :prepare_kb_context, "" do |arg| + arg.is_a?(Array) + end + mock_ai.expect :analyze_meeting_content, generation_data.to_json do |text, kb, type, **kwargs| + type == "transcript" && text.include?("council discussed the budget") + end + mock_ai.expect :analyze_topic_summary, '{"factual_record": []}' do |arg| + arg.is_a?(Hash) + end + mock_ai.expect :render_topic_summary, "## Summary" do |arg| + arg.is_a?(String) + end + + retrieval_stub = Object.new + def retrieval_stub.retrieve_context(*args, **kwargs); []; end + def retrieval_stub.format_context(*args); ""; end + def retrieval_stub.retrieve_topic_context(*args, **kwargs); []; end + def retrieval_stub.format_topic_context(*args); []; end + + RetrievalService.stub :new, retrieval_stub do + Ai::OpenAiService.stub :new, mock_ai do + SummarizeMeetingJob.perform_now(@meeting.id) + end + end + + summary = @meeting.meeting_summaries.find_by(summary_type: "transcript_recap") + assert summary, "Should create a transcript_recap summary" + assert_equal "transcript", summary.generation_data["source_type"] +end + +test "minutes take priority over transcript" do + @meeting.meeting_documents.create!( + document_type: "minutes_pdf", + source_url: "http://example.com/minutes.pdf", + extracted_text: "Official minutes text." + ) + @meeting.meeting_documents.create!( + document_type: "transcript", + source_url: "https://www.youtube.com/watch?v=test123", + extracted_text: "Transcript text with discussion." + ) + + generation_data = { + "headline" => "Council approved the budget", + "highlights" => [], + "public_input" => [], + "item_details" => [] + } + + mock_ai = Minitest::Mock.new + mock_ai.expect :prepare_kb_context, "" do |arg| + arg.is_a?(Array) + end + mock_ai.expect :analyze_meeting_content, generation_data.to_json do |text, kb, type, **kwargs| + type == "minutes" + end + mock_ai.expect :analyze_topic_summary, '{"factual_record": []}' do |arg| + arg.is_a?(Hash) + end + mock_ai.expect :render_topic_summary, "## Summary" do |arg| + arg.is_a?(String) + end + + retrieval_stub = Object.new + def retrieval_stub.retrieve_context(*args, **kwargs); []; end + def retrieval_stub.format_context(*args); ""; end + def retrieval_stub.retrieve_topic_context(*args, **kwargs); []; end + def retrieval_stub.format_topic_context(*args); []; end + + RetrievalService.stub :new, retrieval_stub do + Ai::OpenAiService.stub :new, mock_ai do + SummarizeMeetingJob.perform_now(@meeting.id) + end + end + + summary = @meeting.meeting_summaries.find_by(summary_type: "minutes_recap") + assert summary, "Should create minutes_recap, not transcript_recap" + assert_nil @meeting.meeting_summaries.find_by(summary_type: "transcript_recap") +end +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `bin/rails test test/jobs/summarize_meeting_job_test.rb -n "/transcript/"` +Expected: FAIL + +- [ ] **Step 3: Update generate_meeting_summary to support transcript** + +In `app/jobs/summarize_meeting_job.rb`, replace the `generate_meeting_summary` method (lines 18-53): + +```ruby +def generate_meeting_summary(meeting, ai_service, retrieval_service) + query = build_retrieval_query(meeting) + retrieved_chunks = begin + retrieval_service.retrieve_context(query) + rescue => e + Rails.logger.warn("Context retrieval failed for Meeting #{meeting.id}: #{e.message}") + [] + end + formatted_context = retrieval_service.format_context(retrieved_chunks).split("\n\n") + kb_context = ai_service.prepare_kb_context(formatted_context) + + # Prefer minutes (authoritative) over transcript over packet + minutes_doc = meeting.meeting_documents.find_by(document_type: "minutes_pdf") + if minutes_doc&.extracted_text.present? + transcript_doc = meeting.meeting_documents.find_by(document_type: "transcript") + doc_text = minutes_doc.extracted_text + if transcript_doc&.extracted_text.present? + doc_text += "\n\n--- Additional context from meeting recording transcript ---\n\n" + + transcript_doc.extracted_text.truncate(15_000) + end + + json_str = ai_service.analyze_meeting_content(doc_text, kb_context, "minutes", source: meeting) + result = save_summary(meeting, "minutes_recap", json_str) + + # Remove any preliminary transcript summary now that minutes are available + meeting.meeting_summaries.where(summary_type: "transcript_recap").destroy_all + + # Track source type in generation_data + if result&.generation_data.is_a?(Hash) + source = transcript_doc&.extracted_text.present? ? "minutes_with_transcript" : "minutes" + result.update!(generation_data: result.generation_data.merge("source_type" => source)) + end + return + end + + # Fall back to transcript + transcript_doc = meeting.meeting_documents.find_by(document_type: "transcript") + if transcript_doc&.extracted_text.present? + json_str = ai_service.analyze_meeting_content(transcript_doc.extracted_text, kb_context, "transcript", source: meeting) + result = save_summary(meeting, "transcript_recap", json_str) + if result&.generation_data.is_a?(Hash) + result.update!(generation_data: result.generation_data.merge("source_type" => "transcript")) + end + return + end + + # Fall back to packet + packet_doc = meeting.meeting_documents.where("document_type LIKE ?", "%packet%").first + if packet_doc + doc_text = if packet_doc.extractions.any? + ai_service.prepare_doc_context(packet_doc.extractions) + elsif packet_doc.extracted_text.present? + packet_doc.extracted_text + end + + if doc_text + json_str = ai_service.analyze_meeting_content(doc_text, kb_context, "packet", source: meeting) + save_summary(meeting, "packet_analysis", json_str) + else + Rails.logger.warn("No extractable text for packet document on Meeting #{meeting.id}") + end + end +end +``` + +- [ ] **Step 4: Update save_summary to return the record** + +In `app/jobs/summarize_meeting_job.rb`, update the `save_summary` method to return the summary: + +```ruby +def save_summary(meeting, type, json_str) + generation_data = begin + JSON.parse(json_str) + rescue JSON::ParserError => e + Rails.logger.error "Failed to parse meeting summary JSON: #{e.message}" + {} + end + + summary = meeting.meeting_summaries.find_or_initialize_by(summary_type: type) + summary.generation_data = generation_data + summary.content = nil + summary.save! + summary +end +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `bin/rails test test/jobs/summarize_meeting_job_test.rb` +Expected: ALL PASS (both new and existing tests) + +- [ ] **Step 6: Commit** + +```bash +git add app/jobs/summarize_meeting_job.rb test/jobs/summarize_meeting_job_test.rb +git commit -m "feat: add transcript support to SummarizeMeetingJob + +Transcript used as primary source when no minutes exist (produces +transcript_recap). When minutes arrive, transcript is appended as +supplementary context. Preliminary transcript summary is cleaned up +when minutes-based summary is generated." +``` + +--- + +### Task 6: Update MeetingsController and show view for transcript banner + +**Files:** +- Modify: `app/controllers/meetings_controller.rb:23-25` +- Modify: `app/views/meetings/show.html.erb:35-37` + +- [ ] **Step 1: Update controller to find transcript_recap summaries** + +In `app/controllers/meetings_controller.rb`, update the summary lookup (line 23-25): + +```ruby +# Prefer minutes_recap over transcript_recap over packet_analysis +@summary = @meeting.meeting_summaries.find_by(summary_type: "minutes_recap") || + @meeting.meeting_summaries.find_by(summary_type: "transcript_recap") || + @meeting.meeting_summaries.find_by(summary_type: "packet_analysis") +``` + +- [ ] **Step 2: Add transcript banner to the view** + +In `app/views/meetings/show.html.erb`, after line 35 (`
` closing meeting-meta) and before line 37 (`<% gd = @summary&.generation_data.presence %>`), add: + +```erb +<% if @summary&.generation_data&.dig("source_type") == "transcript" %> +
+ + + + + + This summary is based on the meeting's video recording. It will be updated when official minutes are published. +
+<% end %> +``` + +- [ ] **Step 3: Add CSS for the transcript banner** + +Find the meeting-specific stylesheet. Add: + +```css +.transcript-banner { + display: flex; + align-items: center; + gap: var(--space-sm); + padding: var(--space-sm) var(--space-md); + margin-bottom: var(--space-lg); + background-color: var(--color-cool-bg, #e8eef4); + border: 1px solid var(--color-cool-border, #b0c4d8); + border-radius: var(--radius-sm); + font-family: var(--font-body); + font-size: var(--text-sm); + color: var(--color-cool-text, #2c3e50); +} + +.transcript-banner svg { + flex-shrink: 0; + color: var(--color-cool-accent, #3a7bd5); +} +``` + +Note: Verify the exact CSS custom property names by checking the design system stylesheet. Use existing `--color-*` variables where available. The banner should be cool-toned and distinct from the warm cream background. + +- [ ] **Step 4: Run the app and manually verify** + +Run: `bin/dev` +Navigate to a meeting page. The banner won't show yet (no transcript summaries exist), but verify no visual regressions. + +- [ ] **Step 5: Commit** + +```bash +git add app/controllers/meetings_controller.rb app/views/meetings/show.html.erb app/assets/stylesheets/ +git commit -m "feat: add transcript banner to meeting show page + +Shows an informational banner when the summary is based on the video +recording instead of official minutes. Automatically removed when +minutes arrive and the summary is regenerated." +``` + +--- + +### Task 7: Wire DiscoverTranscriptsJob into DiscoverMeetingsJob + +**Files:** +- Modify: `app/jobs/scrapers/discover_meetings_job.rb:9-24` + +- [ ] **Step 1: Add the trigger at the end of perform** + +In `app/jobs/scrapers/discover_meetings_job.rb`, update the `perform` method: + +```ruby +def perform(since: nil) + since ||= DEFAULT_LOOKBACK.ago + agent = Mechanize.new + agent.user_agent_alias = "Mac Safari" + page = agent.get(MEETINGS_URL) + + loop do + should_continue = parse_page(page, since) + break unless should_continue + + next_link = page.link_with(text: /next ›/) + break unless next_link + + page = next_link.click + end + + # Check for YouTube transcripts for recent council meetings + Scrapers::DiscoverTranscriptsJob.perform_later +end +``` + +- [ ] **Step 2: Run existing discover meetings tests to verify no regression** + +Run: `bin/rails test test/jobs/scrapers/` +Expected: PASS + +- [ ] **Step 3: Commit** + +```bash +git add app/jobs/scrapers/discover_meetings_job.rb +git commit -m "feat: trigger transcript discovery after meeting discovery" +``` + +--- + +### Task 8: Add document section support for transcript in show view + +**Files:** +- Modify: `app/views/meetings/show.html.erb:204-237` (Documents section) + +- [ ] **Step 1: Update the documents section to handle transcript display** + +In the Documents section of `app/views/meetings/show.html.erb`, update the document list item (around line 210-229) to handle transcripts: + +```erb +<% @meeting.meeting_documents.each do |doc| %> +
  • +
    + <%= doc.document_type.humanize %> + <% if doc.file.attached? && doc.document_type.include?("pdf") %> + + (<%= number_to_human_size(doc.file.byte_size) %>) + <% if doc.text_quality.present? %> + - Quality: <%= doc.text_quality.humanize %> + <% end %> + + <% elsif doc.document_type == "transcript" %> + + (<%= number_to_human_size(doc.text_chars || 0) %> chars) + - Source: Video Recording + + <% end %> +
    +
    + <% if doc.file.attached? && doc.document_type.include?("pdf") %> + <%= link_to "Download PDF", rails_blob_path(doc.file, disposition: "attachment"), class: "btn btn--secondary btn--sm" %> + <% elsif doc.document_type == "transcript" && doc.source_url.present? %> + <%= link_to "Watch Recording", safe_external_url(doc.source_url), target: "_blank", rel: "noopener", class: "btn btn--secondary btn--sm" %> + <% else %> + <%= link_to "View Original", safe_external_url(doc.source_url), target: "_blank", rel: "noopener", class: "btn btn--secondary btn--sm" %> + <% end %> +
    +
  • +<% end %> +``` + +- [ ] **Step 2: Commit** + +```bash +git add app/views/meetings/show.html.erb +git commit -m "feat: show transcript document with Watch Recording link in documents section" +``` + +--- + +### Task 9: Run full test suite and lint + +- [ ] **Step 1: Run full test suite** + +Run: `bin/rails test` +Expected: ALL PASS + +- [ ] **Step 2: Run linter** + +Run: `bin/rubocop` +Expected: No new offenses. Fix any that appear in files you modified. + +- [ ] **Step 3: Run CI checks** + +Run: `bin/ci` +Expected: PASS + +- [ ] **Step 4: Fix any issues found, then commit fixes** + +If any fixes needed: +```bash +git add -A +git commit -m "fix: address lint/test issues from transcript feature" +``` + +--- + +### Task 10: Create PR + +- [ ] **Step 1: Push branch and create PR** + +```bash +git push -u origin feature/youtube-transcript-ingestion +gh pr create --title "Add YouTube transcript ingestion for same-day council meeting summaries" --body "$(cat <<'EOF' +## Summary +- Ingests YouTube auto-generated captions from council meeting recordings +- Produces same-day preliminary summaries when official minutes aren't available yet +- Enriches minutes-based summaries with transcript context when minutes arrive +- Shows a visible banner on meeting pages when summary is transcript-sourced + +## Design +See `docs/superpowers/specs/2026-04-09-youtube-transcript-ingestion-design.md` + +## Changes +- **New jobs:** `DiscoverTranscriptsJob` (finds YouTube videos for recent council meetings), `DownloadTranscriptJob` (fetches auto-captions, creates MeetingDocument) +- **Modified:** `SummarizeMeetingJob` (transcript priority tier, supplementary context), `DiscoverMeetingsJob` (triggers transcript discovery), `MeetingsController` (finds transcript summaries), meeting show view (transcript banner + document display) +- **Infrastructure:** `yt-dlp` added to Dockerfile + +## Test plan +- [ ] Run `bin/rails test` — all tests pass +- [ ] Run `bin/ci` — lint, security, audit all pass +- [ ] Manually test with a real YouTube video ID to verify yt-dlp caption download works +- [ ] Verify transcript banner renders correctly on meeting show page +- [ ] Verify banner disappears when minutes-based summary exists + +🤖 Generated with [Claude Code](https://claude.com/claude-code) +EOF +)" +``` diff --git a/docs/superpowers/specs/2026-04-09-youtube-transcript-ingestion-design.md b/docs/superpowers/specs/2026-04-09-youtube-transcript-ingestion-design.md new file mode 100644 index 0000000..3b3a47e --- /dev/null +++ b/docs/superpowers/specs/2026-04-09-youtube-transcript-ingestion-design.md @@ -0,0 +1,168 @@ +# YouTube Transcript Ingestion Design + +**Date:** 2026-04-09 +**Status:** Approved + +## Problem + +Council meetings and work sessions are recorded and posted to YouTube within hours, but the site currently waits weeks for official minutes before publishing summaries. Residents get nothing in the gap. + +## Solution + +Ingest YouTube auto-generated captions as a new `transcript` document type. Use transcripts to produce same-day preliminary summaries, then enrich the authoritative minutes-based summaries with transcript context when minutes arrive later. + +**Core principle:** The transcript is a **supplement**, not a replacement. It never overrides official sources. The existing pipeline is untouched unless a transcript happens to be available. + +## Data Model + +### MeetingDocument + +No new tables or columns. Transcripts use the existing `meeting_documents` table: + +| Field | Value | +|-------|-------| +| `document_type` | `"transcript"` | +| `source_url` | YouTube video URL (e.g., `https://www.youtube.com/watch?v=S8rW22zizHc`) | +| `extracted_text` | Plain text (SRT timestamps stripped) | +| `file` | Raw SRT file (Active Storage attachment) | +| `fetched_at` | When the transcript was downloaded | +| `text_chars` | Character count of extracted text | +| `text_quality` | `"auto_transcribed"` | + +### Meeting#document_status + +Updated priority: `:minutes` > `:packet` > `:transcript` > `:agenda` > `:none`. + +### MeetingSummary.generation_data + +New `source_type` key in the existing JSON column: +- `"transcript"` — preliminary summary from transcript only +- `"minutes"` — authoritative summary from minutes only +- `"minutes_with_transcript"` — authoritative summary enriched with transcript context + +## Job Chain + +### Scrapers::DiscoverTranscriptsJob + +**Trigger:** Enqueued by `DiscoverMeetingsJob` at the end of its run. + +**Logic:** +1. Query for Meeting records where: + - `body_name` matches Council or Work Session (the only recorded meetings) + - `starts_at` within the last 48 hours + - No existing `transcript` document +2. Call `yt-dlp --flat-playlist --print "%(id)s | %(title)s"` on the channel URL +3. Parse each video title with regex to extract the meeting date: + - Primary pattern: `/for \w+, (.+)$/` (handles "Two Rivers City Council Meeting for Monday, April 6, 2026") + - Unmatched titles are logged as warnings and skipped +4. Match parsed date + body keyword ("Council", "Work Session") to Meeting records +5. Enqueue `Documents::DownloadTranscriptJob` for each match + +**Channel URL constant:** +```ruby +YOUTUBE_CHANNEL_URL = "https://www.youtube.com/@Two_Rivers_WI/streams" +``` + +**Idempotency:** Skips meetings that already have a transcript document. + +### Documents::DownloadTranscriptJob + +**Input:** Meeting ID, YouTube video URL. + +**Logic:** +1. Check for existing transcript document on the meeting (idempotency guard) +2. Call `yt-dlp --write-auto-sub --sub-lang en --sub-format srt --skip-download` via `Open3.capture3` +3. Parse SRT to plain text: strip sequence numbers, timestamps (`HH:MM:SS,mmm --> HH:MM:SS,mmm`), blank lines +4. Create `MeetingDocument` with `document_type: "transcript"`: + - Attach raw SRT file + - Store plain text in `extracted_text` + - Set `text_quality: "auto_transcribed"`, `text_chars`, `fetched_at`, `source_url` +5. If the meeting has no minutes-based summary yet, enqueue `SummarizeMeetingJob(meeting.id)` + +**Shell execution:** `Open3.capture3` pattern, consistent with `pdftotext`/`tesseract` usage in existing jobs. Uses `Dir.mktmpdir` for `yt-dlp` output files, cleaned up after processing. + +**Failure handling:** Log error and exit on `yt-dlp` failure. Next day's discovery run retries automatically. + +## Summarization Changes + +### SummarizeMeetingJob + +**Updated document priority:** minutes > transcript > packet. + +**When transcript is the best available source (no minutes):** +- Use transcript `extracted_text` as primary input +- Adjust AI prompt: "Summarize the discussion from this meeting recording transcript" (vs. "Summarize the official minutes") +- Store `"source_type": "transcript"` in `MeetingSummary.generation_data` +- The resulting summary is preliminary — accurate to the recording but not the official record + +**When minutes arrive and re-trigger the job:** +- Minutes remain primary input (existing behavior unchanged) +- Transcript `extracted_text` appended as supplementary context: "Additional context from the meeting recording transcript" (15K char limit) +- Store `"source_type": "minutes_with_transcript"` in `generation_data` +- If no transcript exists, behavior is identical to today (`"source_type": "minutes"`) + +**No changes to other extraction jobs.** `ExtractTopicsJob`, `ExtractVotesJob`, and `ExtractCommitteeMembersJob` continue to trigger only from minutes/agenda. + +## Meeting Show Page — Transcript Banner + +**Condition:** `MeetingSummary` exists with `generation_data["source_type"] == "transcript"` and no `minutes_pdf` document on the meeting. + +**Location:** Top of meeting show page, between the meeting meta section and the headline section. + +**Appearance:** A distinct, noticeable callout — NOT the warm theme (too subtle against the cream background). Use a cool or contrasting accent treatment so it reads as informational/cautionary: + +> This summary is based on the meeting's video recording. It will be updated when official minutes are published. + +**Removal:** Automatic. When minutes arrive and `SummarizeMeetingJob` regenerates the summary with `source_type: "minutes"` or `"minutes_with_transcript"`, the banner condition is no longer met. No manual action required. + +## Infrastructure + +### Dockerfile + +Add `yt-dlp` as a system dependency. Download the standalone binary (no Python required): + +```dockerfile +RUN curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp && \ + chmod +x /usr/local/bin/yt-dlp +``` + +### Schedule + +No new entry in `config/recurring.yml`. `DiscoverTranscriptsJob` is enqueued by `DiscoverMeetingsJob` (daily at 11pm). The chain is: + +``` +DiscoverMeetingsJob (11pm daily) + → at end of run, enqueues DiscoverTranscriptsJob + → for each matched video, enqueues DownloadTranscriptJob + → if no minutes summary, enqueues SummarizeMeetingJob +``` + +## Scope Boundaries + +**In scope:** +- Two new jobs (discover + download) +- `SummarizeMeetingJob` modifications (transcript priority, supplementary context, source_type tracking) +- `Meeting#document_status` update +- Transcript banner on meeting show page +- Dockerfile update for `yt-dlp` +- `DiscoverMeetingsJob` change to enqueue transcript discovery + +**Out of scope:** +- Speaker diarization / attribution +- Vote or member extraction from transcripts +- Admin UI for manual video-to-meeting linking +- Backfilling old videos (manual rake task, future work) +- Whisper / local transcription (YouTube auto-captions are sufficient) +- Video download or storage (captions only, `--skip-download`) + +## YouTube Title Matching + +Observed title patterns from the channel (`@Two_Rivers_WI/streams`): + +| Pattern | Example | Frequency | +|---------|---------|-----------| +| Standard | `Two Rivers City Council Meeting for Monday, April 6, 2026` | ~90% | +| Work Session | `Two Rivers City Council Work Session for Monday, March 30, 2026` | ~8% | +| Special | `Joint Meeting of Plan Commission, EAB, Advisory Recreation Board, & City Council 7/23/2025` | ~2% | + +The regex handles the standard and work session patterns. Special/joint meeting titles are logged and skipped. The discovery job only looks for videos matching meetings in the last 48 hours, not the full back-catalog. diff --git a/test/jobs/documents/download_transcript_job_test.rb b/test/jobs/documents/download_transcript_job_test.rb new file mode 100644 index 0000000..9a118ca --- /dev/null +++ b/test/jobs/documents/download_transcript_job_test.rb @@ -0,0 +1,164 @@ +require "test_helper" +require "ostruct" +require "open3" + +module Documents + class DownloadTranscriptJobTest < ActiveJob::TestCase + include ActiveJob::TestHelper + + SAMPLE_SRT = <<~SRT + 1 + 00:00:01,000 --> 00:00:03,000 + Welcome to the city council meeting. + + 2 + 00:00:04,000 --> 00:00:06,500 + Tonight we will discuss the budget proposal. + + 3 + 00:00:07,000 --> 00:00:09,000 + Public input is now open. + + SRT + + def setup + @meeting = Meeting.create!( + body_name: "City Council", + meeting_type: "Regular", + starts_at: Time.zone.local(2026, 3, 15, 18, 0, 0), + status: "held", + detail_page_url: "http://example.com/meetings/transcript-test-#{SecureRandom.hex(4)}" + ) + @video_url = "https://www.youtube.com/watch?v=abc123" + end + + # Stubs Dir.mktmpdir("transcript") to yield a real tmpdir that already has the SRT file. + # Restores Dir.mktmpdir after the block. + def stub_yt_dlp(srt_content) + Dir.mktmpdir("test-transcript") do |tmpdir| + srt_path = File.join(tmpdir, "video.en.srt") + File.write(srt_path, srt_content) + + original_mktmpdir = Dir.method(:mktmpdir) + Dir.define_singleton_method(:mktmpdir) do |*args, &block| + if args.first == "transcript" + block.call(tmpdir) + else + original_mktmpdir.call(*args, &block) + end + end + + Open3.stub :capture3, [ "", "", OpenStruct.new(success?: true) ] do + yield + end + ensure + Dir.define_singleton_method(:mktmpdir, original_mktmpdir) + end + end + + # ----------------------------------------------------------------------- + # Test 1: creates MeetingDocument with correct attributes and file attached + # ----------------------------------------------------------------------- + test "downloads transcript and creates MeetingDocument" do + stub_yt_dlp(SAMPLE_SRT) do + assert_difference "MeetingDocument.count", 1 do + DownloadTranscriptJob.perform_now(@meeting.id, @video_url) + end + end + + doc = @meeting.meeting_documents.find_by!(document_type: "transcript") + assert_equal @video_url, doc.source_url + assert_equal "auto_transcribed", doc.text_quality + assert_not_includes doc.extracted_text, "00:00:01,000 --> 00:00:03,000", "SRT timestamps must be stripped" + refute_match(/^\d+\s*$/, doc.extracted_text, "SRT sequence numbers must be stripped") + assert_includes doc.extracted_text, "Welcome to the city council meeting." + assert_includes doc.extracted_text, "Tonight we will discuss the budget proposal." + assert_operator doc.text_chars, :>, 0 + assert_not_nil doc.fetched_at + assert doc.file.attached?, "SRT file should be attached" + end + + # ----------------------------------------------------------------------- + # Test 2: skips if transcript document already exists + # ----------------------------------------------------------------------- + test "skips if meeting already has a transcript document" do + MeetingDocument.create!( + meeting: @meeting, + document_type: "transcript", + source_url: @video_url, + text_quality: "auto_transcribed", + extracted_text: "existing transcript" + ) + + stub_yt_dlp(SAMPLE_SRT) do + assert_no_difference "MeetingDocument.count" do + DownloadTranscriptJob.perform_now(@meeting.id, @video_url) + end + end + end + + # ----------------------------------------------------------------------- + # Test 3: enqueues SummarizeMeetingJob when no minutes_recap summary + # ----------------------------------------------------------------------- + test "enqueues SummarizeMeetingJob when no minutes_recap summary exists" do + stub_yt_dlp(SAMPLE_SRT) do + assert_enqueued_with(job: SummarizeMeetingJob, args: [ @meeting.id ]) do + DownloadTranscriptJob.perform_now(@meeting.id, @video_url) + end + end + end + + # ----------------------------------------------------------------------- + # Test 4: does not enqueue SummarizeMeetingJob when minutes_recap exists + # ----------------------------------------------------------------------- + test "does not enqueue SummarizeMeetingJob when minutes_recap summary exists" do + MeetingSummary.create!( + meeting: @meeting, + summary_type: "minutes_recap", + content: "Existing minutes recap" + ) + + stub_yt_dlp(SAMPLE_SRT) do + assert_no_enqueued_jobs(only: SummarizeMeetingJob) do + DownloadTranscriptJob.perform_now(@meeting.id, @video_url) + end + end + end + + # ----------------------------------------------------------------------- + # Test 5: handles yt-dlp failure gracefully (no document created) + # ----------------------------------------------------------------------- + test "handles yt-dlp failure gracefully without creating a document" do + original_mktmpdir = Dir.method(:mktmpdir) + Dir.define_singleton_method(:mktmpdir) do |*args, &block| + if args.first == "transcript" + Dir.mktmpdir("test-transcript-fail") do |tmpdir| + # No SRT file written — yt-dlp "failed" + block.call(tmpdir) + end + else + original_mktmpdir.call(*args, &block) + end + end + + begin + Open3.stub :capture3, [ "", "ERROR: Unable to download", OpenStruct.new(success?: false) ] do + assert_no_difference "MeetingDocument.count" do + DownloadTranscriptJob.perform_now(@meeting.id, @video_url) + end + end + ensure + Dir.define_singleton_method(:mktmpdir, original_mktmpdir) + end + end + + # ----------------------------------------------------------------------- + # Test 6: rejects invalid video URLs + # ----------------------------------------------------------------------- + test "rejects invalid video URL without creating a document" do + assert_no_difference "MeetingDocument.count" do + DownloadTranscriptJob.perform_now(@meeting.id, "https://evil.com/malicious?v=abc") + end + end + end +end diff --git a/test/jobs/scrapers/discover_transcripts_job_test.rb b/test/jobs/scrapers/discover_transcripts_job_test.rb new file mode 100644 index 0000000..3a33b15 --- /dev/null +++ b/test/jobs/scrapers/discover_transcripts_job_test.rb @@ -0,0 +1,104 @@ +require "test_helper" +require "minitest/mock" + +class Scrapers::DiscoverTranscriptsJobTest < ActiveJob::TestCase + setup do + @council_meeting = Meeting.create!( + body_name: "City Council Meeting", + detail_page_url: "http://example.com/council-apr-6", + starts_at: 1.day.ago + ) + @work_session = Meeting.create!( + body_name: "City Council Work Session", + detail_page_url: "http://example.com/ws-mar-30", + starts_at: 47.hours.ago + ) + @plan_commission = Meeting.create!( + body_name: "Plan Commission", + detail_page_url: "http://example.com/plan-commission", + starts_at: 1.day.ago + ) + @old_meeting = Meeting.create!( + body_name: "City Council Meeting", + detail_page_url: "http://example.com/council-old", + starts_at: 5.days.ago + ) + end + + def stub_status(success_bool) + status = Minitest::Mock.new + status.expect :success?, success_bool + status + end + + test "parses standard council meeting title and enqueues download" do + date_str = @council_meeting.starts_at.strftime("%B %-d, %Y") + yt_output = "abc123 | City Council Meeting for Thursday, #{date_str}\n" + + Open3.stub :capture3, [ yt_output, "", stub_status(true) ] do + assert_enqueued_with(job: Documents::DownloadTranscriptJob, args: [ @council_meeting.id, "https://www.youtube.com/watch?v=abc123" ]) do + Scrapers::DiscoverTranscriptsJob.perform_now + end + end + end + + test "parses work session title and enqueues download" do + date_str = @work_session.starts_at.strftime("%B %-d, %Y") + yt_output = "def456 | City Council Work Session for Monday, #{date_str}\n" + + Open3.stub :capture3, [ yt_output, "", stub_status(true) ] do + assert_enqueued_with(job: Documents::DownloadTranscriptJob, args: [ @work_session.id, "https://www.youtube.com/watch?v=def456" ]) do + Scrapers::DiscoverTranscriptsJob.perform_now + end + end + end + + test "skips videos that cannot be parsed" do + yt_output = "xyz999 | Some Random Live Stream\n" + + Open3.stub :capture3, [ yt_output, "", stub_status(true) ] do + assert_no_enqueued_jobs only: Documents::DownloadTranscriptJob do + Scrapers::DiscoverTranscriptsJob.perform_now + end + end + end + + test "skips meetings that already have a transcript" do + MeetingDocument.create!( + meeting: @council_meeting, + document_type: "transcript", + source_url: "https://www.youtube.com/watch?v=existing" + ) + + date_str = @council_meeting.starts_at.strftime("%B %-d, %Y") + yt_output = "abc123 | City Council Meeting for Thursday, #{date_str}\n" + + Open3.stub :capture3, [ yt_output, "", stub_status(true) ] do + assert_no_enqueued_jobs only: Documents::DownloadTranscriptJob do + Scrapers::DiscoverTranscriptsJob.perform_now + end + end + end + + test "skips non-council meetings even if title matches date" do + date_str = @plan_commission.starts_at.strftime("%B %-d, %Y") + # Plan Commission title won't match TITLE_PATTERN — no job enqueued + yt_output = "ghi789 | Plan Commission Meeting for Tuesday, #{date_str}\n" + + Open3.stub :capture3, [ yt_output, "", stub_status(true) ] do + assert_no_enqueued_jobs only: Documents::DownloadTranscriptJob do + Scrapers::DiscoverTranscriptsJob.perform_now + end + end + end + + test "handles yt-dlp failure gracefully" do + Open3.stub :capture3, [ "", "yt-dlp: command not found", stub_status(false) ] do + assert_no_enqueued_jobs only: Documents::DownloadTranscriptJob do + assert_nothing_raised do + Scrapers::DiscoverTranscriptsJob.perform_now + end + end + end + end +end diff --git a/test/jobs/summarize_meeting_job_test.rb b/test/jobs/summarize_meeting_job_test.rb index d30c467..5d70e6c 100644 --- a/test/jobs/summarize_meeting_job_test.rb +++ b/test/jobs/summarize_meeting_job_test.rb @@ -63,7 +63,8 @@ def retrieval_stub.format_topic_context(*args); []; end summary = @meeting.meeting_summaries.find_by(summary_type: "minutes_recap") assert summary, "Should create a minutes_recap summary" - assert_equal generation_data, summary.generation_data + assert_equal "minutes", summary.generation_data["source_type"] + assert_equal generation_data["headline"], summary.generation_data["headline"] assert_nil summary.content end @@ -194,6 +195,98 @@ def retrieval_stub.format_topic_context(*args); []; end mock_ai.verify end + test "generates meeting summary from transcript when no minutes exist" do + doc = @meeting.meeting_documents.create!( + document_type: "transcript", + source_url: "http://example.com/transcript.txt", + extracted_text: "Transcript of meeting: The council discussed the budget at length." + ) + + generation_data = { + "headline" => "Council discussed the budget", + "highlights" => [ + { "text" => "Budget discussed", "citation" => "Transcript", "impact" => "medium" } + ], + "public_input" => [], + "item_details" => [], + "source_type" => "transcript" + } + + mock_ai = Minitest::Mock.new + mock_ai.expect :prepare_kb_context, "" do |arg| arg.is_a?(Array) end + mock_ai.expect :analyze_meeting_content, generation_data.to_json do |text, kb, type, **kwargs| + type == "transcript" + end + # Topic-level mocks + mock_ai.expect :analyze_topic_summary, '{"factual_record": []}' do |arg| arg.is_a?(Hash) end + mock_ai.expect :render_topic_summary, "## Summary" do |arg| arg.is_a?(String) end + + retrieval_stub = Object.new + def retrieval_stub.retrieve_context(*args, **kwargs); []; end + def retrieval_stub.format_context(*args); ""; end + def retrieval_stub.retrieve_topic_context(*args, **kwargs); []; end + def retrieval_stub.format_topic_context(*args); []; end + + RetrievalService.stub :new, retrieval_stub do + Ai::OpenAiService.stub :new, mock_ai do + SummarizeMeetingJob.perform_now(@meeting.id) + end + end + + summary = @meeting.meeting_summaries.find_by(summary_type: "transcript_recap") + assert summary, "Should create a transcript_recap summary" + assert_equal "transcript", summary.generation_data["source_type"] + assert_nil @meeting.meeting_summaries.find_by(summary_type: "minutes_recap") + end + + test "minutes take priority over transcript" do + @meeting.meeting_documents.create!( + document_type: "minutes_pdf", + source_url: "http://example.com/minutes.pdf", + extracted_text: "Page 1: The council approved the budget 5-2." + ) + @meeting.meeting_documents.create!( + document_type: "transcript", + source_url: "http://example.com/transcript.txt", + extracted_text: "Transcript of meeting: The council discussed the budget." + ) + + generation_data = { + "headline" => "Council approved the budget", + "highlights" => [ + { "text" => "Budget approved", "citation" => "Page 1", "vote" => "5-2", "impact" => "high" } + ], + "public_input" => [], + "item_details" => [] + } + + mock_ai = Minitest::Mock.new + mock_ai.expect :prepare_kb_context, "" do |arg| arg.is_a?(Array) end + mock_ai.expect :analyze_meeting_content, generation_data.to_json do |text, kb, type, **kwargs| + type == "minutes" + end + # Topic-level mocks + mock_ai.expect :analyze_topic_summary, '{"factual_record": []}' do |arg| arg.is_a?(Hash) end + mock_ai.expect :render_topic_summary, "## Summary" do |arg| arg.is_a?(String) end + + retrieval_stub = Object.new + def retrieval_stub.retrieve_context(*args, **kwargs); []; end + def retrieval_stub.format_context(*args); ""; end + def retrieval_stub.retrieve_topic_context(*args, **kwargs); []; end + def retrieval_stub.format_topic_context(*args); []; end + + RetrievalService.stub :new, retrieval_stub do + Ai::OpenAiService.stub :new, mock_ai do + SummarizeMeetingJob.perform_now(@meeting.id) + end + end + + summary = @meeting.meeting_summaries.find_by(summary_type: "minutes_recap") + assert summary, "Should create minutes_recap" + assert_equal "minutes_with_transcript", summary.generation_data["source_type"] + assert_nil @meeting.meeting_summaries.find_by(summary_type: "transcript_recap"), "Should NOT create transcript_recap" + end + test "enqueues GenerateTopicBriefingJob after topic summary generation" do mock_ai = Minitest::Mock.new # Meeting-level: prepare_kb_context called (no docs, so no analyze call) diff --git a/test/models/meeting_test.rb b/test/models/meeting_test.rb index 3d2f5b3..88fdce8 100644 --- a/test/models/meeting_test.rb +++ b/test/models/meeting_test.rb @@ -50,4 +50,42 @@ class MeetingTest < ActiveSupport::TestCase assert_equal :none, meeting.document_status end + + test "document_status returns :transcript when transcript exists but no minutes or packet" do + meeting = Meeting.create!( + detail_page_url: "http://example.com/transcript-1", + starts_at: Time.current + ) + meeting.meeting_documents.create!( + document_type: "transcript", + source_url: "https://www.youtube.com/watch?v=test123" + ) + assert_equal :transcript, meeting.document_status + end + + test "document_status returns :minutes even when transcript exists" do + meeting = Meeting.create!( + detail_page_url: "http://example.com/transcript-2", + starts_at: Time.current + ) + meeting.meeting_documents.create!(document_type: "minutes_pdf") + meeting.meeting_documents.create!( + document_type: "transcript", + source_url: "https://www.youtube.com/watch?v=test123" + ) + assert_equal :minutes, meeting.document_status + end + + test "document_status returns :transcript above :agenda" do + meeting = Meeting.create!( + detail_page_url: "http://example.com/transcript-3", + starts_at: Time.current + ) + meeting.meeting_documents.create!(document_type: "agenda_pdf") + meeting.meeting_documents.create!( + document_type: "transcript", + source_url: "https://www.youtube.com/watch?v=test123" + ) + assert_equal :transcript, meeting.document_status + end end