From 696de06b95f8eb2b2bc67bdf786fb2aae15b76d3 Mon Sep 17 00:00:00 2001 From: pretyflaco Date: Tue, 5 May 2026 12:39:03 +0300 Subject: [PATCH] fix(label): handle sensitive condenser mics in YOU detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dual-channel YOU/REMOTE labeling in _label_speakers_from_channels declares a speaker as YOU only if their mic-channel-energy ratio exceeds 0.5. Sensitive condenser mics (e.g. RODE NT-USB) pick up enough room audio that the local speaker's ratio sits below 0.5 even though they are clearly the most mic-dominant — so YOU never gets assigned and the local speaker becomes a REMOTE_X. Add a relative-margin check alongside the absolute one: if the top candidate's ratio is more than 0.1 above the average of all other speakers' ratios (and absolute ratio > 0.15 to avoid silent edge cases), assign YOU. Log the margin and average for debugging. Test: synthesize a 3-segment stereo WAV where the local speaker has ratio ~0.4 (below 0.5) but average of other speakers is ~0.05; assert YOU is assigned via the margin path. All 5 TestLabelSpeakersFromChannels tests pass. --- meet/transcribe.py | 20 ++++++++++++- tests/test_transcribe.py | 63 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/meet/transcribe.py b/meet/transcribe.py index c9e6b09..b7d06d4 100644 --- a/meet/transcribe.py +++ b/meet/transcribe.py @@ -1320,10 +1320,28 @@ def _label_speakers_from_channels( # The speaker with the highest mic ratio is YOU — but only if they # are actually mic-dominant. When no speaker exceeds the threshold # (e.g. only system audio was captured), label everyone as REMOTE. + # + # We use both an absolute check (ratio > 0.5) and a relative margin + # check. Sensitive condenser mics (e.g. RODE NT-USB) pick up room + # audio on the mic channel, which can push the local speaker's ratio + # below 0.5 even though they are clearly the most mic-dominant. The + # margin check catches this case: if the top speaker's ratio is well + # above the average of all other speakers, they are almost certainly + # the local mic user. you_speaker = max(speaker_mic_ratio, key=lambda s: speaker_mic_ratio[s]) + you_ratio = speaker_mic_ratio[you_speaker] + + other_ratios = [r for s, r in speaker_mic_ratio.items() if s != you_speaker] + avg_other = sum(other_ratios) / len(other_ratios) if other_ratios else 0.0 + margin = you_ratio - avg_other + + print( + f" Best candidate: {you_speaker} " + f"(ratio={you_ratio:.3f}, margin={margin:.3f} over avg={avg_other:.3f})" + ) label_map: dict[str, str] = {} - if speaker_mic_ratio[you_speaker] > 0.5: + if you_ratio > 0.5 or (margin > 0.1 and you_ratio > 0.15): # At least one speaker is mic-dominant label_map[you_speaker] = "YOU" remote_speakers = [s for s in sorted(speaker_mic_ratio) if s != you_speaker] diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index e2c8379..f5e93dc 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -227,6 +227,69 @@ def test_no_mic_dominant_speaker_all_remote(self, tmp_path): # All should be REMOTE variants assert all("REMOTE" in label for label in labels) + def test_sensitive_condenser_mic_assigns_you_via_margin(self, tmp_path): + """Sensitive condenser mics (e.g. RODE NT-USB) pick up enough room + audio that the local speaker's mic_ratio sits below 0.5, even though + they are clearly the most mic-dominant. The margin check (top + candidate >0.1 above the average of others, with absolute >0.15) + should still assign YOU in this case.""" + import numpy as np + import wave as wave_mod + from meet.transcribe import _label_speakers_from_channels + + sr = 16000 + # 9s of audio = three 3s segments back-to-back. + n_frames_each = int(3.0 * sr) + t_each = np.linspace(0, 3.0, n_frames_each, dtype=np.float32) + + # Speaker 0 (the local user, talks 0-3s). The mic pickup is only + # somewhat louder than the system channel because the condenser mic + # picks up the speakers' own bleed. ratio ~0.4. + mic_seg0 = (8000 * np.sin(2 * np.pi * 440 * t_each)).astype(np.int16) + sys_seg0 = (5500 * np.sin(2 * np.pi * 880 * t_each)).astype(np.int16) + + # Speaker 1 (remote, talks 3-6s). System channel dominant. + mic_seg1 = (500 * np.sin(2 * np.pi * 220 * t_each)).astype(np.int16) + sys_seg1 = (20000 * np.sin(2 * np.pi * 1100 * t_each)).astype(np.int16) + + # Speaker 2 (remote, talks 6-9s). System channel dominant. + mic_seg2 = (500 * np.sin(2 * np.pi * 330 * t_each)).astype(np.int16) + sys_seg2 = (20000 * np.sin(2 * np.pi * 1320 * t_each)).astype(np.int16) + + mic = np.concatenate([mic_seg0, mic_seg1, mic_seg2]) + system = np.concatenate([sys_seg0, sys_seg1, sys_seg2]) + stereo = np.column_stack((mic, system)).flatten() + + wav_path = tmp_path / "condenser.wav" + with wave_mod.open(str(wav_path), "wb") as wf: + wf.setnchannels(2) + wf.setsampwidth(2) + wf.setframerate(sr) + wf.writeframes(stereo.tobytes()) + + segments = [ + Segment(start=0.0, end=3.0, text="local user", speaker="SPEAKER_00"), + Segment(start=3.0, end=6.0, text="remote a", speaker="SPEAKER_01"), + Segment(start=6.0, end=9.0, text="remote b", speaker="SPEAKER_02"), + ] + speakers = [ + Speaker(id="SPEAKER_00"), + Speaker(id="SPEAKER_01"), + Speaker(id="SPEAKER_02"), + ] + + new_segs, _ = _label_speakers_from_channels( + wav_path, segments, speakers, + ) + + labels = {s.speaker for s in new_segs} + # SPEAKER_00 should be labeled YOU even though its absolute ratio + # is below 0.5 — the margin over the average of the other two + # speakers' ratios is large enough. + assert "YOU" in labels, ( + f"expected YOU label via margin check, got labels={labels}" + ) + # ─── TranscriptionConfig validation ──────────────────────────────────────