From 696de06b95f8eb2b2bc67bdf786fb2aae15b76d3 Mon Sep 17 00:00:00 2001
From: pretyflaco <kemal@blinkbtc.com>
Date: Tue, 5 May 2026 12:39:03 +0300
Subject: [PATCH] fix(label): handle sensitive condenser mics in YOU detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dual-channel YOU/REMOTE labeling in _label_speakers_from_channels
declares a speaker as YOU only if their mic-channel-energy ratio
exceeds 0.5.  Sensitive condenser mics (e.g. RODE NT-USB) pick up
enough room audio that the local speaker's ratio sits below 0.5 even
though they are clearly the most mic-dominant — so YOU never gets
assigned and the local speaker becomes a REMOTE_X.

Add a relative-margin check alongside the absolute one: if the top
candidate's ratio is more than 0.1 above the average of all other
speakers' ratios (and absolute ratio > 0.15 to avoid silent edge
cases), assign YOU.  Log the margin and average for debugging.

Test: synthesize a 3-segment stereo WAV where the local speaker has
ratio ~0.4 (below 0.5) but average of other speakers is ~0.05;
assert YOU is assigned via the margin path.

All 5 TestLabelSpeakersFromChannels tests pass.
---
 meet/transcribe.py       | 20 ++++++++++++-
 tests/test_transcribe.py | 63 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/meet/transcribe.py b/meet/transcribe.py
index c9e6b09..b7d06d4 100644
--- a/meet/transcribe.py
+++ b/meet/transcribe.py
@@ -1320,10 +1320,28 @@ def _label_speakers_from_channels(
     # The speaker with the highest mic ratio is YOU — but only if they
     # are actually mic-dominant.  When no speaker exceeds the threshold
     # (e.g. only system audio was captured), label everyone as REMOTE.
+    #
+    # We use both an absolute check (ratio > 0.5) and a relative margin
+    # check.  Sensitive condenser mics (e.g. RODE NT-USB) pick up room
+    # audio on the mic channel, which can push the local speaker's ratio
+    # below 0.5 even though they are clearly the most mic-dominant.  The
+    # margin check catches this case: if the top speaker's ratio is well
+    # above the average of all other speakers, they are almost certainly
+    # the local mic user.
     you_speaker = max(speaker_mic_ratio, key=lambda s: speaker_mic_ratio[s])
+    you_ratio = speaker_mic_ratio[you_speaker]
+
+    other_ratios = [r for s, r in speaker_mic_ratio.items() if s != you_speaker]
+    avg_other = sum(other_ratios) / len(other_ratios) if other_ratios else 0.0
+    margin = you_ratio - avg_other
+
+    print(
+        f"    Best candidate: {you_speaker} "
+        f"(ratio={you_ratio:.3f}, margin={margin:.3f} over avg={avg_other:.3f})"
+    )
 
     label_map: dict[str, str] = {}
-    if speaker_mic_ratio[you_speaker] > 0.5:
+    if you_ratio > 0.5 or (margin > 0.1 and you_ratio > 0.15):
         # At least one speaker is mic-dominant
         label_map[you_speaker] = "YOU"
         remote_speakers = [s for s in sorted(speaker_mic_ratio) if s != you_speaker]
diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py
index e2c8379..f5e93dc 100644
--- a/tests/test_transcribe.py
+++ b/tests/test_transcribe.py
@@ -227,6 +227,69 @@ def test_no_mic_dominant_speaker_all_remote(self, tmp_path):
         # All should be REMOTE variants
         assert all("REMOTE" in label for label in labels)
 
+    def test_sensitive_condenser_mic_assigns_you_via_margin(self, tmp_path):
+        """Sensitive condenser mics (e.g. RODE NT-USB) pick up enough room
+        audio that the local speaker's mic_ratio sits below 0.5, even though
+        they are clearly the most mic-dominant.  The margin check (top
+        candidate >0.1 above the average of others, with absolute >0.15)
+        should still assign YOU in this case."""
+        import numpy as np
+        import wave as wave_mod
+        from meet.transcribe import _label_speakers_from_channels
+
+        sr = 16000
+        # 9s of audio = three 3s segments back-to-back.
+        n_frames_each = int(3.0 * sr)
+        t_each = np.linspace(0, 3.0, n_frames_each, dtype=np.float32)
+
+        # Speaker 0 (the local user, talks 0-3s).  The mic pickup is only
+        # somewhat louder than the system channel because the condenser mic
+        # picks up the speakers' own bleed.  ratio ~0.4.
+        mic_seg0 = (8000 * np.sin(2 * np.pi * 440 * t_each)).astype(np.int16)
+        sys_seg0 = (5500 * np.sin(2 * np.pi * 880 * t_each)).astype(np.int16)
+
+        # Speaker 1 (remote, talks 3-6s).  System channel dominant.
+        mic_seg1 = (500 * np.sin(2 * np.pi * 220 * t_each)).astype(np.int16)
+        sys_seg1 = (20000 * np.sin(2 * np.pi * 1100 * t_each)).astype(np.int16)
+
+        # Speaker 2 (remote, talks 6-9s).  System channel dominant.
+        mic_seg2 = (500 * np.sin(2 * np.pi * 330 * t_each)).astype(np.int16)
+        sys_seg2 = (20000 * np.sin(2 * np.pi * 1320 * t_each)).astype(np.int16)
+
+        mic = np.concatenate([mic_seg0, mic_seg1, mic_seg2])
+        system = np.concatenate([sys_seg0, sys_seg1, sys_seg2])
+        stereo = np.column_stack((mic, system)).flatten()
+
+        wav_path = tmp_path / "condenser.wav"
+        with wave_mod.open(str(wav_path), "wb") as wf:
+            wf.setnchannels(2)
+            wf.setsampwidth(2)
+            wf.setframerate(sr)
+            wf.writeframes(stereo.tobytes())
+
+        segments = [
+            Segment(start=0.0, end=3.0, text="local user", speaker="SPEAKER_00"),
+            Segment(start=3.0, end=6.0, text="remote a",   speaker="SPEAKER_01"),
+            Segment(start=6.0, end=9.0, text="remote b",   speaker="SPEAKER_02"),
+        ]
+        speakers = [
+            Speaker(id="SPEAKER_00"),
+            Speaker(id="SPEAKER_01"),
+            Speaker(id="SPEAKER_02"),
+        ]
+
+        new_segs, _ = _label_speakers_from_channels(
+            wav_path, segments, speakers,
+        )
+
+        labels = {s.speaker for s in new_segs}
+        # SPEAKER_00 should be labeled YOU even though its absolute ratio
+        # is below 0.5 — the margin over the average of the other two
+        # speakers' ratios is large enough.
+        assert "YOU" in labels, (
+            f"expected YOU label via margin check, got labels={labels}"
+        )
+
 
 # ─── TranscriptionConfig validation ──────────────────────────────────────