From 3459c85a44dfb6c1f30bfb29d5c884755f6c8059 Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Mon, 28 Feb 2022 17:58:27 +0000
Subject: [PATCH 01/19] First attempt at batch vectorisation

---
 pystoi/stoi.py                | 62 ++++++++++++++++++++---------------
 pystoi/utils.py               | 60 ++++++++++++++++++---------------
 tests/test_overlap_and_add.py | 54 ++++++++++++++++++++++++++----
 3 files changed, 117 insertions(+), 59 deletions(-)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index e955afb..87aeb35 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -46,9 +46,13 @@ def stoi(x, y, fs_sig, extended=False):
             IEEE Transactions on Audio, Speech and Language Processing, 2016.
     """
     if x.shape != y.shape:
-        raise Exception('x and y should have the same length,' +
+        raise Exception('x and y should have the same shape,' +
                         'found {} and {}'.format(x.shape, y.shape))
 
+    if len(x.shape) == 1:  # Add a batch size if missing
+        x = x[None, :]
+        y = y[None, :]
+
     # Resample is fs_sig is different than fs
     if fs_sig != FS:
         x = utils.resample_oct(x, FS, fs_sig)
@@ -58,58 +62,62 @@ def stoi(x, y, fs_sig, extended=False):
     x, y = utils.remove_silent_frames(x, y, DYN_RANGE, N_FRAME, int(N_FRAME/2))
 
     # Take STFT
-    x_spec = utils.stft(x, N_FRAME, NFFT, overlap=2).transpose()
-    y_spec = utils.stft(y, N_FRAME, NFFT, overlap=2).transpose()
+    x_spec = utils.stft(x, N_FRAME, NFFT, overlap=2)
+    y_spec = utils.stft(y, N_FRAME, NFFT, overlap=2)
 
     # Ensure at least 30 frames for intermediate intelligibility
-    if x_spec.shape[-1] < N:
+    mask = ~np.all(x_spec == 0, axis=-1)
+    if np.any(np.sum(mask, axis=-1) < N):
         warnings.warn('Not enough STFT frames to compute intermediate '
                       'intelligibility measure after removing silent '
                       'frames. Returning 1e-5. Please check you wav files',
                       RuntimeWarning)
-        return 1e-5
+        return np.squeeze([1e-5 for _ in range(x)])
 
     # Apply OB matrix to the spectrograms as in Eq. (1)
-    x_tob = np.sqrt(np.matmul(OBM, np.square(np.abs(x_spec))))
-    y_tob = np.sqrt(np.matmul(OBM, np.square(np.abs(y_spec))))
+    x_tob = np.sqrt(np.matmul(np.square(np.abs(x_spec)), OBM.T))
+    y_tob = np.sqrt(np.matmul(np.square(np.abs(y_spec)), OBM.T))
 
-    # Take segments of x_tob, y_tob
+    # Take segments of x_tob, y_tob, shape (batch, num_segments, seg_size, bands)
     x_segments = np.array(
-        [x_tob[:, m - N:m] for m in range(N, x_tob.shape[1] + 1)])
+        [x_tob[:, m - N : m] for m in range(N, x_tob.shape[1] + 1)]
+    ).transpose([1, 0, 2, 3])
+    x_segments = x_segments * mask[:, N-1:, None, None]
     y_segments = np.array(
-        [y_tob[:, m - N:m] for m in range(N, x_tob.shape[1] + 1)])
+        [y_tob[:, m - N : m] for m in range(N, x_tob.shape[1] + 1)]
+    ).transpose([1, 0, 2, 3])
+    y_segments = y_segments * mask[:, N-1:, None, None]
 
-    if extended:
-        x_n = utils.row_col_normalize(x_segments)
-        y_n = utils.row_col_normalize(y_segments)
-        return np.sum(x_n * y_n / N) / x_n.shape[0]
+    if extended:  # TODO: Vectorialise this
+        x_n = np.array([utils.row_col_normalize(xi) for xi in x_segments])
+        y_n = np.array([utils.row_col_normalize(yi) for yi in y_segments])
+        return np.sum(x_n * y_n / N, axis=(1, 2, 3)) / x_n.shape[1]
 
     else:
         # Find normalization constants and normalize
-        normalization_consts = (
-            np.linalg.norm(x_segments, axis=2, keepdims=True) /
-            (np.linalg.norm(y_segments, axis=2, keepdims=True) + utils.EPS))
+        normalization_consts = np.linalg.norm(x_segments, axis=-1, keepdims=True) / (
+            np.linalg.norm(y_segments, axis=-1, keepdims=True) + utils.EPS
+        )
         y_segments_normalized = y_segments * normalization_consts
 
         # Clip as described in [1]
         clip_value = 10 ** (-BETA / 20)
-        y_primes = np.minimum(
-            y_segments_normalized, x_segments * (1 + clip_value))
+        y_primes = np.minimum(y_segments_normalized, x_segments * (1 + clip_value))
 
         # Subtract mean vectors
-        y_primes = y_primes - np.mean(y_primes, axis=2, keepdims=True)
-        x_segments = x_segments - np.mean(x_segments, axis=2, keepdims=True)
+        y_primes = y_primes - np.mean(y_primes, axis=-1, keepdims=True)
+        x_segments = x_segments - np.mean(x_segments, axis=-1, keepdims=True)
 
         # Divide by their norms
-        y_primes /= (np.linalg.norm(y_primes, axis=2, keepdims=True) + utils.EPS)
-        x_segments /= (np.linalg.norm(x_segments, axis=2, keepdims=True) + utils.EPS)
+        y_primes /= np.linalg.norm(y_primes, axis=-1, keepdims=True) + utils.EPS
+        x_segments /= np.linalg.norm(x_segments, axis=-1, keepdims=True) + utils.EPS
         # Find a matrix with entries summing to sum of correlations of vectors
-        correlations_components = y_primes * x_segments
+        correlations_components = np.sum(y_primes * x_segments, axis=-2)
 
         # J, M as in [1], eq.6
-        J = x_segments.shape[0]
+        J = x_segments.shape[2]
         M = x_segments.shape[1]
 
         # Find the mean of all correlations
-        d = np.sum(correlations_components) / (J * M)
-        return d
+        d = np.sum(correlations_components, axis=(-2, -1)) / (J * M)
+        return np.squeeze(d)
diff --git a/pystoi/utils.py b/pystoi/utils.py
index c1fdb3e..4eb6f15 100644
--- a/pystoi/utils.py
+++ b/pystoi/utils.py
@@ -47,7 +47,7 @@ def resample_oct(x, p, q):
     """Resampler that is compatible with Octave"""
     h = _resample_window_oct(p, q)
     window = h / np.sum(h)
-    return resample_poly(x, p, q, window=window)
+    return resample_poly(x, p, q, axis=-1, window=window)
 
 
 @functools.lru_cache(maxsize=None)
@@ -95,39 +95,45 @@ def stft(x, win_size, fft_size, overlap=4):
     """
     hop = int(win_size / overlap)
     w = np.hanning(win_size + 2)[1: -1]  # = matlab.hanning(win_size)
-    stft_out = np.array([np.fft.rfft(w * x[i:i + win_size], n=fft_size)
-                        for i in range(0, len(x) - win_size, hop)])
-    return stft_out
+    stft_out = np.array([np.fft.rfft(w * x[:, i:i + win_size], n=fft_size)
+                        for i in range(0, x.shape[-1] - win_size, hop)])
+    return stft_out.transpose([1, 0, 2])
 
 
 def _overlap_and_add(x_frames, hop):
-    num_frames, framelen = x_frames.shape
+    batch_size, num_frames, framelen = x_frames.shape
     # Compute the number of segments, per frame.
     segments = -(-framelen // hop)  # Divide and round up.
 
     # Pad the framelen dimension to segments * hop and add n=segments frames
-    signal = np.pad(x_frames, ((0, segments), (0, segments * hop - framelen)))
-
-    # Reshape to a 3D tensor, splitting the framelen dimension in two
-    signal = signal.reshape((num_frames + segments, segments, hop))
-    # Transpose dimensions so that signal.shape = (segments, frame+segments, hop)
-    signal = np.transpose(signal, [1, 0, 2])
-    # Reshape so that signal.shape = (segments * (frame+segments), hop)
-    signal = signal.reshape((-1, hop))
-
-    # Now behold the magic!! Remove the last n=segments elements from the first axis
-    signal = signal[:-segments]
-    # Reshape to (segments, frame+segments-1, hop)
-    signal = signal.reshape((segments, num_frames + segments - 1, hop))
+    signal = np.pad(x_frames, ((0, 0), (0, segments), (0, segments * hop - framelen)))
+
+    # Reshape to a 4D tensor, splitting the framelen dimension in two
+    signal = signal.reshape((batch_size, num_frames + segments, segments, hop))
+    # Transpose dimensions so that signal.shape = (batch, segments, frame+segments, hop)
+    signal = np.transpose(signal, [0, 2, 1, 3])
+    # Reshape so that signal.shape = (batch, segments * (frame+segments), hop)
+    signal = signal.reshape((batch_size, -1, hop))
+
+    # Now behold the magic!! Remove the last n=segments elements from the second axis
+    signal = signal[:, :-segments]
+    # Reshape to (batch, segments, frame+segments-1, hop)
+    signal = signal.reshape((batch_size, segments, num_frames + segments - 1, hop))
     # This has introduced a shift by one in all rows
 
     # Now, reduce over the columns and flatten the array to achieve the result
-    signal = np.sum(signal, axis=0)
-    end = (len(x_frames) - 1) * hop + framelen
-    signal = signal.reshape(-1)[:end]
+    signal = np.sum(signal, axis=1)
+    end = (num_frames - 1) * hop + framelen
+    signal = signal.reshape((batch_size, -1))[:end]
     return signal
 
 
+def _mask_audio(x, mask):
+    return np.array([
+        np.pad(xi[mi], ((0, len(xi) - np.sum(mi)), (0, 0))) for xi, mi in zip(x, mask)
+    ])
+
+
 def remove_silent_frames(x, y, dyn_range, framelen, hop):
     """ Remove silent frames of x and y based on x
     A frame is excluded if its energy is lower than max(energy) - dyn_range
@@ -146,20 +152,22 @@ def remove_silent_frames(x, y, dyn_range, framelen, hop):
     w = np.hanning(framelen + 2)[1:-1]
 
     x_frames = np.array(
-        [w * x[i:i + framelen] for i in range(0, len(x) - framelen, hop)])
+        [w * x[..., i : i + framelen] for i in range(0, x.shape[-1] - framelen, hop)]
+    ).transpose([1, 0, 2])
     y_frames = np.array(
-        [w * y[i:i + framelen] for i in range(0, len(x) - framelen, hop)])
+        [w * y[..., i : i + framelen] for i in range(0, x.shape[-1] - framelen, hop)]
+    ).transpose([1, 0, 2])
 
     # Compute energies in dB
-    x_energies = 20 * np.log10(np.linalg.norm(x_frames, axis=1) + EPS)
+    x_energies = 20 * np.log10(np.linalg.norm(x_frames, axis=-1) + EPS)
 
     # Find boolean mask of energies lower than dynamic_range dB
     # with respect to maximum clean speech energy frame
     mask = (np.max(x_energies) - dyn_range - x_energies) < 0
 
     # Remove silent frames by masking
-    x_frames = x_frames[mask]
-    y_frames = y_frames[mask]
+    x_frames = _mask_audio(x_frames, mask)
+    y_frames = _mask_audio(y_frames, mask)
 
     x_sil = _overlap_and_add(x_frames, hop)
     y_sil = _overlap_and_add(y_frames, hop)
diff --git a/tests/test_overlap_and_add.py b/tests/test_overlap_and_add.py
index c6137d6..dfa7670 100644
--- a/tests/test_overlap_and_add.py
+++ b/tests/test_overlap_and_add.py
@@ -1,7 +1,8 @@
 import numpy as np
+import pytest
 from numpy.testing import assert_allclose
 
-from pystoi.stoi import N_FRAME
+from pystoi.stoi import N_FRAME, stoi, FS
 from pystoi.utils import _overlap_and_add
 
 
@@ -15,12 +16,53 @@ def old_overlap_and_app(x_frames, hop):
             x_sil[range(i * hop, i * hop + framelen)] += x_frames[i, :]
         return x_sil
 
+    batch_size = 4
     # Initialize
-    x = np.random.randn(1000 * N_FRAME)
+    x = np.random.randn(batch_size, 1000 * N_FRAME)
     # Add silence segment
-    silence = np.zeros(10 * N_FRAME)
-    x = np.concatenate([x[: 500 * N_FRAME], silence, x[500 * N_FRAME :]])
-    x = x.reshape([-1, N_FRAME])
-    xs = old_overlap_and_app(x, N_FRAME // 2)
+    silence = np.zeros((batch_size, 10 * N_FRAME))
+    x = np.concatenate([x[:, : 500 * N_FRAME], silence, x[:, 500 * N_FRAME :]], axis=1)
+    x = x.reshape([batch_size, -1, N_FRAME])
+    xs = [old_overlap_and_app(xi, N_FRAME // 2) for xi in x]
     xs_vectorise = _overlap_and_add(x, N_FRAME // 2)
     assert_allclose(xs, xs_vectorise)
+
+
+@pytest.mark.parametrize("batch_size", [1, 4])
+@pytest.mark.parametrize("fs", [10000, 16000])
+@pytest.mark.parametrize("extended", [True, False])
+def test_pystoi_run(batch_size, fs, extended):
+    N = fs * 4  # 4 seconds of random audio
+    x = np.random.randn(batch_size, N)
+    res = stoi(x, x, fs, extended)
+    print(batch_size, fs, extended, res)
+    assert res.shape == () if batch_size == 1 else (batch_size,)
+
+
+@pytest.mark.parametrize("extended", [True, False])
+def test_pystoi_silence(extended):
+    rng = np.random.default_rng(seed=0)
+    batch_size = 4
+    fs = 16000
+    N = fs * 4  # 4 seconds of random audio
+    x = np.random.randn(batch_size, N)
+    silence = np.random.randn(int(N / 7))
+    audio = []
+    for i in range(batch_size):
+        t = int(rng.random() * N)
+        audio.append(np.concatenate([x[i, :t], silence, x[i, t:]]))
+    audio = np.array(audio)
+    res = stoi(audio, audio, fs, extended)
+    print(batch_size, fs, extended, res)
+    assert res.shape == () if batch_size == 1 else (batch_size,)
+
+
+def test_vectorisation():
+    # Initialize batch of data
+    batch_size = 4
+    x = np.random.random((batch_size, 100 * N_FRAME))
+    y = np.random.random((batch_size, 100 * N_FRAME))
+    res_vec = stoi(x, y, FS)
+    res_old = np.array([stoi(xi, yi, FS) for xi, yi in zip(x, y)])
+    assert res_vec.shape == (batch_size,)
+    assert np.allclose(res_old, res_vec)

From 5b9cb189994da1ad59431374ce7049e0a953ee71 Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Mon, 28 Feb 2022 17:59:38 +0000
Subject: [PATCH 02/19] Add a missing squeeze

---
 pystoi/stoi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index 87aeb35..bbf58a3 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -91,7 +91,7 @@ def stoi(x, y, fs_sig, extended=False):
     if extended:  # TODO: Vectorialise this
         x_n = np.array([utils.row_col_normalize(xi) for xi in x_segments])
         y_n = np.array([utils.row_col_normalize(yi) for yi in y_segments])
-        return np.sum(x_n * y_n / N, axis=(1, 2, 3)) / x_n.shape[1]
+        return np.squeeze(np.sum(x_n * y_n / N, axis=(1, 2, 3)) / x_n.shape[1])
 
     else:
         # Find normalization constants and normalize

From f1eac1a584c415ee4b8bf7f66c6446a7565316a0 Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 16:22:44 +0000
Subject: [PATCH 03/19] Add some comments about shapes

---
 pystoi/stoi.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index bbf58a3..3c241a4 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -49,7 +49,7 @@ def stoi(x, y, fs_sig, extended=False):
         raise Exception('x and y should have the same shape,' +
                         'found {} and {}'.format(x.shape, y.shape))
 
-    if len(x.shape) == 1:  # Add a batch size if missing
+    if len(x.shape) == 1:  # Add a batch size if missing, shape (batch, num_samples)
         x = x[None, :]
         y = y[None, :]
 
@@ -61,12 +61,12 @@ def stoi(x, y, fs_sig, extended=False):
     # Remove silent frames
     x, y = utils.remove_silent_frames(x, y, DYN_RANGE, N_FRAME, int(N_FRAME/2))
 
-    # Take STFT
+    # Take STFT, shape (batch, num_frames, n_fft//2 + 1)
     x_spec = utils.stft(x, N_FRAME, NFFT, overlap=2)
     y_spec = utils.stft(y, N_FRAME, NFFT, overlap=2)
 
     # Ensure at least 30 frames for intermediate intelligibility
-    mask = ~np.all(x_spec == 0, axis=-1)
+    mask = ~np.all(x_spec == 0, axis=-1)  # Check for frames with non-zero values
     if np.any(np.sum(mask, axis=-1) < N):
         warnings.warn('Not enough STFT frames to compute intermediate '
                       'intelligibility measure after removing silent '
@@ -74,7 +74,7 @@ def stoi(x, y, fs_sig, extended=False):
                       RuntimeWarning)
         return np.squeeze([1e-5 for _ in range(x)])
 
-    # Apply OB matrix to the spectrograms as in Eq. (1)
+    # Apply OB matrix to the spectrograms as in Eq. (1), shape (batch, frames, bands)
     x_tob = np.sqrt(np.matmul(np.square(np.abs(x_spec)), OBM.T))
     y_tob = np.sqrt(np.matmul(np.square(np.abs(y_spec)), OBM.T))
 

From 9d2c2731ab768c65c0405ec648729392ca3beb3a Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 16:23:11 +0000
Subject: [PATCH 04/19] Method extraction

---
 pystoi/stoi.py  | 10 ++--------
 pystoi/utils.py |  7 +++++++
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index 3c241a4..8a56484 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -79,14 +79,8 @@ def stoi(x, y, fs_sig, extended=False):
     y_tob = np.sqrt(np.matmul(np.square(np.abs(y_spec)), OBM.T))
 
     # Take segments of x_tob, y_tob, shape (batch, num_segments, seg_size, bands)
-    x_segments = np.array(
-        [x_tob[:, m - N : m] for m in range(N, x_tob.shape[1] + 1)]
-    ).transpose([1, 0, 2, 3])
-    x_segments = x_segments * mask[:, N-1:, None, None]
-    y_segments = np.array(
-        [y_tob[:, m - N : m] for m in range(N, x_tob.shape[1] + 1)]
-    ).transpose([1, 0, 2, 3])
-    y_segments = y_segments * mask[:, N-1:, None, None]
+    x_segments = utils._segment_frames(x_tob, mask, N)
+    y_segments = utils._segment_frames(y_tob, mask, N)
 
     if extended:  # TODO: Vectorialise this
         x_n = np.array([utils.row_col_normalize(xi) for xi in x_segments])
diff --git a/pystoi/utils.py b/pystoi/utils.py
index 4eb6f15..0ea9149 100644
--- a/pystoi/utils.py
+++ b/pystoi/utils.py
@@ -134,6 +134,13 @@ def _mask_audio(x, mask):
     ])
 
 
+def _segment_frames(x, mask, seg_size):
+    # TODO: Vectorise this!
+    segments = np.array([x[:, m - seg_size: m] for m in range(seg_size, x.shape[1] + 1)])
+    segments = segments.transpose([1, 0, 2, 3])  # put back batch in the first dimension
+    return segments * mask[:, seg_size - 1:, None, None]
+
+
 def remove_silent_frames(x, y, dyn_range, framelen, hop):
     """ Remove silent frames of x and y based on x
     A frame is excluded if its energy is lower than max(energy) - dyn_range

From 4a7a5bbd76928cef757b36cf914ef2588f1c7ba4 Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 16:44:17 +0000
Subject: [PATCH 05/19] Remove a wrong todo

---
 pystoi/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pystoi/utils.py b/pystoi/utils.py
index 0ea9149..b1d579a 100644
--- a/pystoi/utils.py
+++ b/pystoi/utils.py
@@ -135,7 +135,6 @@ def _mask_audio(x, mask):
 
 
 def _segment_frames(x, mask, seg_size):
-    # TODO: Vectorise this!
     segments = np.array([x[:, m - seg_size: m] for m in range(seg_size, x.shape[1] + 1)])
     segments = segments.transpose([1, 0, 2, 3])  # put back batch in the first dimension
     return segments * mask[:, seg_size - 1:, None, None]

From 15a8e212ebfa602e39591c6eb91af11a54d31ad2 Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 16:46:51 +0000
Subject: [PATCH 06/19] Stupid refactoring

---
 pystoi/stoi.py  | 4 ++--
 pystoi/utils.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index 8a56484..8727e41 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -79,8 +79,8 @@ def stoi(x, y, fs_sig, extended=False):
     y_tob = np.sqrt(np.matmul(np.square(np.abs(y_spec)), OBM.T))
 
     # Take segments of x_tob, y_tob, shape (batch, num_segments, seg_size, bands)
-    x_segments = utils._segment_frames(x_tob, mask, N)
-    y_segments = utils._segment_frames(y_tob, mask, N)
+    x_segments = utils.segment_frames(x_tob, mask, N)
+    y_segments = utils.segment_frames(y_tob, mask, N)
 
     if extended:  # TODO: Vectorialise this
         x_n = np.array([utils.row_col_normalize(xi) for xi in x_segments])
diff --git a/pystoi/utils.py b/pystoi/utils.py
index b1d579a..83d9fd0 100644
--- a/pystoi/utils.py
+++ b/pystoi/utils.py
@@ -134,7 +134,7 @@ def _mask_audio(x, mask):
     ])
 
 
-def _segment_frames(x, mask, seg_size):
+def segment_frames(x, mask, seg_size):
     segments = np.array([x[:, m - seg_size: m] for m in range(seg_size, x.shape[1] + 1)])
     segments = segments.transpose([1, 0, 2, 3])  # put back batch in the first dimension
     return segments * mask[:, seg_size - 1:, None, None]

From e1a597d635be05541d650ba8ab673e5f737f26af Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 19:30:06 +0000
Subject: [PATCH 07/19] Vectorise row_col_normalize

---
 pystoi/stoi.py  |  9 +++++----
 pystoi/utils.py | 24 ++++++++----------------
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index 8727e41..6f8d068 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -82,10 +82,11 @@ def stoi(x, y, fs_sig, extended=False):
     x_segments = utils.segment_frames(x_tob, mask, N)
     y_segments = utils.segment_frames(y_tob, mask, N)
 
-    if extended:  # TODO: Vectorialise this
-        x_n = np.array([utils.row_col_normalize(xi) for xi in x_segments])
-        y_n = np.array([utils.row_col_normalize(yi) for yi in y_segments])
-        return np.squeeze(np.sum(x_n * y_n / N, axis=(1, 2, 3)) / x_n.shape[1])
+    if extended:
+        x_n = utils.row_col_normalize(x_segments)
+        y_n = utils.row_col_normalize(y_segments)
+        d_n = np.mean(np.sum(x_n * y_n, axis=3), axis=2)
+        return np.squeeze(np.mean(d_n, axis=1))
 
     else:
         # Find normalization constants and normalize
diff --git a/pystoi/utils.py b/pystoi/utils.py
index 83d9fd0..ea012d0 100644
--- a/pystoi/utils.py
+++ b/pystoi/utils.py
@@ -181,25 +181,17 @@ def remove_silent_frames(x, y, dyn_range, framelen, hop):
     return x_sil, y_sil
 
 
-def vect_two_norm(x, axis=-1):
-    """ Returns an array of vectors of norms of the rows of matrices from 3D array """
-    return np.sum(np.square(x), axis=axis, keepdims=True)
-
-
 def row_col_normalize(x):
     """ Row and column mean and variance normalize an array of 2D segments """
+    # input shape (batch, num_segments, seg_size, bands)
     # Row mean and variance normalization
     x_normed = x + EPS * np.random.standard_normal(x.shape)
-    x_normed -= np.mean(x_normed, axis=-1, keepdims=True)
-    x_inv = 1. / np.sqrt(vect_two_norm(x_normed))
-    x_diags = np.array(
-        [np.diag(x_inv[i].reshape(-1)) for i in range(x_inv.shape[0])])
-    x_normed = np.matmul(x_diags, x_normed)
+    x_normed -= np.mean(x_normed, axis=-2, keepdims=True)
+    x_inv = 1. / np.linalg.norm(x_normed, axis=-2)
+    x_normed = x_normed * x_inv[..., None, :]
     # Column mean and variance normalization
-    x_normed += + EPS * np.random.standard_normal(x_normed.shape)
-    x_normed -= np.mean(x_normed, axis=1, keepdims=True)
-    x_inv = 1. / np.sqrt(vect_two_norm(x_normed, axis=1))
-    x_diags = np.array(
-        [np.diag(x_inv[i].reshape(-1)) for i in range(x_inv.shape[0])])
-    x_normed = np.matmul(x_normed, x_diags)
+    x_normed += EPS * np.random.standard_normal(x_normed.shape)
+    x_normed -= np.mean(x_normed, axis=-1, keepdims=True)
+    x_inv = 1. / np.linalg.norm(x_normed, axis=-1)
+    x_normed = x_normed * x_inv[..., None]
     return x_normed

From b64dc8ffd4653917ed61e1c8f868199d7755589c Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 19:31:25 +0000
Subject: [PATCH 08/19] Fix axes for sums

---
 pystoi/stoi.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index 6f8d068..5f80c3c 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -90,8 +90,8 @@ def stoi(x, y, fs_sig, extended=False):
 
     else:
         # Find normalization constants and normalize
-        normalization_consts = np.linalg.norm(x_segments, axis=-1, keepdims=True) / (
-            np.linalg.norm(y_segments, axis=-1, keepdims=True) + utils.EPS
+        normalization_consts = np.linalg.norm(x_segments, axis=-2, keepdims=True) / (
+            np.linalg.norm(y_segments, axis=-2, keepdims=True) + utils.EPS
         )
         y_segments_normalized = y_segments * normalization_consts
 
@@ -100,19 +100,15 @@ def stoi(x, y, fs_sig, extended=False):
         y_primes = np.minimum(y_segments_normalized, x_segments * (1 + clip_value))
 
         # Subtract mean vectors
-        y_primes = y_primes - np.mean(y_primes, axis=-1, keepdims=True)
-        x_segments = x_segments - np.mean(x_segments, axis=-1, keepdims=True)
+        y_primes = y_primes - np.mean(y_primes, axis=-2, keepdims=True)
+        x_segments = x_segments - np.mean(x_segments, axis=-2, keepdims=True)
 
         # Divide by their norms
-        y_primes /= np.linalg.norm(y_primes, axis=-1, keepdims=True) + utils.EPS
-        x_segments /= np.linalg.norm(x_segments, axis=-1, keepdims=True) + utils.EPS
+        y_primes /= np.linalg.norm(y_primes, axis=-2, keepdims=True) + utils.EPS
+        x_segments /= np.linalg.norm(x_segments, axis=-2, keepdims=True) + utils.EPS
         # Find a matrix with entries summing to sum of correlations of vectors
-        correlations_components = np.sum(y_primes * x_segments, axis=-2)
-
-        # J, M as in [1], eq.6
-        J = x_segments.shape[2]
-        M = x_segments.shape[1]
+        correlations_components = np.sum(y_primes * x_segments, axis=-2, keepdims=True)
 
         # Find the mean of all correlations
-        d = np.sum(correlations_components, axis=(-2, -1)) / (J * M)
+        d = np.mean(correlations_components, axis=(-3, -1), keepdims=True)
         return np.squeeze(d)

From 39efe60ac63455f4dd617538ad4006901c437d53 Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 19:47:50 +0000
Subject: [PATCH 09/19] More understandable shape handling

---
 pystoi/stoi.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index 5f80c3c..6d4232f 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -78,10 +78,11 @@ def stoi(x, y, fs_sig, extended=False):
     x_tob = np.sqrt(np.matmul(np.square(np.abs(x_spec)), OBM.T))
     y_tob = np.sqrt(np.matmul(np.square(np.abs(y_spec)), OBM.T))
 
-    # Take segments of x_tob, y_tob, shape (batch, num_segments, seg_size, bands)
+    # Take segments of x_tob, y_tob
     x_segments = utils.segment_frames(x_tob, mask, N)
     y_segments = utils.segment_frames(y_tob, mask, N)
 
+    # From now on the shape is always (batch, num_segments, seg_size, bands)
     if extended:
         x_n = utils.row_col_normalize(x_segments)
         y_n = utils.row_col_normalize(y_segments)
@@ -90,8 +91,8 @@ def stoi(x, y, fs_sig, extended=False):
 
     else:
         # Find normalization constants and normalize
-        normalization_consts = np.linalg.norm(x_segments, axis=-2, keepdims=True) / (
-            np.linalg.norm(y_segments, axis=-2, keepdims=True) + utils.EPS
+        normalization_consts = np.linalg.norm(x_segments, axis=2, keepdims=True) / (
+            np.linalg.norm(y_segments, axis=2, keepdims=True) + utils.EPS
         )
         y_segments_normalized = y_segments * normalization_consts
 
@@ -100,15 +101,15 @@ def stoi(x, y, fs_sig, extended=False):
         y_primes = np.minimum(y_segments_normalized, x_segments * (1 + clip_value))
 
         # Subtract mean vectors
-        y_primes = y_primes - np.mean(y_primes, axis=-2, keepdims=True)
-        x_segments = x_segments - np.mean(x_segments, axis=-2, keepdims=True)
+        y_primes = y_primes - np.mean(y_primes, axis=2, keepdims=True)
+        x_segments = x_segments - np.mean(x_segments, axis=2, keepdims=True)
 
         # Divide by their norms
-        y_primes /= np.linalg.norm(y_primes, axis=-2, keepdims=True) + utils.EPS
-        x_segments /= np.linalg.norm(x_segments, axis=-2, keepdims=True) + utils.EPS
+        y_primes /= np.linalg.norm(y_primes, axis=2, keepdims=True) + utils.EPS
+        x_segments /= np.linalg.norm(x_segments, axis=2, keepdims=True) + utils.EPS
         # Find a matrix with entries summing to sum of correlations of vectors
         correlations_components = np.sum(y_primes * x_segments, axis=-2, keepdims=True)
 
         # Find the mean of all correlations
-        d = np.mean(correlations_components, axis=(-3, -1), keepdims=True)
+        d = np.mean(correlations_components, axis=(1, 3), keepdims=True)
         return np.squeeze(d)

From dd804f75f20a330ab80f1e5d4328f1ba4acb9c26 Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 19:48:16 +0000
Subject: [PATCH 10/19] Fix the value for stoi with silence

---
 pystoi/stoi.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index 6d4232f..3efcb00 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -112,4 +112,5 @@ def stoi(x, y, fs_sig, extended=False):
 
         # Find the mean of all correlations
         d = np.mean(correlations_components, axis=(1, 3), keepdims=True)
+        d *= np.mean(mask, axis=1, keepdims=True)[..., None, None]
         return np.squeeze(d)

From f47fd0fea34bc6dad1f066ffe7621fe4122e972b Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 19:50:49 +0000
Subject: [PATCH 11/19] Revert cosmetic change

---
 pystoi/stoi.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index 3efcb00..0883bba 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -91,9 +91,9 @@ def stoi(x, y, fs_sig, extended=False):
 
     else:
         # Find normalization constants and normalize
-        normalization_consts = np.linalg.norm(x_segments, axis=2, keepdims=True) / (
-            np.linalg.norm(y_segments, axis=2, keepdims=True) + utils.EPS
-        )
+        normalization_consts = (
+            np.linalg.norm(x_segments, axis=2, keepdims=True) /
+            (np.linalg.norm(y_segments, axis=2, keepdims=True) + utils.EPS))
         y_segments_normalized = y_segments * normalization_consts
 
         # Clip as described in [1]

From 2473b98728809cd0c10a753cd2ebbfbccc89d7eb Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 19:56:07 +0000
Subject: [PATCH 12/19] Revert cosmetic change

---
 pystoi/stoi.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index 0883bba..48b5f5b 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -105,8 +105,8 @@ def stoi(x, y, fs_sig, extended=False):
         x_segments = x_segments - np.mean(x_segments, axis=2, keepdims=True)
 
         # Divide by their norms
-        y_primes /= np.linalg.norm(y_primes, axis=2, keepdims=True) + utils.EPS
-        x_segments /= np.linalg.norm(x_segments, axis=2, keepdims=True) + utils.EPS
+        y_primes /= (np.linalg.norm(y_primes, axis=2, keepdims=True) + utils.EPS)
+        x_segments /= (np.linalg.norm(x_segments, axis=2, keepdims=True) + utils.EPS)
         # Find a matrix with entries summing to sum of correlations of vectors
         correlations_components = np.sum(y_primes * x_segments, axis=-2, keepdims=True)
 

From c66365b8453cab5ba2d27b637f2d0b9f04dcde42 Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 19:59:52 +0000
Subject: [PATCH 13/19] Fix docs

---
 pystoi/stoi.py  |  2 ++
 pystoi/utils.py | 18 +++++++++---------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index 48b5f5b..521f30e 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -112,5 +112,7 @@ def stoi(x, y, fs_sig, extended=False):
 
         # Find the mean of all correlations
         d = np.mean(correlations_components, axis=(1, 3), keepdims=True)
+        # Exclude the contribution of silent frames from the calculation of the mean
         d *= np.mean(mask, axis=1, keepdims=True)[..., None, None]
+        # Return just a float if stoi was called with a single waveform
         return np.squeeze(d)
diff --git a/pystoi/utils.py b/pystoi/utils.py
index ea012d0..3ae3ae5 100644
--- a/pystoi/utils.py
+++ b/pystoi/utils.py
@@ -128,18 +128,18 @@ def _overlap_and_add(x_frames, hop):
     return signal
 
 
-def _mask_audio(x, mask):
-    return np.array([
-        np.pad(xi[mi], ((0, len(xi) - np.sum(mi)), (0, 0))) for xi, mi in zip(x, mask)
-    ])
-
-
 def segment_frames(x, mask, seg_size):
     segments = np.array([x[:, m - seg_size: m] for m in range(seg_size, x.shape[1] + 1)])
     segments = segments.transpose([1, 0, 2, 3])  # put back batch in the first dimension
     return segments * mask[:, seg_size - 1:, None, None]
 
 
+def _mask_audio(x, mask):
+    return np.array([
+        np.pad(xi[mi], ((0, len(xi) - np.sum(mi)), (0, 0))) for xi, mi in zip(x, mask)
+    ])
+
+
 def remove_silent_frames(x, y, dyn_range, framelen, hop):
     """ Remove silent frames of x and y based on x
     A frame is excluded if its energy is lower than max(energy) - dyn_range
@@ -151,8 +151,8 @@ def remove_silent_frames(x, y, dyn_range, framelen, hop):
         framelen : Window size for energy evaluation
         hop : Hop size for energy evaluation
     # Returns :
-        x without the silent frames
-        y without the silent frames (aligned to x)
+        x without the silent frames, zero-padded to the original length
+        y without the silent frames, zero-padded to the original length (aligned to x)
     """
     # Compute Mask
     w = np.hanning(framelen + 2)[1:-1]
@@ -171,7 +171,7 @@ def remove_silent_frames(x, y, dyn_range, framelen, hop):
     # with respect to maximum clean speech energy frame
     mask = (np.max(x_energies) - dyn_range - x_energies) < 0
 
-    # Remove silent frames by masking
+    # Remove silent frames and pad with zeroes
     x_frames = _mask_audio(x_frames, mask)
     y_frames = _mask_audio(y_frames, mask)
 

From da2967be8df16dd5c9946ae4493b8d5213c06bc4 Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 20:11:54 +0000
Subject: [PATCH 14/19] Fix output shapes if called with a single waveform

---
 pystoi/stoi.py                | 15 +++++++++------
 tests/test_overlap_and_add.py |  2 +-
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index 521f30e..78423bc 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -19,7 +19,8 @@ def stoi(x, y, fs_sig, extended=False):
     Computes the STOI (See [1][2]) of a denoised signal compared to a clean
     signal, The output is expected to have a monotonic relation with the
     subjective speech-intelligibility, where a higher score denotes better
-    speech intelligibility.
+    speech intelligibility. Accepts either a single waveform or a batch of
+    waveforms stored in a numpy array.
 
     # Arguments
         x (np.ndarray): clean original speech
@@ -28,8 +29,9 @@ def stoi(x, y, fs_sig, extended=False):
         extended (bool): Boolean, whether to use the extended STOI described in [3]
 
     # Returns
-        float: Short time objective intelligibility measure between clean and
-        denoised speech
+        float or np.ndarray: Short time objective intelligibility measure between
+        clean and denoised speech. Returns float if called with a single waveform,
+        np.ndarray if called with a batch of waveforms
 
     # Raises
         AssertionError : if x and y have different lengths
@@ -49,6 +51,7 @@ def stoi(x, y, fs_sig, extended=False):
         raise Exception('x and y should have the same shape,' +
                         'found {} and {}'.format(x.shape, y.shape))
 
+    out_shape = x.shape[:-1]
     if len(x.shape) == 1:  # Add a batch size if missing, shape (batch, num_samples)
         x = x[None, :]
         y = y[None, :]
@@ -72,7 +75,7 @@ def stoi(x, y, fs_sig, extended=False):
                       'intelligibility measure after removing silent '
                       'frames. Returning 1e-5. Please check you wav files',
                       RuntimeWarning)
-        return np.squeeze([1e-5 for _ in range(x)])
+        return np.array([1e-5 for _ in range(x)]).reshape(out_shape)
 
     # Apply OB matrix to the spectrograms as in Eq. (1), shape (batch, frames, bands)
     x_tob = np.sqrt(np.matmul(np.square(np.abs(x_spec)), OBM.T))
@@ -87,7 +90,7 @@ def stoi(x, y, fs_sig, extended=False):
         x_n = utils.row_col_normalize(x_segments)
         y_n = utils.row_col_normalize(y_segments)
         d_n = np.mean(np.sum(x_n * y_n, axis=3), axis=2)
-        return np.squeeze(np.mean(d_n, axis=1))
+        return np.mean(d_n, axis=1).reshape(out_shape)
 
     else:
         # Find normalization constants and normalize
@@ -115,4 +118,4 @@ def stoi(x, y, fs_sig, extended=False):
         # Exclude the contribution of silent frames from the calculation of the mean
         d *= np.mean(mask, axis=1, keepdims=True)[..., None, None]
         # Return just a float if stoi was called with a single waveform
-        return np.squeeze(d)
+        return np.squeeze(d).reshape(out_shape)
diff --git a/tests/test_overlap_and_add.py b/tests/test_overlap_and_add.py
index dfa7670..437f315 100644
--- a/tests/test_overlap_and_add.py
+++ b/tests/test_overlap_and_add.py
@@ -36,7 +36,7 @@ def test_pystoi_run(batch_size, fs, extended):
     x = np.random.randn(batch_size, N)
     res = stoi(x, x, fs, extended)
     print(batch_size, fs, extended, res)
-    assert res.shape == () if batch_size == 1 else (batch_size,)
+    assert res.shape == x.shape[:-1]
 
 
 @pytest.mark.parametrize("extended", [True, False])

From d95b27da627f2f027d0dd8d5e8b0cab73a8dee73 Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 21:04:27 +0000
Subject: [PATCH 15/19] Remove two expensive (and maybe useless?) random
 sampling

---
 pystoi/utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pystoi/utils.py b/pystoi/utils.py
index 3ae3ae5..a0940fa 100644
--- a/pystoi/utils.py
+++ b/pystoi/utils.py
@@ -185,12 +185,10 @@ def row_col_normalize(x):
     """ Row and column mean and variance normalize an array of 2D segments """
     # input shape (batch, num_segments, seg_size, bands)
     # Row mean and variance normalization
-    x_normed = x + EPS * np.random.standard_normal(x.shape)
-    x_normed -= np.mean(x_normed, axis=-2, keepdims=True)
+    x_normed = x - np.mean(x, axis=-2, keepdims=True)
     x_inv = 1. / np.linalg.norm(x_normed, axis=-2)
     x_normed = x_normed * x_inv[..., None, :]
     # Column mean and variance normalization
-    x_normed += EPS * np.random.standard_normal(x_normed.shape)
     x_normed -= np.mean(x_normed, axis=-1, keepdims=True)
     x_inv = 1. / np.linalg.norm(x_normed, axis=-1)
     x_normed = x_normed * x_inv[..., None]

From e7a4026e99622268209e938c2f97495d9717791d Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 21:05:12 +0000
Subject: [PATCH 16/19] Tiny change

---
 pystoi/utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pystoi/utils.py b/pystoi/utils.py
index a0940fa..0088cc2 100644
--- a/pystoi/utils.py
+++ b/pystoi/utils.py
@@ -186,10 +186,10 @@ def row_col_normalize(x):
     # input shape (batch, num_segments, seg_size, bands)
     # Row mean and variance normalization
     x_normed = x - np.mean(x, axis=-2, keepdims=True)
-    x_inv = 1. / np.linalg.norm(x_normed, axis=-2)
-    x_normed = x_normed * x_inv[..., None, :]
+    x_inv = 1. / np.linalg.norm(x_normed, axis=-2, keepdims=True)
+    x_normed = x_normed * x_inv
     # Column mean and variance normalization
     x_normed -= np.mean(x_normed, axis=-1, keepdims=True)
-    x_inv = 1. / np.linalg.norm(x_normed, axis=-1)
-    x_normed = x_normed * x_inv[..., None]
+    x_inv = 1. / np.linalg.norm(x_normed, axis=-1, keepdims=True)
+    x_normed = x_normed * x_inv
     return x_normed

From 0f214e4510c02fba58c69c87d654c8419da2ec09 Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 21:10:27 +0000
Subject: [PATCH 17/19] Fix shapes in tests

---
 tests/test_overlap_and_add.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_overlap_and_add.py b/tests/test_overlap_and_add.py
index 437f315..8583c52 100644
--- a/tests/test_overlap_and_add.py
+++ b/tests/test_overlap_and_add.py
@@ -54,7 +54,7 @@ def test_pystoi_silence(extended):
     audio = np.array(audio)
     res = stoi(audio, audio, fs, extended)
     print(batch_size, fs, extended, res)
-    assert res.shape == () if batch_size == 1 else (batch_size,)
+    assert res.shape == x.shape[:-1]
 
 
 def test_vectorisation():
@@ -62,7 +62,7 @@ def test_vectorisation():
     batch_size = 4
     x = np.random.random((batch_size, 100 * N_FRAME))
     y = np.random.random((batch_size, 100 * N_FRAME))
+    res = np.array([stoi(xi, yi, FS) for xi, yi in zip(x, y)])
     res_vec = stoi(x, y, FS)
-    res_old = np.array([stoi(xi, yi, FS) for xi, yi in zip(x, y)])
-    assert res_vec.shape == (batch_size,)
-    assert np.allclose(res_old, res_vec)
+    assert res_vec.shape == x.shape[:-1]
+    assert np.allclose(res, res_vec)

From 959df2710bef6a06c93f187005a8667ef8d9183e Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Tue, 1 Mar 2022 21:10:45 +0000
Subject: [PATCH 18/19] Add test with bugfix

---
 pystoi/stoi.py                |  2 +-
 tests/test_overlap_and_add.py | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/pystoi/stoi.py b/pystoi/stoi.py
index 78423bc..920871b 100644
--- a/pystoi/stoi.py
+++ b/pystoi/stoi.py
@@ -75,7 +75,7 @@ def stoi(x, y, fs_sig, extended=False):
                       'intelligibility measure after removing silent '
                       'frames. Returning 1e-5. Please check you wav files',
                       RuntimeWarning)
-        return np.array([1e-5 for _ in range(x)]).reshape(out_shape)
+        return np.array([1e-5 for _ in range(x.shape[0])]).reshape(out_shape)
 
     # Apply OB matrix to the spectrograms as in Eq. (1), shape (batch, frames, bands)
     x_tob = np.sqrt(np.matmul(np.square(np.abs(x_spec)), OBM.T))
diff --git a/tests/test_overlap_and_add.py b/tests/test_overlap_and_add.py
index 8583c52..b86bd20 100644
--- a/tests/test_overlap_and_add.py
+++ b/tests/test_overlap_and_add.py
@@ -39,6 +39,17 @@ def test_pystoi_run(batch_size, fs, extended):
     assert res.shape == x.shape[:-1]
 
 
+@pytest.mark.parametrize("extended", [True, False])
+@pytest.mark.parametrize("batch_size", [1, 4])
+def test_pystoi_complete_silence(batch_size, extended):
+    fs = 16000
+    N = fs * 4  # 4 seconds of random audio
+    x = np.zeros((batch_size, N))
+    res = stoi(x, x, fs, extended)
+    print(batch_size, fs, extended, res)
+    assert res.shape == x.shape[:-1]
+
+
 @pytest.mark.parametrize("extended", [True, False])
 def test_pystoi_silence(extended):
     rng = np.random.default_rng(seed=0)

From 11eed46a31db7eaf89801b356eb7fbaca8aa192e Mon Sep 17 00:00:00 2001
From: giamic <gianluca.micchi@gmail.com>
Date: Wed, 2 Mar 2022 09:33:14 +0000
Subject: [PATCH 19/19] Minor rewriting

---
 pystoi/utils.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/pystoi/utils.py b/pystoi/utils.py
index 0088cc2..abc7660 100644
--- a/pystoi/utils.py
+++ b/pystoi/utils.py
@@ -184,12 +184,10 @@ def remove_silent_frames(x, y, dyn_range, framelen, hop):
 def row_col_normalize(x):
     """ Row and column mean and variance normalize an array of 2D segments """
     # input shape (batch, num_segments, seg_size, bands)
-    # Row mean and variance normalization
+    # Row mean and variance normalization -- axis: seg_size
     x_normed = x - np.mean(x, axis=-2, keepdims=True)
-    x_inv = 1. / np.linalg.norm(x_normed, axis=-2, keepdims=True)
-    x_normed = x_normed * x_inv
-    # Column mean and variance normalization
+    x_normed = x_normed / (np.linalg.norm(x_normed, axis=-2, keepdims=True) + EPS)
+    # Column mean and variance normalization -- axis: bands
     x_normed -= np.mean(x_normed, axis=-1, keepdims=True)
-    x_inv = 1. / np.linalg.norm(x_normed, axis=-1, keepdims=True)
-    x_normed = x_normed * x_inv
+    x_normed = x_normed / (np.linalg.norm(x_normed, axis=-1, keepdims=True) + EPS)
     return x_normed