From 3459c85a44dfb6c1f30bfb29d5c884755f6c8059 Mon Sep 17 00:00:00 2001 From: giamic Date: Mon, 28 Feb 2022 17:58:27 +0000 Subject: [PATCH 01/19] First attempt at batch vectorisation --- pystoi/stoi.py | 62 ++++++++++++++++++++--------------- pystoi/utils.py | 60 ++++++++++++++++++--------------- tests/test_overlap_and_add.py | 54 ++++++++++++++++++++++++++---- 3 files changed, 117 insertions(+), 59 deletions(-) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index e955afb..87aeb35 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -46,9 +46,13 @@ def stoi(x, y, fs_sig, extended=False): IEEE Transactions on Audio, Speech and Language Processing, 2016. """ if x.shape != y.shape: - raise Exception('x and y should have the same length,' + + raise Exception('x and y should have the same shape,' + 'found {} and {}'.format(x.shape, y.shape)) + if len(x.shape) == 1: # Add a batch size if missing + x = x[None, :] + y = y[None, :] + # Resample is fs_sig is different than fs if fs_sig != FS: x = utils.resample_oct(x, FS, fs_sig) @@ -58,58 +62,62 @@ def stoi(x, y, fs_sig, extended=False): x, y = utils.remove_silent_frames(x, y, DYN_RANGE, N_FRAME, int(N_FRAME/2)) # Take STFT - x_spec = utils.stft(x, N_FRAME, NFFT, overlap=2).transpose() - y_spec = utils.stft(y, N_FRAME, NFFT, overlap=2).transpose() + x_spec = utils.stft(x, N_FRAME, NFFT, overlap=2) + y_spec = utils.stft(y, N_FRAME, NFFT, overlap=2) # Ensure at least 30 frames for intermediate intelligibility - if x_spec.shape[-1] < N: + mask = ~np.all(x_spec == 0, axis=-1) + if np.any(np.sum(mask, axis=-1) < N): warnings.warn('Not enough STFT frames to compute intermediate ' 'intelligibility measure after removing silent ' 'frames. Returning 1e-5. Please check you wav files', RuntimeWarning) - return 1e-5 + return np.squeeze([1e-5 for _ in range(x)]) # Apply OB matrix to the spectrograms as in Eq. (1) - x_tob = np.sqrt(np.matmul(OBM, np.square(np.abs(x_spec)))) - y_tob = np.sqrt(np.matmul(OBM, np.square(np.abs(y_spec)))) + x_tob = np.sqrt(np.matmul(np.square(np.abs(x_spec)), OBM.T)) + y_tob = np.sqrt(np.matmul(np.square(np.abs(y_spec)), OBM.T)) - # Take segments of x_tob, y_tob + # Take segments of x_tob, y_tob, shape (batch, num_segments, seg_size, bands) x_segments = np.array( - [x_tob[:, m - N:m] for m in range(N, x_tob.shape[1] + 1)]) + [x_tob[:, m - N : m] for m in range(N, x_tob.shape[1] + 1)] + ).transpose([1, 0, 2, 3]) + x_segments = x_segments * mask[:, N-1:, None, None] y_segments = np.array( - [y_tob[:, m - N:m] for m in range(N, x_tob.shape[1] + 1)]) + [y_tob[:, m - N : m] for m in range(N, x_tob.shape[1] + 1)] + ).transpose([1, 0, 2, 3]) + y_segments = y_segments * mask[:, N-1:, None, None] - if extended: - x_n = utils.row_col_normalize(x_segments) - y_n = utils.row_col_normalize(y_segments) - return np.sum(x_n * y_n / N) / x_n.shape[0] + if extended: # TODO: Vectorialise this + x_n = np.array([utils.row_col_normalize(xi) for xi in x_segments]) + y_n = np.array([utils.row_col_normalize(yi) for yi in y_segments]) + return np.sum(x_n * y_n / N, axis=(1, 2, 3)) / x_n.shape[1] else: # Find normalization constants and normalize - normalization_consts = ( - np.linalg.norm(x_segments, axis=2, keepdims=True) / - (np.linalg.norm(y_segments, axis=2, keepdims=True) + utils.EPS)) + normalization_consts = np.linalg.norm(x_segments, axis=-1, keepdims=True) / ( + np.linalg.norm(y_segments, axis=-1, keepdims=True) + utils.EPS + ) y_segments_normalized = y_segments * normalization_consts # Clip as described in [1] clip_value = 10 ** (-BETA / 20) - y_primes = np.minimum( - y_segments_normalized, x_segments * (1 + clip_value)) + y_primes = np.minimum(y_segments_normalized, x_segments * (1 + clip_value)) # Subtract mean vectors - y_primes = y_primes - np.mean(y_primes, axis=2, keepdims=True) - x_segments = x_segments - np.mean(x_segments, axis=2, keepdims=True) + y_primes = y_primes - np.mean(y_primes, axis=-1, keepdims=True) + x_segments = x_segments - np.mean(x_segments, axis=-1, keepdims=True) # Divide by their norms - y_primes /= (np.linalg.norm(y_primes, axis=2, keepdims=True) + utils.EPS) - x_segments /= (np.linalg.norm(x_segments, axis=2, keepdims=True) + utils.EPS) + y_primes /= np.linalg.norm(y_primes, axis=-1, keepdims=True) + utils.EPS + x_segments /= np.linalg.norm(x_segments, axis=-1, keepdims=True) + utils.EPS # Find a matrix with entries summing to sum of correlations of vectors - correlations_components = y_primes * x_segments + correlations_components = np.sum(y_primes * x_segments, axis=-2) # J, M as in [1], eq.6 - J = x_segments.shape[0] + J = x_segments.shape[2] M = x_segments.shape[1] # Find the mean of all correlations - d = np.sum(correlations_components) / (J * M) - return d + d = np.sum(correlations_components, axis=(-2, -1)) / (J * M) + return np.squeeze(d) diff --git a/pystoi/utils.py b/pystoi/utils.py index c1fdb3e..4eb6f15 100644 --- a/pystoi/utils.py +++ b/pystoi/utils.py @@ -47,7 +47,7 @@ def resample_oct(x, p, q): """Resampler that is compatible with Octave""" h = _resample_window_oct(p, q) window = h / np.sum(h) - return resample_poly(x, p, q, window=window) + return resample_poly(x, p, q, axis=-1, window=window) @functools.lru_cache(maxsize=None) @@ -95,39 +95,45 @@ def stft(x, win_size, fft_size, overlap=4): """ hop = int(win_size / overlap) w = np.hanning(win_size + 2)[1: -1] # = matlab.hanning(win_size) - stft_out = np.array([np.fft.rfft(w * x[i:i + win_size], n=fft_size) - for i in range(0, len(x) - win_size, hop)]) - return stft_out + stft_out = np.array([np.fft.rfft(w * x[:, i:i + win_size], n=fft_size) + for i in range(0, x.shape[-1] - win_size, hop)]) + return stft_out.transpose([1, 0, 2]) def _overlap_and_add(x_frames, hop): - num_frames, framelen = x_frames.shape + batch_size, num_frames, framelen = x_frames.shape # Compute the number of segments, per frame. segments = -(-framelen // hop) # Divide and round up. # Pad the framelen dimension to segments * hop and add n=segments frames - signal = np.pad(x_frames, ((0, segments), (0, segments * hop - framelen))) - - # Reshape to a 3D tensor, splitting the framelen dimension in two - signal = signal.reshape((num_frames + segments, segments, hop)) - # Transpose dimensions so that signal.shape = (segments, frame+segments, hop) - signal = np.transpose(signal, [1, 0, 2]) - # Reshape so that signal.shape = (segments * (frame+segments), hop) - signal = signal.reshape((-1, hop)) - - # Now behold the magic!! Remove the last n=segments elements from the first axis - signal = signal[:-segments] - # Reshape to (segments, frame+segments-1, hop) - signal = signal.reshape((segments, num_frames + segments - 1, hop)) + signal = np.pad(x_frames, ((0, 0), (0, segments), (0, segments * hop - framelen))) + + # Reshape to a 4D tensor, splitting the framelen dimension in two + signal = signal.reshape((batch_size, num_frames + segments, segments, hop)) + # Transpose dimensions so that signal.shape = (batch, segments, frame+segments, hop) + signal = np.transpose(signal, [0, 2, 1, 3]) + # Reshape so that signal.shape = (batch, segments * (frame+segments), hop) + signal = signal.reshape((batch_size, -1, hop)) + + # Now behold the magic!! Remove the last n=segments elements from the second axis + signal = signal[:, :-segments] + # Reshape to (batch, segments, frame+segments-1, hop) + signal = signal.reshape((batch_size, segments, num_frames + segments - 1, hop)) # This has introduced a shift by one in all rows # Now, reduce over the columns and flatten the array to achieve the result - signal = np.sum(signal, axis=0) - end = (len(x_frames) - 1) * hop + framelen - signal = signal.reshape(-1)[:end] + signal = np.sum(signal, axis=1) + end = (num_frames - 1) * hop + framelen + signal = signal.reshape((batch_size, -1))[:end] return signal +def _mask_audio(x, mask): + return np.array([ + np.pad(xi[mi], ((0, len(xi) - np.sum(mi)), (0, 0))) for xi, mi in zip(x, mask) + ]) + + def remove_silent_frames(x, y, dyn_range, framelen, hop): """ Remove silent frames of x and y based on x A frame is excluded if its energy is lower than max(energy) - dyn_range @@ -146,20 +152,22 @@ def remove_silent_frames(x, y, dyn_range, framelen, hop): w = np.hanning(framelen + 2)[1:-1] x_frames = np.array( - [w * x[i:i + framelen] for i in range(0, len(x) - framelen, hop)]) + [w * x[..., i : i + framelen] for i in range(0, x.shape[-1] - framelen, hop)] + ).transpose([1, 0, 2]) y_frames = np.array( - [w * y[i:i + framelen] for i in range(0, len(x) - framelen, hop)]) + [w * y[..., i : i + framelen] for i in range(0, x.shape[-1] - framelen, hop)] + ).transpose([1, 0, 2]) # Compute energies in dB - x_energies = 20 * np.log10(np.linalg.norm(x_frames, axis=1) + EPS) + x_energies = 20 * np.log10(np.linalg.norm(x_frames, axis=-1) + EPS) # Find boolean mask of energies lower than dynamic_range dB # with respect to maximum clean speech energy frame mask = (np.max(x_energies) - dyn_range - x_energies) < 0 # Remove silent frames by masking - x_frames = x_frames[mask] - y_frames = y_frames[mask] + x_frames = _mask_audio(x_frames, mask) + y_frames = _mask_audio(y_frames, mask) x_sil = _overlap_and_add(x_frames, hop) y_sil = _overlap_and_add(y_frames, hop) diff --git a/tests/test_overlap_and_add.py b/tests/test_overlap_and_add.py index c6137d6..dfa7670 100644 --- a/tests/test_overlap_and_add.py +++ b/tests/test_overlap_and_add.py @@ -1,7 +1,8 @@ import numpy as np +import pytest from numpy.testing import assert_allclose -from pystoi.stoi import N_FRAME +from pystoi.stoi import N_FRAME, stoi, FS from pystoi.utils import _overlap_and_add @@ -15,12 +16,53 @@ def old_overlap_and_app(x_frames, hop): x_sil[range(i * hop, i * hop + framelen)] += x_frames[i, :] return x_sil + batch_size = 4 # Initialize - x = np.random.randn(1000 * N_FRAME) + x = np.random.randn(batch_size, 1000 * N_FRAME) # Add silence segment - silence = np.zeros(10 * N_FRAME) - x = np.concatenate([x[: 500 * N_FRAME], silence, x[500 * N_FRAME :]]) - x = x.reshape([-1, N_FRAME]) - xs = old_overlap_and_app(x, N_FRAME // 2) + silence = np.zeros((batch_size, 10 * N_FRAME)) + x = np.concatenate([x[:, : 500 * N_FRAME], silence, x[:, 500 * N_FRAME :]], axis=1) + x = x.reshape([batch_size, -1, N_FRAME]) + xs = [old_overlap_and_app(xi, N_FRAME // 2) for xi in x] xs_vectorise = _overlap_and_add(x, N_FRAME // 2) assert_allclose(xs, xs_vectorise) + + +@pytest.mark.parametrize("batch_size", [1, 4]) +@pytest.mark.parametrize("fs", [10000, 16000]) +@pytest.mark.parametrize("extended", [True, False]) +def test_pystoi_run(batch_size, fs, extended): + N = fs * 4 # 4 seconds of random audio + x = np.random.randn(batch_size, N) + res = stoi(x, x, fs, extended) + print(batch_size, fs, extended, res) + assert res.shape == () if batch_size == 1 else (batch_size,) + + +@pytest.mark.parametrize("extended", [True, False]) +def test_pystoi_silence(extended): + rng = np.random.default_rng(seed=0) + batch_size = 4 + fs = 16000 + N = fs * 4 # 4 seconds of random audio + x = np.random.randn(batch_size, N) + silence = np.random.randn(int(N / 7)) + audio = [] + for i in range(batch_size): + t = int(rng.random() * N) + audio.append(np.concatenate([x[i, :t], silence, x[i, t:]])) + audio = np.array(audio) + res = stoi(audio, audio, fs, extended) + print(batch_size, fs, extended, res) + assert res.shape == () if batch_size == 1 else (batch_size,) + + +def test_vectorisation(): + # Initialize batch of data + batch_size = 4 + x = np.random.random((batch_size, 100 * N_FRAME)) + y = np.random.random((batch_size, 100 * N_FRAME)) + res_vec = stoi(x, y, FS) + res_old = np.array([stoi(xi, yi, FS) for xi, yi in zip(x, y)]) + assert res_vec.shape == (batch_size,) + assert np.allclose(res_old, res_vec) From 5b9cb189994da1ad59431374ce7049e0a953ee71 Mon Sep 17 00:00:00 2001 From: giamic Date: Mon, 28 Feb 2022 17:59:38 +0000 Subject: [PATCH 02/19] Add a missing squeeze --- pystoi/stoi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index 87aeb35..bbf58a3 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -91,7 +91,7 @@ def stoi(x, y, fs_sig, extended=False): if extended: # TODO: Vectorialise this x_n = np.array([utils.row_col_normalize(xi) for xi in x_segments]) y_n = np.array([utils.row_col_normalize(yi) for yi in y_segments]) - return np.sum(x_n * y_n / N, axis=(1, 2, 3)) / x_n.shape[1] + return np.squeeze(np.sum(x_n * y_n / N, axis=(1, 2, 3)) / x_n.shape[1]) else: # Find normalization constants and normalize From f1eac1a584c415ee4b8bf7f66c6446a7565316a0 Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 16:22:44 +0000 Subject: [PATCH 03/19] Add some comments about shapes --- pystoi/stoi.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index bbf58a3..3c241a4 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -49,7 +49,7 @@ def stoi(x, y, fs_sig, extended=False): raise Exception('x and y should have the same shape,' + 'found {} and {}'.format(x.shape, y.shape)) - if len(x.shape) == 1: # Add a batch size if missing + if len(x.shape) == 1: # Add a batch size if missing, shape (batch, num_samples) x = x[None, :] y = y[None, :] @@ -61,12 +61,12 @@ def stoi(x, y, fs_sig, extended=False): # Remove silent frames x, y = utils.remove_silent_frames(x, y, DYN_RANGE, N_FRAME, int(N_FRAME/2)) - # Take STFT + # Take STFT, shape (batch, num_frames, n_fft//2 + 1) x_spec = utils.stft(x, N_FRAME, NFFT, overlap=2) y_spec = utils.stft(y, N_FRAME, NFFT, overlap=2) # Ensure at least 30 frames for intermediate intelligibility - mask = ~np.all(x_spec == 0, axis=-1) + mask = ~np.all(x_spec == 0, axis=-1) # Check for frames with non-zero values if np.any(np.sum(mask, axis=-1) < N): warnings.warn('Not enough STFT frames to compute intermediate ' 'intelligibility measure after removing silent ' @@ -74,7 +74,7 @@ def stoi(x, y, fs_sig, extended=False): RuntimeWarning) return np.squeeze([1e-5 for _ in range(x)]) - # Apply OB matrix to the spectrograms as in Eq. (1) + # Apply OB matrix to the spectrograms as in Eq. (1), shape (batch, frames, bands) x_tob = np.sqrt(np.matmul(np.square(np.abs(x_spec)), OBM.T)) y_tob = np.sqrt(np.matmul(np.square(np.abs(y_spec)), OBM.T)) From 9d2c2731ab768c65c0405ec648729392ca3beb3a Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 16:23:11 +0000 Subject: [PATCH 04/19] Method extraction --- pystoi/stoi.py | 10 ++-------- pystoi/utils.py | 7 +++++++ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index 3c241a4..8a56484 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -79,14 +79,8 @@ def stoi(x, y, fs_sig, extended=False): y_tob = np.sqrt(np.matmul(np.square(np.abs(y_spec)), OBM.T)) # Take segments of x_tob, y_tob, shape (batch, num_segments, seg_size, bands) - x_segments = np.array( - [x_tob[:, m - N : m] for m in range(N, x_tob.shape[1] + 1)] - ).transpose([1, 0, 2, 3]) - x_segments = x_segments * mask[:, N-1:, None, None] - y_segments = np.array( - [y_tob[:, m - N : m] for m in range(N, x_tob.shape[1] + 1)] - ).transpose([1, 0, 2, 3]) - y_segments = y_segments * mask[:, N-1:, None, None] + x_segments = utils._segment_frames(x_tob, mask, N) + y_segments = utils._segment_frames(y_tob, mask, N) if extended: # TODO: Vectorialise this x_n = np.array([utils.row_col_normalize(xi) for xi in x_segments]) diff --git a/pystoi/utils.py b/pystoi/utils.py index 4eb6f15..0ea9149 100644 --- a/pystoi/utils.py +++ b/pystoi/utils.py @@ -134,6 +134,13 @@ def _mask_audio(x, mask): ]) +def _segment_frames(x, mask, seg_size): + # TODO: Vectorise this! + segments = np.array([x[:, m - seg_size: m] for m in range(seg_size, x.shape[1] + 1)]) + segments = segments.transpose([1, 0, 2, 3]) # put back batch in the first dimension + return segments * mask[:, seg_size - 1:, None, None] + + def remove_silent_frames(x, y, dyn_range, framelen, hop): """ Remove silent frames of x and y based on x A frame is excluded if its energy is lower than max(energy) - dyn_range From 4a7a5bbd76928cef757b36cf914ef2588f1c7ba4 Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 16:44:17 +0000 Subject: [PATCH 05/19] Remove a wrong todo --- pystoi/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pystoi/utils.py b/pystoi/utils.py index 0ea9149..b1d579a 100644 --- a/pystoi/utils.py +++ b/pystoi/utils.py @@ -135,7 +135,6 @@ def _mask_audio(x, mask): def _segment_frames(x, mask, seg_size): - # TODO: Vectorise this! segments = np.array([x[:, m - seg_size: m] for m in range(seg_size, x.shape[1] + 1)]) segments = segments.transpose([1, 0, 2, 3]) # put back batch in the first dimension return segments * mask[:, seg_size - 1:, None, None] From 15a8e212ebfa602e39591c6eb91af11a54d31ad2 Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 16:46:51 +0000 Subject: [PATCH 06/19] Stupid refactoring --- pystoi/stoi.py | 4 ++-- pystoi/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index 8a56484..8727e41 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -79,8 +79,8 @@ def stoi(x, y, fs_sig, extended=False): y_tob = np.sqrt(np.matmul(np.square(np.abs(y_spec)), OBM.T)) # Take segments of x_tob, y_tob, shape (batch, num_segments, seg_size, bands) - x_segments = utils._segment_frames(x_tob, mask, N) - y_segments = utils._segment_frames(y_tob, mask, N) + x_segments = utils.segment_frames(x_tob, mask, N) + y_segments = utils.segment_frames(y_tob, mask, N) if extended: # TODO: Vectorialise this x_n = np.array([utils.row_col_normalize(xi) for xi in x_segments]) diff --git a/pystoi/utils.py b/pystoi/utils.py index b1d579a..83d9fd0 100644 --- a/pystoi/utils.py +++ b/pystoi/utils.py @@ -134,7 +134,7 @@ def _mask_audio(x, mask): ]) -def _segment_frames(x, mask, seg_size): +def segment_frames(x, mask, seg_size): segments = np.array([x[:, m - seg_size: m] for m in range(seg_size, x.shape[1] + 1)]) segments = segments.transpose([1, 0, 2, 3]) # put back batch in the first dimension return segments * mask[:, seg_size - 1:, None, None] From e1a597d635be05541d650ba8ab673e5f737f26af Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 19:30:06 +0000 Subject: [PATCH 07/19] Vectorise row_col_normalize --- pystoi/stoi.py | 9 +++++---- pystoi/utils.py | 24 ++++++++---------------- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index 8727e41..6f8d068 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -82,10 +82,11 @@ def stoi(x, y, fs_sig, extended=False): x_segments = utils.segment_frames(x_tob, mask, N) y_segments = utils.segment_frames(y_tob, mask, N) - if extended: # TODO: Vectorialise this - x_n = np.array([utils.row_col_normalize(xi) for xi in x_segments]) - y_n = np.array([utils.row_col_normalize(yi) for yi in y_segments]) - return np.squeeze(np.sum(x_n * y_n / N, axis=(1, 2, 3)) / x_n.shape[1]) + if extended: + x_n = utils.row_col_normalize(x_segments) + y_n = utils.row_col_normalize(y_segments) + d_n = np.mean(np.sum(x_n * y_n, axis=3), axis=2) + return np.squeeze(np.mean(d_n, axis=1)) else: # Find normalization constants and normalize diff --git a/pystoi/utils.py b/pystoi/utils.py index 83d9fd0..ea012d0 100644 --- a/pystoi/utils.py +++ b/pystoi/utils.py @@ -181,25 +181,17 @@ def remove_silent_frames(x, y, dyn_range, framelen, hop): return x_sil, y_sil -def vect_two_norm(x, axis=-1): - """ Returns an array of vectors of norms of the rows of matrices from 3D array """ - return np.sum(np.square(x), axis=axis, keepdims=True) - - def row_col_normalize(x): """ Row and column mean and variance normalize an array of 2D segments """ + # input shape (batch, num_segments, seg_size, bands) # Row mean and variance normalization x_normed = x + EPS * np.random.standard_normal(x.shape) - x_normed -= np.mean(x_normed, axis=-1, keepdims=True) - x_inv = 1. / np.sqrt(vect_two_norm(x_normed)) - x_diags = np.array( - [np.diag(x_inv[i].reshape(-1)) for i in range(x_inv.shape[0])]) - x_normed = np.matmul(x_diags, x_normed) + x_normed -= np.mean(x_normed, axis=-2, keepdims=True) + x_inv = 1. / np.linalg.norm(x_normed, axis=-2) + x_normed = x_normed * x_inv[..., None, :] # Column mean and variance normalization - x_normed += + EPS * np.random.standard_normal(x_normed.shape) - x_normed -= np.mean(x_normed, axis=1, keepdims=True) - x_inv = 1. / np.sqrt(vect_two_norm(x_normed, axis=1)) - x_diags = np.array( - [np.diag(x_inv[i].reshape(-1)) for i in range(x_inv.shape[0])]) - x_normed = np.matmul(x_normed, x_diags) + x_normed += EPS * np.random.standard_normal(x_normed.shape) + x_normed -= np.mean(x_normed, axis=-1, keepdims=True) + x_inv = 1. / np.linalg.norm(x_normed, axis=-1) + x_normed = x_normed * x_inv[..., None] return x_normed From b64dc8ffd4653917ed61e1c8f868199d7755589c Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 19:31:25 +0000 Subject: [PATCH 08/19] Fix axes for sums --- pystoi/stoi.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index 6f8d068..5f80c3c 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -90,8 +90,8 @@ def stoi(x, y, fs_sig, extended=False): else: # Find normalization constants and normalize - normalization_consts = np.linalg.norm(x_segments, axis=-1, keepdims=True) / ( - np.linalg.norm(y_segments, axis=-1, keepdims=True) + utils.EPS + normalization_consts = np.linalg.norm(x_segments, axis=-2, keepdims=True) / ( + np.linalg.norm(y_segments, axis=-2, keepdims=True) + utils.EPS ) y_segments_normalized = y_segments * normalization_consts @@ -100,19 +100,15 @@ def stoi(x, y, fs_sig, extended=False): y_primes = np.minimum(y_segments_normalized, x_segments * (1 + clip_value)) # Subtract mean vectors - y_primes = y_primes - np.mean(y_primes, axis=-1, keepdims=True) - x_segments = x_segments - np.mean(x_segments, axis=-1, keepdims=True) + y_primes = y_primes - np.mean(y_primes, axis=-2, keepdims=True) + x_segments = x_segments - np.mean(x_segments, axis=-2, keepdims=True) # Divide by their norms - y_primes /= np.linalg.norm(y_primes, axis=-1, keepdims=True) + utils.EPS - x_segments /= np.linalg.norm(x_segments, axis=-1, keepdims=True) + utils.EPS + y_primes /= np.linalg.norm(y_primes, axis=-2, keepdims=True) + utils.EPS + x_segments /= np.linalg.norm(x_segments, axis=-2, keepdims=True) + utils.EPS # Find a matrix with entries summing to sum of correlations of vectors - correlations_components = np.sum(y_primes * x_segments, axis=-2) - - # J, M as in [1], eq.6 - J = x_segments.shape[2] - M = x_segments.shape[1] + correlations_components = np.sum(y_primes * x_segments, axis=-2, keepdims=True) # Find the mean of all correlations - d = np.sum(correlations_components, axis=(-2, -1)) / (J * M) + d = np.mean(correlations_components, axis=(-3, -1), keepdims=True) return np.squeeze(d) From 39efe60ac63455f4dd617538ad4006901c437d53 Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 19:47:50 +0000 Subject: [PATCH 09/19] More understandable shape handling --- pystoi/stoi.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index 5f80c3c..6d4232f 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -78,10 +78,11 @@ def stoi(x, y, fs_sig, extended=False): x_tob = np.sqrt(np.matmul(np.square(np.abs(x_spec)), OBM.T)) y_tob = np.sqrt(np.matmul(np.square(np.abs(y_spec)), OBM.T)) - # Take segments of x_tob, y_tob, shape (batch, num_segments, seg_size, bands) + # Take segments of x_tob, y_tob x_segments = utils.segment_frames(x_tob, mask, N) y_segments = utils.segment_frames(y_tob, mask, N) + # From now on the shape is always (batch, num_segments, seg_size, bands) if extended: x_n = utils.row_col_normalize(x_segments) y_n = utils.row_col_normalize(y_segments) @@ -90,8 +91,8 @@ def stoi(x, y, fs_sig, extended=False): else: # Find normalization constants and normalize - normalization_consts = np.linalg.norm(x_segments, axis=-2, keepdims=True) / ( - np.linalg.norm(y_segments, axis=-2, keepdims=True) + utils.EPS + normalization_consts = np.linalg.norm(x_segments, axis=2, keepdims=True) / ( + np.linalg.norm(y_segments, axis=2, keepdims=True) + utils.EPS ) y_segments_normalized = y_segments * normalization_consts @@ -100,15 +101,15 @@ def stoi(x, y, fs_sig, extended=False): y_primes = np.minimum(y_segments_normalized, x_segments * (1 + clip_value)) # Subtract mean vectors - y_primes = y_primes - np.mean(y_primes, axis=-2, keepdims=True) - x_segments = x_segments - np.mean(x_segments, axis=-2, keepdims=True) + y_primes = y_primes - np.mean(y_primes, axis=2, keepdims=True) + x_segments = x_segments - np.mean(x_segments, axis=2, keepdims=True) # Divide by their norms - y_primes /= np.linalg.norm(y_primes, axis=-2, keepdims=True) + utils.EPS - x_segments /= np.linalg.norm(x_segments, axis=-2, keepdims=True) + utils.EPS + y_primes /= np.linalg.norm(y_primes, axis=2, keepdims=True) + utils.EPS + x_segments /= np.linalg.norm(x_segments, axis=2, keepdims=True) + utils.EPS # Find a matrix with entries summing to sum of correlations of vectors correlations_components = np.sum(y_primes * x_segments, axis=-2, keepdims=True) # Find the mean of all correlations - d = np.mean(correlations_components, axis=(-3, -1), keepdims=True) + d = np.mean(correlations_components, axis=(1, 3), keepdims=True) return np.squeeze(d) From dd804f75f20a330ab80f1e5d4328f1ba4acb9c26 Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 19:48:16 +0000 Subject: [PATCH 10/19] Fix the value for stoi with silence --- pystoi/stoi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index 6d4232f..3efcb00 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -112,4 +112,5 @@ def stoi(x, y, fs_sig, extended=False): # Find the mean of all correlations d = np.mean(correlations_components, axis=(1, 3), keepdims=True) + d *= np.mean(mask, axis=1, keepdims=True)[..., None, None] return np.squeeze(d) From f47fd0fea34bc6dad1f066ffe7621fe4122e972b Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 19:50:49 +0000 Subject: [PATCH 11/19] Revert cosmetic change --- pystoi/stoi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index 3efcb00..0883bba 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -91,9 +91,9 @@ def stoi(x, y, fs_sig, extended=False): else: # Find normalization constants and normalize - normalization_consts = np.linalg.norm(x_segments, axis=2, keepdims=True) / ( - np.linalg.norm(y_segments, axis=2, keepdims=True) + utils.EPS - ) + normalization_consts = ( + np.linalg.norm(x_segments, axis=2, keepdims=True) / + (np.linalg.norm(y_segments, axis=2, keepdims=True) + utils.EPS)) y_segments_normalized = y_segments * normalization_consts # Clip as described in [1] From 2473b98728809cd0c10a753cd2ebbfbccc89d7eb Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 19:56:07 +0000 Subject: [PATCH 12/19] Revert cosmetic change --- pystoi/stoi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index 0883bba..48b5f5b 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -105,8 +105,8 @@ def stoi(x, y, fs_sig, extended=False): x_segments = x_segments - np.mean(x_segments, axis=2, keepdims=True) # Divide by their norms - y_primes /= np.linalg.norm(y_primes, axis=2, keepdims=True) + utils.EPS - x_segments /= np.linalg.norm(x_segments, axis=2, keepdims=True) + utils.EPS + y_primes /= (np.linalg.norm(y_primes, axis=2, keepdims=True) + utils.EPS) + x_segments /= (np.linalg.norm(x_segments, axis=2, keepdims=True) + utils.EPS) # Find a matrix with entries summing to sum of correlations of vectors correlations_components = np.sum(y_primes * x_segments, axis=-2, keepdims=True) From c66365b8453cab5ba2d27b637f2d0b9f04dcde42 Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 19:59:52 +0000 Subject: [PATCH 13/19] Fix docs --- pystoi/stoi.py | 2 ++ pystoi/utils.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index 48b5f5b..521f30e 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -112,5 +112,7 @@ def stoi(x, y, fs_sig, extended=False): # Find the mean of all correlations d = np.mean(correlations_components, axis=(1, 3), keepdims=True) + # Exclude the contribution of silent frames from the calculation of the mean d *= np.mean(mask, axis=1, keepdims=True)[..., None, None] + # Return just a float if stoi was called with a single waveform return np.squeeze(d) diff --git a/pystoi/utils.py b/pystoi/utils.py index ea012d0..3ae3ae5 100644 --- a/pystoi/utils.py +++ b/pystoi/utils.py @@ -128,18 +128,18 @@ def _overlap_and_add(x_frames, hop): return signal -def _mask_audio(x, mask): - return np.array([ - np.pad(xi[mi], ((0, len(xi) - np.sum(mi)), (0, 0))) for xi, mi in zip(x, mask) - ]) - - def segment_frames(x, mask, seg_size): segments = np.array([x[:, m - seg_size: m] for m in range(seg_size, x.shape[1] + 1)]) segments = segments.transpose([1, 0, 2, 3]) # put back batch in the first dimension return segments * mask[:, seg_size - 1:, None, None] +def _mask_audio(x, mask): + return np.array([ + np.pad(xi[mi], ((0, len(xi) - np.sum(mi)), (0, 0))) for xi, mi in zip(x, mask) + ]) + + def remove_silent_frames(x, y, dyn_range, framelen, hop): """ Remove silent frames of x and y based on x A frame is excluded if its energy is lower than max(energy) - dyn_range @@ -151,8 +151,8 @@ def remove_silent_frames(x, y, dyn_range, framelen, hop): framelen : Window size for energy evaluation hop : Hop size for energy evaluation # Returns : - x without the silent frames - y without the silent frames (aligned to x) + x without the silent frames, zero-padded to the original length + y without the silent frames, zero-padded to the original length (aligned to x) """ # Compute Mask w = np.hanning(framelen + 2)[1:-1] @@ -171,7 +171,7 @@ def remove_silent_frames(x, y, dyn_range, framelen, hop): # with respect to maximum clean speech energy frame mask = (np.max(x_energies) - dyn_range - x_energies) < 0 - # Remove silent frames by masking + # Remove silent frames and pad with zeroes x_frames = _mask_audio(x_frames, mask) y_frames = _mask_audio(y_frames, mask) From da2967be8df16dd5c9946ae4493b8d5213c06bc4 Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 20:11:54 +0000 Subject: [PATCH 14/19] Fix output shapes if called with a single waveform --- pystoi/stoi.py | 15 +++++++++------ tests/test_overlap_and_add.py | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index 521f30e..78423bc 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -19,7 +19,8 @@ def stoi(x, y, fs_sig, extended=False): Computes the STOI (See [1][2]) of a denoised signal compared to a clean signal, The output is expected to have a monotonic relation with the subjective speech-intelligibility, where a higher score denotes better - speech intelligibility. + speech intelligibility. Accepts either a single waveform or a batch of + waveforms stored in a numpy array. # Arguments x (np.ndarray): clean original speech @@ -28,8 +29,9 @@ def stoi(x, y, fs_sig, extended=False): extended (bool): Boolean, whether to use the extended STOI described in [3] # Returns - float: Short time objective intelligibility measure between clean and - denoised speech + float or np.ndarray: Short time objective intelligibility measure between + clean and denoised speech. Returns float if called with a single waveform, + np.ndarray if called with a batch of waveforms # Raises AssertionError : if x and y have different lengths @@ -49,6 +51,7 @@ def stoi(x, y, fs_sig, extended=False): raise Exception('x and y should have the same shape,' + 'found {} and {}'.format(x.shape, y.shape)) + out_shape = x.shape[:-1] if len(x.shape) == 1: # Add a batch size if missing, shape (batch, num_samples) x = x[None, :] y = y[None, :] @@ -72,7 +75,7 @@ def stoi(x, y, fs_sig, extended=False): 'intelligibility measure after removing silent ' 'frames. Returning 1e-5. Please check you wav files', RuntimeWarning) - return np.squeeze([1e-5 for _ in range(x)]) + return np.array([1e-5 for _ in range(x)]).reshape(out_shape) # Apply OB matrix to the spectrograms as in Eq. (1), shape (batch, frames, bands) x_tob = np.sqrt(np.matmul(np.square(np.abs(x_spec)), OBM.T)) @@ -87,7 +90,7 @@ def stoi(x, y, fs_sig, extended=False): x_n = utils.row_col_normalize(x_segments) y_n = utils.row_col_normalize(y_segments) d_n = np.mean(np.sum(x_n * y_n, axis=3), axis=2) - return np.squeeze(np.mean(d_n, axis=1)) + return np.mean(d_n, axis=1).reshape(out_shape) else: # Find normalization constants and normalize @@ -115,4 +118,4 @@ def stoi(x, y, fs_sig, extended=False): # Exclude the contribution of silent frames from the calculation of the mean d *= np.mean(mask, axis=1, keepdims=True)[..., None, None] # Return just a float if stoi was called with a single waveform - return np.squeeze(d) + return np.squeeze(d).reshape(out_shape) diff --git a/tests/test_overlap_and_add.py b/tests/test_overlap_and_add.py index dfa7670..437f315 100644 --- a/tests/test_overlap_and_add.py +++ b/tests/test_overlap_and_add.py @@ -36,7 +36,7 @@ def test_pystoi_run(batch_size, fs, extended): x = np.random.randn(batch_size, N) res = stoi(x, x, fs, extended) print(batch_size, fs, extended, res) - assert res.shape == () if batch_size == 1 else (batch_size,) + assert res.shape == x.shape[:-1] @pytest.mark.parametrize("extended", [True, False]) From d95b27da627f2f027d0dd8d5e8b0cab73a8dee73 Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 21:04:27 +0000 Subject: [PATCH 15/19] Remove two expensive (and maybe useless?) random sampling --- pystoi/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pystoi/utils.py b/pystoi/utils.py index 3ae3ae5..a0940fa 100644 --- a/pystoi/utils.py +++ b/pystoi/utils.py @@ -185,12 +185,10 @@ def row_col_normalize(x): """ Row and column mean and variance normalize an array of 2D segments """ # input shape (batch, num_segments, seg_size, bands) # Row mean and variance normalization - x_normed = x + EPS * np.random.standard_normal(x.shape) - x_normed -= np.mean(x_normed, axis=-2, keepdims=True) + x_normed = x - np.mean(x, axis=-2, keepdims=True) x_inv = 1. / np.linalg.norm(x_normed, axis=-2) x_normed = x_normed * x_inv[..., None, :] # Column mean and variance normalization - x_normed += EPS * np.random.standard_normal(x_normed.shape) x_normed -= np.mean(x_normed, axis=-1, keepdims=True) x_inv = 1. / np.linalg.norm(x_normed, axis=-1) x_normed = x_normed * x_inv[..., None] From e7a4026e99622268209e938c2f97495d9717791d Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 21:05:12 +0000 Subject: [PATCH 16/19] Tiny change --- pystoi/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pystoi/utils.py b/pystoi/utils.py index a0940fa..0088cc2 100644 --- a/pystoi/utils.py +++ b/pystoi/utils.py @@ -186,10 +186,10 @@ def row_col_normalize(x): # input shape (batch, num_segments, seg_size, bands) # Row mean and variance normalization x_normed = x - np.mean(x, axis=-2, keepdims=True) - x_inv = 1. / np.linalg.norm(x_normed, axis=-2) - x_normed = x_normed * x_inv[..., None, :] + x_inv = 1. / np.linalg.norm(x_normed, axis=-2, keepdims=True) + x_normed = x_normed * x_inv # Column mean and variance normalization x_normed -= np.mean(x_normed, axis=-1, keepdims=True) - x_inv = 1. / np.linalg.norm(x_normed, axis=-1) - x_normed = x_normed * x_inv[..., None] + x_inv = 1. / np.linalg.norm(x_normed, axis=-1, keepdims=True) + x_normed = x_normed * x_inv return x_normed From 0f214e4510c02fba58c69c87d654c8419da2ec09 Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 21:10:27 +0000 Subject: [PATCH 17/19] Fix shapes in tests --- tests/test_overlap_and_add.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_overlap_and_add.py b/tests/test_overlap_and_add.py index 437f315..8583c52 100644 --- a/tests/test_overlap_and_add.py +++ b/tests/test_overlap_and_add.py @@ -54,7 +54,7 @@ def test_pystoi_silence(extended): audio = np.array(audio) res = stoi(audio, audio, fs, extended) print(batch_size, fs, extended, res) - assert res.shape == () if batch_size == 1 else (batch_size,) + assert res.shape == x.shape[:-1] def test_vectorisation(): @@ -62,7 +62,7 @@ def test_vectorisation(): batch_size = 4 x = np.random.random((batch_size, 100 * N_FRAME)) y = np.random.random((batch_size, 100 * N_FRAME)) + res = np.array([stoi(xi, yi, FS) for xi, yi in zip(x, y)]) res_vec = stoi(x, y, FS) - res_old = np.array([stoi(xi, yi, FS) for xi, yi in zip(x, y)]) - assert res_vec.shape == (batch_size,) - assert np.allclose(res_old, res_vec) + assert res_vec.shape == x.shape[:-1] + assert np.allclose(res, res_vec) From 959df2710bef6a06c93f187005a8667ef8d9183e Mon Sep 17 00:00:00 2001 From: giamic Date: Tue, 1 Mar 2022 21:10:45 +0000 Subject: [PATCH 18/19] Add test with bugfix --- pystoi/stoi.py | 2 +- tests/test_overlap_and_add.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pystoi/stoi.py b/pystoi/stoi.py index 78423bc..920871b 100644 --- a/pystoi/stoi.py +++ b/pystoi/stoi.py @@ -75,7 +75,7 @@ def stoi(x, y, fs_sig, extended=False): 'intelligibility measure after removing silent ' 'frames. Returning 1e-5. Please check you wav files', RuntimeWarning) - return np.array([1e-5 for _ in range(x)]).reshape(out_shape) + return np.array([1e-5 for _ in range(x.shape[0])]).reshape(out_shape) # Apply OB matrix to the spectrograms as in Eq. (1), shape (batch, frames, bands) x_tob = np.sqrt(np.matmul(np.square(np.abs(x_spec)), OBM.T)) diff --git a/tests/test_overlap_and_add.py b/tests/test_overlap_and_add.py index 8583c52..b86bd20 100644 --- a/tests/test_overlap_and_add.py +++ b/tests/test_overlap_and_add.py @@ -39,6 +39,17 @@ def test_pystoi_run(batch_size, fs, extended): assert res.shape == x.shape[:-1] +@pytest.mark.parametrize("extended", [True, False]) +@pytest.mark.parametrize("batch_size", [1, 4]) +def test_pystoi_complete_silence(batch_size, extended): + fs = 16000 + N = fs * 4 # 4 seconds of random audio + x = np.zeros((batch_size, N)) + res = stoi(x, x, fs, extended) + print(batch_size, fs, extended, res) + assert res.shape == x.shape[:-1] + + @pytest.mark.parametrize("extended", [True, False]) def test_pystoi_silence(extended): rng = np.random.default_rng(seed=0) From 11eed46a31db7eaf89801b356eb7fbaca8aa192e Mon Sep 17 00:00:00 2001 From: giamic Date: Wed, 2 Mar 2022 09:33:14 +0000 Subject: [PATCH 19/19] Minor rewriting --- pystoi/utils.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pystoi/utils.py b/pystoi/utils.py index 0088cc2..abc7660 100644 --- a/pystoi/utils.py +++ b/pystoi/utils.py @@ -184,12 +184,10 @@ def remove_silent_frames(x, y, dyn_range, framelen, hop): def row_col_normalize(x): """ Row and column mean and variance normalize an array of 2D segments """ # input shape (batch, num_segments, seg_size, bands) - # Row mean and variance normalization + # Row mean and variance normalization -- axis: seg_size x_normed = x - np.mean(x, axis=-2, keepdims=True) - x_inv = 1. / np.linalg.norm(x_normed, axis=-2, keepdims=True) - x_normed = x_normed * x_inv - # Column mean and variance normalization + x_normed = x_normed / (np.linalg.norm(x_normed, axis=-2, keepdims=True) + EPS) + # Column mean and variance normalization -- axis: bands x_normed -= np.mean(x_normed, axis=-1, keepdims=True) - x_inv = 1. / np.linalg.norm(x_normed, axis=-1, keepdims=True) - x_normed = x_normed * x_inv + x_normed = x_normed / (np.linalg.norm(x_normed, axis=-1, keepdims=True) + EPS) return x_normed