From fc2ddb7bb185455cb9156304c1bf6d60cc6c21c2 Mon Sep 17 00:00:00 2001 From: Sanna Wager Date: Wed, 29 Aug 2018 21:54:32 -0400 Subject: [PATCH 1/4] Added voiced versus unvoiced frame prediction using HMM model, similar to pYIN algorithm --- crepe/__init__.py | 2 +- crepe/cli.py | 14 +++++++++++- crepe/core.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 71 insertions(+), 3 deletions(-) diff --git a/crepe/__init__.py b/crepe/__init__.py index 52eacea..f4d969c 100755 --- a/crepe/__init__.py +++ b/crepe/__init__.py @@ -1,2 +1,2 @@ from .version import version as __version__ -from .core import get_activation, predict, process_file +from .core import get_activation, predict, predict_voicing, process_file diff --git a/crepe/cli.py b/crepe/cli.py index aaa96de..e56fd06 100644 --- a/crepe/cli.py +++ b/crepe/cli.py @@ -10,7 +10,8 @@ def run(filename, output=None, model_capacity='full', viterbi=False, save_activation=False, save_plot=False, plot_voicing=False, - no_centering=False, step_size=10, verbose=True): + apply_voicing=False, no_centering=False, step_size=10, + verbose=True): """ Collect the WAV files to process and run the model @@ -36,6 +37,10 @@ def run(filename, output=None, model_capacity='full', viterbi=False, Include a visual representation of the voicing activity detection in the plot of the output activation matrix. False by default, only relevant if save_plot is True. + apply_voicing : bool + Apply viterbi algorithm to predict for every frame whether it was + voiced or unvoiced. Zero out silent frames and save the resulting + frequency array to a .npy file. no_centering : bool Don't pad the signal, meaning frames will begin at their timestamp instead of being centered around their timestamp (which is the @@ -81,6 +86,7 @@ def run(filename, output=None, model_capacity='full', viterbi=False, save_activation=save_activation, save_plot=save_plot, plot_voicing=plot_voicing, + apply_voicing=apply_voicing, step_size=step_size, verbose=verbose) @@ -143,6 +149,11 @@ def main(): parser.add_argument('--plot-voicing', '-v', action='store_true', help='Plot the voicing prediction on top of the ' 'output activation matrix plot') + parser.add_argument('--apply-voicing', '-P', action='store_true', + help='Apply viterbi algorithm to predict for every ' + 'frame whether it was voiced or unvoiced. Zero ' + 'out silent frames and save the resulting ' + 'frequency array to a .npy file.') parser.add_argument('--no-centering', '-n', action='store_true', help="Don't pad the signal, meaning frames will begin " "at their timestamp instead of being centered " @@ -168,6 +179,7 @@ def main(): save_activation=args.save_activation, save_plot=args.save_plot, plot_voicing=args.plot_voicing, + apply_voicing=args.apply_voicing, no_centering=args.no_centering, step_size=args.step_size, verbose=not args.quiet) diff --git a/crepe/core.py b/crepe/core.py index fc17c9c..266772d 100644 --- a/crepe/core.py +++ b/crepe/core.py @@ -152,6 +152,45 @@ def to_viterbi_cents(salience): range(len(observations))]) +def predict_voicing(confidence): + """ + Find the Viterbi path for voiced versus unvoiced frames. + + Parameters + ---------- + confidence : np.ndarray [shape=(N,)] + voicing confidence array, i.e. the confidence in the presence of + a pitch + + Returns + ------- + voicing_states : np.ndarray [shape=(N,)] + HMM predictions for each frames state, 0 if unvoiced, 1 if + voiced + """ + from hmmlearn import hmm + + # uniform prior on the voicing confidence + starting = np.array([0.5, 0.5]) + + # transition probabilities inducing continuous voicing state + transition = np.array([[0.99, 0.01], [0.01, 0.99]]) + + # mean and variance for unvoiced and voiced states + means = np.array([[0.0], [1.0]]) + vars = np.array([[0.25], [0.25]]) + + # fix the model parameters because we are not optimizing the model + model = hmm.GaussianHMM(n_components=2) + model.startprob_, model.covars_, model.transmat_, model.means_, model.n_features = \ + starting, vars, transition, means, 1 + + # find the Viterbi path + voicing_states = model.predict(confidence.reshape(-1, 1), [len(confidence)]) + + return np.array(voicing_states) + + def get_activation(audio, sr, model_capacity='full', center=True, step_size=10, verbose=1): """ @@ -271,7 +310,8 @@ def predict(audio, sr, model_capacity='full', def process_file(file, output=None, model_capacity='full', viterbi=False, center=True, save_activation=False, save_plot=False, - plot_voicing=False, step_size=10, verbose=True): + plot_voicing=False, apply_voicing=False, step_size=10, + verbose=True): """ Use the input model to perform pitch estimation on the input file. @@ -300,6 +340,10 @@ def process_file(file, output=None, model_capacity='full', viterbi=False, Include a visual representation of the voicing activity detection in the plot of the output activation matrix. False by default, only relevant if save_plot is True. + apply_voicing : bool + Apply viterbi algorithm to predict for every frame whether it was + voiced or unvoiced. Zero out silent frames and save the resulting + frequency array to a .npy file. step_size : int The step size in milliseconds for running pitch estimation. verbose : bool @@ -323,6 +367,18 @@ def process_file(file, output=None, model_capacity='full', viterbi=False, step_size=step_size, verbose=1 * verbose) + # predict voiced and unvoiced states, zero out silent frames, and + # save the resulting frequency array to a .npy file + if apply_voicing: + is_voiced = predict_voicing(confidence) + voiced_frequency = frequency * is_voiced + voiced_frequency_path = output_path(file, ".voiced_frequency.npy", + output) + np.save(voiced_frequency_path, voiced_frequency) + if verbose: + print("CREPE: Saved the voiced frequency array at {}".format( + voiced_frequency_path)) + # write prediction as TSV f0_file = output_path(file, ".f0.csv", output) f0_data = np.vstack([time, frequency, confidence]).transpose() From a1c52546507be76deaedf64408fcef28785a297a Mon Sep 17 00:00:00 2001 From: sannawag Date: Wed, 29 Aug 2018 22:03:37 -0400 Subject: [PATCH 2/4] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index fcd98ee..032a799 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,11 @@ from scipy.io import wavfile sr, audio = wavfile.read('/path/to/audiofile.wav') time, frequency, confidence, activation = crepe.predict(audio, sr, viterbi=True) ``` +The Viterbi algorithm can also be used to predict which frames are unvoiced. The following commands will set the frequency of such frames to zero: +```python +is_voiced = crepe.predict_voicing(confidence) +frequency *= is_voiced +``` ## Argmax-local Weighted Averaging From a29c8c66e115522d99f50e58dc22a0c34833816b Mon Sep 17 00:00:00 2001 From: Sanna Wager Date: Thu, 30 Aug 2018 09:46:04 -0400 Subject: [PATCH 3/4] changed variable name --- crepe/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crepe/core.py b/crepe/core.py index 266772d..b61f004 100644 --- a/crepe/core.py +++ b/crepe/core.py @@ -178,12 +178,12 @@ def predict_voicing(confidence): # mean and variance for unvoiced and voiced states means = np.array([[0.0], [1.0]]) - vars = np.array([[0.25], [0.25]]) + variances = np.array([[0.25], [0.25]]) # fix the model parameters because we are not optimizing the model model = hmm.GaussianHMM(n_components=2) model.startprob_, model.covars_, model.transmat_, model.means_, model.n_features = \ - starting, vars, transition, means, 1 + starting, variances, transition, means, 1 # find the Viterbi path voicing_states = model.predict(confidence.reshape(-1, 1), [len(confidence)]) From adee92c11940f927e0e85ad3af5f6cb6c07c1867 Mon Sep 17 00:00:00 2001 From: sannawag Date: Wed, 29 Aug 2018 22:03:37 -0400 Subject: [PATCH 4/4] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index fcd98ee..032a799 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,11 @@ from scipy.io import wavfile sr, audio = wavfile.read('/path/to/audiofile.wav') time, frequency, confidence, activation = crepe.predict(audio, sr, viterbi=True) ``` +The Viterbi algorithm can also be used to predict which frames are unvoiced. The following commands will set the frequency of such frames to zero: +```python +is_voiced = crepe.predict_voicing(confidence) +frequency *= is_voiced +``` ## Argmax-local Weighted Averaging