From ae82619d6e42111462d79157e48ee1e6c471e04a Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Thu, 16 Jan 2025 13:25:40 -0500 Subject: [PATCH 01/16] added hubert processor + fairseq requirement --- environment.yml | 1 + shennong/processor/__init__.py | 1 + shennong/processor/hubert.py | 191 +++++++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+) create mode 100644 shennong/processor/hubert.py diff --git a/environment.yml b/environment.yml index 7ce6ea2..bd317ee 100644 --- a/environment.yml +++ b/environment.yml @@ -27,3 +27,4 @@ dependencies: - pip: - json-tricks==3.15.* - sox # pysox + - fairseq diff --git a/shennong/processor/__init__.py b/shennong/processor/__init__.py index 2511026..0f1af9f 100644 --- a/shennong/processor/__init__.py +++ b/shennong/processor/__init__.py @@ -3,6 +3,7 @@ from shennong.processor.bottleneck import BottleneckProcessor from shennong.processor.energy import EnergyProcessor from shennong.processor.filterbank import FilterbankProcessor +from shennong.processor.hubert import HubertProcessor from shennong.processor.mfcc import MfccProcessor from shennong.processor.onehot import FramedOneHotProcessor, OneHotProcessor from shennong.processor.pitch_crepe import ( diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py new file mode 100644 index 0000000..d5f63b1 --- /dev/null +++ b/shennong/processor/hubert.py @@ -0,0 +1,191 @@ +"""Extraction of HuBERT features from audio signals + + :class:`~shennong.audio.Audio` ---> HubertProcessor \ + ---> :class:`~shennong.features.Features` + +Examples +-------- + +>>> from shennong.audio import Audio +>>> from shennong.processor.hubert import HubertProcessor +>>> audio = Audio.load('./test/data/test.wav') +>>> processor = HubertProcessor(layer=3, model_path='/home/exp/mhubert-147') + +Compute the HuBERT features. the output is an +instance of :class:`~shennong.features.Features`: + +>>> hubert = processor.process(audio) +>>> type(hubert) + + +References +---------- + +.. [HuBERT] https://arxiv.org/abs/2106.07447 + +""" + +import torch +import fairseq + +import numpy as np + +from shennong import Features +from shennong.processor.base import FeaturesProcessor + +class HubertProcessor(FeaturesProcessor): + """HuBERT features from a pre-trained neural network + + Parameters + ---------- + layer : 1, 2, ..., 12 + The layer to extract features from + + model_path : The path to the pre-trained HuBERT model + + """ + + _LAYERS = tuple(i for i in range(1, 13)) + + _SEED = 3939 + + def __init__(self, layer=None, model_path=None): + super().__init__() + self.layer = layer + self.model_path = model_path + + @property + def name(self): + return 'hubert' + + @property + def layer(self): + """Layer to extract features from""" + return self._layer + + @layer.setter + def layer(self, value): + if int(value) not in self._LAYERS: + raise ValueError(f"Layer {value} does not exist in this model") + elif not value: + raise ValueError("No layers selected") + else: + self._layer = int(value) + + @property + def model_path(self): + """The path to the pretrained HuBERT model""" + return self._model_path + + @model_path.setter + def model_path(self, value): + self._model_path = str(value) + + @property + def ndims(self): + """The dimension of extracted frames + + Cannot be tuned because the underlying neural networks are + trained with this parameter. + + """ + return 768 + + @property + def sample_rate(self): + """Processing sample frequency in Hertz + + Cannot be tuned because the underlying neural networks are + trained with this parameter. + + """ + return 16000 + + @property + def frame_length(self): + """The length of extracted frames (in seconds) + + Cannot be tuned because the underlying neural networks are + trained with this parameter. + + """ + return 0.02 + + @property + def frame_shift(self): + """The time shift between two consecutive frames (in seconds) + + Cannot be tuned because the underlying neural networks are + trained with this parameter. + + """ + return 0.02 + + def process(self, signal): + """Computes HuBERT features with the specified options + + Use a pre-trained neural network to extract HuBERT + features. Features have a frame shift of 20 ms and frame + length of 20 ms. + + Parameters + ---------- + signal : Audio, shape = [nsamples, 1] + The input audio signal to compute the features on, must be + mono. The signal is up/down-sampled to 16 kHz during + processing. + + Returns + ------- + features : Features, shape = [nframes, 768] + The computed HuBERT features will have as many rows as + there are frames (depends on the `signal` duration, expect + about 50 frames per second), each frame with 768 + dimensions. + + Raises + ------ + ValueError + If the input `signal` has more than one channel (i.e. is + not mono). If `sample_rate` != `signal.sample_rate`. + + """ + + torch.manual_seed(self._SEED) + np.random.seed(self._SEED) + + # ensure the signal is correct + if signal.nchannels != 1: + raise ValueError( + 'signal must have one dimension, but it has {}' + .format(signal.nchannels)) + + # force resampling to 16 kHz and 32 bit floats + need_resample = ( + signal.sample_rate != 16000 or + signal.dtype is not np.dtype(np.float32)) + + if need_resample: + self.log.debug( + 'resampling audio from %dHz@%db to %dHz@%db', + signal.sample_rate, signal.dtype.itemsize * 8, 16000, 32) + signal = signal.resample(16000).astype(np.float32) + + signal = torch.unsqueeze(torch.from_numpy(signal.data), 0) + + model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0] + model.eval() + + out_dict = model(signal, features_only=True, mask=False, output_layer=self.layer) + + data = out_dict["features"][0].squeeze(1).detach().numpy() + + del out_dict + + # compute the timestamps for each output frame + times = np.vstack(( + np.arange(data.shape[0]) * self.frame_shift, + np.arange(data.shape[0]) * self.frame_shift + self.frame_length)).T + + return Features( + data, times, properties=self.get_properties()) \ No newline at end of file From 7169cf64daa170e6cea43319051fa0090f66be89 Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Thu, 16 Jan 2025 15:33:38 -0500 Subject: [PATCH 02/16] moved choice of layer into processing function --- shennong/processor/hubert.py | 43 +++++++++++++++--------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index d5f63b1..0550694 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -9,12 +9,12 @@ >>> from shennong.audio import Audio >>> from shennong.processor.hubert import HubertProcessor >>> audio = Audio.load('./test/data/test.wav') ->>> processor = HubertProcessor(layer=3, model_path='/home/exp/mhubert-147') +>>> processor = HubertProcessor(model_path='/home/exp/mhubert-147') Compute the HuBERT features. the output is an instance of :class:`~shennong.features.Features`: ->>> hubert = processor.process(audio) +>>> hubert = processor.process(audio, layer=3) >>> type(hubert) @@ -38,39 +38,19 @@ class HubertProcessor(FeaturesProcessor): Parameters ---------- - layer : 1, 2, ..., 12 - The layer to extract features from - model_path : The path to the pre-trained HuBERT model """ - _LAYERS = tuple(i for i in range(1, 13)) - _SEED = 3939 - def __init__(self, layer=None, model_path=None): + def __init__(self, model_path=None): super().__init__() - self.layer = layer self.model_path = model_path @property def name(self): return 'hubert' - - @property - def layer(self): - """Layer to extract features from""" - return self._layer - - @layer.setter - def layer(self, value): - if int(value) not in self._LAYERS: - raise ValueError(f"Layer {value} does not exist in this model") - elif not value: - raise ValueError("No layers selected") - else: - self._layer = int(value) @property def model_path(self): @@ -120,8 +100,14 @@ def frame_shift(self): """ return 0.02 + + def _check_layer(self, value, model): + if value not in range(len(model.encoder.layers) + 1): + raise ValueError(f"Layer {value} does not exist in this model") + elif not value: + raise ValueError("No layers selected") - def process(self, signal): + def process(self, signal, layer): """Computes HuBERT features with the specified options Use a pre-trained neural network to extract HuBERT @@ -135,6 +121,9 @@ def process(self, signal): mono. The signal is up/down-sampled to 16 kHz during processing. + layer : int + The layer to extract features from + Returns ------- features : Features, shape = [nframes, 768] @@ -149,6 +138,8 @@ def process(self, signal): If the input `signal` has more than one channel (i.e. is not mono). If `sample_rate` != `signal.sample_rate`. + ValueError + If the selected layer does not exist in the given model. """ torch.manual_seed(self._SEED) @@ -176,7 +167,9 @@ def process(self, signal): model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0] model.eval() - out_dict = model(signal, features_only=True, mask=False, output_layer=self.layer) + self._check_layer(layer, model) + + out_dict = model(signal, features_only=True, mask=False, output_layer=layer) data = out_dict["features"][0].squeeze(1).detach().numpy() From a34a13c65eb8d5ce48946ca28351881ad136ee09 Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Thu, 16 Jan 2025 16:24:14 -0500 Subject: [PATCH 03/16] rearranging --- shennong/processor/hubert.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index 0550694..35705b3 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -46,7 +46,12 @@ class HubertProcessor(FeaturesProcessor): def __init__(self, model_path=None): super().__init__() + torch.manual_seed(self._SEED) + np.random.seed(self._SEED) + self.model_path = model_path + self.model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0] + @property def name(self): @@ -142,8 +147,9 @@ def process(self, signal, layer): If the selected layer does not exist in the given model. """ - torch.manual_seed(self._SEED) - np.random.seed(self._SEED) + self._check_layer(layer, self.model) + + self.model.eval() # ensure the signal is correct if signal.nchannels != 1: @@ -164,12 +170,7 @@ def process(self, signal, layer): signal = torch.unsqueeze(torch.from_numpy(signal.data), 0) - model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0] - model.eval() - - self._check_layer(layer, model) - - out_dict = model(signal, features_only=True, mask=False, output_layer=layer) + out_dict = self.model(signal, features_only=True, mask=False, output_layer=layer) data = out_dict["features"][0].squeeze(1).detach().numpy() From 3f2b59593e50217c4139ccf46834e523b4b7f73d Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Wed, 29 Jan 2025 14:55:26 -0500 Subject: [PATCH 04/16] now able to work with huggingface models --- environment.yml | 5 +++-- shennong/processor/hubert.py | 36 +++++++++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/environment.yml b/environment.yml index bd317ee..58346fd 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ channels: - conda-forge - coml dependencies: - - python>=3.6 + - python>=3.7 - shennong-pykaldi - ffmpeg - h5features>=1.3.2 @@ -25,6 +25,7 @@ dependencies: - sphinx_rtd_theme - tensorflow<2.5 - pip: + - fairseq - json-tricks==3.15.* - sox # pysox - - fairseq + - transformers==4.30.2 \ No newline at end of file diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index 35705b3..a33edc6 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -32,6 +32,7 @@ from shennong import Features from shennong.processor.base import FeaturesProcessor +from transformers import HubertForCTC class HubertProcessor(FeaturesProcessor): """HuBERT features from a pre-trained neural network @@ -39,6 +40,12 @@ class HubertProcessor(FeaturesProcessor): Parameters ---------- model_path : The path to the pre-trained HuBERT model + + Raises + ------ + RuntimeError + If the model path does not point to a HuBERT model + that can be loaded with either fairseq or huggingface """ @@ -50,8 +57,15 @@ def __init__(self, model_path=None): np.random.seed(self._SEED) self.model_path = model_path - self.model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0] - + try: + self.model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0] + self._model_type = 'fairseq' + except IsADirectoryError: + try: + self.model = HubertForCTC.from_pretrained(self.model_path) + self._model_type = 'huggingface' + except: + raise RuntimeError(f"The model at {self.model_path} cannot be loaded. Make sure that this is a fairseq model or huggingface model directory.") @property def name(self): @@ -107,7 +121,12 @@ def frame_shift(self): return 0.02 def _check_layer(self, value, model): - if value not in range(len(model.encoder.layers) + 1): + if self._model_type == 'fairseq': + layer_num = len(model.encoder.layers) + 1 + elif self._model_type == 'huggingface': + layer_num = model.config.num_hidden_layers + 1 + + if value not in range(layer_num): raise ValueError(f"Layer {value} does not exist in this model") elif not value: raise ValueError("No layers selected") @@ -170,10 +189,13 @@ def process(self, signal, layer): signal = torch.unsqueeze(torch.from_numpy(signal.data), 0) - out_dict = self.model(signal, features_only=True, mask=False, output_layer=layer) - - data = out_dict["features"][0].squeeze(1).detach().numpy() - + if self._model_type == 'fairseq': + out_dict = self.model(signal, features_only=True, mask=False, output_layer=layer) + data = out_dict["features"][0].squeeze(1).detach().numpy() + elif self._model_type == 'huggingface': + out_dict = self.model(signal, output_hidden_states=True) + data = out_dict["hidden_states"][layer][0].squeeze(1).detach().numpy() + del out_dict # compute the timestamps for each output frame From 4800f75138c4d65b9e8234f5a7c3068c216a4ca6 Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Thu, 30 Jan 2025 12:18:07 -0500 Subject: [PATCH 05/16] moved layer check to init --- shennong/processor/hubert.py | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index a33edc6..33a1c6d 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -41,17 +41,21 @@ class HubertProcessor(FeaturesProcessor): ---------- model_path : The path to the pre-trained HuBERT model + layer : The layer to extract features from + Raises ------ RuntimeError If the model path does not point to a HuBERT model that can be loaded with either fairseq or huggingface - + + ValueError + If the selected layer does not exist in the given model. """ _SEED = 3939 - def __init__(self, model_path=None): + def __init__(self, model_path=None, layer=0): super().__init__() torch.manual_seed(self._SEED) np.random.seed(self._SEED) @@ -66,6 +70,9 @@ def __init__(self, model_path=None): self._model_type = 'huggingface' except: raise RuntimeError(f"The model at {self.model_path} cannot be loaded. Make sure that this is a fairseq model or huggingface model directory.") + + self._check_layer(layer, self.model) + self.layer = layer @property def name(self): @@ -80,6 +87,15 @@ def model_path(self): def model_path(self, value): self._model_path = str(value) + @property + def layer(self): + """The layer to extract features from""" + return self._layer + + @layer.setter + def layer(self, value): + self._layer = int(value) + @property def ndims(self): """The dimension of extracted frames @@ -131,7 +147,7 @@ def _check_layer(self, value, model): elif not value: raise ValueError("No layers selected") - def process(self, signal, layer): + def process(self, signal): """Computes HuBERT features with the specified options Use a pre-trained neural network to extract HuBERT @@ -145,9 +161,6 @@ def process(self, signal, layer): mono. The signal is up/down-sampled to 16 kHz during processing. - layer : int - The layer to extract features from - Returns ------- features : Features, shape = [nframes, 768] @@ -161,13 +174,8 @@ def process(self, signal, layer): ValueError If the input `signal` has more than one channel (i.e. is not mono). If `sample_rate` != `signal.sample_rate`. - - ValueError - If the selected layer does not exist in the given model. """ - self._check_layer(layer, self.model) - self.model.eval() # ensure the signal is correct @@ -190,11 +198,11 @@ def process(self, signal, layer): signal = torch.unsqueeze(torch.from_numpy(signal.data), 0) if self._model_type == 'fairseq': - out_dict = self.model(signal, features_only=True, mask=False, output_layer=layer) + out_dict = self.model(signal, features_only=True, mask=False, output_layer=self.layer) data = out_dict["features"][0].squeeze(1).detach().numpy() elif self._model_type == 'huggingface': out_dict = self.model(signal, output_hidden_states=True) - data = out_dict["hidden_states"][layer][0].squeeze(1).detach().numpy() + data = out_dict["hidden_states"][self.layer][0].squeeze(1).detach().numpy() del out_dict From 3e9a8006b8fb1abdf35716cfd386f8ff75a4388e Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Thu, 30 Jan 2025 13:59:42 -0500 Subject: [PATCH 06/16] model_path default changed to str + can now handle pulling models from huggingface --- shennong/processor/hubert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index 33a1c6d..1b0d9a4 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -55,7 +55,7 @@ class HubertProcessor(FeaturesProcessor): _SEED = 3939 - def __init__(self, model_path=None, layer=0): + def __init__(self, model_path="", layer=0): super().__init__() torch.manual_seed(self._SEED) np.random.seed(self._SEED) @@ -64,7 +64,7 @@ def __init__(self, model_path=None, layer=0): try: self.model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0] self._model_type = 'fairseq' - except IsADirectoryError: + except: try: self.model = HubertForCTC.from_pretrained(self.model_path) self._model_type = 'huggingface' From 5a19f38d67afc3c5704cf6847b83ec7030158507 Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Thu, 30 Jan 2025 14:03:19 -0500 Subject: [PATCH 07/16] cleanup --- shennong/processor/hubert.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index 1b0d9a4..c2b1fe5 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -138,11 +138,11 @@ def frame_shift(self): def _check_layer(self, value, model): if self._model_type == 'fairseq': - layer_num = len(model.encoder.layers) + 1 + layer_num = len(model.encoder.layers) elif self._model_type == 'huggingface': - layer_num = model.config.num_hidden_layers + 1 + layer_num = model.config.num_hidden_layers - if value not in range(layer_num): + if value not in range(layer_num + 1): raise ValueError(f"Layer {value} does not exist in this model") elif not value: raise ValueError("No layers selected") From 2f3c2da156ba3d3a17cf8a8d77d4a7a193c56792 Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Thu, 30 Jan 2025 14:27:18 -0500 Subject: [PATCH 08/16] layer path default changed to string --- shennong/processor/hubert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index c2b1fe5..c18118b 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -55,7 +55,7 @@ class HubertProcessor(FeaturesProcessor): _SEED = 3939 - def __init__(self, model_path="", layer=0): + def __init__(self, model_path="", layer=""): super().__init__() torch.manual_seed(self._SEED) np.random.seed(self._SEED) From dc0f0e771cd09214178e11726c7e90b20b4329fa Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Thu, 6 Feb 2025 13:47:33 -0500 Subject: [PATCH 09/16] added convolution layer extraction + correctly gets feature properties --- shennong/processor/hubert.py | 149 ++++++++++++++++++++++++++++------- 1 file changed, 121 insertions(+), 28 deletions(-) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index c18118b..f4adafe 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -9,12 +9,12 @@ >>> from shennong.audio import Audio >>> from shennong.processor.hubert import HubertProcessor >>> audio = Audio.load('./test/data/test.wav') ->>> processor = HubertProcessor(model_path='/home/exp/mhubert-147') +>>> processor = HubertProcessor(model_path='/home/exp/mhubert-147', layer=1, layer_type="convolutional") Compute the HuBERT features. the output is an instance of :class:`~shennong.features.Features`: ->>> hubert = processor.process(audio, layer=3) +>>> hubert = processor.process(audio) >>> type(hubert) @@ -30,6 +30,7 @@ import numpy as np +from ast import literal_eval from shennong import Features from shennong.processor.base import FeaturesProcessor from transformers import HubertForCTC @@ -43,6 +44,8 @@ class HubertProcessor(FeaturesProcessor): layer : The layer to extract features from + layer_type : The type of layer to extract features from (encoder or convolutional) + Raises ------ RuntimeError @@ -50,19 +53,22 @@ class HubertProcessor(FeaturesProcessor): that can be loaded with either fairseq or huggingface ValueError - If the selected layer does not exist in the given model. + If the selected layer does not exist in the given model + or if the given layer type does not exist. """ _SEED = 3939 - def __init__(self, model_path="", layer=""): + def __init__(self, model_path="", layer="", layer_type="encoder"): super().__init__() torch.manual_seed(self._SEED) np.random.seed(self._SEED) self.model_path = model_path try: - self.model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0] + self.model, self._cfg, self._task_cfg = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path]) + self._conv_list = self._parse_conv_str(self._cfg['model']['conv_feature_layers']) + self.model = self.model[0] self._model_type = 'fairseq' except: try: @@ -71,7 +77,8 @@ def __init__(self, model_path="", layer=""): except: raise RuntimeError(f"The model at {self.model_path} cannot be loaded. Make sure that this is a fairseq model or huggingface model directory.") - self._check_layer(layer, self.model) + self.layer_type = layer_type + self._check_layer(int(layer)) self.layer = layer @property @@ -95,6 +102,15 @@ def layer(self): @layer.setter def layer(self, value): self._layer = int(value) + + @property + def layer_type(self): + """The type of layer that features are extracted from""" + return self._layer_type + + @layer_type.setter + def layer_type(self, value): + self._layer_type = str(value) @property def ndims(self): @@ -104,7 +120,16 @@ def ndims(self): trained with this parameter. """ - return 768 + if self.layer_type == 'encoder': + if self._model_type == 'fairseq': + return self._cfg['model']['encoder_embed_dim'] + elif self._model_type == 'huggingface': + return self.model.config.hidden_size + elif self.layer_type == 'convolutional': + if self._model_type == 'fairseq': + return self._conv_list[self.layer - 1][0] + elif self._model_type == 'huggingface': + return self.model.config.conv_dim[self.layer - 1] @property def sample_rate(self): @@ -124,7 +149,30 @@ def frame_length(self): trained with this parameter. """ - return 0.02 + + def get_receptive_field_length(kernels, strides): + field_length = 1 + for i in range(len(kernels)): + field_length += (kernels[i] - 1) * np.prod(strides[:i]) + return field_length + + if self.layer_type == 'encoder': + # receptive field length of all convolutional layers + if self._model_type == 'fairseq': + _, kernels, strides = zip(*self._conv_list) + elif self._model_type == 'huggingface': + kernels = self.model.config.conv_kernel + strides = self.model.config.conv_stride + elif self.layer_type == 'convolutional': + # receptive field length of convolution layers up to selected layer + if self._model_type == 'fairseq': + _, kernels, strides = zip(*self._conv_list[:self.layer]) + elif self._model_type == 'huggingface': + kernels = self.model.config.conv_kernel[:self.layer] + strides = self.model.config.conv_stride[:self.layer] + + frame_length = get_receptive_field_length(kernels, strides) / self.sample_rate + return frame_length @property def frame_shift(self): @@ -134,19 +182,55 @@ def frame_shift(self): trained with this parameter. """ - return 0.02 + if self.layer_type == 'encoder': + # total stride length of all convolutional layers + if self._model_type == 'fairseq': + _, _, strides = zip(*self._conv_list) + elif self._model_type == 'huggingface': + strides = self.model.config.conv_stride + elif self.layer_type == 'convolutional': + # total stride length of convolution layers up to selected layer + if self._model_type == 'fairseq': + _, _, strides = zip(*self._conv_list[:self.layer]) + elif self._model_type == 'huggingface': + strides = self.model.config.conv_stride[:self.layer] + + total_stride = np.prod(strides) + frame_shift = total_stride / self.sample_rate + return frame_shift - def _check_layer(self, value, model): - if self._model_type == 'fairseq': - layer_num = len(model.encoder.layers) - elif self._model_type == 'huggingface': - layer_num = model.config.num_hidden_layers + def _check_layer(self, value): + if self.layer_type == 'encoder': + if self._model_type == 'fairseq': + layer_num = self._cfg['model']['encoder_layers'] + elif self._model_type == 'huggingface': + layer_num = self.model.config.num_hidden_layers + elif self.layer_type == 'convolutional': + if self._model_type == 'fairseq': + layer_num = len(self._conv_list) + elif self._model_type == 'huggingface': + layer_num = len(self.model.config.conv_dim) + else: + raise ValueError("Invalid layer type") if value not in range(layer_num + 1): - raise ValueError(f"Layer {value} does not exist in this model") + raise ValueError(f"There is no {self.layer_type} layer {value} in this model") elif not value: raise ValueError("No layers selected") + def _parse_conv_str(self, conv_str): + conv_list = [] + + for item in conv_str.split("+"): + item = item.strip() + if "*" in item: + feat, mult = item.split("*") + conv_list.extend(([literal_eval(item)[0] for item in (feat * int(mult)).split()])) + else: + conv_list.append(literal_eval(item)[0]) + + return conv_list + def process(self, signal): """Computes HuBERT features with the specified options @@ -163,11 +247,13 @@ def process(self, signal): Returns ------- - features : Features, shape = [nframes, 768] - The computed HuBERT features will have as many rows as - there are frames (depends on the `signal` duration, expect - about 50 frames per second), each frame with 768 - dimensions. + features : Features, shape = [nframes, ndim] + The computed HuBERT features will either: + have as many rows as there are frames (depends on the `signal` duration, expect + 50 frames per second) for encoder layers, + or have as many rows as there are samples divided by the product of the stride lengths + (depends on the `signal` duration and the stride lengths) for convolutional layers, + each frame with the number of dimensions in the layer. Raises ------ @@ -199,17 +285,24 @@ def process(self, signal): if self._model_type == 'fairseq': out_dict = self.model(signal, features_only=True, mask=False, output_layer=self.layer) - data = out_dict["features"][0].squeeze(1).detach().numpy() + if self.layer_type == 'encoder': + data = out_dict["features"][0].squeeze(1).detach().numpy() + elif self.layer_type == 'convolutional': + self._cfg['model']['conv_feature_layers'] = str(self._conv_list[:self.layer]) + self.model.feature_extractor = fairseq.models.hubert.HubertModel.build_model(self._cfg['model'], self._task_cfg).feature_extractor + data = self.model.forward_features(signal).transpose(1, 2).squeeze(0).detach().numpy() elif self._model_type == 'huggingface': out_dict = self.model(signal, output_hidden_states=True) - data = out_dict["hidden_states"][self.layer][0].squeeze(1).detach().numpy() - + if self.layer_type == 'encoder': + data = out_dict["hidden_states"][self.layer][0].squeeze(1).detach().numpy() + elif self.layer_type == 'convolutional': + self.model.hubert.config.num_feat_extract_layers = self.layer + self.model.hubert.feature_extractor = HubertForCTC(self.model.hubert.config).hubert.feature_extractor + data = self.model.hubert.feature_extractor(signal).transpose(1, 2).squeeze(0).detach().numpy() del out_dict - # compute the timestamps for each output frame - times = np.vstack(( - np.arange(data.shape[0]) * self.frame_shift, - np.arange(data.shape[0]) * self.frame_shift + self.frame_length)).T - + # compute the timestamps for the midpoint of each output frame + times = np.vstack((np.arange(data.shape[0]) * self.frame_shift + (self.frame_length / 2))).squeeze(1) + return Features( data, times, properties=self.get_properties()) \ No newline at end of file From 9fcd2e693ef7994fe81595fb0cafd184c4763483 Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Mon, 31 Mar 2025 12:08:03 -0400 Subject: [PATCH 10/16] combined layer and layer_type into layer_info --- shennong/processor/hubert.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index f4adafe..ed14273 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -42,9 +42,8 @@ class HubertProcessor(FeaturesProcessor): ---------- model_path : The path to the pre-trained HuBERT model - layer : The layer to extract features from - - layer_type : The type of layer to extract features from (encoder or convolutional) + layer_info : Tuple with the type of layer to extract features from (encoder or convolutional) + as the first element and the layer number as the second element Raises ------ @@ -59,7 +58,7 @@ class HubertProcessor(FeaturesProcessor): _SEED = 3939 - def __init__(self, model_path="", layer="", layer_type="encoder"): + def __init__(self, model_path="", layer_info=("encoder", "1")): super().__init__() torch.manual_seed(self._SEED) np.random.seed(self._SEED) @@ -77,6 +76,10 @@ def __init__(self, model_path="", layer="", layer_type="encoder"): except: raise RuntimeError(f"The model at {self.model_path} cannot be loaded. Make sure that this is a fairseq model or huggingface model directory.") + self.layer_info = layer_info + layer_type = layer_info[0] + layer = layer_info[1] + self.layer_type = layer_type self._check_layer(int(layer)) self.layer = layer From 907dcd53c0338315762d48675d2caf45728ce1ea Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Mon, 31 Mar 2025 12:49:28 -0400 Subject: [PATCH 11/16] timestamps are now at the beginning and end of each frame for consistency with other processors --- shennong/processor/hubert.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index ed14273..0e1610b 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -304,8 +304,10 @@ def process(self, signal): data = self.model.hubert.feature_extractor(signal).transpose(1, 2).squeeze(0).detach().numpy() del out_dict - # compute the timestamps for the midpoint of each output frame - times = np.vstack((np.arange(data.shape[0]) * self.frame_shift + (self.frame_length / 2))).squeeze(1) - + # compute the timestamps for each output frame + times = np.vstack(( + np.arange(data.shape[0]) * self.frame_shift, + np.arange(data.shape[0]) * self.frame_shift + self.frame_length)).T + return Features( data, times, properties=self.get_properties()) \ No newline at end of file From a157c3bb371d677326bc8f20462d5310b940608c Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Wed, 2 Jul 2025 14:48:03 -0400 Subject: [PATCH 12/16] adjusted example to match current class --- shennong/processor/hubert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index 0e1610b..4d3a9f7 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -9,7 +9,7 @@ >>> from shennong.audio import Audio >>> from shennong.processor.hubert import HubertProcessor >>> audio = Audio.load('./test/data/test.wav') ->>> processor = HubertProcessor(model_path='/home/exp/mhubert-147', layer=1, layer_type="convolutional") +>>> processor = HubertProcessor(model_path="facebook/hubert-large-ls960-ft", layer_info=("encoder", 1)) Compute the HuBERT features. the output is an instance of :class:`~shennong.features.Features`: From 337901012629def90673b4a5677a31737c8f5ee1 Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Thu, 3 Jul 2025 15:46:20 -0400 Subject: [PATCH 13/16] added feature extractor preprocessing step --- shennong/processor/hubert.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index 4d3a9f7..05db0bb 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -33,7 +33,7 @@ from ast import literal_eval from shennong import Features from shennong.processor.base import FeaturesProcessor -from transformers import HubertForCTC +from transformers import HubertForCTC, AutoFeatureExtractor class HubertProcessor(FeaturesProcessor): """HuBERT features from a pre-trained neural network @@ -284,10 +284,11 @@ def process(self, signal): signal.sample_rate, signal.dtype.itemsize * 8, 16000, 32) signal = signal.resample(16000).astype(np.float32) - signal = torch.unsqueeze(torch.from_numpy(signal.data), 0) - if self._model_type == 'fairseq': - out_dict = self.model(signal, features_only=True, mask=False, output_layer=self.layer) + signal = torch.unsqueeze(torch.from_numpy(signal.data), 0) + self.model.feature_extractor = fairseq.models.hubert.HubertModel.build_model(self._cfg['model'], self._task_cfg).feature_extractor + input_values = self.model.forward_features(signal).transpose(1, 2).squeeze(0).detach().numpy() + out_dict = self.model(input_values, features_only=True, mask=False, output_layer=self.layer) if self.layer_type == 'encoder': data = out_dict["features"][0].squeeze(1).detach().numpy() elif self.layer_type == 'convolutional': @@ -295,7 +296,14 @@ def process(self, signal): self.model.feature_extractor = fairseq.models.hubert.HubertModel.build_model(self._cfg['model'], self._task_cfg).feature_extractor data = self.model.forward_features(signal).transpose(1, 2).squeeze(0).detach().numpy() elif self._model_type == 'huggingface': - out_dict = self.model(signal, output_hidden_states=True) + feature_extractor = AutoFeatureExtractor.from_pretrained(self.model_path) + input_values = feature_extractor( + signal.data, + return_tensors="pt", + padding=True, + sampling_rate=16000, + ).input_values + out_dict = self.model(input_values, output_hidden_states=True) if self.layer_type == 'encoder': data = out_dict["hidden_states"][self.layer][0].squeeze(1).detach().numpy() elif self.layer_type == 'convolutional': From 01a197855d44d66f40d0ef270bf7cc48223db173 Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Mon, 7 Jul 2025 11:31:29 -0400 Subject: [PATCH 14/16] added temporary test --- shennong/processor/hubert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index 05db0bb..7867817 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -303,6 +303,7 @@ def process(self, signal): padding=True, sampling_rate=16000, ).input_values + print("feature extractor") out_dict = self.model(input_values, output_hidden_states=True) if self.layer_type == 'encoder': data = out_dict["hidden_states"][self.layer][0].squeeze(1).detach().numpy() From 920e0daf3ebc2ec21018d1e67d44e4f4177d2ac2 Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Mon, 7 Jul 2025 12:37:02 -0400 Subject: [PATCH 15/16] testing --- shennong/processor/hubert.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index 7867817..33ac2d6 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -303,7 +303,6 @@ def process(self, signal): padding=True, sampling_rate=16000, ).input_values - print("feature extractor") out_dict = self.model(input_values, output_hidden_states=True) if self.layer_type == 'encoder': data = out_dict["hidden_states"][self.layer][0].squeeze(1).detach().numpy() @@ -319,4 +318,4 @@ def process(self, signal): np.arange(data.shape[0]) * self.frame_shift + self.frame_length)).T return Features( - data, times, properties=self.get_properties()) \ No newline at end of file + input_values.numpy(), times, properties=self.get_properties()) \ No newline at end of file From 60d09d7eb0339236afd81bf73a561ab37f0dc716 Mon Sep 17 00:00:00 2001 From: michaelong7 Date: Mon, 7 Jul 2025 15:13:59 -0400 Subject: [PATCH 16/16] removed testing code --- shennong/processor/hubert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py index 33ac2d6..05db0bb 100644 --- a/shennong/processor/hubert.py +++ b/shennong/processor/hubert.py @@ -318,4 +318,4 @@ def process(self, signal): np.arange(data.shape[0]) * self.frame_shift + self.frame_length)).T return Features( - input_values.numpy(), times, properties=self.get_properties()) \ No newline at end of file + data, times, properties=self.get_properties()) \ No newline at end of file