From ae82619d6e42111462d79157e48ee1e6c471e04a Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Thu, 16 Jan 2025 13:25:40 -0500
Subject: [PATCH 01/16] added hubert processor + fairseq requirement

---
 environment.yml                |   1 +
 shennong/processor/__init__.py |   1 +
 shennong/processor/hubert.py   | 191 +++++++++++++++++++++++++++++++++
 3 files changed, 193 insertions(+)
 create mode 100644 shennong/processor/hubert.py
diff --git a/environment.yml b/environment.yml
index 7ce6ea2..bd317ee 100644
--- a/environment.yml
+++ b/environment.yml
@@ -27,3 +27,4 @@ dependencies:
   - pip:
       - json-tricks==3.15.*
       - sox  # pysox
+      - fairseq
diff --git a/shennong/processor/__init__.py b/shennong/processor/__init__.py
index 2511026..0f1af9f 100644
--- a/shennong/processor/__init__.py
+++ b/shennong/processor/__init__.py
@@ -3,6 +3,7 @@
 from shennong.processor.bottleneck import BottleneckProcessor
 from shennong.processor.energy import EnergyProcessor
 from shennong.processor.filterbank import FilterbankProcessor
+from shennong.processor.hubert import HubertProcessor
 from shennong.processor.mfcc import MfccProcessor
 from shennong.processor.onehot import FramedOneHotProcessor, OneHotProcessor
 from shennong.processor.pitch_crepe import (
diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
new file mode 100644
index 0000000..d5f63b1
--- /dev/null
+++ b/shennong/processor/hubert.py
@@ -0,0 +1,191 @@
+"""Extraction of HuBERT features from audio signals
+
+    :class:`~shennong.audio.Audio` ---> HubertProcessor \
+    ---> :class:`~shennong.features.Features`
+
+Examples
+--------
+
+>>> from shennong.audio import Audio
+>>> from shennong.processor.hubert import HubertProcessor
+>>> audio = Audio.load('./test/data/test.wav')
+>>> processor = HubertProcessor(layer=3, model_path='/home/exp/mhubert-147')
+
+Compute the HuBERT features. the output is an
+instance of :class:`~shennong.features.Features`:
+
+>>> hubert = processor.process(audio)
+>>> type(hubert)
+<class 'shennong.features.Features'>
+
+References
+----------
+
+.. [HuBERT] https://arxiv.org/abs/2106.07447
+
+"""
+
+import torch
+import fairseq
+
+import numpy as np
+
+from shennong import Features
+from shennong.processor.base import FeaturesProcessor
+
+class HubertProcessor(FeaturesProcessor):
+    """HuBERT features from a pre-trained neural network
+
+    Parameters
+    ----------
+    layer : 1, 2, ..., 12
+        The layer to extract features from
+
+    model_path : The path to the pre-trained HuBERT model
+        
+    """
+
+    _LAYERS = tuple(i for i in range(1, 13))
+
+    _SEED = 3939
+
+    def __init__(self, layer=None, model_path=None):
+        super().__init__()
+        self.layer = layer
+        self.model_path = model_path
+
+    @property
+    def name(self):
+        return 'hubert'
+    
+    @property
+    def layer(self):
+        """Layer to extract features from"""
+        return self._layer
+
+    @layer.setter
+    def layer(self, value):
+        if int(value) not in self._LAYERS:
+            raise ValueError(f"Layer {value} does not exist in this model")
+        elif not value:
+            raise ValueError("No layers selected")
+        else:
+            self._layer = int(value)
+
+    @property
+    def model_path(self):
+        """The path to the pretrained HuBERT model"""
+        return self._model_path
+
+    @model_path.setter
+    def model_path(self, value):
+        self._model_path = str(value)
+
+    @property
+    def ndims(self):
+        """The dimension of extracted frames
+
+        Cannot be tuned because the underlying neural networks are
+        trained with this parameter.
+
+        """
+        return 768
+    
+    @property
+    def sample_rate(self):
+        """Processing sample frequency in Hertz
+
+        Cannot be tuned because the underlying neural networks are
+        trained with this parameter.
+
+        """
+        return 16000
+    
+    @property
+    def frame_length(self):
+        """The length of extracted frames (in seconds)
+
+        Cannot be tuned because the underlying neural networks are
+        trained with this parameter.
+
+        """
+        return 0.02
+
+    @property
+    def frame_shift(self):
+        """The time shift between two consecutive frames (in seconds)
+
+        Cannot be tuned because the underlying neural networks are
+        trained with this parameter.
+
+        """
+        return 0.02
+
+    def process(self, signal):
+        """Computes HuBERT features with the specified options
+
+        Use a pre-trained neural network to extract HuBERT
+        features. Features have a frame shift of 20 ms and frame
+        length of 20 ms.
+
+        Parameters
+        ----------
+        signal : Audio, shape = [nsamples, 1]
+            The input audio signal to compute the features on, must be
+            mono. The signal is up/down-sampled to 16 kHz during
+            processing.
+
+        Returns
+        -------
+        features : Features, shape = [nframes, 768]
+            The computed HuBERT features will have as many rows as
+            there are frames (depends on the `signal` duration, expect
+            about 50 frames per second), each frame with 768
+            dimensions.
+
+        Raises
+        ------
+        ValueError
+            If the input `signal` has more than one channel (i.e. is
+            not mono). If `sample_rate` != `signal.sample_rate`.
+
+        """
+
+        torch.manual_seed(self._SEED)
+        np.random.seed(self._SEED)
+
+        # ensure the signal is correct
+        if signal.nchannels != 1:
+            raise ValueError(
+                'signal must have one dimension, but it has {}'
+                .format(signal.nchannels))
+
+        # force resampling to 16 kHz and 32 bit floats
+        need_resample = (
+            signal.sample_rate != 16000 or
+            signal.dtype is not np.dtype(np.float32))
+
+        if need_resample:
+            self.log.debug(
+                'resampling audio from %dHz@%db to %dHz@%db',
+                signal.sample_rate, signal.dtype.itemsize * 8, 16000, 32)
+            signal = signal.resample(16000).astype(np.float32)
+
+        signal = torch.unsqueeze(torch.from_numpy(signal.data), 0)
+
+        model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0]
+        model.eval()
+
+        out_dict = model(signal, features_only=True, mask=False, output_layer=self.layer)
+
+        data = out_dict["features"][0].squeeze(1).detach().numpy()
+
+        del out_dict
+
+        # compute the timestamps for each output frame
+        times = np.vstack((
+            np.arange(data.shape[0]) * self.frame_shift,
+            np.arange(data.shape[0]) * self.frame_shift + self.frame_length)).T
+
+        return Features(
+            data, times, properties=self.get_properties())
\ No newline at end of file

From 7169cf64daa170e6cea43319051fa0090f66be89 Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Thu, 16 Jan 2025 15:33:38 -0500
Subject: [PATCH 02/16] moved choice of layer into processing function

---
 shennong/processor/hubert.py | 43 +++++++++++++++---------------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index d5f63b1..0550694 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -9,12 +9,12 @@
 >>> from shennong.audio import Audio
 >>> from shennong.processor.hubert import HubertProcessor
 >>> audio = Audio.load('./test/data/test.wav')
->>> processor = HubertProcessor(layer=3, model_path='/home/exp/mhubert-147')
+>>> processor = HubertProcessor(model_path='/home/exp/mhubert-147')
 
 Compute the HuBERT features. the output is an
 instance of :class:`~shennong.features.Features`:
 
->>> hubert = processor.process(audio)
+>>> hubert = processor.process(audio, layer=3)
 >>> type(hubert)
 <class 'shennong.features.Features'>
 
@@ -38,39 +38,19 @@ class HubertProcessor(FeaturesProcessor):
 
     Parameters
     ----------
-    layer : 1, 2, ..., 12
-        The layer to extract features from
-
     model_path : The path to the pre-trained HuBERT model
         
     """
 
-    _LAYERS = tuple(i for i in range(1, 13))
-
     _SEED = 3939
 
-    def __init__(self, layer=None, model_path=None):
+    def __init__(self, model_path=None):
         super().__init__()
-        self.layer = layer
         self.model_path = model_path
 
     @property
     def name(self):
         return 'hubert'
-    
-    @property
-    def layer(self):
-        """Layer to extract features from"""
-        return self._layer
-
-    @layer.setter
-    def layer(self, value):
-        if int(value) not in self._LAYERS:
-            raise ValueError(f"Layer {value} does not exist in this model")
-        elif not value:
-            raise ValueError("No layers selected")
-        else:
-            self._layer = int(value)
 
     @property
     def model_path(self):
@@ -120,8 +100,14 @@ def frame_shift(self):
 
         """
         return 0.02
+    
+    def _check_layer(self, value, model):
+        if value not in range(len(model.encoder.layers) + 1):
+            raise ValueError(f"Layer {value} does not exist in this model")
+        elif not value:
+            raise ValueError("No layers selected")
 
-    def process(self, signal):
+    def process(self, signal, layer):
         """Computes HuBERT features with the specified options
 
         Use a pre-trained neural network to extract HuBERT
@@ -135,6 +121,9 @@ def process(self, signal):
             mono. The signal is up/down-sampled to 16 kHz during
             processing.
 
+        layer : int
+            The layer to extract features from
+
         Returns
         -------
         features : Features, shape = [nframes, 768]
@@ -149,6 +138,8 @@ def process(self, signal):
             If the input `signal` has more than one channel (i.e. is
             not mono). If `sample_rate` != `signal.sample_rate`.
 
+        ValueError
+            If the selected layer does not exist in the given model.
         """
 
         torch.manual_seed(self._SEED)
@@ -176,7 +167,9 @@ def process(self, signal):
         model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0]
         model.eval()
 
-        out_dict = model(signal, features_only=True, mask=False, output_layer=self.layer)
+        self._check_layer(layer, model)
+
+        out_dict = model(signal, features_only=True, mask=False, output_layer=layer)
 
         data = out_dict["features"][0].squeeze(1).detach().numpy()
 

From a34a13c65eb8d5ce48946ca28351881ad136ee09 Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Thu, 16 Jan 2025 16:24:14 -0500
Subject: [PATCH 03/16] rearranging

---
 shennong/processor/hubert.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index 0550694..35705b3 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -46,7 +46,12 @@ class HubertProcessor(FeaturesProcessor):
 
     def __init__(self, model_path=None):
         super().__init__()
+        torch.manual_seed(self._SEED)
+        np.random.seed(self._SEED)
+
         self.model_path = model_path
+        self.model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0]
+        
 
     @property
     def name(self):
@@ -142,8 +147,9 @@ def process(self, signal, layer):
             If the selected layer does not exist in the given model.
         """
 
-        torch.manual_seed(self._SEED)
-        np.random.seed(self._SEED)
+        self._check_layer(layer, self.model)
+
+        self.model.eval()
 
         # ensure the signal is correct
         if signal.nchannels != 1:
@@ -164,12 +170,7 @@ def process(self, signal, layer):
 
         signal = torch.unsqueeze(torch.from_numpy(signal.data), 0)
 
-        model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0]
-        model.eval()
-
-        self._check_layer(layer, model)
-
-        out_dict = model(signal, features_only=True, mask=False, output_layer=layer)
+        out_dict = self.model(signal, features_only=True, mask=False, output_layer=layer)
 
         data = out_dict["features"][0].squeeze(1).detach().numpy()
 

From 3f2b59593e50217c4139ccf46834e523b4b7f73d Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Wed, 29 Jan 2025 14:55:26 -0500
Subject: [PATCH 04/16] now able to work with huggingface models

---
 environment.yml              |  5 +++--
 shennong/processor/hubert.py | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/environment.yml b/environment.yml
index bd317ee..58346fd 100644
--- a/environment.yml
+++ b/environment.yml
@@ -3,7 +3,7 @@ channels:
   - conda-forge
   - coml
 dependencies:
-  - python>=3.6
+  - python>=3.7
   - shennong-pykaldi
   - ffmpeg
   - h5features>=1.3.2
@@ -25,6 +25,7 @@ dependencies:
   - sphinx_rtd_theme
   - tensorflow<2.5
   - pip:
+      - fairseq
       - json-tricks==3.15.*
       - sox  # pysox
-      - fairseq
+      - transformers==4.30.2
\ No newline at end of file
diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index 35705b3..a33edc6 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -32,6 +32,7 @@
 
 from shennong import Features
 from shennong.processor.base import FeaturesProcessor
+from transformers import HubertForCTC
 
 class HubertProcessor(FeaturesProcessor):
     """HuBERT features from a pre-trained neural network
@@ -39,6 +40,12 @@ class HubertProcessor(FeaturesProcessor):
     Parameters
     ----------
     model_path : The path to the pre-trained HuBERT model
+
+    Raises
+    ------
+    RuntimeError
+        If the model path does not point to a HuBERT model 
+        that can be loaded with either fairseq or huggingface
         
     """
 
@@ -50,8 +57,15 @@ def __init__(self, model_path=None):
         np.random.seed(self._SEED)
 
         self.model_path = model_path
-        self.model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0]
-        
+        try:
+            self.model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0]
+            self._model_type = 'fairseq'
+        except IsADirectoryError:
+            try:
+                self.model = HubertForCTC.from_pretrained(self.model_path)
+                self._model_type = 'huggingface'
+            except:
+                raise RuntimeError(f"The model at {self.model_path} cannot be loaded. Make sure that this is a fairseq model or huggingface model directory.")
 
     @property
     def name(self):
@@ -107,7 +121,12 @@ def frame_shift(self):
         return 0.02
     
     def _check_layer(self, value, model):
-        if value not in range(len(model.encoder.layers) + 1):
+        if self._model_type == 'fairseq':
+            layer_num = len(model.encoder.layers) + 1
+        elif self._model_type == 'huggingface':
+            layer_num = model.config.num_hidden_layers + 1
+
+        if value not in range(layer_num):
             raise ValueError(f"Layer {value} does not exist in this model")
         elif not value:
             raise ValueError("No layers selected")
@@ -170,10 +189,13 @@ def process(self, signal, layer):
 
         signal = torch.unsqueeze(torch.from_numpy(signal.data), 0)
 
-        out_dict = self.model(signal, features_only=True, mask=False, output_layer=layer)
-
-        data = out_dict["features"][0].squeeze(1).detach().numpy()
-
+        if self._model_type == 'fairseq':
+            out_dict = self.model(signal, features_only=True, mask=False, output_layer=layer)
+            data = out_dict["features"][0].squeeze(1).detach().numpy()
+        elif self._model_type == 'huggingface':
+            out_dict = self.model(signal, output_hidden_states=True)
+            data = out_dict["hidden_states"][layer][0].squeeze(1).detach().numpy()
+        
         del out_dict
 
         # compute the timestamps for each output frame

From 4800f75138c4d65b9e8234f5a7c3068c216a4ca6 Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Thu, 30 Jan 2025 12:18:07 -0500
Subject: [PATCH 05/16] moved layer check to init

---
 shennong/processor/hubert.py | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index a33edc6..33a1c6d 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -41,17 +41,21 @@ class HubertProcessor(FeaturesProcessor):
     ----------
     model_path : The path to the pre-trained HuBERT model
 
+    layer : The layer to extract features from
+
     Raises
     ------
     RuntimeError
         If the model path does not point to a HuBERT model 
         that can be loaded with either fairseq or huggingface
-        
+
+    ValueError
+        If the selected layer does not exist in the given model.
     """
 
     _SEED = 3939
 
-    def __init__(self, model_path=None):
+    def __init__(self, model_path=None, layer=0):
         super().__init__()
         torch.manual_seed(self._SEED)
         np.random.seed(self._SEED)
@@ -66,6 +70,9 @@ def __init__(self, model_path=None):
                 self._model_type = 'huggingface'
             except:
                 raise RuntimeError(f"The model at {self.model_path} cannot be loaded. Make sure that this is a fairseq model or huggingface model directory.")
+        
+        self._check_layer(layer, self.model)
+        self.layer = layer
 
     @property
     def name(self):
@@ -80,6 +87,15 @@ def model_path(self):
     def model_path(self, value):
         self._model_path = str(value)
 
+    @property
+    def layer(self):
+        """The layer to extract features from"""
+        return self._layer
+
+    @layer.setter
+    def layer(self, value):
+        self._layer = int(value)
+
     @property
     def ndims(self):
         """The dimension of extracted frames
@@ -131,7 +147,7 @@ def _check_layer(self, value, model):
         elif not value:
             raise ValueError("No layers selected")
 
-    def process(self, signal, layer):
+    def process(self, signal):
         """Computes HuBERT features with the specified options
 
         Use a pre-trained neural network to extract HuBERT
@@ -145,9 +161,6 @@ def process(self, signal, layer):
             mono. The signal is up/down-sampled to 16 kHz during
             processing.
 
-        layer : int
-            The layer to extract features from
-
         Returns
         -------
         features : Features, shape = [nframes, 768]
@@ -161,13 +174,8 @@ def process(self, signal, layer):
         ValueError
             If the input `signal` has more than one channel (i.e. is
             not mono). If `sample_rate` != `signal.sample_rate`.
-
-        ValueError
-            If the selected layer does not exist in the given model.
         """
 
-        self._check_layer(layer, self.model)
-
         self.model.eval()
 
         # ensure the signal is correct
@@ -190,11 +198,11 @@ def process(self, signal, layer):
         signal = torch.unsqueeze(torch.from_numpy(signal.data), 0)
 
         if self._model_type == 'fairseq':
-            out_dict = self.model(signal, features_only=True, mask=False, output_layer=layer)
+            out_dict = self.model(signal, features_only=True, mask=False, output_layer=self.layer)
             data = out_dict["features"][0].squeeze(1).detach().numpy()
         elif self._model_type == 'huggingface':
             out_dict = self.model(signal, output_hidden_states=True)
-            data = out_dict["hidden_states"][layer][0].squeeze(1).detach().numpy()
+            data = out_dict["hidden_states"][self.layer][0].squeeze(1).detach().numpy()
         
         del out_dict
 

From 3e9a8006b8fb1abdf35716cfd386f8ff75a4388e Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Thu, 30 Jan 2025 13:59:42 -0500
Subject: [PATCH 06/16] model_path default changed to str + can now handle
 pulling models from huggingface

---
 shennong/processor/hubert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index 33a1c6d..1b0d9a4 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -55,7 +55,7 @@ class HubertProcessor(FeaturesProcessor):
 
     _SEED = 3939
 
-    def __init__(self, model_path=None, layer=0):
+    def __init__(self, model_path="", layer=0):
         super().__init__()
         torch.manual_seed(self._SEED)
         np.random.seed(self._SEED)
@@ -64,7 +64,7 @@ def __init__(self, model_path=None, layer=0):
         try:
             self.model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0]
             self._model_type = 'fairseq'
-        except IsADirectoryError:
+        except:
             try:
                 self.model = HubertForCTC.from_pretrained(self.model_path)
                 self._model_type = 'huggingface'

From 5a19f38d67afc3c5704cf6847b83ec7030158507 Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Thu, 30 Jan 2025 14:03:19 -0500
Subject: [PATCH 07/16] cleanup

---
 shennong/processor/hubert.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index 1b0d9a4..c2b1fe5 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -138,11 +138,11 @@ def frame_shift(self):
     
     def _check_layer(self, value, model):
         if self._model_type == 'fairseq':
-            layer_num = len(model.encoder.layers) + 1
+            layer_num = len(model.encoder.layers)
         elif self._model_type == 'huggingface':
-            layer_num = model.config.num_hidden_layers + 1
+            layer_num = model.config.num_hidden_layers
 
-        if value not in range(layer_num):
+        if value not in range(layer_num + 1):
             raise ValueError(f"Layer {value} does not exist in this model")
         elif not value:
             raise ValueError("No layers selected")

From 2f3c2da156ba3d3a17cf8a8d77d4a7a193c56792 Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Thu, 30 Jan 2025 14:27:18 -0500
Subject: [PATCH 08/16] layer path default changed to string

---
 shennong/processor/hubert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index c2b1fe5..c18118b 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -55,7 +55,7 @@ class HubertProcessor(FeaturesProcessor):
 
     _SEED = 3939
 
-    def __init__(self, model_path="", layer=0):
+    def __init__(self, model_path="", layer=""):
         super().__init__()
         torch.manual_seed(self._SEED)
         np.random.seed(self._SEED)

From dc0f0e771cd09214178e11726c7e90b20b4329fa Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Thu, 6 Feb 2025 13:47:33 -0500
Subject: [PATCH 09/16] added convolution layer extraction + correctly gets
 feature properties

---
 shennong/processor/hubert.py | 149 ++++++++++++++++++++++++++++-------
 1 file changed, 121 insertions(+), 28 deletions(-)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index c18118b..f4adafe 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -9,12 +9,12 @@
 >>> from shennong.audio import Audio
 >>> from shennong.processor.hubert import HubertProcessor
 >>> audio = Audio.load('./test/data/test.wav')
->>> processor = HubertProcessor(model_path='/home/exp/mhubert-147')
+>>> processor = HubertProcessor(model_path='/home/exp/mhubert-147', layer=1, layer_type="convolutional")
 
 Compute the HuBERT features. the output is an
 instance of :class:`~shennong.features.Features`:
 
->>> hubert = processor.process(audio, layer=3)
+>>> hubert = processor.process(audio)
 >>> type(hubert)
 <class 'shennong.features.Features'>
 
@@ -30,6 +30,7 @@
 
 import numpy as np
 
+from ast import literal_eval
 from shennong import Features
 from shennong.processor.base import FeaturesProcessor
 from transformers import HubertForCTC
@@ -43,6 +44,8 @@ class HubertProcessor(FeaturesProcessor):
 
     layer : The layer to extract features from
 
+    layer_type : The type of layer to extract features from (encoder or convolutional)
+
     Raises
     ------
     RuntimeError
@@ -50,19 +53,22 @@ class HubertProcessor(FeaturesProcessor):
         that can be loaded with either fairseq or huggingface
 
     ValueError
-        If the selected layer does not exist in the given model.
+        If the selected layer does not exist in the given model
+        or if the given layer type does not exist.
     """
 
     _SEED = 3939
 
-    def __init__(self, model_path="", layer=""):
+    def __init__(self, model_path="", layer="", layer_type="encoder"):
         super().__init__()
         torch.manual_seed(self._SEED)
         np.random.seed(self._SEED)
 
         self.model_path = model_path
         try:
-            self.model = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])[0][0]
+            self.model, self._cfg, self._task_cfg = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.model_path])
+            self._conv_list = self._parse_conv_str(self._cfg['model']['conv_feature_layers'])
+            self.model = self.model[0]
             self._model_type = 'fairseq'
         except:
             try:
@@ -71,7 +77,8 @@ def __init__(self, model_path="", layer=""):
             except:
                 raise RuntimeError(f"The model at {self.model_path} cannot be loaded. Make sure that this is a fairseq model or huggingface model directory.")
         
-        self._check_layer(layer, self.model)
+        self.layer_type = layer_type
+        self._check_layer(int(layer))
         self.layer = layer
 
     @property
@@ -95,6 +102,15 @@ def layer(self):
     @layer.setter
     def layer(self, value):
         self._layer = int(value)
+    
+    @property
+    def layer_type(self):
+        """The type of layer that features are extracted from"""
+        return self._layer_type
+
+    @layer_type.setter
+    def layer_type(self, value):
+        self._layer_type = str(value)
 
     @property
     def ndims(self):
@@ -104,7 +120,16 @@ def ndims(self):
         trained with this parameter.
 
         """
-        return 768
+        if self.layer_type == 'encoder':
+            if self._model_type == 'fairseq':
+                return self._cfg['model']['encoder_embed_dim']
+            elif self._model_type == 'huggingface':
+                return self.model.config.hidden_size
+        elif self.layer_type == 'convolutional':
+            if self._model_type == 'fairseq':
+                return self._conv_list[self.layer - 1][0]
+            elif self._model_type == 'huggingface':
+                return self.model.config.conv_dim[self.layer - 1]
     
     @property
     def sample_rate(self):
@@ -124,7 +149,30 @@ def frame_length(self):
         trained with this parameter.
 
         """
-        return 0.02
+
+        def get_receptive_field_length(kernels, strides):
+            field_length = 1
+            for i in range(len(kernels)):
+                field_length += (kernels[i] - 1) * np.prod(strides[:i])
+            return field_length
+
+        if self.layer_type == 'encoder':
+            # receptive field length of all convolutional layers
+            if self._model_type == 'fairseq':
+                _, kernels, strides = zip(*self._conv_list)
+            elif self._model_type == 'huggingface':
+                kernels = self.model.config.conv_kernel
+                strides = self.model.config.conv_stride
+        elif self.layer_type == 'convolutional':
+            # receptive field length of convolution layers up to selected layer
+            if self._model_type == 'fairseq':
+                _, kernels, strides = zip(*self._conv_list[:self.layer])
+            elif self._model_type == 'huggingface':
+                kernels = self.model.config.conv_kernel[:self.layer]
+                strides = self.model.config.conv_stride[:self.layer]
+        
+        frame_length = get_receptive_field_length(kernels, strides) / self.sample_rate
+        return frame_length
 
     @property
     def frame_shift(self):
@@ -134,19 +182,55 @@ def frame_shift(self):
         trained with this parameter.
 
         """
-        return 0.02
+        if self.layer_type == 'encoder':
+            # total stride length of all convolutional layers
+            if self._model_type == 'fairseq':
+                _, _, strides = zip(*self._conv_list)
+            elif self._model_type == 'huggingface':
+                strides = self.model.config.conv_stride
+        elif self.layer_type == 'convolutional':
+            # total stride length of convolution layers up to selected layer
+            if self._model_type == 'fairseq':
+                _, _, strides = zip(*self._conv_list[:self.layer])
+            elif self._model_type == 'huggingface':
+                strides = self.model.config.conv_stride[:self.layer]
+        
+        total_stride = np.prod(strides)
+        frame_shift = total_stride / self.sample_rate
+        return frame_shift
     
-    def _check_layer(self, value, model):
-        if self._model_type == 'fairseq':
-            layer_num = len(model.encoder.layers)
-        elif self._model_type == 'huggingface':
-            layer_num = model.config.num_hidden_layers
+    def _check_layer(self, value):
+        if self.layer_type == 'encoder':
+            if self._model_type == 'fairseq':
+                layer_num = self._cfg['model']['encoder_layers']
+            elif self._model_type == 'huggingface':
+                layer_num = self.model.config.num_hidden_layers
+        elif self.layer_type == 'convolutional':
+            if self._model_type == 'fairseq':
+                layer_num = len(self._conv_list)
+            elif self._model_type == 'huggingface':
+                layer_num = len(self.model.config.conv_dim)
+        else:
+             raise ValueError("Invalid layer type")
 
         if value not in range(layer_num + 1):
-            raise ValueError(f"Layer {value} does not exist in this model")
+            raise ValueError(f"There is no {self.layer_type} layer {value} in this model")
         elif not value:
             raise ValueError("No layers selected")
 
+    def _parse_conv_str(self, conv_str):
+        conv_list = []
+
+        for item in conv_str.split("+"):
+            item = item.strip()
+            if "*" in item:
+                feat, mult = item.split("*")
+                conv_list.extend(([literal_eval(item)[0] for item in (feat * int(mult)).split()]))
+            else:
+                conv_list.append(literal_eval(item)[0])
+
+        return conv_list
+
     def process(self, signal):
         """Computes HuBERT features with the specified options
 
@@ -163,11 +247,13 @@ def process(self, signal):
 
         Returns
         -------
-        features : Features, shape = [nframes, 768]
-            The computed HuBERT features will have as many rows as
-            there are frames (depends on the `signal` duration, expect
-            about 50 frames per second), each frame with 768
-            dimensions.
+        features : Features, shape = [nframes, ndim]
+            The computed HuBERT features will either:
+            have as many rows as there are frames (depends on the `signal` duration, expect
+            50 frames per second) for encoder layers,
+            or have as many rows as there are samples divided by the product of the stride lengths 
+            (depends on the `signal` duration and the stride lengths) for convolutional layers,
+            each frame with the number of dimensions in the layer.
 
         Raises
         ------
@@ -199,17 +285,24 @@ def process(self, signal):
 
         if self._model_type == 'fairseq':
             out_dict = self.model(signal, features_only=True, mask=False, output_layer=self.layer)
-            data = out_dict["features"][0].squeeze(1).detach().numpy()
+            if self.layer_type == 'encoder':
+                data = out_dict["features"][0].squeeze(1).detach().numpy()
+            elif self.layer_type == 'convolutional':
+                self._cfg['model']['conv_feature_layers'] = str(self._conv_list[:self.layer])
+                self.model.feature_extractor = fairseq.models.hubert.HubertModel.build_model(self._cfg['model'], self._task_cfg).feature_extractor
+                data = self.model.forward_features(signal).transpose(1, 2).squeeze(0).detach().numpy()
         elif self._model_type == 'huggingface':
             out_dict = self.model(signal, output_hidden_states=True)
-            data = out_dict["hidden_states"][self.layer][0].squeeze(1).detach().numpy()
-        
+            if self.layer_type == 'encoder':
+                data = out_dict["hidden_states"][self.layer][0].squeeze(1).detach().numpy()
+            elif self.layer_type == 'convolutional':
+                self.model.hubert.config.num_feat_extract_layers = self.layer
+                self.model.hubert.feature_extractor = HubertForCTC(self.model.hubert.config).hubert.feature_extractor
+                data = self.model.hubert.feature_extractor(signal).transpose(1, 2).squeeze(0).detach().numpy()
         del out_dict
 
-        # compute the timestamps for each output frame
-        times = np.vstack((
-            np.arange(data.shape[0]) * self.frame_shift,
-            np.arange(data.shape[0]) * self.frame_shift + self.frame_length)).T
-
+        # compute the timestamps for the midpoint of each output frame
+        times = np.vstack((np.arange(data.shape[0]) * self.frame_shift + (self.frame_length / 2))).squeeze(1)
+        
         return Features(
             data, times, properties=self.get_properties())
\ No newline at end of file

From 9fcd2e693ef7994fe81595fb0cafd184c4763483 Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Mon, 31 Mar 2025 12:08:03 -0400
Subject: [PATCH 10/16] combined layer and layer_type into layer_info

---
 shennong/processor/hubert.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index f4adafe..ed14273 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -42,9 +42,8 @@ class HubertProcessor(FeaturesProcessor):
     ----------
     model_path : The path to the pre-trained HuBERT model
 
-    layer : The layer to extract features from
-
-    layer_type : The type of layer to extract features from (encoder or convolutional)
+    layer_info : Tuple with the type of layer to extract features from (encoder or convolutional)
+                 as the first element and the layer number as the second element
 
     Raises
     ------
@@ -59,7 +58,7 @@ class HubertProcessor(FeaturesProcessor):
 
     _SEED = 3939
 
-    def __init__(self, model_path="", layer="", layer_type="encoder"):
+    def __init__(self, model_path="", layer_info=("encoder", "1")):
         super().__init__()
         torch.manual_seed(self._SEED)
         np.random.seed(self._SEED)
@@ -77,6 +76,10 @@ def __init__(self, model_path="", layer="", layer_type="encoder"):
             except:
                 raise RuntimeError(f"The model at {self.model_path} cannot be loaded. Make sure that this is a fairseq model or huggingface model directory.")
         
+        self.layer_info = layer_info
+        layer_type = layer_info[0]
+        layer = layer_info[1]
+
         self.layer_type = layer_type
         self._check_layer(int(layer))
         self.layer = layer

From 907dcd53c0338315762d48675d2caf45728ce1ea Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Mon, 31 Mar 2025 12:49:28 -0400
Subject: [PATCH 11/16] timestamps are now at the beginning and end of each
 frame for consistency with other processors

---
 shennong/processor/hubert.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index ed14273..0e1610b 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -304,8 +304,10 @@ def process(self, signal):
                 data = self.model.hubert.feature_extractor(signal).transpose(1, 2).squeeze(0).detach().numpy()
         del out_dict
 
-        # compute the timestamps for the midpoint of each output frame
-        times = np.vstack((np.arange(data.shape[0]) * self.frame_shift + (self.frame_length / 2))).squeeze(1)
-        
+        # compute the timestamps for each output frame
+        times = np.vstack((
+            np.arange(data.shape[0]) * self.frame_shift,
+            np.arange(data.shape[0]) * self.frame_shift + self.frame_length)).T
+
         return Features(
             data, times, properties=self.get_properties())
\ No newline at end of file

From a157c3bb371d677326bc8f20462d5310b940608c Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Wed, 2 Jul 2025 14:48:03 -0400
Subject: [PATCH 12/16] adjusted example to match current class

---
 shennong/processor/hubert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index 0e1610b..4d3a9f7 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -9,7 +9,7 @@
 >>> from shennong.audio import Audio
 >>> from shennong.processor.hubert import HubertProcessor
 >>> audio = Audio.load('./test/data/test.wav')
->>> processor = HubertProcessor(model_path='/home/exp/mhubert-147', layer=1, layer_type="convolutional")
+>>> processor = HubertProcessor(model_path="facebook/hubert-large-ls960-ft", layer_info=("encoder", 1))
 
 Compute the HuBERT features. the output is an
 instance of :class:`~shennong.features.Features`:

From 337901012629def90673b4a5677a31737c8f5ee1 Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Thu, 3 Jul 2025 15:46:20 -0400
Subject: [PATCH 13/16] added feature extractor preprocessing step

---
 shennong/processor/hubert.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index 4d3a9f7..05db0bb 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -33,7 +33,7 @@
 from ast import literal_eval
 from shennong import Features
 from shennong.processor.base import FeaturesProcessor
-from transformers import HubertForCTC
+from transformers import HubertForCTC, AutoFeatureExtractor
 
 class HubertProcessor(FeaturesProcessor):
     """HuBERT features from a pre-trained neural network
@@ -284,10 +284,11 @@ def process(self, signal):
                 signal.sample_rate, signal.dtype.itemsize * 8, 16000, 32)
             signal = signal.resample(16000).astype(np.float32)
 
-        signal = torch.unsqueeze(torch.from_numpy(signal.data), 0)
-
         if self._model_type == 'fairseq':
-            out_dict = self.model(signal, features_only=True, mask=False, output_layer=self.layer)
+            signal = torch.unsqueeze(torch.from_numpy(signal.data), 0)
+            self.model.feature_extractor = fairseq.models.hubert.HubertModel.build_model(self._cfg['model'], self._task_cfg).feature_extractor
+            input_values = self.model.forward_features(signal).transpose(1, 2).squeeze(0).detach().numpy()
+            out_dict = self.model(input_values, features_only=True, mask=False, output_layer=self.layer)
             if self.layer_type == 'encoder':
                 data = out_dict["features"][0].squeeze(1).detach().numpy()
             elif self.layer_type == 'convolutional':
@@ -295,7 +296,14 @@ def process(self, signal):
                 self.model.feature_extractor = fairseq.models.hubert.HubertModel.build_model(self._cfg['model'], self._task_cfg).feature_extractor
                 data = self.model.forward_features(signal).transpose(1, 2).squeeze(0).detach().numpy()
         elif self._model_type == 'huggingface':
-            out_dict = self.model(signal, output_hidden_states=True)
+            feature_extractor = AutoFeatureExtractor.from_pretrained(self.model_path)
+            input_values = feature_extractor(
+                signal.data,
+                return_tensors="pt",
+                padding=True,
+                sampling_rate=16000,
+            ).input_values
+            out_dict = self.model(input_values, output_hidden_states=True)
             if self.layer_type == 'encoder':
                 data = out_dict["hidden_states"][self.layer][0].squeeze(1).detach().numpy()
             elif self.layer_type == 'convolutional':

From 01a197855d44d66f40d0ef270bf7cc48223db173 Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Mon, 7 Jul 2025 11:31:29 -0400
Subject: [PATCH 14/16] added temporary test

---
 shennong/processor/hubert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index 05db0bb..7867817 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -303,6 +303,7 @@ def process(self, signal):
                 padding=True,
                 sampling_rate=16000,
             ).input_values
+            print("feature extractor")
             out_dict = self.model(input_values, output_hidden_states=True)
             if self.layer_type == 'encoder':
                 data = out_dict["hidden_states"][self.layer][0].squeeze(1).detach().numpy()

From 920e0daf3ebc2ec21018d1e67d44e4f4177d2ac2 Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Mon, 7 Jul 2025 12:37:02 -0400
Subject: [PATCH 15/16] testing

---
 shennong/processor/hubert.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index 7867817..33ac2d6 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -303,7 +303,6 @@ def process(self, signal):
                 padding=True,
                 sampling_rate=16000,
             ).input_values
-            print("feature extractor")
             out_dict = self.model(input_values, output_hidden_states=True)
             if self.layer_type == 'encoder':
                 data = out_dict["hidden_states"][self.layer][0].squeeze(1).detach().numpy()
@@ -319,4 +318,4 @@ def process(self, signal):
             np.arange(data.shape[0]) * self.frame_shift + self.frame_length)).T
 
         return Features(
-            data, times, properties=self.get_properties())
\ No newline at end of file
+            input_values.numpy(), times, properties=self.get_properties())
\ No newline at end of file

From 60d09d7eb0339236afd81bf73a561ab37f0dc716 Mon Sep 17 00:00:00 2001
From: michaelong7 <michael.ong@mail.utoronto.ca>
Date: Mon, 7 Jul 2025 15:13:59 -0400
Subject: [PATCH 16/16] removed testing code

---
 shennong/processor/hubert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shennong/processor/hubert.py b/shennong/processor/hubert.py
index 33ac2d6..05db0bb 100644
--- a/shennong/processor/hubert.py
+++ b/shennong/processor/hubert.py
@@ -318,4 +318,4 @@ def process(self, signal):
             np.arange(data.shape[0]) * self.frame_shift + self.frame_length)).T
 
         return Features(
-            input_values.numpy(), times, properties=self.get_properties())
\ No newline at end of file
+            data, times, properties=self.get_properties())
\ No newline at end of file