Python/Spectrogram at main · DracoRex679/Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
import os
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display


# Takes in audio and labels, then uses tf.squeeze to lower the axis of the audio down to 1.
def squeeze(audio, labels):
    audio = tf.squeeze(audio, axis=-1)
    return audio, labels


#Creates a spectogram using our waveform. Waveform is then adjusted to specifications.
def get_spectrogram(waveform):
    spectrogram = tf.signal.stft(
        waveform, frame_length=255, frame_step=128)
    spectrogram = tf.abs(spectrogram)
    spectrogram = spectrogram[..., tf.newaxis]
    return spectrogram


#Function for ploting a spectrogram using spectrogram and ax (Really used for setting up height, width, x, y, and the pcolormesh.
def plot_spectrogram(spectrogram, ax):
    if len(spectrogram.shape) > 2:
        assert len(spectrogram.shape) == 3
        spectrogram = np.squeeze(spectrogram, axis=-1)

    log_spec = np.log(spectrogram.T + np.finfo(float).eps)
    height = log_spec.shape[0]
    width = log_spec.shape[1]
    X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
    Y = range(height)
    ax.pcolormesh(X, Y, log_spec)


#Returns a dataset map using map_functions and parallel call.
def make_spectrogram(ds):
    return ds.map(
        map_func=lambda audio, label: (get_spectrogram(audio), label),
        num_parallel_calls = tf.data.AUTOTUNE)


if __name__ == '__main__':
    #Dataset path for the speech commands
    DATASET_PATH = 'data/mini_speech_commands'

    #The Data directory for the dataset path selected.
    data_dir = pathlib.Path(DATASET_PATH)
    #Pulls the mini_speech_commands.zip file from the googleapis site and extracts this information if it doesn't exist already.
    if not data_dir.exists():
        tf.keras.utils.get_file(
            'mini_speech_commands.zip',
            origin = "http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip",
            extract = True,
            cache_dir='.', cache_subdir='data')

    #From the data directory we create a numpy array called commands (removing README and DS_Store).
    commands = np.array(tf.io.gfile.listdir(str(data_dir)))
    commands = commands[(commands != 'README.md') & (commands != 'DS_Store')]
    print(f'Commands: {commands}')

    #Pulls a training and value set from the directory mentioned above with batch size 64, validation split .2, seed 0, and a length of 16000.
    train, values = tf.keras.utils.audio_dataset_from_directory(
        directory=data_dir,
        batch_size=64,
        validation_split=0.2,
        seed=0,
        output_sequence_length=16000,
        subset='both'
    )
    label_names = np.array(train.class_names)
    print()
    print(f'label names:{label_names}')

    print(train.element_spec)

    #We use squeeze through train.map and values.map with autotune to reorrient the train and values sets.
    train = train.map(squeeze, tf.data.AUTOTUNE)
    values = values.map(squeeze, tf.data.AUTOTUNE)

    #Partition
    test = values.shard(num_shards=2, index=0)
    values = values.shard(num_shards=2, index=1)

    #Print the shape of the example audio and labels.
    for example_audio,  example_labels in train.take(1):
        print(example_audio.shape)
        print(example_labels.shape)


    label_names[[1, 1, 3, 0]]

    #Plots the figures of the audio files and shows them.
    plt.figure(figsize=(16, 10))
    rows = 3
    cols = 3
    n = rows * cols
    for i  in range(n):
        plt.subplot(rows, cols, i+1)
        audio_signal = example_audio[i]
        plt.plot(audio_signal)
        plt.title(label_names[example_labels[i]])
        plt.yticks(np.arange(-1.2, 1.2, 0.2))
        plt.ylim(-1.1, 1.1)
    plt.show()

    #Sets the label, waveform, and associated spectrogram using our label names, example_audio, and waveform respectfully.
    for i in range(3):
        label = label_names[example_labels[i]]
        waveform = example_audio[i]
        spectrogram = get_spectrogram(waveform)

        #Prints the label, waveform shape, spectogram shape, and displays audio playback (Jupyter Notebook only for audio playback).
        print(f'Label: {label}')
        print(f'Waveform shape: {waveform.shape}')
        print(f'Spectrogram shape: {spectrogram.shape}')
        print('Audio playback')
        display.display(display.Audio(waveform, rate=16000))

    #Configures the figure, axes, and timescale.
    fig, axes = plt.subplots(2, figsize=(12, 8))
    timescale = np.arange(waveform.shape[0])
    axes[0].plot(timescale, waveform.numpy())
    axes[0].set_title('Waveform')
    axes[0].set_xlim([0, 16000])

    #Plots the spectrogram
    plot_spectrogram((spectrogram.numpy()), axes[1])
    axes[1].set_title('Spectrogram')
    plt.suptitle(label.title())
    plt.show()

    #Use make spectrogram on train, values, and test set.
    train_spectrogram = make_spectrogram(train)
    values_spectrogram = make_spectrogram(values)
    test_spectrogram = make_spectrogram(test)

    #Sets up your spectrogram and displays it.
    for example_spectrograms, example_spect_labels in train_spectrogram.take(1):
        break

    rows = 3
    cols = 3
    n = rows * cols
    fig, axes = plt.subplots(rows, cols, figsize=(16, 9))

    for i in range(n):
        r = i // cols
        c = i % cols
        ax = axes[r][c]
        plot_spectrogram(example_spectrograms[i].numpy(), ax)
        ax.set_title(label_names[example_spect_labels[i].numpy()])

    plt.show()

    #Cache the train, values, and test information.
    train_spectrogram = train_spectrogram.cache().shuffle(10000).prefetch(tf.data.AUTOTUNE)
    values_spectrogram = values_spectrogram.cache().prefetch(tf.data.AUTOTUNE)
    test_spectrogram = values_spectrogram.cache().prefetch(tf.data.AUTOTUNE)

    #Prints the input shape, which comes from the example spectrograms shape. Also sets up num_labels as the length of our label names.
    input_shape = example_spectrograms.shape[1:]
    print('Input shape: ', input_shape)
    num_labels = len(label_names)

    #Normalization and adaptation.
    norm_layer = layers.Normalization()
    norm_layer.adapt(data=train_spectrogram.map(map_func=lambda spec, label: spec))

    #Uses a Sequential model with relu to create our neural network.
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Resizing(32, 32),
        norm_layer,
        layers.Conv2D(32, 3, activation='relu'),
        layers.Conv2D(64, 3, activation='relu'),
        layers.MaxPooling2D(),
        layers.Dropout(0.25),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_labels),
    ])

    #Prints the model summary and compiles the model using the adam optimizer, categorical cross entropy, and accuracy metric.
    model.summary()
    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy']
    )

    #Sets the number of epochs to fit into the model.
    EPOCHS = 12
    history = model.fit(
        train_spectrogram,
        validation_data=values_spectrogram,
        epochs=EPOCHS,
        callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
    )

    #Sets the metrics from history.history, then plots the loss, value loss, accuracy, and value accuracy.
    metrics = history.history
    plt.figure(figsize=(16, 6))
    plt.subplot(1, 2, 1)
    plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
    plt.legend(['loss', 'val_loss'])
    plt.ylim([0, max(plt.ylim())])
    plt.xlabel('Epoch')
    plt.ylabel('Loss [CrossEntropy]')

    plt.subplot(1, 2, 2)
    plt.plot(history.epoch, 100*np.array(metrics['accuracy']), 100*np.array(metrics['val_accuracy']))
    plt.legend(['accuracy', 'val_accuracy'])
    plt.ylim([0, 100])
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy [%]')
    plt.show()

    #Evaluates the model.
    model.evaluate(test_spectrogram, return_dict=True)

    #Makes predictions based on the test data and concatonates them into y-true.
    y_pred = model.predict(test_spectrogram)
    y_pred = tf.argmax(y_pred, axis=1)
    y_true = tf.concat(list(test_spectrogram.map(lambda s,lab: lab)), axis=0)

    #Create a confusion matrix from using y_true and y_pred and displays it in the form of a heatmap.
    confusion_matrix = tf.math.confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(confusion_matrix,
                xticklabels=label_names,
                yticklabels=label_names,
                annot=True, fmt='g')
    plt.xlabel('Prediction')
    plt.ylabel('Label')
    plt.show()

    #Gets a spectrogram from another .wav file to test our model.
    x = data_dir/'no/01bb6a2a_nohash_0.wav'
    x = tf.io.read_file(str(x))
    x, sample_rate = tf.audio.decode_wav(x, desired_channels=1, desired_samples=16000)
    x = tf.squeeze(x, axis=-1)
    waveform = x
    x = get_spectrogram(x)
    x = x[tf.newaxis]

    #Predicts based on our .wav file and uses a bar graph to tally results.
    prediction = model(x)
    x_labels = ['no', 'yes', 'down', 'go', 'left', 'up', 'right', 'stop']
    plt.bar(x_labels, tf.nn.softmax(prediction[0]))
    plt.title('No')
    plt.show()

    #Displays the last wav file (Jupyter notebook only).
    display.display(display.Audio(waveform, rate=16000))