From 43dd712d51dcd6354673c7ed20853ccd90ebc15d Mon Sep 17 00:00:00 2001 From: kweonwoo-jung Date: Fri, 6 Apr 2018 17:52:02 +0900 Subject: [PATCH 1/4] fix label error for semi-train --- data.py | 42 ++++++++++++++++++++++++++++-------------- trainer.py | 6 +++--- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/data.py b/data.py index 3001b34..9b2d8bc 100644 --- a/data.py +++ b/data.py @@ -25,7 +25,8 @@ def __init__(self, mode, label_words_dict, wav_list, add_noise, preprocess_fun, """ self.mode = mode self.label_words_dict = label_words_dict - self.wav_list = wav_list + self.wav_list = wav_list[0] + self.label_list = wav_list[1] self.add_noise = add_noise self.sr = sr self.n_silence = int(len(wav_list) * 0.09) @@ -33,7 +34,7 @@ def __init__(self, mode, label_words_dict, wav_list, add_noise, preprocess_fun, self.preprocess_param = preprocess_param # read all background noise here - self.background_noises = [librosa.load(x, sr=self.sr)[0] for x in glob("../input/train/audio/_background_noise_/*.wav")] + self.background_noises = [librosa.load(x, sr=self.sr)[0] for x in glob("/home1/irteam/.kaggle/competitions/tensorflow-speech-recognition-challenge/train/audio/_background_noise_/*.wav")] self.resize_shape = resize_shape self.is_1d = is_1d @@ -100,8 +101,7 @@ def __getitem__(self, idx): if self.mode == 'test': return {'spec': wav_tensor, 'id': self.wav_list[idx]} - label = self.label_words_dict[self.wav_list[idx].split("/")[-2]] if self.wav_list[idx].split( - "/")[-2] in self.label_words_dict else len(self.label_words_dict) + label = self.label_words_dict.get(self.label_list[idx], len(self.label_words_dict)) return {'spec': wav_tensor, 'id': self.wav_list[idx], 'label': label} @@ -127,20 +127,23 @@ def get_label_dict(): def get_wav_list(words, unknown_ratio=0.2): - full_train_list = glob("../input/train/audio/*/*.wav") - full_test_list = glob("../input/test/audio/*.wav") + full_train_list = glob("/home1/irteam/.kaggle/competitions/tensorflow-speech-recognition-challenge/train/audio/*/*.wav") + full_test_list = glob("/home1/irteam/.kaggle/competitions/tensorflow-speech-recognition-challenge/test/audio/*.wav") # sample full train list sampled_train_list = [] + labels = [] for w in full_train_list: l = w.split("/")[-2] if l not in words: if random.random() < unknown_ratio: sampled_train_list.append(w) + labels.append('unknown') else: sampled_train_list.append(w) + labels.append(l) - return sampled_train_list, full_test_list + return sampled_train_list, labels def get_sub_list(num, sub_path): @@ -148,17 +151,28 @@ def get_sub_list(num, sub_path): df = pd.read_csv(sub_path) words = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown'] each_num = int(num * 0.085) + labels = [] for w in words: tmp = df['fname'][df['label'] == w].sample(each_num).tolist() - lst += ["../input/test/audio/" + x for x in tmp] - return lst + lst += ["/home1/irteam/.kaggle/competitions/tensorflow-speech-recognition-challenge/test/audio/" + x for x in tmp] + for _ in range(len(tmp)): + labels.append(w) + return lst, labels def get_semi_list(words, sub_path, unknown_ratio=0.2, test_ratio=0.2): - train_list, _ = get_wav_list(words=words, unknown_ratio=unknown_ratio) - test_list = get_sub_list(num=int(len(train_list) * test_ratio), sub_path=sub_path) - lst = train_list + test_list - return sample(lst, len(lst)) + train_list, train_labels = get_wav_list(words=words, unknown_ratio=unknown_ratio) + test_list, test_labels = get_sub_list(num=int(len(train_list) * test_ratio), sub_path=sub_path) + file_list = train_list + test_list + label_list = train_labels + test_labels + assert(len(file_list) == len(label_list)) + + random.seed(2018) + file_list = sample(file_list, len(file_list)) + random.seed(2018) + label_list = sample(label_list, len(label_list)) + + return file_list, label_list def preprocess_mfcc(wave): @@ -189,4 +203,4 @@ def preprocess_wav(wav, normalization=True): if normalization: mean = data.mean() data -= mean - return data \ No newline at end of file + return data diff --git a/trainer.py b/trainer.py index 7318aa7..ba3ca4e 100644 --- a/trainer.py +++ b/trainer.py @@ -65,18 +65,18 @@ def get_model(model=model_class, m=MGPU, pretrained=pretrained): optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, speechmodel.parameters()), lr=learning_rate, momentum=0.9, weight_decay=0.00001) speechmodel.train() if semi_train_path: - train_list = get_semi_list(words=label_to_int.keys(), sub_path=semi_train_path, + train_list, label_list = get_semi_list(words=label_to_int.keys(), sub_path=semi_train_path, test_ratio=choice([0.2, 0.25, 0.3, 0.35])) print("semi training list length: ", len(train_list)) else: - train_list, _ = get_wav_list(words=label_to_int.keys()) + train_list, label_list = get_wav_list(words=label_to_int.keys()) if pretraining: traindataset = PreDataset(label_words_dict=label_to_int, add_noise=True, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param, resize_shape=reshape_size, is_1d=is_1d) else: - traindataset = SpeechDataset(mode='train', label_words_dict=label_to_int, wav_list=train_list, + traindataset = SpeechDataset(mode='train', label_words_dict=label_to_int, wav_list=(train_list, label_list), add_noise=True, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param, resize_shape=reshape_size, is_1d=is_1d) trainloader = DataLoader(traindataset, BATCH_SIZE, shuffle=True) From a9e4b698f133212433e59ce28e44f2e18e8d6b07 Mon Sep 17 00:00:00 2001 From: kweonwoo-jung Date: Fri, 6 Apr 2018 17:53:59 +0900 Subject: [PATCH 2/4] fix data path --- data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/data.py b/data.py index 9b2d8bc..d5017e1 100644 --- a/data.py +++ b/data.py @@ -34,7 +34,7 @@ def __init__(self, mode, label_words_dict, wav_list, add_noise, preprocess_fun, self.preprocess_param = preprocess_param # read all background noise here - self.background_noises = [librosa.load(x, sr=self.sr)[0] for x in glob("/home1/irteam/.kaggle/competitions/tensorflow-speech-recognition-challenge/train/audio/_background_noise_/*.wav")] + self.background_noises = [librosa.load(x, sr=self.sr)[0] for x in glob("../input/train/audio/_background_noise_/*.wav")] self.resize_shape = resize_shape self.is_1d = is_1d @@ -127,8 +127,8 @@ def get_label_dict(): def get_wav_list(words, unknown_ratio=0.2): - full_train_list = glob("/home1/irteam/.kaggle/competitions/tensorflow-speech-recognition-challenge/train/audio/*/*.wav") - full_test_list = glob("/home1/irteam/.kaggle/competitions/tensorflow-speech-recognition-challenge/test/audio/*.wav") + full_train_list = glob("../input/train/audio/*/*.wav") + full_test_list = glob("../input/test/audio/*.wav") # sample full train list sampled_train_list = [] @@ -154,7 +154,7 @@ def get_sub_list(num, sub_path): labels = [] for w in words: tmp = df['fname'][df['label'] == w].sample(each_num).tolist() - lst += ["/home1/irteam/.kaggle/competitions/tensorflow-speech-recognition-challenge/test/audio/" + x for x in tmp] + lst += ["../input/test/audio/" + x for x in tmp] for _ in range(len(tmp)): labels.append(w) return lst, labels From bae1ecae07e3a9f6aa6dbb1cd44d4d7dc272c555 Mon Sep 17 00:00:00 2001 From: kweonwoo-jung Date: Sat, 7 Apr 2018 11:33:17 +0900 Subject: [PATCH 3/4] update for inference --- data.py | 10 +++++----- trainer.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/data.py b/data.py index d5017e1..cfcbed5 100644 --- a/data.py +++ b/data.py @@ -132,18 +132,18 @@ def get_wav_list(words, unknown_ratio=0.2): # sample full train list sampled_train_list = [] - labels = [] + sampled_train_labels = [] for w in full_train_list: l = w.split("/")[-2] if l not in words: if random.random() < unknown_ratio: sampled_train_list.append(w) - labels.append('unknown') + sample_train_labels.append('unknown') else: sampled_train_list.append(w) - labels.append(l) + sampled_train_labels.append(l) - return sampled_train_list, labels + return sampled_train_list, sampled_train_labels, full_test_list def get_sub_list(num, sub_path): @@ -161,7 +161,7 @@ def get_sub_list(num, sub_path): def get_semi_list(words, sub_path, unknown_ratio=0.2, test_ratio=0.2): - train_list, train_labels = get_wav_list(words=words, unknown_ratio=unknown_ratio) + train_list, train_labels, _ = get_wav_list(words=words, unknown_ratio=unknown_ratio) test_list, test_labels = get_sub_list(num=int(len(train_list) * test_ratio), sub_path=sub_path) file_list = train_list + test_list label_list = train_labels + test_labels diff --git a/trainer.py b/trainer.py index ba3ca4e..cb38705 100644 --- a/trainer.py +++ b/trainer.py @@ -69,7 +69,7 @@ def get_model(model=model_class, m=MGPU, pretrained=pretrained): test_ratio=choice([0.2, 0.25, 0.3, 0.35])) print("semi training list length: ", len(train_list)) else: - train_list, label_list = get_wav_list(words=label_to_int.keys()) + train_list, label_list, _ = get_wav_list(words=label_to_int.keys()) if pretraining: traindataset = PreDataset(label_words_dict=label_to_int, @@ -108,8 +108,8 @@ def get_model(model=model_class, m=MGPU, pretrained=pretrained): trained_models = ["model/model_%s_%s.pth" % (CODER, b) for b in range(bagging_num)] # prediction - _, test_list = get_wav_list(words=label_to_int.keys()) - testdataset = SpeechDataset(mode='test', label_words_dict=label_to_int, wav_list=test_list, + _, _, test_list = get_wav_list(words=label_to_int.keys()) + testdataset = SpeechDataset(mode='test', label_words_dict=label_to_int, wav_list=(test_list, []), add_noise=False, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param, resize_shape=reshape_size, is_1d=is_1d) testloader = DataLoader(testdataset, BATCH_SIZE, shuffle=False) From 0883228eefd55b9c28461c254e956d3233bd3c0f Mon Sep 17 00:00:00 2001 From: kweonwoo-jung Date: Wed, 11 Apr 2018 15:16:08 +0900 Subject: [PATCH 4/4] fix name --- data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data.py b/data.py index cfcbed5..400693d 100644 --- a/data.py +++ b/data.py @@ -29,7 +29,7 @@ def __init__(self, mode, label_words_dict, wav_list, add_noise, preprocess_fun, self.label_list = wav_list[1] self.add_noise = add_noise self.sr = sr - self.n_silence = int(len(wav_list) * 0.09) + self.n_silence = int(len(self.wav_list) * 0.09) self.preprocess_fun = preprocess_fun self.preprocess_param = preprocess_param