diff --git a/README.md b/README.md index 3021acc..3e4d36d 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +Forked from [avisingh599]https://github.com/avisingh599/visual-qa.git + # Deep Learning for Visual Question Answering [Click here](https://avisingh599.github.io/deeplearning/visual-qa/) to go to the accompanying blog post. diff --git a/scripts/README.md b/scripts/README.md old mode 100644 new mode 100755 diff --git a/scripts/demo_batch.py b/scripts/demo_batch.py old mode 100644 new mode 100755 diff --git a/scripts/dumpText.py b/scripts/dumpText.py old mode 100644 new mode 100755 diff --git a/scripts/evaluateLSTM.py b/scripts/evaluateLSTM.py old mode 100644 new mode 100755 diff --git a/scripts/evaluateMLP.py b/scripts/evaluateMLP.py old mode 100644 new mode 100755 index 483836e..413b329 --- a/scripts/evaluateMLP.py +++ b/scripts/evaluateMLP.py @@ -26,7 +26,7 @@ def main(): 'r').read().decode('utf8').splitlines() answers_val = open('../data/preprocessed/answers_val2014_all.txt', 'r').read().decode('utf8').splitlines() - images_val = open('../data/preprocessed/images_val2014.txt', + images_val = open('../data/preprocessed/images_val2014_all.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' diff --git a/scripts/extract_features.py b/scripts/extract_features.py old mode 100644 new mode 100755 diff --git a/scripts/features.py b/scripts/features.py old mode 100644 new mode 100755 diff --git a/scripts/get_started.sh b/scripts/get_started.sh old mode 100644 new mode 100755 diff --git a/scripts/own_image.py b/scripts/own_image.py old mode 100644 new mode 100755 diff --git a/scripts/trainLSTM_1.py b/scripts/trainLSTM_1.py old mode 100644 new mode 100755 index 1b205e3..1b2a7b7 --- a/scripts/trainLSTM_1.py +++ b/scripts/trainLSTM_1.py @@ -4,7 +4,8 @@ import argparse from keras.models import Sequential -from keras.layers.core import Dense, Activation, Merge, Dropout, Reshape +from keras.layers.core import Dense, Activation, Dropout, Reshape +from keras.layers import Merge from keras.layers.recurrent import LSTM from keras.utils import np_utils, generic_utils from keras.callbacks import ModelCheckpoint, RemoteMonitor @@ -21,7 +22,7 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units_mlp', type=int, default=1024) - parser.add_argument('-num_hidden_units_lstm', type=int, default=512) + parser.add_argument('-num_hidden_units_lstm', type=int, default=4096) parser.add_argument('-num_hidden_layers_mlp', type=int, default=3) parser.add_argument('-num_hidden_layers_lstm', type=int, default=1) parser.add_argument('-dropout', type=float, default=0.5) @@ -56,13 +57,14 @@ def main(): joblib.dump(labelencoder,'../models/labelencoder.pkl') image_model = Sequential() - image_model.add(Reshape(input_shape = (img_dim,), dims=(img_dim,))) + #image_model.add(Reshape(input_shape = (img_dim,), dims=(img_dim,))) + image_model.add(Reshape((4096,), input_shape=(4096,)))#input_shape = (img_dim,), dims=(img_dim,))) language_model = Sequential() if args.num_hidden_layers_lstm == 1: - language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=False, input_shape=(max_len, word_vec_dim))) + language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=False, input_shape=(None, word_vec_dim))) else: - language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=True, input_shape=(max_len, word_vec_dim))) + language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=True, input_shape=(None, word_vec_dim))) for i in xrange(args.num_hidden_layers_lstm-2): language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=True)) language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=False)) @@ -109,6 +111,7 @@ def main(): X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps) X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures) Y_batch = get_answers_matrix(an_batch, labelencoder) + print X_q_batch.shape loss = model.train_on_batch([X_q_batch, X_i_batch], Y_batch) progbar.add(args.batch_size, values=[("train loss", loss)]) @@ -119,4 +122,4 @@ def main(): model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k)) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/scripts/trainLSTM_language.py b/scripts/trainLSTM_language.py old mode 100644 new mode 100755 index 1c46648..e7559d0 --- a/scripts/trainLSTM_language.py +++ b/scripts/trainLSTM_language.py @@ -27,7 +27,7 @@ def main(): questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_train = open('../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines() - answers_train = open('../data/preprocessed/answers_train2014.txt', 'r').read().decode('utf8').splitlines() + answers_train = open('../data/preprocessed/answers_train2014_modal.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() max_answers = 1000 questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, max_answers) diff --git a/scripts/trainMLP.py b/scripts/trainMLP.py old mode 100644 new mode 100755 index 42c7a4d..468ba69 --- a/scripts/trainMLP.py +++ b/scripts/trainMLP.py @@ -22,13 +22,13 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units', type=int, default=1024) - parser.add_argument('-num_hidden_layers', type=int, default=3) - parser.add_argument('-dropout', type=float, default=0.5) + parser.add_argument('-num_hidden_layers', type=int, default=5) + parser.add_argument('-dropout', type=float, default=0.2) parser.add_argument('-activation', type=str, default='tanh') parser.add_argument('-language_only', type=bool, default= False) - parser.add_argument('-num_epochs', type=int, default=100) + parser.add_argument('-num_epochs', type=int, default=50) parser.add_argument('-model_save_interval', type=int, default=10) - parser.add_argument('-batch_size', type=int, default=128) + parser.add_argument('-batch_size', type=int, default=256) args = parser.parse_args() questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() @@ -42,7 +42,7 @@ def main(): labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) - joblib.dump(labelencoder,'../models/labelencoder.pkl') + joblib.dump(labelencoder,'../models3/labelencoder.pkl') features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] @@ -76,9 +76,9 @@ def main(): json_string = model.to_json() if args.language_only: - model_file_name = '../models/mlp_language_only_num_hidden_units_' + str(args.num_hidden_units) + '_num_hidden_layers_' + str(args.num_hidden_layers) + model_file_name = '../models3/mlp_language_only_num_hidden_units_' + str(args.num_hidden_units) + '_num_hidden_layers_' + str(args.num_hidden_layers) else: - model_file_name = '../models/mlp_num_hidden_units_' + str(args.num_hidden_units) + '_num_hidden_layers_' + str(args.num_hidden_layers) + model_file_name = '../models3/mlp_num_hidden_units_' + str(args.num_hidden_units) + '_num_hidden_layers_' + str(args.num_hidden_layers) open(model_file_name + '.json', 'w').write(json_string) print 'Compiling model...' @@ -113,4 +113,4 @@ def main(): model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k)) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/scripts/utils.py b/scripts/utils.py old mode 100644 new mode 100755 diff --git a/scripts/vgg_features.prototxt b/scripts/vgg_features.prototxt old mode 100644 new mode 100755