From 326b28759a6a03959cc6ae75d8ed4e5d2d733fe8 Mon Sep 17 00:00:00 2001 From: Ankur Satpute Date: Tue, 28 May 2019 20:16:39 +0530 Subject: [PATCH] Add files via upload Updating the code and converting it into a Python notebook for better visualization --- BreakHist-Dataset-Image-Classification.ipynb | 573 +++++++++++++++++++ 1 file changed, 573 insertions(+) create mode 100644 BreakHist-Dataset-Image-Classification.ipynb diff --git a/BreakHist-Dataset-Image-Classification.ipynb b/BreakHist-Dataset-Image-Classification.ipynb new file mode 100644 index 0000000..2bd3f26 --- /dev/null +++ b/BreakHist-Dataset-Image-Classification.ipynb @@ -0,0 +1,573 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], + "source": [ + "import numpy as np \n", + "import pandas as pd\n", + "import random\n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.image as mpimg\n", + "from skimage.transform import resize\n", + "from keras.layers import *\n", + "from keras.models import *\n", + "from keras import layers\n", + "\n", + "from keras.applications.vgg16 import VGG16\n", + "from keras.applications.vgg19 import VGG19\n", + "from keras.applications.resnet50 import ResNet50\n", + "from keras.applications.inception_v3 import InceptionV3\n", + "from keras.applications.inception_resnet_v2 import InceptionResNetV2\n", + "from keras.applications.xception import Xception\n", + "\n", + "from keras.utils.data_utils import get_file\n", + "from keras import backend as K\n", + "from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau\n", + "from keras.optimizers import Adam" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "magnification_list = ['40X', '100X', '200X', '400X']\n", + "benign_list = ['adenosis', 'fibroadenoma', 'phyllodes_tumor', 'tubular_adenoma']\n", + "malignant_list = ['ductal_carcinoma', 'lobular_carcinoma', 'mucinous_carcinoma', 'papillary_carcinoma']\n", + "cancer_list = benign_list + malignant_list" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def dense_to_one_hot(labels_dense, num_classes):\n", + " num_labels = labels_dense.shape[0]\n", + " index_offset = np.arange(num_labels) * num_classes\n", + " labels_one_hot = np.zeros((num_labels, num_classes))\n", + " labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1\n", + " return labels_one_hot" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def data_split(magnification = '40X', validation_percent = 0.15, testing_percent = 0.15, encoding='Yes'):\n", + " validation_percent = validation_percent\n", + " testing_percent = testing_percent\n", + " training_images = []\n", + " training_labels = []\n", + " validation_images = []\n", + " validation_labels = []\n", + " testing_images = []\n", + " testing_labels = []\n", + " for root, dirnames, filenames in os.walk(r\"D:/Machine Learning/Datasets/BreakHist_Dataset/\" + magnification):\n", + " if filenames == []:\n", + " continue\n", + " else:\n", + " str_length = len(\"D:/Machine Learning/Datasets/BreakHist_Dataset/40X/\")\n", + " #print(root)\n", + " if root[str_length:str_length+6] == 'Benign':\n", + " string_end = 58\n", + " elif root[str_length:str_length+9] == 'Malignant':\n", + " string_end = 61\n", + " elif root[str_length+1:str_length+7] == 'Benign':\n", + " string_end = 59\n", + " else:\n", + " string_end = 62\n", + " name = root[string_end:]\n", + " #print(name)\n", + " #print(cancer_list.index(name))\n", + " total_images = 0\n", + " for names in filenames:\n", + " total_images += 1\n", + " print(name, magnification, total_images)\n", + " validation_size = np.int(total_images*validation_percent)\n", + " testing_size = np.int(total_images*testing_percent)\n", + " training_size = total_images - (validation_size + testing_size)\n", + " print(training_size, validation_size, testing_size, total_images)\n", + " num = 0\n", + " for names in filenames:\n", + " num += 1\n", + " filepath = os.path.join(root, names)\n", + " #print(filepath)\n", + " image = mpimg.imread(filepath)\n", + " image_resize = resize(image,(115,175), mode = 'constant')\n", + " if num in range(training_size):\n", + " training_images.append(image_resize[:,:,:])\n", + " training_labels.append(cancer_list.index(name))\n", + " elif num in range(training_size,training_size+validation_size):\n", + " validation_images.append(image_resize[:,:,:])\n", + " validation_labels.append(cancer_list.index(name))\n", + " elif num in range(training_size+validation_size,total_images):\n", + " testing_images.append(image_resize[:,:,:])\n", + " testing_labels.append(cancer_list.index(name))\n", + " \n", + " training_images = np.asarray(training_images)\n", + " validation_images = np.asarray(validation_images)\n", + " testing_images = np.asarray(testing_images)\n", + "\n", + " training_labels = np.asarray(training_labels)\n", + " validation_labels = np.asarray(validation_labels)\n", + " testing_labels = np.asarray(testing_labels)\n", + " \n", + " if encoding == 'Yes':\n", + " \n", + " labels_count = np.unique(training_labels).shape[0]\n", + " \n", + " training_labels = dense_to_one_hot(training_labels, labels_count)\n", + " training_labels = training_labels.astype(np.float32)\n", + " validation_labels = dense_to_one_hot(validation_labels, labels_count)\n", + " validation_labels = validation_labels.astype(np.float32)\n", + " testing_labels = dense_to_one_hot(testing_labels, labels_count)\n", + " testing_labels = testing_labels.astype(np.float32)\n", + " \n", + " print(training_images.shape[0],validation_images.shape[0],testing_images.shape[0])\n", + " \n", + " return training_images, training_labels, validation_images, validation_labels, testing_images, testing_labels" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "image_width, image_height = 210, 180 #420, 360\n", + "num_classes = 8\n", + "dropout = 0.35\n", + "\n", + "def xception_model(load_weights = True):\n", + " base_model = Xception(include_top=False, weights='imagenet', input_tensor=None, input_shape=(image_width, image_height,3), pooling='max')\n", + " x = base_model.output\n", + " x = Dense(1024, activation='relu')(x)\n", + " x = Dense(256, activation='relu')(x)\n", + " x = Dense(64, activation='relu')(x)\n", + " x = Dense(16, activation='relu')(x)\n", + " x = Dense(classes, activation='softmax')(x)\n", + " model = Model(inputs=base_model.input, outputs=x)\n", + " model.name = 'xception'\n", + " \n", + " return model\n", + "\n", + "def vgg16_model(load_weights = True):\n", + " base_model = VGG16(include_top=False, weights='imagenet', input_tensor=None, input_shape=(image_width, image_height,3), pooling='max')\n", + " x = base_model.output\n", + " x = Dense(1024, activation='relu')(x)\n", + " x = Dense(256, activation='relu')(x)\n", + " x = Dense(64, activation='relu')(x)\n", + " x = Dense(16, activation='relu')(x)\n", + " x = Dense(8, activation='softmax')(x)\n", + " model = Model(inputs=base_model.input, outputs=x)\n", + " model.name = 'vgg16'\n", + " \n", + " return model\n", + "\n", + "def vgg19_model(load_weights = True):\n", + " if load_weights:\n", + " base_model = VGG19(include_top=False, weights='imagenet', input_tensor=None, input_shape=(image_width, image_height,3), pooling='max')\n", + " else:\n", + " base_model = VGG19(include_top=False, weights=None, input_tensor=None, input_shape=(image_width, image_height,3), pooling='max')\n", + " x = base_model.output\n", + " x = Dense(1024, activation='relu')(x)\n", + " x = Dense(256, activation='relu')(x)\n", + " x = Dense(64, activation='relu')(x)\n", + " x = Dense(16, activation='relu')(x)\n", + " x = Dense(8, activation='softmax')(x)\n", + " model = Model(inputs=base_model.input, outputs=x)\n", + " model.name = 'vgg19'\n", + " return model\n", + "\n", + "def resnet50_model(load_weights = True):\n", + " base_model = ResNet50(include_top=False, weights='imagenet', input_tensor=None, input_shape=(image_width, image_height,3), pooling='avg')\n", + " x = base_model.output\n", + " x = Dense(1024, activation='relu')(x)\n", + " x = Dense(256, activation='relu')(x)\n", + " x = Dense(64, activation='relu')(x)\n", + " x = Dense(16, activation='relu')(x)\n", + " x = Dense(8, activation='softmax')(x)\n", + " model = Model(inputs=base_model.input, outputs=x)\n", + " model.name = 'resnet'\n", + " return model\n", + "\n", + "def inception_model(load_weights = True):\n", + " base_model = InceptionV3(include_top=False, weights='imagenet', input_tensor=None, input_shape=(image_width, image_height,3), pooling='avg')\n", + " x = base_model.output\n", + " x = Dense(1024, activation='relu')(x)\n", + " x = Dense(256, activation='relu')(x)\n", + " x = Dense(64, activation='relu')(x)\n", + " x = Dense(16, activation='relu')(x)\n", + " x = Dense(8, activation='softmax')(x)\n", + " model = Model(inputs=base_model.input, outputs=x)\n", + " model.name = 'inception'\n", + " return model\n", + "\n", + "def inception_resnet_model(load_weights = True):\n", + " base_model = InceptionResNetV2(include_top=False, weights='imagenet', input_tensor=None, input_shape=(image_width, image_height,3), pooling='avg')\n", + " x = base_model.output\n", + " x = Dense(1024, activation='relu')(x)\n", + " x = Dense(256, activation='relu')(x)\n", + " x = Dense(64, activation='relu')(x)\n", + " x = Dense(16, activation='relu')(x)\n", + " x = Dense(classes, activation='softmax')(x)\n", + " model = Model(inputs=base_model.input, outputs=x)\n", + " model.name = 'inception_resnet'\n", + " return model\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "models = [vgg16_model, vgg19_model, xception_model, resnet50_model, inception_model, inception_resnet_model]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "def compile_n_fit(validation_percent, testing_percent, load_wt, image_width=175, image_height=115,dropout = 0.3, model_name = 'vgg16_model', magnification = '40X'):\n", + " training_images, training_labels, validation_images, validation_labels, testing_images, testing_labels = data_split(magnification = magnification, validation_percent = validation_percent, testing_percent = testing_percent)\n", + " for i in range(len(models)):\n", + " if models[i].__name__ == model_name:\n", + " model = models[i]\n", + " \n", + " model = model(load_weights = load_wt)\n", + " try:\n", + " model.load_weights(model_name + '_weight_1.h5')\n", + " print('Weights loaded!')\n", + " except:\n", + " print('No weights defined!')\n", + "# pass\n", + " \n", + " model.compile(loss=\"categorical_crossentropy\", optimizer=Adam(lr=0.0001), metrics=['accuracy'])\n", + " early_stopping = EarlyStopping(patience=10, verbose=2)\n", + " model_checkpoint = ModelCheckpoint(model_name + \"_combine\" +\".model\", save_best_only=True, verbose=2)\n", + " reduce_lr = ReduceLROnPlateau(factor=0.1, patience=5, verbose=2) #min_lr=0.00001,\n", + "\n", + " epochs = 100\n", + " batch_size = 32\n", + "\n", + " history = model.fit(training_images, training_labels,\n", + " validation_data=[validation_images, validation_labels], \n", + " epochs=epochs,\n", + " verbose = 0,\n", + " batch_size=batch_size,\n", + " callbacks=[early_stopping, model_checkpoint, reduce_lr])\n", + "\n", + " test_loss, test_acc = model.evaluate(testing_images, testing_labels)\n", + " \n", + " model.save_weights(model_name + '_weight_1.h5')\n", + " \n", + " print(\"\\nThe test accuracy for \" + model_name + \" with magnification \"+ magnification +\" is \", test_acc, \"\\n\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "adenosis 40X 114\n", + "80 17 17 114\n", + "fibroadenoma 40X 253\n", + "179 37 37 253\n", + "phyllodes_tumor 40X 109\n", + "77 16 16 109\n", + "tubular_adenoma 40X 149\n", + "105 22 22 149\n", + "ductal_carcinoma 40X 864\n", + "606 129 129 864\n", + "lobular_carcinoma 40X 156\n", + "110 23 23 156\n", + "mucinous_carcinoma 40X 205\n", + "145 30 30 205\n", + "papillary_carcinoma 40X 145\n", + "103 21 21 145\n", + "1397 295 295\n", + "No weights defined!\n", + "\n", + "Epoch 00001: val_loss improved from inf to 1.89395, saving model to vgg19_model_combine.model\n", + "\n", + "Epoch 00002: val_loss improved from 1.89395 to 1.74137, saving model to vgg19_model_combine.model\n", + "\n", + "Epoch 00003: val_loss improved from 1.74137 to 1.71194, saving model to vgg19_model_combine.model\n", + "\n", + "Epoch 00004: val_loss did not improve from 1.71194\n", + "\n", + "Epoch 00005: val_loss did not improve from 1.71194\n", + "\n", + "Epoch 00006: val_loss did not improve from 1.71194\n", + "\n", + "Epoch 00007: val_loss did not improve from 1.71194\n", + "\n", + "Epoch 00008: val_loss did not improve from 1.71194\n", + "\n", + "Epoch 00008: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.\n", + "\n", + "Epoch 00009: val_loss did not improve from 1.71194\n", + "\n", + "Epoch 00010: val_loss did not improve from 1.71194\n", + "\n", + "Epoch 00011: val_loss did not improve from 1.71194\n", + "\n", + "Epoch 00012: val_loss did not improve from 1.71194\n", + "\n", + "Epoch 00013: val_loss did not improve from 1.71194\n", + "\n", + "Epoch 00013: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-07.\n", + "Epoch 00013: early stopping\n", + "295/295 [==============================] - 2s 6ms/step\n", + "\n", + "The test accuracy for vgg19_model with magnification 40X is 0.49491525423728816 \n", + "\n", + "adenosis 100X 113\n", + "81 16 16 113\n", + "fibroadenoma 100X 260\n", + "182 39 39 260\n", + "phyllodes_tumor 100X 70\n", + "50 10 10 70\n", + "tubular_adenoma 100X 150\n", + "106 22 22 150\n", + "ductal_carcinoma 100X 903\n", + "633 135 135 903\n", + "lobular_carcinoma 100X 170\n", + "120 25 25 170\n", + "mucinous_carcinoma 100X 222\n", + "156 33 33 222\n", + "papillary_carcinoma 100X 142\n", + "100 21 21 142\n", + "1420 301 301\n", + "Weights loaded!\n", + "\n", + "Epoch 00001: val_loss improved from inf to 1.91977, saving model to vgg19_model_combine.model\n", + "\n", + "Epoch 00002: val_loss did not improve from 1.91977\n", + "\n", + "Epoch 00003: val_loss did not improve from 1.91977\n", + "\n", + "Epoch 00004: val_loss improved from 1.91977 to 1.85441, saving model to vgg19_model_combine.model\n", + "\n", + "Epoch 00005: val_loss did not improve from 1.85441\n", + "\n", + "Epoch 00006: val_loss did not improve from 1.85441\n", + "\n", + "Epoch 00007: val_loss did not improve from 1.85441\n", + "\n", + "Epoch 00008: val_loss did not improve from 1.85441\n", + "\n", + "Epoch 00009: val_loss did not improve from 1.85441\n", + "\n", + "Epoch 00009: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.\n", + "\n", + "Epoch 00010: val_loss did not improve from 1.85441\n", + "\n", + "Epoch 00011: val_loss did not improve from 1.85441\n", + "\n", + "Epoch 00012: val_loss did not improve from 1.85441\n", + "\n", + "Epoch 00013: val_loss did not improve from 1.85441\n", + "\n", + "Epoch 00014: val_loss did not improve from 1.85441\n", + "\n", + "Epoch 00014: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-07.\n", + "Epoch 00014: early stopping\n", + "301/301 [==============================] - 2s 5ms/step\n", + "\n", + "The test accuracy for vgg19_model with magnification 100X is 0.6046511627906976 \n", + "\n", + "adenosis 200X 111\n", + "79 16 16 111\n", + "fibroadenoma 200X 264\n", + "186 39 39 264\n", + "phyllodes_tumor 200X 108\n", + "76 16 16 108\n", + "tubular_adenoma 200X 140\n", + "98 21 21 140\n", + "ductal_carcinoma 200X 896\n", + "628 134 134 896\n", + "lobular_carcinoma 200X 163\n", + "115 24 24 163\n", + "mucinous_carcinoma 200X 196\n", + "138 29 29 196\n", + "papillary_carcinoma 200X 135\n", + "95 20 20 135\n", + "1407 299 299\n", + "Weights loaded!\n", + "\n", + "Epoch 00001: val_loss improved from inf to 1.91302, saving model to vgg19_model_combine.model\n", + "\n", + "Epoch 00002: val_loss did not improve from 1.91302\n", + "\n", + "Epoch 00003: val_loss improved from 1.91302 to 1.74487, saving model to vgg19_model_combine.model\n", + "\n", + "Epoch 00004: val_loss did not improve from 1.74487\n", + "\n", + "Epoch 00005: val_loss did not improve from 1.74487\n", + "\n", + "Epoch 00006: val_loss did not improve from 1.74487\n", + "\n", + "Epoch 00007: val_loss did not improve from 1.74487\n", + "\n", + "Epoch 00008: val_loss did not improve from 1.74487\n", + "\n", + "Epoch 00008: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.\n", + "\n", + "Epoch 00009: val_loss did not improve from 1.74487\n", + "\n", + "Epoch 00010: val_loss did not improve from 1.74487\n", + "\n", + "Epoch 00011: val_loss did not improve from 1.74487\n", + "\n", + "Epoch 00012: val_loss did not improve from 1.74487\n", + "\n", + "Epoch 00013: val_loss did not improve from 1.74487\n", + "\n", + "Epoch 00013: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-07.\n", + "Epoch 00013: early stopping\n", + "299/299 [==============================] - 2s 6ms/step\n", + "\n", + "The test accuracy for vgg19_model with magnification 200X is 0.5551839465879677 \n", + "\n", + "adenosis 400X 106\n", + "76 15 15 106\n", + "fibroadenoma 400X 237\n", + "167 35 35 237\n", + "phyllodes_tumor 400X 115\n", + "81 17 17 115\n", + "tubular_adenoma 400X 130\n", + "92 19 19 130\n", + "ductal_carcinoma 400X 788\n", + "552 118 118 788\n", + "lobular_carcinoma 400X 137\n", + "97 20 20 137\n", + "mucinous_carcinoma 400X 169\n", + "119 25 25 169\n", + "papillary_carcinoma 400X 138\n", + "98 20 20 138\n", + "1274 269 269\n", + "Weights loaded!\n", + "\n", + "Epoch 00001: val_loss improved from inf to 1.66334, saving model to vgg19_model_combine.model\n", + "\n", + "Epoch 00002: val_loss did not improve from 1.66334\n", + "\n", + "Epoch 00003: val_loss did not improve from 1.66334\n", + "\n", + "Epoch 00004: val_loss did not improve from 1.66334\n", + "\n", + "Epoch 00005: val_loss did not improve from 1.66334\n", + "\n", + "Epoch 00006: val_loss did not improve from 1.66334\n", + "\n", + "Epoch 00006: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.\n", + "\n", + "Epoch 00007: val_loss did not improve from 1.66334\n", + "\n", + "Epoch 00008: val_loss did not improve from 1.66334\n", + "\n", + "Epoch 00009: val_loss did not improve from 1.66334\n", + "\n", + "Epoch 00010: val_loss did not improve from 1.66334\n", + "\n", + "Epoch 00011: val_loss did not improve from 1.66334\n", + "\n", + "Epoch 00011: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-07.\n", + "Epoch 00011: early stopping\n", + "269/269 [==============================] - 1s 5ms/step\n", + "\n", + "The test accuracy for vgg19_model with magnification 400X is 0.5650557620817844 \n", + "\n" + ] + } + ], + "source": [ + "model_num = 1\n", + "name = models[model_num].__name__\n", + "iteration = 0\n", + "for types in magnification_list:\n", + " if iteration == 0:\n", + " load_wt = \"Yes\"\n", + " else:\n", + " load_wt = \"No\"\n", + " compile_n_fit(validation_percent=0.15, testing_percent=0.15,\n", + " image_width=175, image_height=115, dropout = 0.3,\n", + " load_wt=load_wt, model_name = name, magnification = types)\n", + " iteration += 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}