From cc0668df7089fab449af118f1c8fc89859e7d16f Mon Sep 17 00:00:00 2001 From: Jaskanwal Date: Wed, 29 Nov 2023 15:25:33 -0600 Subject: [PATCH 1/5] remove old dvc files --- .dvc/.gitignore | 3 --- .dvc/config | 0 2 files changed, 3 deletions(-) delete mode 100644 .dvc/.gitignore delete mode 100644 .dvc/config diff --git a/.dvc/.gitignore b/.dvc/.gitignore deleted file mode 100644 index 528f30c..0000000 --- a/.dvc/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/config.local -/tmp -/cache diff --git a/.dvc/config b/.dvc/config deleted file mode 100644 index e69de29..0000000 From cb73f5df111e002e94e53ca1fee38e5338c1fc2e Mon Sep 17 00:00:00 2001 From: Jaskanwal Date: Wed, 29 Nov 2023 15:31:28 -0600 Subject: [PATCH 2/5] Initializing dvc --- .dvc/.gitignore | 3 +++ .dvc/config | 0 2 files changed, 3 insertions(+) create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..e69de29 From 51bc82b6f461a2426905ced89408754c026a0213 Mon Sep 17 00:00:00 2001 From: Jaskanwal Date: Wed, 29 Nov 2023 17:51:34 -0600 Subject: [PATCH 3/5] Create params.yaml --- params.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 params.yaml diff --git a/params.yaml b/params.yaml new file mode 100644 index 0000000..b3d4ad7 --- /dev/null +++ b/params.yaml @@ -0,0 +1,15 @@ +base: + seed: 42 + pokemon_type_train: "Water" + +data_preprocess: + source_directory: "data/external" + destination_directory: "data/processed" + dataset_labels: "stats/pokemon-gen-1-8.csv" + dataset_images: "images" + +train: + test_size: 0.2 + learning_rate: 0.001 + epochs: 15 + batch_size: 120 From 48544f88443ef3db204ce23ad71bd1a42c4d7b96 Mon Sep 17 00:00:00 2001 From: Jaskanwal Date: Wed, 29 Nov 2023 22:00:30 -0600 Subject: [PATCH 4/5] Experiments increase learning rate --- .dvc/config | 4 + data/.gitignore | 1 + data/external.dvc | 5 ++ data/processed/.gitignore | 9 ++ dvc.lock | 159 +++++++++++++++++++++++++++++++++ dvc.yaml | 58 ++++++++++++ outputs/.gitignore | 3 + outputs/metrics.yaml | 4 + params.yaml | 10 +-- src/data_load.py | 62 +++++++++++++ src/data_preprocess.py | 96 ++++++++++++++++++++ src/evaluate.py | 63 +++++++++++++ src/train.py | 117 ++++++++++++++++++++++++ src/utils/find_project_root.py | 15 ++++ 14 files changed, 601 insertions(+), 5 deletions(-) create mode 100644 data/.gitignore create mode 100644 data/external.dvc create mode 100644 data/processed/.gitignore create mode 100644 dvc.lock create mode 100644 dvc.yaml create mode 100644 outputs/.gitignore create mode 100644 outputs/metrics.yaml create mode 100644 src/data_load.py create mode 100644 src/data_preprocess.py create mode 100644 src/evaluate.py create mode 100644 src/train.py create mode 100644 src/utils/find_project_root.py diff --git a/.dvc/config b/.dvc/config index e69de29..1477538 100644 --- a/.dvc/config +++ b/.dvc/config @@ -0,0 +1,4 @@ +[core] + remote = driveremote +['remote "driveremote"'] + url = gdrive://11lAGBPn-ZVRMUzzyaHcCGiEgwf-wdiFg diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..6adea30 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +/external diff --git a/data/external.dvc b/data/external.dvc new file mode 100644 index 0000000..68cfa8d --- /dev/null +++ b/data/external.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 97f98670ff36fe230e610d174236c6eb.dir + size: 118339628 + nfiles: 906 + path: external diff --git a/data/processed/.gitignore b/data/processed/.gitignore new file mode 100644 index 0000000..d0e6e42 --- /dev/null +++ b/data/processed/.gitignore @@ -0,0 +1,9 @@ +/pokemon.csv +/pokemon-with-image-paths.csv +/pokemon +/X.pckl +/X_train.pckl +/X_test.pckl +/y.pckl +/y_train.pckl +/y_test.pckl diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..5037edc --- /dev/null +++ b/dvc.lock @@ -0,0 +1,159 @@ +schema: '2.0' +stages: + data_preprocess: + cmd: python src/data_preprocess.py --params params.yaml + deps: + - path: data/external/images + md5: c48566c030889e71a791e1412d7dddf7.dir + size: 118179012 + nfiles: 905 + - path: data/external/stats/pokemon-gen-1-8.csv + md5: b0b029c2dc01bd0e6f2dbf9eeae8c14e + size: 160616 + - path: src/data_preprocess.py + md5: 9044f1c3ce91734931cdcf353240c429 + size: 3167 + params: + params.yaml: + base: + seed: 42 + pokemon_type_train: Water + data_preprocess: + source_directory: data/external + destination_directory: data/processed + dataset_labels: stats/pokemon-gen-1-8.csv + dataset_images: images + outs: + - path: data/processed/pokemon + md5: c48566c030889e71a791e1412d7dddf7.dir + size: 118179012 + nfiles: 905 + - path: data/processed/pokemon-with-image-paths.csv + hash: md5 + md5: c52b925d3573270a7377e6e8bc37e90d + size: 109527 + - path: data/processed/pokemon.csv + hash: md5 + md5: 102a3d4e0350c5934c8fd2b24c90b028 + size: 47039 + data_load: + cmd: python src/data_load.py --params params.yaml + deps: + - path: data/processed/pokemon + md5: c48566c030889e71a791e1412d7dddf7.dir + size: 118179012 + nfiles: 905 + - path: data/processed/pokemon-with-image-paths.csv + hash: md5 + md5: c52b925d3573270a7377e6e8bc37e90d + size: 109527 + - path: src/data_load.py + md5: 3beae88ab58ae0cb8d355477e04f3574 + size: 2073 + params: + params.yaml: + base: + seed: 42 + pokemon_type_train: Water + data_preprocess: + source_directory: data/external + destination_directory: data/processed + dataset_labels: stats/pokemon-gen-1-8.csv + dataset_images: images + outs: + - path: data/processed/X.pckl + md5: 8f6555c6159e27739fd97a61ace395a4 + size: 2891610169 + - path: data/processed/X_test.pckl + hash: md5 + md5: 23c3aa551e6826615b71e592972997a8 + size: 581210168 + - path: data/processed/X_train.pckl + hash: md5 + md5: 364b2c4e8fe73f88a79ef5da6416abac + size: 2310400169 + - path: data/processed/y.pckl + hash: md5 + md5: f156295a269847ea043fff40f8cca0a7 + size: 7072 + - path: data/processed/y_test.pckl + hash: md5 + md5: 05695e9b10f3c84048b6c6d0af01bd30 + size: 3247 + - path: data/processed/y_train.pckl + hash: md5 + md5: 25b4a8bb684e918396014650bfcfeffd + size: 10913 + train: + cmd: python src/train.py --params params.yaml + deps: + - path: data/processed/X_test.pckl + md5: 23c3aa551e6826615b71e592972997a8 + size: 581210168 + - path: data/processed/X_train.pckl + md5: 364b2c4e8fe73f88a79ef5da6416abac + size: 2310400169 + - path: data/processed/y_test.pckl + md5: 05695e9b10f3c84048b6c6d0af01bd30 + size: 3247 + - path: data/processed/y_train.pckl + md5: 25b4a8bb684e918396014650bfcfeffd + size: 10913 + - path: src/train.py + hash: md5 + md5: c1c31fe2472322e96c64b5dd9bf5b2bc + size: 4078 + params: + params.yaml: + base: + seed: 42 + pokemon_type_train: Water + data_preprocess: + source_directory: data/external + destination_directory: data/processed + dataset_labels: stats/pokemon-gen-1-8.csv + dataset_images: images + train: + test_size: 0.2 + learning_rate: 0.001 + epochs: 15 + batch_size: 120 + outs: + - path: outputs/model + hash: md5 + md5: c9d0434e11c3cac21a55c1c2bf2fd93d.dir + size: 1507711 + nfiles: 4 + - path: outputs/train_history.png + hash: md5 + md5: 9ef932ad29a5107e9c219e3652c9d330 + size: 54241 + evaluate: + cmd: python src/evaluate.py --params params.yaml + deps: + - path: outputs/model/keras_metadata.pb + hash: md5 + md5: c7f12fbd11650bd6ea7c3fdd7b53e8f4 + size: 17311 + - path: outputs/model/saved_model.pb + hash: md5 + md5: 044e312dae1c89084aaf8af5d9551418 + size: 194467 + - path: outputs/train_history.png + hash: md5 + md5: 9ef932ad29a5107e9c219e3652c9d330 + size: 54241 + params: + params.yaml: + base: + seed: 42 + pokemon_type_train: Water + outs: + - path: outputs/confusion_matrix.png + hash: md5 + md5: e10bcae55c34bfe026a65ea00acda1c2 + size: 22285 + - path: outputs/metrics.yaml + hash: md5 + md5: a744d45ab0e17293c26ac8beb9e5c522 + size: 103 diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..73c7e00 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,58 @@ +stages: + data_preprocess: + cmd: python src/data_preprocess.py --params params.yaml + deps: + - src/data_preprocess.py + - data/external/images + - data/external/stats/pokemon-gen-1-8.csv + outs: + - data/processed/pokemon.csv + - data/processed/pokemon-with-image-paths.csv + - data/processed/pokemon + params: + - base + - data_preprocess + data_load: + cmd: python src/data_load.py --params params.yaml + deps: + - src/data_load.py + - data/processed/pokemon-with-image-paths.csv + - data/processed/pokemon + outs: + - data/processed/X.pckl + - data/processed/X_train.pckl + - data/processed/X_test.pckl + - data/processed/y.pckl + - data/processed/y_train.pckl + - data/processed/y_test.pckl + params: + - base + - data_preprocess + train: + cmd: python src/train.py --params params.yaml + deps: + - src/train.py + - data/processed/X_train.pckl + - data/processed/X_test.pckl + - data/processed/y_train.pckl + - data/processed/y_test.pckl + outs: + - outputs/model + - outputs/train_history.png + params: + - base + - data_preprocess + - train + evaluate: + cmd: python src/evaluate.py --params params.yaml + deps: + - outputs/model/keras_metadata.pb + - outputs/model/saved_model.pb + - outputs/train_history.png + outs: + - outputs/confusion_matrix.png + params: + - base + metrics: + - outputs/metrics.yaml: + cache: false diff --git a/outputs/.gitignore b/outputs/.gitignore new file mode 100644 index 0000000..3f0f98c --- /dev/null +++ b/outputs/.gitignore @@ -0,0 +1,3 @@ +/model +/train_history.png +/confusion_matrix.png diff --git a/outputs/metrics.yaml b/outputs/metrics.yaml new file mode 100644 index 0000000..64503ec --- /dev/null +++ b/outputs/metrics.yaml @@ -0,0 +1,4 @@ +acc: 0.9400749063670412 +f1: 0.823529411764706 +precision: 0.7943262411347518 +recall: 0.8549618320610687 diff --git a/params.yaml b/params.yaml index b3d4ad7..37c699c 100644 --- a/params.yaml +++ b/params.yaml @@ -1,12 +1,12 @@ base: seed: 42 - pokemon_type_train: "Water" + pokemon_type_train: Water data_preprocess: - source_directory: "data/external" - destination_directory: "data/processed" - dataset_labels: "stats/pokemon-gen-1-8.csv" - dataset_images: "images" + source_directory: data/external + destination_directory: data/processed + dataset_labels: stats/pokemon-gen-1-8.csv + dataset_images: images train: test_size: 0.2 diff --git a/src/data_load.py b/src/data_load.py new file mode 100644 index 0000000..ffd3ab8 --- /dev/null +++ b/src/data_load.py @@ -0,0 +1,62 @@ +import argparse +import pickle + +import numpy as np +import pandas as pd +import tensorflow as tf +import yaml +from sklearn.model_selection import train_test_split +from tqdm import tqdm + +from utils.find_project_root import find_project_root + + +# Load training images +def load_training_data(labels) -> np.array: + train_image = [] + + for i in tqdm(range(labels.shape[0])): + + img = tf.keras.utils.load_img(labels.iloc[i]["imagePath"], color_mode='rgba') + img = tf.keras.utils.img_to_array(img) + img = img/255 + train_image.append(img) + X = np.array(train_image) + + return(X) + +# Create labels +def create_labels(labels): + return(pokemon[["is" + POKEMON_TYPE_TRAIN]]) + +if __name__ == '__main__': + + args_parser = argparse.ArgumentParser() + args_parser.add_argument('--params', dest='params', required=True) + args = args_parser.parse_args() + + with open(args.params) as param_file: + params = yaml.safe_load(param_file) + + PROJECT_ROOT = find_project_root() + + SEED: str = params['base']['seed'] + POKEMON_TYPE_TRAIN: str = params['base']['pokemon_type_train'] + + DESTINATION_DIRECTORY: str = params['data_preprocess']['destination_directory'] + MODEL_TEST_SIZE: float = params['train']['test_size'] + + pokemon = pd.read_csv(PROJECT_ROOT / DESTINATION_DIRECTORY / "pokemon-with-image-paths.csv") + + X = load_training_data(pokemon) + y = create_labels(pokemon) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED, test_size=MODEL_TEST_SIZE, stratify=y) + + pickle.dump(X, open(PROJECT_ROOT / DESTINATION_DIRECTORY / "X.pckl", "wb")) + pickle.dump(X_train, open(PROJECT_ROOT / DESTINATION_DIRECTORY / "X_train.pckl", "wb")) + pickle.dump(X_test, open(PROJECT_ROOT / DESTINATION_DIRECTORY / "X_test.pckl", "wb")) + + pickle.dump(y, open(PROJECT_ROOT / DESTINATION_DIRECTORY / "y.pckl", "wb")) + pickle.dump(y_train, open(PROJECT_ROOT / DESTINATION_DIRECTORY / "y_train.pckl", "wb")) + pickle.dump(y_test, open(PROJECT_ROOT / DESTINATION_DIRECTORY / "y_test.pckl", "wb")) \ No newline at end of file diff --git a/src/data_preprocess.py b/src/data_preprocess.py new file mode 100644 index 0000000..567673b --- /dev/null +++ b/src/data_preprocess.py @@ -0,0 +1,96 @@ +import argparse +import os +import shutil + +import numpy as np +import pandas as pd +import yaml + +from utils.find_project_root import find_project_root + + +# Process Pokémon and one-hot encode the types +def preprocess_training_labels(dataset) -> pd.DataFrame: + pokemon = pd.read_csv(PROJECT_ROOT / SOURCE_DIRECTORY / dataset) + pokemon = pokemon[["pokedex_number", "name", "type1", "type2"]] + + # Create one-hot columns for each type + types = set(pokemon["type1"]) + for t in types: + pokemon["is" + str(t).capitalize()] = 0 + + # Iterate over Pokémon + for i, p in pokemon.iterrows(): + + # Set one-hot columns to 1 for relevant types + pokemon.loc[i, "is" + p["type1"].capitalize()] = 1 + + if not pd.isna(p["type2"]): + pokemon.loc[i, "is" + p["type2"].capitalize()] = 1 + + # Save output + pokemon.to_csv(PROJECT_ROOT / DESTINATION_DIRECTORY / 'pokemon.csv', index=False) + return(pokemon) + +# Process image data +def preprocess_training_data(dataset) -> pd.DataFrame: + + data_directory_images = PROJECT_ROOT / SOURCE_DIRECTORY / dataset + output_directory = PROJECT_ROOT / DESTINATION_DIRECTORY / "pokemon" + + pokemon = pd.read_csv(PROJECT_ROOT / DESTINATION_DIRECTORY / 'pokemon.csv') + pokemon["imagePath"] = np.nan + + # Remove processed folder and create empty new one + try: + shutil.rmtree(output_directory) + os.mkdir(output_directory) + except: + os.mkdir(output_directory) + + # Copy images to processed folder + for image in os.listdir(data_directory_images): + pokemon_id = image.split('.')[0] + + # Add leading zeroes to ID + while len(pokemon_id) < 3: + pokemon_id = "0" + pokemon_id + + # Images with no variety (e.g. "211.png") + if pokemon_id.isnumeric(): + + # Copy to processed folder + src = data_directory_images / image + dst = os.path.join(output_directory, pokemon_id + ".png") + shutil.copyfile(src, dst) + + # Set image path in data frame + pokemon.loc[pokemon["pokedex_number"] == int(pokemon_id), 'imagePath'] = dst + + # Drop Pokemon without image path + pokemon = pokemon.dropna(subset=["imagePath"]) + + # Save pokemon.csv with image paths + pokemon.to_csv(PROJECT_ROOT / DESTINATION_DIRECTORY / 'pokemon-with-image-paths.csv', index=False) + + return(pokemon) + +if __name__ == '__main__': + + args_parser = argparse.ArgumentParser() + args_parser.add_argument('--params', dest='params', required=True) + args = args_parser.parse_args() + + with open(args.params) as param_file: + params = yaml.safe_load(param_file) + + PROJECT_ROOT = find_project_root() + SOURCE_DIRECTORY: str = params['data_preprocess']['source_directory'] + DESTINATION_DIRECTORY: str = params['data_preprocess']['destination_directory'] + TRAIN_DATA_LABELS: str = params['data_preprocess']['dataset_labels'] + TRAIN_DATA_IMAGES: str = params['data_preprocess']['dataset_images'] + + pokemon = preprocess_training_labels(TRAIN_DATA_LABELS) + pokemon = preprocess_training_data(TRAIN_DATA_IMAGES) + + print(pokemon.head()) \ No newline at end of file diff --git a/src/evaluate.py b/src/evaluate.py new file mode 100644 index 0000000..31373ac --- /dev/null +++ b/src/evaluate.py @@ -0,0 +1,63 @@ +import argparse +import pickle + +import matplotlib.pyplot as plt +import yaml +from sklearn.metrics import (ConfusionMatrixDisplay, accuracy_score, + classification_report, confusion_matrix, f1_score, + log_loss, precision_score, recall_score) +from tensorflow import keras + +from utils.find_project_root import find_project_root + +if __name__ == '__main__': + + args_parser = argparse.ArgumentParser() + args_parser.add_argument('--params', dest='params', required=True) + args = args_parser.parse_args() + + with open(args.params) as param_file: + params = yaml.safe_load(param_file) + + PROJECT_ROOT = find_project_root() + DESTINATION_DIRECTORY: str = params['data_preprocess']['destination_directory'] + + # Load model + # estimator = pickle.loads((PROJECT_ROOT / "outputs" / "model.pckl").read_bytes()) + model = keras.models.load_model(PROJECT_ROOT / "outputs" / "model") + + # Load data + X = pickle.loads((PROJECT_ROOT / DESTINATION_DIRECTORY / "X.pckl").read_bytes()) + X_train = pickle.loads((PROJECT_ROOT / DESTINATION_DIRECTORY / "X_train.pckl").read_bytes()) + X_test = pickle.loads((PROJECT_ROOT / DESTINATION_DIRECTORY / "X_test.pckl").read_bytes()) + + y = pickle.loads((PROJECT_ROOT / DESTINATION_DIRECTORY / "y.pckl").read_bytes()) + y_train = pickle.loads((PROJECT_ROOT / DESTINATION_DIRECTORY / "y_train.pckl").read_bytes()) + y_test = pickle.loads((PROJECT_ROOT / DESTINATION_DIRECTORY / "y_test.pckl").read_bytes()) + + # Predict all Pokémon + predictions = model.predict(X) > 0.5 + + # Calculate metrics + metrics = {} + + metrics["acc"] = float(accuracy_score(y, predictions)) + metrics["precision"] = float(precision_score(y, predictions)) + metrics["recall"] = float(recall_score(y, predictions)) + metrics["f1"] = float(f1_score(y, predictions)) + + # Save metrics + with open(PROJECT_ROOT / "outputs" / "metrics.yaml", 'w') as file: + yaml.dump(metrics, file, default_flow_style=False) + + # Plot confusion matrix + cm = confusion_matrix(y, predictions) + + disp = ConfusionMatrixDisplay(confusion_matrix=cm) + disp.plot() + + # Save confusion matrix + plt.savefig(PROJECT_ROOT / "outputs" / "confusion_matrix.png", dpi=150, bbox_inches='tight', pad_inches=0) + + print(f"Evaluation done!") + print(metrics) \ No newline at end of file diff --git a/src/train.py b/src/train.py new file mode 100644 index 0000000..83c833b --- /dev/null +++ b/src/train.py @@ -0,0 +1,117 @@ +import argparse +import pickle + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import tensorflow as tf +import yaml +from keras import layers, regularizers +from keras.layers import (Activation, BatchNormalization, Conv2D, Dense, + Dropout, Flatten, MaxPooling2D) +from keras.models import Sequential +from sklearn.metrics import (ConfusionMatrixDisplay, accuracy_score, + classification_report, confusion_matrix, f1_score, + log_loss, precision_score, recall_score) +from sklearn.model_selection import train_test_split +from tensorflow import keras + +from utils.find_project_root import find_project_root + + +def compile_model(model_image_size_x, model_image_size_y): + img_input = layers.Input(shape=(model_image_size_x, model_image_size_y, 4)) + + model = Sequential() + + model.add(Conv2D(4, kernel_size=(5,5), activation='relu', kernel_regularizer=regularizers.l2(l=0.01), input_shape=(model_image_size_x, model_image_size_y, 4))) + model.add(MaxPooling2D(pool_size=(2, 2))) + + model.add(Dropout(0.2)) + + model.add(Conv2D(4, kernel_size=(5,5), activation='relu', kernel_regularizer=regularizers.l2(l=0.01))) + model.add(MaxPooling2D(pool_size=(2, 2))) + + model.add(Dense(8, activation="relu")) + + model.add(Dropout(0.2)) + model.add(Flatten()) + + model.add(Dense(1, activation="sigmoid")) + + optimizer = keras.optimizers.Adam(learning_rate=MODEL_LEARNING_RATE) #Adam, RMSprop or SGD + + model.compile( + loss='binary_crossentropy' + , optimizer=optimizer + , metrics=[keras.metrics.AUC()] + # , metrics=[keras.metrics.Recall()] + ) + + model.summary() + + return(model) + +def train_estimator(model): + def calculate_class_weights(y_train): + print(y_train) + ratio_true = sum(y_train.values == 1) / len(y_train) + ratio_false = sum(y_train.values != 1) / len(y_train) + + return {0: ratio_true, 1: ratio_false} + + + estimator = model.fit(X_train, y_train, + validation_data=(X_test, y_test), + class_weight= calculate_class_weights(y_train), + epochs=MODEL_EPOCHS, + batch_size=MODEL_BATCH_SIZE, + verbose=1) + + return(estimator) + +def save_estimator(estimator): + # Training history + plt.figure() + plt.ylabel('Loss / Accuracy') + plt.xlabel('Epoch') + + for k in estimator.history.keys(): + plt.plot(estimator.history[k], label = k) + plt.legend(loc='best') + + plt.savefig(PROJECT_ROOT / "outputs" / "train_history.png", dpi=150, bbox_inches='tight', pad_inches=0) + + # Save model itself + model.save(PROJECT_ROOT / "outputs" / "model") + +if __name__ == '__main__': + + args_parser = argparse.ArgumentParser() + args_parser.add_argument('--params', dest='params', required=True) + args = args_parser.parse_args() + + with open(args.params) as param_file: + params = yaml.safe_load(param_file) + + PROJECT_ROOT = find_project_root() + DESTINATION_DIRECTORY: str = params['data_preprocess']['destination_directory'] + + MODEL_LEARNING_RATE: float = params['train']['learning_rate'] + MODEL_EPOCHS: int = params['train']['epochs'] + MODEL_BATCH_SIZE: int = params['train']['batch_size'] + + X = pickle.loads((PROJECT_ROOT / DESTINATION_DIRECTORY / "X.pckl").read_bytes()) + X_train = pickle.loads((PROJECT_ROOT / DESTINATION_DIRECTORY / "X_train.pckl").read_bytes()) + X_test = pickle.loads((PROJECT_ROOT / DESTINATION_DIRECTORY / "X_test.pckl").read_bytes()) + + y = pickle.loads((PROJECT_ROOT / DESTINATION_DIRECTORY / "y.pckl").read_bytes()) + y_train = pickle.loads((PROJECT_ROOT / DESTINATION_DIRECTORY / "y_train.pckl").read_bytes()) + y_test = pickle.loads((PROJECT_ROOT / DESTINATION_DIRECTORY / "y_test.pckl").read_bytes()) + + model_image_size_x = len(X[1]) + model_image_size_y = len(X[2]) + + model = compile_model(model_image_size_x, model_image_size_y) + estimator = train_estimator(model) + save_estimator(estimator) \ No newline at end of file diff --git a/src/utils/find_project_root.py b/src/utils/find_project_root.py new file mode 100644 index 0000000..263a94c --- /dev/null +++ b/src/utils/find_project_root.py @@ -0,0 +1,15 @@ +from pathlib import Path +from typing import Optional + +def find_project_root() -> Optional[Path]: + current = Path(".").resolve() + + while True: + if (current / ".git").exists(): + return current + + if current.parent == current: + print("WARNING: No .git dir found") + return current + + current = current.parent \ No newline at end of file From a1b1f1374d3368a9efc309b6966e7bf746c493a0 Mon Sep 17 00:00:00 2001 From: Jaskanwal Date: Wed, 29 Nov 2023 22:14:24 -0600 Subject: [PATCH 5/5] Running experiments base.pokemon_type_train=Dragon --- README.md | 1 + dvc.lock | 68 +++++++++++++++++++++++--------------------- outputs/metrics.yaml | 8 +++--- params.yaml | 2 +- src/train.py | 1 - 5 files changed, 42 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 786d582..4b31f5f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # [Data Talks Club] GitOps for ML: Converting Notebooks to Reproducible Pipelines +https://www.youtube.com/watch?v=6x6GwtNeYdI In this hands-on workshop, we’ll take a prototype in a Jupyter Notebook and transform it into a DVC pipeline. We’ll then use that pipeline locally to run diff --git a/dvc.lock b/dvc.lock index 5037edc..11ceb9f 100644 --- a/dvc.lock +++ b/dvc.lock @@ -17,7 +17,7 @@ stages: params.yaml: base: seed: 42 - pokemon_type_train: Water + pokemon_type_train: Dragon data_preprocess: source_directory: data/external destination_directory: data/processed @@ -30,11 +30,11 @@ stages: nfiles: 905 - path: data/processed/pokemon-with-image-paths.csv hash: md5 - md5: c52b925d3573270a7377e6e8bc37e90d + md5: d83c5cef1016e07c094d368a93d35bca size: 109527 - path: data/processed/pokemon.csv hash: md5 - md5: 102a3d4e0350c5934c8fd2b24c90b028 + md5: 8c9953d4fb1dcb85871f9158f32eff10 size: 47039 data_load: cmd: python src/data_load.py --params params.yaml @@ -45,7 +45,7 @@ stages: nfiles: 905 - path: data/processed/pokemon-with-image-paths.csv hash: md5 - md5: c52b925d3573270a7377e6e8bc37e90d + md5: d83c5cef1016e07c094d368a93d35bca size: 109527 - path: src/data_load.py md5: 3beae88ab58ae0cb8d355477e04f3574 @@ -54,7 +54,7 @@ stages: params.yaml: base: seed: 42 - pokemon_type_train: Water + pokemon_type_train: Dragon data_preprocess: source_directory: data/external destination_directory: data/processed @@ -66,48 +66,52 @@ stages: size: 2891610169 - path: data/processed/X_test.pckl hash: md5 - md5: 23c3aa551e6826615b71e592972997a8 + md5: 4e11c60b9a929fc30f25b7a478ca60b6 size: 581210168 - path: data/processed/X_train.pckl hash: md5 - md5: 364b2c4e8fe73f88a79ef5da6416abac + md5: e7bcb4d9113bce537fc454db4bb4a5f4 size: 2310400169 - path: data/processed/y.pckl hash: md5 - md5: f156295a269847ea043fff40f8cca0a7 - size: 7072 + md5: 36a6a2c5b8ee9201257c99762d105ab2 + size: 7073 - path: data/processed/y_test.pckl hash: md5 - md5: 05695e9b10f3c84048b6c6d0af01bd30 - size: 3247 + md5: 763d8277cd61dc8347fd3e6449ea7efd + size: 3248 - path: data/processed/y_train.pckl hash: md5 - md5: 25b4a8bb684e918396014650bfcfeffd - size: 10913 + md5: c8135ca15906dfb1766cbae114320985 + size: 10914 train: cmd: python src/train.py --params params.yaml deps: - path: data/processed/X_test.pckl - md5: 23c3aa551e6826615b71e592972997a8 + hash: md5 + md5: 4e11c60b9a929fc30f25b7a478ca60b6 size: 581210168 - path: data/processed/X_train.pckl - md5: 364b2c4e8fe73f88a79ef5da6416abac + hash: md5 + md5: e7bcb4d9113bce537fc454db4bb4a5f4 size: 2310400169 - path: data/processed/y_test.pckl - md5: 05695e9b10f3c84048b6c6d0af01bd30 - size: 3247 + hash: md5 + md5: 763d8277cd61dc8347fd3e6449ea7efd + size: 3248 - path: data/processed/y_train.pckl - md5: 25b4a8bb684e918396014650bfcfeffd - size: 10913 + hash: md5 + md5: c8135ca15906dfb1766cbae114320985 + size: 10914 - path: src/train.py hash: md5 - md5: c1c31fe2472322e96c64b5dd9bf5b2bc - size: 4078 + md5: d2a33e6d884a2c4e7699c5fd411c0300 + size: 4055 params: params.yaml: base: seed: 42 - pokemon_type_train: Water + pokemon_type_train: Dragon data_preprocess: source_directory: data/external destination_directory: data/processed @@ -121,13 +125,13 @@ stages: outs: - path: outputs/model hash: md5 - md5: c9d0434e11c3cac21a55c1c2bf2fd93d.dir + md5: 21321212d5bbe167c20b881440f5bfe6.dir size: 1507711 nfiles: 4 - path: outputs/train_history.png hash: md5 - md5: 9ef932ad29a5107e9c219e3652c9d330 - size: 54241 + md5: b85d410f992d05bf2ef3b08163f7117a + size: 53206 evaluate: cmd: python src/evaluate.py --params params.yaml deps: @@ -141,19 +145,19 @@ stages: size: 194467 - path: outputs/train_history.png hash: md5 - md5: 9ef932ad29a5107e9c219e3652c9d330 - size: 54241 + md5: b85d410f992d05bf2ef3b08163f7117a + size: 53206 params: params.yaml: base: seed: 42 - pokemon_type_train: Water + pokemon_type_train: Dragon outs: - path: outputs/confusion_matrix.png hash: md5 - md5: e10bcae55c34bfe026a65ea00acda1c2 - size: 22285 + md5: 0331572c77ef147c729e88a7b6e16fb1 + size: 23113 - path: outputs/metrics.yaml hash: md5 - md5: a744d45ab0e17293c26ac8beb9e5c522 - size: 103 + md5: 5d08a92ee9b5b19b7090dd8c42072eef + size: 90 diff --git a/outputs/metrics.yaml b/outputs/metrics.yaml index 64503ec..028c2d3 100644 --- a/outputs/metrics.yaml +++ b/outputs/metrics.yaml @@ -1,4 +1,4 @@ -acc: 0.9400749063670412 -f1: 0.823529411764706 -precision: 0.7943262411347518 -recall: 0.8549618320610687 +acc: 0.9725343320848939 +f1: 0.7659574468085107 +precision: 0.72 +recall: 0.8181818181818182 diff --git a/params.yaml b/params.yaml index 37c699c..e16c282 100644 --- a/params.yaml +++ b/params.yaml @@ -1,6 +1,6 @@ base: seed: 42 - pokemon_type_train: Water + pokemon_type_train: Dragon data_preprocess: source_directory: data/external diff --git a/src/train.py b/src/train.py index 83c833b..7f5eae4 100644 --- a/src/train.py +++ b/src/train.py @@ -54,7 +54,6 @@ def compile_model(model_image_size_x, model_image_size_y): def train_estimator(model): def calculate_class_weights(y_train): - print(y_train) ratio_true = sum(y_train.values == 1) / len(y_train) ratio_false = sum(y_train.values != 1) / len(y_train)