Skip to content
This repository was archived by the owner on Sep 25, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .dvc/config
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[core]
remote = driveremote
['remote "driveremote"']
url = gdrive://11lAGBPn-ZVRMUzzyaHcCGiEgwf-wdiFg
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# [Data Talks Club] GitOps for ML: Converting Notebooks to Reproducible Pipelines
https://www.youtube.com/watch?v=6x6GwtNeYdI

In this hands-on workshop, we’ll take a prototype in a Jupyter Notebook and
transform it into a DVC pipeline. We’ll then use that pipeline locally to run
Expand Down
1 change: 1 addition & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/external
5 changes: 5 additions & 0 deletions data/external.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
outs:
- md5: 97f98670ff36fe230e610d174236c6eb.dir
size: 118339628
nfiles: 906
path: external
9 changes: 9 additions & 0 deletions data/processed/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
/pokemon.csv
/pokemon-with-image-paths.csv
/pokemon
/X.pckl
/X_train.pckl
/X_test.pckl
/y.pckl
/y_train.pckl
/y_test.pckl
163 changes: 163 additions & 0 deletions dvc.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
schema: '2.0'
stages:
data_preprocess:
cmd: python src/data_preprocess.py --params params.yaml
deps:
- path: data/external/images
md5: c48566c030889e71a791e1412d7dddf7.dir
size: 118179012
nfiles: 905
- path: data/external/stats/pokemon-gen-1-8.csv
md5: b0b029c2dc01bd0e6f2dbf9eeae8c14e
size: 160616
- path: src/data_preprocess.py
md5: 9044f1c3ce91734931cdcf353240c429
size: 3167
params:
params.yaml:
base:
seed: 42
pokemon_type_train: Dragon
data_preprocess:
source_directory: data/external
destination_directory: data/processed
dataset_labels: stats/pokemon-gen-1-8.csv
dataset_images: images
outs:
- path: data/processed/pokemon
md5: c48566c030889e71a791e1412d7dddf7.dir
size: 118179012
nfiles: 905
- path: data/processed/pokemon-with-image-paths.csv
hash: md5
md5: d83c5cef1016e07c094d368a93d35bca
size: 109527
- path: data/processed/pokemon.csv
hash: md5
md5: 8c9953d4fb1dcb85871f9158f32eff10
size: 47039
data_load:
cmd: python src/data_load.py --params params.yaml
deps:
- path: data/processed/pokemon
md5: c48566c030889e71a791e1412d7dddf7.dir
size: 118179012
nfiles: 905
- path: data/processed/pokemon-with-image-paths.csv
hash: md5
md5: d83c5cef1016e07c094d368a93d35bca
size: 109527
- path: src/data_load.py
md5: 3beae88ab58ae0cb8d355477e04f3574
size: 2073
params:
params.yaml:
base:
seed: 42
pokemon_type_train: Dragon
data_preprocess:
source_directory: data/external
destination_directory: data/processed
dataset_labels: stats/pokemon-gen-1-8.csv
dataset_images: images
outs:
- path: data/processed/X.pckl
md5: 8f6555c6159e27739fd97a61ace395a4
size: 2891610169
- path: data/processed/X_test.pckl
hash: md5
md5: 4e11c60b9a929fc30f25b7a478ca60b6
size: 581210168
- path: data/processed/X_train.pckl
hash: md5
md5: e7bcb4d9113bce537fc454db4bb4a5f4
size: 2310400169
- path: data/processed/y.pckl
hash: md5
md5: 36a6a2c5b8ee9201257c99762d105ab2
size: 7073
- path: data/processed/y_test.pckl
hash: md5
md5: 763d8277cd61dc8347fd3e6449ea7efd
size: 3248
- path: data/processed/y_train.pckl
hash: md5
md5: c8135ca15906dfb1766cbae114320985
size: 10914
train:
cmd: python src/train.py --params params.yaml
deps:
- path: data/processed/X_test.pckl
hash: md5
md5: 4e11c60b9a929fc30f25b7a478ca60b6
size: 581210168
- path: data/processed/X_train.pckl
hash: md5
md5: e7bcb4d9113bce537fc454db4bb4a5f4
size: 2310400169
- path: data/processed/y_test.pckl
hash: md5
md5: 763d8277cd61dc8347fd3e6449ea7efd
size: 3248
- path: data/processed/y_train.pckl
hash: md5
md5: c8135ca15906dfb1766cbae114320985
size: 10914
- path: src/train.py
hash: md5
md5: d2a33e6d884a2c4e7699c5fd411c0300
size: 4055
params:
params.yaml:
base:
seed: 42
pokemon_type_train: Dragon
data_preprocess:
source_directory: data/external
destination_directory: data/processed
dataset_labels: stats/pokemon-gen-1-8.csv
dataset_images: images
train:
test_size: 0.2
learning_rate: 0.001
epochs: 15
batch_size: 120
outs:
- path: outputs/model
hash: md5
md5: 21321212d5bbe167c20b881440f5bfe6.dir
size: 1507711
nfiles: 4
- path: outputs/train_history.png
hash: md5
md5: b85d410f992d05bf2ef3b08163f7117a
size: 53206
evaluate:
cmd: python src/evaluate.py --params params.yaml
deps:
- path: outputs/model/keras_metadata.pb
hash: md5
md5: c7f12fbd11650bd6ea7c3fdd7b53e8f4
size: 17311
- path: outputs/model/saved_model.pb
hash: md5
md5: 044e312dae1c89084aaf8af5d9551418
size: 194467
- path: outputs/train_history.png
hash: md5
md5: b85d410f992d05bf2ef3b08163f7117a
size: 53206
params:
params.yaml:
base:
seed: 42
pokemon_type_train: Dragon
outs:
- path: outputs/confusion_matrix.png
hash: md5
md5: 0331572c77ef147c729e88a7b6e16fb1
size: 23113
- path: outputs/metrics.yaml
hash: md5
md5: 5d08a92ee9b5b19b7090dd8c42072eef
size: 90
58 changes: 58 additions & 0 deletions dvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
stages:
data_preprocess:
cmd: python src/data_preprocess.py --params params.yaml
deps:
- src/data_preprocess.py
- data/external/images
- data/external/stats/pokemon-gen-1-8.csv
outs:
- data/processed/pokemon.csv
- data/processed/pokemon-with-image-paths.csv
- data/processed/pokemon
params:
- base
- data_preprocess
data_load:
cmd: python src/data_load.py --params params.yaml
deps:
- src/data_load.py
- data/processed/pokemon-with-image-paths.csv
- data/processed/pokemon
outs:
- data/processed/X.pckl
- data/processed/X_train.pckl
- data/processed/X_test.pckl
- data/processed/y.pckl
- data/processed/y_train.pckl
- data/processed/y_test.pckl
params:
- base
- data_preprocess
train:
cmd: python src/train.py --params params.yaml
deps:
- src/train.py
- data/processed/X_train.pckl
- data/processed/X_test.pckl
- data/processed/y_train.pckl
- data/processed/y_test.pckl
outs:
- outputs/model
- outputs/train_history.png
params:
- base
- data_preprocess
- train
evaluate:
cmd: python src/evaluate.py --params params.yaml
deps:
- outputs/model/keras_metadata.pb
- outputs/model/saved_model.pb
- outputs/train_history.png
outs:
- outputs/confusion_matrix.png
params:
- base
metrics:
- outputs/metrics.yaml:
cache: false
3 changes: 3 additions & 0 deletions outputs/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/model
/train_history.png
/confusion_matrix.png
4 changes: 4 additions & 0 deletions outputs/metrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
acc: 0.9725343320848939
f1: 0.7659574468085107
precision: 0.72
recall: 0.8181818181818182
15 changes: 15 additions & 0 deletions params.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
base:
seed: 42
pokemon_type_train: Dragon

data_preprocess:
source_directory: data/external
destination_directory: data/processed
dataset_labels: stats/pokemon-gen-1-8.csv
dataset_images: images

train:
test_size: 0.2
learning_rate: 0.001
epochs: 15
batch_size: 120
62 changes: 62 additions & 0 deletions src/data_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import argparse
import pickle

import numpy as np
import pandas as pd
import tensorflow as tf
import yaml
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from utils.find_project_root import find_project_root


# Load training images
def load_training_data(labels) -> np.array:
train_image = []

for i in tqdm(range(labels.shape[0])):

img = tf.keras.utils.load_img(labels.iloc[i]["imagePath"], color_mode='rgba')
img = tf.keras.utils.img_to_array(img)
img = img/255
train_image.append(img)
X = np.array(train_image)

return(X)

# Create labels
def create_labels(labels):
return(pokemon[["is" + POKEMON_TYPE_TRAIN]])

if __name__ == '__main__':

args_parser = argparse.ArgumentParser()
args_parser.add_argument('--params', dest='params', required=True)
args = args_parser.parse_args()

with open(args.params) as param_file:
params = yaml.safe_load(param_file)

PROJECT_ROOT = find_project_root()

SEED: str = params['base']['seed']
POKEMON_TYPE_TRAIN: str = params['base']['pokemon_type_train']

DESTINATION_DIRECTORY: str = params['data_preprocess']['destination_directory']
MODEL_TEST_SIZE: float = params['train']['test_size']

pokemon = pd.read_csv(PROJECT_ROOT / DESTINATION_DIRECTORY / "pokemon-with-image-paths.csv")

X = load_training_data(pokemon)
y = create_labels(pokemon)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED, test_size=MODEL_TEST_SIZE, stratify=y)

pickle.dump(X, open(PROJECT_ROOT / DESTINATION_DIRECTORY / "X.pckl", "wb"))
pickle.dump(X_train, open(PROJECT_ROOT / DESTINATION_DIRECTORY / "X_train.pckl", "wb"))
pickle.dump(X_test, open(PROJECT_ROOT / DESTINATION_DIRECTORY / "X_test.pckl", "wb"))

pickle.dump(y, open(PROJECT_ROOT / DESTINATION_DIRECTORY / "y.pckl", "wb"))
pickle.dump(y_train, open(PROJECT_ROOT / DESTINATION_DIRECTORY / "y_train.pckl", "wb"))
pickle.dump(y_test, open(PROJECT_ROOT / DESTINATION_DIRECTORY / "y_test.pckl", "wb"))
Loading