From 9aa6c6cf33f991d3d4f2921f7d33f1c51fb90707 Mon Sep 17 00:00:00 2001 From: Harish Allakonda Date: Wed, 12 Apr 2023 04:26:10 +0000 Subject: [PATCH 1/3] Updated configurations --- src/workshop/core/scoring/deployment.yml | 2 +- src/workshop/core/scoring/endpoint.yml | 2 +- src/workshop/data/linear_regression.joblib | Bin 5731 -> 5883 bytes 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/workshop/core/scoring/deployment.yml b/src/workshop/core/scoring/deployment.yml index 29c3500c..8f8adb6f 100644 --- a/src/workshop/core/scoring/deployment.yml +++ b/src/workshop/core/scoring/deployment.yml @@ -1,6 +1,6 @@ $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json name: green -endpoint_name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml +endpoint_name: mlops-h1-endpoint-910157 #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml model: azureml:nyc_fare_prediction:1 code_configuration: code: ./ diff --git a/src/workshop/core/scoring/endpoint.yml b/src/workshop/core/scoring/endpoint.yml index 611e0721..6dbd60b1 100644 --- a/src/workshop/core/scoring/endpoint.yml +++ b/src/workshop/core/scoring/endpoint.yml @@ -1,3 +1,3 @@ $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json -name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique +name: mlops-h1-endpoint-910157 #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique auth_mode: key diff --git a/src/workshop/data/linear_regression.joblib b/src/workshop/data/linear_regression.joblib index d6bd059086fc6a30a98af61118b41c2f0c08a9c8..8ec65470a26799b8a02c5a083d4424f1e8dc4d21 100644 GIT binary patch delta 1371 zcmZuwe{54#6z+qr>#uEW-4A4JD{O-;&cdKEQf0Tp%9dSL2oh71!Q*wk>wRhaUf=60 zWB8+}Y+=wPu=}PYj3}E4sL@CmMuPZ^iO~dg2quK7Q8I|h47h=02ngQ334-zdaZkSU zeJAIXNhtM$!eN;TE$-6h`q#ZbUV|fLNXTbVmVpn z=pE(5LJSEC8|+e0LWOfkTW%*)LL_?=S)6q^Wic2GF;P%3OUtBr0XxMZS}1m~dRi(C z*#1p*A{3G4>lrL2<&27or82#&T}%p!+{LzwAtaRtOtcKkY=Rf0ihH14Oh`^0gt~O* zi3RsihFTaf>%A1D@G&G+1_~(@L^(!3V;gD~qde{A=O|T04w@XE>VTCB;xcOI;~O#+ zi}cYH6pJI7Q<5^07VDWAi4VmUEY%=IswG!UWpoSnY;RK`EP}-_2j;>&I0xY+8`TwSX?SE;N zX(L%V&p(_#pR&uL(aU+yGXE%M{jY+Y&uLyh zkFL6vlNDZYu1P8w$xg@%VP}i%a{8zT3Iawd@QVVH9CX8lLDxWL7UzR=bdu8r0p?{= z>qs!m$N@){r!*0XB2E@uQC^^tdM<>bE>`A4VMHCiUXMhreMNmqtx0Eb(KzPRmbZEs z1G(XFZ%CVC+*X!X=x^Xp&sM~!+62}GThC0rb31KHh|w*Szzu10Tukr^zXhqiX}in` zAu+}#6iz{^AMzgJA+_O{ibQp@#A0; z_I+=e*r**2jBDlYelUFL!0(a1*ReMdd9K! zi{Cuj_}YCrk*FE;|K%F_b=0d(NNp$16<28Me(9@Hvuql^v-k2kcO^)j_~eC-O|92H z-F~qmR{Cy@HaO99I`FL*q|leSw!W2M^nLXD{?Fao?$J|`iQQ)!TJmo@i#}QfCc)~F z+N&NgHn-PP_93I~ug@Jfi1pfM9TQzY#p^)*(zQd=HWxU1zWk>7(p)f>H{=^P1K^pS z-3Q)VseQ0uLu$Bx$uruO6?^RY@m1P>^<>rb*-EgY_wKRqg(YD8YFyZ1d_>!KO^=}4 zbRjZ(3_djwr0&a{bhe-yf>r&*UhJ;)j4a==_mIo0_NePs?8FO_!5$UiN4?j5_m181?)G+X4Xq+H_01*%}#Q;r>j!x9x5z5;g zP)@)(cc7o2pTo5q8hI|xAoqc7tX-hFc!NShHt#!SciFfEL$HL+#!8B{P(GSVG^iG+ z3JCXeNj`>VUA)&NnYkst&rGb#F7TC^*8B^0<-XfiNS>X%Bc2hm_Lhyh_B z94rEANIzXHYJeY50u_h^QIIb3VgkrO00;zf5Cjw;7=(aO5CL5t6=kO^(3j*CK`eSx z{<;j(^il72pc6qugb5u~=!sHvNwHi;*Sbp4jH2KPzeWdItK=4(D^)Fo2HjSr{hQj1 zx`MY72DCV2_X0f?vX5ATs72ACfi)sc(IwI#JPw=;t@=ky`j7a9Dts3)r=nwO<@_Ie z{*QVNaS={JhsGjOo{w|xw@PDV*IF67RHYd)L#vi5H9Dc_3E%1?cfv9rI-Lw_7m=8D zNPJb)pxnp|H@&0K6Xal>jKGvb@hHszohQJ=LJK9*FU|SrJ@j6>80a3a_XzE6Q`VLb z|7u^qjr=3~)AZs2rGC6>*s-ViBL2!hu}hcs9`yiQ$MyeT*FF2R^gjuy+s&zqvm_$?1``0>B&BPYFGK;yx@Mk z?)!9wZejx0#?F@QZ5<^`W{Shc2Rleb>!><=-!$3%)@AdV7YFc3*mXbobSJs4Hn-!3 zb%uN;rJE{k_?ZkkEjG`NKOj4LsyDoG=PoJwH+`PO+{ec^u6c01_a=U~vwBBbW*=GK z5${gF>i-LFNw{?3WL+*|M&(Gk@jUR1~`KA-EPg=nWXGh7`>axC69R3aOjGxWV zyfcGkeA};0MJbo(Yok> Date: Wed, 12 Apr 2023 04:58:49 +0000 Subject: [PATCH 2/3] configurations update --- .github/workflows/workshop_unit_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/workshop_unit_test.yml b/.github/workflows/workshop_unit_test.yml index 3c1382c3..708eab4d 100644 --- a/.github/workflows/workshop_unit_test.yml +++ b/.github/workflows/workshop_unit_test.yml @@ -31,7 +31,7 @@ jobs: - name: Install AZ ML and tools run: | # SETUP line 34 to point to your own AML workspace az extension add -n ml -y --version 2.2.1 - az configure --defaults group=azureml workspace=ws01ent location=westus2 + az configure --defaults group=mlops-rg-910157 workspace=aml910157 location=eastus - name: Run Feature Engineering uses: ./.github/actions/aml-job-create with: From b0fdb6bb5dd1c205110bf0301ffe65f1ac5d0805 Mon Sep 17 00:00:00 2001 From: Harish Allakonda Date: Wed, 12 Apr 2023 07:14:14 +0000 Subject: [PATCH 3/3] Changed alpha from 100000 to 100 for improvement --- src/workshop/core/training/.amlignore | 6 + src/workshop/core/training/.amlignore.amltmp | 6 + src/workshop/core/training/ml_training.py | 2 +- .../core/training/ml_training.py.amltmp | 103 ++++++++++++++++++ .../core/training/ml_training.py.save | 103 ++++++++++++++++++ 5 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 src/workshop/core/training/.amlignore create mode 100644 src/workshop/core/training/.amlignore.amltmp create mode 100644 src/workshop/core/training/ml_training.py.amltmp create mode 100644 src/workshop/core/training/ml_training.py.save diff --git a/src/workshop/core/training/.amlignore b/src/workshop/core/training/.amlignore new file mode 100644 index 00000000..0621f9fc --- /dev/null +++ b/src/workshop/core/training/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/training/.amlignore.amltmp b/src/workshop/core/training/.amlignore.amltmp new file mode 100644 index 00000000..0621f9fc --- /dev/null +++ b/src/workshop/core/training/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/training/ml_training.py b/src/workshop/core/training/ml_training.py index 6f59dcdd..2f9aaf62 100644 --- a/src/workshop/core/training/ml_training.py +++ b/src/workshop/core/training/ml_training.py @@ -43,7 +43,7 @@ def createClassModel(algo_name, catg, nums): #--------------------------------------------- #setup: Update alpha value #--------------------------------------------- - model = Ridge(alpha=100000) #setup + model = Ridge(alpha=100) #setup elif algo_name == 'random_forest': model = RandomForestRegressor() else: diff --git a/src/workshop/core/training/ml_training.py.amltmp b/src/workshop/core/training/ml_training.py.amltmp new file mode 100644 index 00000000..2f9aaf62 --- /dev/null +++ b/src/workshop/core/training/ml_training.py.amltmp @@ -0,0 +1,103 @@ +import pandas as pd +import numpy as np +import os +import argparse +import mlflow +import mlflow.sklearn +from azureml.core import Run, Dataset,Datastore, Workspace +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import Ridge +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.compose import ColumnTransformer +from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error +import joblib +def parse_args(): + # arg parser + parser = argparse.ArgumentParser() + + parser.add_argument("--prep_data", default="data", type=str, help="Path to prepped data, default to local folder") + parser.add_argument("--model_folder", type=str,default="data", help="Path of model ouput folder, default to local folder") + parser.add_argument("--input_file_name", type=str, default="final_df.parquet") + parser.add_argument("--run_mode", type=str, default="local") + + + # parse args + args = parser.parse_args() + + # return args + return args + + +def createClassModel(algo_name, catg, nums): + numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]) + + categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="MISSING")), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) + + preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)]) + + if algo_name == 'linear_regression': + #--------------------------------------------- + #setup: Update alpha value + #--------------------------------------------- + model = Ridge(alpha=100) #setup + elif algo_name == 'random_forest': + model = RandomForestRegressor() + else: + pass + + ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), ("model", model)]) + + return ModelPipeline + +def main(args): + + # read in data + final_df = pd.read_parquet(os.path.join(args.prep_data,args.input_file_name)) + catg_cols = ["vendorID", "month_num", "day_of_month", "normalizeHolidayName", "isPaidTimeOff"] + num_cols = ["passengerCount", "tripDistance", "precipTime", "temperature", "precipDepth", "hr_sin", "hr_cos", "dy_sin", "dy_cos"] + label = ["totalAmount"] + # make sure categorical columns are strings + final_df[catg_cols] = final_df[catg_cols].astype("str") + + # split data + X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222) + + # test 2 algorithms + os.makedirs(args.model_folder, exist_ok=True) + + algorithmname = "linear_regression" + fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline + fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine + + y_pred = fitPipeline.predict(X_test) # score with fitted pipeline + + # Evaluate + r2 = r2_score(y_test, y_pred) + mape = mean_absolute_percentage_error(y_test, y_pred) + rmse = np.sqrt(mean_squared_error(y_test, y_pred)) + + + joblib.dump(fitPipeline,args.model_folder+"/"+algorithmname+".joblib") + + print("Training finished!. Metrics:") + print(f"R2_{algorithmname}", r2) + print(f"MAPE_{algorithmname}", mape) + print(f"RMSE_{algorithmname}", rmse) + print("Model",args.model_folder+"/"+algorithmname+".joblib","saved!") + + if args.run_mode == 'remote': + mlflow.log_metric(f"R2_{algorithmname}", r2) + mlflow.log_metric(f"MAPE_{algorithmname}", mape) + mlflow.log_metric(f"RMSE_{algorithmname}", rmse) + mlflow.sklearn.log_model(fitPipeline,f"{algorithmname}_model") + +# run script +if __name__ == "__main__": + # parse args + args = parse_args() + # run main function + main(args) \ No newline at end of file diff --git a/src/workshop/core/training/ml_training.py.save b/src/workshop/core/training/ml_training.py.save new file mode 100644 index 00000000..c85b8ad5 --- /dev/null +++ b/src/workshop/core/training/ml_training.py.save @@ -0,0 +1,103 @@ +import pandas as pd +import numpy as np +import os +import argparse +import mlflow +import mlflow.sklearn +from azureml.core import Run, Dataset,Datastore, Workspace +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import Ridge +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.compose import ColumnTransformer +from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error +import joblib +def parse_args(): + # arg parser + parser = argparse.ArgumentParser() + + parser.add_argument("--prep_data", default="data", type=str, help="Path to prepped data, default to local folder") + parser.add_argument("--model_folder", type=str,default="data", help="Path of model ouput folder, default to local folder") + parser.add_argument("--input_file_name", type=str, default="final_df.parquet") + parser.add_argument("--run_mode", type=str, default="local") + + + # parse args + args = parser.parse_args() + + # return args + return args + + +def createClassModel(algo_name, catg, nums): + numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]) + + categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="MISSING")), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) + + preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)]) + + if algo_name == 'linear_regression': + #--------------------------------------------- + #setup: Update alpha value + #--------------------------------------------- + model = Ridge(alpha=100) #setup + elif algo_name == 'random_forest': + model = RandomForestRegressor() + else: + pass + + ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), ("model", model)]) + + return ModelPipeline + +def main(args): + + # read in data + final_df = pd.read_parquet(os.path.join(args.prep_data,args.input_file_name)) + catg_cols = ["vendorID", "month_num", "day_of_month", "normalizeHolidayName", "isPaidTimeOff"] + num_cols = ["passengerCount", "tripDistance", "precipTime", "temperature", "precipDepth", "hr_sin", "hr_cos", "dy_sin", "dy_cos"] + label = ["totalAmount"] + # make sure categorical columns are strings + final_df[catg_cols] = final_df[catg_cols].astype("str") + + # split data + X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222) + + # test 2 algorithms + os.makedirs(args.model_folder, exist_ok=True) + + algorithmname = "linear_regression" + fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline + fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine + + y_pred = fitPipeline.predict(X_test) # score with fitted pipeline + + # Evaluate + r2 = r2_score(y_test, y_pred) + mape = mean_absolute_percentage_error(y_test, y_pred) + rmse = np.sqrt(mean_squared_error(y_test, y_pred)) + + + joblib.dump(fitPipeline,args.model_folder+"/"+algorithmname+".joblib") + + print("Training finished!. Metrics:") + print(f"R2_{algorithmname}", r2) + print(f"MAPE_{algorithmname}", mape) + print(f"RMSE_{algorithmname}", rmse) + print("Model",args.model_folder+"/"+algorithmname+".joblib","saved!") + + if args.run_mode == 'remote': + mlflow.log_metric(f"R2_{algorithmname}", r2) + mlflow.log_metric(f"MAPE_{algorithmname}", mape) + mlflow.log_metric(f"RMSE_{algorithmname}", rmse) + mlflow.sklearn.log_model(fitPipeline,f"{algorithmname}_model") + +# run script +if __name__ == "__main__": + # parse args + args = parse_args() + # run main function + main(args)