From 75800d00c36dfadc3b8b76f9f32f70d543393688 Mon Sep 17 00:00:00 2001 From: Drew Minnear Date: Fri, 21 Nov 2025 09:56:32 -0500 Subject: [PATCH 1/2] try removing unnecessary things --- charts/rhods/templates/dsc.yaml | 15 +----------- charts/rhods/templates/dsci.yaml | 21 ----------------- values-global.yaml | 10 ++++---- values-hub.yaml => values-prod.yaml | 36 +---------------------------- values-secret.yaml.template | 3 --- 5 files changed, 6 insertions(+), 79 deletions(-) delete mode 100644 charts/rhods/templates/dsci.yaml rename values-hub.yaml => values-prod.yaml (59%) delete mode 100644 values-secret.yaml.template diff --git a/charts/rhods/templates/dsc.yaml b/charts/rhods/templates/dsc.yaml index 957b186..29d456d 100644 --- a/charts/rhods/templates/dsc.yaml +++ b/charts/rhods/templates/dsc.yaml @@ -4,15 +4,13 @@ metadata: name: default-dsc spec: components: - codeflare: - managementState: Removed dashboard: managementState: Managed datasciencepipelines: managementState: Managed kserve: managementState: Managed - defaultDeploymentMode: Serverless + defaultDeploymentMode: RawDeployment rawDeploymentServiceConfig: Headed serving: ingressGateway: @@ -21,18 +19,7 @@ spec: type: OpenshiftDefaultIngress managementState: Managed name: knative-serving - kueue: - managementState: Removed modelmeshserving: managementState: Managed - ray: - managementState: Removed - trainingoperator: - managementState: Removed - trustyai: - managementState: Removed workbenches: managementState: Managed - modelregistry: - managementState: Managed - registriesNamespace: rhoai-model-registries diff --git a/charts/rhods/templates/dsci.yaml b/charts/rhods/templates/dsci.yaml deleted file mode 100644 index a77d1d4..0000000 --- a/charts/rhods/templates/dsci.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: dscinitialization.opendatahub.io/v1 -kind: DSCInitialization -metadata: - name: default-dsci -spec: - applicationsNamespace: redhat-ods-applications - monitoring: - managementState: Managed - namespace: redhat-ods-monitoring - serviceMesh: - auth: - audiences: - - 'https://kubernetes.default.svc' - controlPlane: - metricsCollection: Istio - name: data-science-smcp - namespace: istio-system - managementState: Managed - trustedCABundle: - customCABundle: '' - managementState: Managed diff --git a/values-global.yaml b/values-global.yaml index 81aefe9..f705917 100644 --- a/values-global.yaml +++ b/values-global.yaml @@ -1,12 +1,10 @@ ---- global: pattern: mlops - options: - useCSV: false - syncPolicy: Automatic - installPlanApproval: Automatic + secretLoader: + disabled: true + main: - clusterGroupName: hub + clusterGroupName: prod multiSourceConfig: enabled: true clusterGroupChartVersion: "0.9.*" diff --git a/values-hub.yaml b/values-prod.yaml similarity index 59% rename from values-hub.yaml rename to values-prod.yaml index 27eee4c..b531791 100644 --- a/values-hub.yaml +++ b/values-prod.yaml @@ -1,10 +1,8 @@ clusterGroup: - name: hub + name: prod isHubCluster: true namespaces: - - vault - - golang-external-secrets - inferencing-app - fraud-detection: labels: @@ -17,68 +15,36 @@ clusterGroup: operatorGroup: true targetNamespaces: [] - projects: - - hub - - rhods - - inferencing-app - - fraud-detection - subscriptions: rhods: name: rhods-operator namespace: redhat-ods-operator - channel: stable servicemesh: name: servicemeshoperator namespace: openshift-operators - channel: stable serverless: name: serverless-operator namespace: openshift-serverless - channel: stable - - authorino: - name: authorino-operator - namespace: openshift-operators - channel: stable applications: - vault: - name: vault - namespace: vault - project: hub - chart: hashicorp-vault - chartVersion: 0.1.* - - golang-external-secrets: - name: golang-external-secrets - namespace: golang-external-secrets - project: hub - chart: golang-external-secrets - chartVersion: 0.1.* - rhods: name: rhods namespace: redhat-ods-operator - project: rhods path: charts/rhods inferencing-app: name: inferencing-app namespace: inferencing-app - project: inferencing-app path: charts/inferencing-app minio: name: minio-storage namespace: fraud-detection - project: fraud-detection path: charts/minio fraud-detection: name: fraud-detection namespace: fraud-detection - project: fraud-detection path: charts/fraud-detection diff --git a/values-secret.yaml.template b/values-secret.yaml.template deleted file mode 100644 index 751f784..0000000 --- a/values-secret.yaml.template +++ /dev/null @@ -1,3 +0,0 @@ -version: "2.0" - -secrets: From d0b5019a750bfd0eacbae7c7a2c1948f6f0db09a Mon Sep 17 00:00:00 2001 From: Drew Minnear Date: Fri, 21 Nov 2025 11:11:14 -0500 Subject: [PATCH 2/2] improve model --- .gitignore | 1 + charts/rhods/templates/dsci.yaml | 21 ++ src/kubeflow-pipelines/environment.yaml | 83 ++++++- src/kubeflow-pipelines/requirements.in | 8 + src/kubeflow-pipelines/requirements.txt | 215 ++++++++++++++++++ .../small-model/train_upload_model.py | 78 +++++-- .../small-model/train_upload_model.yaml | 112 +++++---- 7 files changed, 447 insertions(+), 71 deletions(-) create mode 100644 .gitignore create mode 100644 charts/rhods/templates/dsci.yaml create mode 100644 src/kubeflow-pipelines/requirements.in create mode 100644 src/kubeflow-pipelines/requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/charts/rhods/templates/dsci.yaml b/charts/rhods/templates/dsci.yaml new file mode 100644 index 0000000..a77d1d4 --- /dev/null +++ b/charts/rhods/templates/dsci.yaml @@ -0,0 +1,21 @@ +apiVersion: dscinitialization.opendatahub.io/v1 +kind: DSCInitialization +metadata: + name: default-dsci +spec: + applicationsNamespace: redhat-ods-applications + monitoring: + managementState: Managed + namespace: redhat-ods-monitoring + serviceMesh: + auth: + audiences: + - 'https://kubernetes.default.svc' + controlPlane: + metricsCollection: Istio + name: data-science-smcp + namespace: istio-system + managementState: Managed + trustedCABundle: + customCABundle: '' + managementState: Managed diff --git a/src/kubeflow-pipelines/environment.yaml b/src/kubeflow-pipelines/environment.yaml index 53f8bd1..5a82fa2 100644 --- a/src/kubeflow-pipelines/environment.yaml +++ b/src/kubeflow-pipelines/environment.yaml @@ -3,14 +3,79 @@ channels: - conda-forge - defaults dependencies: - - python=3.11 - - kfp + - python=3.12 + - ipykernel - pip + - pip-tools - pip: - - kfp-kubernetes - - numpy==2.2.5 - - pandas==2.2.3 - - torch==2.6.0 - - scikit-learn==1.6.1 - - boto3==1.37.38 - - botocore==1.37.38 + - boto3==1.41.1 + - botocore==1.41.1 + - cachetools==6.2.2 + - certifi==2025.11.12 + - charset-normalizer==3.4.4 + - click==8.1.8 + - click-option-group==0.5.7 + - docstring-parser==0.17.0 + - filelock==3.20.0 + - fsspec==2025.10.0 + - google-api-core==2.28.1 + - google-auth==2.43.0 + - google-cloud-core==2.5.0 + - google-cloud-storage==3.6.0 + - google-crc32c==1.7.1 + - google-resumable-media==2.8.0 + - googleapis-common-protos==1.72.0 + - idna==3.11 + - jinja2==3.1.6 + - jmespath==1.0.1 + - joblib==1.5.2 + - kfp==2.14.6 + - kfp-kubernetes==2.14.6 + - kfp-pipeline-spec==2.14.6 + - kfp-server-api==2.14.6 + - kubernetes==30.1.0 + - markupsafe==3.0.3 + - mpmath==1.3.0 + - networkx==3.5 + - numpy==2.3.5 + - nvidia-cublas-cu12==12.8.4.1 + - nvidia-cuda-cupti-cu12==12.8.90 + - nvidia-cuda-nvrtc-cu12==12.8.93 + - nvidia-cuda-runtime-cu12==12.8.90 + - nvidia-cudnn-cu12==9.10.2.21 + - nvidia-cufft-cu12==11.3.3.83 + - nvidia-cufile-cu12==1.13.1.3 + - nvidia-curand-cu12==10.3.9.90 + - nvidia-cusolver-cu12==11.7.3.90 + - nvidia-cusparse-cu12==12.5.8.93 + - nvidia-cusparselt-cu12==0.7.1 + - nvidia-nccl-cu12==2.27.5 + - nvidia-nvjitlink-cu12==12.8.93 + - nvidia-nvshmem-cu12==3.3.20 + - nvidia-nvtx-cu12==12.8.90 + - oauthlib==3.3.1 + - pandas==2.3.3 + - proto-plus==1.26.1 + - protobuf==6.33.1 + - pyasn1==0.6.1 + - pyasn1-modules==0.4.2 + - python-dateutil==2.9.0.post0 + - pytz==2025.2 + - pyyaml==6.0.3 + - requests==2.32.5 + - requests-oauthlib==2.0.0 + - requests-toolbelt==1.0.0 + - rsa==4.9.1 + - s3transfer==0.15.0 + - scikit-learn==1.7.2 + - scipy==1.16.3 + - six==1.17.0 + - sympy==1.14.0 + - tabulate==0.9.0 + - threadpoolctl==3.6.0 + - torch==2.9.1 + - triton==3.5.1 + - typing-extensions==4.15.0 + - tzdata==2025.2 + - urllib3==2.5.0 + - websocket-client==1.9.0 diff --git a/src/kubeflow-pipelines/requirements.in b/src/kubeflow-pipelines/requirements.in new file mode 100644 index 0000000..9f2f6be --- /dev/null +++ b/src/kubeflow-pipelines/requirements.in @@ -0,0 +1,8 @@ +kfp +kfp-kubernetes +numpy +pandas +torch +scikit-learn +boto3 +botocore diff --git a/src/kubeflow-pipelines/requirements.txt b/src/kubeflow-pipelines/requirements.txt new file mode 100644 index 0000000..bfebfc0 --- /dev/null +++ b/src/kubeflow-pipelines/requirements.txt @@ -0,0 +1,215 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile +# +boto3==1.41.1 + # via -r requirements.in +botocore==1.41.1 + # via + # -r requirements.in + # boto3 + # s3transfer +cachetools==6.2.2 + # via google-auth +certifi==2025.11.12 + # via + # kfp-server-api + # kubernetes + # requests +charset-normalizer==3.4.4 + # via requests +click==8.1.8 + # via + # click-option-group + # kfp +click-option-group==0.5.7 + # via kfp +docstring-parser==0.17.0 + # via kfp +filelock==3.20.0 + # via torch +fsspec==2025.10.0 + # via torch +google-api-core==2.28.1 + # via + # google-cloud-core + # google-cloud-storage + # kfp +google-auth==2.43.0 + # via + # google-api-core + # google-cloud-core + # google-cloud-storage + # kfp + # kubernetes +google-cloud-core==2.5.0 + # via google-cloud-storage +google-cloud-storage==3.6.0 + # via kfp +google-crc32c==1.7.1 + # via + # google-cloud-storage + # google-resumable-media +google-resumable-media==2.8.0 + # via google-cloud-storage +googleapis-common-protos==1.72.0 + # via google-api-core +idna==3.11 + # via requests +jinja2==3.1.6 + # via torch +jmespath==1.0.1 + # via + # boto3 + # botocore +joblib==1.5.2 + # via scikit-learn +kfp==2.14.6 + # via + # -r requirements.in + # kfp-kubernetes +kfp-kubernetes==2.14.6 + # via -r requirements.in +kfp-pipeline-spec==2.14.6 + # via kfp +kfp-server-api==2.14.6 + # via kfp +kubernetes==30.1.0 + # via kfp +markupsafe==3.0.3 + # via jinja2 +mpmath==1.3.0 + # via sympy +networkx==3.5 + # via torch +numpy==2.3.5 + # via + # -r requirements.in + # pandas + # scikit-learn + # scipy +nvidia-cublas-cu12==12.8.4.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.8.90 + # via torch +nvidia-cuda-nvrtc-cu12==12.8.93 + # via torch +nvidia-cuda-runtime-cu12==12.8.90 + # via torch +nvidia-cudnn-cu12==9.10.2.21 + # via torch +nvidia-cufft-cu12==11.3.3.83 + # via torch +nvidia-cufile-cu12==1.13.1.3 + # via torch +nvidia-curand-cu12==10.3.9.90 + # via torch +nvidia-cusolver-cu12==11.7.3.90 + # via torch +nvidia-cusparse-cu12==12.5.8.93 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-cusparselt-cu12==0.7.1 + # via torch +nvidia-nccl-cu12==2.27.5 + # via torch +nvidia-nvjitlink-cu12==12.8.93 + # via + # nvidia-cufft-cu12 + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvshmem-cu12==3.3.20 + # via torch +nvidia-nvtx-cu12==12.8.90 + # via torch +oauthlib==3.3.1 + # via + # kubernetes + # requests-oauthlib +pandas==2.3.3 + # via -r requirements.in +proto-plus==1.26.1 + # via google-api-core +protobuf==6.33.1 + # via + # google-api-core + # googleapis-common-protos + # kfp + # kfp-kubernetes + # kfp-pipeline-spec + # proto-plus +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 + # via google-auth +python-dateutil==2.9.0.post0 + # via + # botocore + # kfp-server-api + # kubernetes + # pandas +pytz==2025.2 + # via pandas +pyyaml==6.0.3 + # via + # kfp + # kubernetes +requests==2.32.5 + # via + # google-api-core + # google-cloud-storage + # kubernetes + # requests-oauthlib + # requests-toolbelt +requests-oauthlib==2.0.0 + # via kubernetes +requests-toolbelt==1.0.0 + # via kfp +rsa==4.9.1 + # via google-auth +s3transfer==0.15.0 + # via boto3 +scikit-learn==1.7.2 + # via -r requirements.in +scipy==1.16.3 + # via scikit-learn +six==1.17.0 + # via + # kfp-server-api + # kubernetes + # python-dateutil +sympy==1.14.0 + # via torch +tabulate==0.9.0 + # via kfp +threadpoolctl==3.6.0 + # via scikit-learn +torch==2.9.1 + # via -r requirements.in +triton==3.5.1 + # via torch +typing-extensions==4.15.0 + # via torch +tzdata==2025.2 + # via pandas +urllib3==2.5.0 + # via + # botocore + # kfp + # kfp-server-api + # kubernetes + # requests +websocket-client==1.9.0 + # via kubernetes + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/src/kubeflow-pipelines/small-model/train_upload_model.py b/src/kubeflow-pipelines/small-model/train_upload_model.py index 460fb61..d26314b 100644 --- a/src/kubeflow-pipelines/small-model/train_upload_model.py +++ b/src/kubeflow-pipelines/small-model/train_upload_model.py @@ -5,7 +5,7 @@ @dsl.component( - base_image="quay.io/modh/runtime-images:runtime-cuda-pytorch-ubi9-python-3.11-20250501-8e41d5c" + base_image="quay.io/modh/odh-pipeline-runtime-pytorch-cuda-py312-ubi9:rhoai-2.24-e8b7177ca2b6226a29d3aab458db7776d8fb0554" ) def get_data( train_data_output_path: OutputPath(), validate_data_output_path: OutputPath() @@ -24,7 +24,7 @@ def get_data( @dsl.component( - base_image="quay.io/modh/runtime-images:runtime-cuda-pytorch-ubi9-python-3.11-20250501-8e41d5c", + base_image="quay.io/modh/odh-pipeline-runtime-pytorch-cuda-py312-ubi9:rhoai-2.24-e8b7177ca2b6226a29d3aab458db7776d8fb0554", ) def train_model( train_data_input_path: InputPath(), @@ -40,6 +40,7 @@ def train_model( import torch.nn as nn from sklearn.preprocessing import StandardScaler from sklearn.utils import class_weight + from torch.utils.data import DataLoader, TensorDataset torch.set_default_dtype(torch.float32) @@ -92,25 +93,53 @@ def forward(self, x): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = FraudNetMedium(len(feature_cols)).to(device) - X_train_t = torch.tensor(X_train, device=device) - y_train_t = torch.tensor(y_train, device=device) + # Create data loaders for mini-batch training + train_dataset = TensorDataset( + torch.tensor(X_train, dtype=torch.float32), + torch.tensor(y_train, dtype=torch.float32), + ) + train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True) + X_val_t = torch.tensor(X_val, device=device) y_val_t = torch.tensor(y_val, device=device) - sample_weights = (y_train_t * (pos_weight[0] - 1) + 1).flatten() - criterion = nn.BCELoss(weight=sample_weights) - optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) + # Use weighted BCELoss without pre-computed sample weights + criterion = nn.BCELoss(reduction="none") + optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) # Lower learning rate - y_train_flat = y_train_t.flatten() + # Early stopping parameters + best_val_loss = float("inf") + best_model_state = None + patience = 5 + patience_counter = 0 + min_epochs = 10 + max_epochs = 30 - for epoch in range(3): + for epoch in range(max_epochs): model.train() - optimizer.zero_grad() - preds = model(X_train_t).flatten() - loss = criterion(preds, y_train_flat) - loss.backward() - optimizer.step() + train_loss = 0.0 + num_batches = 0 + + for batch_X, batch_y in train_loader: + batch_X = batch_X.to(device) + batch_y = batch_y.to(device) + + optimizer.zero_grad() + preds = model(batch_X).flatten() + + # Apply class weights per sample + batch_weights = batch_y.flatten() * (pos_weight[0] - 1) + 1 + loss = (criterion(preds, batch_y.flatten()) * batch_weights).mean() + + loss.backward() + optimizer.step() + train_loss += loss.item() + num_batches += 1 + + train_loss /= num_batches + + # Validation model.eval() with torch.no_grad(): val_preds = model(X_val_t).flatten() @@ -118,9 +147,26 @@ def forward(self, x): val_acc = ((val_preds > 0.5).float() == y_val_t.flatten()).float().mean() print( - f"Epoch {epoch + 1}: train loss {loss.item():.4f} | val loss {val_loss.item():.4f} | val acc {val_acc.item():.4f}" + f"Epoch {epoch + 1}: train loss {train_loss:.4f} | val loss {val_loss.item():.4f} | val acc {val_acc.item():.4f}" ) + # Early stopping logic + if val_loss < best_val_loss: + best_val_loss = val_loss + best_model_state = model.state_dict().copy() + patience_counter = 0 + else: + patience_counter += 1 + + if epoch >= min_epochs and patience_counter >= patience: + print(f"Early stopping at epoch {epoch + 1}") + break + + # Restore best model + if best_model_state is not None: + model.load_state_dict(best_model_state) + print(f"Restored best model with val loss {best_val_loss:.4f}") + dummy = torch.randn(1, len(feature_cols), dtype=torch.float32) torch.onnx.export( model.cpu(), @@ -134,7 +180,7 @@ def forward(self, x): @dsl.component( - base_image="quay.io/modh/runtime-images:runtime-cuda-pytorch-ubi9-python-3.11-20250501-8e41d5c", + base_image="quay.io/modh/odh-pipeline-runtime-pytorch-cuda-py312-ubi9:rhoai-2.24-e8b7177ca2b6226a29d3aab458db7776d8fb0554", ) def upload_model(input_model_path: InputPath()): import os diff --git a/src/kubeflow-pipelines/small-model/train_upload_model.yaml b/src/kubeflow-pipelines/small-model/train_upload_model.yaml index 852233e..6547634 100644 --- a/src/kubeflow-pipelines/small-model/train_upload_model.yaml +++ b/src/kubeflow-pipelines/small-model/train_upload_model.yaml @@ -53,7 +53,7 @@ deploymentSpec: - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ - \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.13.0'\ + \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.14.6'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ $0\" \"$@\"\n" - sh @@ -75,7 +75,7 @@ deploymentSpec: \ url = \"https://raw.githubusercontent.com/rh-aiservices-bu/fraud-detection/main/data/validate.csv\"\ \n urllib.request.urlretrieve(url, validate_data_output_path)\n print(\"\ validation data downloaded\")\n\n" - image: quay.io/modh/runtime-images:runtime-cuda-pytorch-ubi9-python-3.11-20250501-8e41d5c + image: quay.io/modh/odh-pipeline-runtime-pytorch-cuda-py312-ubi9:rhoai-2.24-e8b7177ca2b6226a29d3aab458db7776d8fb0554 exec-train-model: container: args: @@ -88,7 +88,7 @@ deploymentSpec: - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ - \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.13.0'\ + \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.14.6'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ $0\" \"$@\"\n" - sh @@ -106,50 +106,70 @@ deploymentSpec: \ InputPath(),\n model_output_path: OutputPath(),\n):\n import pickle\n\ \ from pathlib import Path\n\n import numpy as np\n import pandas\ \ as pd\n import torch\n import torch.nn as nn\n from sklearn.preprocessing\ - \ import StandardScaler\n from sklearn.utils import class_weight\n\n\ - \ torch.set_default_dtype(torch.float32)\n\n feature_cols = list(range(7))\n\ - \ label_col = 7\n\n df_train = pd.read_csv(train_data_input_path)\n\ - \ df_val = pd.read_csv(validate_data_input_path)\n\n X_train = df_train.iloc[:,\ - \ feature_cols].values\n y_train = df_train.iloc[:, label_col].values.reshape(-1,\ - \ 1)\n\n X_val = df_val.iloc[:, feature_cols].values\n y_val = df_val.iloc[:,\ - \ label_col].values.reshape(-1, 1)\n\n scaler = StandardScaler()\n \ - \ X_train = scaler.fit_transform(X_train).astype(\"float32\")\n X_val\ - \ = scaler.transform(X_val).astype(\"float32\")\n y_train = y_train.astype(\"\ - float32\")\n y_val = y_val.astype(\"float32\")\n\n Path(\"artifact\"\ - ).mkdir(parents=True, exist_ok=True)\n pickle.dump(scaler, open(\"artifact/scaler.pkl\"\ - , \"wb\"))\n\n cw = class_weight.compute_class_weight(\n \"balanced\"\ - , classes=np.unique(y_train), y=y_train.ravel()\n )\n pos_weight =\ - \ torch.tensor([cw[1] / cw[0]], dtype=torch.float32)\n\n class FraudNetMedium(nn.Module):\n\ - \ def __init__(self, input_dim):\n super().__init__()\n\ - \ self.net = nn.Sequential(\n nn.Linear(input_dim,\ - \ 128),\n nn.ReLU(),\n nn.Dropout(0.2),\n\ - \ nn.Linear(128, 128),\n nn.ReLU(),\n \ - \ nn.Dropout(0.2),\n nn.Linear(128, 64),\n \ - \ nn.ReLU(),\n nn.Dropout(0.2),\n \ - \ nn.Linear(64, 1),\n nn.Sigmoid(),\n )\n\n\ - \ def forward(self, x):\n return self.net(x)\n\n device\ - \ = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n \ - \ model = FraudNetMedium(len(feature_cols)).to(device)\n\n X_train_t\ - \ = torch.tensor(X_train, device=device)\n y_train_t = torch.tensor(y_train,\ - \ device=device)\n X_val_t = torch.tensor(X_val, device=device)\n \ - \ y_val_t = torch.tensor(y_val, device=device)\n\n sample_weights = (y_train_t\ - \ * (pos_weight[0] - 1) + 1).flatten()\n criterion = nn.BCELoss(weight=sample_weights)\n\ - \ optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n\n y_train_flat\ - \ = y_train_t.flatten()\n\n for epoch in range(3):\n model.train()\n\ - \ optimizer.zero_grad()\n preds = model(X_train_t).flatten()\n\ - \ loss = criterion(preds, y_train_flat)\n loss.backward()\n\ - \ optimizer.step()\n\n model.eval()\n with torch.no_grad():\n\ + \ import StandardScaler\n from sklearn.utils import class_weight\n \ + \ from torch.utils.data import DataLoader, TensorDataset\n\n torch.set_default_dtype(torch.float32)\n\ + \n feature_cols = list(range(7))\n label_col = 7\n\n df_train =\ + \ pd.read_csv(train_data_input_path)\n df_val = pd.read_csv(validate_data_input_path)\n\ + \n X_train = df_train.iloc[:, feature_cols].values\n y_train = df_train.iloc[:,\ + \ label_col].values.reshape(-1, 1)\n\n X_val = df_val.iloc[:, feature_cols].values\n\ + \ y_val = df_val.iloc[:, label_col].values.reshape(-1, 1)\n\n scaler\ + \ = StandardScaler()\n X_train = scaler.fit_transform(X_train).astype(\"\ + float32\")\n X_val = scaler.transform(X_val).astype(\"float32\")\n \ + \ y_train = y_train.astype(\"float32\")\n y_val = y_val.astype(\"float32\"\ + )\n\n Path(\"artifact\").mkdir(parents=True, exist_ok=True)\n pickle.dump(scaler,\ + \ open(\"artifact/scaler.pkl\", \"wb\"))\n\n cw = class_weight.compute_class_weight(\n\ + \ \"balanced\", classes=np.unique(y_train), y=y_train.ravel()\n \ + \ )\n pos_weight = torch.tensor([cw[1] / cw[0]], dtype=torch.float32)\n\ + \n class FraudNetMedium(nn.Module):\n def __init__(self, input_dim):\n\ + \ super().__init__()\n self.net = nn.Sequential(\n\ + \ nn.Linear(input_dim, 128),\n nn.ReLU(),\n\ + \ nn.Dropout(0.2),\n nn.Linear(128, 128),\n\ + \ nn.ReLU(),\n nn.Dropout(0.2),\n \ + \ nn.Linear(128, 64),\n nn.ReLU(),\n \ + \ nn.Dropout(0.2),\n nn.Linear(64, 1),\n \ + \ nn.Sigmoid(),\n )\n\n def forward(self, x):\n \ + \ return self.net(x)\n\n device = torch.device(\"cuda\" if torch.cuda.is_available()\ + \ else \"cpu\")\n model = FraudNetMedium(len(feature_cols)).to(device)\n\ + \n # Create data loaders for mini-batch training\n train_dataset =\ + \ TensorDataset(\n torch.tensor(X_train, dtype=torch.float32),\n\ + \ torch.tensor(y_train, dtype=torch.float32),\n )\n train_loader\ + \ = DataLoader(train_dataset, batch_size=128, shuffle=True)\n\n X_val_t\ + \ = torch.tensor(X_val, device=device)\n y_val_t = torch.tensor(y_val,\ + \ device=device)\n\n # Use weighted BCELoss without pre-computed sample\ + \ weights\n criterion = nn.BCELoss(reduction=\"none\")\n optimizer\ + \ = torch.optim.Adam(model.parameters(), lr=3e-4) # Lower learning rate\n\ + \n # Early stopping parameters\n best_val_loss = float(\"inf\")\n\ + \ best_model_state = None\n patience = 5\n patience_counter = 0\n\ + \ min_epochs = 10\n max_epochs = 30\n\n for epoch in range(max_epochs):\n\ + \ model.train()\n train_loss = 0.0\n num_batches =\ + \ 0\n\n for batch_X, batch_y in train_loader:\n batch_X\ + \ = batch_X.to(device)\n batch_y = batch_y.to(device)\n\n \ + \ optimizer.zero_grad()\n preds = model(batch_X).flatten()\n\ + \n # Apply class weights per sample\n batch_weights\ + \ = batch_y.flatten() * (pos_weight[0] - 1) + 1\n loss = (criterion(preds,\ + \ batch_y.flatten()) * batch_weights).mean()\n\n loss.backward()\n\ + \ optimizer.step()\n\n train_loss += loss.item()\n\ + \ num_batches += 1\n\n train_loss /= num_batches\n\n \ + \ # Validation\n model.eval()\n with torch.no_grad():\n\ \ val_preds = model(X_val_t).flatten()\n val_loss\ \ = nn.BCELoss()(val_preds, y_val_t.flatten())\n val_acc = ((val_preds\ \ > 0.5).float() == y_val_t.flatten()).float().mean()\n\n print(\n\ - \ f\"Epoch {epoch + 1}: train loss {loss.item():.4f} | val loss\ + \ f\"Epoch {epoch + 1}: train loss {train_loss:.4f} | val loss\ \ {val_loss.item():.4f} | val acc {val_acc.item():.4f}\"\n )\n\n\ - \ dummy = torch.randn(1, len(feature_cols), dtype=torch.float32)\n \ - \ torch.onnx.export(\n model.cpu(),\n dummy,\n model_output_path,\n\ - \ input_names=[\"dense_input\"],\n output_names=[\"output\"\ - ],\n dynamic_axes={\"dense_input\": {0: \"batch\"}, \"output\": {0:\ - \ \"batch\"}},\n opset_version=13,\n )\n\n" - image: quay.io/modh/runtime-images:runtime-cuda-pytorch-ubi9-python-3.11-20250501-8e41d5c + \ # Early stopping logic\n if val_loss < best_val_loss:\n\ + \ best_val_loss = val_loss\n best_model_state = model.state_dict().copy()\n\ + \ patience_counter = 0\n else:\n patience_counter\ + \ += 1\n\n if epoch >= min_epochs and patience_counter >= patience:\n\ + \ print(f\"Early stopping at epoch {epoch + 1}\")\n \ + \ break\n\n # Restore best model\n if best_model_state is not None:\n\ + \ model.load_state_dict(best_model_state)\n print(f\"Restored\ + \ best model with val loss {best_val_loss:.4f}\")\n\n dummy = torch.randn(1,\ + \ len(feature_cols), dtype=torch.float32)\n torch.onnx.export(\n \ + \ model.cpu(),\n dummy,\n model_output_path,\n input_names=[\"\ + dense_input\"],\n output_names=[\"output\"],\n dynamic_axes={\"\ + dense_input\": {0: \"batch\"}, \"output\": {0: \"batch\"}},\n opset_version=13,\n\ + \ )\n\n" + image: quay.io/modh/odh-pipeline-runtime-pytorch-cuda-py312-ubi9:rhoai-2.24-e8b7177ca2b6226a29d3aab458db7776d8fb0554 exec-upload-model: container: args: @@ -162,7 +182,7 @@ deploymentSpec: - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ - \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.13.0'\ + \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.14.6'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ $0\" \"$@\"\n" - sh @@ -192,7 +212,7 @@ deploymentSpec: env: - name: S3_KEY value: models/fraud/1/model.onnx - image: quay.io/modh/runtime-images:runtime-cuda-pytorch-ubi9-python-3.11-20250501-8e41d5c + image: quay.io/modh/odh-pipeline-runtime-pytorch-cuda-py312-ubi9:rhoai-2.24-e8b7177ca2b6226a29d3aab458db7776d8fb0554 pipelineInfo: name: train-upload-model root: @@ -240,7 +260,7 @@ root: taskInfo: name: upload-model schemaVersion: 2.1.0 -sdkVersion: kfp-2.13.0 +sdkVersion: kfp-2.14.6 --- platforms: kubernetes: