From 75800d00c36dfadc3b8b76f9f32f70d543393688 Mon Sep 17 00:00:00 2001
From: Drew Minnear <dminnear@redhat.com>
Date: Fri, 21 Nov 2025 09:56:32 -0500
Subject: [PATCH 1/2] try removing unnecessary things

---
 charts/rhods/templates/dsc.yaml     | 15 +-----------
 charts/rhods/templates/dsci.yaml    | 21 -----------------
 values-global.yaml                  | 10 ++++----
 values-hub.yaml => values-prod.yaml | 36 +----------------------------
 values-secret.yaml.template         |  3 ---
 5 files changed, 6 insertions(+), 79 deletions(-)
 delete mode 100644 charts/rhods/templates/dsci.yaml
 rename values-hub.yaml => values-prod.yaml (59%)
 delete mode 100644 values-secret.yaml.template

diff --git a/charts/rhods/templates/dsc.yaml b/charts/rhods/templates/dsc.yaml
index 957b186..29d456d 100644
--- a/charts/rhods/templates/dsc.yaml
+++ b/charts/rhods/templates/dsc.yaml
@@ -4,15 +4,13 @@ metadata:
   name: default-dsc
 spec:
   components:
-    codeflare:
-      managementState: Removed
     dashboard:
       managementState: Managed
     datasciencepipelines:
       managementState: Managed
     kserve:
       managementState: Managed
-      defaultDeploymentMode: Serverless
+      defaultDeploymentMode: RawDeployment
       rawDeploymentServiceConfig: Headed
       serving:
         ingressGateway:
@@ -21,18 +19,7 @@ spec:
             type: OpenshiftDefaultIngress
         managementState: Managed
         name: knative-serving
-    kueue:
-      managementState: Removed
     modelmeshserving:
       managementState: Managed
-    ray:
-      managementState: Removed
-    trainingoperator:
-      managementState: Removed
-    trustyai:
-      managementState: Removed
     workbenches:
       managementState: Managed
-    modelregistry:
-      managementState: Managed
-      registriesNamespace: rhoai-model-registries
diff --git a/charts/rhods/templates/dsci.yaml b/charts/rhods/templates/dsci.yaml
deleted file mode 100644
index a77d1d4..0000000
--- a/charts/rhods/templates/dsci.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-apiVersion: dscinitialization.opendatahub.io/v1
-kind: DSCInitialization
-metadata:
-  name: default-dsci
-spec:
-  applicationsNamespace: redhat-ods-applications
-  monitoring:
-    managementState: Managed
-    namespace: redhat-ods-monitoring
-  serviceMesh:
-    auth:
-      audiences:
-        - 'https://kubernetes.default.svc'
-    controlPlane:
-      metricsCollection: Istio
-      name: data-science-smcp
-      namespace: istio-system
-    managementState: Managed
-  trustedCABundle:
-    customCABundle: ''
-    managementState: Managed
diff --git a/values-global.yaml b/values-global.yaml
index 81aefe9..f705917 100644
--- a/values-global.yaml
+++ b/values-global.yaml
@@ -1,12 +1,10 @@
----
 global:
   pattern: mlops
-  options:
-    useCSV: false
-    syncPolicy: Automatic
-    installPlanApproval: Automatic
+  secretLoader:
+    disabled: true
+
 main:
-  clusterGroupName: hub
+  clusterGroupName: prod
   multiSourceConfig:
     enabled: true
     clusterGroupChartVersion: "0.9.*"
diff --git a/values-hub.yaml b/values-prod.yaml
similarity index 59%
rename from values-hub.yaml
rename to values-prod.yaml
index 27eee4c..b531791 100644
--- a/values-hub.yaml
+++ b/values-prod.yaml
@@ -1,10 +1,8 @@
 clusterGroup:
-  name: hub
+  name: prod
   isHubCluster: true
 
   namespaces:
-    - vault
-    - golang-external-secrets
     - inferencing-app
     - fraud-detection:
         labels:
@@ -17,68 +15,36 @@ clusterGroup:
         operatorGroup: true
         targetNamespaces: []
 
-  projects:
-    - hub
-    - rhods
-    - inferencing-app
-    - fraud-detection
-
   subscriptions:
     rhods:
       name: rhods-operator
       namespace: redhat-ods-operator
-      channel: stable
 
     servicemesh:
       name: servicemeshoperator
       namespace: openshift-operators
-      channel: stable
 
     serverless:
       name: serverless-operator
       namespace: openshift-serverless
-      channel: stable
-
-    authorino:
-      name: authorino-operator
-      namespace: openshift-operators
-      channel: stable
 
   applications:
-    vault:
-      name: vault
-      namespace: vault
-      project: hub
-      chart: hashicorp-vault
-      chartVersion: 0.1.*
-
-    golang-external-secrets:
-      name: golang-external-secrets
-      namespace: golang-external-secrets
-      project: hub
-      chart: golang-external-secrets
-      chartVersion: 0.1.*
-
     rhods:
       name: rhods
       namespace: redhat-ods-operator
-      project: rhods
       path: charts/rhods
 
     inferencing-app:
       name: inferencing-app
       namespace: inferencing-app
-      project: inferencing-app
       path: charts/inferencing-app
 
     minio:
       name: minio-storage
       namespace: fraud-detection
-      project: fraud-detection
       path: charts/minio
 
     fraud-detection:
       name: fraud-detection
       namespace: fraud-detection
-      project: fraud-detection
       path: charts/fraud-detection
diff --git a/values-secret.yaml.template b/values-secret.yaml.template
deleted file mode 100644
index 751f784..0000000
--- a/values-secret.yaml.template
+++ /dev/null
@@ -1,3 +0,0 @@
-version: "2.0"
-
-secrets:

From d0b5019a750bfd0eacbae7c7a2c1948f6f0db09a Mon Sep 17 00:00:00 2001
From: Drew Minnear <dminnear@redhat.com>
Date: Fri, 21 Nov 2025 11:11:14 -0500
Subject: [PATCH 2/2] improve model

---
 .gitignore                                    |   1 +
 charts/rhods/templates/dsci.yaml              |  21 ++
 src/kubeflow-pipelines/environment.yaml       |  83 ++++++-
 src/kubeflow-pipelines/requirements.in        |   8 +
 src/kubeflow-pipelines/requirements.txt       | 215 ++++++++++++++++++
 .../small-model/train_upload_model.py         |  78 +++++--
 .../small-model/train_upload_model.yaml       | 112 +++++----
 7 files changed, 447 insertions(+), 71 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 charts/rhods/templates/dsci.yaml
 create mode 100644 src/kubeflow-pipelines/requirements.in
 create mode 100644 src/kubeflow-pipelines/requirements.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/charts/rhods/templates/dsci.yaml b/charts/rhods/templates/dsci.yaml
new file mode 100644
index 0000000..a77d1d4
--- /dev/null
+++ b/charts/rhods/templates/dsci.yaml
@@ -0,0 +1,21 @@
+apiVersion: dscinitialization.opendatahub.io/v1
+kind: DSCInitialization
+metadata:
+  name: default-dsci
+spec:
+  applicationsNamespace: redhat-ods-applications
+  monitoring:
+    managementState: Managed
+    namespace: redhat-ods-monitoring
+  serviceMesh:
+    auth:
+      audiences:
+        - 'https://kubernetes.default.svc'
+    controlPlane:
+      metricsCollection: Istio
+      name: data-science-smcp
+      namespace: istio-system
+    managementState: Managed
+  trustedCABundle:
+    customCABundle: ''
+    managementState: Managed
diff --git a/src/kubeflow-pipelines/environment.yaml b/src/kubeflow-pipelines/environment.yaml
index 53f8bd1..5a82fa2 100644
--- a/src/kubeflow-pipelines/environment.yaml
+++ b/src/kubeflow-pipelines/environment.yaml
@@ -3,14 +3,79 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - python=3.11
-  - kfp
+  - python=3.12
+  - ipykernel
   - pip
+  - pip-tools
   - pip:
-    - kfp-kubernetes
-    - numpy==2.2.5
-    - pandas==2.2.3
-    - torch==2.6.0
-    - scikit-learn==1.6.1
-    - boto3==1.37.38
-    - botocore==1.37.38
+    - boto3==1.41.1
+    - botocore==1.41.1
+    - cachetools==6.2.2
+    - certifi==2025.11.12
+    - charset-normalizer==3.4.4
+    - click==8.1.8
+    - click-option-group==0.5.7
+    - docstring-parser==0.17.0
+    - filelock==3.20.0
+    - fsspec==2025.10.0
+    - google-api-core==2.28.1
+    - google-auth==2.43.0
+    - google-cloud-core==2.5.0
+    - google-cloud-storage==3.6.0
+    - google-crc32c==1.7.1
+    - google-resumable-media==2.8.0
+    - googleapis-common-protos==1.72.0
+    - idna==3.11
+    - jinja2==3.1.6
+    - jmespath==1.0.1
+    - joblib==1.5.2
+    - kfp==2.14.6
+    - kfp-kubernetes==2.14.6
+    - kfp-pipeline-spec==2.14.6
+    - kfp-server-api==2.14.6
+    - kubernetes==30.1.0
+    - markupsafe==3.0.3
+    - mpmath==1.3.0
+    - networkx==3.5
+    - numpy==2.3.5
+    - nvidia-cublas-cu12==12.8.4.1
+    - nvidia-cuda-cupti-cu12==12.8.90
+    - nvidia-cuda-nvrtc-cu12==12.8.93
+    - nvidia-cuda-runtime-cu12==12.8.90
+    - nvidia-cudnn-cu12==9.10.2.21
+    - nvidia-cufft-cu12==11.3.3.83
+    - nvidia-cufile-cu12==1.13.1.3
+    - nvidia-curand-cu12==10.3.9.90
+    - nvidia-cusolver-cu12==11.7.3.90
+    - nvidia-cusparse-cu12==12.5.8.93
+    - nvidia-cusparselt-cu12==0.7.1
+    - nvidia-nccl-cu12==2.27.5
+    - nvidia-nvjitlink-cu12==12.8.93
+    - nvidia-nvshmem-cu12==3.3.20
+    - nvidia-nvtx-cu12==12.8.90
+    - oauthlib==3.3.1
+    - pandas==2.3.3
+    - proto-plus==1.26.1
+    - protobuf==6.33.1
+    - pyasn1==0.6.1
+    - pyasn1-modules==0.4.2
+    - python-dateutil==2.9.0.post0
+    - pytz==2025.2
+    - pyyaml==6.0.3
+    - requests==2.32.5
+    - requests-oauthlib==2.0.0
+    - requests-toolbelt==1.0.0
+    - rsa==4.9.1
+    - s3transfer==0.15.0
+    - scikit-learn==1.7.2
+    - scipy==1.16.3
+    - six==1.17.0
+    - sympy==1.14.0
+    - tabulate==0.9.0
+    - threadpoolctl==3.6.0
+    - torch==2.9.1
+    - triton==3.5.1
+    - typing-extensions==4.15.0
+    - tzdata==2025.2
+    - urllib3==2.5.0
+    - websocket-client==1.9.0
diff --git a/src/kubeflow-pipelines/requirements.in b/src/kubeflow-pipelines/requirements.in
new file mode 100644
index 0000000..9f2f6be
--- /dev/null
+++ b/src/kubeflow-pipelines/requirements.in
@@ -0,0 +1,8 @@
+kfp
+kfp-kubernetes
+numpy
+pandas
+torch
+scikit-learn
+boto3
+botocore
diff --git a/src/kubeflow-pipelines/requirements.txt b/src/kubeflow-pipelines/requirements.txt
new file mode 100644
index 0000000..bfebfc0
--- /dev/null
+++ b/src/kubeflow-pipelines/requirements.txt
@@ -0,0 +1,215 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile
+#
+boto3==1.41.1
+    # via -r requirements.in
+botocore==1.41.1
+    # via
+    #   -r requirements.in
+    #   boto3
+    #   s3transfer
+cachetools==6.2.2
+    # via google-auth
+certifi==2025.11.12
+    # via
+    #   kfp-server-api
+    #   kubernetes
+    #   requests
+charset-normalizer==3.4.4
+    # via requests
+click==8.1.8
+    # via
+    #   click-option-group
+    #   kfp
+click-option-group==0.5.7
+    # via kfp
+docstring-parser==0.17.0
+    # via kfp
+filelock==3.20.0
+    # via torch
+fsspec==2025.10.0
+    # via torch
+google-api-core==2.28.1
+    # via
+    #   google-cloud-core
+    #   google-cloud-storage
+    #   kfp
+google-auth==2.43.0
+    # via
+    #   google-api-core
+    #   google-cloud-core
+    #   google-cloud-storage
+    #   kfp
+    #   kubernetes
+google-cloud-core==2.5.0
+    # via google-cloud-storage
+google-cloud-storage==3.6.0
+    # via kfp
+google-crc32c==1.7.1
+    # via
+    #   google-cloud-storage
+    #   google-resumable-media
+google-resumable-media==2.8.0
+    # via google-cloud-storage
+googleapis-common-protos==1.72.0
+    # via google-api-core
+idna==3.11
+    # via requests
+jinja2==3.1.6
+    # via torch
+jmespath==1.0.1
+    # via
+    #   boto3
+    #   botocore
+joblib==1.5.2
+    # via scikit-learn
+kfp==2.14.6
+    # via
+    #   -r requirements.in
+    #   kfp-kubernetes
+kfp-kubernetes==2.14.6
+    # via -r requirements.in
+kfp-pipeline-spec==2.14.6
+    # via kfp
+kfp-server-api==2.14.6
+    # via kfp
+kubernetes==30.1.0
+    # via kfp
+markupsafe==3.0.3
+    # via jinja2
+mpmath==1.3.0
+    # via sympy
+networkx==3.5
+    # via torch
+numpy==2.3.5
+    # via
+    #   -r requirements.in
+    #   pandas
+    #   scikit-learn
+    #   scipy
+nvidia-cublas-cu12==12.8.4.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.8.90
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.8.93
+    # via torch
+nvidia-cuda-runtime-cu12==12.8.90
+    # via torch
+nvidia-cudnn-cu12==9.10.2.21
+    # via torch
+nvidia-cufft-cu12==11.3.3.83
+    # via torch
+nvidia-cufile-cu12==1.13.1.3
+    # via torch
+nvidia-curand-cu12==10.3.9.90
+    # via torch
+nvidia-cusolver-cu12==11.7.3.90
+    # via torch
+nvidia-cusparse-cu12==12.5.8.93
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.7.1
+    # via torch
+nvidia-nccl-cu12==2.27.5
+    # via torch
+nvidia-nvjitlink-cu12==12.8.93
+    # via
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvshmem-cu12==3.3.20
+    # via torch
+nvidia-nvtx-cu12==12.8.90
+    # via torch
+oauthlib==3.3.1
+    # via
+    #   kubernetes
+    #   requests-oauthlib
+pandas==2.3.3
+    # via -r requirements.in
+proto-plus==1.26.1
+    # via google-api-core
+protobuf==6.33.1
+    # via
+    #   google-api-core
+    #   googleapis-common-protos
+    #   kfp
+    #   kfp-kubernetes
+    #   kfp-pipeline-spec
+    #   proto-plus
+pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.2
+    # via google-auth
+python-dateutil==2.9.0.post0
+    # via
+    #   botocore
+    #   kfp-server-api
+    #   kubernetes
+    #   pandas
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.3
+    # via
+    #   kfp
+    #   kubernetes
+requests==2.32.5
+    # via
+    #   google-api-core
+    #   google-cloud-storage
+    #   kubernetes
+    #   requests-oauthlib
+    #   requests-toolbelt
+requests-oauthlib==2.0.0
+    # via kubernetes
+requests-toolbelt==1.0.0
+    # via kfp
+rsa==4.9.1
+    # via google-auth
+s3transfer==0.15.0
+    # via boto3
+scikit-learn==1.7.2
+    # via -r requirements.in
+scipy==1.16.3
+    # via scikit-learn
+six==1.17.0
+    # via
+    #   kfp-server-api
+    #   kubernetes
+    #   python-dateutil
+sympy==1.14.0
+    # via torch
+tabulate==0.9.0
+    # via kfp
+threadpoolctl==3.6.0
+    # via scikit-learn
+torch==2.9.1
+    # via -r requirements.in
+triton==3.5.1
+    # via torch
+typing-extensions==4.15.0
+    # via torch
+tzdata==2025.2
+    # via pandas
+urllib3==2.5.0
+    # via
+    #   botocore
+    #   kfp
+    #   kfp-server-api
+    #   kubernetes
+    #   requests
+websocket-client==1.9.0
+    # via kubernetes
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/src/kubeflow-pipelines/small-model/train_upload_model.py b/src/kubeflow-pipelines/small-model/train_upload_model.py
index 460fb61..d26314b 100644
--- a/src/kubeflow-pipelines/small-model/train_upload_model.py
+++ b/src/kubeflow-pipelines/small-model/train_upload_model.py
@@ -5,7 +5,7 @@
 
 
 @dsl.component(
-    base_image="quay.io/modh/runtime-images:runtime-cuda-pytorch-ubi9-python-3.11-20250501-8e41d5c"
+    base_image="quay.io/modh/odh-pipeline-runtime-pytorch-cuda-py312-ubi9:rhoai-2.24-e8b7177ca2b6226a29d3aab458db7776d8fb0554"
 )
 def get_data(
     train_data_output_path: OutputPath(), validate_data_output_path: OutputPath()
@@ -24,7 +24,7 @@ def get_data(
 
 
 @dsl.component(
-    base_image="quay.io/modh/runtime-images:runtime-cuda-pytorch-ubi9-python-3.11-20250501-8e41d5c",
+    base_image="quay.io/modh/odh-pipeline-runtime-pytorch-cuda-py312-ubi9:rhoai-2.24-e8b7177ca2b6226a29d3aab458db7776d8fb0554",
 )
 def train_model(
     train_data_input_path: InputPath(),
@@ -40,6 +40,7 @@ def train_model(
     import torch.nn as nn
     from sklearn.preprocessing import StandardScaler
     from sklearn.utils import class_weight
+    from torch.utils.data import DataLoader, TensorDataset
 
     torch.set_default_dtype(torch.float32)
 
@@ -92,25 +93,53 @@ def forward(self, x):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model = FraudNetMedium(len(feature_cols)).to(device)
 
-    X_train_t = torch.tensor(X_train, device=device)
-    y_train_t = torch.tensor(y_train, device=device)
+    # Create data loaders for mini-batch training
+    train_dataset = TensorDataset(
+        torch.tensor(X_train, dtype=torch.float32),
+        torch.tensor(y_train, dtype=torch.float32),
+    )
+    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
+
     X_val_t = torch.tensor(X_val, device=device)
     y_val_t = torch.tensor(y_val, device=device)
 
-    sample_weights = (y_train_t * (pos_weight[0] - 1) + 1).flatten()
-    criterion = nn.BCELoss(weight=sample_weights)
-    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    # Use weighted BCELoss without pre-computed sample weights
+    criterion = nn.BCELoss(reduction="none")
+    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)  # Lower learning rate
 
-    y_train_flat = y_train_t.flatten()
+    # Early stopping parameters
+    best_val_loss = float("inf")
+    best_model_state = None
+    patience = 5
+    patience_counter = 0
+    min_epochs = 10
+    max_epochs = 30
 
-    for epoch in range(3):
+    for epoch in range(max_epochs):
         model.train()
-        optimizer.zero_grad()
-        preds = model(X_train_t).flatten()
-        loss = criterion(preds, y_train_flat)
-        loss.backward()
-        optimizer.step()
+        train_loss = 0.0
+        num_batches = 0
+
+        for batch_X, batch_y in train_loader:
+            batch_X = batch_X.to(device)
+            batch_y = batch_y.to(device)
+
+            optimizer.zero_grad()
+            preds = model(batch_X).flatten()
+
+            # Apply class weights per sample
+            batch_weights = batch_y.flatten() * (pos_weight[0] - 1) + 1
+            loss = (criterion(preds, batch_y.flatten()) * batch_weights).mean()
+
+            loss.backward()
+            optimizer.step()
 
+            train_loss += loss.item()
+            num_batches += 1
+
+        train_loss /= num_batches
+
+        # Validation
         model.eval()
         with torch.no_grad():
             val_preds = model(X_val_t).flatten()
@@ -118,9 +147,26 @@ def forward(self, x):
             val_acc = ((val_preds > 0.5).float() == y_val_t.flatten()).float().mean()
 
         print(
-            f"Epoch {epoch + 1}: train loss {loss.item():.4f} | val loss {val_loss.item():.4f} | val acc {val_acc.item():.4f}"
+            f"Epoch {epoch + 1}: train loss {train_loss:.4f} | val loss {val_loss.item():.4f} | val acc {val_acc.item():.4f}"
         )
 
+        # Early stopping logic
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+            best_model_state = model.state_dict().copy()
+            patience_counter = 0
+        else:
+            patience_counter += 1
+
+        if epoch >= min_epochs and patience_counter >= patience:
+            print(f"Early stopping at epoch {epoch + 1}")
+            break
+
+    # Restore best model
+    if best_model_state is not None:
+        model.load_state_dict(best_model_state)
+        print(f"Restored best model with val loss {best_val_loss:.4f}")
+
     dummy = torch.randn(1, len(feature_cols), dtype=torch.float32)
     torch.onnx.export(
         model.cpu(),
@@ -134,7 +180,7 @@ def forward(self, x):
 
 
 @dsl.component(
-    base_image="quay.io/modh/runtime-images:runtime-cuda-pytorch-ubi9-python-3.11-20250501-8e41d5c",
+    base_image="quay.io/modh/odh-pipeline-runtime-pytorch-cuda-py312-ubi9:rhoai-2.24-e8b7177ca2b6226a29d3aab458db7776d8fb0554",
 )
 def upload_model(input_model_path: InputPath()):
     import os
diff --git a/src/kubeflow-pipelines/small-model/train_upload_model.yaml b/src/kubeflow-pipelines/small-model/train_upload_model.yaml
index 852233e..6547634 100644
--- a/src/kubeflow-pipelines/small-model/train_upload_model.yaml
+++ b/src/kubeflow-pipelines/small-model/train_upload_model.yaml
@@ -53,7 +53,7 @@ deploymentSpec:
         - -c
         - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
           \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
-          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.13.0'\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.14.6'\
           \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
           $0\" \"$@\"\n"
         - sh
@@ -75,7 +75,7 @@ deploymentSpec:
           \   url = \"https://raw.githubusercontent.com/rh-aiservices-bu/fraud-detection/main/data/validate.csv\"\
           \n    urllib.request.urlretrieve(url, validate_data_output_path)\n    print(\"\
           validation data downloaded\")\n\n"
-        image: quay.io/modh/runtime-images:runtime-cuda-pytorch-ubi9-python-3.11-20250501-8e41d5c
+        image: quay.io/modh/odh-pipeline-runtime-pytorch-cuda-py312-ubi9:rhoai-2.24-e8b7177ca2b6226a29d3aab458db7776d8fb0554
     exec-train-model:
       container:
         args:
@@ -88,7 +88,7 @@ deploymentSpec:
         - -c
         - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
           \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
-          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.13.0'\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.14.6'\
           \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
           $0\" \"$@\"\n"
         - sh
@@ -106,50 +106,70 @@ deploymentSpec:
           \ InputPath(),\n    model_output_path: OutputPath(),\n):\n    import pickle\n\
           \    from pathlib import Path\n\n    import numpy as np\n    import pandas\
           \ as pd\n    import torch\n    import torch.nn as nn\n    from sklearn.preprocessing\
-          \ import StandardScaler\n    from sklearn.utils import class_weight\n\n\
-          \    torch.set_default_dtype(torch.float32)\n\n    feature_cols = list(range(7))\n\
-          \    label_col = 7\n\n    df_train = pd.read_csv(train_data_input_path)\n\
-          \    df_val = pd.read_csv(validate_data_input_path)\n\n    X_train = df_train.iloc[:,\
-          \ feature_cols].values\n    y_train = df_train.iloc[:, label_col].values.reshape(-1,\
-          \ 1)\n\n    X_val = df_val.iloc[:, feature_cols].values\n    y_val = df_val.iloc[:,\
-          \ label_col].values.reshape(-1, 1)\n\n    scaler = StandardScaler()\n  \
-          \  X_train = scaler.fit_transform(X_train).astype(\"float32\")\n    X_val\
-          \ = scaler.transform(X_val).astype(\"float32\")\n    y_train = y_train.astype(\"\
-          float32\")\n    y_val = y_val.astype(\"float32\")\n\n    Path(\"artifact\"\
-          ).mkdir(parents=True, exist_ok=True)\n    pickle.dump(scaler, open(\"artifact/scaler.pkl\"\
-          , \"wb\"))\n\n    cw = class_weight.compute_class_weight(\n        \"balanced\"\
-          , classes=np.unique(y_train), y=y_train.ravel()\n    )\n    pos_weight =\
-          \ torch.tensor([cw[1] / cw[0]], dtype=torch.float32)\n\n    class FraudNetMedium(nn.Module):\n\
-          \        def __init__(self, input_dim):\n            super().__init__()\n\
-          \            self.net = nn.Sequential(\n                nn.Linear(input_dim,\
-          \ 128),\n                nn.ReLU(),\n                nn.Dropout(0.2),\n\
-          \                nn.Linear(128, 128),\n                nn.ReLU(),\n    \
-          \            nn.Dropout(0.2),\n                nn.Linear(128, 64),\n   \
-          \             nn.ReLU(),\n                nn.Dropout(0.2),\n           \
-          \     nn.Linear(64, 1),\n                nn.Sigmoid(),\n            )\n\n\
-          \        def forward(self, x):\n            return self.net(x)\n\n    device\
-          \ = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n \
-          \   model = FraudNetMedium(len(feature_cols)).to(device)\n\n    X_train_t\
-          \ = torch.tensor(X_train, device=device)\n    y_train_t = torch.tensor(y_train,\
-          \ device=device)\n    X_val_t = torch.tensor(X_val, device=device)\n   \
-          \ y_val_t = torch.tensor(y_val, device=device)\n\n    sample_weights = (y_train_t\
-          \ * (pos_weight[0] - 1) + 1).flatten()\n    criterion = nn.BCELoss(weight=sample_weights)\n\
-          \    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n\n    y_train_flat\
-          \ = y_train_t.flatten()\n\n    for epoch in range(3):\n        model.train()\n\
-          \        optimizer.zero_grad()\n        preds = model(X_train_t).flatten()\n\
-          \        loss = criterion(preds, y_train_flat)\n        loss.backward()\n\
-          \        optimizer.step()\n\n        model.eval()\n        with torch.no_grad():\n\
+          \ import StandardScaler\n    from sklearn.utils import class_weight\n  \
+          \  from torch.utils.data import DataLoader, TensorDataset\n\n    torch.set_default_dtype(torch.float32)\n\
+          \n    feature_cols = list(range(7))\n    label_col = 7\n\n    df_train =\
+          \ pd.read_csv(train_data_input_path)\n    df_val = pd.read_csv(validate_data_input_path)\n\
+          \n    X_train = df_train.iloc[:, feature_cols].values\n    y_train = df_train.iloc[:,\
+          \ label_col].values.reshape(-1, 1)\n\n    X_val = df_val.iloc[:, feature_cols].values\n\
+          \    y_val = df_val.iloc[:, label_col].values.reshape(-1, 1)\n\n    scaler\
+          \ = StandardScaler()\n    X_train = scaler.fit_transform(X_train).astype(\"\
+          float32\")\n    X_val = scaler.transform(X_val).astype(\"float32\")\n  \
+          \  y_train = y_train.astype(\"float32\")\n    y_val = y_val.astype(\"float32\"\
+          )\n\n    Path(\"artifact\").mkdir(parents=True, exist_ok=True)\n    pickle.dump(scaler,\
+          \ open(\"artifact/scaler.pkl\", \"wb\"))\n\n    cw = class_weight.compute_class_weight(\n\
+          \        \"balanced\", classes=np.unique(y_train), y=y_train.ravel()\n \
+          \   )\n    pos_weight = torch.tensor([cw[1] / cw[0]], dtype=torch.float32)\n\
+          \n    class FraudNetMedium(nn.Module):\n        def __init__(self, input_dim):\n\
+          \            super().__init__()\n            self.net = nn.Sequential(\n\
+          \                nn.Linear(input_dim, 128),\n                nn.ReLU(),\n\
+          \                nn.Dropout(0.2),\n                nn.Linear(128, 128),\n\
+          \                nn.ReLU(),\n                nn.Dropout(0.2),\n        \
+          \        nn.Linear(128, 64),\n                nn.ReLU(),\n             \
+          \   nn.Dropout(0.2),\n                nn.Linear(64, 1),\n              \
+          \  nn.Sigmoid(),\n            )\n\n        def forward(self, x):\n     \
+          \       return self.net(x)\n\n    device = torch.device(\"cuda\" if torch.cuda.is_available()\
+          \ else \"cpu\")\n    model = FraudNetMedium(len(feature_cols)).to(device)\n\
+          \n    # Create data loaders for mini-batch training\n    train_dataset =\
+          \ TensorDataset(\n        torch.tensor(X_train, dtype=torch.float32),\n\
+          \        torch.tensor(y_train, dtype=torch.float32),\n    )\n    train_loader\
+          \ = DataLoader(train_dataset, batch_size=128, shuffle=True)\n\n    X_val_t\
+          \ = torch.tensor(X_val, device=device)\n    y_val_t = torch.tensor(y_val,\
+          \ device=device)\n\n    # Use weighted BCELoss without pre-computed sample\
+          \ weights\n    criterion = nn.BCELoss(reduction=\"none\")\n    optimizer\
+          \ = torch.optim.Adam(model.parameters(), lr=3e-4)  # Lower learning rate\n\
+          \n    # Early stopping parameters\n    best_val_loss = float(\"inf\")\n\
+          \    best_model_state = None\n    patience = 5\n    patience_counter = 0\n\
+          \    min_epochs = 10\n    max_epochs = 30\n\n    for epoch in range(max_epochs):\n\
+          \        model.train()\n        train_loss = 0.0\n        num_batches =\
+          \ 0\n\n        for batch_X, batch_y in train_loader:\n            batch_X\
+          \ = batch_X.to(device)\n            batch_y = batch_y.to(device)\n\n   \
+          \         optimizer.zero_grad()\n            preds = model(batch_X).flatten()\n\
+          \n            # Apply class weights per sample\n            batch_weights\
+          \ = batch_y.flatten() * (pos_weight[0] - 1) + 1\n            loss = (criterion(preds,\
+          \ batch_y.flatten()) * batch_weights).mean()\n\n            loss.backward()\n\
+          \            optimizer.step()\n\n            train_loss += loss.item()\n\
+          \            num_batches += 1\n\n        train_loss /= num_batches\n\n \
+          \       # Validation\n        model.eval()\n        with torch.no_grad():\n\
           \            val_preds = model(X_val_t).flatten()\n            val_loss\
           \ = nn.BCELoss()(val_preds, y_val_t.flatten())\n            val_acc = ((val_preds\
           \ > 0.5).float() == y_val_t.flatten()).float().mean()\n\n        print(\n\
-          \            f\"Epoch {epoch + 1}: train loss {loss.item():.4f} | val loss\
+          \            f\"Epoch {epoch + 1}: train loss {train_loss:.4f} | val loss\
           \ {val_loss.item():.4f} | val acc {val_acc.item():.4f}\"\n        )\n\n\
-          \    dummy = torch.randn(1, len(feature_cols), dtype=torch.float32)\n  \
-          \  torch.onnx.export(\n        model.cpu(),\n        dummy,\n        model_output_path,\n\
-          \        input_names=[\"dense_input\"],\n        output_names=[\"output\"\
-          ],\n        dynamic_axes={\"dense_input\": {0: \"batch\"}, \"output\": {0:\
-          \ \"batch\"}},\n        opset_version=13,\n    )\n\n"
-        image: quay.io/modh/runtime-images:runtime-cuda-pytorch-ubi9-python-3.11-20250501-8e41d5c
+          \        # Early stopping logic\n        if val_loss < best_val_loss:\n\
+          \            best_val_loss = val_loss\n            best_model_state = model.state_dict().copy()\n\
+          \            patience_counter = 0\n        else:\n            patience_counter\
+          \ += 1\n\n        if epoch >= min_epochs and patience_counter >= patience:\n\
+          \            print(f\"Early stopping at epoch {epoch + 1}\")\n         \
+          \   break\n\n    # Restore best model\n    if best_model_state is not None:\n\
+          \        model.load_state_dict(best_model_state)\n        print(f\"Restored\
+          \ best model with val loss {best_val_loss:.4f}\")\n\n    dummy = torch.randn(1,\
+          \ len(feature_cols), dtype=torch.float32)\n    torch.onnx.export(\n    \
+          \    model.cpu(),\n        dummy,\n        model_output_path,\n        input_names=[\"\
+          dense_input\"],\n        output_names=[\"output\"],\n        dynamic_axes={\"\
+          dense_input\": {0: \"batch\"}, \"output\": {0: \"batch\"}},\n        opset_version=13,\n\
+          \    )\n\n"
+        image: quay.io/modh/odh-pipeline-runtime-pytorch-cuda-py312-ubi9:rhoai-2.24-e8b7177ca2b6226a29d3aab458db7776d8fb0554
     exec-upload-model:
       container:
         args:
@@ -162,7 +182,7 @@ deploymentSpec:
         - -c
         - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
           \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
-          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.13.0'\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.14.6'\
           \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
           $0\" \"$@\"\n"
         - sh
@@ -192,7 +212,7 @@ deploymentSpec:
         env:
         - name: S3_KEY
           value: models/fraud/1/model.onnx
-        image: quay.io/modh/runtime-images:runtime-cuda-pytorch-ubi9-python-3.11-20250501-8e41d5c
+        image: quay.io/modh/odh-pipeline-runtime-pytorch-cuda-py312-ubi9:rhoai-2.24-e8b7177ca2b6226a29d3aab458db7776d8fb0554
 pipelineInfo:
   name: train-upload-model
 root:
@@ -240,7 +260,7 @@ root:
         taskInfo:
           name: upload-model
 schemaVersion: 2.1.0
-sdkVersion: kfp-2.13.0
+sdkVersion: kfp-2.14.6
 ---
 platforms:
   kubernetes: