plexe-ai · marcellodebernardi · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -3,20 +3,47 @@
 #   base      → shared dependencies (no Spark provider)
 #   pyspark   → local PySpark execution (DEFAULT)
 #   databricks→ remote Databricks Connect execution
+#   VARIANT=cpu (default) or VARIANT=gpu
 #
 # Usage:
 #   docker build .                        # default: pyspark
 #   docker build --target databricks .    # databricks-connect
+#   docker build --build-arg VARIANT=gpu .  # GPU-enabled PySpark image (amd64 only)
+
+# ============================================
+# Stage: base selection (cpu/gpu)
+# ============================================
+ARG PYTHON_VERSION=3.12
+ARG VARIANT=cpu
+
+FROM python:${PYTHON_VERSION}-slim-bookworm AS base-cpu
+
+FROM nvidia/cuda:12.9.0-runtime-ubuntu24.04 AS base-gpu
+ARG PYTHON_VERSION=3.12
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update && apt-get install -y --no-install-recommends \
+        python${PYTHON_VERSION} \
+        python${PYTHON_VERSION}-venv \
+        python${PYTHON_VERSION}-dev \
+        python3-pip \
+    && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 \
+    && ln -sf /usr/bin/python3 /usr/bin/python \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm -f /usr/lib/python${PYTHON_VERSION}/EXTERNALLY-MANAGED
 
 # ============================================
 # Stage: base (shared across all variants)
 # ============================================
+FROM base-${VARIANT} AS base
+ARG TARGETARCH
+ARG VARIANT=cpu
 ARG PYTHON_VERSION=3.12
-FROM python:${PYTHON_VERSION}-slim-bookworm AS base
 
+# System dependencies
 WORKDIR /code
 
-# System dependencies
 RUN apt-get update && apt-get install -y \
     build-essential \
     gcc \
@@ -29,14 +56,17 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
 ENV PATH="/root/.cargo/bin:${PATH}"
 
 # Python tooling
-RUN pip install --no-cache-dir --upgrade pip && \
+RUN rm -rf /usr/lib/python3/dist-packages/*.dist-info 2>/dev/null; \
+    pip install --no-cache-dir pip && \
     pip install --no-cache-dir poetry && \
     poetry config virtualenvs.create false
 
 # Install large stable dependencies before poetry to maximize build cache reuse.
-# INSTALL_PYTORCH controls whether CPU-only PyTorch is installed.
+# INSTALL_PYTORCH controls whether PyTorch is installed.
 ARG INSTALL_PYTORCH="true"
-RUN if [ "$INSTALL_PYTORCH" = "true" ]; then \
+RUN if [ "$VARIANT" = "gpu" ] && [ "$INSTALL_PYTORCH" = "true" ]; then \
+        pip install --no-cache-dir torch==2.7.1; \
+    elif [ "$INSTALL_PYTORCH" = "true" ]; then \
         pip install --no-cache-dir torch==2.7.1 \
             --index-url https://download.pytorch.org/whl/cpu \
             --extra-index-url https://pypi.org/simple; \
@@ -101,6 +131,10 @@ RUN mkdir -p /opt/spark-jars && \
 
 # Spark configuration for local mode
 ARG PYTHON_VERSION=3.12
+# GPU variant (Ubuntu) may install to dist-packages. Symlink ensures stable SPARK_HOME.
+RUN mkdir -p /usr/local/lib/python${PYTHON_VERSION}/site-packages && \
+    ln -sf $(python3 -c "import pyspark; print(pyspark.__path__[0])") \
+    /usr/local/lib/python${PYTHON_VERSION}/site-packages/pyspark 2>/dev/null || true
 ENV SPARK_HOME="/usr/local/lib/python${PYTHON_VERSION}/site-packages/pyspark"
 ENV PYSPARK_PYTHON="python3"
 ENV PYSPARK_DRIVER_PYTHON="python3"

diff --git a/Makefile b/Makefile
@@ -49,6 +49,7 @@ help:
 	@echo ""
 	@echo "🏗️  Building:"
 	@echo "  make build              Build default image (PySpark)"
+	@echo "  make build-gpu          Build GPU variant (CUDA + GPU PyTorch, amd64)"
 	@echo "  make build-databricks   Build Databricks variant"
 	@echo ""
 	@echo "🧹 Cleanup:"
@@ -70,21 +71,23 @@ help:
 .PHONY: test-integration
 test-integration:
 	@echo "🧪 Running staged pytest integration suite..."
+	@echo "Using DATALOADER_WORKERS=$${DATALOADER_WORKERS:-0}"
 	@if [ -n "$(INTEGRATION_RUN_ID)" ]; then \
 		echo "Using integration run id: $(INTEGRATION_RUN_ID)"; \
-		PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" bash scripts/tests/run_integration_staged.sh; \
+		DATALOADER_WORKERS="$${DATALOADER_WORKERS:-0}" PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" bash scripts/tests/run_integration_staged.sh; \
 	else \
-		bash scripts/tests/run_integration_staged.sh; \
+		DATALOADER_WORKERS="$${DATALOADER_WORKERS:-0}" bash scripts/tests/run_integration_staged.sh; \
 	fi
 
 .PHONY: test-integration-verbose
 test-integration-verbose:
 	@echo "🧪 Running staged pytest integration suite (verbose)..."
+	@echo "Using DATALOADER_WORKERS=$${DATALOADER_WORKERS:-0}"
 	@if [ -n "$(INTEGRATION_RUN_ID)" ]; then \
 		echo "Using integration run id: $(INTEGRATION_RUN_ID)"; \
-		PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
+		DATALOADER_WORKERS="$${DATALOADER_WORKERS:-0}" PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
 	else \
-		PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
+		DATALOADER_WORKERS="$${DATALOADER_WORKERS:-0}" PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
 	fi
 
 # Fast sanity check - 1 iteration, minimal config
@@ -368,6 +371,17 @@ build:
 		-f Dockerfile .
 	@echo "✅ Build complete: plexe:py$(PYTHON_VERSION)"
 
+# Build GPU variant (NVIDIA CUDA + CUDA-enabled PyTorch, amd64 only)
+.PHONY: build-gpu
+build-gpu:
+	@echo "🏗️  Building GPU variant (Python $(PYTHON_VERSION), CUDA)..."
+	docker buildx build --platform linux/amd64 --output type=docker --provenance=false \
+		--build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
+		--build-arg VARIANT=gpu \
+		-t plexe:py$(PYTHON_VERSION)-gpu \
+		-f Dockerfile .
+	@echo "✅ Build complete: plexe:py$(PYTHON_VERSION)-gpu"
+
 
 # Build Databricks variant
 .PHONY: build-databricks

diff --git a/config.yaml.template b/config.yaml.template
@@ -34,8 +34,8 @@
 
 # Default epochs for neural network training (Keras, PyTorch)
 # Type: integer
-# Default: 25
-# nn_default_epochs: 25
+# Default: 10
+# nn_default_epochs: 10
 
 # Maximum epochs for neural network training (Keras, PyTorch)
 # Type: integer

diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md
@@ -1,6 +1,6 @@
 # Code Index: plexe
 
-> Generated on 2026-03-02 19:57:53
+> Generated on 2026-03-02 22:03:39
 
 Code structure and public interface documentation for the **plexe** package.
 
@@ -207,14 +207,14 @@ Local process runner - executes training in subprocess.
 
 **`LocalProcessRunner`** - Runs training in local subprocess.
 - `__init__(self, work_dir: str)`
-- `run_training(self, template: str, model: Any, feature_pipeline: Pipeline, train_uri: str, val_uri: str, timeout: int, target_columns: list[str], optimizer: Any, loss: Any, epochs: int, batch_size: int, group_column: str | None) -> Path` - Execute training in subprocess.
+- `run_training(self, template: str, model: Any, feature_pipeline: Pipeline, train_uri: str, val_uri: str, timeout: int, target_columns: list[str], task_type: str, optimizer: Any, loss: Any, epochs: int, batch_size: int, group_column: str | None, mixed_precision: bool, dataloader_workers: int) -> Path` - Execute training in subprocess.
 
 ---
 ## `execution/training/runner.py`
 Training runner abstract base class.
 
 **`TrainingRunner`** - Abstract base class for training execution environments.
-- `run_training(self, template: str, model: Any, feature_pipeline: Pipeline, train_uri: str, val_uri: str, timeout: int, target_columns: list[str]) -> Path` - Execute model training and return path to artifacts.
+- `run_training(self, template: str, model: Any, feature_pipeline: Pipeline, train_uri: str, val_uri: str, timeout: int, target_columns: list[str], task_type: str) -> Path` - Execute model training and return path to artifacts.
 
 ---
 ## `helpers.py`
@@ -312,6 +312,9 @@ Simple dataclasses for model building workflow.
 
 **`DataLayout`** - Physical structure of dataset (not semantic meaning).
 
+**`TaskType`** - Canonical ML task type determined during Phase 1.
+- `is_classification(self) -> bool` - No description
+
 **`Metric`** - Evaluation metric definition.
 
 **`BuildContext`** - Context passed through workflow phases.
@@ -452,6 +455,7 @@ Standard Keras predictor - NO Plexe dependencies.
 **`KerasPredictor`** - Standalone Keras predictor.
 - `__init__(self, model_dir: str)`
 - `predict(self, x: pd.DataFrame) -> pd.DataFrame` - Make predictions on input DataFrame.
+- `predict_proba(self, x: pd.DataFrame) -> pd.DataFrame` - Predict per-class probabilities on input DataFrame.
 
 ---
 ## `templates/inference/lightgbm_predictor.py`
@@ -468,6 +472,7 @@ Standard PyTorch predictor - NO Plexe dependencies.
 **`PyTorchPredictor`** - Standalone PyTorch predictor.
 - `__init__(self, model_dir: str)`
 - `predict(self, x: pd.DataFrame) -> pd.DataFrame` - Make predictions on input DataFrame.
+- `predict_proba(self, x: pd.DataFrame) -> pd.DataFrame` - Predict per-class probabilities on input DataFrame.
 
 ---
 ## `templates/inference/xgboost_predictor.py`
@@ -489,37 +494,37 @@ Model card template generator.
 Hardcoded robust CatBoost training loop.
 
 **Functions:**
-- `train_catboost(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str) -> dict` - Train CatBoost model directly (no Spark).
+- `train_catboost(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, task_type: str | None) -> dict` - Train CatBoost model directly (no Spark).
 - `main()` - No description
 
 ---
 ## `templates/training/train_keras.py`
-Hardcoded robust Keras training loop.
+Keras training template with streaming data loading, multi-GPU (MirroredStrategy), and mixed precision.
 
 **Functions:**
-- `train_keras(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, epochs: int, batch_size: int) -> dict` - Train Keras model directly.
+- `train_keras(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, epochs: int, batch_size: int, use_multi_gpu: bool, use_mixed_precision: bool, task_type: str | None) -> dict` - Train Keras model with streaming data, optional multi-GPU, and mixed precision.
 
 ---
 ## `templates/training/train_lightgbm.py`
 Hardcoded robust LightGBM training loop.
 
 **Functions:**
-- `train_lightgbm(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, group_column: str | None) -> dict` - Train LightGBM model directly (no Spark).
+- `train_lightgbm(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, group_column: str | None, task_type: str | None) -> dict` - Train LightGBM model directly (no Spark).
 - `main()` - No description
 
 ---
 ## `templates/training/train_pytorch.py`
-Hardcoded robust PyTorch training loop.
+PyTorch training template with streaming data loading, multi-GPU (DDP), and mixed precision.
 
 **Functions:**
-- `train_pytorch(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, epochs: int, batch_size: int) -> dict` - Train PyTorch model directly.
+- `train_pytorch(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, epochs: int, batch_size: int, num_workers: int, use_ddp: bool, use_mixed_precision: bool, task_type: str | None) -> dict` - Train PyTorch model with streaming data, optional DDP, and mixed precision.
 
 ---
 ## `templates/training/train_xgboost.py`
 Hardcoded robust XGBoost training loop.
 
 **Functions:**
-- `train_xgboost(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, group_column: str | None) -> dict` - Train XGBoost model directly (no Spark).
+- `train_xgboost(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, group_column: str | None, task_type: str | None) -> dict` - Train XGBoost model directly (no Spark).
 - `main()` - No description
 
 ---
@@ -624,7 +629,7 @@ Utility functions for dashboard data loading.
 - `load_report(exp_path: Path, report_name: str) -> dict | None` - Load YAML report from DirNames.BUILD_DIR/reports/.
 - `load_code_file(file_path: Path) -> str | None` - Load Python code file.
 - `load_parquet_sample(uri: str, limit: int) -> pd.DataFrame | None` - Load first N rows from parquet file.
-- `get_parquet_row_count(uri: str) -> int | None` - Get row count from parquet file.
+- `get_parquet_row_count(uri: str) -> int | None` - Get row count from parquet metadata without reading data.
 - `load_json_file(file_path: Path) -> dict | None` - Load JSON file.
 
 ---
@@ -636,6 +641,21 @@ LiteLLM model wrapper with retry logic and optional post-call hook.
 - `generate(self)` - Generate with automatic retries, header injection, and post-call hook.
 - `chat(self)` - Chat with automatic retries, header injection, and post-call hook.
 
+---
+## `utils/parquet_dataset.py`
+Streaming parquet data loading utilities for large-dataset training.
+
+**`ParquetIterableDataset`** - Streaming parquet dataset for PyTorch DataLoader.
+- `__init__(self, uri: str, target_column: str, task_type: str)`
+- `total_rows(self) -> int` - No description
+
+**Functions:**
+- `get_parquet_row_count(uri: str) -> int` - Get total row count from parquet metadata without reading data.
+- `get_dataset_size_bytes(uri: str) -> int` - Get dataset size in bytes for a local file or directory of parquet files.
+- `parquet_batch_generator(uri: str, target_column: str, batch_size: int, task_type: str | None) -> Iterator[tuple[np.ndarray, np.ndarray]]` - Streaming parquet batch generator for Keras/TensorFlow.
+- `get_parquet_feature_count(uri: str, target_column: str) -> int` - Get number of feature columns (total columns minus target).
+- `get_steps_per_epoch(uri: str, batch_size: int) -> int` - Compute number of steps per epoch for a parquet dataset.
+
 ---
 ## `utils/reporting.py`
 Utilities for saving agent reports to disk.

diff --git a/plexe/config.py b/plexe/config.py
@@ -309,12 +309,21 @@ class Config(BaseSettings):
     # Training settings
     training_timeout: int = Field(default=1800, description="Timeout for training runs (seconds)", gt=0)
     nn_default_epochs: int = Field(
-        default=25, description="Default epochs for neural network training (Keras, PyTorch)"
+        default=10, description="Default epochs for neural network training (Keras, PyTorch)"
     )
     nn_max_epochs: int = Field(default=50, description="Maximum epochs for neural network training (Keras, PyTorch)")
     nn_default_batch_size: int = Field(
         default=32, description="Default batch size for neural network training (Keras, PyTorch)"
     )
+    nn_training_timeout: int = Field(
+        default=14400, description="Timeout for neural network training on full dataset (seconds)", gt=0
+    )
+    mixed_precision: bool = Field(
+        default=True, description="Use mixed precision (FP16) when GPU available (auto-disabled on CPU)"
+    )
+    dataloader_workers: int = Field(
+        default=4, description="Number of DataLoader worker processes for streaming data loading", ge=0
+    )
 
     # LLM settings (per agent role)
     statistical_analysis_llm: str = Field(