Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 39 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,47 @@
# base → shared dependencies (no Spark provider)
# pyspark → local PySpark execution (DEFAULT)
# databricks→ remote Databricks Connect execution
# VARIANT=cpu (default) or VARIANT=gpu
#
# Usage:
# docker build . # default: pyspark
# docker build --target databricks . # databricks-connect
# docker build --build-arg VARIANT=gpu . # GPU-enabled PySpark image (amd64 only)

# ============================================
# Stage: base selection (cpu/gpu)
# ============================================
ARG PYTHON_VERSION=3.12
ARG VARIANT=cpu

FROM python:${PYTHON_VERSION}-slim-bookworm AS base-cpu

FROM nvidia/cuda:12.9.0-runtime-ubuntu24.04 AS base-gpu
ARG PYTHON_VERSION=3.12
RUN apt-get update && apt-get install -y --no-install-recommends \
software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update && apt-get install -y --no-install-recommends \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-venv \
python${PYTHON_VERSION}-dev \
python3-pip \
&& ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 \
&& ln -sf /usr/bin/python3 /usr/bin/python \
&& rm -rf /var/lib/apt/lists/* \
&& rm -f /usr/lib/python${PYTHON_VERSION}/EXTERNALLY-MANAGED

# ============================================
# Stage: base (shared across all variants)
# ============================================
FROM base-${VARIANT} AS base
ARG TARGETARCH
ARG VARIANT=cpu
ARG PYTHON_VERSION=3.12
FROM python:${PYTHON_VERSION}-slim-bookworm AS base

# System dependencies
WORKDIR /code

# System dependencies
RUN apt-get update && apt-get install -y \
build-essential \
gcc \
Expand All @@ -29,14 +56,17 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"

# Python tooling
RUN pip install --no-cache-dir --upgrade pip && \
RUN rm -rf /usr/lib/python3/dist-packages/*.dist-info 2>/dev/null; \
pip install --no-cache-dir pip && \
pip install --no-cache-dir poetry && \
poetry config virtualenvs.create false

# Install large stable dependencies before poetry to maximize build cache reuse.
# INSTALL_PYTORCH controls whether CPU-only PyTorch is installed.
# INSTALL_PYTORCH controls whether PyTorch is installed.
ARG INSTALL_PYTORCH="true"
RUN if [ "$INSTALL_PYTORCH" = "true" ]; then \
RUN if [ "$VARIANT" = "gpu" ] && [ "$INSTALL_PYTORCH" = "true" ]; then \
pip install --no-cache-dir torch==2.7.1; \
elif [ "$INSTALL_PYTORCH" = "true" ]; then \
pip install --no-cache-dir torch==2.7.1 \
--index-url https://download.pytorch.org/whl/cpu \
--extra-index-url https://pypi.org/simple; \
Expand Down Expand Up @@ -101,6 +131,10 @@ RUN mkdir -p /opt/spark-jars && \

# Spark configuration for local mode
ARG PYTHON_VERSION=3.12
# GPU variant (Ubuntu) may install to dist-packages. Symlink ensures stable SPARK_HOME.
RUN mkdir -p /usr/local/lib/python${PYTHON_VERSION}/site-packages && \
ln -sf $(python3 -c "import pyspark; print(pyspark.__path__[0])") \
/usr/local/lib/python${PYTHON_VERSION}/site-packages/pyspark 2>/dev/null || true
ENV SPARK_HOME="/usr/local/lib/python${PYTHON_VERSION}/site-packages/pyspark"
ENV PYSPARK_PYTHON="python3"
ENV PYSPARK_DRIVER_PYTHON="python3"
Expand Down
22 changes: 18 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ help:
@echo ""
@echo "🏗️ Building:"
@echo " make build Build default image (PySpark)"
@echo " make build-gpu Build GPU variant (CUDA + GPU PyTorch, amd64)"
@echo " make build-databricks Build Databricks variant"
@echo ""
@echo "🧹 Cleanup:"
Expand All @@ -70,21 +71,23 @@ help:
.PHONY: test-integration
test-integration:
@echo "🧪 Running staged pytest integration suite..."
@echo "Using DATALOADER_WORKERS=$${DATALOADER_WORKERS:-0}"
@if [ -n "$(INTEGRATION_RUN_ID)" ]; then \
echo "Using integration run id: $(INTEGRATION_RUN_ID)"; \
PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" bash scripts/tests/run_integration_staged.sh; \
DATALOADER_WORKERS="$${DATALOADER_WORKERS:-0}" PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" bash scripts/tests/run_integration_staged.sh; \
else \
bash scripts/tests/run_integration_staged.sh; \
DATALOADER_WORKERS="$${DATALOADER_WORKERS:-0}" bash scripts/tests/run_integration_staged.sh; \
fi

.PHONY: test-integration-verbose
test-integration-verbose:
@echo "🧪 Running staged pytest integration suite (verbose)..."
@echo "Using DATALOADER_WORKERS=$${DATALOADER_WORKERS:-0}"
@if [ -n "$(INTEGRATION_RUN_ID)" ]; then \
echo "Using integration run id: $(INTEGRATION_RUN_ID)"; \
PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
DATALOADER_WORKERS="$${DATALOADER_WORKERS:-0}" PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
else \
PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
DATALOADER_WORKERS="$${DATALOADER_WORKERS:-0}" PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
fi

# Fast sanity check - 1 iteration, minimal config
Expand Down Expand Up @@ -368,6 +371,17 @@ build:
-f Dockerfile .
@echo "✅ Build complete: plexe:py$(PYTHON_VERSION)"

# Build GPU variant (NVIDIA CUDA + CUDA-enabled PyTorch, amd64 only)
.PHONY: build-gpu
build-gpu:
@echo "🏗️ Building GPU variant (Python $(PYTHON_VERSION), CUDA)..."
docker buildx build --platform linux/amd64 --output type=docker --provenance=false \
--build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
--build-arg VARIANT=gpu \
-t plexe:py$(PYTHON_VERSION)-gpu \
-f Dockerfile .
@echo "✅ Build complete: plexe:py$(PYTHON_VERSION)-gpu"


# Build Databricks variant
.PHONY: build-databricks
Expand Down
4 changes: 2 additions & 2 deletions config.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@

# Default epochs for neural network training (Keras, PyTorch)
# Type: integer
# Default: 25
# nn_default_epochs: 25
# Default: 10
# nn_default_epochs: 10

# Maximum epochs for neural network training (Keras, PyTorch)
# Type: integer
Expand Down
42 changes: 31 additions & 11 deletions plexe/CODE_INDEX.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Code Index: plexe

> Generated on 2026-03-02 19:57:53
> Generated on 2026-03-02 22:03:39

Code structure and public interface documentation for the **plexe** package.

Expand Down Expand Up @@ -207,14 +207,14 @@ Local process runner - executes training in subprocess.

**`LocalProcessRunner`** - Runs training in local subprocess.
- `__init__(self, work_dir: str)`
- `run_training(self, template: str, model: Any, feature_pipeline: Pipeline, train_uri: str, val_uri: str, timeout: int, target_columns: list[str], optimizer: Any, loss: Any, epochs: int, batch_size: int, group_column: str | None) -> Path` - Execute training in subprocess.
- `run_training(self, template: str, model: Any, feature_pipeline: Pipeline, train_uri: str, val_uri: str, timeout: int, target_columns: list[str], task_type: str, optimizer: Any, loss: Any, epochs: int, batch_size: int, group_column: str | None, mixed_precision: bool, dataloader_workers: int) -> Path` - Execute training in subprocess.

---
## `execution/training/runner.py`
Training runner abstract base class.

**`TrainingRunner`** - Abstract base class for training execution environments.
- `run_training(self, template: str, model: Any, feature_pipeline: Pipeline, train_uri: str, val_uri: str, timeout: int, target_columns: list[str]) -> Path` - Execute model training and return path to artifacts.
- `run_training(self, template: str, model: Any, feature_pipeline: Pipeline, train_uri: str, val_uri: str, timeout: int, target_columns: list[str], task_type: str) -> Path` - Execute model training and return path to artifacts.

---
## `helpers.py`
Expand Down Expand Up @@ -312,6 +312,9 @@ Simple dataclasses for model building workflow.

**`DataLayout`** - Physical structure of dataset (not semantic meaning).

**`TaskType`** - Canonical ML task type determined during Phase 1.
- `is_classification(self) -> bool` - No description

**`Metric`** - Evaluation metric definition.

**`BuildContext`** - Context passed through workflow phases.
Expand Down Expand Up @@ -452,6 +455,7 @@ Standard Keras predictor - NO Plexe dependencies.
**`KerasPredictor`** - Standalone Keras predictor.
- `__init__(self, model_dir: str)`
- `predict(self, x: pd.DataFrame) -> pd.DataFrame` - Make predictions on input DataFrame.
- `predict_proba(self, x: pd.DataFrame) -> pd.DataFrame` - Predict per-class probabilities on input DataFrame.

---
## `templates/inference/lightgbm_predictor.py`
Expand All @@ -468,6 +472,7 @@ Standard PyTorch predictor - NO Plexe dependencies.
**`PyTorchPredictor`** - Standalone PyTorch predictor.
- `__init__(self, model_dir: str)`
- `predict(self, x: pd.DataFrame) -> pd.DataFrame` - Make predictions on input DataFrame.
- `predict_proba(self, x: pd.DataFrame) -> pd.DataFrame` - Predict per-class probabilities on input DataFrame.

---
## `templates/inference/xgboost_predictor.py`
Expand All @@ -489,37 +494,37 @@ Model card template generator.
Hardcoded robust CatBoost training loop.

**Functions:**
- `train_catboost(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str) -> dict` - Train CatBoost model directly (no Spark).
- `train_catboost(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, task_type: str | None) -> dict` - Train CatBoost model directly (no Spark).
- `main()` - No description

---
## `templates/training/train_keras.py`
Hardcoded robust Keras training loop.
Keras training template with streaming data loading, multi-GPU (MirroredStrategy), and mixed precision.

**Functions:**
- `train_keras(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, epochs: int, batch_size: int) -> dict` - Train Keras model directly.
- `train_keras(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, epochs: int, batch_size: int, use_multi_gpu: bool, use_mixed_precision: bool, task_type: str | None) -> dict` - Train Keras model with streaming data, optional multi-GPU, and mixed precision.

---
## `templates/training/train_lightgbm.py`
Hardcoded robust LightGBM training loop.

**Functions:**
- `train_lightgbm(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, group_column: str | None) -> dict` - Train LightGBM model directly (no Spark).
- `train_lightgbm(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, group_column: str | None, task_type: str | None) -> dict` - Train LightGBM model directly (no Spark).
- `main()` - No description

---
## `templates/training/train_pytorch.py`
Hardcoded robust PyTorch training loop.
PyTorch training template with streaming data loading, multi-GPU (DDP), and mixed precision.

**Functions:**
- `train_pytorch(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, epochs: int, batch_size: int) -> dict` - Train PyTorch model directly.
- `train_pytorch(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, epochs: int, batch_size: int, num_workers: int, use_ddp: bool, use_mixed_precision: bool, task_type: str | None) -> dict` - Train PyTorch model with streaming data, optional DDP, and mixed precision.

---
## `templates/training/train_xgboost.py`
Hardcoded robust XGBoost training loop.

**Functions:**
- `train_xgboost(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, group_column: str | None) -> dict` - Train XGBoost model directly (no Spark).
- `train_xgboost(untrained_model_path: Path, train_uri: str, val_uri: str, output_dir: Path, target_column: str, group_column: str | None, task_type: str | None) -> dict` - Train XGBoost model directly (no Spark).
- `main()` - No description

---
Expand Down Expand Up @@ -624,7 +629,7 @@ Utility functions for dashboard data loading.
- `load_report(exp_path: Path, report_name: str) -> dict | None` - Load YAML report from DirNames.BUILD_DIR/reports/.
- `load_code_file(file_path: Path) -> str | None` - Load Python code file.
- `load_parquet_sample(uri: str, limit: int) -> pd.DataFrame | None` - Load first N rows from parquet file.
- `get_parquet_row_count(uri: str) -> int | None` - Get row count from parquet file.
- `get_parquet_row_count(uri: str) -> int | None` - Get row count from parquet metadata without reading data.
- `load_json_file(file_path: Path) -> dict | None` - Load JSON file.

---
Expand All @@ -636,6 +641,21 @@ LiteLLM model wrapper with retry logic and optional post-call hook.
- `generate(self)` - Generate with automatic retries, header injection, and post-call hook.
- `chat(self)` - Chat with automatic retries, header injection, and post-call hook.

---
## `utils/parquet_dataset.py`
Streaming parquet data loading utilities for large-dataset training.

**`ParquetIterableDataset`** - Streaming parquet dataset for PyTorch DataLoader.
- `__init__(self, uri: str, target_column: str, task_type: str)`
- `total_rows(self) -> int` - No description

**Functions:**
- `get_parquet_row_count(uri: str) -> int` - Get total row count from parquet metadata without reading data.
- `get_dataset_size_bytes(uri: str) -> int` - Get dataset size in bytes for a local file or directory of parquet files.
- `parquet_batch_generator(uri: str, target_column: str, batch_size: int, task_type: str | None) -> Iterator[tuple[np.ndarray, np.ndarray]]` - Streaming parquet batch generator for Keras/TensorFlow.
- `get_parquet_feature_count(uri: str, target_column: str) -> int` - Get number of feature columns (total columns minus target).
- `get_steps_per_epoch(uri: str, batch_size: int) -> int` - Compute number of steps per epoch for a parquet dataset.

---
## `utils/reporting.py`
Utilities for saving agent reports to disk.
Expand Down
11 changes: 10 additions & 1 deletion plexe/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,12 +309,21 @@ class Config(BaseSettings):
# Training settings
training_timeout: int = Field(default=1800, description="Timeout for training runs (seconds)", gt=0)
nn_default_epochs: int = Field(
default=25, description="Default epochs for neural network training (Keras, PyTorch)"
default=10, description="Default epochs for neural network training (Keras, PyTorch)"
)
nn_max_epochs: int = Field(default=50, description="Maximum epochs for neural network training (Keras, PyTorch)")
nn_default_batch_size: int = Field(
default=32, description="Default batch size for neural network training (Keras, PyTorch)"
)
nn_training_timeout: int = Field(
default=14400, description="Timeout for neural network training on full dataset (seconds)", gt=0
)
mixed_precision: bool = Field(
default=True, description="Use mixed precision (FP16) when GPU available (auto-disabled on CPU)"
)
dataloader_workers: int = Field(
default=4, description="Number of DataLoader worker processes for streaming data loading", ge=0
)

# LLM settings (per agent role)
statistical_analysis_llm: str = Field(
Expand Down
Loading