Skip to content

Commit df26414

Browse files
author
Idate96
committed
Fix container extraction path - use project partition for extracted containers
1 parent b9e0a91 commit df26414

File tree

5 files changed

+337
-3
lines changed

5 files changed

+337
-3
lines changed

docs/container-workflow.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -316,11 +316,12 @@ lquota
316316
For frequently used containers, consider keeping extracted versions:
317317

318318
```bash
319-
# One-time extraction
320-
tar -xzf container.tar.gz -C /cluster/work/rsl/$USER/containers/extracted/
319+
# One-time extraction (use project for extracted containers)
320+
tar -xzf /cluster/work/rsl/$USER/containers/container.tar.gz \
321+
-C /cluster/project/rsl/$USER/containers/extracted/
321322

322323
# In job script, just copy
323-
cp -r /cluster/work/rsl/$USER/containers/extracted/my-app.sif $TMPDIR/
324+
cp -r /cluster/project/rsl/$USER/containers/extracted/my-app.sif $TMPDIR/
324325
```
325326

326327
### Automated Workflows

fake_train.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
#!/usr/bin/env python3
2+
"""Fake training script to test Euler workflows."""
3+
4+
import argparse
5+
import time
6+
import os
7+
import json
8+
import random
9+
import numpy as np
10+
11+
def print_gpu_info():
12+
"""Print GPU information if available."""
13+
try:
14+
import torch
15+
if torch.cuda.is_available():
16+
print(f"PyTorch CUDA available: True")
17+
print(f"GPU count: {torch.cuda.device_count()}")
18+
print(f"GPU name: {torch.cuda.get_device_name(0)}")
19+
print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
20+
else:
21+
print("No GPU detected, using CPU")
22+
except ImportError:
23+
print("PyTorch not installed, skipping GPU check")
24+
25+
def simulate_epoch(epoch, total_epochs, batch_size, lr):
26+
"""Simulate one training epoch."""
27+
# Fake metrics that improve over time
28+
base_loss = 2.5
29+
loss = base_loss * (0.95 ** epoch) + random.uniform(-0.1, 0.1)
30+
31+
base_acc = 0.1
32+
acc = min(0.95, base_acc + (0.85 * epoch / total_epochs) + random.uniform(-0.05, 0.05))
33+
34+
# Simulate training time
35+
time.sleep(2) # Pretend each epoch takes 2 seconds
36+
37+
return loss, acc
38+
39+
def save_checkpoint(output_dir, epoch, loss, acc):
40+
"""Save a fake checkpoint."""
41+
checkpoint_dir = os.path.join(output_dir, "checkpoints")
42+
os.makedirs(checkpoint_dir, exist_ok=True)
43+
44+
checkpoint = {
45+
"epoch": epoch,
46+
"loss": loss,
47+
"accuracy": acc,
48+
"model_state": "fake_model_weights_here"
49+
}
50+
51+
checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch}.json")
52+
with open(checkpoint_path, 'w') as f:
53+
json.dump(checkpoint, f, indent=2)
54+
55+
return checkpoint_path
56+
57+
def main():
58+
parser = argparse.ArgumentParser(description='Fake ML Training Script')
59+
parser.add_argument('--data-dir', type=str, required=True, help='Data directory')
60+
parser.add_argument('--output-dir', type=str, required=True, help='Output directory')
61+
parser.add_argument('--epochs', type=int, default=10, help='Number of epochs')
62+
parser.add_argument('--batch-size', type=int, default=32, help='Batch size')
63+
parser.add_argument('--lr', type=float, default=0.001, help='Learning rate')
64+
parser.add_argument('--seed', type=int, default=42, help='Random seed')
65+
66+
args = parser.parse_args()
67+
68+
# Set random seed
69+
random.seed(args.seed)
70+
np.random.seed(args.seed)
71+
72+
print("="*60)
73+
print("FAKE ML TRAINING SCRIPT")
74+
print("="*60)
75+
print(f"Data directory: {args.data_dir}")
76+
print(f"Output directory: {args.output_dir}")
77+
print(f"Epochs: {args.epochs}")
78+
print(f"Batch size: {args.batch_size}")
79+
print(f"Learning rate: {args.lr}")
80+
print(f"Random seed: {args.seed}")
81+
print("="*60)
82+
83+
# Print GPU info
84+
print("\nSystem Information:")
85+
print_gpu_info()
86+
print()
87+
88+
# Create output directory
89+
os.makedirs(args.output_dir, exist_ok=True)
90+
91+
# Simulate data loading
92+
print("Loading dataset...")
93+
if os.path.exists(args.data_dir):
94+
print(f"✓ Found data directory: {args.data_dir}")
95+
else:
96+
print(f"⚠ Data directory not found, using fake data")
97+
time.sleep(1)
98+
99+
# Training loop
100+
print("\nStarting training...")
101+
best_loss = float('inf')
102+
103+
for epoch in range(args.epochs):
104+
print(f"\nEpoch {epoch+1}/{args.epochs}")
105+
print("-" * 40)
106+
107+
# Simulate training
108+
loss, acc = simulate_epoch(epoch, args.epochs, args.batch_size, args.lr)
109+
110+
print(f"Loss: {loss:.4f}")
111+
print(f"Accuracy: {acc:.4f}")
112+
113+
# Save checkpoint every 5 epochs or if best
114+
if (epoch + 1) % 5 == 0 or loss < best_loss:
115+
checkpoint_path = save_checkpoint(args.output_dir, epoch + 1, loss, acc)
116+
print(f"Saved checkpoint: {checkpoint_path}")
117+
118+
if loss < best_loss:
119+
best_loss = loss
120+
best_checkpoint = os.path.join(args.output_dir, "checkpoints", "best_model.json")
121+
with open(best_checkpoint, 'w') as f:
122+
json.dump({"epoch": epoch + 1, "loss": loss, "accuracy": acc}, f)
123+
print(f"New best model saved!")
124+
125+
# Save final results
126+
results = {
127+
"final_epoch": args.epochs,
128+
"final_loss": loss,
129+
"final_accuracy": acc,
130+
"best_loss": best_loss,
131+
"hyperparameters": vars(args)
132+
}
133+
134+
results_path = os.path.join(args.output_dir, "training_results.json")
135+
with open(results_path, 'w') as f:
136+
json.dump(results, f, indent=2)
137+
138+
print("\n" + "="*60)
139+
print("TRAINING COMPLETED!")
140+
print(f"Final Loss: {loss:.4f}")
141+
print(f"Final Accuracy: {acc:.4f}")
142+
print(f"Best Loss: {best_loss:.4f}")
143+
print(f"Results saved to: {results_path}")
144+
print("="*60)
145+
146+
if __name__ == "__main__":
147+
main()

test_array_job.sh

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=test-array
3+
#SBATCH --output=logs/array_%A_%a.out
4+
#SBATCH --error=logs/array_%A_%a.err
5+
#SBATCH --time=00:10:00
6+
#SBATCH --array=1-6
7+
#SBATCH --ntasks=1
8+
#SBATCH --cpus-per-task=2
9+
#SBATCH --mem-per-cpu=1G
10+
11+
module load eth_proxy
12+
13+
echo "Array Job Test"
14+
echo "=============="
15+
echo "Array Job ID: $SLURM_ARRAY_JOB_ID"
16+
echo "Array Task ID: $SLURM_ARRAY_TASK_ID"
17+
echo "Running on: $(hostname)"
18+
echo ""
19+
20+
# Define parameter arrays for hyperparameter search
21+
learning_rates=(0.001 0.01 0.1)
22+
batch_sizes=(16 32)
23+
24+
# Calculate indices for 2D parameter grid
25+
# We have 3 LRs x 2 batch sizes = 6 total combinations
26+
lr_index=$(( ($SLURM_ARRAY_TASK_ID - 1) / ${#batch_sizes[@]} ))
27+
bs_index=$(( ($SLURM_ARRAY_TASK_ID - 1) % ${#batch_sizes[@]} ))
28+
29+
LR=${learning_rates[$lr_index]}
30+
BS=${batch_sizes[$bs_index]}
31+
32+
echo "Testing parameters:"
33+
echo "Learning Rate: $LR"
34+
echo "Batch Size: $BS"
35+
echo ""
36+
37+
# Activate environment
38+
source /cluster/project/rsl/$USER/miniconda3/bin/activate test_env 2>/dev/null || echo "Using base environment"
39+
40+
# Create output directory for this parameter combination
41+
OUTPUT_DIR=/cluster/project/rsl/$USER/hp_search/lr${LR}_bs${BS}
42+
mkdir -p $OUTPUT_DIR
43+
44+
# Run fake training with these parameters
45+
if [ -f /cluster/home/$USER/fake_train.py ]; then
46+
python /cluster/home/$USER/fake_train.py \
47+
--data-dir /tmp/fake_data \
48+
--output-dir $OUTPUT_DIR \
49+
--epochs 5 \
50+
--batch-size $BS \
51+
--lr $LR \
52+
--seed $SLURM_ARRAY_TASK_ID
53+
54+
echo ""
55+
echo "Results saved to: $OUTPUT_DIR"
56+
else
57+
echo "Training script not found, simulating results..."
58+
echo "{\"lr\": $LR, \"bs\": $BS, \"final_loss\": $(echo "scale=4; 1.5 - $SLURM_ARRAY_TASK_ID * 0.1" | bc)}" > $OUTPUT_DIR/results.json
59+
fi
60+
61+
echo ""
62+
echo "Task $SLURM_ARRAY_TASK_ID completed"

test_container_extraction.sh

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=test-container-extract
3+
#SBATCH --output=test_container_%j.out
4+
#SBATCH --error=test_container_%j.err
5+
#SBATCH --time=00:10:00
6+
#SBATCH --ntasks=1
7+
#SBATCH --cpus-per-task=4
8+
#SBATCH --mem-per-cpu=2G
9+
#SBATCH --tmp=20G
10+
11+
# Load required modules
12+
module load eth_proxy
13+
14+
echo "Container Extraction Test"
15+
echo "========================"
16+
echo "Job started on $(hostname) at $(date)"
17+
echo "Job ID: $SLURM_JOB_ID"
18+
echo "Temp directory: $TMPDIR"
19+
echo ""
20+
21+
# Check available space
22+
echo "Available space in $TMPDIR:"
23+
df -h $TMPDIR
24+
echo ""
25+
26+
# Test extraction timing
27+
echo "Extracting container to local scratch..."
28+
time tar -xf /cluster/work/rsl/$USER/containers/euler-test.tar -C $TMPDIR
29+
30+
echo ""
31+
echo "Extraction complete. Checking contents:"
32+
ls -la $TMPDIR/
33+
echo ""
34+
35+
# Check if it's a singularity image
36+
if [ -f "$TMPDIR/euler-test.sif" ]; then
37+
echo "Found singularity image: euler-test.sif"
38+
echo "Image size: $(du -h $TMPDIR/euler-test.sif | cut -f1)"
39+
40+
echo ""
41+
echo "Testing singularity exec:"
42+
singularity exec $TMPDIR/euler-test.sif echo "Hello from container!"
43+
fi
44+
45+
echo ""
46+
echo "Job completed at $(date)"

test_full_training_job.sh

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=test-ml-training
3+
#SBATCH --output=test_training_%j.out
4+
#SBATCH --error=test_training_%j.err
5+
#SBATCH --time=00:15:00
6+
#SBATCH --ntasks=1
7+
#SBATCH --cpus-per-task=8
8+
#SBATCH --mem-per-cpu=2G
9+
#SBATCH --gpus=1
10+
#SBATCH --tmp=50G
11+
12+
# Load modules
13+
module load eth_proxy
14+
15+
echo "========================================="
16+
echo "ML Training Job Test"
17+
echo "========================================="
18+
echo "Job ID: $SLURM_JOB_ID"
19+
echo "Running on: $(hostname)"
20+
echo "Start time: $(date)"
21+
echo "GPU allocation: $CUDA_VISIBLE_DEVICES"
22+
echo "CPUs: $SLURM_CPUS_PER_TASK"
23+
echo "Temp directory: $TMPDIR"
24+
echo "========================================="
25+
26+
# Show GPU info
27+
echo -e "\nGPU Information:"
28+
nvidia-smi --query-gpu=name,memory.total --format=csv
29+
30+
# Create fake dataset in local scratch
31+
echo -e "\nPreparing fake dataset..."
32+
mkdir -p $TMPDIR/fake_dataset/{train,val}
33+
echo "Dataset created in $TMPDIR/fake_dataset"
34+
35+
# Activate conda environment
36+
echo -e "\nActivating conda environment..."
37+
source /cluster/project/rsl/$USER/miniconda3/bin/activate
38+
conda activate test_env || echo "test_env not found, using base environment"
39+
40+
# Copy training script
41+
echo -e "\nCopying training script..."
42+
cp /cluster/home/$USER/fake_train.py $TMPDIR/
43+
44+
# Set up output directory
45+
OUTPUT_DIR=/cluster/project/rsl/$USER/results/test_$SLURM_JOB_ID
46+
mkdir -p $OUTPUT_DIR
47+
48+
# Run training
49+
echo -e "\nStarting training..."
50+
cd $TMPDIR
51+
python fake_train.py \
52+
--data-dir $TMPDIR/fake_dataset \
53+
--output-dir $OUTPUT_DIR \
54+
--epochs 10 \
55+
--batch-size 64 \
56+
--lr 0.001
57+
58+
# Check results
59+
echo -e "\nTraining completed. Results:"
60+
if [ -f "$OUTPUT_DIR/training_results.json" ]; then
61+
cat $OUTPUT_DIR/training_results.json
62+
else
63+
echo "No results file found"
64+
fi
65+
66+
echo -e "\nOutput files:"
67+
ls -la $OUTPUT_DIR/
68+
69+
# Simulate copying important results back
70+
if [ -d "$OUTPUT_DIR/checkpoints" ]; then
71+
echo -e "\nCheckpoints saved:"
72+
ls -la $OUTPUT_DIR/checkpoints/
73+
fi
74+
75+
echo -e "\n========================================="
76+
echo "Job completed at $(date)"
77+
echo "Results saved to: $OUTPUT_DIR"
78+
echo "========================================="

0 commit comments

Comments
 (0)