Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@ machine_rank: 0
main_training_function: main
mixed_precision: 'bf16'
num_machines: 1
num_processes: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
main_process_port: 0
82 changes: 82 additions & 0 deletions jobs/unkillable_diversity.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/bin/bash
#SBATCH --job-name=validate_static_diversity
#SBATCH --tasks=1
#SBATCH --cpus-per-task=6
#SBATCH --account=
#SBATCH --gres=gpu:a100:2
#SBATCH --mem=128G
#SBATCH --time=24:00:00
#SBATCH --output=slurm-%j.out
#SBATCH --error=slurm-%j.err
#SBATCH --mail-type=ALL
#SBATCH --mail-user=

cd
module load python/3.10
module load cuda/12.6
source .venv/bin/activate
echo "Starting vLLM server..."
uv run vllm serve Salesforce/SFR-Embedding-Mistral \
--dtype bfloat16 \
--api-key openai \
--kv-cache-dtype fp8 \
--task embed \
--trust-remote-code \
--tensor_parallel_size 2 \
--max-model-len 4096 &

# Save server process ID
SERVER_PID=$!

echo "Waiting for server to start..."
while true; do
echo "Checking if server is up..."
RESPONSE=$(curl -s http://localhost:8000/v1/models -H "Authorization: Bearer openai" 2>&1)

if [[ "$RESPONSE" == *"data"* ]]; then
echo "Server is up and running!"
break
fi

# Check if server is still running
if ! kill -0 $SERVER_PID 2>/dev/null; then
echo "Server process died unexpectedly"
exit 1
fi

echo "Server not ready yet. Waiting 5 seconds..."
sleep 5
done

deactivate
cd projects/AIF-Gen
source .env

echo "Starting validation process..."

# list all sub‐tasks
tasks=(
merged_qna
merged_qna_summary
hh
ultra
cppo-rl-sampled
cppo-reward-sampled
)

for t in "${tasks[@]}"; do
echo "Validating $t..."
uv run aif validate \
"data/$t.json" \
"data/$t-validate-diversity-nvidia.json" \
--no-validate-diversity \
--no-validate-count \
--no-validate-entropy \
--no-validate-llm-judge \
--embedding-model "Salesforce/SFR-Embedding-Mistral" \
--embedding-batch-size 128 \
--max_concurrency 8 \
|| { echo "Validation failed on $t"; exit 1; }
done

echo "All validations completed successfully."
149 changes: 22 additions & 127 deletions jobs/validate_all_static.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#SBATCH --job-name=validate_static_all
#SBATCH --partition=unkillable-cpu
#SBATCH --cpus-per-task=2
#SBATCH --mem=16G
#SBATCH --time=24:00:00
#SBATCH --output=slurm-%j.out
#SBATCH --error=slurm-%j.err
Expand All @@ -10,130 +11,24 @@

source .env

uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/education_qna_direct/*/data.json \
data/70B_15_validation/70B/education_qna_direct/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/education_qna_eli5/*/data.json \
data/70B_15_validation/70B/education_qna_eli5/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/education_qna_expert/*/data.json \
data/70B_15_validation/70B/education_qna_expert/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/education_qna_hinted/*/data.json \
data/70B_15_validation/70B/education_qna_hinted/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/education_summary_eli5/*/data.json \
data/70B_15_validation/70B/education_summary_eli5/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/education_summary_expert/*/data.json \
data/70B_15_validation/70B/education_summary_expert/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_generate_long/*/data.json \
data/70B_15_validation/70B/politics_generate_long/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_generate_short/*/data.json \
data/70B_15_validation/70B/politics_generate_short/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_qna_eli5/*/data.json \
data/70B_15_validation/70B/politics_qna_eli5/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_qna_expert/*/data.json \
data/70B_15_validation/70B/politics_qna_expert/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_summary_eli5/*/data.json \
data/70B_15_validation/70B/politics_summary_eli5/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_summary_expert/*/data.json \
data/70B_15_validation/70B/politics_summary_expert/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_generate_short/*/data.json \
data/70B_15_validation/70B/politics_generate_short/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_qna_eli5/*/data.json \
data/70B_15_validation/70B/politics_qna_eli5/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_summary_eli5/*/data.json \
data/70B_15_validation/70B/politics_summary_eli5/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/tech_healthcare_summary_expert/*/data.json \
data/70B_15_validation/70B/tech_healthcare_summary_expert/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct
MODEL="gpt-4o-mini"
MAX_CONC=256

# list all sub‐tasks
tasks=(
internal_subsampled_merged
)

for t in "${tasks[@]}"; do
echo "Validating $t..."
uv run aif validate \
--max_concurrency "$MAX_CONC" \
"data/$t.json" \
"data/$t-validate-no-diversity.json" \
--no-validate-diversity \
--no-validate-embedding-diversity \
--no-validate-llm-judge \
|| { echo "Validation failed on $t"; exit 1; }
done

echo "All validations completed successfully."
72 changes: 72 additions & 0 deletions jobs/validate_all_static_diversity.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/bash
#SBATCH --job-name=validate_static_all_diversity_external
#SBATCH --partition=main
#SBATCH --gres=gpu:a100l:1
#SBATCH --mem=48G
#SBATCH --cpus-per-task=8
#SBATCH --time=24:00:00
#SBATCH --output=slurm-%j.out
#SBATCH --error=slurm-%j.err
#SBATCH --mail-type=ALL
#SBATCH --mail-user=

cd
module load python/3.10
module load cuda/12.6.0
source .venv/bin/activate
echo "Starting vLLM server..."
uv run vllm serve intfloat/e5-mistral-7b-instruct --quantization int8 --api-key openai --task embed --trust-remote-code --max-model-len 4096 &

# Save server process ID
SERVER_PID=$!

echo "Waiting for server to start..."
while true; do
echo "Checking if server is up..."
RESPONSE=$(curl -s http://localhost:8000/v1/models -H "Authorization: Bearer openai" 2>&1)

if [[ "$RESPONSE" == *"data"* ]]; then
echo "Server is up and running!"
break
fi

# Check if server is still running
if ! kill -0 $SERVER_PID 2>/dev/null; then
echo "Server process died unexpectedly"
exit 1
fi

echo "Server not ready yet. Waiting 5 seconds..."
sleep 5
done

deactivate
cd projects/AIF-Gen
source .env

echo "Starting validation process..."

# list all sub‐tasks
tasks=(
merged_qna
merged_qna_summary
hh
ultra
)

for t in "${tasks[@]}"; do
echo "Validating $t..."
uv run aif validate \
"data/$t.json" \
"data/$t-validate-diversity-nvidia.json" \
--no-validate-diversity \
--no-validate-count \
--no-validate-entropy \
--no-validate-llm-judge \
--embedding-model "intfloat/e5-mistral-7b-instruct" \
--embedding-batch-size 64 \
--max_concurrency 8 \
# || { echo "Validation failed on $t"; exit 1; }
done

echo "All validations completed successfully."
541 changes: 541 additions & 0 deletions notebooks/continuity.ipynb

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dependencies = [
"pydantic>=2.10.4",
"pytest-asyncio>=0.25.3",
"pytest-mock>=3.14.0",
"setuptools>=75.8.2",
"torch==2.3.0",
"types-pyyaml>=6.0.12.20241230",
]
Expand All @@ -40,6 +41,9 @@ dev = [
"ruff>=0.7.3",
"sphinx>=7.4.7",
"sphinx-rtd-theme>=3.0.2",
"ipykernel>=6.29.5",
"matplotlib>=3.10.1",
"scipy>=1.15.2",
]
benchmarks-dpo = [
"datasets>=3.2.0",
Expand Down
Loading