-
Notifications
You must be signed in to change notification settings - Fork 1
Data Generation Second Phase - service.py cache fix and job scripts add
#138
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,89 @@ | ||||||||||||||||
| import logging | ||||||||||||||||
| import pathlib | ||||||||||||||||
| from typing import Optional | ||||||||||||||||
|
|
||||||||||||||||
| import click | ||||||||||||||||
|
|
||||||||||||||||
| from aif_gen.dataset.continual_alignment_dataset import ( | ||||||||||||||||
| ContinualAlignmentDataset, | ||||||||||||||||
| ) | ||||||||||||||||
| from aif_gen.util.hf import download_from_hf, upload_to_hf | ||||||||||||||||
| from aif_gen.util.seed import seed_everything | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
| @click.command(context_settings={'show_default': True}) | ||||||||||||||||
| @click.argument( | ||||||||||||||||
| 'input_data_file', | ||||||||||||||||
| type=click.Path(exists=True, dir_okay=False, path_type=pathlib.Path), | ||||||||||||||||
| ) | ||||||||||||||||
| @click.argument( | ||||||||||||||||
| 'output_data_file', | ||||||||||||||||
| type=click.Path(dir_okay=False, path_type=pathlib.Path), | ||||||||||||||||
| ) | ||||||||||||||||
| @click.argument( | ||||||||||||||||
| 'words', | ||||||||||||||||
| type=click.STRING, | ||||||||||||||||
| ) | ||||||||||||||||
| @click.option( | ||||||||||||||||
| '--random_seed', | ||||||||||||||||
| type=int, | ||||||||||||||||
| help='Random seed for data generation.', | ||||||||||||||||
| default=0, | ||||||||||||||||
| ) | ||||||||||||||||
| @click.option( | ||||||||||||||||
| '--hf-repo-id', | ||||||||||||||||
| type=click.STRING, | ||||||||||||||||
| default=None, | ||||||||||||||||
| help='If not None, push the generated input_dataset to a HuggingFace remote repository with the associated repo-id.', | ||||||||||||||||
| ) | ||||||||||||||||
| def clean_dataset( | ||||||||||||||||
| input_data_file: pathlib.Path, | ||||||||||||||||
| output_data_file: pathlib.Path, | ||||||||||||||||
| words: str, | ||||||||||||||||
| random_seed: int, | ||||||||||||||||
| hf_repo_id: Optional[str], | ||||||||||||||||
| ) -> None: | ||||||||||||||||
| r"""Clean a ContinualAlignmentDataset given a space-separated string of words. | ||||||||||||||||
|
|
||||||||||||||||
| INPUT_DATA_FILE: Path to the input dataset. | ||||||||||||||||
| OUTPUT_DATA_FILE: Path to the output dataset. | ||||||||||||||||
| WORDS: Space-separated string of words to clean the dataset. | ||||||||||||||||
| """ | ||||||||||||||||
| if hf_repo_id is not None: | ||||||||||||||||
| input_data_file = download_from_hf(hf_repo_id, input_data_file) | ||||||||||||||||
|
|
||||||||||||||||
| logging.info(f'Reading input_dataset from: {input_data_file}') | ||||||||||||||||
| input_dataset = ContinualAlignmentDataset.from_json(input_data_file) | ||||||||||||||||
| logging.info(f'Read {len(input_dataset)} samples from: {input_data_file}') | ||||||||||||||||
|
|
||||||||||||||||
| if not len(input_dataset): | ||||||||||||||||
| logging.warning('No samples found in dataset, skipping clean up.') | ||||||||||||||||
| return | ||||||||||||||||
|
|
||||||||||||||||
| logging.info(f'Using words: {words}') | ||||||||||||||||
| logging.info(f'Random seed: {random_seed}') | ||||||||||||||||
| seed_everything(random_seed) | ||||||||||||||||
|
|
||||||||||||||||
| output_data_file.parent.mkdir(parents=True, exist_ok=True) | ||||||||||||||||
|
|
||||||||||||||||
| words_list = words.split(' ') | ||||||||||||||||
| if len(words_list) == 0: | ||||||||||||||||
| logging.warning('No words found in words string, skipping clean up.') | ||||||||||||||||
| return | ||||||||||||||||
|
|
||||||||||||||||
| # clean up each data point in the dataset | ||||||||||||||||
| for dataset in input_dataset.datasets: | ||||||||||||||||
| for sample in dataset.samples: | ||||||||||||||||
| for word in words_list: | ||||||||||||||||
| sample.prompt = sample.prompt.replace(word, '') | ||||||||||||||||
| sample.chosen = sample.chosen.replace(word, '') | ||||||||||||||||
| sample.rejected = sample.rejected.replace(word, '') | ||||||||||||||||
|
|
||||||||||||||||
| logging.info(f'Finished cleaning dataset.') | ||||||||||||||||
|
|
||||||||||||||||
| logging.info(f'Writing {len(dataset)} samples to {output_data_file}') | ||||||||||||||||
| input_dataset.to_json(output_data_file) | ||||||||||||||||
| logging.info(f'Wrote {len(dataset)} samples to {output_data_file}') | ||||||||||||||||
|
Comment on lines
+84
to
+86
|
||||||||||||||||
| logging.info(f'Writing {len(dataset)} samples to {output_data_file}') | |
| input_dataset.to_json(output_data_file) | |
| logging.info(f'Wrote {len(dataset)} samples to {output_data_file}') | |
| total_samples = sum(len(dataset.samples) for dataset in input_dataset.datasets) | |
| logging.info(f'Writing {total_samples} samples to {output_data_file}') | |
| input_dataset.to_json(output_data_file) | |
| logging.info(f'Wrote {total_samples} samples to {output_data_file}') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Copilot is right here
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assuming we are finalized on this approach for our first release and paper, we should move this to config (in a subsequent PR)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also could be worth fixing #140 while we are here unless you rather do it separately for clean git history |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,169 @@ | ||
| #!/bin/bash | ||
| #SBATCH --job-name=generate_static_all_70B_final | ||
| #SBATCH --partition=main | ||
| #SBATCH --mem=48G | ||
| #SBATCH --cpus-per-task=6 | ||
| #SBATCH --time=24:00:00 | ||
| #SBATCH --output=slurm-%j.out | ||
| #SBATCH --error=slurm-%j.err | ||
| #SBATCH --mail-type=ALL | ||
| #SBATCH --mail-user= | ||
|
|
||
| # set -euo pipefail | ||
| source .env | ||
|
|
||
| # 1) start the vllm server in the background | ||
| uvx vllm serve meta-llama/Meta-Llama-3-70B-Instruct \ | ||
| --dtype auto \ | ||
| --api-key openai \ | ||
| --tensor-parallel-size 2 & | ||
| SERVER_PID=$! | ||
| echo "⏳ Waiting for VLLM server (PID=$SERVER_PID) to come up…" | ||
|
|
||
| # replace fixed sleep with a health‐check loop | ||
| export UV_VLLM_SERVER_URL="http://127.0.0.1:8000" # tell `uv run` where to send requests | ||
| for i in $(seq 1 600); do | ||
| if curl -fs "${UV_VLLM_SERVER_URL}/health"; then | ||
| echo "✅ VLLM up after $((i*5))s" | ||
| break | ||
| fi | ||
| echo "…still waiting ($i/600)…" | ||
| sleep 5 | ||
| done | ||
|
|
||
| # helper to run one job | ||
| () { echo "➡️ $*"; eval "$*"; } | ||
|
|
||
|
|
||
| # 2) run all generation jobs sequentially | ||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/education_qna_direct/data.json" \ | ||
| config/static_copy/education_qna_direct.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/education_qna_eli5/data.json" \ | ||
| config/static_copy/education_qna_eli5.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/education_qna_expert/data.json" \ | ||
| config/static_copy/education_qna_expert.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/education_qna_hinted/data.json" \ | ||
| config/static_copy/education_qna_hinted.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/education_summary_eli5/data.json" \ | ||
| config/static_copy/education_summary_eli5.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/education_summary_expert/data.json" \ | ||
| config/static_copy/education_summary_expert.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/politics_generate_formal/data.json" \ | ||
| config/static_copy/politics_generate_formal.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/politics_generate_rapper/data.json" \ | ||
| config/static_copy/politics_generate_rapper.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/politics_generate_shakespeare/data.json" \ | ||
| config/static_copy/politics_generate_shakespeare.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/politics_qna_eli5/data.json" \ | ||
| config/static_copy/politics_qna_eli5.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/politics_qna_expert/data.json" \ | ||
| config/static_copy/politics_qna_expert.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/politics_summary_eli5/data.json" \ | ||
| config/static_copy/politics_summary_eli5.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/politics_summary_expert/data.json" \ | ||
| config/static_copy/politics_summary_expert.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/tech_healthcare_qna_eli5/data.json" \ | ||
| config/static_copy/tech_healthcare_qna_eli5.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/tech_healthcare_qna_expert/data.json" \ | ||
| config/static_copy/tech_healthcare_qna_expert.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/tech_physics_summary_eli5/data.json" \ | ||
| config/static_copy/tech_physics_summary_eli5.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/tech_physics_summary_expert/data.json" \ | ||
| config/static_copy/tech_physics_summary_expert.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| uv run aif generate \ | ||
| --include-preference-axes \ | ||
| --max_concurrency 256 \ | ||
| --output_file "data/70B_generation/tech_physics_summary_highschool/data.json" \ | ||
| config/static_copy/tech_physics_summary_highschool.yaml \ | ||
| Meta-Llama-3.1-70B-Instruct | ||
|
|
||
| # 3) shutdown the server when done | ||
| echo "✅ All jobs finished. Shutting down VLLM server (PID=$SERVER_PID)…" | ||
| kill $SERVER_PID | ||
| wait $SERVER_PID 2>/dev/null || true | ||
| echo "🛑 Server stopped." |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the use case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To reiterate, is this necessary to merge? Or just useful as a pre-processing step for our data? By the way, I would rather not have used this in excess (or at all), since it's unclear how removing specific words could alter the latent preference that we are aiming to model.