diff --git a/README.md b/README.md index 96a5599f..c560b7de 100644 --- a/README.md +++ b/README.md @@ -11,31 +11,70 @@ of the model. This results in speedups (without any additional code optimization The code is arranged as a package `slicegpt` in `/src`, and scripts to replicate experiments from the paper are in `/experiments`. To install the `slicegpt` package, we recommend -`pip install -e .` +``` + pip install -e . +``` ## Running SliceGPT To run SliceGPT on `microsoft/phi-2`, from the `experiments` folder, run ``` - python run_slicegpt_perplexity.py \ + python run_slicegpt.py \ --model microsoft/phi-2 \ --save-dir dir/to/save/sliced_model/in \ --sparsity 0.25 \ - --no-wandb \ --device cuda:0 \ - --eval-baseline + --eval-baseline \ + --no-wandb ``` This will compress the `microsoft/phi-2` model and save the compressed model to the specified directory. Please consult the script for the full set of options. -The experiments folder also contains scripts for -- [finetuning](./experiments/run_finetuning.py) the compressed model to recover most of the quality lost during compression -- [zero-shot task evaluation](./experiments/run_zero_shot_tasks.py) of a dense, compressed or fine-tuned model - _Note:_ For models that require Hugging Face authentication, set the `--hf-token` argument manually or using a key vault. Alternatively, set the environment variable `HF_TOKEN`. +### Recovery fine-tuning + +To install additional dependencies required for post-slicing recovery fine-tuning (RFT): + +``` + pip install -e .[finetune] +``` + +The following replicates the experiments in the paper (LoRA hyperparams valid for all Llama-2 and Phi-2 models): +``` + python run_finetuning.py \ + --model microsoft/phi-2 \ + --sliced-model-path path/to/sliced/model.pt \ + --save-dir dir/to/save/finetuned_model/in \ + --sparsity 0.25 \ + --device cuda:0 \ + --ppl-eval-dataset alpaca \ + --finetune-dataset alpaca \ + --finetune-train-nsamples 8000 \ + --finetune-train-seqlen 1024 \ + --finetune-train-batch-size 3 \ + --lora-alpha 10 \ + --lora-r 32 \ + --lora-dropout 0.05 \ + --lora-target-option attn_head_and_mlp \ + --eval-steps 16 \ + --save-steps 16 \ + --no-wandb +``` + +Note: the script [`bo_finetuning.py`](./experiments/bo_finetuning.py) can be use to run Bayesian optimization over the RFT hyperparameters. + +### Evaluation using the [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) +``` + python run_lm_eval.py \ + --model microsoft/phi-2 \ + --sliced-model-path path/to/sliced/model.pt \ + --tasks piqa \ + --no-wandb +``` + ## Supported models The following models from Hugging Face hub are currently supported @@ -75,7 +114,7 @@ and update `hf_utils.get_model_and_tokenizer` before slicing the new model. ([Phi-2](./src/slicegpt/adapters/phi2_adapter.py)). The `self.*_shortcut_Q` matrices are attached to the modules during slicing and are available in `forward()`. If the skip connection does not need modification, these matrices will be None, and the `forward()` method can follow the original workflow. For more details on this, - please read Section 3 [the paper](https://arxiv.org/abs/2401.15024). + please read Section 3 in [the paper](https://arxiv.org/abs/2401.15024). Example: [llama_adapter.py](./src/slicegpt/adapters/llama_adapter.py) diff --git a/experiments/run_zero_shot_tasks.py b/experiments/run_lm_eval.py similarity index 91% rename from experiments/run_zero_shot_tasks.py rename to experiments/run_lm_eval.py index 4d999abf..ed093783 100644 --- a/experiments/run_zero_shot_tasks.py +++ b/experiments/run_lm_eval.py @@ -18,12 +18,14 @@ from slicegpt import gpu_utils, hf_utils, utils from slicegpt.config import config -utils.configure_logging() +# Use the logger from lm_eval, adding a file handler to write the log to file +logging = lm_eval_utils.eval_logger +logging.addHandler(utils.create_file_handler(log_dir="log")) os.environ["WANDB__SERVICE_WAIT"] = "300" -def parse_args() -> argparse.Namespace: +def argparser() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument( "--model", @@ -54,7 +56,7 @@ def parse_args() -> argparse.Namespace: help="Interval for rounding the weights (the best value may depend on your hardware)", ) parser.add_argument('--hf-token', type=str, default=os.getenv('HF_TOKEN', None)) - parser.add_argument("--batch-size", type=int, default=1, help="Batch size for evaluating with lm eval harness.") + parser.add_argument("--batch-size", type=int, default=64, help="Batch size for evaluating with lm eval harness.") parser.add_argument( "--distribute-model", action="store_true", @@ -69,14 +71,21 @@ def parse_args() -> argparse.Namespace: choices=lm_eval_utils.MultiChoice(tasks.ALL_TASKS), ) parser.add_argument('--num-fewshot', type=int, default=0, help="Number of fewshots for all tasks.") - return parser.parse_args() + + args = parser.parse_args() + + logging.info(f'Parsed arguments:') + for arg, argv in vars(args).items(): + logging.info(f'{arg} = {argv}') + + return args def main() -> None: - logging.info("Running SliceGPT zeroshot tasks experiment.") + logging.info("Running SliceGPT LM eval experiment.") initialize_tasks() - args = parse_args() + args = argparser() logging.info(f"PyTorch device: {config.device}") logging.info(f"Number of available cuda devices: {torch.cuda.device_count()}") diff --git a/experiments/run_slicegpt_perplexity.py b/experiments/run_slicegpt.py similarity index 98% rename from experiments/run_slicegpt_perplexity.py rename to experiments/run_slicegpt.py index e386940a..7008ff28 100755 --- a/experiments/run_slicegpt_perplexity.py +++ b/experiments/run_slicegpt.py @@ -54,7 +54,7 @@ def argparser() -> argparse.Namespace: help="Number of samples of the calibration data to load.", default=128, ) - parser.add_argument("--cal-batch-size", type=int, default=1, help="Batch size for loading the calibration data.") + parser.add_argument("--cal-batch-size", type=int, default=16, help="Batch size for loading the calibration data.") parser.add_argument( "--cal-max-seqlen", type=int, default=2048, help="Maximum sequence length for the calibration data." ) @@ -128,7 +128,7 @@ def argparser() -> argparse.Namespace: def main() -> None: - logging.info("Running SliceGPT perplexity experiment") + logging.info("Running SliceGPT experiment.") args = argparser() diff --git a/src/slicegpt/utils.py b/src/slicegpt/utils.py index 10032b7d..1cce65b3 100644 --- a/src/slicegpt/utils.py +++ b/src/slicegpt/utils.py @@ -11,6 +11,18 @@ import torch +def create_file_handler(log_dir: str) -> logging.FileHandler: + path = pathlib.Path.cwd() / log_dir / f'{datetime.datetime.now():log_%Y-%m-%d-%H-%M-%S}.log' + path.parent.mkdir(parents=True, exist_ok=True) + file_handler = logging.FileHandler(path, encoding='utf-8') + file_handler.setLevel(logging.DEBUG) + formatter = logging.Formatter( + '%(asctime)s.%(msecs)04d\t%(levelname)s\t%(name)s\t%(message)s', datefmt='%Y-%m-%dT%H:%M:%S' + ) + file_handler.setFormatter(formatter) + return file_handler + + def configure_logging( log_to_console: bool = True, log_to_file: bool = True, @@ -27,15 +39,7 @@ def configure_logging( handlers.append(handler) if log_to_file: - path = pathlib.Path.cwd() / log_dir / f'{datetime.datetime.now():log_%Y-%m-%d-%H-%M-%S}.log' - path.parent.mkdir(parents=True, exist_ok=True) - file_handler = logging.FileHandler(path, encoding='utf-8') - file_handler.setLevel(logging.DEBUG) - formatter = logging.Formatter( - '%(asctime)s.%(msecs)04d\t%(levelname)s\t%(name)s\t%(message)s', datefmt='%Y-%m-%dT%H:%M:%S' - ) - file_handler.setFormatter(formatter) - handlers.append(file_handler) + handlers.append(create_file_handler(log_dir=log_dir)) logging.basicConfig( handlers=handlers, diff --git a/tests/test_experiments.py b/tests/test_experiments.py index 19ab4256..41c91a6d 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -59,6 +59,15 @@ def run_shell_command(command: str, cmd_args: list[str]) -> str: return run_shell_command(python, py_args) +def check_task_acc_in_log(log: str, task: str, expected_acc: float) -> None: + """Verify that the log contains the expected accuracy for the provided task.""" + match = re.search(rf'"{task}": (\d+\.\d+)', log) + assert match, f'Expected to find task {task} in the log' + assert np.isclose( + float(match.group(1)), expected_acc, atol=1e-2, rtol=1e-2 + ), f'Expected {expected_acc} but got {match.group(1)}' + + def check_ppl_in_log(log: str, expected_ppl: float | None, expected_parameters: int | None) -> None: """Verify that the log contains the expected perplexity and parameters result.""" @@ -76,11 +85,33 @@ def verify(pattern: str, value: float | int) -> None: verify(r'Sliced model parameters: ([0-9,]+)', expected_parameters) -def verify_run_slicegpt_perplexity(model: str, sparsity: float, expected_ppl: float, expected_parameters: int) -> None: - """Test the run_slicegpt_perplexity.py script with the provided parameters.""" +def verify_run_lm_eval( + model: str, sparsity: float, task: str, expected_acc_dense: float, expected_acc_sliced: float +) -> None: + """Test the run_lm_eval.py script with the provided parameters.""" + # test lm eval of a dense model + tests_dir = get_test_dir() + script = tests_dir.parent / 'experiments' / 'run_lm_eval.py' + save_dir = tests_dir / 'test_model_data' + args = ['--no-wandb', '--model', str(model)] + + ext_args = ['--sparsity', str(sparsity), '--tasks', task] + log = run_python_script(script, args + ext_args) + + check_task_acc_in_log(log, task, expected_acc_dense) + + # test lm eval of a sliced model + model_path = save_dir / (model.split('/')[-1] + '_' + str(sparsity) + '.pt') + ext_args = ['--sliced-model-path', str(model_path), '--tasks', task] + log = run_python_script(script, args + ext_args) + check_task_acc_in_log(log, task, expected_acc_sliced) + + +def verify_run_slicegpt(model: str, sparsity: float, expected_ppl: float, expected_parameters: int) -> None: + """Test the run_slicegpt.py script with the provided parameters.""" # test rotate, slice and save model tests_dir = get_test_dir() - script = tests_dir.parent / 'experiments' / 'run_slicegpt_perplexity.py' + script = tests_dir.parent / 'experiments' / 'run_slicegpt.py' save_dir = tests_dir / 'test_model_data' args = ['--no-wandb', '--model', str(model)] @@ -88,7 +119,7 @@ def verify_run_slicegpt_perplexity(model: str, sparsity: float, expected_ppl: fl log = run_python_script(script, args + ext_args) check_ppl_in_log(log, expected_ppl=expected_ppl, expected_parameters=expected_parameters) - # test load and slice model + # test load a sliced model model_path = save_dir / (model.split('/')[-1] + '_' + str(sparsity) + '.pt') ext_args = ['--sliced-model-path', str(model_path)] log = run_python_script(script, args + ext_args) @@ -98,26 +129,48 @@ def verify_run_slicegpt_perplexity(model: str, sparsity: float, expected_ppl: fl @pytest.mark.experiment @pytest.mark.gpu def test_opt_125m(): - """Test the run_slicegpt_perplexity.py script with the facebook/opt-125m model.""" + """Test run_slicegpt.py and run_lm_eval.py with the facebook/opt-125m model.""" assert torch.cuda.is_available() - verify_run_slicegpt_perplexity( - model='facebook/opt-125m', - sparsity=0.2, + model = 'facebook/opt-125m' + sparsity = 0.2 + + verify_run_slicegpt( + model=model, + sparsity=sparsity, expected_ppl=34.53, expected_parameters=147_250_880, ) + verify_run_lm_eval( + model=model, + sparsity=sparsity, + task='piqa', + expected_acc_dense=0.6208, + expected_acc_sliced=0.5762, + ) + @pytest.mark.experiment @pytest.mark.gpu def test_phi_2(): - """Test the run_slicegpt_perplexity.py script with the microsoft/phi-2 model.""" + """Test run_slicegpt.py and run_lm_eval.py with the microsoft/phi-2 model.""" assert torch.cuda.is_available() - verify_run_slicegpt_perplexity( - model='microsoft/phi-2', - sparsity=0.2, + model = 'microsoft/phi-2' + sparsity = 0.2 + + verify_run_slicegpt( + model=model, + sparsity=sparsity, expected_ppl=11.2691, expected_parameters=2_391_772_160, ) + + verify_run_lm_eval( + model=model, + sparsity=sparsity, + task='piqa', + expected_acc_dense=0.7911, + expected_acc_sliced=0.7187, + )