microsoft · nailimixaM · Mar 1, 2024 · Feb 27, 2024 · Feb 27, 2024 · Feb 27, 2024
diff --git a/README.md b/README.md
@@ -11,31 +11,70 @@ of the model. This results in speedups (without any additional code optimization
 The code is arranged as a package `slicegpt` in `/src`, and scripts to replicate experiments from the paper are in 
 `/experiments`. To install the `slicegpt` package, we recommend
 
-`pip install -e .`
+```
+    pip install -e . 
+```
 
 ## Running SliceGPT
 
 To run SliceGPT on `microsoft/phi-2`, from the `experiments` folder, run 
 ```
-    python run_slicegpt_perplexity.py \
+    python run_slicegpt.py \
            --model microsoft/phi-2 \
            --save-dir dir/to/save/sliced_model/in \
            --sparsity 0.25 \
-           --no-wandb \
            --device cuda:0 \
-           --eval-baseline
+           --eval-baseline \
+           --no-wandb
 ```
 
 This will compress the `microsoft/phi-2` model and save the compressed model to the specified directory. Please consult 
 the script for the full set of options.
 
-The experiments folder also contains scripts for 
-- [finetuning](./experiments/run_finetuning.py) the compressed model to recover most of the quality lost during compression
-- [zero-shot task evaluation](./experiments/run_zero_shot_tasks.py) of a dense, compressed or fine-tuned model
-
 _Note:_ For models that require Hugging Face authentication, set the `--hf-token` argument 
 manually or using a key vault. Alternatively, set the environment variable `HF_TOKEN`.
 
+### Recovery fine-tuning
+
+To install additional dependencies required for post-slicing recovery fine-tuning (RFT):
+
+```
+    pip install -e .[finetune]
+```
+
+The following replicates the experiments in the paper (LoRA hyperparams valid for all Llama-2 and Phi-2 models): 
+```
+    python run_finetuning.py \
+           --model microsoft/phi-2 \
+           --sliced-model-path path/to/sliced/model.pt \
+           --save-dir dir/to/save/finetuned_model/in \
+           --sparsity 0.25 \
+           --device cuda:0 \
+           --ppl-eval-dataset alpaca \
+           --finetune-dataset alpaca \
+           --finetune-train-nsamples 8000 \
+           --finetune-train-seqlen 1024 \
+           --finetune-train-batch-size 3 \
+           --lora-alpha 10 \
+           --lora-r 32 \
+           --lora-dropout 0.05 \
+           --lora-target-option attn_head_and_mlp \
+           --eval-steps 16 \
+           --save-steps 16 \
+           --no-wandb
+```
+
+Note: the script [`bo_finetuning.py`](./experiments/bo_finetuning.py) can be use to run Bayesian optimization over the RFT hyperparameters.
+
+### Evaluation using the [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) 
+```
+    python run_lm_eval.py \
+           --model microsoft/phi-2 \
+           --sliced-model-path path/to/sliced/model.pt \
+           --tasks piqa \
+           --no-wandb
+```
+
 ## Supported models
 
 The following models from Hugging Face hub are currently supported
@@ -75,7 +114,7 @@ and update `hf_utils.get_model_and_tokenizer` before slicing the new model.
   ([Phi-2](./src/slicegpt/adapters/phi2_adapter.py)). The `self.*_shortcut_Q` matrices are attached to the modules during
   slicing and are available in `forward()`. If the skip connection does not need modification, these matrices will be None, 
   and the `forward()` method can follow the original workflow. For more details on this, 
-  please read Section 3 [the paper](https://arxiv.org/abs/2401.15024).
+  please read Section 3 in [the paper](https://arxiv.org/abs/2401.15024).
 
 Example: [llama_adapter.py](./src/slicegpt/adapters/llama_adapter.py)
 

diff --git a/experiments/run_zero_shot_tasks.py → experiments/run_lm_eval.py b/experiments/run_zero_shot_tasks.py → experiments/run_lm_eval.py
@@ -18,12 +18,14 @@
 from slicegpt import gpu_utils, hf_utils, utils
 from slicegpt.config import config
 
-utils.configure_logging()
+# Use the logger from lm_eval, adding a file handler to write the log to file
+logging = lm_eval_utils.eval_logger
+logging.addHandler(utils.create_file_handler(log_dir="log"))
 
 os.environ["WANDB__SERVICE_WAIT"] = "300"
 
 
-def parse_args() -> argparse.Namespace:
+def argparser() -> argparse.Namespace:
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model",
@@ -54,7 +56,7 @@ def parse_args() -> argparse.Namespace:
         help="Interval for rounding the weights (the best value may depend on your hardware)",
     )
     parser.add_argument('--hf-token', type=str, default=os.getenv('HF_TOKEN', None))
-    parser.add_argument("--batch-size", type=int, default=1, help="Batch size for evaluating with lm eval harness.")
+    parser.add_argument("--batch-size", type=int, default=64, help="Batch size for evaluating with lm eval harness.")
     parser.add_argument(
         "--distribute-model",
         action="store_true",
@@ -69,14 +71,21 @@ def parse_args() -> argparse.Namespace:
         choices=lm_eval_utils.MultiChoice(tasks.ALL_TASKS),
     )
     parser.add_argument('--num-fewshot', type=int, default=0, help="Number of fewshots for all tasks.")
-    return parser.parse_args()
+
+    args = parser.parse_args()
+
+    logging.info(f'Parsed arguments:')
+    for arg, argv in vars(args).items():
+        logging.info(f'{arg} = {argv}')
+
+    return args
 
 
 def main() -> None:
-    logging.info("Running SliceGPT zeroshot tasks experiment.")
+    logging.info("Running SliceGPT LM eval experiment.")
 
     initialize_tasks()
-    args = parse_args()
+    args = argparser()
 
     logging.info(f"PyTorch device: {config.device}")
     logging.info(f"Number of available cuda devices: {torch.cuda.device_count()}")

diff --git a/experiments/run_slicegpt_perplexity.py → experiments/run_slicegpt.py b/experiments/run_slicegpt_perplexity.py → experiments/run_slicegpt.py
@@ -54,7 +54,7 @@ def argparser() -> argparse.Namespace:
         help="Number of samples of the calibration data to load.",
         default=128,
     )
-    parser.add_argument("--cal-batch-size", type=int, default=1, help="Batch size for loading the calibration data.")
+    parser.add_argument("--cal-batch-size", type=int, default=16, help="Batch size for loading the calibration data.")
     parser.add_argument(
         "--cal-max-seqlen", type=int, default=2048, help="Maximum sequence length for the calibration data."
     )
@@ -128,7 +128,7 @@ def argparser() -> argparse.Namespace:
 
 
 def main() -> None:
-    logging.info("Running SliceGPT perplexity experiment")
+    logging.info("Running SliceGPT experiment.")
 
     args = argparser()
 

diff --git a/src/slicegpt/utils.py b/src/slicegpt/utils.py
@@ -11,6 +11,18 @@
 import torch
 
 
+def create_file_handler(log_dir: str) -> logging.FileHandler:
+    path = pathlib.Path.cwd() / log_dir / f'{datetime.datetime.now():log_%Y-%m-%d-%H-%M-%S}.log'
+    path.parent.mkdir(parents=True, exist_ok=True)
+    file_handler = logging.FileHandler(path, encoding='utf-8')
+    file_handler.setLevel(logging.DEBUG)
+    formatter = logging.Formatter(
+        '%(asctime)s.%(msecs)04d\t%(levelname)s\t%(name)s\t%(message)s', datefmt='%Y-%m-%dT%H:%M:%S'
+    )
+    file_handler.setFormatter(formatter)
+    return file_handler
+
+
 def configure_logging(
     log_to_console: bool = True,
     log_to_file: bool = True,
@@ -27,15 +39,7 @@ def configure_logging(
         handlers.append(handler)
 
     if log_to_file:
-        path = pathlib.Path.cwd() / log_dir / f'{datetime.datetime.now():log_%Y-%m-%d-%H-%M-%S}.log'
-        path.parent.mkdir(parents=True, exist_ok=True)
-        file_handler = logging.FileHandler(path, encoding='utf-8')
-        file_handler.setLevel(logging.DEBUG)
-        formatter = logging.Formatter(
-            '%(asctime)s.%(msecs)04d\t%(levelname)s\t%(name)s\t%(message)s', datefmt='%Y-%m-%dT%H:%M:%S'
-        )
-        file_handler.setFormatter(formatter)
-        handlers.append(file_handler)
+        handlers.append(create_file_handler(log_dir=log_dir))
 
     logging.basicConfig(
         handlers=handlers,

diff --git a/tests/test_experiments.py b/tests/test_experiments.py
@@ -59,6 +59,15 @@ def run_shell_command(command: str, cmd_args: list[str]) -> str:
     return run_shell_command(python, py_args)
 
 
+def check_task_acc_in_log(log: str, task: str, expected_acc: float) -> None:
+    """Verify that the log contains the expected accuracy for the provided task."""
+    match = re.search(rf'"{task}": (\d+\.\d+)', log)
+    assert match, f'Expected to find task {task} in the log'
+    assert np.isclose(
+        float(match.group(1)), expected_acc, atol=1e-2, rtol=1e-2
+    ), f'Expected {expected_acc} but got {match.group(1)}'
+
+
 def check_ppl_in_log(log: str, expected_ppl: float | None, expected_parameters: int | None) -> None:
     """Verify that the log contains the expected perplexity and parameters result."""
 
@@ -76,19 +85,41 @@ def verify(pattern: str, value: float | int) -> None:
         verify(r'Sliced model parameters: ([0-9,]+)', expected_parameters)
 
 
-def verify_run_slicegpt_perplexity(model: str, sparsity: float, expected_ppl: float, expected_parameters: int) -> None:
-    """Test the run_slicegpt_perplexity.py script with the provided parameters."""
+def verify_run_lm_eval(
+    model: str, sparsity: float, task: str, expected_acc_dense: float, expected_acc_sliced: float
+) -> None:
+    """Test the run_lm_eval.py script with the provided parameters."""
+    # test lm eval of a dense model
+    tests_dir = get_test_dir()
+    script = tests_dir.parent / 'experiments' / 'run_lm_eval.py'
+    save_dir = tests_dir / 'test_model_data'
+    args = ['--no-wandb', '--model', str(model)]
+
+    ext_args = ['--sparsity', str(sparsity), '--tasks', task]
+    log = run_python_script(script, args + ext_args)
+
+    check_task_acc_in_log(log, task, expected_acc_dense)
+
+    # test lm eval of a sliced model
+    model_path = save_dir / (model.split('/')[-1] + '_' + str(sparsity) + '.pt')
+    ext_args = ['--sliced-model-path', str(model_path), '--tasks', task]
+    log = run_python_script(script, args + ext_args)
+    check_task_acc_in_log(log, task, expected_acc_sliced)
+
+
+def verify_run_slicegpt(model: str, sparsity: float, expected_ppl: float, expected_parameters: int) -> None:
+    """Test the run_slicegpt.py script with the provided parameters."""
     # test rotate, slice and save model
     tests_dir = get_test_dir()
-    script = tests_dir.parent / 'experiments' / 'run_slicegpt_perplexity.py'
+    script = tests_dir.parent / 'experiments' / 'run_slicegpt.py'
     save_dir = tests_dir / 'test_model_data'
     args = ['--no-wandb', '--model', str(model)]
 
     ext_args = ['--sparsity', str(sparsity), '--save-dir', str(save_dir)]
     log = run_python_script(script, args + ext_args)
     check_ppl_in_log(log, expected_ppl=expected_ppl, expected_parameters=expected_parameters)
 
-    # test load and slice model
+    # test load a sliced model
     model_path = save_dir / (model.split('/')[-1] + '_' + str(sparsity) + '.pt')
     ext_args = ['--sliced-model-path', str(model_path)]
     log = run_python_script(script, args + ext_args)
@@ -98,26 +129,48 @@ def verify_run_slicegpt_perplexity(model: str, sparsity: float, expected_ppl: fl
 @pytest.mark.experiment
 @pytest.mark.gpu
 def test_opt_125m():
-    """Test the run_slicegpt_perplexity.py script with the facebook/opt-125m model."""
+    """Test run_slicegpt.py and run_lm_eval.py with the facebook/opt-125m model."""
     assert torch.cuda.is_available()
 
-    verify_run_slicegpt_perplexity(
-        model='facebook/opt-125m',
-        sparsity=0.2,
+    model = 'facebook/opt-125m'
+    sparsity = 0.2
+
+    verify_run_slicegpt(
+        model=model,
+        sparsity=sparsity,
         expected_ppl=34.53,
         expected_parameters=147_250_880,
     )
 
+    verify_run_lm_eval(
+        model=model,
+        sparsity=sparsity,
+        task='piqa',
+        expected_acc_dense=0.6208,
+        expected_acc_sliced=0.5762,
+    )
+
 
 @pytest.mark.experiment
 @pytest.mark.gpu
 def test_phi_2():
-    """Test the run_slicegpt_perplexity.py script with the microsoft/phi-2 model."""
+    """Test run_slicegpt.py and run_lm_eval.py with the microsoft/phi-2 model."""
     assert torch.cuda.is_available()
 
-    verify_run_slicegpt_perplexity(
-        model='microsoft/phi-2',
-        sparsity=0.2,
+    model = 'microsoft/phi-2'
+    sparsity = 0.2
+
+    verify_run_slicegpt(
+        model=model,
+        sparsity=sparsity,
         expected_ppl=11.2691,
         expected_parameters=2_391_772_160,
     )
+
+    verify_run_lm_eval(
+        model=model,
+        sparsity=sparsity,
+        task='piqa',
+        expected_acc_dense=0.7911,
+        expected_acc_sliced=0.7187,
+    )