Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 48 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,70 @@ of the model. This results in speedups (without any additional code optimization
The code is arranged as a package `slicegpt` in `/src`, and scripts to replicate experiments from the paper are in
`/experiments`. To install the `slicegpt` package, we recommend

`pip install -e .`
```
pip install -e .
```

## Running SliceGPT

To run SliceGPT on `microsoft/phi-2`, from the `experiments` folder, run
```
python run_slicegpt_perplexity.py \
python run_slicegpt.py \
--model microsoft/phi-2 \
--save-dir dir/to/save/sliced_model/in \
--sparsity 0.25 \
--no-wandb \
--device cuda:0 \
--eval-baseline
--eval-baseline \
--no-wandb
```

This will compress the `microsoft/phi-2` model and save the compressed model to the specified directory. Please consult
the script for the full set of options.

The experiments folder also contains scripts for
- [finetuning](./experiments/run_finetuning.py) the compressed model to recover most of the quality lost during compression
- [zero-shot task evaluation](./experiments/run_zero_shot_tasks.py) of a dense, compressed or fine-tuned model

_Note:_ For models that require Hugging Face authentication, set the `--hf-token` argument
manually or using a key vault. Alternatively, set the environment variable `HF_TOKEN`.

### Recovery fine-tuning

To install additional dependencies required for post-slicing recovery fine-tuning (RFT):

```
pip install -e .[finetune]
```

The following replicates the experiments in the paper (LoRA hyperparams valid for all Llama-2 and Phi-2 models):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you tried running this with the fixes that I added in my PR #101 yet? I expect better results than in the paper

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should do this, but it can come as a separate change, with another update to the readme if needed.

```
python run_finetuning.py \
--model microsoft/phi-2 \
--sliced-model-path path/to/sliced/model.pt \
--save-dir dir/to/save/finetuned_model/in \
--sparsity 0.25 \
--device cuda:0 \
--ppl-eval-dataset alpaca \
--finetune-dataset alpaca \
--finetune-train-nsamples 8000 \
--finetune-train-seqlen 1024 \
--finetune-train-batch-size 3 \
--lora-alpha 10 \
--lora-r 32 \
--lora-dropout 0.05 \
--lora-target-option attn_head_and_mlp \
--eval-steps 16 \
--save-steps 16 \
--no-wandb
```

Note: the script [`bo_finetuning.py`](./experiments/bo_finetuning.py) can be use to run Bayesian optimization over the RFT hyperparameters.

### Evaluation using the [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness)
```
python run_lm_eval.py \
--model microsoft/phi-2 \
--sliced-model-path path/to/sliced/model.pt \
--tasks piqa \
--no-wandb
```

## Supported models

The following models from Hugging Face hub are currently supported
Expand Down Expand Up @@ -75,7 +114,7 @@ and update `hf_utils.get_model_and_tokenizer` before slicing the new model.
([Phi-2](./src/slicegpt/adapters/phi2_adapter.py)). The `self.*_shortcut_Q` matrices are attached to the modules during
slicing and are available in `forward()`. If the skip connection does not need modification, these matrices will be None,
and the `forward()` method can follow the original workflow. For more details on this,
please read Section 3 [the paper](https://arxiv.org/abs/2401.15024).
please read Section 3 in [the paper](https://arxiv.org/abs/2401.15024).

Example: [llama_adapter.py](./src/slicegpt/adapters/llama_adapter.py)

Expand Down
21 changes: 15 additions & 6 deletions experiments/run_zero_shot_tasks.py → experiments/run_lm_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
from slicegpt import gpu_utils, hf_utils, utils
from slicegpt.config import config

utils.configure_logging()
# Use the logger from lm_eval, adding a file handler to write the log to file
logging = lm_eval_utils.eval_logger
logging.addHandler(utils.create_file_handler(log_dir="log"))

os.environ["WANDB__SERVICE_WAIT"] = "300"


def parse_args() -> argparse.Namespace:
def argparser() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument(
"--model",
Expand Down Expand Up @@ -54,7 +56,7 @@ def parse_args() -> argparse.Namespace:
help="Interval for rounding the weights (the best value may depend on your hardware)",
)
parser.add_argument('--hf-token', type=str, default=os.getenv('HF_TOKEN', None))
parser.add_argument("--batch-size", type=int, default=1, help="Batch size for evaluating with lm eval harness.")
parser.add_argument("--batch-size", type=int, default=64, help="Batch size for evaluating with lm eval harness.")
parser.add_argument(
"--distribute-model",
action="store_true",
Expand All @@ -69,14 +71,21 @@ def parse_args() -> argparse.Namespace:
choices=lm_eval_utils.MultiChoice(tasks.ALL_TASKS),
)
parser.add_argument('--num-fewshot', type=int, default=0, help="Number of fewshots for all tasks.")
return parser.parse_args()

args = parser.parse_args()

logging.info(f'Parsed arguments:')
for arg, argv in vars(args).items():
logging.info(f'{arg} = {argv}')

return args


def main() -> None:
logging.info("Running SliceGPT zeroshot tasks experiment.")
logging.info("Running SliceGPT LM eval experiment.")

initialize_tasks()
args = parse_args()
args = argparser()

logging.info(f"PyTorch device: {config.device}")
logging.info(f"Number of available cuda devices: {torch.cuda.device_count()}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def argparser() -> argparse.Namespace:
help="Number of samples of the calibration data to load.",
default=128,
)
parser.add_argument("--cal-batch-size", type=int, default=1, help="Batch size for loading the calibration data.")
parser.add_argument("--cal-batch-size", type=int, default=16, help="Batch size for loading the calibration data.")
parser.add_argument(
"--cal-max-seqlen", type=int, default=2048, help="Maximum sequence length for the calibration data."
)
Expand Down Expand Up @@ -128,7 +128,7 @@ def argparser() -> argparse.Namespace:


def main() -> None:
logging.info("Running SliceGPT perplexity experiment")
logging.info("Running SliceGPT experiment.")

args = argparser()

Expand Down
22 changes: 13 additions & 9 deletions src/slicegpt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@
import torch


def create_file_handler(log_dir: str) -> logging.FileHandler:
path = pathlib.Path.cwd() / log_dir / f'{datetime.datetime.now():log_%Y-%m-%d-%H-%M-%S}.log'
path.parent.mkdir(parents=True, exist_ok=True)
file_handler = logging.FileHandler(path, encoding='utf-8')
file_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter(
'%(asctime)s.%(msecs)04d\t%(levelname)s\t%(name)s\t%(message)s', datefmt='%Y-%m-%dT%H:%M:%S'
)
file_handler.setFormatter(formatter)
return file_handler


def configure_logging(
log_to_console: bool = True,
log_to_file: bool = True,
Expand All @@ -27,15 +39,7 @@ def configure_logging(
handlers.append(handler)

if log_to_file:
path = pathlib.Path.cwd() / log_dir / f'{datetime.datetime.now():log_%Y-%m-%d-%H-%M-%S}.log'
path.parent.mkdir(parents=True, exist_ok=True)
file_handler = logging.FileHandler(path, encoding='utf-8')
file_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter(
'%(asctime)s.%(msecs)04d\t%(levelname)s\t%(name)s\t%(message)s', datefmt='%Y-%m-%dT%H:%M:%S'
)
file_handler.setFormatter(formatter)
handlers.append(file_handler)
handlers.append(create_file_handler(log_dir=log_dir))

logging.basicConfig(
handlers=handlers,
Expand Down
77 changes: 65 additions & 12 deletions tests/test_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@ def run_shell_command(command: str, cmd_args: list[str]) -> str:
return run_shell_command(python, py_args)


def check_task_acc_in_log(log: str, task: str, expected_acc: float) -> None:
"""Verify that the log contains the expected accuracy for the provided task."""
match = re.search(rf'"{task}": (\d+\.\d+)', log)
assert match, f'Expected to find task {task} in the log'
assert np.isclose(
float(match.group(1)), expected_acc, atol=1e-2, rtol=1e-2
), f'Expected {expected_acc} but got {match.group(1)}'


def check_ppl_in_log(log: str, expected_ppl: float | None, expected_parameters: int | None) -> None:
"""Verify that the log contains the expected perplexity and parameters result."""

Expand All @@ -76,19 +85,41 @@ def verify(pattern: str, value: float | int) -> None:
verify(r'Sliced model parameters: ([0-9,]+)', expected_parameters)


def verify_run_slicegpt_perplexity(model: str, sparsity: float, expected_ppl: float, expected_parameters: int) -> None:
"""Test the run_slicegpt_perplexity.py script with the provided parameters."""
def verify_run_lm_eval(
model: str, sparsity: float, task: str, expected_acc_dense: float, expected_acc_sliced: float
) -> None:
"""Test the run_lm_eval.py script with the provided parameters."""
# test lm eval of a dense model
tests_dir = get_test_dir()
script = tests_dir.parent / 'experiments' / 'run_lm_eval.py'
save_dir = tests_dir / 'test_model_data'
args = ['--no-wandb', '--model', str(model)]

ext_args = ['--sparsity', str(sparsity), '--tasks', task]
log = run_python_script(script, args + ext_args)

check_task_acc_in_log(log, task, expected_acc_dense)

# test lm eval of a sliced model
model_path = save_dir / (model.split('/')[-1] + '_' + str(sparsity) + '.pt')
ext_args = ['--sliced-model-path', str(model_path), '--tasks', task]
log = run_python_script(script, args + ext_args)
check_task_acc_in_log(log, task, expected_acc_sliced)


def verify_run_slicegpt(model: str, sparsity: float, expected_ppl: float, expected_parameters: int) -> None:
"""Test the run_slicegpt.py script with the provided parameters."""
# test rotate, slice and save model
tests_dir = get_test_dir()
script = tests_dir.parent / 'experiments' / 'run_slicegpt_perplexity.py'
script = tests_dir.parent / 'experiments' / 'run_slicegpt.py'
save_dir = tests_dir / 'test_model_data'
args = ['--no-wandb', '--model', str(model)]

ext_args = ['--sparsity', str(sparsity), '--save-dir', str(save_dir)]
log = run_python_script(script, args + ext_args)
check_ppl_in_log(log, expected_ppl=expected_ppl, expected_parameters=expected_parameters)

# test load and slice model
# test load a sliced model
model_path = save_dir / (model.split('/')[-1] + '_' + str(sparsity) + '.pt')
ext_args = ['--sliced-model-path', str(model_path)]
log = run_python_script(script, args + ext_args)
Expand All @@ -98,26 +129,48 @@ def verify_run_slicegpt_perplexity(model: str, sparsity: float, expected_ppl: fl
@pytest.mark.experiment
@pytest.mark.gpu
def test_opt_125m():
"""Test the run_slicegpt_perplexity.py script with the facebook/opt-125m model."""
"""Test run_slicegpt.py and run_lm_eval.py with the facebook/opt-125m model."""
assert torch.cuda.is_available()

verify_run_slicegpt_perplexity(
model='facebook/opt-125m',
sparsity=0.2,
model = 'facebook/opt-125m'
sparsity = 0.2

verify_run_slicegpt(
model=model,
sparsity=sparsity,
expected_ppl=34.53,
expected_parameters=147_250_880,
)

verify_run_lm_eval(
model=model,
sparsity=sparsity,
task='piqa',
expected_acc_dense=0.6208,
expected_acc_sliced=0.5762,
)


@pytest.mark.experiment
@pytest.mark.gpu
def test_phi_2():
"""Test the run_slicegpt_perplexity.py script with the microsoft/phi-2 model."""
"""Test run_slicegpt.py and run_lm_eval.py with the microsoft/phi-2 model."""
assert torch.cuda.is_available()

verify_run_slicegpt_perplexity(
model='microsoft/phi-2',
sparsity=0.2,
model = 'microsoft/phi-2'
sparsity = 0.2

verify_run_slicegpt(
model=model,
sparsity=sparsity,
expected_ppl=11.2691,
expected_parameters=2_391_772_160,
)

verify_run_lm_eval(
model=model,
sparsity=sparsity,
task='piqa',
expected_acc_dense=0.7911,
expected_acc_sliced=0.7187,
)