Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions experiments/run_slicegpt_perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@ def argparser() -> argparse.Namespace:
parser.add_argument(
"--sparsity", type=float, default=0.0, help="A measure of how much slicing is applied (in the range [0, 1))"
)
parser.add_argument("--round-interval", type=int, default=8, help="Interval for rounding the weights (the best value may depend on your hardware)")
parser.add_argument(
"--round-interval",
type=int,
default=8,
help="Interval for rounding the weights (the best value may depend on your hardware)",
)
parser.add_argument("--eval-baseline", action="store_true", help="Evaluate the baseline model.")
parser.add_argument("--eval-fused-model", action="store_true", help="Evaluate the fused model.")
parser.add_argument("--ppl-only", action="store_true", help="Evaluate the loaded model without doing compression.")
Expand Down Expand Up @@ -203,8 +208,10 @@ def reset_model_device() -> None:
# compute new embedding dimension given the desired sparsity level
new_embedding_dimension = int((1 - args.sparsity) * model_adapter.hidden_size)
# round (down) to the nearest multiple of round_interval
new_embedding_dimension = new_embedding_dimension - (new_embedding_dimension % args.round_interval)
logging.info(f"New embedding dimension: {new_embedding_dimension} (sparsity {100*(1 - new_embedding_dimension / model_adapter.hidden_size):.4f} %)")
new_embedding_dimension = new_embedding_dimension - (new_embedding_dimension % args.round_interval)
logging.info(
f"New embedding dimension: {new_embedding_dimension} (sparsity {100*(1 - new_embedding_dimension / model_adapter.hidden_size):.4f} %)"
)

ignore_tokens = [tokenizer.pad_token_id]
rotate.rotate_and_slice(model_adapter, train_loader, new_embedding_dimension, ignore_tokens=ignore_tokens)
Expand Down
58 changes: 58 additions & 0 deletions tests/test_opt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import torch

from slicegpt import data_utils, gpu_utils, hf_utils, layernorm_fusion, rotate


def test_opt_125m(sparsity=0.2, ppl_upper_limit=36):
"""A complete test of OPT 125m, end-to-end. Includes:
- loading the model
- loading the calibration data
- layernorm fusion & module replacement
- slicing
- ppl evaluation (ppl is verified to be close to published value)

The intention of this test is that it should represent a complete run of SliceGPT,
but on a model small enough to run on CPU in a few minutes.
"""
# get model
model_adapter, tokenizer = hf_utils.get_model_and_tokenizer("facebook/opt-125m", dtype=torch.float32)
model = model_adapter.model

# prepare data
train_dataset, test_dataset = data_utils.get_dataset("wikitext2")
train_loader = data_utils.prepare_dataloader(
dataset=train_dataset,
tokenizer=tokenizer,
max_seqlen=model.seqlen,
batch_size=1,
nsamples=128,
varied_seqlen=False,
seed=42,
)
test_loader = data_utils.prepare_dataloader(
dataset=test_dataset,
tokenizer=tokenizer,
max_seqlen=model_adapter.seqlen,
batch_size=1,
varied_seqlen=False,
seed=42,
)

# replace modules with compressible equivalents
layernorm_fusion.replace_layers(model_adapter)

# fuse layernorms and add rotations to skip connections
layernorm_fusion.fuse_modules(model_adapter)

# compute new embedding dimension given the desired sparsity level
new_embedding_dimension = int((1 - sparsity) * model_adapter.hidden_size)

# run slicing
ignore_tokens = [tokenizer.pad_token_id]
rotate.rotate_and_slice(model_adapter, train_loader, new_embedding_dimension, ignore_tokens=ignore_tokens)

# get ppl
dataset_ppl = gpu_utils.evaluate_ppl(model_adapter, test_loader)

# check that ppl is close to published value
assert dataset_ppl < ppl_upper_limit