diff --git a/experiments/run_slicegpt_perplexity.py b/experiments/run_slicegpt_perplexity.py index 680773d8..8627633a 100755 --- a/experiments/run_slicegpt_perplexity.py +++ b/experiments/run_slicegpt_perplexity.py @@ -58,7 +58,12 @@ def argparser() -> argparse.Namespace: parser.add_argument( "--sparsity", type=float, default=0.0, help="A measure of how much slicing is applied (in the range [0, 1))" ) - parser.add_argument("--round-interval", type=int, default=8, help="Interval for rounding the weights (the best value may depend on your hardware)") + parser.add_argument( + "--round-interval", + type=int, + default=8, + help="Interval for rounding the weights (the best value may depend on your hardware)", + ) parser.add_argument("--eval-baseline", action="store_true", help="Evaluate the baseline model.") parser.add_argument("--eval-fused-model", action="store_true", help="Evaluate the fused model.") parser.add_argument("--ppl-only", action="store_true", help="Evaluate the loaded model without doing compression.") @@ -203,8 +208,10 @@ def reset_model_device() -> None: # compute new embedding dimension given the desired sparsity level new_embedding_dimension = int((1 - args.sparsity) * model_adapter.hidden_size) # round (down) to the nearest multiple of round_interval - new_embedding_dimension = new_embedding_dimension - (new_embedding_dimension % args.round_interval) - logging.info(f"New embedding dimension: {new_embedding_dimension} (sparsity {100*(1 - new_embedding_dimension / model_adapter.hidden_size):.4f} %)") + new_embedding_dimension = new_embedding_dimension - (new_embedding_dimension % args.round_interval) + logging.info( + f"New embedding dimension: {new_embedding_dimension} (sparsity {100*(1 - new_embedding_dimension / model_adapter.hidden_size):.4f} %)" + ) ignore_tokens = [tokenizer.pad_token_id] rotate.rotate_and_slice(model_adapter, train_loader, new_embedding_dimension, ignore_tokens=ignore_tokens) diff --git a/tests/test_opt.py b/tests/test_opt.py new file mode 100644 index 00000000..03278f18 --- /dev/null +++ b/tests/test_opt.py @@ -0,0 +1,58 @@ +import torch + +from slicegpt import data_utils, gpu_utils, hf_utils, layernorm_fusion, rotate + + +def test_opt_125m(sparsity=0.2, ppl_upper_limit=36): + """A complete test of OPT 125m, end-to-end. Includes: + - loading the model + - loading the calibration data + - layernorm fusion & module replacement + - slicing + - ppl evaluation (ppl is verified to be close to published value) + + The intention of this test is that it should represent a complete run of SliceGPT, + but on a model small enough to run on CPU in a few minutes. + """ + # get model + model_adapter, tokenizer = hf_utils.get_model_and_tokenizer("facebook/opt-125m", dtype=torch.float32) + model = model_adapter.model + + # prepare data + train_dataset, test_dataset = data_utils.get_dataset("wikitext2") + train_loader = data_utils.prepare_dataloader( + dataset=train_dataset, + tokenizer=tokenizer, + max_seqlen=model.seqlen, + batch_size=1, + nsamples=128, + varied_seqlen=False, + seed=42, + ) + test_loader = data_utils.prepare_dataloader( + dataset=test_dataset, + tokenizer=tokenizer, + max_seqlen=model_adapter.seqlen, + batch_size=1, + varied_seqlen=False, + seed=42, + ) + + # replace modules with compressible equivalents + layernorm_fusion.replace_layers(model_adapter) + + # fuse layernorms and add rotations to skip connections + layernorm_fusion.fuse_modules(model_adapter) + + # compute new embedding dimension given the desired sparsity level + new_embedding_dimension = int((1 - sparsity) * model_adapter.hidden_size) + + # run slicing + ignore_tokens = [tokenizer.pad_token_id] + rotate.rotate_and_slice(model_adapter, train_loader, new_embedding_dimension, ignore_tokens=ignore_tokens) + + # get ppl + dataset_ppl = gpu_utils.evaluate_ppl(model_adapter, test_loader) + + # check that ppl is close to published value + assert dataset_ppl < ppl_upper_limit