From 2956a98f3bba1093040dd073ab71d037cc292c7e Mon Sep 17 00:00:00 2001 From: James Hensman Date: Thu, 21 Dec 2023 11:04:28 +0000 Subject: [PATCH 1/3] adding test --- experiments/run_slicegpt_perplexity.py | 13 ++++-- tests/test_opt.py | 56 ++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 3 deletions(-) create mode 100644 tests/test_opt.py diff --git a/experiments/run_slicegpt_perplexity.py b/experiments/run_slicegpt_perplexity.py index 680773d8..8627633a 100755 --- a/experiments/run_slicegpt_perplexity.py +++ b/experiments/run_slicegpt_perplexity.py @@ -58,7 +58,12 @@ def argparser() -> argparse.Namespace: parser.add_argument( "--sparsity", type=float, default=0.0, help="A measure of how much slicing is applied (in the range [0, 1))" ) - parser.add_argument("--round-interval", type=int, default=8, help="Interval for rounding the weights (the best value may depend on your hardware)") + parser.add_argument( + "--round-interval", + type=int, + default=8, + help="Interval for rounding the weights (the best value may depend on your hardware)", + ) parser.add_argument("--eval-baseline", action="store_true", help="Evaluate the baseline model.") parser.add_argument("--eval-fused-model", action="store_true", help="Evaluate the fused model.") parser.add_argument("--ppl-only", action="store_true", help="Evaluate the loaded model without doing compression.") @@ -203,8 +208,10 @@ def reset_model_device() -> None: # compute new embedding dimension given the desired sparsity level new_embedding_dimension = int((1 - args.sparsity) * model_adapter.hidden_size) # round (down) to the nearest multiple of round_interval - new_embedding_dimension = new_embedding_dimension - (new_embedding_dimension % args.round_interval) - logging.info(f"New embedding dimension: {new_embedding_dimension} (sparsity {100*(1 - new_embedding_dimension / model_adapter.hidden_size):.4f} %)") + new_embedding_dimension = new_embedding_dimension - (new_embedding_dimension % args.round_interval) + logging.info( + f"New embedding dimension: {new_embedding_dimension} (sparsity {100*(1 - new_embedding_dimension / model_adapter.hidden_size):.4f} %)" + ) ignore_tokens = [tokenizer.pad_token_id] rotate.rotate_and_slice(model_adapter, train_loader, new_embedding_dimension, ignore_tokens=ignore_tokens) diff --git a/tests/test_opt.py b/tests/test_opt.py new file mode 100644 index 00000000..b33b5e3a --- /dev/null +++ b/tests/test_opt.py @@ -0,0 +1,56 @@ +from slicegpt import hf_utils, layernorm_fusion, rotate, data_utils, gpu_utils + +def test_opt_125m(sparsity=0.2, ppl_upper_limit=36): + """A complete test of OPT 125m, end-to-end. Includes: + - loading the model + - loading the calibration data + - layernorm fusion & module replacement + - slicing + - ppl evaluation (ppl is verified to be close to published value) + + The intention of this test is that it should represent a complete run of SliceGPT, + but on a model small enough to run on CPU in a few minutes. + """ + # get model + model_adapter, tokenizer = hf_utils.get_model_and_tokenizer("facebook/opt-125m") + model = model_adapter.model + + # prepare data + train_dataset, test_dataset = data_utils.get_dataset("wikitext2") + train_loader = data_utils.prepare_dataloader( + dataset=train_dataset, + tokenizer=tokenizer, + max_seqlen=model.seqlen, + batch_size=1, + nsamples=128, + varied_seqlen=False, + seed=42, + ) + test_loader = data_utils.prepare_dataloader( + dataset=test_dataset, + tokenizer=tokenizer, + max_seqlen=model_adapter.seqlen, + batch_size=1, + varied_seqlen=False, + seed=42, + ) + + # replace modules with compressible equivalents + layernorm_fusion.replace_layers(model_adapter) + + # fuse layernorms and add rotations to skip connections + layernorm_fusion.fuse_modules(model_adapter) + + # compute new embedding dimension given the desired sparsity level + new_embedding_dimension = int((1 - sparsity) * model_adapter.hidden_size) + + # run slicing + ignore_tokens = [tokenizer.pad_token_id] + rotate.rotate_and_slice(model_adapter, train_loader, new_embedding_dimension, ignore_tokens=ignore_tokens) + + # get ppl + dataset_ppl = gpu_utils.evaluate_ppl(model_adapter, test_loader) + + # check that ppl is close to published value + assert dataset_ppl < ppl_upper_limit + \ No newline at end of file From e1bb8ec1c21630ba655c08bd1122fad7583a6669 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Thu, 21 Dec 2023 11:40:09 +0000 Subject: [PATCH 2/3] black --- tests/test_opt.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/test_opt.py b/tests/test_opt.py index b33b5e3a..8c72df54 100644 --- a/tests/test_opt.py +++ b/tests/test_opt.py @@ -1,4 +1,5 @@ -from slicegpt import hf_utils, layernorm_fusion, rotate, data_utils, gpu_utils +from slicegpt import data_utils, gpu_utils, hf_utils, layernorm_fusion, rotate + def test_opt_125m(sparsity=0.2, ppl_upper_limit=36): """A complete test of OPT 125m, end-to-end. Includes: @@ -7,14 +8,14 @@ def test_opt_125m(sparsity=0.2, ppl_upper_limit=36): - layernorm fusion & module replacement - slicing - ppl evaluation (ppl is verified to be close to published value) - - The intention of this test is that it should represent a complete run of SliceGPT, + + The intention of this test is that it should represent a complete run of SliceGPT, but on a model small enough to run on CPU in a few minutes. """ # get model model_adapter, tokenizer = hf_utils.get_model_and_tokenizer("facebook/opt-125m") model = model_adapter.model - + # prepare data train_dataset, test_dataset = data_utils.get_dataset("wikitext2") train_loader = data_utils.prepare_dataloader( @@ -34,23 +35,22 @@ def test_opt_125m(sparsity=0.2, ppl_upper_limit=36): varied_seqlen=False, seed=42, ) - + # replace modules with compressible equivalents layernorm_fusion.replace_layers(model_adapter) # fuse layernorms and add rotations to skip connections layernorm_fusion.fuse_modules(model_adapter) - + # compute new embedding dimension given the desired sparsity level new_embedding_dimension = int((1 - sparsity) * model_adapter.hidden_size) - + # run slicing ignore_tokens = [tokenizer.pad_token_id] rotate.rotate_and_slice(model_adapter, train_loader, new_embedding_dimension, ignore_tokens=ignore_tokens) - + # get ppl dataset_ppl = gpu_utils.evaluate_ppl(model_adapter, test_loader) - + # check that ppl is close to published value assert dataset_ppl < ppl_upper_limit - \ No newline at end of file From d2963f803ffe4a5520818f953eb780065aa5051f Mon Sep 17 00:00:00 2001 From: James Hensman Date: Thu, 21 Dec 2023 13:18:57 +0000 Subject: [PATCH 3/3] test in fp32 --- tests/test_opt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_opt.py b/tests/test_opt.py index 8c72df54..03278f18 100644 --- a/tests/test_opt.py +++ b/tests/test_opt.py @@ -1,3 +1,5 @@ +import torch + from slicegpt import data_utils, gpu_utils, hf_utils, layernorm_fusion, rotate @@ -13,7 +15,7 @@ def test_opt_125m(sparsity=0.2, ppl_upper_limit=36): but on a model small enough to run on CPU in a few minutes. """ # get model - model_adapter, tokenizer = hf_utils.get_model_and_tokenizer("facebook/opt-125m") + model_adapter, tokenizer = hf_utils.get_model_and_tokenizer("facebook/opt-125m", dtype=torch.float32) model = model_adapter.model # prepare data