From d56138089e74160d409ef1b49a69c094b5e84a8a Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Thu, 11 Apr 2024 13:13:24 +0000 Subject: [PATCH 01/33] Create a custom sliced class for phi2 --- src/slicegpt/adapters/hf_compatible_phi.py | 88 ++++++++++++++++++++++ src/slicegpt/adapters/phi2_adapter.py | 6 +- 2 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 src/slicegpt/adapters/hf_compatible_phi.py diff --git a/src/slicegpt/adapters/hf_compatible_phi.py b/src/slicegpt/adapters/hf_compatible_phi.py new file mode 100644 index 00000000..83ebf509 --- /dev/null +++ b/src/slicegpt/adapters/hf_compatible_phi.py @@ -0,0 +1,88 @@ +from transformers import PreTrainedModel +from transformers.models.phi.modeling_phi import PhiConfig, PhiForCausalLM, PhiModel +from slicegpt.adapters.phi2_adapter import CompressedPhiDecoderLayer +from slicegpt.modules import RMSN +import torch.nn as nn + +class SlicedPhi2Config(PhiConfig): + model_type = "sliced_phi2" + is_composition = True + + def __init__(self, sparsity, num_hidden_layers, hidden_size, new_hidden_dim, **kwargs): + super().__init__(**kwargs) + self.sparsity = sparsity + self.num_hidden_layers = num_hidden_layers + self.hidden_size = hidden_size + self.new_hidden_dim = new_hidden_dim + + def to_dict(self): + output = super().to_dict() + output.update({"sparsity": self.sparsity, "num_hidden_layers": self.num_hidden_layers, "new_hidden_dim": self.new_hidden_dim}) + return output + + @classmethod + def from_dict(cls, config_dict): + return cls(**config_dict) + +class SlicedPhi(PhiModel): + def __init__(self, config): + super().__init__(config) + self.config = config + self.layers = nn.ModuleList( + [CompressedPhiDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.final_layernorm = RMSN(config.hidden_size) + +class SlicedPhiForCausalLM(PhiForCausalLM): + def __init__(self, config): + super().__init__(config) + self.model = SlicedPhi(config) + + self.layers = nn.ModuleList( + [CompressedPhiDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.final_layernorm = RMSN(config.hidden_size) + + # TODO: apply slicing here according to the new_hidden_dim (or incorporate it into the CompressedPhiDecoderLayer) + +class SlicedPhi2(PreTrainedModel): + """Wrapper class around SlicedPhiForCausalLM so it can be registered as a HF model""" + config_class = SlicedPhi2Config + base_model_prefix = "sliced_phi2" + + def __init__(self, config): + super().__init__(config) + self.config = config + self.model = SlicedPhi(config) + + def forward(self, input_ids, **kwargs): + return input_ids + + def save_pretrained(self, save_directory): + self.config.save_pretrained(save_directory) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + config = SlicedPhi2.from_pretrained(pretrained_model_name_or_path) + return cls(config, *model_args, **kwargs) + +if __name__ == "__main__": + sparsity = 0.1 + hidden_size = 2560 + num_hidden_layers= 31 + round_interval = 8 + + new_embedding_dim = int((1 - sparsity) * hidden_size) + new_embedding_dim -= new_embedding_dim % round_interval + + sliced_model = SlicedPhi2Config(sparsity=sparsity, num_hidden_layers=num_hidden_layers, hidden_size=hidden_size, new_hidden_dim=new_embedding_dim) + sliced_model.save_pretrained("sliced_phi2") + + # load the config + config = SlicedPhi2Config.from_pretrained("sliced_phi2") + print(config) + + sliced_model = SlicedPhiForCausalLM(config) + print(sliced_model) + + \ No newline at end of file diff --git a/src/slicegpt/adapters/phi2_adapter.py b/src/slicegpt/adapters/phi2_adapter.py index 37cbf6fe..aef89d77 100644 --- a/src/slicegpt/adapters/phi2_adapter.py +++ b/src/slicegpt/adapters/phi2_adapter.py @@ -16,7 +16,7 @@ from transformers.models.phi.modeling_phi import PhiConfig, PhiDecoderLayer, PhiForCausalLM from slicegpt.model_adapter import LayerAdapter, ModelAdapter - +from slicegpt.modules import RMSN class CompressedPhiDecoderLayer(PhiDecoderLayer): """ @@ -24,6 +24,10 @@ class CompressedPhiDecoderLayer(PhiDecoderLayer): https://huggingface.co/microsoft/phi-2/blob/main/modeling_phi.py but with the addition of a shortcut_Q attribute. This attribute is used to rotate the residual tensors. """ + + def __init__(self, config: PhiConfig, layer_idx: int): + super().__init__(config, layer_idx) + self.input_layernorm = RMSN(config.hidden_size) def forward( self, From e017d979a1e98b19ca90e3eace1b6f5b146de1c8 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Thu, 11 Apr 2024 13:36:33 +0000 Subject: [PATCH 02/33] Verify model saving and loading --- src/slicegpt/adapters/hf_compatible_phi.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/slicegpt/adapters/hf_compatible_phi.py b/src/slicegpt/adapters/hf_compatible_phi.py index 83ebf509..2543382e 100644 --- a/src/slicegpt/adapters/hf_compatible_phi.py +++ b/src/slicegpt/adapters/hf_compatible_phi.py @@ -78,11 +78,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): sliced_model = SlicedPhi2Config(sparsity=sparsity, num_hidden_layers=num_hidden_layers, hidden_size=hidden_size, new_hidden_dim=new_embedding_dim) sliced_model.save_pretrained("sliced_phi2") - # load the config config = SlicedPhi2Config.from_pretrained("sliced_phi2") print(config) sliced_model = SlicedPhiForCausalLM(config) print(sliced_model) - \ No newline at end of file + sliced_model.save_pretrained("sliced_phi2_model") + sliced_model.from_pretrained("sliced_phi2_model") + + print(sliced_model) \ No newline at end of file From d071a9264aa9b86e47f3e6b8363618ce12dbbaa8 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Thu, 11 Apr 2024 18:33:47 +0000 Subject: [PATCH 03/33] Start adding slicing scheduler --- src/slicegpt/adapters/hf_compatible_phi.py | 64 +++++++++++++--------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/src/slicegpt/adapters/hf_compatible_phi.py b/src/slicegpt/adapters/hf_compatible_phi.py index 2543382e..a0de46d3 100644 --- a/src/slicegpt/adapters/hf_compatible_phi.py +++ b/src/slicegpt/adapters/hf_compatible_phi.py @@ -1,5 +1,7 @@ -from transformers import PreTrainedModel +import pathlib from transformers.models.phi.modeling_phi import PhiConfig, PhiForCausalLM, PhiModel +from slicegpt.slicing_scheduler import ConfigSlicingScheduler, SlicingScheduler +from slicegpt.model_adapter import SlicingConfig from slicegpt.adapters.phi2_adapter import CompressedPhiDecoderLayer from slicegpt.modules import RMSN import torch.nn as nn @@ -8,16 +10,15 @@ class SlicedPhi2Config(PhiConfig): model_type = "sliced_phi2" is_composition = True - def __init__(self, sparsity, num_hidden_layers, hidden_size, new_hidden_dim, **kwargs): + def __init__(self, sparsity, hidden_size, new_hidden_dim, **kwargs): super().__init__(**kwargs) self.sparsity = sparsity - self.num_hidden_layers = num_hidden_layers self.hidden_size = hidden_size self.new_hidden_dim = new_hidden_dim def to_dict(self): output = super().to_dict() - output.update({"sparsity": self.sparsity, "num_hidden_layers": self.num_hidden_layers, "new_hidden_dim": self.new_hidden_dim}) + output.update({"sparsity": self.sparsity, "new_hidden_dim": self.new_hidden_dim}) return output @classmethod @@ -34,38 +35,39 @@ def __init__(self, config): self.final_layernorm = RMSN(config.hidden_size) class SlicedPhiForCausalLM(PhiForCausalLM): - def __init__(self, config): + def __init__(self, config, scheduler: SlicingScheduler | None = None): super().__init__(config) self.model = SlicedPhi(config) self.layers = nn.ModuleList( [CompressedPhiDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) + self.final_layernorm = RMSN(config.hidden_size) # TODO: apply slicing here according to the new_hidden_dim (or incorporate it into the CompressedPhiDecoderLayer) - -class SlicedPhi2(PreTrainedModel): - """Wrapper class around SlicedPhiForCausalLM so it can be registered as a HF model""" - config_class = SlicedPhi2Config - base_model_prefix = "sliced_phi2" - - def __init__(self, config): - super().__init__(config) - self.config = config - self.model = SlicedPhi(config) - - def forward(self, input_ids, **kwargs): - return input_ids - - def save_pretrained(self, save_directory): - self.config.save_pretrained(save_directory) - + if not scheduler: + print("Slicing scheduler is not prodided. No slicing is applied") + else: + self.slice(scheduler) + @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - config = SlicedPhi2.from_pretrained(pretrained_model_name_or_path) - return cls(config, *model_args, **kwargs) + """Overrides the from_pretrained method to accept the scheduler and return the sliced model""" + scheduler = kwargs.pop("slicing_scheduler", None) + model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + model = cls(model.config, scheduler) + model.load_state_dict(model.state_dict()) + return model + def slice(self, scheduler: SlicingScheduler): + self.slice_embeddings(scheduler.get_embedding_dimensions()) + + def slice_embeddings(self, new_hidden_dim): + for i, W in enumerate([self.model.get_input_embeddings()]): + W.weight.data = W.weight.data[:, : new_hidden_dim[i]].contiguous() + W.embedding_dim = new_hidden_dim[i] + if __name__ == "__main__": sparsity = 0.1 hidden_size = 2560 @@ -75,16 +77,24 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): new_embedding_dim = int((1 - sparsity) * hidden_size) new_embedding_dim -= new_embedding_dim % round_interval - sliced_model = SlicedPhi2Config(sparsity=sparsity, num_hidden_layers=num_hidden_layers, hidden_size=hidden_size, new_hidden_dim=new_embedding_dim) + config_path = pathlib.Path("/home/t-lmikaelyan/new_2/TransformerCompression/sliced_phi_0.1/phi-2_0.1.json") + + slicing_conf = SlicingConfig.from_json_string(config_path.read_text()) + + slicing_scheduler = ConfigSlicingScheduler(slicing_conf) + + sliced_model = SlicedPhi2Config(sparsity=sparsity, hidden_size=hidden_size, new_hidden_dim=new_embedding_dim) sliced_model.save_pretrained("sliced_phi2") config = SlicedPhi2Config.from_pretrained("sliced_phi2") print(config) - sliced_model = SlicedPhiForCausalLM(config) + sliced_model = SlicedPhiForCausalLM(config, slicing_scheduler) print(sliced_model) sliced_model.save_pretrained("sliced_phi2_model") - sliced_model.from_pretrained("sliced_phi2_model") + + sliced_model = SlicedPhiForCausalLM(config, slicing_scheduler) + sliced_model.from_pretrained("sliced_phi2_model", slicing_scheduler) print(sliced_model) \ No newline at end of file From c27e3d5763528b9e5fdc3a692b8bcf67e68be453 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Fri, 12 Apr 2024 13:06:12 +0000 Subject: [PATCH 04/33] Correct model architecture and slice --- src/slicegpt/adapters/hf_compatible_phi.py | 57 ++++++++++++++-------- src/slicegpt/rotate.py | 12 ++--- 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/src/slicegpt/adapters/hf_compatible_phi.py b/src/slicegpt/adapters/hf_compatible_phi.py index a0de46d3..60233088 100644 --- a/src/slicegpt/adapters/hf_compatible_phi.py +++ b/src/slicegpt/adapters/hf_compatible_phi.py @@ -1,9 +1,12 @@ import pathlib +from slicegpt.rotate import slice_attention_inputs, slice_attention_output, slice_embeddings, slice_head, slice_mlp_input, slice_mlp_output from transformers.models.phi.modeling_phi import PhiConfig, PhiForCausalLM, PhiModel from slicegpt.slicing_scheduler import ConfigSlicingScheduler, SlicingScheduler from slicegpt.model_adapter import SlicingConfig from slicegpt.adapters.phi2_adapter import CompressedPhiDecoderLayer from slicegpt.modules import RMSN +from slicegpt.adapters.phi2_adapter import Phi2ModelAdapter +import torch import torch.nn as nn class SlicedPhi2Config(PhiConfig): @@ -38,46 +41,60 @@ class SlicedPhiForCausalLM(PhiForCausalLM): def __init__(self, config, scheduler: SlicingScheduler | None = None): super().__init__(config) self.model = SlicedPhi(config) - - self.layers = nn.ModuleList( - [CompressedPhiDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - ) + self.model_adapter = Phi2ModelAdapter(self) - self.final_layernorm = RMSN(config.hidden_size) - - # TODO: apply slicing here according to the new_hidden_dim (or incorporate it into the CompressedPhiDecoderLayer) - if not scheduler: - print("Slicing scheduler is not prodided. No slicing is applied") - else: + if scheduler: self.slice(scheduler) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - """Overrides the from_pretrained method to accept the scheduler and return the sliced model""" + """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" scheduler = kwargs.pop("slicing_scheduler", None) model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) model = cls(model.config, scheduler) model.load_state_dict(model.state_dict()) return model - + def slice(self, scheduler: SlicingScheduler): - self.slice_embeddings(scheduler.get_embedding_dimensions()) + slice_embeddings(self.model_adapter, scheduler.get_embedding_dimensions()) + + layers = self.model_adapter.get_layers() + + hidden_size = self.model_adapter.hidden_size + for layer_adapter in layers: + if not self.model_adapter.parallel_blocks: + layer_adapter.layer.mlp_shortcut_Q = torch.nn.Parameter( + torch.zeros(hidden_size, hidden_size).to(dtype=torch.float16).contiguous() + ) + layer_adapter.layer.attn_shortcut_Q = torch.nn.Parameter( + torch.zeros(hidden_size, hidden_size).to(dtype=torch.float16).contiguous() + ) + + for idx, layer_adapter in enumerate(layers): + slice_attention_inputs(layer_adapter, slicing_scheduler.get_attention_input_dimension(idx)) + slice_mlp_input(layer_adapter, slicing_scheduler.get_attention_input_dimension(idx)) + + slice_mlp_output(layer_adapter, slicing_scheduler.get_mlp_output_dimension(idx)) + slice_attention_output(layer_adapter, slicing_scheduler.get_mlp_output_dimension(idx)) + + layer_adapter.layer.attn_shortcut_Q = nn.Parameter( + layer_adapter.layer.attn_shortcut_Q[:, : slicing_scheduler.get_mlp_output_dimension(idx)].contiguous() + ) - def slice_embeddings(self, new_hidden_dim): - for i, W in enumerate([self.model.get_input_embeddings()]): - W.weight.data = W.weight.data[:, : new_hidden_dim[i]].contiguous() - W.embedding_dim = new_hidden_dim[i] + if slicing_scheduler.do_slice_head: + slice_head(self.model_adapter, slicing_scheduler.get_head_dimension()) if __name__ == "__main__": sparsity = 0.1 hidden_size = 2560 - num_hidden_layers= 31 + num_hidden_layers= 32 round_interval = 8 + config_path = "" new_embedding_dim = int((1 - sparsity) * hidden_size) new_embedding_dim -= new_embedding_dim % round_interval - config_path = pathlib.Path("/home/t-lmikaelyan/new_2/TransformerCompression/sliced_phi_0.1/phi-2_0.1.json") + config_path = pathlib.Path(config_path) slicing_conf = SlicingConfig.from_json_string(config_path.read_text()) @@ -93,8 +110,6 @@ def slice_embeddings(self, new_hidden_dim): print(sliced_model) sliced_model.save_pretrained("sliced_phi2_model") - - sliced_model = SlicedPhiForCausalLM(config, slicing_scheduler) sliced_model.from_pretrained("sliced_phi2_model", slicing_scheduler) print(sliced_model) \ No newline at end of file diff --git a/src/slicegpt/rotate.py b/src/slicegpt/rotate.py index c5333b3f..147b6823 100644 --- a/src/slicegpt/rotate.py +++ b/src/slicegpt/rotate.py @@ -26,7 +26,7 @@ def rotate_attention_inputs(layer_adapter: LayerAdapter, Q: torch.Tensor) -> Non def slice_attention_inputs(layer_adapter: LayerAdapter, new_embedding_dimension: int) -> None: # Slice the WQ, WK and WV matrices of the self-attention layer. for W in layer_adapter.get_attention_inputs(): - W.weight.data = W.weight.data[:, :new_embedding_dimension] + W.weight.data = W.weight.data[:, :new_embedding_dimension].contiguous() W.in_features = new_embedding_dimension layer_adapter.layer.attn_shortcut_Q = nn.Parameter(layer_adapter.layer.attn_shortcut_Q[:new_embedding_dimension, :]) @@ -47,7 +47,7 @@ def rotate_attention_output(layer_adapter: LayerAdapter, Q: torch.Tensor) -> Non def slice_attention_output(layer_adapter: LayerAdapter, new_embedding_dimension: int) -> None: # Slice output matrix of the self-attention layer. W = layer_adapter.get_attention_output() - W.weight.data = W.weight.data[:new_embedding_dimension, :] + W.weight.data = W.weight.data[:new_embedding_dimension, :].contiguous() if W.bias is not None: W.bias.data = W.bias.data[:new_embedding_dimension] W.out_features = new_embedding_dimension @@ -64,7 +64,7 @@ def rotate_mlp_input(layer_adapter: LayerAdapter, Q: torch.Tensor) -> None: def slice_mlp_input(layer_adapter: LayerAdapter, new_embedding_dimension: int) -> None: # Slice the MLP input weights. for W in layer_adapter.get_mlp_inputs(): - W.weight.data = W.weight.data[:, :new_embedding_dimension] + W.weight.data = W.weight.data[:, :new_embedding_dimension].contiguous() W.in_features = new_embedding_dimension @@ -82,7 +82,7 @@ def rotate_mlp_output(layer_adapter: LayerAdapter, Q: torch.Tensor) -> None: def slice_mlp_output(layer_adapter: LayerAdapter, new_embedding_dimension: int) -> None: # Slice the MLP output weights and bias. W = layer_adapter.get_mlp_output() - W.weight.data = W.weight.data[:new_embedding_dimension, :] + W.weight.data = W.weight.data[:new_embedding_dimension, :].contiguous() if W.bias is not None: W.bias.data = W.bias.data[:new_embedding_dimension] W.out_features = new_embedding_dimension @@ -102,7 +102,7 @@ def rotate_embeddings(model_adapter: ModelAdapter, Q: torch.Tensor) -> None: def slice_embeddings(model_adapter: ModelAdapter, new_embedding_dimensions: dict[int, int]) -> None: # Slice the embeddings. for i, W in enumerate(model_adapter.get_embeddings()): - W.weight.data = W.weight.data[:, : new_embedding_dimensions[i]] + W.weight.data = W.weight.data[:, : new_embedding_dimensions[i]].contiguous() W.embedding_dim = new_embedding_dimensions[i] @@ -117,7 +117,7 @@ def rotate_head(model_adapter: ModelAdapter, Q: torch.Tensor) -> None: def slice_head(model_adapter: ModelAdapter, new_embedding_dimension: int) -> None: # Slice the head. lm_head = model_adapter.get_lm_head() - lm_head.weight.data = lm_head.weight.data[:, :new_embedding_dimension] + lm_head.weight.data = lm_head.weight.data[:, :new_embedding_dimension].contiguous() lm_head.in_features = new_embedding_dimension From 94a57206290369091019b83dcd4d1d07e1e5d113 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Fri, 12 Apr 2024 15:13:23 +0000 Subject: [PATCH 05/33] Add additional intermediate hidden size to config --- src/slicegpt/adapters/phi2_adapter.py | 9 +++++++-- src/slicegpt/model_adapter.py | 9 +++++++++ src/slicegpt/rotate.py | 10 +++++----- src/slicegpt/slicing_scheduler.py | 3 ++- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/slicegpt/adapters/phi2_adapter.py b/src/slicegpt/adapters/phi2_adapter.py index aef89d77..3b5e4d41 100644 --- a/src/slicegpt/adapters/phi2_adapter.py +++ b/src/slicegpt/adapters/phi2_adapter.py @@ -25,9 +25,10 @@ class CompressedPhiDecoderLayer(PhiDecoderLayer): but with the addition of a shortcut_Q attribute. This attribute is used to rotate the residual tensors. """ - def __init__(self, config: PhiConfig, layer_idx: int): + def __init__(self, config: PhiConfig, layer_idx: int, replace_layernorm: bool = False): super().__init__(config, layer_idx) - self.input_layernorm = RMSN(config.hidden_size) + if replace_layernorm: + self.input_layernorm = RMSN(config.hidden_size) def forward( self, @@ -154,6 +155,10 @@ def seqlen(self) -> int: @property def hidden_size(self) -> int: return self.config.hidden_size + + @property + def intermediate_size(self) -> int: + return self.config.intermediate_size @property def should_bake_mean_into_linear(self) -> bool: diff --git a/src/slicegpt/model_adapter.py b/src/slicegpt/model_adapter.py index f9b0f8bc..6e43b5ff 100644 --- a/src/slicegpt/model_adapter.py +++ b/src/slicegpt/model_adapter.py @@ -163,6 +163,14 @@ def hidden_size(self) -> int: The hidden size of the model """ raise NotImplementedError + + @property + @abstractmethod + def intermediate_size(self) -> int: + """ + The intermediate hidden size of MLP + """ + raise NotImplementedError @property @abstractmethod @@ -434,6 +442,7 @@ class SlicingConfig: """Slicing configuration such as individual layer dimensions and whether to slice head.""" hidden_size: int = 0 + intermediate_size: int = 0 layers_num: int = 0 do_slice_head: bool = False parallel_blocks: bool = False diff --git a/src/slicegpt/rotate.py b/src/slicegpt/rotate.py index 147b6823..7d59b444 100644 --- a/src/slicegpt/rotate.py +++ b/src/slicegpt/rotate.py @@ -163,7 +163,7 @@ def rotate_and_slice_sequential( ignore_masks.append(batch["attention_mask"]) layers = model_adapter.get_layers() - slicing_scheduler.setup(hidden_size=model_adapter.hidden_size, layers_num=len(layers), parallel_blocks=False) + slicing_scheduler.setup(hidden_size=model_adapter.hidden_size, intermediate_size=model_adapter.model.intermediate_size, layers_num=len(layers), parallel_blocks=True) # rotate and slice embeddings eig_val, Q = pca_calc(inps, ignore_masks) @@ -277,7 +277,7 @@ def rotate_and_slice_parallel( ignore_masks.append(batch["attention_mask"]) layers = model_adapter.get_layers() - slicing_scheduler.setup(hidden_size=model_adapter.hidden_size, layers_num=len(layers), parallel_blocks=True) + slicing_scheduler.setup(hidden_size=model_adapter.hidden_size, intermediate_size=model_adapter.intermediate_size, layers_num=len(layers), parallel_blocks=True) # rotate and slice embeddings _, Q = pca_calc(inps, ignore_masks) @@ -465,17 +465,17 @@ def slice_rotated_model(model_adapter: ModelAdapter, slicing_scheduler: SlicingS if model_adapter.parallel_blocks: # parallel case layer.attn_shortcut_Q = nn.Parameter( - layer.attn_shortcut_Q[:, : slicing_scheduler.get_attention_output_dimension(i, match_head_dim=True)] + layer.attn_shortcut_Q[:, : slicing_scheduler.get_attention_output_dimension(i, match_head_dim=True)].contiguous() ) slice_attention_output( layer_adapter, slicing_scheduler.get_attention_output_dimension(i, match_head_dim=True) ) else: # sequential case layer.attn_shortcut_Q = nn.Parameter( - layer.attn_shortcut_Q[:, : slicing_scheduler.get_attention_output_dimension(i, match_head_dim=False)] + layer.attn_shortcut_Q[:, : slicing_scheduler.get_attention_output_dimension(i, match_head_dim=False)].contiguous() ) layer.mlp_shortcut_Q = nn.Parameter( - layer.mlp_shortcut_Q[:, : slicing_scheduler.get_mlp_output_dimension(i)] + layer.mlp_shortcut_Q[:, : slicing_scheduler.get_mlp_output_dimension(i)].contiguous() ) # slice attention weights 1st dimension diff --git a/src/slicegpt/slicing_scheduler.py b/src/slicegpt/slicing_scheduler.py index e06b78f0..7bf1667e 100644 --- a/src/slicegpt/slicing_scheduler.py +++ b/src/slicegpt/slicing_scheduler.py @@ -40,9 +40,10 @@ def parallel_blocks(self) -> bool: """Return whether working with a parallel blocks models.""" return self.slicing_conf.parallel_blocks - def setup(self, *, hidden_size: int, layers_num: int, parallel_blocks: bool) -> None: + def setup(self, *, hidden_size: int, intermediate_size:int, layers_num: int, parallel_blocks: bool) -> None: """Set up the slicing scheduler with the given model parameters.""" self.slicing_conf.hidden_size = hidden_size + self.slicing_conf.intermediate_size = intermediate_size self.slicing_conf.layers_num = layers_num self.slicing_conf.parallel_blocks = parallel_blocks From 2b6b7f94420c5fb1243090f13ca5cd9fd4f7f0e1 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Fri, 12 Apr 2024 15:35:35 +0000 Subject: [PATCH 06/33] Clean up and update intermediate hidden layer dim --- src/slicegpt/adapters/hf_compatible_phi.py | 107 +++++++++++++-------- 1 file changed, 69 insertions(+), 38 deletions(-) diff --git a/src/slicegpt/adapters/hf_compatible_phi.py b/src/slicegpt/adapters/hf_compatible_phi.py index 60233088..643d98bb 100644 --- a/src/slicegpt/adapters/hf_compatible_phi.py +++ b/src/slicegpt/adapters/hf_compatible_phi.py @@ -1,5 +1,7 @@ +import argparse import pathlib -from slicegpt.rotate import slice_attention_inputs, slice_attention_output, slice_embeddings, slice_head, slice_mlp_input, slice_mlp_output +from slicegpt import hf_utils +from slicegpt.rotate import slice_rotated_model from transformers.models.phi.modeling_phi import PhiConfig, PhiForCausalLM, PhiModel from slicegpt.slicing_scheduler import ConfigSlicingScheduler, SlicingScheduler from slicegpt.model_adapter import SlicingConfig @@ -8,20 +10,22 @@ from slicegpt.adapters.phi2_adapter import Phi2ModelAdapter import torch import torch.nn as nn +import os class SlicedPhi2Config(PhiConfig): model_type = "sliced_phi2" is_composition = True - def __init__(self, sparsity, hidden_size, new_hidden_dim, **kwargs): + def __init__(self, sparsity, num_layers, hidden_size, intermediate_size, **kwargs): super().__init__(**kwargs) self.sparsity = sparsity + self.num_layers = num_layers self.hidden_size = hidden_size - self.new_hidden_dim = new_hidden_dim + self.intermediate_size = intermediate_size def to_dict(self): output = super().to_dict() - output.update({"sparsity": self.sparsity, "new_hidden_dim": self.new_hidden_dim}) + output.update({"sparsity": self.sparsity}) return output @classmethod @@ -33,7 +37,7 @@ def __init__(self, config): super().__init__(config) self.config = config self.layers = nn.ModuleList( - [CompressedPhiDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + [CompressedPhiDecoderLayer(config, layer_idx, replace_layernorm=True) for layer_idx in range(config.num_layers)] ) self.final_layernorm = RMSN(config.hidden_size) @@ -44,7 +48,7 @@ def __init__(self, config, scheduler: SlicingScheduler | None = None): self.model_adapter = Phi2ModelAdapter(self) if scheduler: - self.slice(scheduler) + self.update_dims(scheduler) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): @@ -55,9 +59,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): model.load_state_dict(model.state_dict()) return model - def slice(self, scheduler: SlicingScheduler): - slice_embeddings(self.model_adapter, scheduler.get_embedding_dimensions()) - + def update_dims(self, scheduler: SlicingScheduler) -> None: layers = self.model_adapter.get_layers() hidden_size = self.model_adapter.hidden_size @@ -70,46 +72,75 @@ def slice(self, scheduler: SlicingScheduler): torch.zeros(hidden_size, hidden_size).to(dtype=torch.float16).contiguous() ) - for idx, layer_adapter in enumerate(layers): - slice_attention_inputs(layer_adapter, slicing_scheduler.get_attention_input_dimension(idx)) - slice_mlp_input(layer_adapter, slicing_scheduler.get_attention_input_dimension(idx)) - - slice_mlp_output(layer_adapter, slicing_scheduler.get_mlp_output_dimension(idx)) - slice_attention_output(layer_adapter, slicing_scheduler.get_mlp_output_dimension(idx)) - - layer_adapter.layer.attn_shortcut_Q = nn.Parameter( - layer_adapter.layer.attn_shortcut_Q[:, : slicing_scheduler.get_mlp_output_dimension(idx)].contiguous() - ) - - if slicing_scheduler.do_slice_head: - slice_head(self.model_adapter, slicing_scheduler.get_head_dimension()) + slice_rotated_model(self.model_adapter, scheduler) + +def arg_parser() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument( + "--model", + type=str, + default="microsoft/phi-2", + help="Model to load", + ) + parser.add_argument( + "--save-model-path", + type=str, + default=None, + help="Path to save the final model to", + ) + parser.add_argument( + "--sparsity", type=float, default=0.1, help="A measure of how much slicing is applied (in the range [0, 1))" + ) + parser.add_argument( + "--round-interval", + type=int, + default=8, + help="Interval for rounding the weights (the best value may depend on your hardware)", + ) + parser.add_argument( + "--sliced-model-path", + type=str, + help="Path to load the sliced model to copy the weights from", + default="", + ) + parser.add_argument( + "--sliced-model-config-path", + type=str, + help="Path to load the config of the sliced model from", + default="", + ) + parser.add_argument('--hf-token', type=str, default=os.getenv('HF_TOKEN', None)) + + return parser.parse_args() if __name__ == "__main__": - sparsity = 0.1 - hidden_size = 2560 - num_hidden_layers= 32 - round_interval = 8 - config_path = "" - new_embedding_dim = int((1 - sparsity) * hidden_size) - new_embedding_dim -= new_embedding_dim % round_interval - - config_path = pathlib.Path(config_path) + args = arg_parser() + + config_path = pathlib.Path(args.sliced_model_config_path) slicing_conf = SlicingConfig.from_json_string(config_path.read_text()) slicing_scheduler = ConfigSlicingScheduler(slicing_conf) - sliced_model = SlicedPhi2Config(sparsity=sparsity, hidden_size=hidden_size, new_hidden_dim=new_embedding_dim) - sliced_model.save_pretrained("sliced_phi2") + sliced_model_conf = SlicedPhi2Config(sparsity=args.sparsity, hidden_size=slicing_conf.hidden_size, intermediate_size=slicing_conf.intermediate_size, num_layers=slicing_conf.layers_num) + sliced_model_conf.save_pretrained("sliced_phi2") config = SlicedPhi2Config.from_pretrained("sliced_phi2") - print(config) - + sliced_model = SlicedPhiForCausalLM(config, slicing_scheduler) - print(sliced_model) - + sliced_model.save_pretrained("sliced_phi2_model") sliced_model.from_pretrained("sliced_phi2_model", slicing_scheduler) - print(sliced_model) \ No newline at end of file + # load the saved sliced model + model_adapter, tokenizer = hf_utils.load_sliced_model( + args.model, + args.sliced_model_path, + sparsity=args.sparsity, + round_interval=args.round_interval, + token=args.hf_token, + ) + + sliced_model.load_state_dict(model_adapter.model.state_dict()) + print("Model loaded successfully!") \ No newline at end of file From a9a3488021e4a13efca0dd3d365907c0baffc077 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Tue, 16 Apr 2024 22:07:32 +0000 Subject: [PATCH 07/33] Small fixes and perplexity computation --- src/slicegpt/adapters/hf_compatible_phi.py | 29 ++++++++++++++++------ src/slicegpt/hf_utils.py | 3 ++- src/slicegpt/rotate.py | 1 + 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/slicegpt/adapters/hf_compatible_phi.py b/src/slicegpt/adapters/hf_compatible_phi.py index 643d98bb..55f8ba20 100644 --- a/src/slicegpt/adapters/hf_compatible_phi.py +++ b/src/slicegpt/adapters/hf_compatible_phi.py @@ -1,6 +1,6 @@ import argparse import pathlib -from slicegpt import hf_utils +from slicegpt import data_utils, gpu_utils, hf_utils from slicegpt.rotate import slice_rotated_model from transformers.models.phi.modeling_phi import PhiConfig, PhiForCausalLM, PhiModel from slicegpt.slicing_scheduler import ConfigSlicingScheduler, SlicingScheduler @@ -37,7 +37,7 @@ def __init__(self, config): super().__init__(config) self.config = config self.layers = nn.ModuleList( - [CompressedPhiDecoderLayer(config, layer_idx, replace_layernorm=True) for layer_idx in range(config.num_layers)] + [CompressedPhiDecoderLayer(config, layer_idx, replace_layernorm=True).to(config.torch_dtype) for layer_idx in range(config.num_layers)] ) self.final_layernorm = RMSN(config.hidden_size) @@ -51,9 +51,9 @@ def __init__(self, config, scheduler: SlicingScheduler | None = None): self.update_dims(scheduler) @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path, scheduler, *model_args, **kwargs): """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" - scheduler = kwargs.pop("slicing_scheduler", None) + #scheduler = kwargs.pop("slicing_scheduler", None) model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) model = cls(model.config, scheduler) model.load_state_dict(model.state_dict()) @@ -128,10 +128,11 @@ def arg_parser() -> argparse.Namespace: config = SlicedPhi2Config.from_pretrained("sliced_phi2") + config.torch_dtype = torch.float16 sliced_model = SlicedPhiForCausalLM(config, slicing_scheduler) - sliced_model.save_pretrained("sliced_phi2_model") - sliced_model.from_pretrained("sliced_phi2_model", slicing_scheduler) + #sliced_model.save_pretrained("sliced_phi2_model") + #sliced_model.from_pretrained("sliced_phi2_model", slicing_scheduler, config) # load the saved sliced model model_adapter, tokenizer = hf_utils.load_sliced_model( @@ -143,4 +144,18 @@ def arg_parser() -> argparse.Namespace: ) sliced_model.load_state_dict(model_adapter.model.state_dict()) - print("Model loaded successfully!") \ No newline at end of file + print("Model loaded successfully!") + + dataset = data_utils.get_dataset("wikitext2") + train_dataset, test_dataset = dataset["train"], dataset["test"] + + test_loader = data_utils.prepare_test_dataloader( + dataset=test_dataset, tokenizer=tokenizer, batch_size=8 + ) + + sliced_model.to("cuda") + + dataset_ppl = gpu_utils.evaluate_ppl(sliced_model, tokenizer.pad_token_id, test_loader) + print(f'Loaded model perplexity: {dataset_ppl}') + + sliced_model.save_pretrained(args.save_model_path) \ No newline at end of file diff --git a/src/slicegpt/hf_utils.py b/src/slicegpt/hf_utils.py index 7a9167b5..da7753ae 100755 --- a/src/slicegpt/hf_utils.py +++ b/src/slicegpt/hf_utils.py @@ -116,7 +116,7 @@ def load_sliced_model( sliced_model_path: str, *, token: str | None = None, - lora_config: LoraConfig = None, + lora_config: LoraConfig | None = None, sparsity: float | None = None, round_interval: int | None = 1, ) -> tuple[ModelAdapter, PreTrainedTokenizerBase]: @@ -135,6 +135,7 @@ def load_sliced_model( uninitialized=True, token=token, ) + replace_layers(model_adapter) fuse_modules(model_adapter) diff --git a/src/slicegpt/rotate.py b/src/slicegpt/rotate.py index 7d59b444..fbc5d3ab 100644 --- a/src/slicegpt/rotate.py +++ b/src/slicegpt/rotate.py @@ -437,6 +437,7 @@ def slice_rotated_model(model_adapter: ModelAdapter, slicing_scheduler: SlicingS slicing_scheduler = ConstSlicingScheduler(model_adapter.slicing_conf.const_dimension) slicing_scheduler.setup( hidden_size=model_adapter.hidden_size, + intermediate_size=model_adapter.intermediate_size, layers_num=len(layers), parallel_blocks=model_adapter.parallel_blocks, ) From 7f4adda404e9e44001e04d521e69a82610665419 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Wed, 17 Apr 2024 12:56:30 +0000 Subject: [PATCH 08/33] Fix from_pretrained function --- src/slicegpt/adapters/hf_compatible_phi.py | 37 ++++++++++++++-------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/src/slicegpt/adapters/hf_compatible_phi.py b/src/slicegpt/adapters/hf_compatible_phi.py index 55f8ba20..c863fcfd 100644 --- a/src/slicegpt/adapters/hf_compatible_phi.py +++ b/src/slicegpt/adapters/hf_compatible_phi.py @@ -37,12 +37,12 @@ def __init__(self, config): super().__init__(config) self.config = config self.layers = nn.ModuleList( - [CompressedPhiDecoderLayer(config, layer_idx, replace_layernorm=True).to(config.torch_dtype) for layer_idx in range(config.num_layers)] + [CompressedPhiDecoderLayer(config, layer_idx, replace_layernorm=True)for layer_idx in range(config.num_layers)] ) self.final_layernorm = RMSN(config.hidden_size) class SlicedPhiForCausalLM(PhiForCausalLM): - def __init__(self, config, scheduler: SlicingScheduler | None = None): + def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_args, **kwargs): super().__init__(config) self.model = SlicedPhi(config) self.model_adapter = Phi2ModelAdapter(self) @@ -51,11 +51,11 @@ def __init__(self, config, scheduler: SlicingScheduler | None = None): self.update_dims(scheduler) @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, scheduler, *model_args, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path, scheduler: SlicingScheduler, config_path, *model_args, **kwargs): """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" - #scheduler = kwargs.pop("slicing_scheduler", None) - model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - model = cls(model.config, scheduler) + config = SlicedPhi2Config.from_pretrained(config_path) + #model = cls(config, scheduler) + model = super().from_pretrained(pretrained_model_name_or_path, scheduler, config) model.load_state_dict(model.state_dict()) return model @@ -130,9 +130,6 @@ def arg_parser() -> argparse.Namespace: config.torch_dtype = torch.float16 sliced_model = SlicedPhiForCausalLM(config, slicing_scheduler) - - #sliced_model.save_pretrained("sliced_phi2_model") - #sliced_model.from_pretrained("sliced_phi2_model", slicing_scheduler, config) # load the saved sliced model model_adapter, tokenizer = hf_utils.load_sliced_model( @@ -142,9 +139,6 @@ def arg_parser() -> argparse.Namespace: round_interval=args.round_interval, token=args.hf_token, ) - - sliced_model.load_state_dict(model_adapter.model.state_dict()) - print("Model loaded successfully!") dataset = data_utils.get_dataset("wikitext2") train_dataset, test_dataset = dataset["train"], dataset["test"] @@ -153,9 +147,24 @@ def arg_parser() -> argparse.Namespace: dataset=test_dataset, tokenizer=tokenizer, batch_size=8 ) + # evaluate original perplexity + dataset_ppl = gpu_utils.evaluate_ppl(model_adapter.model.to("cuda"), tokenizer.pad_token_id, test_loader) + print(f'Loaded sliced model perplexity: {dataset_ppl}') + + + sliced_model = sliced_model.to(torch.float16) + sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) + print("Model loaded successfully!") + + sliced_model.to("cuda") dataset_ppl = gpu_utils.evaluate_ppl(sliced_model, tokenizer.pad_token_id, test_loader) - print(f'Loaded model perplexity: {dataset_ppl}') + print(f'Loaded new sliced model perplexity: {dataset_ppl}') + + sliced_model.save_pretrained("new_sliced_phi2_model") + sliced_model_new = sliced_model.from_pretrained("new_sliced_phi2_model", slicing_scheduler, "sliced_phi2") + sliced_model_new = sliced_model_new.to(torch.float16) - sliced_model.save_pretrained(args.save_model_path) \ No newline at end of file + dataset_ppl = gpu_utils.evaluate_ppl(sliced_model_new.to("cuda"), tokenizer.pad_token_id, test_loader) + print(f'Loaded new sliced model perplexity: {dataset_ppl}') \ No newline at end of file From dd58719d740969f3190a39b97f18ed18da226031 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Fri, 19 Apr 2024 15:29:15 +0000 Subject: [PATCH 09/33] Add tests and fix config --- src/slicegpt/adapters/hf_compatible_phi.py | 51 ++++++++++++---------- tests/test_slicing.py | 43 +++++++++++++++++- 2 files changed, 71 insertions(+), 23 deletions(-) diff --git a/src/slicegpt/adapters/hf_compatible_phi.py b/src/slicegpt/adapters/hf_compatible_phi.py index c863fcfd..f4bcba56 100644 --- a/src/slicegpt/adapters/hf_compatible_phi.py +++ b/src/slicegpt/adapters/hf_compatible_phi.py @@ -37,24 +37,23 @@ def __init__(self, config): super().__init__(config) self.config = config self.layers = nn.ModuleList( - [CompressedPhiDecoderLayer(config, layer_idx, replace_layernorm=True)for layer_idx in range(config.num_layers)] + [CompressedPhiDecoderLayer(config, layer_idx, replace_layernorm=True)for layer_idx in range(config.num_hidden_layers)] ) self.final_layernorm = RMSN(config.hidden_size) - + class SlicedPhiForCausalLM(PhiForCausalLM): def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_args, **kwargs): super().__init__(config) self.model = SlicedPhi(config) self.model_adapter = Phi2ModelAdapter(self) - + if scheduler: self.update_dims(scheduler) @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, scheduler: SlicingScheduler, config_path, *model_args, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, config_path, *model_args, **kwargs): """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" config = SlicedPhi2Config.from_pretrained(config_path) - #model = cls(config, scheduler) model = super().from_pretrained(pretrained_model_name_or_path, scheduler, config) model.load_state_dict(model.state_dict()) return model @@ -85,7 +84,7 @@ def arg_parser() -> argparse.Namespace: parser.add_argument( "--save-model-path", type=str, - default=None, + default="sliced_HF_model", help="Path to save the final model to", ) parser.add_argument( @@ -109,10 +108,24 @@ def arg_parser() -> argparse.Namespace: help="Path to load the config of the sliced model from", default="", ) + parser.add_argument( + "--cal-dataset", + type=str, + help="Dataset that sliced model was calibrated on. Also used to compute perplexity", + choices=["wikitext2", "ptb", "c4", "alpaca"], + default="wikitext2", + ) + parser.add_argument("--device", type=str, default="cuda") parser.add_argument('--hf-token', type=str, default=os.getenv('HF_TOKEN', None)) return parser.parse_args() +def compare_weights(model1, model2): + for p1, p2 in zip(model1.parameters(), model2.parameters()): + if not torch.equal(p1.data, p2.data): + return False + return True + if __name__ == "__main__": args = arg_parser() @@ -126,10 +139,12 @@ def arg_parser() -> argparse.Namespace: sliced_model_conf = SlicedPhi2Config(sparsity=args.sparsity, hidden_size=slicing_conf.hidden_size, intermediate_size=slicing_conf.intermediate_size, num_layers=slicing_conf.layers_num) sliced_model_conf.save_pretrained("sliced_phi2") - config = SlicedPhi2Config.from_pretrained("sliced_phi2") + # sliced config should be the same as the original model's config + orig_config = PhiConfig.from_pretrained( + "microsoft/phi-2", torch_dtype=torch.float16, + ) - config.torch_dtype = torch.float16 - sliced_model = SlicedPhiForCausalLM(config, slicing_scheduler) + sliced_model = SlicedPhiForCausalLM(orig_config, slicing_scheduler, slicing_conf) # load the saved sliced model model_adapter, tokenizer = hf_utils.load_sliced_model( @@ -144,27 +159,19 @@ def arg_parser() -> argparse.Namespace: train_dataset, test_dataset = dataset["train"], dataset["test"] test_loader = data_utils.prepare_test_dataloader( - dataset=test_dataset, tokenizer=tokenizer, batch_size=8 + dataset=test_dataset, tokenizer=tokenizer, batch_size=1 ) - # evaluate original perplexity - dataset_ppl = gpu_utils.evaluate_ppl(model_adapter.model.to("cuda"), tokenizer.pad_token_id, test_loader) + dataset_ppl = gpu_utils.evaluate_ppl(model_adapter.model.to(args.device), tokenizer.pad_token_id, test_loader) print(f'Loaded sliced model perplexity: {dataset_ppl}') - sliced_model = sliced_model.to(torch.float16) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) print("Model loaded successfully!") - - sliced_model.to("cuda") - - dataset_ppl = gpu_utils.evaluate_ppl(sliced_model, tokenizer.pad_token_id, test_loader) - print(f'Loaded new sliced model perplexity: {dataset_ppl}') - - sliced_model.save_pretrained("new_sliced_phi2_model") - sliced_model_new = sliced_model.from_pretrained("new_sliced_phi2_model", slicing_scheduler, "sliced_phi2") + sliced_model.save_pretrained(args.save_model_path) + sliced_model_new = sliced_model.from_pretrained(args.save_model_path, slicing_scheduler, "sliced_phi2") sliced_model_new = sliced_model_new.to(torch.float16) - dataset_ppl = gpu_utils.evaluate_ppl(sliced_model_new.to("cuda"), tokenizer.pad_token_id, test_loader) + dataset_ppl = gpu_utils.evaluate_ppl(sliced_model_new.to(args.device), tokenizer.pad_token_id, test_loader) print(f'Loaded new sliced model perplexity: {dataset_ppl}') \ No newline at end of file diff --git a/tests/test_slicing.py b/tests/test_slicing.py index 79830c27..ddbf5767 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -1,8 +1,12 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -from slicegpt import hf_utils, layernorm_fusion +from slicegpt.slicing_scheduler import ConstSlicingScheduler +from transformers.models.phi.modeling_phi import PhiConfig +from slicegpt import hf_utils, layernorm_fusion, rotate from slicegpt.adapters.opt_adapter import OPTModelAdapter +from slicegpt.adapters.hf_compatible_phi import SlicedPhiForCausalLM +import torch def test_layernorm_fusion_replaces_modules() -> None: @@ -21,3 +25,40 @@ def test_layernorm_fusion_replaces_modules() -> None: def get_module_names(model) -> list[str]: return [name for name, _ in model.named_parameters()] + + +def test_HF_model(): + """Check that the HF model weights are equivalent to the sliced model weights after layernorm fusion""" + model_name = "microsoft/phi-2" + model_adapter, _ = hf_utils.get_model_and_tokenizer(model_name) + + layernorm_fusion.replace_layers(model_adapter) + layernorm_fusion.fuse_modules(model_adapter) + + config = PhiConfig.from_pretrained( + "microsoft/phi-2", torch_dtype=torch.float16, + ) + + sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) + sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) + + assert compare_weights(model_adapter.model, sliced_model.model) + +def test_save_and_load_HF_model(): + """Check that the HF model weights are equivalent to the sliced model weights after layernorm fusion""" + config = PhiConfig.from_pretrained( + "microsoft/phi-2", torch_dtype=torch.float16, + ) + + sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) + sliced_model.save_pretrained("sliced_model") + sliced_model = SlicedPhiForCausalLM.from_pretrained("sliced_model", None, "microsoft/phi-2") + +def compare_weights(model1, model2): + for p1, p2 in zip(model1.parameters(), model2.parameters()): + if not torch.equal(p1.data, p2.data): + return False + return True + +if __name__ == "__main__": + test_HF_model() \ No newline at end of file From 4cb10a789a4eb8a4cf40ba3179a002a2ae1dc826 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Fri, 19 Apr 2024 15:29:42 +0000 Subject: [PATCH 10/33] Style --- src/slicegpt/adapters/hf_compatible_phi.py | 90 +++++++++++++--------- tests/test_slicing.py | 26 ++++--- 2 files changed, 69 insertions(+), 47 deletions(-) diff --git a/src/slicegpt/adapters/hf_compatible_phi.py b/src/slicegpt/adapters/hf_compatible_phi.py index f4bcba56..ee600035 100644 --- a/src/slicegpt/adapters/hf_compatible_phi.py +++ b/src/slicegpt/adapters/hf_compatible_phi.py @@ -1,16 +1,18 @@ import argparse +import os import pathlib -from slicegpt import data_utils, gpu_utils, hf_utils -from slicegpt.rotate import slice_rotated_model + +import torch +import torch.nn as nn from transformers.models.phi.modeling_phi import PhiConfig, PhiForCausalLM, PhiModel -from slicegpt.slicing_scheduler import ConfigSlicingScheduler, SlicingScheduler + +from slicegpt import data_utils, gpu_utils, hf_utils +from slicegpt.adapters.phi2_adapter import CompressedPhiDecoderLayer, Phi2ModelAdapter from slicegpt.model_adapter import SlicingConfig -from slicegpt.adapters.phi2_adapter import CompressedPhiDecoderLayer from slicegpt.modules import RMSN -from slicegpt.adapters.phi2_adapter import Phi2ModelAdapter -import torch -import torch.nn as nn -import os +from slicegpt.rotate import slice_rotated_model +from slicegpt.slicing_scheduler import ConfigSlicingScheduler, SlicingScheduler + class SlicedPhi2Config(PhiConfig): model_type = "sliced_phi2" @@ -22,7 +24,7 @@ def __init__(self, sparsity, num_layers, hidden_size, intermediate_size, **kwarg self.num_layers = num_layers self.hidden_size = hidden_size self.intermediate_size = intermediate_size - + def to_dict(self): output = super().to_dict() output.update({"sparsity": self.sparsity}) @@ -32,15 +34,20 @@ def to_dict(self): def from_dict(cls, config_dict): return cls(**config_dict) + class SlicedPhi(PhiModel): def __init__(self, config): super().__init__(config) self.config = config self.layers = nn.ModuleList( - [CompressedPhiDecoderLayer(config, layer_idx, replace_layernorm=True)for layer_idx in range(config.num_hidden_layers)] + [ + CompressedPhiDecoderLayer(config, layer_idx, replace_layernorm=True) + for layer_idx in range(config.num_hidden_layers) + ] ) self.final_layernorm = RMSN(config.hidden_size) + class SlicedPhiForCausalLM(PhiForCausalLM): def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_args, **kwargs): super().__init__(config) @@ -49,9 +56,11 @@ def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_arg if scheduler: self.update_dims(scheduler) - + @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, config_path, *model_args, **kwargs): + def from_pretrained( + cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, config_path, *model_args, **kwargs + ): """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" config = SlicedPhi2Config.from_pretrained(config_path) model = super().from_pretrained(pretrained_model_name_or_path, scheduler, config) @@ -60,7 +69,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, scheduler: SlicingSchedu def update_dims(self, scheduler: SlicingScheduler) -> None: layers = self.model_adapter.get_layers() - + hidden_size = self.model_adapter.hidden_size for layer_adapter in layers: if not self.model_adapter.parallel_blocks: @@ -70,9 +79,10 @@ def update_dims(self, scheduler: SlicingScheduler) -> None: layer_adapter.layer.attn_shortcut_Q = torch.nn.Parameter( torch.zeros(hidden_size, hidden_size).to(dtype=torch.float16).contiguous() ) - + slice_rotated_model(self.model_adapter, scheduler) + def arg_parser() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument( @@ -120,51 +130,57 @@ def arg_parser() -> argparse.Namespace: return parser.parse_args() + def compare_weights(model1, model2): for p1, p2 in zip(model1.parameters(), model2.parameters()): if not torch.equal(p1.data, p2.data): return False return True + if __name__ == "__main__": - + args = arg_parser() - + config_path = pathlib.Path(args.sliced_model_config_path) slicing_conf = SlicingConfig.from_json_string(config_path.read_text()) - + slicing_scheduler = ConfigSlicingScheduler(slicing_conf) - sliced_model_conf = SlicedPhi2Config(sparsity=args.sparsity, hidden_size=slicing_conf.hidden_size, intermediate_size=slicing_conf.intermediate_size, num_layers=slicing_conf.layers_num) + sliced_model_conf = SlicedPhi2Config( + sparsity=args.sparsity, + hidden_size=slicing_conf.hidden_size, + intermediate_size=slicing_conf.intermediate_size, + num_layers=slicing_conf.layers_num, + ) sliced_model_conf.save_pretrained("sliced_phi2") - + # sliced config should be the same as the original model's config orig_config = PhiConfig.from_pretrained( - "microsoft/phi-2", torch_dtype=torch.float16, - ) + "microsoft/phi-2", + torch_dtype=torch.float16, + ) sliced_model = SlicedPhiForCausalLM(orig_config, slicing_scheduler, slicing_conf) - + # load the saved sliced model model_adapter, tokenizer = hf_utils.load_sliced_model( - args.model, - args.sliced_model_path, - sparsity=args.sparsity, - round_interval=args.round_interval, - token=args.hf_token, - ) - + args.model, + args.sliced_model_path, + sparsity=args.sparsity, + round_interval=args.round_interval, + token=args.hf_token, + ) + dataset = data_utils.get_dataset("wikitext2") train_dataset, test_dataset = dataset["train"], dataset["test"] - - test_loader = data_utils.prepare_test_dataloader( - dataset=test_dataset, tokenizer=tokenizer, batch_size=1 - ) - + + test_loader = data_utils.prepare_test_dataloader(dataset=test_dataset, tokenizer=tokenizer, batch_size=1) + dataset_ppl = gpu_utils.evaluate_ppl(model_adapter.model.to(args.device), tokenizer.pad_token_id, test_loader) print(f'Loaded sliced model perplexity: {dataset_ppl}') - + sliced_model = sliced_model.to(torch.float16) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) print("Model loaded successfully!") @@ -172,6 +188,6 @@ def compare_weights(model1, model2): sliced_model.save_pretrained(args.save_model_path) sliced_model_new = sliced_model.from_pretrained(args.save_model_path, slicing_scheduler, "sliced_phi2") sliced_model_new = sliced_model_new.to(torch.float16) - + dataset_ppl = gpu_utils.evaluate_ppl(sliced_model_new.to(args.device), tokenizer.pad_token_id, test_loader) - print(f'Loaded new sliced model perplexity: {dataset_ppl}') \ No newline at end of file + print(f'Loaded new sliced model perplexity: {dataset_ppl}') diff --git a/tests/test_slicing.py b/tests/test_slicing.py index ddbf5767..a36fa2a7 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -1,12 +1,13 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -from slicegpt.slicing_scheduler import ConstSlicingScheduler +import torch from transformers.models.phi.modeling_phi import PhiConfig + from slicegpt import hf_utils, layernorm_fusion, rotate -from slicegpt.adapters.opt_adapter import OPTModelAdapter from slicegpt.adapters.hf_compatible_phi import SlicedPhiForCausalLM -import torch +from slicegpt.adapters.opt_adapter import OPTModelAdapter +from slicegpt.slicing_scheduler import ConstSlicingScheduler def test_layernorm_fusion_replaces_modules() -> None: @@ -34,31 +35,36 @@ def test_HF_model(): layernorm_fusion.replace_layers(model_adapter) layernorm_fusion.fuse_modules(model_adapter) - + config = PhiConfig.from_pretrained( - "microsoft/phi-2", torch_dtype=torch.float16, - ) + "microsoft/phi-2", + torch_dtype=torch.float16, + ) sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) - + assert compare_weights(model_adapter.model, sliced_model.model) + def test_save_and_load_HF_model(): """Check that the HF model weights are equivalent to the sliced model weights after layernorm fusion""" config = PhiConfig.from_pretrained( - "microsoft/phi-2", torch_dtype=torch.float16, - ) + "microsoft/phi-2", + torch_dtype=torch.float16, + ) sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) sliced_model.save_pretrained("sliced_model") sliced_model = SlicedPhiForCausalLM.from_pretrained("sliced_model", None, "microsoft/phi-2") + def compare_weights(model1, model2): for p1, p2 in zip(model1.parameters(), model2.parameters()): if not torch.equal(p1.data, p2.data): return False return True + if __name__ == "__main__": - test_HF_model() \ No newline at end of file + test_HF_model() From 85b3782e1ca067e5314dd6597dec328feadd3d1d Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Sun, 21 Apr 2024 12:27:01 +0000 Subject: [PATCH 11/33] ClAdd LLama class and clean up --- src/slicegpt/adapters/llama_adapter.py | 10 ++++++++++ src/slicegpt/adapters/phi2_adapter.py | 5 +++-- src/slicegpt/rotate.py | 22 ++++++++++++++++++---- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/slicegpt/adapters/llama_adapter.py b/src/slicegpt/adapters/llama_adapter.py index 78e60a2b..5cf232b1 100644 --- a/src/slicegpt/adapters/llama_adapter.py +++ b/src/slicegpt/adapters/llama_adapter.py @@ -14,6 +14,7 @@ from transformers.models.llama.modeling_llama import LlamaConfig, LlamaDecoderLayer, LlamaForCausalLM, LlamaRMSNorm from slicegpt.model_adapter import LayerAdapter, ModelAdapter +from slicegpt.modules import RMSN class CompressedLlamaDecoderLayer(LlamaDecoderLayer): @@ -23,6 +24,11 @@ class CompressedLlamaDecoderLayer(LlamaDecoderLayer): but with the addition of a shortcut_Q attribute. This attribute is used to rotate the residual tensors. """ + def __init__(self, config: LlamaConfig, layer_idx: int, replace_layernorm: bool = False): + super().__init__(config, layer_idx) + if replace_layernorm: + self.input_layernorm = RMSN(config.hidden_size) + def forward( self, hidden_states: Tensor, @@ -154,6 +160,10 @@ def seqlen(self) -> int: def hidden_size(self) -> int: return self.config.hidden_size + @property + def intermediate_size(self) -> int: + return self.config.intermediate_size + @property def should_bake_mean_into_linear(self) -> bool: return False diff --git a/src/slicegpt/adapters/phi2_adapter.py b/src/slicegpt/adapters/phi2_adapter.py index 3b5e4d41..867fd78c 100644 --- a/src/slicegpt/adapters/phi2_adapter.py +++ b/src/slicegpt/adapters/phi2_adapter.py @@ -18,13 +18,14 @@ from slicegpt.model_adapter import LayerAdapter, ModelAdapter from slicegpt.modules import RMSN + class CompressedPhiDecoderLayer(PhiDecoderLayer): """ This class simulates the PhiDecoderlayer class from PhiModel (PhiForCausalLM) https://huggingface.co/microsoft/phi-2/blob/main/modeling_phi.py but with the addition of a shortcut_Q attribute. This attribute is used to rotate the residual tensors. """ - + def __init__(self, config: PhiConfig, layer_idx: int, replace_layernorm: bool = False): super().__init__(config, layer_idx) if replace_layernorm: @@ -155,7 +156,7 @@ def seqlen(self) -> int: @property def hidden_size(self) -> int: return self.config.hidden_size - + @property def intermediate_size(self) -> int: return self.config.intermediate_size diff --git a/src/slicegpt/rotate.py b/src/slicegpt/rotate.py index fbc5d3ab..75b830c0 100644 --- a/src/slicegpt/rotate.py +++ b/src/slicegpt/rotate.py @@ -163,7 +163,12 @@ def rotate_and_slice_sequential( ignore_masks.append(batch["attention_mask"]) layers = model_adapter.get_layers() - slicing_scheduler.setup(hidden_size=model_adapter.hidden_size, intermediate_size=model_adapter.model.intermediate_size, layers_num=len(layers), parallel_blocks=True) + slicing_scheduler.setup( + hidden_size=model_adapter.hidden_size, + intermediate_size=model_adapter.model.intermediate_size, + layers_num=len(layers), + parallel_blocks=True, + ) # rotate and slice embeddings eig_val, Q = pca_calc(inps, ignore_masks) @@ -277,7 +282,12 @@ def rotate_and_slice_parallel( ignore_masks.append(batch["attention_mask"]) layers = model_adapter.get_layers() - slicing_scheduler.setup(hidden_size=model_adapter.hidden_size, intermediate_size=model_adapter.intermediate_size, layers_num=len(layers), parallel_blocks=True) + slicing_scheduler.setup( + hidden_size=model_adapter.hidden_size, + intermediate_size=model_adapter.intermediate_size, + layers_num=len(layers), + parallel_blocks=True, + ) # rotate and slice embeddings _, Q = pca_calc(inps, ignore_masks) @@ -466,14 +476,18 @@ def slice_rotated_model(model_adapter: ModelAdapter, slicing_scheduler: SlicingS if model_adapter.parallel_blocks: # parallel case layer.attn_shortcut_Q = nn.Parameter( - layer.attn_shortcut_Q[:, : slicing_scheduler.get_attention_output_dimension(i, match_head_dim=True)].contiguous() + layer.attn_shortcut_Q[ + :, : slicing_scheduler.get_attention_output_dimension(i, match_head_dim=True) + ].contiguous() ) slice_attention_output( layer_adapter, slicing_scheduler.get_attention_output_dimension(i, match_head_dim=True) ) else: # sequential case layer.attn_shortcut_Q = nn.Parameter( - layer.attn_shortcut_Q[:, : slicing_scheduler.get_attention_output_dimension(i, match_head_dim=False)].contiguous() + layer.attn_shortcut_Q[ + :, : slicing_scheduler.get_attention_output_dimension(i, match_head_dim=False) + ].contiguous() ) layer.mlp_shortcut_Q = nn.Parameter( layer.mlp_shortcut_Q[:, : slicing_scheduler.get_mlp_output_dimension(i)].contiguous() From 1f6c01998694e85d1b9c3b90ac1544d3a93a5db9 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Sun, 21 Apr 2024 12:28:25 +0000 Subject: [PATCH 12/33] Add tests and update loading in run_slicegpt --- experiments/run_slicegpt.py | 19 ++++++++++--- tests/test_slicing.py | 54 +++++++++++++++++++++++++++++++++---- 2 files changed, 65 insertions(+), 8 deletions(-) diff --git a/experiments/run_slicegpt.py b/experiments/run_slicegpt.py index e02332c3..d79c4491 100755 --- a/experiments/run_slicegpt.py +++ b/experiments/run_slicegpt.py @@ -8,9 +8,11 @@ import shutil import torch -import wandb +from transformers.models.phi.modeling_phi import PhiConfig +import wandb from slicegpt import data_utils, gpu_utils, hf_utils, layernorm_fusion, rotate, utils +from slicegpt.adapters.hf_compatible_phi import SlicedPhiForCausalLM from slicegpt.config import config from slicegpt.slicing_scheduler import ConstSlicingScheduler @@ -230,8 +232,19 @@ def reset_model_device() -> None: sliced_model_name = sliced_model_dir / f'{pathlib.Path(args.model).name}_{args.sparsity}.pt' - # Save the sliced model - torch.save(model.state_dict(), sliced_model_name) + # Save the sliced model in HF format + if args.model == "microsoft/phi-2": + config_to_save = PhiConfig.from_pretrained( + "microsoft/phi-2", + torch_dtype=torch.float16, + ) + + sliced_model = SlicedPhiForCausalLM(config_to_save, scheduler).to(config.dtype) + sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) + sliced_model.save_pretrained(sliced_model_dir) + else: + # Save the sliced model for other models types + torch.save(model.state_dict(), sliced_model_name) # Save the slicing config config_path = sliced_model_name.with_suffix('.json') diff --git a/tests/test_slicing.py b/tests/test_slicing.py index a36fa2a7..7e6f7d00 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -2,11 +2,14 @@ # Licensed under the MIT license. import torch +from transformers import AutoTokenizer +from transformers.models.llama.modeling_llama import LlamaConfig from transformers.models.phi.modeling_phi import PhiConfig -from slicegpt import hf_utils, layernorm_fusion, rotate -from slicegpt.adapters.hf_compatible_phi import SlicedPhiForCausalLM +from slicegpt import data_utils, gpu_utils, hf_utils, layernorm_fusion, rotate from slicegpt.adapters.opt_adapter import OPTModelAdapter +from slicegpt.adapters.sliced_llama import SlicedLlamaForCausalLM +from slicegpt.adapters.sliced_phi import SlicedPhiForCausalLM from slicegpt.slicing_scheduler import ConstSlicingScheduler @@ -29,9 +32,10 @@ def get_module_names(model) -> list[str]: def test_HF_model(): - """Check that the HF model weights are equivalent to the sliced model weights after layernorm fusion""" + """Check that the HF model weights are equivalent to the sliced model weights""" model_name = "microsoft/phi-2" - model_adapter, _ = hf_utils.get_model_and_tokenizer(model_name) + model_adapter, tokenizer = hf_utils.get_model_and_tokenizer(model_name) + sparsity = 0.1 layernorm_fusion.replace_layers(model_adapter) layernorm_fusion.fuse_modules(model_adapter) @@ -41,14 +45,39 @@ def test_HF_model(): torch_dtype=torch.float16, ) + new_embedding_dimension = int((1 - sparsity) * model_adapter.hidden_size) + new_embedding_dimension -= new_embedding_dimension % 8 + sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) + # The sliced model weights should be identical to the HF model weights after layer norm fusion assert compare_weights(model_adapter.model, sliced_model.model) + dataset = data_utils.get_dataset("wikitext2") + train_dataset, test_dataset = dataset["train"], dataset["test"] + + train_loader = data_utils.prepare_dataloader(dataset=train_dataset, tokenizer=tokenizer) + + test_loader = data_utils.prepare_test_dataloader(dataset=test_dataset, tokenizer=tokenizer) + + scheduler = ConstSlicingScheduler(new_embedding_dimension) + rotate.rotate_and_slice(model_adapter, train_loader, scheduler, final_orientation="random") + + sliced_ppl = gpu_utils.evaluate_ppl(model_adapter.model.to("cuda"), tokenizer.pad_token_id, test_loader) + + sliced_model = SlicedPhiForCausalLM(config, scheduler).to(torch.float16) + sliced_model = sliced_model.to(torch.float16) + sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) + + new_model_ppl = gpu_utils.evaluate_ppl(sliced_model.to("cuda"), tokenizer.pad_token_id, test_loader) + + # The perplexity of the sliced model should be the same as the HF model + assert sliced_ppl == new_model_ppl + def test_save_and_load_HF_model(): - """Check that the HF model weights are equivalent to the sliced model weights after layernorm fusion""" + """Test HF model saving and loading""" config = PhiConfig.from_pretrained( "microsoft/phi-2", torch_dtype=torch.float16, @@ -58,6 +87,21 @@ def test_save_and_load_HF_model(): sliced_model.save_pretrained("sliced_model") sliced_model = SlicedPhiForCausalLM.from_pretrained("sliced_model", None, "microsoft/phi-2") + assert isinstance(sliced_model, SlicedPhiForCausalLM) + assert sliced_model.model.config == config + + +def test_save_and_load_sliced_llama(): + """Test HF model saving and loading""" + config = LlamaConfig.from_pretrained( + "openlm-research/open_llama_7b_v2", + torch_dtype=torch.float16, + ) + + sliced_model = SlicedLlamaForCausalLM(config).to(torch.float16) + sliced_model.save_pretrained("sliced_model") + sliced_model = SlicedLlamaForCausalLM.from_pretrained("sliced_model", None, "openlm-research/open_llama_7b_v2") + def compare_weights(model1, model2): for p1, p2 in zip(model1.parameters(), model2.parameters()): From c6ff900c00b0fa7727b3e56cb1acd0b1d1ce973d Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Sun, 21 Apr 2024 12:34:32 +0000 Subject: [PATCH 13/33] Clean up tests and unnecessary script --- tests/test_slicing.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/tests/test_slicing.py b/tests/test_slicing.py index 7e6f7d00..34d83a0c 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -41,7 +41,7 @@ def test_HF_model(): layernorm_fusion.fuse_modules(model_adapter) config = PhiConfig.from_pretrained( - "microsoft/phi-2", + model_name, torch_dtype=torch.float16, ) @@ -78,31 +78,20 @@ def test_HF_model(): def test_save_and_load_HF_model(): """Test HF model saving and loading""" + base_model_name = "microsoft/phi-2" config = PhiConfig.from_pretrained( - "microsoft/phi-2", + base_model_name, torch_dtype=torch.float16, ) sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) sliced_model.save_pretrained("sliced_model") - sliced_model = SlicedPhiForCausalLM.from_pretrained("sliced_model", None, "microsoft/phi-2") + sliced_model = SlicedPhiForCausalLM.from_pretrained("sliced_model", None, base_model_name) assert isinstance(sliced_model, SlicedPhiForCausalLM) assert sliced_model.model.config == config -def test_save_and_load_sliced_llama(): - """Test HF model saving and loading""" - config = LlamaConfig.from_pretrained( - "openlm-research/open_llama_7b_v2", - torch_dtype=torch.float16, - ) - - sliced_model = SlicedLlamaForCausalLM(config).to(torch.float16) - sliced_model.save_pretrained("sliced_model") - sliced_model = SlicedLlamaForCausalLM.from_pretrained("sliced_model", None, "openlm-research/open_llama_7b_v2") - - def compare_weights(model1, model2): for p1, p2 in zip(model1.parameters(), model2.parameters()): if not torch.equal(p1.data, p2.data): From 1a68d002f863e82d48fb85e7e1aa9f71338aa6d2 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Sun, 21 Apr 2024 12:36:28 +0000 Subject: [PATCH 14/33] Remove unnecessary script --- src/slicegpt/adapters/hf_compatible_phi.py | 193 --------------------- 1 file changed, 193 deletions(-) delete mode 100644 src/slicegpt/adapters/hf_compatible_phi.py diff --git a/src/slicegpt/adapters/hf_compatible_phi.py b/src/slicegpt/adapters/hf_compatible_phi.py deleted file mode 100644 index ee600035..00000000 --- a/src/slicegpt/adapters/hf_compatible_phi.py +++ /dev/null @@ -1,193 +0,0 @@ -import argparse -import os -import pathlib - -import torch -import torch.nn as nn -from transformers.models.phi.modeling_phi import PhiConfig, PhiForCausalLM, PhiModel - -from slicegpt import data_utils, gpu_utils, hf_utils -from slicegpt.adapters.phi2_adapter import CompressedPhiDecoderLayer, Phi2ModelAdapter -from slicegpt.model_adapter import SlicingConfig -from slicegpt.modules import RMSN -from slicegpt.rotate import slice_rotated_model -from slicegpt.slicing_scheduler import ConfigSlicingScheduler, SlicingScheduler - - -class SlicedPhi2Config(PhiConfig): - model_type = "sliced_phi2" - is_composition = True - - def __init__(self, sparsity, num_layers, hidden_size, intermediate_size, **kwargs): - super().__init__(**kwargs) - self.sparsity = sparsity - self.num_layers = num_layers - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - - def to_dict(self): - output = super().to_dict() - output.update({"sparsity": self.sparsity}) - return output - - @classmethod - def from_dict(cls, config_dict): - return cls(**config_dict) - - -class SlicedPhi(PhiModel): - def __init__(self, config): - super().__init__(config) - self.config = config - self.layers = nn.ModuleList( - [ - CompressedPhiDecoderLayer(config, layer_idx, replace_layernorm=True) - for layer_idx in range(config.num_hidden_layers) - ] - ) - self.final_layernorm = RMSN(config.hidden_size) - - -class SlicedPhiForCausalLM(PhiForCausalLM): - def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_args, **kwargs): - super().__init__(config) - self.model = SlicedPhi(config) - self.model_adapter = Phi2ModelAdapter(self) - - if scheduler: - self.update_dims(scheduler) - - @classmethod - def from_pretrained( - cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, config_path, *model_args, **kwargs - ): - """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" - config = SlicedPhi2Config.from_pretrained(config_path) - model = super().from_pretrained(pretrained_model_name_or_path, scheduler, config) - model.load_state_dict(model.state_dict()) - return model - - def update_dims(self, scheduler: SlicingScheduler) -> None: - layers = self.model_adapter.get_layers() - - hidden_size = self.model_adapter.hidden_size - for layer_adapter in layers: - if not self.model_adapter.parallel_blocks: - layer_adapter.layer.mlp_shortcut_Q = torch.nn.Parameter( - torch.zeros(hidden_size, hidden_size).to(dtype=torch.float16).contiguous() - ) - layer_adapter.layer.attn_shortcut_Q = torch.nn.Parameter( - torch.zeros(hidden_size, hidden_size).to(dtype=torch.float16).contiguous() - ) - - slice_rotated_model(self.model_adapter, scheduler) - - -def arg_parser() -> argparse.Namespace: - parser = argparse.ArgumentParser() - parser.add_argument( - "--model", - type=str, - default="microsoft/phi-2", - help="Model to load", - ) - parser.add_argument( - "--save-model-path", - type=str, - default="sliced_HF_model", - help="Path to save the final model to", - ) - parser.add_argument( - "--sparsity", type=float, default=0.1, help="A measure of how much slicing is applied (in the range [0, 1))" - ) - parser.add_argument( - "--round-interval", - type=int, - default=8, - help="Interval for rounding the weights (the best value may depend on your hardware)", - ) - parser.add_argument( - "--sliced-model-path", - type=str, - help="Path to load the sliced model to copy the weights from", - default="", - ) - parser.add_argument( - "--sliced-model-config-path", - type=str, - help="Path to load the config of the sliced model from", - default="", - ) - parser.add_argument( - "--cal-dataset", - type=str, - help="Dataset that sliced model was calibrated on. Also used to compute perplexity", - choices=["wikitext2", "ptb", "c4", "alpaca"], - default="wikitext2", - ) - parser.add_argument("--device", type=str, default="cuda") - parser.add_argument('--hf-token', type=str, default=os.getenv('HF_TOKEN', None)) - - return parser.parse_args() - - -def compare_weights(model1, model2): - for p1, p2 in zip(model1.parameters(), model2.parameters()): - if not torch.equal(p1.data, p2.data): - return False - return True - - -if __name__ == "__main__": - - args = arg_parser() - - config_path = pathlib.Path(args.sliced_model_config_path) - - slicing_conf = SlicingConfig.from_json_string(config_path.read_text()) - - slicing_scheduler = ConfigSlicingScheduler(slicing_conf) - - sliced_model_conf = SlicedPhi2Config( - sparsity=args.sparsity, - hidden_size=slicing_conf.hidden_size, - intermediate_size=slicing_conf.intermediate_size, - num_layers=slicing_conf.layers_num, - ) - sliced_model_conf.save_pretrained("sliced_phi2") - - # sliced config should be the same as the original model's config - orig_config = PhiConfig.from_pretrained( - "microsoft/phi-2", - torch_dtype=torch.float16, - ) - - sliced_model = SlicedPhiForCausalLM(orig_config, slicing_scheduler, slicing_conf) - - # load the saved sliced model - model_adapter, tokenizer = hf_utils.load_sliced_model( - args.model, - args.sliced_model_path, - sparsity=args.sparsity, - round_interval=args.round_interval, - token=args.hf_token, - ) - - dataset = data_utils.get_dataset("wikitext2") - train_dataset, test_dataset = dataset["train"], dataset["test"] - - test_loader = data_utils.prepare_test_dataloader(dataset=test_dataset, tokenizer=tokenizer, batch_size=1) - - dataset_ppl = gpu_utils.evaluate_ppl(model_adapter.model.to(args.device), tokenizer.pad_token_id, test_loader) - print(f'Loaded sliced model perplexity: {dataset_ppl}') - - sliced_model = sliced_model.to(torch.float16) - sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) - print("Model loaded successfully!") - - sliced_model.save_pretrained(args.save_model_path) - sliced_model_new = sliced_model.from_pretrained(args.save_model_path, slicing_scheduler, "sliced_phi2") - sliced_model_new = sliced_model_new.to(torch.float16) - - dataset_ppl = gpu_utils.evaluate_ppl(sliced_model_new.to(args.device), tokenizer.pad_token_id, test_loader) - print(f'Loaded new sliced model perplexity: {dataset_ppl}') From e6e63f556cda99d3781f433b0fe4d783396d8006 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Sun, 21 Apr 2024 12:39:55 +0000 Subject: [PATCH 15/33] Move sliced models to new files --- src/slicegpt/adapters/sliced_llama.py | 49 +++++++++++++++++++++++++++ src/slicegpt/adapters/sliced_phi.py | 49 +++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 src/slicegpt/adapters/sliced_llama.py create mode 100644 src/slicegpt/adapters/sliced_phi.py diff --git a/src/slicegpt/adapters/sliced_llama.py b/src/slicegpt/adapters/sliced_llama.py new file mode 100644 index 00000000..66e3f6e7 --- /dev/null +++ b/src/slicegpt/adapters/sliced_llama.py @@ -0,0 +1,49 @@ +from slicegpt.rotate import slice_rotated_model +from transformers.models.llama.modeling_llama import LlamaConfig, LlamaForCausalLM, LlamaModel +from slicegpt.slicing_scheduler import SlicingScheduler +from slicegpt.adapters.llama_adapter import CompressedLlamaDecoderLayer +from slicegpt.modules import RMSN +from slicegpt.adapters.llama_adapter import LlamaModelAdapter +import torch +import torch.nn as nn + +class SlicedLlama(LlamaModel): + def __init__(self, config): + super().__init__(config) + self.config = config + self.layers = nn.ModuleList( + [CompressedLlamaDecoderLayer(config, layer_idx, replace_layernorm=True) for layer_idx in range(config.num_hidden_layers)] + ) + self.final_layernorm = RMSN(config.hidden_size) + +class SlicedLlamaForCausalLM(LlamaForCausalLM): + def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_args, **kwargs): + super().__init__(config) + self.model = SlicedLlama(config) + self.model_adapter = LlamaModelAdapter(self) + + if scheduler: + self.update_dims(scheduler) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, config_path, *model_args, **kwargs): + """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" + config = LlamaConfig.from_pretrained(config_path) + model = super().from_pretrained(pretrained_model_name_or_path, scheduler, config) + model.load_state_dict(model.state_dict()) + return model + + def update_dims(self, scheduler: SlicingScheduler) -> None: + layers = self.model_adapter.get_layers() + + hidden_size = self.model_adapter.hidden_size + for layer_adapter in layers: + if not self.model_adapter.parallel_blocks: + layer_adapter.layer.mlp_shortcut_Q = torch.nn.Parameter( + torch.zeros(hidden_size, hidden_size).to(dtype=torch.float16).contiguous() + ) + layer_adapter.layer.attn_shortcut_Q = torch.nn.Parameter( + torch.zeros(hidden_size, hidden_size).to(dtype=torch.float16).contiguous() + ) + + slice_rotated_model(self.model_adapter, scheduler) \ No newline at end of file diff --git a/src/slicegpt/adapters/sliced_phi.py b/src/slicegpt/adapters/sliced_phi.py new file mode 100644 index 00000000..ffe964bd --- /dev/null +++ b/src/slicegpt/adapters/sliced_phi.py @@ -0,0 +1,49 @@ +from slicegpt.rotate import slice_rotated_model +from transformers.models.phi.modeling_phi import PhiConfig, PhiForCausalLM, PhiModel +from slicegpt.slicing_scheduler import SlicingScheduler +from slicegpt.adapters.phi2_adapter import CompressedPhiDecoderLayer +from slicegpt.modules import RMSN +from slicegpt.adapters.phi2_adapter import Phi2ModelAdapter +import torch +import torch.nn as nn + +class SlicedPhi(PhiModel): + def __init__(self, config): + super().__init__(config) + self.config = config + self.layers = nn.ModuleList( + [CompressedPhiDecoderLayer(config, layer_idx, replace_layernorm=True) for layer_idx in range(config.num_hidden_layers)] + ) + self.final_layernorm = RMSN(config.hidden_size) + +class SlicedPhiForCausalLM(PhiForCausalLM): + def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_args, **kwargs): + super().__init__(config) + self.model = SlicedPhi(config) + self.model_adapter = Phi2ModelAdapter(self) + + if scheduler: + self.update_dims(scheduler) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, config_path, *model_args, **kwargs): + """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" + config = PhiConfig.from_pretrained(config_path) + model = super().from_pretrained(pretrained_model_name_or_path, scheduler, config) + model.load_state_dict(model.state_dict()) + return model + + def update_dims(self, scheduler: SlicingScheduler) -> None: + layers = self.model_adapter.get_layers() + + hidden_size = self.model_adapter.hidden_size + for layer_adapter in layers: + if not self.model_adapter.parallel_blocks: + layer_adapter.layer.mlp_shortcut_Q = torch.nn.Parameter( + torch.zeros(hidden_size, hidden_size).to(dtype=torch.float16).contiguous() + ) + layer_adapter.layer.attn_shortcut_Q = torch.nn.Parameter( + torch.zeros(hidden_size, hidden_size).to(dtype=torch.float16).contiguous() + ) + + slice_rotated_model(self.model_adapter, scheduler) \ No newline at end of file From 125cd58611a0ebc8a987f8e370b877deeebe9e4d Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Sun, 21 Apr 2024 17:02:58 +0000 Subject: [PATCH 16/33] Apply style --- experiments/run_slicegpt.py | 17 ++++++++++++--- src/slicegpt/adapters/sliced_llama.py | 31 ++++++++++++++++----------- src/slicegpt/adapters/sliced_phi.py | 31 ++++++++++++++++----------- tests/test_slicing.py | 3 --- 4 files changed, 52 insertions(+), 30 deletions(-) diff --git a/experiments/run_slicegpt.py b/experiments/run_slicegpt.py index d79c4491..ca68355c 100755 --- a/experiments/run_slicegpt.py +++ b/experiments/run_slicegpt.py @@ -8,11 +8,13 @@ import shutil import torch +from transformers.models.llama.modeling_llama import LlamaConfig from transformers.models.phi.modeling_phi import PhiConfig import wandb from slicegpt import data_utils, gpu_utils, hf_utils, layernorm_fusion, rotate, utils from slicegpt.adapters.hf_compatible_phi import SlicedPhiForCausalLM +from slicegpt.adapters.sliced_llama import SlicedLlamaForCausalLM from slicegpt.config import config from slicegpt.slicing_scheduler import ConstSlicingScheduler @@ -232,16 +234,25 @@ def reset_model_device() -> None: sliced_model_name = sliced_model_dir / f'{pathlib.Path(args.model).name}_{args.sparsity}.pt' - # Save the sliced model in HF format + # Save the sliced model in HF format for Phi and Llama if args.model == "microsoft/phi-2": config_to_save = PhiConfig.from_pretrained( - "microsoft/phi-2", - torch_dtype=torch.float16, + args.model, + torch_dtype=config.dtype, ) sliced_model = SlicedPhiForCausalLM(config_to_save, scheduler).to(config.dtype) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) sliced_model.save_pretrained(sliced_model_dir) + elif "meta-llama" in args.model: + config_to_save = LlamaConfig.from_pretrained( + args.model, + torch_dtype=config.dtype, + ) + + sliced_model = SlicedLlamaForCausalLM(config_to_save, scheduler).to(config.dtype) + sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) + sliced_model.save_pretrained(sliced_model_dir) else: # Save the sliced model for other models types torch.save(model.state_dict(), sliced_model_name) diff --git a/src/slicegpt/adapters/sliced_llama.py b/src/slicegpt/adapters/sliced_llama.py index 66e3f6e7..b0a7d4d5 100644 --- a/src/slicegpt/adapters/sliced_llama.py +++ b/src/slicegpt/adapters/sliced_llama.py @@ -1,21 +1,26 @@ -from slicegpt.rotate import slice_rotated_model -from transformers.models.llama.modeling_llama import LlamaConfig, LlamaForCausalLM, LlamaModel -from slicegpt.slicing_scheduler import SlicingScheduler -from slicegpt.adapters.llama_adapter import CompressedLlamaDecoderLayer -from slicegpt.modules import RMSN -from slicegpt.adapters.llama_adapter import LlamaModelAdapter import torch import torch.nn as nn +from transformers.models.llama.modeling_llama import LlamaConfig, LlamaForCausalLM, LlamaModel + +from slicegpt.adapters.llama_adapter import CompressedLlamaDecoderLayer, LlamaModelAdapter +from slicegpt.modules import RMSN +from slicegpt.rotate import slice_rotated_model +from slicegpt.slicing_scheduler import SlicingScheduler + class SlicedLlama(LlamaModel): def __init__(self, config): super().__init__(config) self.config = config self.layers = nn.ModuleList( - [CompressedLlamaDecoderLayer(config, layer_idx, replace_layernorm=True) for layer_idx in range(config.num_hidden_layers)] + [ + CompressedLlamaDecoderLayer(config, layer_idx, replace_layernorm=True) + for layer_idx in range(config.num_hidden_layers) + ] ) self.final_layernorm = RMSN(config.hidden_size) + class SlicedLlamaForCausalLM(LlamaForCausalLM): def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_args, **kwargs): super().__init__(config) @@ -24,9 +29,11 @@ def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_arg if scheduler: self.update_dims(scheduler) - + @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, config_path, *model_args, **kwargs): + def from_pretrained( + cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, config_path, *model_args, **kwargs + ): """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" config = LlamaConfig.from_pretrained(config_path) model = super().from_pretrained(pretrained_model_name_or_path, scheduler, config) @@ -35,7 +42,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, scheduler: SlicingSchedu def update_dims(self, scheduler: SlicingScheduler) -> None: layers = self.model_adapter.get_layers() - + hidden_size = self.model_adapter.hidden_size for layer_adapter in layers: if not self.model_adapter.parallel_blocks: @@ -45,5 +52,5 @@ def update_dims(self, scheduler: SlicingScheduler) -> None: layer_adapter.layer.attn_shortcut_Q = torch.nn.Parameter( torch.zeros(hidden_size, hidden_size).to(dtype=torch.float16).contiguous() ) - - slice_rotated_model(self.model_adapter, scheduler) \ No newline at end of file + + slice_rotated_model(self.model_adapter, scheduler) diff --git a/src/slicegpt/adapters/sliced_phi.py b/src/slicegpt/adapters/sliced_phi.py index ffe964bd..ee7ce11f 100644 --- a/src/slicegpt/adapters/sliced_phi.py +++ b/src/slicegpt/adapters/sliced_phi.py @@ -1,21 +1,26 @@ -from slicegpt.rotate import slice_rotated_model -from transformers.models.phi.modeling_phi import PhiConfig, PhiForCausalLM, PhiModel -from slicegpt.slicing_scheduler import SlicingScheduler -from slicegpt.adapters.phi2_adapter import CompressedPhiDecoderLayer -from slicegpt.modules import RMSN -from slicegpt.adapters.phi2_adapter import Phi2ModelAdapter import torch import torch.nn as nn +from transformers.models.phi.modeling_phi import PhiConfig, PhiForCausalLM, PhiModel + +from slicegpt.adapters.phi2_adapter import CompressedPhiDecoderLayer, Phi2ModelAdapter +from slicegpt.modules import RMSN +from slicegpt.rotate import slice_rotated_model +from slicegpt.slicing_scheduler import SlicingScheduler + class SlicedPhi(PhiModel): def __init__(self, config): super().__init__(config) self.config = config self.layers = nn.ModuleList( - [CompressedPhiDecoderLayer(config, layer_idx, replace_layernorm=True) for layer_idx in range(config.num_hidden_layers)] + [ + CompressedPhiDecoderLayer(config, layer_idx, replace_layernorm=True) + for layer_idx in range(config.num_hidden_layers) + ] ) self.final_layernorm = RMSN(config.hidden_size) + class SlicedPhiForCausalLM(PhiForCausalLM): def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_args, **kwargs): super().__init__(config) @@ -24,9 +29,11 @@ def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_arg if scheduler: self.update_dims(scheduler) - + @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, config_path, *model_args, **kwargs): + def from_pretrained( + cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, config_path, *model_args, **kwargs + ): """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" config = PhiConfig.from_pretrained(config_path) model = super().from_pretrained(pretrained_model_name_or_path, scheduler, config) @@ -35,7 +42,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, scheduler: SlicingSchedu def update_dims(self, scheduler: SlicingScheduler) -> None: layers = self.model_adapter.get_layers() - + hidden_size = self.model_adapter.hidden_size for layer_adapter in layers: if not self.model_adapter.parallel_blocks: @@ -45,5 +52,5 @@ def update_dims(self, scheduler: SlicingScheduler) -> None: layer_adapter.layer.attn_shortcut_Q = torch.nn.Parameter( torch.zeros(hidden_size, hidden_size).to(dtype=torch.float16).contiguous() ) - - slice_rotated_model(self.model_adapter, scheduler) \ No newline at end of file + + slice_rotated_model(self.model_adapter, scheduler) diff --git a/tests/test_slicing.py b/tests/test_slicing.py index 34d83a0c..d62c3817 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -2,13 +2,10 @@ # Licensed under the MIT license. import torch -from transformers import AutoTokenizer -from transformers.models.llama.modeling_llama import LlamaConfig from transformers.models.phi.modeling_phi import PhiConfig from slicegpt import data_utils, gpu_utils, hf_utils, layernorm_fusion, rotate from slicegpt.adapters.opt_adapter import OPTModelAdapter -from slicegpt.adapters.sliced_llama import SlicedLlamaForCausalLM from slicegpt.adapters.sliced_phi import SlicedPhiForCausalLM from slicegpt.slicing_scheduler import ConstSlicingScheduler From e9bb4175f8579bf4607cf31dc37dd1289c81a56d Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Sun, 21 Apr 2024 20:12:40 +0000 Subject: [PATCH 17/33] Add configs and move model saving to hf_utils --- experiments/run_slicegpt.py | 32 +--------------- src/slicegpt/adapters/sliced_llama.py | 22 +++++++++-- src/slicegpt/adapters/sliced_phi.py | 20 ++++++++-- src/slicegpt/hf_utils.py | 54 +++++++++++++++++++++++++++ tests/test_slicing.py | 43 +++++++++++++++------ 5 files changed, 120 insertions(+), 51 deletions(-) diff --git a/experiments/run_slicegpt.py b/experiments/run_slicegpt.py index ca68355c..f5ee71e3 100755 --- a/experiments/run_slicegpt.py +++ b/experiments/run_slicegpt.py @@ -8,13 +8,9 @@ import shutil import torch -from transformers.models.llama.modeling_llama import LlamaConfig -from transformers.models.phi.modeling_phi import PhiConfig import wandb from slicegpt import data_utils, gpu_utils, hf_utils, layernorm_fusion, rotate, utils -from slicegpt.adapters.hf_compatible_phi import SlicedPhiForCausalLM -from slicegpt.adapters.sliced_llama import SlicedLlamaForCausalLM from slicegpt.config import config from slicegpt.slicing_scheduler import ConstSlicingScheduler @@ -232,34 +228,8 @@ def reset_model_device() -> None: sliced_model_dir = pathlib.Path(args.save_dir) sliced_model_dir.mkdir(parents=True, exist_ok=True) - sliced_model_name = sliced_model_dir / f'{pathlib.Path(args.model).name}_{args.sparsity}.pt' - # Save the sliced model in HF format for Phi and Llama - if args.model == "microsoft/phi-2": - config_to_save = PhiConfig.from_pretrained( - args.model, - torch_dtype=config.dtype, - ) - - sliced_model = SlicedPhiForCausalLM(config_to_save, scheduler).to(config.dtype) - sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) - sliced_model.save_pretrained(sliced_model_dir) - elif "meta-llama" in args.model: - config_to_save = LlamaConfig.from_pretrained( - args.model, - torch_dtype=config.dtype, - ) - - sliced_model = SlicedLlamaForCausalLM(config_to_save, scheduler).to(config.dtype) - sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) - sliced_model.save_pretrained(sliced_model_dir) - else: - # Save the sliced model for other models types - torch.save(model.state_dict(), sliced_model_name) - - # Save the slicing config - config_path = sliced_model_name.with_suffix('.json') - config_path.write_text(model_adapter.slicing_conf.to_json_string()) + hf_utils.save_sliced_model(args.model, config.dtype, model, scheduler, sliced_model_dir, args.sparsity, new_embedding_dimension, model_adapter.slicing_conf) # If slicing a local model, also save HF config files in sliced model dir if args.model_path: diff --git a/src/slicegpt/adapters/sliced_llama.py b/src/slicegpt/adapters/sliced_llama.py index b0a7d4d5..e9338e74 100644 --- a/src/slicegpt/adapters/sliced_llama.py +++ b/src/slicegpt/adapters/sliced_llama.py @@ -8,6 +8,20 @@ from slicegpt.slicing_scheduler import SlicingScheduler +class SlicedLlamaConfig(LlamaConfig): + model_type = "sliced_llama" + is_composition = True + + def __init__(self, sparsity = 0.1, new_hidden_size = 1024, **kwargs): + self.sparsity = sparsity + self.new_hidden_size = new_hidden_size + super().__init__(**kwargs) + + @classmethod + def from_pretrained(cls, config_path, sparsity, new_hidden_size): + return super().from_pretrained(config_path, sparsity, new_hidden_size) + + class SlicedLlama(LlamaModel): def __init__(self, config): super().__init__(config) @@ -22,7 +36,7 @@ def __init__(self, config): class SlicedLlamaForCausalLM(LlamaForCausalLM): - def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_args, **kwargs): + def __init__(self, config, scheduler: SlicingScheduler | None = None, sparsity: float = 0.0, new_hidden_size: int = 1024, *model_args, **kwargs): super().__init__(config) self.model = SlicedLlama(config) self.model_adapter = LlamaModelAdapter(self) @@ -32,11 +46,11 @@ def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_arg @classmethod def from_pretrained( - cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, config_path, *model_args, **kwargs + cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, sparsity: float, new_hidden_size: int, config_path: str, *model_args, **kwargs ): """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" - config = LlamaConfig.from_pretrained(config_path) - model = super().from_pretrained(pretrained_model_name_or_path, scheduler, config) + config = SlicedLlamaConfig.from_pretrained(config_path, sparsity, new_hidden_size) + model = super().from_pretrained(pretrained_model_name_or_path, config=config) model.load_state_dict(model.state_dict()) return model diff --git a/src/slicegpt/adapters/sliced_phi.py b/src/slicegpt/adapters/sliced_phi.py index ee7ce11f..f600eafb 100644 --- a/src/slicegpt/adapters/sliced_phi.py +++ b/src/slicegpt/adapters/sliced_phi.py @@ -7,6 +7,18 @@ from slicegpt.rotate import slice_rotated_model from slicegpt.slicing_scheduler import SlicingScheduler +class SlicedPhi2Config(PhiConfig): + model_type = "sliced_phi2" + is_composition = True + + def __init__(self, sparsity = 0.1, new_hidden_size = 1024, **kwargs): + self.sparsity = sparsity + self.new_hidden_size = new_hidden_size + super().__init__(**kwargs) + + @classmethod + def from_pretrained(cls, config_path, sparsity, new_hidden_size): + return super().from_pretrained(config_path, sparsity, new_hidden_size) class SlicedPhi(PhiModel): def __init__(self, config): @@ -22,7 +34,7 @@ def __init__(self, config): class SlicedPhiForCausalLM(PhiForCausalLM): - def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_args, **kwargs): + def __init__(self, config, scheduler: SlicingScheduler | None = None, sparsity: float = 0.0, new_hidden_size: int = 1024, *model_args, **kwargs): super().__init__(config) self.model = SlicedPhi(config) self.model_adapter = Phi2ModelAdapter(self) @@ -32,11 +44,11 @@ def __init__(self, config, scheduler: SlicingScheduler | None = None, *model_arg @classmethod def from_pretrained( - cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, config_path, *model_args, **kwargs + cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, sparsity: float, new_hidden_size: int, config_path: str, *model_args, **kwargs ): """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" - config = PhiConfig.from_pretrained(config_path) - model = super().from_pretrained(pretrained_model_name_or_path, scheduler, config) + config = SlicedPhi2Config.from_pretrained(config_path, sparsity, new_hidden_size) + model = super().from_pretrained(pretrained_model_name_or_path, config=config) model.load_state_dict(model.state_dict()) return model diff --git a/src/slicegpt/hf_utils.py b/src/slicegpt/hf_utils.py index da7753ae..f33f1aeb 100755 --- a/src/slicegpt/hf_utils.py +++ b/src/slicegpt/hf_utils.py @@ -4,6 +4,7 @@ import logging import pathlib +from slicegpt.slicing_scheduler import SlicingScheduler import torch from peft import LoraConfig, get_peft_model from transformers import AutoTokenizer, PreTrainedTokenizerBase @@ -11,6 +12,10 @@ from .layernorm_fusion import fuse_modules, replace_layers from .model_adapter import ModelAdapter, SlicingConfig from .rotate import slice_rotated_model +from slicegpt.adapters.sliced_phi import SlicedPhi2Config, SlicedPhiForCausalLM +from slicegpt.adapters.sliced_llama import SlicedLlamaConfig, SlicedLlamaForCausalLM +from transformers.models.llama.modeling_llama import LlamaConfig +from transformers.models.phi.modeling_phi import PhiConfig def do_not_initialize(func): @@ -174,3 +179,52 @@ def load_sliced_model( model_adapter.model.eval() return model_adapter, tokenizer + +def save_sliced_model( + model_name: str, + dtype: torch.dtype, + model: torch.nn.Module, + scheduler: SlicingScheduler, + save_sliced_model_dir: str | pathlib.Path, + sparsity: float, + new_hidden_size: int, + slicing_conf: SlicingConfig, +): + if model_name == "microsoft/phi-2": + config = PhiConfig.from_pretrained( + model_name, + torch_dtype=torch.float16, + ) + + config.save_pretrained("phi_config") + config_to_save = SlicedPhi2Config.from_pretrained( + config_path="phi_config", + sparsity=sparsity, + new_hidden_size=new_hidden_size + ) + + sliced_model = SlicedPhiForCausalLM(config_to_save, scheduler).to(dtype) + sliced_model.load_state_dict(model.state_dict(), strict=True, assign=True) + sliced_model.save_pretrained(save_sliced_model_dir) + + elif "meta-llama" in model_name: + config = LlamaConfig.from_pretrained( + model_name, + torch_dtype=torch.float16, + ) + config.save_pretrained("llama_config") + config_to_save = SlicedLlamaConfig.from_pretrained( + config_path="llama_config", + sparsity=sparsity, + new_hidden_size=new_hidden_size, + ) + + sliced_model = SlicedLlamaForCausalLM(config_to_save, scheduler).to(dtype) + sliced_model.load_state_dict(model.state_dict(), strict=True, assign=True) + sliced_model.save_pretrained(save_sliced_model_dir) + else: + # Save the sliced model for other models types + sliced_model_name = save_sliced_model_dir / f'{pathlib.Path(model_name).name}_{sparsity}.pt' + torch.save(model.state_dict(), sliced_model_name) + config_path = sliced_model_name.with_suffix('.json') + config_path.write_text(slicing_conf.to_json_string()) diff --git a/tests/test_slicing.py b/tests/test_slicing.py index d62c3817..f1811b18 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -1,12 +1,13 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import pytest import torch from transformers.models.phi.modeling_phi import PhiConfig from slicegpt import data_utils, gpu_utils, hf_utils, layernorm_fusion, rotate from slicegpt.adapters.opt_adapter import OPTModelAdapter -from slicegpt.adapters.sliced_phi import SlicedPhiForCausalLM +from slicegpt.adapters.sliced_phi import SlicedPhi2Config, SlicedPhiForCausalLM from slicegpt.slicing_scheduler import ConstSlicingScheduler @@ -28,22 +29,25 @@ def get_module_names(model) -> list[str]: return [name for name, _ in model.named_parameters()] +@pytest.mark.experiment +@pytest.mark.gpu def test_HF_model(): """Check that the HF model weights are equivalent to the sliced model weights""" model_name = "microsoft/phi-2" model_adapter, tokenizer = hf_utils.get_model_and_tokenizer(model_name) sparsity = 0.1 + new_hidden_size = 2304 layernorm_fusion.replace_layers(model_adapter) layernorm_fusion.fuse_modules(model_adapter) - config = PhiConfig.from_pretrained( + phi_config = PhiConfig.from_pretrained( model_name, torch_dtype=torch.float16, ) - new_embedding_dimension = int((1 - sparsity) * model_adapter.hidden_size) - new_embedding_dimension -= new_embedding_dimension % 8 + phi_config.save_pretrained("phi_config") + config = SlicedPhi2Config.from_pretrained(config_path="phi_config", sparsity=sparsity, new_hidden_size=new_hidden_size) sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) @@ -58,7 +62,7 @@ def test_HF_model(): test_loader = data_utils.prepare_test_dataloader(dataset=test_dataset, tokenizer=tokenizer) - scheduler = ConstSlicingScheduler(new_embedding_dimension) + scheduler = ConstSlicingScheduler(new_hidden_size) rotate.rotate_and_slice(model_adapter, train_loader, scheduler, final_orientation="random") sliced_ppl = gpu_utils.evaluate_ppl(model_adapter.model.to("cuda"), tokenizer.pad_token_id, test_loader) @@ -75,18 +79,33 @@ def test_HF_model(): def test_save_and_load_HF_model(): """Test HF model saving and loading""" - base_model_name = "microsoft/phi-2" - config = PhiConfig.from_pretrained( - base_model_name, - torch_dtype=torch.float16, + sparsity = 0.0 + new_hidden_size = 2506 + config_name = "sliced_model_config" + model_name = "sliced_model" + + config = SlicedPhi2Config(sparsity, new_hidden_size) + config.save_pretrained(config_name) + + config = SlicedPhi2Config.from_pretrained( + config_name, + sparsity, + new_hidden_size ) sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) - sliced_model.save_pretrained("sliced_model") - sliced_model = SlicedPhiForCausalLM.from_pretrained("sliced_model", None, base_model_name) + sliced_model.save_pretrained(model_name) + sliced_model = SlicedPhiForCausalLM.from_pretrained( + model_name, + scheduler=None, + config_path=config_name, + sparsity=sparsity, + new_hidden_size=new_hidden_size + ) assert isinstance(sliced_model, SlicedPhiForCausalLM) - assert sliced_model.model.config == config + assert sliced_model.config.sparsity == sparsity + assert sliced_model.config.new_hidden_size == new_hidden_size def compare_weights(model1, model2): From be7b7677fb20de8aa86582c840b5c133bc5f50ee Mon Sep 17 00:00:00 2001 From: Pashmina Cameron Date: Tue, 23 Apr 2024 12:28:27 +0100 Subject: [PATCH 18/33] Formatting --- experiments/run_slicegpt.py | 13 +++++++++++-- src/slicegpt/adapters/sliced_llama.py | 23 +++++++++++++++++++---- src/slicegpt/adapters/sliced_phi.py | 23 ++++++++++++++++++++--- src/slicegpt/hf_utils.py | 18 +++++++++--------- src/slicegpt/model_adapter.py | 2 +- src/slicegpt/slicing_scheduler.py | 2 +- tests/test_slicing.py | 20 +++++++------------- 7 files changed, 68 insertions(+), 33 deletions(-) diff --git a/experiments/run_slicegpt.py b/experiments/run_slicegpt.py index f5ee71e3..11e682f9 100755 --- a/experiments/run_slicegpt.py +++ b/experiments/run_slicegpt.py @@ -8,8 +8,8 @@ import shutil import torch - import wandb + from slicegpt import data_utils, gpu_utils, hf_utils, layernorm_fusion, rotate, utils from slicegpt.config import config from slicegpt.slicing_scheduler import ConstSlicingScheduler @@ -229,7 +229,16 @@ def reset_model_device() -> None: sliced_model_dir.mkdir(parents=True, exist_ok=True) # Save the sliced model in HF format for Phi and Llama - hf_utils.save_sliced_model(args.model, config.dtype, model, scheduler, sliced_model_dir, args.sparsity, new_embedding_dimension, model_adapter.slicing_conf) + hf_utils.save_sliced_model( + args.model, + config.dtype, + model, + scheduler, + sliced_model_dir, + args.sparsity, + new_embedding_dimension, + model_adapter.slicing_conf, + ) # If slicing a local model, also save HF config files in sliced model dir if args.model_path: diff --git a/src/slicegpt/adapters/sliced_llama.py b/src/slicegpt/adapters/sliced_llama.py index e9338e74..7e893df9 100644 --- a/src/slicegpt/adapters/sliced_llama.py +++ b/src/slicegpt/adapters/sliced_llama.py @@ -12,7 +12,7 @@ class SlicedLlamaConfig(LlamaConfig): model_type = "sliced_llama" is_composition = True - def __init__(self, sparsity = 0.1, new_hidden_size = 1024, **kwargs): + def __init__(self, sparsity=0.1, new_hidden_size=1024, **kwargs): self.sparsity = sparsity self.new_hidden_size = new_hidden_size super().__init__(**kwargs) @@ -20,7 +20,7 @@ def __init__(self, sparsity = 0.1, new_hidden_size = 1024, **kwargs): @classmethod def from_pretrained(cls, config_path, sparsity, new_hidden_size): return super().from_pretrained(config_path, sparsity, new_hidden_size) - + class SlicedLlama(LlamaModel): def __init__(self, config): @@ -36,7 +36,15 @@ def __init__(self, config): class SlicedLlamaForCausalLM(LlamaForCausalLM): - def __init__(self, config, scheduler: SlicingScheduler | None = None, sparsity: float = 0.0, new_hidden_size: int = 1024, *model_args, **kwargs): + def __init__( + self, + config, + scheduler: SlicingScheduler | None = None, + sparsity: float = 0.0, + new_hidden_size: int = 1024, + *model_args, + **kwargs, + ): super().__init__(config) self.model = SlicedLlama(config) self.model_adapter = LlamaModelAdapter(self) @@ -46,7 +54,14 @@ def __init__(self, config, scheduler: SlicingScheduler | None = None, sparsity: @classmethod def from_pretrained( - cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, sparsity: float, new_hidden_size: int, config_path: str, *model_args, **kwargs + cls, + pretrained_model_name_or_path, + scheduler: SlicingScheduler | None, + sparsity: float, + new_hidden_size: int, + config_path: str, + *model_args, + **kwargs, ): """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" config = SlicedLlamaConfig.from_pretrained(config_path, sparsity, new_hidden_size) diff --git a/src/slicegpt/adapters/sliced_phi.py b/src/slicegpt/adapters/sliced_phi.py index f600eafb..89b5fb19 100644 --- a/src/slicegpt/adapters/sliced_phi.py +++ b/src/slicegpt/adapters/sliced_phi.py @@ -7,11 +7,12 @@ from slicegpt.rotate import slice_rotated_model from slicegpt.slicing_scheduler import SlicingScheduler + class SlicedPhi2Config(PhiConfig): model_type = "sliced_phi2" is_composition = True - def __init__(self, sparsity = 0.1, new_hidden_size = 1024, **kwargs): + def __init__(self, sparsity=0.1, new_hidden_size=1024, **kwargs): self.sparsity = sparsity self.new_hidden_size = new_hidden_size super().__init__(**kwargs) @@ -20,6 +21,7 @@ def __init__(self, sparsity = 0.1, new_hidden_size = 1024, **kwargs): def from_pretrained(cls, config_path, sparsity, new_hidden_size): return super().from_pretrained(config_path, sparsity, new_hidden_size) + class SlicedPhi(PhiModel): def __init__(self, config): super().__init__(config) @@ -34,7 +36,15 @@ def __init__(self, config): class SlicedPhiForCausalLM(PhiForCausalLM): - def __init__(self, config, scheduler: SlicingScheduler | None = None, sparsity: float = 0.0, new_hidden_size: int = 1024, *model_args, **kwargs): + def __init__( + self, + config, + scheduler: SlicingScheduler | None = None, + sparsity: float = 0.0, + new_hidden_size: int = 1024, + *model_args, + **kwargs, + ): super().__init__(config) self.model = SlicedPhi(config) self.model_adapter = Phi2ModelAdapter(self) @@ -44,7 +54,14 @@ def __init__(self, config, scheduler: SlicingScheduler | None = None, sparsity: @classmethod def from_pretrained( - cls, pretrained_model_name_or_path, scheduler: SlicingScheduler | None, sparsity: float, new_hidden_size: int, config_path: str, *model_args, **kwargs + cls, + pretrained_model_name_or_path, + scheduler: SlicingScheduler | None, + sparsity: float, + new_hidden_size: int, + config_path: str, + *model_args, + **kwargs, ): """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" config = SlicedPhi2Config.from_pretrained(config_path, sparsity, new_hidden_size) diff --git a/src/slicegpt/hf_utils.py b/src/slicegpt/hf_utils.py index f33f1aeb..fbd2e90d 100755 --- a/src/slicegpt/hf_utils.py +++ b/src/slicegpt/hf_utils.py @@ -4,18 +4,19 @@ import logging import pathlib -from slicegpt.slicing_scheduler import SlicingScheduler import torch from peft import LoraConfig, get_peft_model from transformers import AutoTokenizer, PreTrainedTokenizerBase +from transformers.models.llama.modeling_llama import LlamaConfig +from transformers.models.phi.modeling_phi import PhiConfig + +from slicegpt.adapters.sliced_llama import SlicedLlamaConfig, SlicedLlamaForCausalLM +from slicegpt.adapters.sliced_phi import SlicedPhi2Config, SlicedPhiForCausalLM +from slicegpt.slicing_scheduler import SlicingScheduler from .layernorm_fusion import fuse_modules, replace_layers from .model_adapter import ModelAdapter, SlicingConfig from .rotate import slice_rotated_model -from slicegpt.adapters.sliced_phi import SlicedPhi2Config, SlicedPhiForCausalLM -from slicegpt.adapters.sliced_llama import SlicedLlamaConfig, SlicedLlamaForCausalLM -from transformers.models.llama.modeling_llama import LlamaConfig -from transformers.models.phi.modeling_phi import PhiConfig def do_not_initialize(func): @@ -180,6 +181,7 @@ def load_sliced_model( return model_adapter, tokenizer + def save_sliced_model( model_name: str, dtype: torch.dtype, @@ -198,15 +200,13 @@ def save_sliced_model( config.save_pretrained("phi_config") config_to_save = SlicedPhi2Config.from_pretrained( - config_path="phi_config", - sparsity=sparsity, - new_hidden_size=new_hidden_size + config_path="phi_config", sparsity=sparsity, new_hidden_size=new_hidden_size ) sliced_model = SlicedPhiForCausalLM(config_to_save, scheduler).to(dtype) sliced_model.load_state_dict(model.state_dict(), strict=True, assign=True) sliced_model.save_pretrained(save_sliced_model_dir) - + elif "meta-llama" in model_name: config = LlamaConfig.from_pretrained( model_name, diff --git a/src/slicegpt/model_adapter.py b/src/slicegpt/model_adapter.py index 6e43b5ff..f2075fab 100644 --- a/src/slicegpt/model_adapter.py +++ b/src/slicegpt/model_adapter.py @@ -163,7 +163,7 @@ def hidden_size(self) -> int: The hidden size of the model """ raise NotImplementedError - + @property @abstractmethod def intermediate_size(self) -> int: diff --git a/src/slicegpt/slicing_scheduler.py b/src/slicegpt/slicing_scheduler.py index 7bf1667e..a7089ee9 100644 --- a/src/slicegpt/slicing_scheduler.py +++ b/src/slicegpt/slicing_scheduler.py @@ -40,7 +40,7 @@ def parallel_blocks(self) -> bool: """Return whether working with a parallel blocks models.""" return self.slicing_conf.parallel_blocks - def setup(self, *, hidden_size: int, intermediate_size:int, layers_num: int, parallel_blocks: bool) -> None: + def setup(self, *, hidden_size: int, intermediate_size: int, layers_num: int, parallel_blocks: bool) -> None: """Set up the slicing scheduler with the given model parameters.""" self.slicing_conf.hidden_size = hidden_size self.slicing_conf.intermediate_size = intermediate_size diff --git a/tests/test_slicing.py b/tests/test_slicing.py index f1811b18..492a689a 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -47,7 +47,9 @@ def test_HF_model(): ) phi_config.save_pretrained("phi_config") - config = SlicedPhi2Config.from_pretrained(config_path="phi_config", sparsity=sparsity, new_hidden_size=new_hidden_size) + config = SlicedPhi2Config.from_pretrained( + config_path="phi_config", sparsity=sparsity, new_hidden_size=new_hidden_size + ) sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) @@ -83,24 +85,16 @@ def test_save_and_load_HF_model(): new_hidden_size = 2506 config_name = "sliced_model_config" model_name = "sliced_model" - + config = SlicedPhi2Config(sparsity, new_hidden_size) config.save_pretrained(config_name) - - config = SlicedPhi2Config.from_pretrained( - config_name, - sparsity, - new_hidden_size - ) + + config = SlicedPhi2Config.from_pretrained(config_name, sparsity, new_hidden_size) sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) sliced_model.save_pretrained(model_name) sliced_model = SlicedPhiForCausalLM.from_pretrained( - model_name, - scheduler=None, - config_path=config_name, - sparsity=sparsity, - new_hidden_size=new_hidden_size + model_name, scheduler=None, config_path=config_name, sparsity=sparsity, new_hidden_size=new_hidden_size ) assert isinstance(sliced_model, SlicedPhiForCausalLM) From 40a7661dda40f03e4995f7e48ef30c9ce337ca64 Mon Sep 17 00:00:00 2001 From: Pashmina Cameron Date: Tue, 23 Apr 2024 12:57:31 +0100 Subject: [PATCH 19/33] Add module imports --- src/slicegpt/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/slicegpt/__init__.py b/src/slicegpt/__init__.py index a2e75aff..b2d9da05 100644 --- a/src/slicegpt/__init__.py +++ b/src/slicegpt/__init__.py @@ -4,6 +4,8 @@ from .adapters.llama_adapter import LlamaModelAdapter from .adapters.opt_adapter import OPTModelAdapter from .adapters.phi2_adapter import Phi2ModelAdapter +from .adapters.sliced_phi import SlicedPhi2Config, SlicedPhi, SlicedPhiForCausalLM +from .adapters.sliced_llama import SlicedLlamaConfig, SlicedLlama, SlicedLlamaForCausalLM from .data_utils import get_dataset, prepare_dataloader from .gpu_utils import benchmark, distribute_model, evaluate_ppl from .hf_utils import get_model_and_tokenizer, load_sliced_model From 3719480f7024864838cc13efa3af067a5631e33a Mon Sep 17 00:00:00 2001 From: Pashmina Cameron Date: Tue, 23 Apr 2024 13:40:35 +0100 Subject: [PATCH 20/33] Formatting module imports --- src/slicegpt/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/slicegpt/__init__.py b/src/slicegpt/__init__.py index b2d9da05..dc494cd2 100644 --- a/src/slicegpt/__init__.py +++ b/src/slicegpt/__init__.py @@ -4,8 +4,8 @@ from .adapters.llama_adapter import LlamaModelAdapter from .adapters.opt_adapter import OPTModelAdapter from .adapters.phi2_adapter import Phi2ModelAdapter -from .adapters.sliced_phi import SlicedPhi2Config, SlicedPhi, SlicedPhiForCausalLM -from .adapters.sliced_llama import SlicedLlamaConfig, SlicedLlama, SlicedLlamaForCausalLM +from .adapters.sliced_llama import SlicedLlama, SlicedLlamaConfig, SlicedLlamaForCausalLM +from .adapters.sliced_phi import SlicedPhi, SlicedPhi2Config, SlicedPhiForCausalLM from .data_utils import get_dataset, prepare_dataloader from .gpu_utils import benchmark, distribute_model, evaluate_ppl from .hf_utils import get_model_and_tokenizer, load_sliced_model From f63a0381e672cf6bab60c02dec8e79d27ec2b11f Mon Sep 17 00:00:00 2001 From: Pashmina Cameron Date: Tue, 23 Apr 2024 14:44:47 +0100 Subject: [PATCH 21/33] src/slicegpt/hf_utils.py --- src/slicegpt/adapters/__init__.py | 0 src/slicegpt/hf_utils.py | 7 +++---- 2 files changed, 3 insertions(+), 4 deletions(-) create mode 100644 src/slicegpt/adapters/__init__.py diff --git a/src/slicegpt/adapters/__init__.py b/src/slicegpt/adapters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/slicegpt/hf_utils.py b/src/slicegpt/hf_utils.py index fbd2e90d..7f03de27 100755 --- a/src/slicegpt/hf_utils.py +++ b/src/slicegpt/hf_utils.py @@ -10,13 +10,12 @@ from transformers.models.llama.modeling_llama import LlamaConfig from transformers.models.phi.modeling_phi import PhiConfig -from slicegpt.adapters.sliced_llama import SlicedLlamaConfig, SlicedLlamaForCausalLM -from slicegpt.adapters.sliced_phi import SlicedPhi2Config, SlicedPhiForCausalLM -from slicegpt.slicing_scheduler import SlicingScheduler - +from .adapters.sliced_llama import SlicedLlamaConfig, SlicedLlamaForCausalLM +from .adapters.sliced_phi import SlicedPhi2Config, SlicedPhiForCausalLM from .layernorm_fusion import fuse_modules, replace_layers from .model_adapter import ModelAdapter, SlicingConfig from .rotate import slice_rotated_model +from .slicing_scheduler import SlicingScheduler def do_not_initialize(func): From 6210fc1953d9f73356c3c4ea37e2f578eed1b090 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Tue, 23 Apr 2024 16:32:47 +0000 Subject: [PATCH 22/33] Add intermediate_size to OPT adapter to fix tests --- src/slicegpt/adapters/opt_adapter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/slicegpt/adapters/opt_adapter.py b/src/slicegpt/adapters/opt_adapter.py index bd62da1c..5ae8ff38 100644 --- a/src/slicegpt/adapters/opt_adapter.py +++ b/src/slicegpt/adapters/opt_adapter.py @@ -176,6 +176,10 @@ def seqlen(self) -> int: @property def hidden_size(self) -> int: return self.config.hidden_size + + @property + def intermediate_size(self) -> int: + return self.config.intermediate_size @property def should_bake_mean_into_linear(self) -> bool: From fcd5fa7c39ffef51b65aea3e55c16091f0751dd4 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Tue, 23 Apr 2024 17:35:09 +0000 Subject: [PATCH 23/33] Make sparsity and new_hidden_size mandatory, fix intermediate_size in opt --- src/slicegpt/adapters/sliced_llama.py | 9 +++++---- src/slicegpt/adapters/sliced_phi.py | 9 +++++---- src/slicegpt/hf_utils.py | 4 ++-- src/slicegpt/rotate.py | 2 +- tests/test_model_adapter.py | 1 + tests/test_slicing.py | 6 +++--- 6 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/slicegpt/adapters/sliced_llama.py b/src/slicegpt/adapters/sliced_llama.py index 7e893df9..8a0474ef 100644 --- a/src/slicegpt/adapters/sliced_llama.py +++ b/src/slicegpt/adapters/sliced_llama.py @@ -1,6 +1,7 @@ import torch import torch.nn as nn from transformers.models.llama.modeling_llama import LlamaConfig, LlamaForCausalLM, LlamaModel +from transformers.configuration_utils import PretrainedConfig from slicegpt.adapters.llama_adapter import CompressedLlamaDecoderLayer, LlamaModelAdapter from slicegpt.modules import RMSN @@ -12,13 +13,13 @@ class SlicedLlamaConfig(LlamaConfig): model_type = "sliced_llama" is_composition = True - def __init__(self, sparsity=0.1, new_hidden_size=1024, **kwargs): + def __init__(self, sparsity: float, new_hidden_size: int, **kwargs) -> None: self.sparsity = sparsity self.new_hidden_size = new_hidden_size super().__init__(**kwargs) @classmethod - def from_pretrained(cls, config_path, sparsity, new_hidden_size): + def from_pretrained(cls, config_path: str, sparsity: float, new_hidden_size: int) -> PretrainedConfig: return super().from_pretrained(config_path, sparsity, new_hidden_size) @@ -39,9 +40,9 @@ class SlicedLlamaForCausalLM(LlamaForCausalLM): def __init__( self, config, + sparsity: float, + new_hidden_size: int, scheduler: SlicingScheduler | None = None, - sparsity: float = 0.0, - new_hidden_size: int = 1024, *model_args, **kwargs, ): diff --git a/src/slicegpt/adapters/sliced_phi.py b/src/slicegpt/adapters/sliced_phi.py index 89b5fb19..567d5b82 100644 --- a/src/slicegpt/adapters/sliced_phi.py +++ b/src/slicegpt/adapters/sliced_phi.py @@ -1,6 +1,7 @@ import torch import torch.nn as nn from transformers.models.phi.modeling_phi import PhiConfig, PhiForCausalLM, PhiModel +from transformers.configuration_utils import PretrainedConfig from slicegpt.adapters.phi2_adapter import CompressedPhiDecoderLayer, Phi2ModelAdapter from slicegpt.modules import RMSN @@ -12,13 +13,13 @@ class SlicedPhi2Config(PhiConfig): model_type = "sliced_phi2" is_composition = True - def __init__(self, sparsity=0.1, new_hidden_size=1024, **kwargs): + def __init__(self, sparsity: float, new_hidden_size: int, **kwargs) -> None: self.sparsity = sparsity self.new_hidden_size = new_hidden_size super().__init__(**kwargs) @classmethod - def from_pretrained(cls, config_path, sparsity, new_hidden_size): + def from_pretrained(cls, config_path: str, sparsity: float, new_hidden_size: int) -> PretrainedConfig: return super().from_pretrained(config_path, sparsity, new_hidden_size) @@ -39,9 +40,9 @@ class SlicedPhiForCausalLM(PhiForCausalLM): def __init__( self, config, + sparsity: float, + new_hidden_size: int, scheduler: SlicingScheduler | None = None, - sparsity: float = 0.0, - new_hidden_size: int = 1024, *model_args, **kwargs, ): diff --git a/src/slicegpt/hf_utils.py b/src/slicegpt/hf_utils.py index 7f03de27..96a87d51 100755 --- a/src/slicegpt/hf_utils.py +++ b/src/slicegpt/hf_utils.py @@ -202,7 +202,7 @@ def save_sliced_model( config_path="phi_config", sparsity=sparsity, new_hidden_size=new_hidden_size ) - sliced_model = SlicedPhiForCausalLM(config_to_save, scheduler).to(dtype) + sliced_model = SlicedPhiForCausalLM(config_to_save, sparsity, new_hidden_size, scheduler).to(dtype) sliced_model.load_state_dict(model.state_dict(), strict=True, assign=True) sliced_model.save_pretrained(save_sliced_model_dir) @@ -218,7 +218,7 @@ def save_sliced_model( new_hidden_size=new_hidden_size, ) - sliced_model = SlicedLlamaForCausalLM(config_to_save, scheduler).to(dtype) + sliced_model = SlicedLlamaForCausalLM(config_to_save, sparsity, new_hidden_size, scheduler).to(dtype) sliced_model.load_state_dict(model.state_dict(), strict=True, assign=True) sliced_model.save_pretrained(save_sliced_model_dir) else: diff --git a/src/slicegpt/rotate.py b/src/slicegpt/rotate.py index 75b830c0..01e3b201 100644 --- a/src/slicegpt/rotate.py +++ b/src/slicegpt/rotate.py @@ -165,7 +165,7 @@ def rotate_and_slice_sequential( layers = model_adapter.get_layers() slicing_scheduler.setup( hidden_size=model_adapter.hidden_size, - intermediate_size=model_adapter.model.intermediate_size, + intermediate_size=model_adapter.intermediate_size, layers_num=len(layers), parallel_blocks=True, ) diff --git a/tests/test_model_adapter.py b/tests/test_model_adapter.py index c4379e4a..b4514e17 100644 --- a/tests/test_model_adapter.py +++ b/tests/test_model_adapter.py @@ -102,6 +102,7 @@ def create_adapter(self) -> OPTModelAdapter: config = OPTConfig( vocab_size=32, hidden_size=8, + intermediate_size=32, num_hidden_layers=2, ffn_dim=32, max_position_embeddings=16, diff --git a/tests/test_slicing.py b/tests/test_slicing.py index 492a689a..281eb0db 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -51,7 +51,7 @@ def test_HF_model(): config_path="phi_config", sparsity=sparsity, new_hidden_size=new_hidden_size ) - sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) + sliced_model = SlicedPhiForCausalLM(config, sparsity, new_hidden_size).to(torch.float16) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) # The sliced model weights should be identical to the HF model weights after layer norm fusion @@ -69,7 +69,7 @@ def test_HF_model(): sliced_ppl = gpu_utils.evaluate_ppl(model_adapter.model.to("cuda"), tokenizer.pad_token_id, test_loader) - sliced_model = SlicedPhiForCausalLM(config, scheduler).to(torch.float16) + sliced_model = SlicedPhiForCausalLM(config, sparsity, new_hidden_size, scheduler).to(torch.float16) sliced_model = sliced_model.to(torch.float16) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) @@ -91,7 +91,7 @@ def test_save_and_load_HF_model(): config = SlicedPhi2Config.from_pretrained(config_name, sparsity, new_hidden_size) - sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) + sliced_model = SlicedPhiForCausalLM(config, sparsity, new_hidden_size).to(torch.float16) sliced_model.save_pretrained(model_name) sliced_model = SlicedPhiForCausalLM.from_pretrained( model_name, scheduler=None, config_path=config_name, sparsity=sparsity, new_hidden_size=new_hidden_size From 20d1ef7207a6e4fb13d99fa1c8fecc4feedb2a8c Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Tue, 23 Apr 2024 18:30:54 +0000 Subject: [PATCH 24/33] Fix config inputs --- src/slicegpt/adapters/sliced_llama.py | 9 +++++---- src/slicegpt/adapters/sliced_phi.py | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/slicegpt/adapters/sliced_llama.py b/src/slicegpt/adapters/sliced_llama.py index 8a0474ef..58ac941a 100644 --- a/src/slicegpt/adapters/sliced_llama.py +++ b/src/slicegpt/adapters/sliced_llama.py @@ -13,14 +13,15 @@ class SlicedLlamaConfig(LlamaConfig): model_type = "sliced_llama" is_composition = True - def __init__(self, sparsity: float, new_hidden_size: int, **kwargs) -> None: - self.sparsity = sparsity - self.new_hidden_size = new_hidden_size + def __init__(self, **kwargs) -> None: + self.sparsity = kwargs.pop("sparsity", None) + self.new_hidden_size = kwargs.pop("new_hidden_size", None) super().__init__(**kwargs) @classmethod def from_pretrained(cls, config_path: str, sparsity: float, new_hidden_size: int) -> PretrainedConfig: - return super().from_pretrained(config_path, sparsity, new_hidden_size) + kwargs = {"sparsity": sparsity, "new_hidden_size": new_hidden_size} + return super().from_pretrained(config_path, **kwargs) class SlicedLlama(LlamaModel): diff --git a/src/slicegpt/adapters/sliced_phi.py b/src/slicegpt/adapters/sliced_phi.py index 567d5b82..25dfacf3 100644 --- a/src/slicegpt/adapters/sliced_phi.py +++ b/src/slicegpt/adapters/sliced_phi.py @@ -13,14 +13,15 @@ class SlicedPhi2Config(PhiConfig): model_type = "sliced_phi2" is_composition = True - def __init__(self, sparsity: float, new_hidden_size: int, **kwargs) -> None: - self.sparsity = sparsity - self.new_hidden_size = new_hidden_size + def __init__(self, **kwargs) -> None: + self.sparsity = kwargs.pop("sparsity", None) + self.new_hidden_size = kwargs.pop("new_hidden_size", None) super().__init__(**kwargs) @classmethod def from_pretrained(cls, config_path: str, sparsity: float, new_hidden_size: int) -> PretrainedConfig: - return super().from_pretrained(config_path, sparsity, new_hidden_size) + kwargs = {"sparsity": sparsity, "new_hidden_size": new_hidden_size} + return super().from_pretrained(config_path, **kwargs) class SlicedPhi(PhiModel): From 6c1f0df10ea3bb489b53750d2c4dc42814c6a2cf Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Tue, 23 Apr 2024 18:44:11 +0000 Subject: [PATCH 25/33] Fix slicing tests --- src/slicegpt/adapters/sliced_llama.py | 2 -- src/slicegpt/adapters/sliced_phi.py | 2 -- tests/test_slicing.py | 6 ++++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/slicegpt/adapters/sliced_llama.py b/src/slicegpt/adapters/sliced_llama.py index 58ac941a..d55a4d67 100644 --- a/src/slicegpt/adapters/sliced_llama.py +++ b/src/slicegpt/adapters/sliced_llama.py @@ -41,8 +41,6 @@ class SlicedLlamaForCausalLM(LlamaForCausalLM): def __init__( self, config, - sparsity: float, - new_hidden_size: int, scheduler: SlicingScheduler | None = None, *model_args, **kwargs, diff --git a/src/slicegpt/adapters/sliced_phi.py b/src/slicegpt/adapters/sliced_phi.py index 25dfacf3..94b66644 100644 --- a/src/slicegpt/adapters/sliced_phi.py +++ b/src/slicegpt/adapters/sliced_phi.py @@ -41,8 +41,6 @@ class SlicedPhiForCausalLM(PhiForCausalLM): def __init__( self, config, - sparsity: float, - new_hidden_size: int, scheduler: SlicingScheduler | None = None, *model_args, **kwargs, diff --git a/tests/test_slicing.py b/tests/test_slicing.py index 281eb0db..71ea7069 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -86,7 +86,9 @@ def test_save_and_load_HF_model(): config_name = "sliced_model_config" model_name = "sliced_model" - config = SlicedPhi2Config(sparsity, new_hidden_size) + kwargs = {"sparsity": sparsity, "new_hidden_size": new_hidden_size} + + config = SlicedPhi2Config(**kwargs) config.save_pretrained(config_name) config = SlicedPhi2Config.from_pretrained(config_name, sparsity, new_hidden_size) @@ -110,4 +112,4 @@ def compare_weights(model1, model2): if __name__ == "__main__": - test_HF_model() + test_save_and_load_HF_model() From 9821f0887610f270924e7c1994efc9ee2c3e9724 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Tue, 23 Apr 2024 19:28:39 +0000 Subject: [PATCH 26/33] Fi model saving --- src/slicegpt/hf_utils.py | 4 ++-- tests/test_slicing.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/slicegpt/hf_utils.py b/src/slicegpt/hf_utils.py index 96a87d51..7f03de27 100755 --- a/src/slicegpt/hf_utils.py +++ b/src/slicegpt/hf_utils.py @@ -202,7 +202,7 @@ def save_sliced_model( config_path="phi_config", sparsity=sparsity, new_hidden_size=new_hidden_size ) - sliced_model = SlicedPhiForCausalLM(config_to_save, sparsity, new_hidden_size, scheduler).to(dtype) + sliced_model = SlicedPhiForCausalLM(config_to_save, scheduler).to(dtype) sliced_model.load_state_dict(model.state_dict(), strict=True, assign=True) sliced_model.save_pretrained(save_sliced_model_dir) @@ -218,7 +218,7 @@ def save_sliced_model( new_hidden_size=new_hidden_size, ) - sliced_model = SlicedLlamaForCausalLM(config_to_save, sparsity, new_hidden_size, scheduler).to(dtype) + sliced_model = SlicedLlamaForCausalLM(config_to_save, scheduler).to(dtype) sliced_model.load_state_dict(model.state_dict(), strict=True, assign=True) sliced_model.save_pretrained(save_sliced_model_dir) else: diff --git a/tests/test_slicing.py b/tests/test_slicing.py index 71ea7069..56e7a62b 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -51,7 +51,7 @@ def test_HF_model(): config_path="phi_config", sparsity=sparsity, new_hidden_size=new_hidden_size ) - sliced_model = SlicedPhiForCausalLM(config, sparsity, new_hidden_size).to(torch.float16) + sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) # The sliced model weights should be identical to the HF model weights after layer norm fusion @@ -69,7 +69,7 @@ def test_HF_model(): sliced_ppl = gpu_utils.evaluate_ppl(model_adapter.model.to("cuda"), tokenizer.pad_token_id, test_loader) - sliced_model = SlicedPhiForCausalLM(config, sparsity, new_hidden_size, scheduler).to(torch.float16) + sliced_model = SlicedPhiForCausalLM(config, scheduler).to(torch.float16) sliced_model = sliced_model.to(torch.float16) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) @@ -93,7 +93,7 @@ def test_save_and_load_HF_model(): config = SlicedPhi2Config.from_pretrained(config_name, sparsity, new_hidden_size) - sliced_model = SlicedPhiForCausalLM(config, sparsity, new_hidden_size).to(torch.float16) + sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) sliced_model.save_pretrained(model_name) sliced_model = SlicedPhiForCausalLM.from_pretrained( model_name, scheduler=None, config_path=config_name, sparsity=sparsity, new_hidden_size=new_hidden_size From 322016c6abbe5602661f5d6c78ebc56b51f9b82c Mon Sep 17 00:00:00 2001 From: Pashmina Cameron Date: Wed, 24 Apr 2024 10:50:02 +0100 Subject: [PATCH 27/33] Use ffn_dim in OPT. Don't set intermediate_size in OPTCOnfig. --- src/slicegpt/adapters/opt_adapter.py | 4 ++-- src/slicegpt/adapters/sliced_llama.py | 2 +- src/slicegpt/adapters/sliced_phi.py | 2 +- tests/test_model_adapter.py | 1 - tests/test_slicing.py | 4 ++-- 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/slicegpt/adapters/opt_adapter.py b/src/slicegpt/adapters/opt_adapter.py index 5ae8ff38..18decc70 100644 --- a/src/slicegpt/adapters/opt_adapter.py +++ b/src/slicegpt/adapters/opt_adapter.py @@ -176,10 +176,10 @@ def seqlen(self) -> int: @property def hidden_size(self) -> int: return self.config.hidden_size - + @property def intermediate_size(self) -> int: - return self.config.intermediate_size + return self.config.ffn_dim @property def should_bake_mean_into_linear(self) -> bool: diff --git a/src/slicegpt/adapters/sliced_llama.py b/src/slicegpt/adapters/sliced_llama.py index d55a4d67..020807b9 100644 --- a/src/slicegpt/adapters/sliced_llama.py +++ b/src/slicegpt/adapters/sliced_llama.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn -from transformers.models.llama.modeling_llama import LlamaConfig, LlamaForCausalLM, LlamaModel from transformers.configuration_utils import PretrainedConfig +from transformers.models.llama.modeling_llama import LlamaConfig, LlamaForCausalLM, LlamaModel from slicegpt.adapters.llama_adapter import CompressedLlamaDecoderLayer, LlamaModelAdapter from slicegpt.modules import RMSN diff --git a/src/slicegpt/adapters/sliced_phi.py b/src/slicegpt/adapters/sliced_phi.py index 94b66644..27066e72 100644 --- a/src/slicegpt/adapters/sliced_phi.py +++ b/src/slicegpt/adapters/sliced_phi.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn -from transformers.models.phi.modeling_phi import PhiConfig, PhiForCausalLM, PhiModel from transformers.configuration_utils import PretrainedConfig +from transformers.models.phi.modeling_phi import PhiConfig, PhiForCausalLM, PhiModel from slicegpt.adapters.phi2_adapter import CompressedPhiDecoderLayer, Phi2ModelAdapter from slicegpt.modules import RMSN diff --git a/tests/test_model_adapter.py b/tests/test_model_adapter.py index b4514e17..c4379e4a 100644 --- a/tests/test_model_adapter.py +++ b/tests/test_model_adapter.py @@ -102,7 +102,6 @@ def create_adapter(self) -> OPTModelAdapter: config = OPTConfig( vocab_size=32, hidden_size=8, - intermediate_size=32, num_hidden_layers=2, ffn_dim=32, max_position_embeddings=16, diff --git a/tests/test_slicing.py b/tests/test_slicing.py index 56e7a62b..6d6639ae 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -69,7 +69,7 @@ def test_HF_model(): sliced_ppl = gpu_utils.evaluate_ppl(model_adapter.model.to("cuda"), tokenizer.pad_token_id, test_loader) - sliced_model = SlicedPhiForCausalLM(config, scheduler).to(torch.float16) + sliced_model = SlicedPhiForCausalLM(config, scheduler).to(torch.float16) sliced_model = sliced_model.to(torch.float16) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) @@ -93,7 +93,7 @@ def test_save_and_load_HF_model(): config = SlicedPhi2Config.from_pretrained(config_name, sparsity, new_hidden_size) - sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) + sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) sliced_model.save_pretrained(model_name) sliced_model = SlicedPhiForCausalLM.from_pretrained( model_name, scheduler=None, config_path=config_name, sparsity=sparsity, new_hidden_size=new_hidden_size From d1e589996aaf543c37badf32f4a1a6ebc14a6050 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Wed, 24 Apr 2024 14:34:27 +0000 Subject: [PATCH 28/33] Remove unnecessary params and fix scheduler when model loading --- src/slicegpt/adapters/llama_adapter.py | 4 ---- src/slicegpt/adapters/phi2_adapter.py | 4 ---- src/slicegpt/adapters/sliced_llama.py | 3 ++- src/slicegpt/adapters/sliced_phi.py | 5 +++-- src/slicegpt/model_adapter.py | 11 ++--------- src/slicegpt/rotate.py | 3 --- src/slicegpt/slicing_scheduler.py | 3 +-- tests/test_model_adapter.py | 4 +--- tests/test_slicing.py | 7 +++++-- 9 files changed, 14 insertions(+), 30 deletions(-) diff --git a/src/slicegpt/adapters/llama_adapter.py b/src/slicegpt/adapters/llama_adapter.py index 5cf232b1..9cc4e028 100644 --- a/src/slicegpt/adapters/llama_adapter.py +++ b/src/slicegpt/adapters/llama_adapter.py @@ -160,10 +160,6 @@ def seqlen(self) -> int: def hidden_size(self) -> int: return self.config.hidden_size - @property - def intermediate_size(self) -> int: - return self.config.intermediate_size - @property def should_bake_mean_into_linear(self) -> bool: return False diff --git a/src/slicegpt/adapters/phi2_adapter.py b/src/slicegpt/adapters/phi2_adapter.py index 867fd78c..9c856ef0 100644 --- a/src/slicegpt/adapters/phi2_adapter.py +++ b/src/slicegpt/adapters/phi2_adapter.py @@ -157,10 +157,6 @@ def seqlen(self) -> int: def hidden_size(self) -> int: return self.config.hidden_size - @property - def intermediate_size(self) -> int: - return self.config.intermediate_size - @property def should_bake_mean_into_linear(self) -> bool: return True diff --git a/src/slicegpt/adapters/sliced_llama.py b/src/slicegpt/adapters/sliced_llama.py index d55a4d67..f3f245bf 100644 --- a/src/slicegpt/adapters/sliced_llama.py +++ b/src/slicegpt/adapters/sliced_llama.py @@ -65,7 +65,8 @@ def from_pretrained( ): """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" config = SlicedLlamaConfig.from_pretrained(config_path, sparsity, new_hidden_size) - model = super().from_pretrained(pretrained_model_name_or_path, config=config) + kwargs = {"scheduler": scheduler} + model = super().from_pretrained(pretrained_model_name_or_path, config=config, **kwargs) model.load_state_dict(model.state_dict()) return model diff --git a/src/slicegpt/adapters/sliced_phi.py b/src/slicegpt/adapters/sliced_phi.py index 94b66644..6b6855e9 100644 --- a/src/slicegpt/adapters/sliced_phi.py +++ b/src/slicegpt/adapters/sliced_phi.py @@ -21,7 +21,7 @@ def __init__(self, **kwargs) -> None: @classmethod def from_pretrained(cls, config_path: str, sparsity: float, new_hidden_size: int) -> PretrainedConfig: kwargs = {"sparsity": sparsity, "new_hidden_size": new_hidden_size} - return super().from_pretrained(config_path, **kwargs) + return super().from_pretrained(config_path, local_files_only=True, **kwargs) class SlicedPhi(PhiModel): @@ -65,7 +65,8 @@ def from_pretrained( ): """Overrides the from_pretrained method to accept the scheduler and returns the sliced model""" config = SlicedPhi2Config.from_pretrained(config_path, sparsity, new_hidden_size) - model = super().from_pretrained(pretrained_model_name_or_path, config=config) + kwargs = {"scheduler": scheduler} + model = super().from_pretrained(pretrained_model_name_or_path, config=config, **kwargs) model.load_state_dict(model.state_dict()) return model diff --git a/src/slicegpt/model_adapter.py b/src/slicegpt/model_adapter.py index f2075fab..9ab0cb0f 100644 --- a/src/slicegpt/model_adapter.py +++ b/src/slicegpt/model_adapter.py @@ -113,7 +113,7 @@ class ModelAdapter(ABC): To implement a new model adapter, implement the interface defined in this class """ - def __init__(self): + def __init__(self) -> None: self.slicing_conf: SlicingConfig | None = None @property @@ -163,14 +163,7 @@ def hidden_size(self) -> int: The hidden size of the model """ raise NotImplementedError - - @property - @abstractmethod - def intermediate_size(self) -> int: - """ - The intermediate hidden size of MLP - """ - raise NotImplementedError + @property @abstractmethod diff --git a/src/slicegpt/rotate.py b/src/slicegpt/rotate.py index 01e3b201..256aefe6 100644 --- a/src/slicegpt/rotate.py +++ b/src/slicegpt/rotate.py @@ -165,7 +165,6 @@ def rotate_and_slice_sequential( layers = model_adapter.get_layers() slicing_scheduler.setup( hidden_size=model_adapter.hidden_size, - intermediate_size=model_adapter.intermediate_size, layers_num=len(layers), parallel_blocks=True, ) @@ -284,7 +283,6 @@ def rotate_and_slice_parallel( layers = model_adapter.get_layers() slicing_scheduler.setup( hidden_size=model_adapter.hidden_size, - intermediate_size=model_adapter.intermediate_size, layers_num=len(layers), parallel_blocks=True, ) @@ -447,7 +445,6 @@ def slice_rotated_model(model_adapter: ModelAdapter, slicing_scheduler: SlicingS slicing_scheduler = ConstSlicingScheduler(model_adapter.slicing_conf.const_dimension) slicing_scheduler.setup( hidden_size=model_adapter.hidden_size, - intermediate_size=model_adapter.intermediate_size, layers_num=len(layers), parallel_blocks=model_adapter.parallel_blocks, ) diff --git a/src/slicegpt/slicing_scheduler.py b/src/slicegpt/slicing_scheduler.py index a7089ee9..e06b78f0 100644 --- a/src/slicegpt/slicing_scheduler.py +++ b/src/slicegpt/slicing_scheduler.py @@ -40,10 +40,9 @@ def parallel_blocks(self) -> bool: """Return whether working with a parallel blocks models.""" return self.slicing_conf.parallel_blocks - def setup(self, *, hidden_size: int, intermediate_size: int, layers_num: int, parallel_blocks: bool) -> None: + def setup(self, *, hidden_size: int, layers_num: int, parallel_blocks: bool) -> None: """Set up the slicing scheduler with the given model parameters.""" self.slicing_conf.hidden_size = hidden_size - self.slicing_conf.intermediate_size = intermediate_size self.slicing_conf.layers_num = layers_num self.slicing_conf.parallel_blocks = parallel_blocks diff --git a/tests/test_model_adapter.py b/tests/test_model_adapter.py index b4514e17..f57ad1db 100644 --- a/tests/test_model_adapter.py +++ b/tests/test_model_adapter.py @@ -102,7 +102,6 @@ def create_adapter(self) -> OPTModelAdapter: config = OPTConfig( vocab_size=32, hidden_size=8, - intermediate_size=32, num_hidden_layers=2, ffn_dim=32, max_position_embeddings=16, @@ -117,7 +116,6 @@ def create_adapter(self) -> LlamaModelAdapter: config = LlamaConfig( vocab_size=32, hidden_size=8, - intermediate_size=32, num_hidden_layers=2, num_attention_heads=2, max_position_embeddings=16, @@ -130,7 +128,7 @@ class TestPhi2Adapter(ModelAdapterTestBase): def create_adapter(self) -> Phi2ModelAdapter: # a tiny phi, just to test adapter. config = PhiConfig( - vocab_size=32, hidden_size=8, intermediate_size=32, num_hidden_layers=2, num_attention_heads=2 + vocab_size=32, hidden_size=8, num_hidden_layers=2, num_attention_heads=2 ) model = PhiForCausalLM(config) return Phi2ModelAdapter(model) diff --git a/tests/test_slicing.py b/tests/test_slicing.py index 56e7a62b..31399a31 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -72,6 +72,7 @@ def test_HF_model(): sliced_model = SlicedPhiForCausalLM(config, scheduler).to(torch.float16) sliced_model = sliced_model.to(torch.float16) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) + sliced_model.save_pretrained("sliced_phi2_model") new_model_ppl = gpu_utils.evaluate_ppl(sliced_model.to("cuda"), tokenizer.pad_token_id, test_loader) @@ -95,8 +96,10 @@ def test_save_and_load_HF_model(): sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) sliced_model.save_pretrained(model_name) + + scheduler = ConstSlicingScheduler(new_hidden_size) sliced_model = SlicedPhiForCausalLM.from_pretrained( - model_name, scheduler=None, config_path=config_name, sparsity=sparsity, new_hidden_size=new_hidden_size + model_name, scheduler=scheduler, config_path=config_name, sparsity=sparsity, new_hidden_size=new_hidden_size ) assert isinstance(sliced_model, SlicedPhiForCausalLM) @@ -112,4 +115,4 @@ def compare_weights(model1, model2): if __name__ == "__main__": - test_save_and_load_HF_model() + test_HF_model() From 16cb0822842356e8749c8df7976762196fe23666 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Wed, 24 Apr 2024 15:08:32 +0000 Subject: [PATCH 29/33] Fix tests --- src/slicegpt/adapters/opt_adapter.py | 9 +++++---- tests/test_slicing.py | 11 ++++++++++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/slicegpt/adapters/opt_adapter.py b/src/slicegpt/adapters/opt_adapter.py index 5ae8ff38..9d8ede61 100644 --- a/src/slicegpt/adapters/opt_adapter.py +++ b/src/slicegpt/adapters/opt_adapter.py @@ -6,6 +6,7 @@ # Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. from typing import cast +from slicegpt.modules import RMSN import torch from torch import FloatTensor, Tensor, matmul from torch.nn import LayerNorm, Linear, Module @@ -22,6 +23,10 @@ class CompressedOPTDecoderLayer(OPTDecoderLayer): but with the addition of a shortcut_Q attributes. We also support the input rotation and mean subtraction in this class (if needed). """ + def __init__(self, config: OPTConfig, replace_layernorm: bool = False): + super().__init__(config) + if replace_layernorm: + self.input_layernorm = RMSN(config.hidden_size) def forward( self, @@ -176,10 +181,6 @@ def seqlen(self) -> int: @property def hidden_size(self) -> int: return self.config.hidden_size - - @property - def intermediate_size(self) -> int: - return self.config.intermediate_size @property def should_bake_mean_into_linear(self) -> bool: diff --git a/tests/test_slicing.py b/tests/test_slicing.py index 31399a31..b59151af 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -78,7 +78,16 @@ def test_HF_model(): # The perplexity of the sliced model should be the same as the HF model assert sliced_ppl == new_model_ppl - + + # load the sliced model back + sliced_model = SlicedPhiForCausalLM.from_pretrained( + "sliced_phi2_model", scheduler=scheduler, config_path="sliced_phi2_model", sparsity=sparsity, new_hidden_size=new_hidden_size + ) + + assert sliced_model is not None + assert isinstance(sliced_model, SlicedPhiForCausalLM) + assert sliced_model.config.sparsity == sparsity + assert sliced_model.config.new_hidden_size == new_hidden_size def test_save_and_load_HF_model(): """Test HF model saving and loading""" From d2f4d2e43e57861a122355975a226b9b8e1674ef Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Wed, 24 Apr 2024 15:24:00 +0000 Subject: [PATCH 30/33] Update model loading --- experiments/run_slicegpt.py | 7 +++---- src/slicegpt/hf_utils.py | 30 ++++++++++++++++++++++-------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/experiments/run_slicegpt.py b/experiments/run_slicegpt.py index 11e682f9..2c9af4e2 100755 --- a/experiments/run_slicegpt.py +++ b/experiments/run_slicegpt.py @@ -8,8 +8,8 @@ import shutil import torch -import wandb +import wandb from slicegpt import data_utils, gpu_utils, hf_utils, layernorm_fusion, rotate, utils from slicegpt.config import config from slicegpt.slicing_scheduler import ConstSlicingScheduler @@ -137,7 +137,7 @@ def slicing_main(args: argparse.Namespace) -> None: if args.sliced_model_path: # load the model from sliced_model_path to compute perplexity and skip rotation and slicing - model_adapter, tokenizer = hf_utils.load_sliced_model( + model, tokenizer = hf_utils.load_sliced_model( args.model, args.sliced_model_path, sparsity=args.sparsity, @@ -149,8 +149,7 @@ def slicing_main(args: argparse.Namespace) -> None: model_adapter, tokenizer = hf_utils.get_model_and_tokenizer( args.model, args.model_path, token=args.hf_token, dtype=config.dtype ) - - model = model_adapter.model + model = model_adapter.model def reset_model_device() -> None: if args.distribute_model: diff --git a/src/slicegpt/hf_utils.py b/src/slicegpt/hf_utils.py index 7f03de27..a8eb0453 100755 --- a/src/slicegpt/hf_utils.py +++ b/src/slicegpt/hf_utils.py @@ -15,7 +15,7 @@ from .layernorm_fusion import fuse_modules, replace_layers from .model_adapter import ModelAdapter, SlicingConfig from .rotate import slice_rotated_model -from .slicing_scheduler import SlicingScheduler +from .slicing_scheduler import ConstSlicingScheduler, SlicingScheduler def do_not_initialize(func): @@ -107,7 +107,7 @@ def get_model_and_tokenizer( model.eval() # This switches off dropout. model_adapter.use_cache = False - tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, token=token, local_files_only=local_model) + tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=token, local_files_only=local_model) model_adapter.post_init(tokenizer) logging.info("Loading model done") @@ -115,7 +115,6 @@ def get_model_and_tokenizer( return model_adapter, tokenizer -@do_not_initialize def load_sliced_model( model_name: str, sliced_model_path: str, @@ -124,22 +123,37 @@ def load_sliced_model( lora_config: LoraConfig | None = None, sparsity: float | None = None, round_interval: int | None = 1, -) -> tuple[ModelAdapter, PreTrainedTokenizerBase]: +) -> tuple[ModelAdapter | torch.nn.Module, PreTrainedTokenizerBase]: """ Load the sliced model and the tokenizer from the given path. If lora_config is supplied as an arg then this function will return a PEFT model (post-slicing finetuned model). The corresponding model adapter class must be imported before calling this method. """ - my_model_suffix = pathlib.Path(model_name).name - my_sliced_model_name = f"{my_model_suffix}_{sparsity}.pt" - my_sliced_model_config = f"{my_model_suffix}_{sparsity}.json" - + model_adapter, tokenizer = get_model_and_tokenizer( model_name, model_path=sliced_model_path, uninitialized=True, token=token, ) + + # handle loading sliced HF compatible models + if model_name.startswith("microsoft") or model_name.startswith("llama"): + new_embedding_dimension = int((1 - sparsity) * model_adapter.hidden_size) + new_embedding_dimension -= new_embedding_dimension % round_interval + + scheduler = ConstSlicingScheduler(new_embedding_dimension) + + sliced_model = SlicedPhiForCausalLM.from_pretrained( + sliced_model_path, scheduler=scheduler, config_path=sliced_model_path, sparsity=sparsity, new_hidden_size=new_embedding_dimension + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=token, local_files_only=True) + return sliced_model, tokenizer + + my_model_suffix = pathlib.Path(model_name).name + my_sliced_model_name = f"{my_model_suffix}_{sparsity}.pt" + my_sliced_model_config = f"{my_model_suffix}_{sparsity}.json" replace_layers(model_adapter) fuse_modules(model_adapter) From 981615bca97778a8a2840b81fe8aa23b8214c33d Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Wed, 24 Apr 2024 20:54:11 +0000 Subject: [PATCH 31/33] Update scheduler params when loading sliced model --- src/slicegpt/hf_utils.py | 31 +++++++++++++++++++++++++------ src/slicegpt/model_adapter.py | 2 -- tests/test_model_adapter.py | 4 +--- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/slicegpt/hf_utils.py b/src/slicegpt/hf_utils.py index a8eb0453..91c8a4f9 100755 --- a/src/slicegpt/hf_utils.py +++ b/src/slicegpt/hf_utils.py @@ -129,28 +129,39 @@ def load_sliced_model( function will return a PEFT model (post-slicing finetuned model). The corresponding model adapter class must be imported before calling this method. """ - + model_adapter, tokenizer = get_model_and_tokenizer( model_name, model_path=sliced_model_path, uninitialized=True, token=token, ) - + # handle loading sliced HF compatible models if model_name.startswith("microsoft") or model_name.startswith("llama"): new_embedding_dimension = int((1 - sparsity) * model_adapter.hidden_size) new_embedding_dimension -= new_embedding_dimension % round_interval - + scheduler = ConstSlicingScheduler(new_embedding_dimension) + + layers = model_adapter.get_layers() + scheduler.setup( + hidden_size=model_adapter.hidden_size, + layers_num=len(layers), + parallel_blocks=True, + ) sliced_model = SlicedPhiForCausalLM.from_pretrained( - sliced_model_path, scheduler=scheduler, config_path=sliced_model_path, sparsity=sparsity, new_hidden_size=new_embedding_dimension + sliced_model_path, + scheduler=scheduler, + config_path=sliced_model_path, + sparsity=sparsity, + new_hidden_size=new_embedding_dimension, ) - + tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=token, local_files_only=True) return sliced_model, tokenizer - + my_model_suffix = pathlib.Path(model_name).name my_sliced_model_name = f"{my_model_suffix}_{sparsity}.pt" my_sliced_model_config = f"{my_model_suffix}_{sparsity}.json" @@ -219,6 +230,14 @@ def save_sliced_model( sliced_model = SlicedPhiForCausalLM(config_to_save, scheduler).to(dtype) sliced_model.load_state_dict(model.state_dict(), strict=True, assign=True) sliced_model.save_pretrained(save_sliced_model_dir) + + sliced_model = SlicedPhiForCausalLM.from_pretrained( + save_sliced_model_dir, + scheduler=scheduler, + config_path=save_sliced_model_dir, + sparsity=sparsity, + new_hidden_size=new_hidden_size, + ) elif "meta-llama" in model_name: config = LlamaConfig.from_pretrained( diff --git a/src/slicegpt/model_adapter.py b/src/slicegpt/model_adapter.py index 9ab0cb0f..6c4052e6 100644 --- a/src/slicegpt/model_adapter.py +++ b/src/slicegpt/model_adapter.py @@ -163,7 +163,6 @@ def hidden_size(self) -> int: The hidden size of the model """ raise NotImplementedError - @property @abstractmethod @@ -435,7 +434,6 @@ class SlicingConfig: """Slicing configuration such as individual layer dimensions and whether to slice head.""" hidden_size: int = 0 - intermediate_size: int = 0 layers_num: int = 0 do_slice_head: bool = False parallel_blocks: bool = False diff --git a/tests/test_model_adapter.py b/tests/test_model_adapter.py index f57ad1db..645a7997 100644 --- a/tests/test_model_adapter.py +++ b/tests/test_model_adapter.py @@ -127,8 +127,6 @@ def create_adapter(self) -> LlamaModelAdapter: class TestPhi2Adapter(ModelAdapterTestBase): def create_adapter(self) -> Phi2ModelAdapter: # a tiny phi, just to test adapter. - config = PhiConfig( - vocab_size=32, hidden_size=8, num_hidden_layers=2, num_attention_heads=2 - ) + config = PhiConfig(vocab_size=32, hidden_size=8, num_hidden_layers=2, num_attention_heads=2) model = PhiForCausalLM(config) return Phi2ModelAdapter(model) From 84d775af665ae66822a35f14786fc6016f0fe18a Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Wed, 24 Apr 2024 20:56:39 +0000 Subject: [PATCH 32/33] Update test with model loading --- tests/test_slicing.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/test_slicing.py b/tests/test_slicing.py index 0919b838..cf908134 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -78,16 +78,25 @@ def test_HF_model(): # The perplexity of the sliced model should be the same as the HF model assert sliced_ppl == new_model_ppl - + # load the sliced model back sliced_model = SlicedPhiForCausalLM.from_pretrained( - "sliced_phi2_model", scheduler=scheduler, config_path="sliced_phi2_model", sparsity=sparsity, new_hidden_size=new_hidden_size + "sliced_phi2_model", + scheduler=scheduler, + config_path="sliced_phi2_model", + sparsity=sparsity, + new_hidden_size=new_hidden_size, ) - + sliced_model = sliced_model.to(torch.float16) + assert sliced_model is not None assert isinstance(sliced_model, SlicedPhiForCausalLM) assert sliced_model.config.sparsity == sparsity assert sliced_model.config.new_hidden_size == new_hidden_size + + loaded_model_ppl = gpu_utils.evaluate_ppl(sliced_model.to("cuda"), tokenizer.pad_token_id, test_loader) + assert loaded_model_ppl == new_model_ppl + def test_save_and_load_HF_model(): """Test HF model saving and loading""" @@ -105,10 +114,10 @@ def test_save_and_load_HF_model(): sliced_model = SlicedPhiForCausalLM(config).to(torch.float16) sliced_model.save_pretrained(model_name) - + scheduler = ConstSlicingScheduler(new_hidden_size) sliced_model = SlicedPhiForCausalLM.from_pretrained( - model_name, scheduler=scheduler, config_path=config_name, sparsity=sparsity, new_hidden_size=new_hidden_size + model_name, scheduler=scheduler, config_path=model_name, sparsity=sparsity, new_hidden_size=new_hidden_size ) assert isinstance(sliced_model, SlicedPhiForCausalLM) From e6787174a055b90bfaeb06ce85660acf6102c344 Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan Date: Thu, 25 Apr 2024 15:10:23 +0000 Subject: [PATCH 33/33] Update lm eval and fix rotate --- experiments/run_lm_eval.py | 16 +++++++++++----- experiments/run_slicegpt.py | 1 + src/slicegpt/adapters/opt_adapter.py | 3 ++- src/slicegpt/hf_utils.py | 10 +--------- src/slicegpt/rotate.py | 2 +- tests/test_slicing.py | 10 +++------- 6 files changed, 19 insertions(+), 23 deletions(-) diff --git a/experiments/run_lm_eval.py b/experiments/run_lm_eval.py index 15b2f9f3..0e769f6f 100644 --- a/experiments/run_lm_eval.py +++ b/experiments/run_lm_eval.py @@ -7,14 +7,15 @@ import os import lm_eval +from slicegpt.model_adapter import ModelAdapter import torch -import wandb from lm_eval import tasks from lm_eval import utils as lm_eval_utils from lm_eval.api.registry import ALL_TASKS from lm_eval.models.huggingface import HFLM from lm_eval.tasks import initialize_tasks +import wandb from slicegpt import gpu_utils, hf_utils, utils from slicegpt.config import config @@ -120,29 +121,34 @@ def eval_main(args: argparse.Namespace) -> None: if args.sliced_model_path: # load the sliced model logging.info(f"Loading sliced {args.model} model from {args.sliced_model_path} with sparsity {args.sparsity}") - model_adapter, tokenizer = hf_utils.load_sliced_model( + model, tokenizer = hf_utils.load_sliced_model( args.model, args.sliced_model_path, sparsity=args.sparsity, token=args.hf_token, round_interval=args.round_interval, ) + if isinstance(model, ModelAdapter): + model = model.model + else: + model = model.to(config.dtype) else: # load the original model logging.info(f"Loading {args.model} model") model_adapter, tokenizer = hf_utils.get_model_and_tokenizer(args.model, args.model_path, token=args.hf_token) + model = model_adapter.model # the lm eval harness ties the weights, but this should not be done for sliced models unless the lm_head was sliced - model_adapter.model.tie_weights = lambda: None + model.model.tie_weights = lambda: None if args.distribute_model: # distribute model across available GPUs gpu_utils.distribute_model(model_adapter) else: - model_adapter.model.to(config.device) + model.to(config.device) ### LM Eval Harness ### - hflm = HFLM(pretrained=model_adapter.model, tokenizer=tokenizer, batch_size=args.batch_size) + hflm = HFLM(pretrained=model, tokenizer=tokenizer, batch_size=args.batch_size) if args.tasks is None: task_names = tasks.ALL_TASKS diff --git a/experiments/run_slicegpt.py b/experiments/run_slicegpt.py index 2c9af4e2..fcec8dff 100755 --- a/experiments/run_slicegpt.py +++ b/experiments/run_slicegpt.py @@ -144,6 +144,7 @@ def slicing_main(args: argparse.Namespace) -> None: round_interval=args.round_interval, token=args.hf_token, ) + model = model.to(config.dtype) else: # load one of the pre-trained models model_adapter, tokenizer = hf_utils.get_model_and_tokenizer( diff --git a/src/slicegpt/adapters/opt_adapter.py b/src/slicegpt/adapters/opt_adapter.py index 9d8ede61..2219e446 100644 --- a/src/slicegpt/adapters/opt_adapter.py +++ b/src/slicegpt/adapters/opt_adapter.py @@ -6,7 +6,6 @@ # Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. from typing import cast -from slicegpt.modules import RMSN import torch from torch import FloatTensor, Tensor, matmul from torch.nn import LayerNorm, Linear, Module @@ -15,6 +14,7 @@ from transformers.models.opt.modeling_opt import OPTConfig, OPTDecoderLayer, OPTForCausalLM from slicegpt.model_adapter import LayerAdapter, ModelAdapter +from slicegpt.modules import RMSN class CompressedOPTDecoderLayer(OPTDecoderLayer): @@ -23,6 +23,7 @@ class CompressedOPTDecoderLayer(OPTDecoderLayer): but with the addition of a shortcut_Q attributes. We also support the input rotation and mean subtraction in this class (if needed). """ + def __init__(self, config: OPTConfig, replace_layernorm: bool = False): super().__init__(config) if replace_layernorm: diff --git a/src/slicegpt/hf_utils.py b/src/slicegpt/hf_utils.py index 91c8a4f9..a4ce3d05 100755 --- a/src/slicegpt/hf_utils.py +++ b/src/slicegpt/hf_utils.py @@ -143,7 +143,7 @@ def load_sliced_model( new_embedding_dimension -= new_embedding_dimension % round_interval scheduler = ConstSlicingScheduler(new_embedding_dimension) - + layers = model_adapter.get_layers() scheduler.setup( hidden_size=model_adapter.hidden_size, @@ -230,14 +230,6 @@ def save_sliced_model( sliced_model = SlicedPhiForCausalLM(config_to_save, scheduler).to(dtype) sliced_model.load_state_dict(model.state_dict(), strict=True, assign=True) sliced_model.save_pretrained(save_sliced_model_dir) - - sliced_model = SlicedPhiForCausalLM.from_pretrained( - save_sliced_model_dir, - scheduler=scheduler, - config_path=save_sliced_model_dir, - sparsity=sparsity, - new_hidden_size=new_hidden_size, - ) elif "meta-llama" in model_name: config = LlamaConfig.from_pretrained( diff --git a/src/slicegpt/rotate.py b/src/slicegpt/rotate.py index 256aefe6..5d31953e 100644 --- a/src/slicegpt/rotate.py +++ b/src/slicegpt/rotate.py @@ -166,7 +166,7 @@ def rotate_and_slice_sequential( slicing_scheduler.setup( hidden_size=model_adapter.hidden_size, layers_num=len(layers), - parallel_blocks=True, + parallel_blocks=False, ) # rotate and slice embeddings diff --git a/tests/test_slicing.py b/tests/test_slicing.py index cf908134..d108469b 100644 --- a/tests/test_slicing.py +++ b/tests/test_slicing.py @@ -72,7 +72,7 @@ def test_HF_model(): sliced_model = SlicedPhiForCausalLM(config, scheduler).to(torch.float16) sliced_model = sliced_model.to(torch.float16) sliced_model.load_state_dict(model_adapter.model.state_dict(), strict=True, assign=True) - sliced_model.save_pretrained("sliced_phi2_model") + sliced_model.save_pretrained("sliced_phi2_model123") new_model_ppl = gpu_utils.evaluate_ppl(sliced_model.to("cuda"), tokenizer.pad_token_id, test_loader) @@ -93,7 +93,7 @@ def test_HF_model(): assert isinstance(sliced_model, SlicedPhiForCausalLM) assert sliced_model.config.sparsity == sparsity assert sliced_model.config.new_hidden_size == new_hidden_size - + loaded_model_ppl = gpu_utils.evaluate_ppl(sliced_model.to("cuda"), tokenizer.pad_token_id, test_loader) assert loaded_model_ppl == new_model_ppl @@ -129,8 +129,4 @@ def compare_weights(model1, model2): for p1, p2 in zip(model1.parameters(), model2.parameters()): if not torch.equal(p1.data, p2.data): return False - return True - - -if __name__ == "__main__": - test_HF_model() + return True \ No newline at end of file