From 35bf7fdca540d2c38572533a28744fba3f422afa Mon Sep 17 00:00:00 2001 From: Tarun Suresh Date: Wed, 2 Apr 2025 21:55:43 +0000 Subject: [PATCH 1/3] enable support for device map auto --- README.md | 2 ++ syncode/common.py | 14 ++++++++++---- syncode/infer.py | 5 ++++- syncode/language_model.py | 2 +- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d6b8c232..d4e2f848 100644 --- a/README.md +++ b/README.md @@ -207,6 +207,7 @@ export HF_ACCESS_TOKEN="your_huggingface_api_key" - `task_id` (int, optional): Problem task id for selecting a problem from a Dataset. +- `use_auto` (bool, optional): Use auto device mapping. Defaults to False. - `kwargs`(void, optional): Currently supported `kwargs` are `max_length`, `max_new_tokens`, `min_length`, `min_new_tokens`, `early_stopping`, `do_sample`, `num_beams`, `use_cache`, `temperature`, `top_k`, `top_p`, `num_return_sequences`, `pad_token_id`, and `eos_token_id`. Refer to the [HuggingFace Text Generation Documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation) for more information. @@ -237,6 +238,7 @@ python3 syncode/infer.py --new_mask_store [True, False] --parser ["lr", "lalr"] --task_id [task_id] + --use_auto [True, False] ``` diff --git a/syncode/common.py b/syncode/common.py index baf5370f..86d6b77d 100644 --- a/syncode/common.py +++ b/syncode/common.py @@ -11,16 +11,22 @@ HF_ACCESS_TOKEN = os.environ['HF_ACCESS_TOKEN'] if 'HF_ACCESS_TOKEN' in os.environ else None -def load_model(model_name, device, quantize): +def load_model(model_name, device, quantize, use_auto = False): if model_name == 'test': model = AutoModelForCausalLM.from_pretrained('bigcode/tiny_starcoder_py').to(device) elif model_name == 'test-instruct': model = AutoModelForCausalLM.from_pretrained("rahuldshetty/tiny-starcoder-instruct") else: - if (quantize): - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True).eval().to(device) + if use_auto: + if (quantize): + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = 'auto').eval() + else: + model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = 'auto').eval() else: - model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True).eval().to(device) + if (quantize): + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True).eval().to(device) + else: + model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True).eval().to(device) return model def load_tokenizer(model_name): diff --git a/syncode/infer.py b/syncode/infer.py index 8bc68d0a..23f3a0c2 100644 --- a/syncode/infer.py +++ b/syncode/infer.py @@ -50,6 +50,7 @@ def __init__( parser: Literal["lr", "lalr"] = "lalr", seed: Optional[int] = None, opp: bool = True, + use_auto: bool = False, **kwargs ): # Check inputs @@ -85,7 +86,7 @@ def __init__( self.grammar = Grammar(grammar) if self._is_grammar_mode() else None # Load model and tokenizer - model = common.load_model(self.model_name, device, quantize) + model = common.load_model(self.model_name, device, quantize, use_auto) tokenizer = common.load_tokenizer(self.model_name) # Initialize grammar decoder if needed @@ -259,6 +260,7 @@ def main( parse_output_only: bool = True, prompt_type: str = 'original', format_tabs: bool = False, + use_auto: bool = False, **kwargs ): """Run Syncode with the specified configuration. @@ -309,6 +311,7 @@ def main( seed=seed, opp=opp, parse_output_only=parse_output_only, + use_auto=use_auto, **kwargs ) diff --git a/syncode/language_model.py b/syncode/language_model.py index 4b1607c2..2db3c11d 100644 --- a/syncode/language_model.py +++ b/syncode/language_model.py @@ -50,7 +50,7 @@ def __init__( self.prompt_template = prompt_template self.model: PreTrainedModel = model self.tokenizer = tokenizer - self.device = device + self.device = self.model.device self.best_of = best_of self._before_prediction_hook = before_prediction_hook self.logits_processor = grammar_decoder From eb9a4a77136fe12151bb926287ec66acaf4c5343 Mon Sep 17 00:00:00 2001 From: Tarun Suresh Date: Wed, 2 Apr 2025 22:00:52 +0000 Subject: [PATCH 2/3] enable support for device map auto --- README.md | 4 ++-- syncode/common.py | 4 ++-- syncode/infer.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index d4e2f848..67bc63ff 100644 --- a/README.md +++ b/README.md @@ -207,7 +207,7 @@ export HF_ACCESS_TOKEN="your_huggingface_api_key" - `task_id` (int, optional): Problem task id for selecting a problem from a Dataset. -- `use_auto` (bool, optional): Use auto device mapping. Defaults to False. +- `use_auto_device_mapping` (bool, optional): Use auto device mapping. Defaults to False. - `kwargs`(void, optional): Currently supported `kwargs` are `max_length`, `max_new_tokens`, `min_length`, `min_new_tokens`, `early_stopping`, `do_sample`, `num_beams`, `use_cache`, `temperature`, `top_k`, `top_p`, `num_return_sequences`, `pad_token_id`, and `eos_token_id`. Refer to the [HuggingFace Text Generation Documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation) for more information. @@ -238,7 +238,7 @@ python3 syncode/infer.py --new_mask_store [True, False] --parser ["lr", "lalr"] --task_id [task_id] - --use_auto [True, False] + --use_auto_device_mapping [True, False] ``` diff --git a/syncode/common.py b/syncode/common.py index 86d6b77d..5522a1fd 100644 --- a/syncode/common.py +++ b/syncode/common.py @@ -11,13 +11,13 @@ HF_ACCESS_TOKEN = os.environ['HF_ACCESS_TOKEN'] if 'HF_ACCESS_TOKEN' in os.environ else None -def load_model(model_name, device, quantize, use_auto = False): +def load_model(model_name, device, quantize, use_auto_device_mapping = False): if model_name == 'test': model = AutoModelForCausalLM.from_pretrained('bigcode/tiny_starcoder_py').to(device) elif model_name == 'test-instruct': model = AutoModelForCausalLM.from_pretrained("rahuldshetty/tiny-starcoder-instruct") else: - if use_auto: + if use_auto_device_mapping: if (quantize): model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = 'auto').eval() else: diff --git a/syncode/infer.py b/syncode/infer.py index 23f3a0c2..f44a8693 100644 --- a/syncode/infer.py +++ b/syncode/infer.py @@ -50,7 +50,7 @@ def __init__( parser: Literal["lr", "lalr"] = "lalr", seed: Optional[int] = None, opp: bool = True, - use_auto: bool = False, + use_auto_device_mapping: bool = False, **kwargs ): # Check inputs @@ -86,7 +86,7 @@ def __init__( self.grammar = Grammar(grammar) if self._is_grammar_mode() else None # Load model and tokenizer - model = common.load_model(self.model_name, device, quantize, use_auto) + model = common.load_model(self.model_name, device, quantize, use_auto_device_mapping) tokenizer = common.load_tokenizer(self.model_name) # Initialize grammar decoder if needed @@ -260,7 +260,7 @@ def main( parse_output_only: bool = True, prompt_type: str = 'original', format_tabs: bool = False, - use_auto: bool = False, + use_auto_device_mapping: bool = False, **kwargs ): """Run Syncode with the specified configuration. @@ -311,7 +311,7 @@ def main( seed=seed, opp=opp, parse_output_only=parse_output_only, - use_auto=use_auto, + use_auto_device_mapping=use_auto_device_mapping, **kwargs ) From 0243367f9574af87c1194b2ae69d2af775461040 Mon Sep 17 00:00:00 2001 From: Tarun Suresh Date: Wed, 2 Apr 2025 22:02:37 +0000 Subject: [PATCH 3/3] enable support for device map auto --- README.md | 4 ++-- syncode/common.py | 8 ++++---- syncode/infer.py | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 67bc63ff..44cc50d6 100644 --- a/README.md +++ b/README.md @@ -207,7 +207,7 @@ export HF_ACCESS_TOKEN="your_huggingface_api_key" - `task_id` (int, optional): Problem task id for selecting a problem from a Dataset. -- `use_auto_device_mapping` (bool, optional): Use auto device mapping. Defaults to False. +- `device_map` (str, optional): Device map for the model. Defaults to None. - `kwargs`(void, optional): Currently supported `kwargs` are `max_length`, `max_new_tokens`, `min_length`, `min_new_tokens`, `early_stopping`, `do_sample`, `num_beams`, `use_cache`, `temperature`, `top_k`, `top_p`, `num_return_sequences`, `pad_token_id`, and `eos_token_id`. Refer to the [HuggingFace Text Generation Documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation) for more information. @@ -238,7 +238,7 @@ python3 syncode/infer.py --new_mask_store [True, False] --parser ["lr", "lalr"] --task_id [task_id] - --use_auto_device_mapping [True, False] + --device_map [device_map] ``` diff --git a/syncode/common.py b/syncode/common.py index 5522a1fd..cbf06662 100644 --- a/syncode/common.py +++ b/syncode/common.py @@ -11,17 +11,17 @@ HF_ACCESS_TOKEN = os.environ['HF_ACCESS_TOKEN'] if 'HF_ACCESS_TOKEN' in os.environ else None -def load_model(model_name, device, quantize, use_auto_device_mapping = False): +def load_model(model_name, device, quantize, device_map = None): if model_name == 'test': model = AutoModelForCausalLM.from_pretrained('bigcode/tiny_starcoder_py').to(device) elif model_name == 'test-instruct': model = AutoModelForCausalLM.from_pretrained("rahuldshetty/tiny-starcoder-instruct") else: - if use_auto_device_mapping: + if device_map is not None: if (quantize): - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = 'auto').eval() + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = device_map).eval() else: - model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = 'auto').eval() + model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = device_map).eval() else: if (quantize): model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True).eval().to(device) diff --git a/syncode/infer.py b/syncode/infer.py index f44a8693..deaa0bac 100644 --- a/syncode/infer.py +++ b/syncode/infer.py @@ -50,7 +50,7 @@ def __init__( parser: Literal["lr", "lalr"] = "lalr", seed: Optional[int] = None, opp: bool = True, - use_auto_device_mapping: bool = False, + device_map: Optional[str] = None, **kwargs ): # Check inputs @@ -86,7 +86,7 @@ def __init__( self.grammar = Grammar(grammar) if self._is_grammar_mode() else None # Load model and tokenizer - model = common.load_model(self.model_name, device, quantize, use_auto_device_mapping) + model = common.load_model(self.model_name, device, quantize, device_map) tokenizer = common.load_tokenizer(self.model_name) # Initialize grammar decoder if needed @@ -260,7 +260,7 @@ def main( parse_output_only: bool = True, prompt_type: str = 'original', format_tabs: bool = False, - use_auto_device_mapping: bool = False, + device_map: Optional[str] = None, **kwargs ): """Run Syncode with the specified configuration. @@ -311,7 +311,7 @@ def main( seed=seed, opp=opp, parse_output_only=parse_output_only, - use_auto_device_mapping=use_auto_device_mapping, + device_map=device_map, **kwargs )