From 35bf7fdca540d2c38572533a28744fba3f422afa Mon Sep 17 00:00:00 2001
From: Tarun Suresh <tarsur909@gmail.com>
Date: Wed, 2 Apr 2025 21:55:43 +0000
Subject: [PATCH 1/3] enable support for device map auto

---
 README.md                 |  2 ++
 syncode/common.py         | 14 ++++++++++----
 syncode/infer.py          |  5 ++++-
 syncode/language_model.py |  2 +-
 4 files changed, 17 insertions(+), 6 deletions(-)
diff --git a/README.md b/README.md
index d6b8c232..d4e2f848 100644
--- a/README.md
+++ b/README.md
@@ -207,6 +207,7 @@ export HF_ACCESS_TOKEN="your_huggingface_api_key"
 
 - `task_id` (int, optional): Problem task id for selecting a problem from a Dataset.
 
+- `use_auto` (bool, optional): Use auto device mapping. Defaults to False.
 - `kwargs`(void, optional): Currently supported `kwargs` are `max_length`, `max_new_tokens`, `min_length`, `min_new_tokens`, `early_stopping`, `do_sample`, `num_beams`, `use_cache`, `temperature`, `top_k`, `top_p`, `num_return_sequences`, `pad_token_id`, and `eos_token_id`. Refer to the [HuggingFace Text Generation Documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation) for more information.
 
   
@@ -237,6 +238,7 @@ python3 syncode/infer.py
     --new_mask_store [True, False]
     --parser ["lr", "lalr"]
     --task_id [task_id]
+    --use_auto [True, False]
 ```
 </details>
 
diff --git a/syncode/common.py b/syncode/common.py
index baf5370f..86d6b77d 100644
--- a/syncode/common.py
+++ b/syncode/common.py
@@ -11,16 +11,22 @@
 HF_ACCESS_TOKEN = os.environ['HF_ACCESS_TOKEN'] if 'HF_ACCESS_TOKEN' in os.environ else None
 
 
-def load_model(model_name, device, quantize):
+def load_model(model_name, device, quantize, use_auto = False):
         if model_name == 'test':
             model = AutoModelForCausalLM.from_pretrained('bigcode/tiny_starcoder_py').to(device)
         elif model_name == 'test-instruct':
             model = AutoModelForCausalLM.from_pretrained("rahuldshetty/tiny-starcoder-instruct")
         else:
-            if (quantize):
-                model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True).eval().to(device)
+            if use_auto:
+                if (quantize):
+                    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = 'auto').eval()
+                else:
+                    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = 'auto').eval()
             else:
-                model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True).eval().to(device)
+                if (quantize):
+                    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True).eval().to(device)
+                else:
+                    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True).eval().to(device)
         return model
 
 def load_tokenizer(model_name):
diff --git a/syncode/infer.py b/syncode/infer.py
index 8bc68d0a..23f3a0c2 100644
--- a/syncode/infer.py
+++ b/syncode/infer.py
@@ -50,6 +50,7 @@ def __init__(
         parser: Literal["lr", "lalr"] = "lalr",
         seed: Optional[int] = None,
         opp: bool = True,
+        use_auto: bool = False,
         **kwargs
     ):  
         # Check inputs
@@ -85,7 +86,7 @@ def __init__(
         self.grammar = Grammar(grammar) if self._is_grammar_mode() else None
 
         # Load model and tokenizer
-        model = common.load_model(self.model_name, device, quantize)
+        model = common.load_model(self.model_name, device, quantize, use_auto)
         tokenizer = common.load_tokenizer(self.model_name)
         
         # Initialize grammar decoder if needed
@@ -259,6 +260,7 @@ def main(
     parse_output_only: bool = True,
     prompt_type: str = 'original',
     format_tabs: bool = False,
+    use_auto: bool = False,
     **kwargs
 ):
     """Run Syncode with the specified configuration.
@@ -309,6 +311,7 @@ def main(
         seed=seed, 
         opp=opp,
         parse_output_only=parse_output_only,
+        use_auto=use_auto,
         **kwargs
     )
     
diff --git a/syncode/language_model.py b/syncode/language_model.py
index 4b1607c2..2db3c11d 100644
--- a/syncode/language_model.py
+++ b/syncode/language_model.py
@@ -50,7 +50,7 @@ def __init__(
         self.prompt_template = prompt_template
         self.model: PreTrainedModel = model
         self.tokenizer = tokenizer
-        self.device = device
+        self.device = self.model.device
         self.best_of = best_of
         self._before_prediction_hook = before_prediction_hook
         self.logits_processor = grammar_decoder

From eb9a4a77136fe12151bb926287ec66acaf4c5343 Mon Sep 17 00:00:00 2001
From: Tarun Suresh <tarsur909@gmail.com>
Date: Wed, 2 Apr 2025 22:00:52 +0000
Subject: [PATCH 2/3] enable support for device map auto

---
 README.md         | 4 ++--
 syncode/common.py | 4 ++--
 syncode/infer.py  | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index d4e2f848..67bc63ff 100644
--- a/README.md
+++ b/README.md
@@ -207,7 +207,7 @@ export HF_ACCESS_TOKEN="your_huggingface_api_key"
 
 - `task_id` (int, optional): Problem task id for selecting a problem from a Dataset.
 
-- `use_auto` (bool, optional): Use auto device mapping. Defaults to False.
+- `use_auto_device_mapping` (bool, optional): Use auto device mapping. Defaults to False.
 - `kwargs`(void, optional): Currently supported `kwargs` are `max_length`, `max_new_tokens`, `min_length`, `min_new_tokens`, `early_stopping`, `do_sample`, `num_beams`, `use_cache`, `temperature`, `top_k`, `top_p`, `num_return_sequences`, `pad_token_id`, and `eos_token_id`. Refer to the [HuggingFace Text Generation Documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation) for more information.
 
   
@@ -238,7 +238,7 @@ python3 syncode/infer.py
     --new_mask_store [True, False]
     --parser ["lr", "lalr"]
     --task_id [task_id]
-    --use_auto [True, False]
+    --use_auto_device_mapping [True, False]
 ```
 </details>
 
diff --git a/syncode/common.py b/syncode/common.py
index 86d6b77d..5522a1fd 100644
--- a/syncode/common.py
+++ b/syncode/common.py
@@ -11,13 +11,13 @@
 HF_ACCESS_TOKEN = os.environ['HF_ACCESS_TOKEN'] if 'HF_ACCESS_TOKEN' in os.environ else None
 
 
-def load_model(model_name, device, quantize, use_auto = False):
+def load_model(model_name, device, quantize, use_auto_device_mapping = False):
         if model_name == 'test':
             model = AutoModelForCausalLM.from_pretrained('bigcode/tiny_starcoder_py').to(device)
         elif model_name == 'test-instruct':
             model = AutoModelForCausalLM.from_pretrained("rahuldshetty/tiny-starcoder-instruct")
         else:
-            if use_auto:
+            if use_auto_device_mapping:
                 if (quantize):
                     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = 'auto').eval()
                 else:
diff --git a/syncode/infer.py b/syncode/infer.py
index 23f3a0c2..f44a8693 100644
--- a/syncode/infer.py
+++ b/syncode/infer.py
@@ -50,7 +50,7 @@ def __init__(
         parser: Literal["lr", "lalr"] = "lalr",
         seed: Optional[int] = None,
         opp: bool = True,
-        use_auto: bool = False,
+        use_auto_device_mapping: bool = False,
         **kwargs
     ):  
         # Check inputs
@@ -86,7 +86,7 @@ def __init__(
         self.grammar = Grammar(grammar) if self._is_grammar_mode() else None
 
         # Load model and tokenizer
-        model = common.load_model(self.model_name, device, quantize, use_auto)
+        model = common.load_model(self.model_name, device, quantize, use_auto_device_mapping)
         tokenizer = common.load_tokenizer(self.model_name)
         
         # Initialize grammar decoder if needed
@@ -260,7 +260,7 @@ def main(
     parse_output_only: bool = True,
     prompt_type: str = 'original',
     format_tabs: bool = False,
-    use_auto: bool = False,
+    use_auto_device_mapping: bool = False,
     **kwargs
 ):
     """Run Syncode with the specified configuration.
@@ -311,7 +311,7 @@ def main(
         seed=seed, 
         opp=opp,
         parse_output_only=parse_output_only,
-        use_auto=use_auto,
+        use_auto_device_mapping=use_auto_device_mapping,
         **kwargs
     )
     

From 0243367f9574af87c1194b2ae69d2af775461040 Mon Sep 17 00:00:00 2001
From: Tarun Suresh <tarsur909@gmail.com>
Date: Wed, 2 Apr 2025 22:02:37 +0000
Subject: [PATCH 3/3] enable support for device map auto

---
 README.md         | 4 ++--
 syncode/common.py | 8 ++++----
 syncode/infer.py  | 8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 67bc63ff..44cc50d6 100644
--- a/README.md
+++ b/README.md
@@ -207,7 +207,7 @@ export HF_ACCESS_TOKEN="your_huggingface_api_key"
 
 - `task_id` (int, optional): Problem task id for selecting a problem from a Dataset.
 
-- `use_auto_device_mapping` (bool, optional): Use auto device mapping. Defaults to False.
+- `device_map` (str, optional): Device map for the model. Defaults to None.
 - `kwargs`(void, optional): Currently supported `kwargs` are `max_length`, `max_new_tokens`, `min_length`, `min_new_tokens`, `early_stopping`, `do_sample`, `num_beams`, `use_cache`, `temperature`, `top_k`, `top_p`, `num_return_sequences`, `pad_token_id`, and `eos_token_id`. Refer to the [HuggingFace Text Generation Documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation) for more information.
 
   
@@ -238,7 +238,7 @@ python3 syncode/infer.py
     --new_mask_store [True, False]
     --parser ["lr", "lalr"]
     --task_id [task_id]
-    --use_auto_device_mapping [True, False]
+    --device_map [device_map]
 ```
 </details>
 
diff --git a/syncode/common.py b/syncode/common.py
index 5522a1fd..cbf06662 100644
--- a/syncode/common.py
+++ b/syncode/common.py
@@ -11,17 +11,17 @@
 HF_ACCESS_TOKEN = os.environ['HF_ACCESS_TOKEN'] if 'HF_ACCESS_TOKEN' in os.environ else None
 
 
-def load_model(model_name, device, quantize, use_auto_device_mapping = False):
+def load_model(model_name, device, quantize, device_map = None):
         if model_name == 'test':
             model = AutoModelForCausalLM.from_pretrained('bigcode/tiny_starcoder_py').to(device)
         elif model_name == 'test-instruct':
             model = AutoModelForCausalLM.from_pretrained("rahuldshetty/tiny-starcoder-instruct")
         else:
-            if use_auto_device_mapping:
+            if device_map is not None:
                 if (quantize):
-                    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = 'auto').eval()
+                    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = device_map).eval()
                 else:
-                    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = 'auto').eval()
+                    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = device_map).eval()
             else:
                 if (quantize):
                     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True).eval().to(device)
diff --git a/syncode/infer.py b/syncode/infer.py
index f44a8693..deaa0bac 100644
--- a/syncode/infer.py
+++ b/syncode/infer.py
@@ -50,7 +50,7 @@ def __init__(
         parser: Literal["lr", "lalr"] = "lalr",
         seed: Optional[int] = None,
         opp: bool = True,
-        use_auto_device_mapping: bool = False,
+        device_map: Optional[str] = None,
         **kwargs
     ):  
         # Check inputs
@@ -86,7 +86,7 @@ def __init__(
         self.grammar = Grammar(grammar) if self._is_grammar_mode() else None
 
         # Load model and tokenizer
-        model = common.load_model(self.model_name, device, quantize, use_auto_device_mapping)
+        model = common.load_model(self.model_name, device, quantize, device_map)
         tokenizer = common.load_tokenizer(self.model_name)
         
         # Initialize grammar decoder if needed
@@ -260,7 +260,7 @@ def main(
     parse_output_only: bool = True,
     prompt_type: str = 'original',
     format_tabs: bool = False,
-    use_auto_device_mapping: bool = False,
+    device_map: Optional[str] = None,
     **kwargs
 ):
     """Run Syncode with the specified configuration.
@@ -311,7 +311,7 @@ def main(
         seed=seed, 
         opp=opp,
         parse_output_only=parse_output_only,
-        use_auto_device_mapping=use_auto_device_mapping,
+        device_map=device_map,
         **kwargs
     )