From 3c1b20e15ab80c76a3910525d4da9b5c0502ad23 Mon Sep 17 00:00:00 2001
From: Harry Yang <hhy@google.com>
Date: Fri, 27 Sep 2024 16:51:01 +0000
Subject: [PATCH 1/3] Fix bug to use all CPU cores

1. on my VM running inference with CPU is only using one core, instead of all 16.
delete `torch.set_default_tensor_type(torch.BFloat16Tensor)` make it using
all 16 cores

2. reduce default max_seq_len to 128
---
 inference.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/inference.py b/inference.py
index 4492aed..4bf9afa 100644
--- a/inference.py
+++ b/inference.py
@@ -42,8 +42,6 @@ def build(checkpoints_dir: str, tokenizer_path: str, load_model: bool, max_seq_l
         
         if device == "cuda":
             torch.set_default_tensor_type(torch.cuda.HalfTensor)
-        else:
-            torch.set_default_tensor_type(torch.BFloat16Tensor)
         
         model = Transformer(model_args).to(device)
 
@@ -156,7 +154,7 @@ def _sample_top_p(self, probs, p):
         checkpoints_dir='llama-2-7b/',
         tokenizer_path='tokenizer.model',
         load_model=True,
-        max_seq_len=1024,
+        max_seq_len=128,
         max_batch_size=len(prompts),
         device=device
     )

From b3c12a105eff4ee980e7b8d1da3728d6583a5cea Mon Sep 17 00:00:00 2001
From: Harry Yang <hhy@google.com>
Date: Fri, 27 Sep 2024 20:38:04 +0000
Subject: [PATCH 2/3] Define device in text_completion

* `device` is undefined
---
 inference.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/inference.py b/inference.py
index 4bf9afa..bde373a 100644
--- a/inference.py
+++ b/inference.py
@@ -54,6 +54,7 @@ def build(checkpoints_dir: str, tokenizer_path: str, load_model: bool, max_seq_l
         return LLaMA(model, tokenizer, model_args)
 
     def text_completion(self, prompts: list[str], temperature: float = 0.6, top_p: float = 0.9, max_gen_len: Optional[int] = None):
+        device = self.args.device
         if max_gen_len is None:
             max_gen_len = self.args.max_seq_len - 1
         # Convert each prompt into tokens

From 498bd5b865a6e48067d1b0598db02a32e95e49e9 Mon Sep 17 00:00:00 2001
From: Harry Yang <hhy@google.com>
Date: Fri, 27 Sep 2024 20:50:29 +0000
Subject: [PATCH 3/3] Refactor to pass cmd args

---
 inference.py     | 22 +++++++++++++++-------
 requirements.txt |  3 ++-
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/inference.py b/inference.py
index bde373a..9524cc0 100644
--- a/inference.py
+++ b/inference.py
@@ -1,3 +1,4 @@
+import fire
 from typing import Optional
 import torch
 import time
@@ -127,11 +128,15 @@ def _sample_top_p(self, probs, p):
         return next_token
 
 
-
-if __name__ == '__main__':
+def main(
+    checkpoints_dir: str ='llama-2-7b/',
+    tokenizer_path: str ='tokenizer.model',
+    max_seq_len: int = 128,
+    max_batch_size: int = 4,
+    allow_cuda: bool = False
+):
     torch.manual_seed(0)
 
-    allow_cuda = False
     device = 'cuda' if torch.cuda.is_available() and allow_cuda else 'cpu'
 
     prompts = [
@@ -152,11 +157,11 @@ def _sample_top_p(self, probs, p):
     ]
 
     model = LLaMA.build(
-        checkpoints_dir='llama-2-7b/',
-        tokenizer_path='tokenizer.model',
+        checkpoints_dir=checkpoints_dir,
+        tokenizer_path=tokenizer_path,
+        max_seq_len=max_seq_len,
+        max_batch_size=max_batch_size,
         load_model=True,
-        max_seq_len=128,
-        max_batch_size=len(prompts),
         device=device
     )
 
@@ -166,3 +171,6 @@ def _sample_top_p(self, probs, p):
         print(f'{out_texts[i]}')
         print('-' * 50)
 
+
+if __name__ == '__main__':
+    fire.Fire(main)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 6e76252..90887d7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 torch
 sentencepiece
-tqdm
\ No newline at end of file
+tqdm
+fire
\ No newline at end of file