From 3c1b20e15ab80c76a3910525d4da9b5c0502ad23 Mon Sep 17 00:00:00 2001 From: Harry Yang Date: Fri, 27 Sep 2024 16:51:01 +0000 Subject: [PATCH 1/3] Fix bug to use all CPU cores 1. on my VM running inference with CPU is only using one core, instead of all 16. delete `torch.set_default_tensor_type(torch.BFloat16Tensor)` make it using all 16 cores 2. reduce default max_seq_len to 128 --- inference.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/inference.py b/inference.py index 4492aed..4bf9afa 100644 --- a/inference.py +++ b/inference.py @@ -42,8 +42,6 @@ def build(checkpoints_dir: str, tokenizer_path: str, load_model: bool, max_seq_l if device == "cuda": torch.set_default_tensor_type(torch.cuda.HalfTensor) - else: - torch.set_default_tensor_type(torch.BFloat16Tensor) model = Transformer(model_args).to(device) @@ -156,7 +154,7 @@ def _sample_top_p(self, probs, p): checkpoints_dir='llama-2-7b/', tokenizer_path='tokenizer.model', load_model=True, - max_seq_len=1024, + max_seq_len=128, max_batch_size=len(prompts), device=device ) From b3c12a105eff4ee980e7b8d1da3728d6583a5cea Mon Sep 17 00:00:00 2001 From: Harry Yang Date: Fri, 27 Sep 2024 20:38:04 +0000 Subject: [PATCH 2/3] Define device in text_completion * `device` is undefined --- inference.py | 1 + 1 file changed, 1 insertion(+) diff --git a/inference.py b/inference.py index 4bf9afa..bde373a 100644 --- a/inference.py +++ b/inference.py @@ -54,6 +54,7 @@ def build(checkpoints_dir: str, tokenizer_path: str, load_model: bool, max_seq_l return LLaMA(model, tokenizer, model_args) def text_completion(self, prompts: list[str], temperature: float = 0.6, top_p: float = 0.9, max_gen_len: Optional[int] = None): + device = self.args.device if max_gen_len is None: max_gen_len = self.args.max_seq_len - 1 # Convert each prompt into tokens From 498bd5b865a6e48067d1b0598db02a32e95e49e9 Mon Sep 17 00:00:00 2001 From: Harry Yang Date: Fri, 27 Sep 2024 20:50:29 +0000 Subject: [PATCH 3/3] Refactor to pass cmd args --- inference.py | 22 +++++++++++++++------- requirements.txt | 3 ++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/inference.py b/inference.py index bde373a..9524cc0 100644 --- a/inference.py +++ b/inference.py @@ -1,3 +1,4 @@ +import fire from typing import Optional import torch import time @@ -127,11 +128,15 @@ def _sample_top_p(self, probs, p): return next_token - -if __name__ == '__main__': +def main( + checkpoints_dir: str ='llama-2-7b/', + tokenizer_path: str ='tokenizer.model', + max_seq_len: int = 128, + max_batch_size: int = 4, + allow_cuda: bool = False +): torch.manual_seed(0) - allow_cuda = False device = 'cuda' if torch.cuda.is_available() and allow_cuda else 'cpu' prompts = [ @@ -152,11 +157,11 @@ def _sample_top_p(self, probs, p): ] model = LLaMA.build( - checkpoints_dir='llama-2-7b/', - tokenizer_path='tokenizer.model', + checkpoints_dir=checkpoints_dir, + tokenizer_path=tokenizer_path, + max_seq_len=max_seq_len, + max_batch_size=max_batch_size, load_model=True, - max_seq_len=128, - max_batch_size=len(prompts), device=device ) @@ -166,3 +171,6 @@ def _sample_top_p(self, probs, p): print(f'{out_texts[i]}') print('-' * 50) + +if __name__ == '__main__': + fire.Fire(main) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6e76252..90887d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ torch sentencepiece -tqdm \ No newline at end of file +tqdm +fire \ No newline at end of file