Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions offline_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,16 @@ def main():
parser.add_argument(
"--model-path", type=str, required=True, help="Base path for model files"
)
parser.add_argument(
"--load-in-8bit", action="store_true", help="Base path for model files"
)
args = parser.parse_args()

model = StepAudio(
tokenizer_path=f"{args.model_path}/Step-Audio-Tokenizer",
tts_path=f"{args.model_path}/Step-Audio-TTS-3B",
llm_path=f"{args.model_path}/Step-Audio-Chat",
load_in_8bit=args.load_in_8bit
)

# example for text input
Expand Down
3 changes: 2 additions & 1 deletion requirements-vllm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ sentencepiece
funasr>=1.1.3
protobuf==5.29.3
gradio>=5.16.0
vllm==0.7.2
vllm==0.7.2
bitsandbytes
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ sentencepiece
funasr>=1.1.3
protobuf==5.29.3
gradio>=5.16.0
bitsandbytes
34 changes: 8 additions & 26 deletions stepaudio.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,22 @@

import torch
import torchaudio
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from tokenizer import StepAudioTokenizer
from tts import StepAudioTTS
from utils import load_audio, load_optimus_ths_lib, speech_adjust, volumn_adjust


class StepAudio:
def __init__(self, tokenizer_path: str, tts_path: str, llm_path: str):
def __init__(self, tokenizer_path: str, tts_path: str, llm_path: str, load_in_8bit: bool=False):
load_optimus_ths_lib(os.path.join(llm_path, 'lib'))
q_config = None
if load_in_8bit:
q_config = BitsAndBytesConfig(load_in_8bit=True)
print(f"load in 8bit")
self.llm_tokenizer = AutoTokenizer.from_pretrained(
llm_path, trust_remote_code=True
llm_path, trust_remote_code=True, quantization_config=q_config,
)
self.encoder = StepAudioTokenizer(tokenizer_path)
self.decoder = StepAudioTTS(tts_path, self.encoder)
Expand All @@ -22,6 +26,7 @@ def __init__(self, tokenizer_path: str, tts_path: str, llm_path: str):
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
quantization_config=q_config,
)

def __call__(
Expand Down Expand Up @@ -73,26 +78,3 @@ def apply_chat_template(self, messages: list):
text_with_audio += "<|BOT|>assistant\n"
return text_with_audio


if __name__ == "__main__":
model = StepAudio(
encoder_path="/mnt/ys-shai-jfs/open-step1o-audio/step1o-audio-encoder",
decoder_path="/mnt/ys-shai-jfs/open-step1o-audio/step1o-audio-decoder",
llm_path="/mnt/ys-shai-jfs/open-step1o-audio/step1o-audio-v18",
)

text, audio, sr = model(
[{"role": "user", "content": "你好,我是你的朋友,我叫小明,你叫什么名字?"}],
"Tingting",
)
torchaudio.save("output/output_e2e_tqta.wav", audio, sr)
text, audio, sr = model(
[
{
"role": "user",
"content": {"type": "audio", "audio": "output/output_e2e_tqta.wav"},
}
],
"Tingting",
)
torchaudio.save("output/output_e2e_aqta.wav", audio, sr)