microsoft · kunal-vaishnavi · Sep 30, 2024 · Oct 22, 2024 · Nov 2, 2024 · Nov 5, 2024
diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
@@ -143,7 +143,7 @@ jobs:
           docker run \
             --gpus all \
             --rm \
-            --volume /data/ortgenai/pytorch:/data/ortgenai/pytorch \
+            --volume /data/ortgenai/:/data/ortgenai/ \
             --volume $GITHUB_WORKSPACE:/ort_genai_src \
             -e HF_TOKEN=$HF_TOKEN \
             -w /ort_genai_src onnxruntimecudabuildx64 bash -c " \
@@ -170,6 +170,6 @@ jobs:
           docker run \
             --gpus all \
             --rm \
-            --volume /data/ortgenai/pytorch:/data/ortgenai/pytorch \
+            --volume /data/ortgenai/:/data/ortgenai/ \
             --volume $GITHUB_WORKSPACE:/ort_genai_src \
             -w /ort_genai_src onnxruntimecudabuildx64 bash -c "ORTGENAI_LOG_ORT_LIB=1 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ort_genai_src/build/cuda/ /ort_genai_src/build/cuda/unit_tests"
diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -14,7 +14,7 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
-onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;5ea4b9b0683b83c1d6800eb332f37dcc76bb2e61
+onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;a85fa861ee5e5300f16142bd969ede0eabc61c86
 
 # These two dependencies are for the optional constrained decoding feature (USE_GUIDANCE)
 llguidance;https://github.com/microsoft/llguidance.git;2d2f1de3c87e3289528affc346f734f7471216d9

diff --git a/examples/c/src/phi3v.cpp b/examples/c/src/phi3v.cpp
@@ -63,9 +63,9 @@ void CXX_API(const char* model_path, const char* execution_provider) {
     std::cout << "Generating response..." << std::endl;
     auto params = OgaGeneratorParams::Create(*model);
     params->SetSearchOption("max_length", 7680);
-    params->SetInputs(*input_tensors);
 
     auto generator = OgaGenerator::Create(*model, *params);
+    generator->SetInputs(*input_tensors);
 
     while (!generator->IsDone()) {
       generator->GenerateNextToken();

diff --git a/examples/c/src/phi4-mm.cpp b/examples/c/src/phi4-mm.cpp
@@ -94,9 +94,9 @@ void CXX_API(const char* model_path, const char* execution_provider) {
     std::cout << "Generating response..." << std::endl;
     auto params = OgaGeneratorParams::Create(*model);
     params->SetSearchOption("max_length", 7680);
-    params->SetInputs(*input_tensors);
 
     auto generator = OgaGenerator::Create(*model, *params);
+    generator->SetInputs(*input_tensors);
 
     while (!generator->IsDone()) {
       generator->GenerateNextToken();

diff --git a/examples/c/src/whisper.cpp b/examples/c/src/whisper.cpp
@@ -15,8 +15,6 @@ void CXX_API(const char* model_path, int32_t num_beams) {
   auto model = OgaModel::Create(model_path);
   std::cout << "Creating multimodal processor..." << std::endl;
   auto processor = OgaMultiModalProcessor::Create(*model);
-  std::cout << "Creating tokenizer..." << std::endl;
-  auto tokenizer = OgaTokenizer::Create(*model);
 
   while (true) {
     std::string audio_paths_str;
@@ -42,31 +40,24 @@ void CXX_API(const char* model_path, int32_t num_beams) {
       audios = OgaAudios::Load(audio_paths_c);
     }
 
-    std::cout << "Processing audio..." << std::endl;
-    auto mel = processor->ProcessAudios(audios.get());
-    const std::vector<const char*> prompt_tokens = {"<|startoftranscript|>", "<|en|>", "<|transcribe|>",
-                                                    "<|notimestamps|>"};
-    auto input_ids = OgaSequences::Create();
+    std::cout << "Processing inputs..." << std::endl;
     const size_t batch_size = audio_paths.size();
-    for (size_t i = 0; i < batch_size; ++i) {
-      for (const auto& token : prompt_tokens) {
-        input_ids->Append(tokenizer->ToTokenId(token), i);
-      }
-    }
+    const char* prompt_tokens = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>";
+    const std::vector<const char*> prompts(batch_size, prompt_tokens);
+    auto inputs = processor->ProcessAudios(prompts, audios.get());
 
     std::cout << "Generating response..." << std::endl;
     auto params = OgaGeneratorParams::Create(*model);
-    params->SetSearchOption("max_length", 256);
+    params->SetSearchOption("batch_size", static_cast<double>(batch_size));
+    params->SetSearchOption("max_length", 448);
     params->SetSearchOptionBool("do_sample", false);
     params->SetSearchOption("num_beams", num_beams);
     params->SetSearchOption("num_return_sequences", num_beams);
-    params->SetInputs(*mel);
-    params->SetInputSequences(*input_ids);
 
     auto generator = OgaGenerator::Create(*model, *params);
+    generator->SetInputs(*inputs);
 
     while (!generator->IsDone()) {
-      generator->ComputeLogits();
       generator->GenerateNextToken();
     }
 
@@ -133,36 +124,29 @@ void C_API(const char* model_path, int32_t num_beams) {
     }
 
     std::cout << "Processing audio..." << std::endl;
-    OgaNamedTensors* mel;
-    CheckResult(OgaProcessorProcessAudios(processor, audios, &mel));
-    const std::vector<const char*> prompt_tokens = {"<|startoftranscript|>", "<|en|>", "<|transcribe|>",
-                                                    "<|notimestamps|>"};
-    OgaSequences* input_ids;
-    CheckResult(OgaCreateSequences(&input_ids));
+    OgaNamedTensors* inputs;
     const size_t batch_size = audio_paths.size();
-    for (size_t i = 0; i < batch_size; ++i) {
-      for (const auto& token : prompt_tokens) {
-        int32_t token_id;
-        CheckResult(OgaTokenizerToTokenId(tokenizer, token, &token_id));
-        CheckResult(OgaAppendTokenToSequence(token_id, input_ids, i));
-      }
-    }
+    const char* prompt_tokens = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>";
+    std::vector<const char*> prompts(batch_size, prompt_tokens);
+    OgaStringArray* prompts_string_array;
+    CheckResult(OgaCreateStringArrayFromStrings(prompts.data(), prompts.size(), &prompts_string_array));
+    CheckResult(OgaProcessorProcessAudiosAndPrompts(processor, prompts_string_array, audios, &inputs));
+    OgaDestroyStringArray(prompts_string_array);
 
     std::cout << "Generating response..." << std::endl;
     OgaGeneratorParams* params;
     CheckResult(OgaCreateGeneratorParams(model, &params));
-    CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 256));
+    CheckResult(OgaGeneratorParamsSetSearchNumber(params, "batch_size", static_cast<double>(batch_size)));
+    CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 448));
     CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", false));
     CheckResult(OgaGeneratorParamsSetSearchNumber(params, "num_beams", num_beams));
     CheckResult(OgaGeneratorParamsSetSearchNumber(params, "num_return_sequences", num_beams));
-    CheckResult(OgaGeneratorParamsSetInputs(params, mel));
-    CheckResult(OgaGeneratorParamsSetInputSequences(params, input_ids));
 
     OgaGenerator* generator;
     CheckResult(OgaCreateGenerator(model, params, &generator));
+    CheckResult(OgaGenerator_SetInputs(generator, inputs));
 
     while (!OgaGenerator_IsDone(generator)) {
-      CheckResult(OgaGenerator_ComputeLogits(generator));
       CheckResult(OgaGenerator_GenerateNextToken(generator));
     }
 
@@ -182,8 +166,7 @@ void C_API(const char* model_path, int32_t num_beams) {
 
     OgaDestroyGenerator(generator);
     OgaDestroyGeneratorParams(params);
-    OgaDestroySequences(input_ids);
-    OgaDestroyNamedTensors(mel);
+    OgaDestroyNamedTensors(inputs);
     OgaDestroyAudios(audios);
   }
 
@@ -203,6 +186,11 @@ int main(int argc, char** argv) {
     return -1;
   }
 
+  // Uncomment for debugging purposes
+  // Oga::SetLogBool("enabled", true);
+  // Oga::SetLogBool("model_input_values", true);
+  // Oga::SetLogBool("model_output_values", true);
+
   std::cout << "---------------" << std::endl;
   std::cout << "Hello, Whisper!" << std::endl;
   std::cout << "---------------" << std::endl;

diff --git a/examples/csharp/HelloPhi3V/Program.cs b/examples/csharp/HelloPhi3V/Program.cs
@@ -163,9 +163,9 @@ void PrintUsage()
     Console.WriteLine("Generating response...");
     using GeneratorParams generatorParams = new GeneratorParams(model);
     generatorParams.SetSearchOption("max_length", 7680);
-    generatorParams.SetInputs(inputTensors);
 
     using var generator = new Generator(model, generatorParams);
+    generator.SetInputs(inputTensors);
     var watch = System.Diagnostics.Stopwatch.StartNew();
     while (!generator.IsDone())
     {

diff --git a/examples/csharp/HelloPhi4MM/Program.cs b/examples/csharp/HelloPhi4MM/Program.cs
@@ -204,9 +204,9 @@ void PrintUsage()
     Console.WriteLine("Generating response...");
     using GeneratorParams generatorParams = new GeneratorParams(model);
     generatorParams.SetSearchOption("max_length", 7680);
-    generatorParams.SetInputs(inputTensors);
 
     using var generator = new Generator(model, generatorParams);
+    generator.SetInputs(inputTensors);
     var watch = System.Diagnostics.Stopwatch.StartNew();
     while (!generator.IsDone())
     {

diff --git a/examples/python/model-vision.py b/examples/python/model-vision.py
@@ -8,6 +8,7 @@
 from pathlib import Path
 
 import onnxruntime_genai as og
+# og.set_log_options(enabled=True, model_input_values=True, model_output_values=True)
 
 def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name):
     curr_path = Path(current_dir).absolute()
@@ -103,10 +104,10 @@ def run(args: argparse.Namespace):
 
         print("Generating response...")
         params = og.GeneratorParams(model)
-        params.set_inputs(inputs)
         params.set_search_options(max_length=7680)
 
         generator = og.Generator(model, params)
+        generator.set_inputs(inputs)
         start_time = time.time()
 
         while not generator.is_done():

diff --git a/examples/python/phi4-mm.py b/examples/python/phi4-mm.py
@@ -124,10 +124,10 @@ def run(args: argparse.Namespace):
 
         print("Generating response...")
         params = og.GeneratorParams(model)
-        params.set_inputs(inputs)
         params.set_search_options(max_length=7680)
 
         generator = og.Generator(model, params)
+        generator.set_inputs(inputs)
         start_time = time.time()
 
         while not generator.is_done():

diff --git a/examples/python/whisper.py b/examples/python/whisper.py
@@ -7,7 +7,7 @@
 import readline
 
 import onnxruntime_genai as og
-
+# og.set_log_options(enabled=True, model_input_values=True, model_output_values=True)
 
 def _complete(text, state):
     return (glob.glob(text + "*") + [None])[state]
@@ -20,15 +20,25 @@ class Format:
 
 def run(args: argparse.Namespace):
     print("Loading model...")
-    model = og.Model(args.model_path)
+    config = og.Config(args.model_path)
+    if args.execution_provider != "follow_config":
+        config.clear_providers()
+        if args.execution_provider != "cpu":
+            print(f"Setting model to {args.execution_provider}")
+            config.append_provider(args.execution_provider)
+    model = og.Model(config)
     processor = model.create_multimodal_processor()
     tokenizer = og.Tokenizer(model)
 
     while True:
         readline.set_completer_delims(" \t\n;")
         readline.parse_and_bind("tab: complete")
         readline.set_completer(_complete)
-        audio_paths = [audio_path.strip() for audio_path in input("Audio Paths (comma separated): ").split(",")]
+
+        if args.non_interactive:
+            audio_paths = [args.audio]
+        else:
+            audio_paths = [audio_path.strip() for audio_path in input("Audio Paths (comma separated): ").split(",")]
         if len(audio_paths) == 0:
             raise ValueError("No audio provided.")
 
@@ -39,28 +49,27 @@ def run(args: argparse.Namespace):
         audios = og.Audios.open(*audio_paths)
 
         print("Processing audio...")
-        mel = processor(audios=audios)
+        batch_size = len(audio_paths)
         decoder_prompt_tokens = ["<|startoftranscript|>", "<|en|>", "<|transcribe|>", "<|notimestamps|>"]
+        prompts = ["".join(decoder_prompt_tokens)] * batch_size
+        inputs = processor(prompts, audios=audios)
 
         params = og.GeneratorParams(model)
         params.set_search_options(
             do_sample=False,
             num_beams=args.num_beams,
             num_return_sequences=args.num_beams,
-            max_length=256,
+            max_length=448,
         )
 
-        batch_size = len(audio_paths)
-        params.set_inputs(mel)
-        params.input_ids = [[tokenizer.to_token_id(token) for token in decoder_prompt_tokens]] * batch_size
-
         generator = og.Generator(model, params)
+        generator.set_inputs(inputs)
 
         while not generator.is_done():
-            generator.compute_logits()
             generator.generate_next_token()
 
         print()
+        transcriptions = []
         for i in range(batch_size * args.num_beams):
             tokens = generator.get_sequence(i)
             transcription = processor.decode(tokens)
@@ -69,18 +78,45 @@ def run(args: argparse.Namespace):
             print(
                 f"    {Format.underline}batch {i // args.num_beams}, beam {i % args.num_beams}{Format.end}: {transcription}"
             )
+            transcriptions.append(transcription.strip())
 
         for _ in range(3):
             print()
 
+        if args.non_interactive:
+            args.output = args.output.strip()
+            matching = False
+            for transcription in transcriptions:
+                if transcription == args.output:
+                    matching = True
+                    break
+
+            if matching:
+                print("One of the model's transcription matches the expected transcription.")
+                return
+            raise Exception("None of the model's transcriptions match the expected transcription.")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-m", "--model_path", type=str, required=True, help="Path to the model"
     )
+    parser.add_argument(
+        '-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "follow_config"],
+        help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead."
+    )
     parser.add_argument(
         "-b", "--num_beams", type=int, default=4, help="Number of beams"
     )
+    parser.add_argument(
+        "-a", "--audio", type=str, default="", help="Path to audio file for CI testing purposes"
+    )
+    parser.add_argument(
+        "-o", "--output", type=str, default="", help="Expected transcribed output for CI testing purposes"
+    )
+    parser.add_argument(
+        "-ni", "--non_interactive", default=False, action="store_true", help="Non-interactive mode for CI testing purposes"
+    )
     args = parser.parse_args()
     run(args)