Skip to content

Commit ef955e7

Browse files
Merge branch 'main' into kvaishnavi/whisper
2 parents fbebe68 + 4d702e2 commit ef955e7

File tree

77 files changed

+1435
-558
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+1435
-558
lines changed

.github/workflows/linux-cpu-x64-build.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ concurrency:
1010
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
1111
cancel-in-progress: true
1212
env:
13+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
1314
ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
1415
ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
1516
ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
@@ -84,10 +85,6 @@ jobs:
8485
python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
8586
python3 -m pip install --user --no-index --no-deps --find-links build/cpu/wheel onnxruntime_genai
8687
87-
- name: Use Dummy HuggingFace Token
88-
run: |
89-
echo "HF_TOKEN=12345" >> $GITHUB_ENV
90-
9188
- name: Verify Build Artifacts
9289
if: always()
9390
continue-on-error: true

.github/workflows/linux-cpu-x64-nightly-build.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ concurrency:
1212
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
1313
cancel-in-progress: true
1414
env:
15+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
1516
ort_dir: "onnxruntime-linux-x64-1.18.0"
1617
ort_zip: "onnxruntime-linux-x64-1.18.0.tgz"
1718
ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.18.0/onnxruntime-linux-x64-1.18.0.tgz"
@@ -55,10 +56,6 @@ jobs:
5556
python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
5657
python3 -m pip install build/cpu/wheel/onnxruntime_genai*.whl --no-deps
5758
58-
- name: Use Dummy HuggingFace Token
59-
run: |
60-
echo "HF_TOKEN=12345" >> $GITHUB_ENV
61-
6259
- name: Run the python tests
6360
run: |
6461
python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models --e2e

.github/workflows/linux-gpu-x64-build.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ concurrency:
1212
cancel-in-progress: true
1313

1414
env:
15+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
1516
ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Linux&api-version=6.0-preview.1"
1617
ORT_PACKAGE_NAME: Microsoft.ML.OnnxRuntime.Gpu.Linux
1718
ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
@@ -109,10 +110,6 @@ jobs:
109110
bash -c " \
110111
/usr/bin/cmake --build --preset linux_gcc_cuda_release"
111112
112-
- name: Use Dummy HuggingFace Token
113-
run: |
114-
echo "HF_TOKEN=12345" >> $GITHUB_ENV
115-
116113
- name: Install the onnxruntime-genai Python wheel and run python test
117114
run: |
118115
echo "Installing the onnxruntime-genai Python wheel and running the Python tests"

.github/workflows/mac-cpu-arm64-build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ concurrency:
1010
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
1111
cancel-in-progress: true
1212
env:
13+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
1314
ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
1415
ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
1516
jobs:
@@ -86,7 +87,6 @@ jobs:
8687
- name: Run the python tests
8788
run: |
8889
source genai-macos-venv/bin/activate
89-
export HF_TOKEN="12345"
9090
export ORTGENAI_LOG_ORT_LIB=1
9191
python3 -m pip install requests
9292
python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models

.github/workflows/win-cpu-x64-build.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ concurrency:
1111
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
1212
cancel-in-progress: true
1313
env:
14+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
1415
binaryDir: 'build/cpu/win-x64'
1516
ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
1617
ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
@@ -91,10 +92,6 @@ jobs:
9192
python3 -m pip install -r test\python\cpu\ort\requirements.txt --user
9293
python3 -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
9394
94-
- name: Use Dummy HuggingFace Token
95-
run: |
96-
Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
97-
9895
- name: Run the Python Tests
9996
run: |
10097
python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"

.github/workflows/win-cuda-x64-build.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ concurrency:
1212
cancel-in-progress: true
1313

1414
env:
15+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
1516
AZCOPY_AUTO_LOGIN_TYPE: MSI
1617
AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
1718
cuda_dir: "${{ github.workspace }}\\cuda_sdk"
@@ -80,10 +81,6 @@ jobs:
8081
python -m pip install -r test\python\cuda\ort\requirements.txt
8182
python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
8283
83-
- name: Use Dummy HuggingFace Token
84-
run: |
85-
Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
86-
8784
- name: Run the Python Tests
8885
run: |
8986
python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e

.pipelines/nuget-publishing.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ parameters:
4848
type: boolean
4949
default: false
5050

51+
- name: enable_win_qnn
52+
displayName: 'Whether QNN nuget package should be built.'
53+
type: boolean
54+
default: false
55+
5156
- name: ort_version
5257
displayName: 'OnnxRuntime version'
5358
type: string
@@ -122,6 +127,7 @@ stages:
122127
ort_cuda_version: ${{ parameters.ort_cuda_version }}
123128
ort_dml_version: ${{ parameters.ort_dml_version }}
124129
build_config: ${{ parameters.build_config }}
130+
enable_win_qnn: ${{ parameters.enable_win_qnn }}
125131

126132
- ${{ if eq(parameters.enable_post_packaging_validation, true) }}:
127133
- template: stages/nuget-validation-stage.yml

.pipelines/stages/jobs/nuget-packaging-job.yml

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ parameters:
4747
type: boolean
4848
default: false
4949

50+
- name: enable_win_qnn
51+
displayName: 'Whether QNN nuget package should be built.'
52+
type: boolean
53+
default: false
54+
5055
- name: ort_version
5156
type: string
5257

@@ -55,7 +60,7 @@ parameters:
5560
default: 'release'
5661

5762
jobs:
58-
- job: nuget_${{ parameters.ep }}_packaging
63+
- job: nuget_${{ parameters.ep }}_packaging_dep_qnn_${{ parameters.enable_win_qnn }}
5964
pool: 'onnxruntime-Win-CPU-2022'
6065
variables:
6166
- name: ep
@@ -81,20 +86,24 @@ jobs:
8186
value: 'onnxruntime-genai-${{ parameters.ep }}'
8287

8388
- name: genai_nuget_package_name
84-
${{ if eq(parameters.ep, 'cpu') }}:
89+
${{ if and(eq(parameters.ep, 'cpu'), eq(parameters.enable_win_qnn, false)) }}:
8590
value: 'Microsoft.ML.OnnxRuntimeGenAI'
8691
${{ if eq(parameters.ep, 'cuda') }}:
8792
value: 'Microsoft.ML.OnnxRuntimeGenAI.Cuda'
8893
${{ if eq(parameters.ep, 'directml') }}:
8994
value: 'Microsoft.ML.OnnxRuntimeGenAI.DirectML'
95+
${{ if and(eq(parameters.ep, 'cpu'), eq(parameters.enable_win_qnn, true)) }}:
96+
value: 'Microsoft.ML.OnnxRuntimeGenAI.QNN'
9097

9198
- name: ort_nuget_package_name
92-
${{ if eq(parameters.ep, 'cpu') }}:
99+
${{ if and(eq(parameters.ep, 'cpu'), eq(parameters.enable_win_qnn, false)) }}:
93100
value: 'Microsoft.ML.OnnxRuntime'
94101
${{ if eq(parameters.ep, 'cuda') }}:
95102
value: 'Microsoft.ML.OnnxRuntime.Gpu'
96103
${{ if eq(parameters.ep, 'directml') }}:
97104
value: 'Microsoft.ML.OnnxRuntime.DirectML'
105+
${{ if and(eq(parameters.ep, 'cpu'), eq(parameters.enable_win_qnn, true)) }}:
106+
value: 'Microsoft.ML.OnnxRuntime.QNN'
98107

99108
steps:
100109
- ${{ if and(eq(parameters.enable_win_cpu, true), eq(parameters.ep, 'cpu')) }}:
@@ -181,6 +190,15 @@ jobs:
181190

182191
- task: NuGetAuthenticate@1
183192

193+
- powershell: |
194+
dotnet --info
195+
dotnet workload install android
196+
dotnet workload install ios
197+
dotnet workload install maccatalyst
198+
dotnet workload install macos
199+
displayName: 'Install dependencies'
200+
workingDirectory: '$(Build.Repository.LocalPath)\src\csharp'
201+
184202
- powershell: |
185203
dotnet --info
186204
dotnet build Microsoft.ML.OnnxRuntimeGenAI.csproj -p:Configuration="$(buildConfig)" -p:IncludeMobileTargets=true --verbosity normal

.pipelines/stages/jobs/nuget-validation-job.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,9 @@ jobs:
100100

101101
- name: cuda_docker_image
102102
${{ if eq(parameters.cuda_version, '11.8') }}:
103-
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
103+
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
104104
${{ else }}:
105-
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
105+
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
106106

107107
workspace:
108108
clean: all

.pipelines/stages/jobs/py-validation-job.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,9 @@ jobs:
109109

110110
- name: cuda_docker_image
111111
${{ if eq(parameters.cuda_version, '11.8') }}:
112-
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
112+
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
113113
${{ else }}:
114-
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
114+
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
115115

116116
steps:
117117
- checkout: self

.pipelines/stages/jobs/steps/python-validation-step.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ steps:
3535
python -m pip install -r test/python/directml/torch/requirements.txt
3636
python -m pip install -r test/python/directml/ort/requirements.txt
3737
}
38+
elseif ("$(arch)" -eq "arm64") {
39+
python -m pip install onnxruntime-qnn
40+
}
3841
else {
3942
python -m pip install -r test/python/cpu/torch/requirements.txt
4043
python -m pip install -r test/python/cpu/ort/requirements.txt

.pipelines/stages/nuget-packaging-stage.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ parameters:
4444
type: boolean
4545
default: true
4646

47+
- name: enable_win_qnn
48+
displayName: Whether QNN nuget package should be built.'
49+
type: boolean
50+
default: true
51+
4752
- name: ort_version
4853
type: string
4954
- name: ort_cuda_version
@@ -85,3 +90,16 @@ stages:
8590
build_config: ${{ parameters.build_config }}
8691
enable_win_dml: ${{ parameters.enable_win_dml }}
8792
enable_win_arm64: ${{ parameters.enable_win_arm64 }}
93+
- ${{ if eq(parameters.enable_win_qnn, true) }}:
94+
- template: jobs/nuget-packaging-job.yml
95+
parameters:
96+
ep: 'cpu'
97+
ort_version: ${{ parameters.ort_version }}
98+
build_config: ${{ parameters.build_config }}
99+
enable_linux_cpu: false
100+
enable_win_cpu: false
101+
enable_win_arm64: true
102+
enable_macos_cpu: false
103+
enable_android: false
104+
enable_apple_framework: false
105+
enable_win_qnn: true

CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,6 @@ include(cmake/check_cuda.cmake)
4141
include(cmake/check_rocm.cmake)
4242
# Checking if DML is supported
4343
include(cmake/check_dml.cmake)
44-
# Checking if WebGpu is supported
45-
include(cmake/check_webgpu.cmake)
4644

4745
include(cmake/cxx_standard.cmake)
4846

README.md

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@ It implements the generative AI loop for ONNX models, including pre and post pro
1414
See documentation at https://onnxruntime.ai/docs/genai.
1515

1616
|Support matrix|Supported now|Under development|On the roadmap|
17-
|-|-|-|-|
18-
|Model architectures| Gemma <br/> Llama * <br/> Mistral + <br/>Phi (language + vision)<br/>Qwen <br/>Nemotron <br/>|Whisper|Stable diffusion|
17+
| -------------- | ------------- | ----------------- | -------------- |
18+
| Model architectures | Gemma <br/> Llama * <br/> Mistral + <br/> Phi (language + vision) <br/> Qwen <br/> Nemotron <br/> Granite <br/> AMD OLMo | Whisper | Stable diffusion |
1919
|API| Python <br/>C# <br/>C/C++ <br/> Java ^ |Objective-C||
2020
|Platform| Linux <br/> Windows <br/>Mac ^ <br/>Android ^ ||iOS |||
2121
|Architecture|x86 <br/> x64 <br/> Arm64 ~ ||||
2222
|Hardware Acceleration|CUDA<br/>DirectML<br/>|QNN <br/> OpenVINO <br/> ROCm ||
23-
|Features|| Interactive decoding <br/> Customization (fine-tuning)| Speculative decoding |
23+
|Features|MultiLoRA <br/> Continuous decoding (session continuation)^ | Constrained decoding | Speculative decoding |
2424

2525
\* The Llama model architecture supports similar model families such as CodeLlama, Vicuna, Yi, and more.
2626

@@ -32,7 +32,7 @@ See documentation at https://onnxruntime.ai/docs/genai.
3232

3333
## Installation
3434

35-
See https://onnxruntime.ai/docs/genai/howto/install
35+
See [installation instructions](https://onnxruntime.ai/docs/genai/howto/install) or [build from source](https://onnxruntime.ai/docs/genai/howto/build-from-source.html)
3636

3737
## Sample code for Phi-3 in Python
3838

@@ -143,6 +143,30 @@ See https://onnxruntime.ai/docs/genai/howto/install
143143
del generator
144144
```
145145

146+
### Choosing the Right Examples: Release vs. Main Branch
147+
148+
Due to evolving nature of this project and ongoing feature additions, examples in the `main` branch may not always align with the latest stable release. This section outlines how to ensure compatibility between the examples and the corresponding version. Majority of the steps would remain same, just the package installation and the model example file would change.
149+
150+
### Stable version
151+
Install the package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install). Let's say you installed 0.5.2 version of ONNX Runtime GenAI, so the instructions would look like this:
152+
153+
```bash
154+
# Clone the repo
155+
git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai
156+
# Checkout the branch for the version you are using
157+
git checkout v0.5.2
158+
cd examples
159+
```
160+
161+
### Nightly version (Main Branch)
162+
Build the package from source using these [instructions](https://onnxruntime.ai/docs/genai/howto/build-from-source.html). Now just go to the folder location where all the examples are present.
163+
164+
```bash
165+
# Clone the repo
166+
git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai
167+
cd examples
168+
```
169+
146170
## Roadmap
147171

148172
See the [Discussions](https://github.com/microsoft/onnxruntime-genai/discussions) to request new features and up-vote existing requests.

benchmark/python/benchmark_e2e.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,14 @@ def generate_prompt(model, tokenizer, prompt_length, use_graph_capture) -> str:
8282
generator.generate_next_token()
8383
return tokenizer.decode(generator.get_sequence(0))
8484

85+
# Use prompt length to get pre-defined prompt
86+
def get_prompt_by_length(prompt_length):
87+
json_path = "prompts.json"
88+
with open(json_path) as prompts_file:
89+
content = prompts_file.read()
90+
data = json.load(content)
91+
return data[f"{prompt_length}"]
92+
8593
def get_target_pip_package_version(target_pip_package_name_list):
8694
# get package name and version
8795
import pkg_resources
@@ -231,6 +239,18 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
231239
# use random tokens instead of generating a prompt using the model and then tokenizing it
232240
tokens = np.random.randint(100, size=(batch_size, prompt_length))
233241
prompt = [tokenizer.decode(tokens[0])] * batch_size
242+
elif args.use_prompt_set:
243+
prompt = [get_prompt_by_length(prompt_length)] * batch_size
244+
tokens = tokenizer.encode_batch(prompt)
245+
246+
if len(tokens) > max_length:
247+
# Shorten the inputs from (batch_size, tokenized_length) to (batch_size, requested_length)
248+
tokens = tokens[:, :max_length]
249+
elif len(tokens) < max_length:
250+
# Lengthen the inputs from (batch_size, tokenized_length) to (batch_size, requested_length)
251+
tokens_first_col = tokens[:, 0].unsqueeze(0).T
252+
for _ in range(max_length - len(tokens)):
253+
tokens = np.hstack((tokens_first_col, tokens))
234254
else:
235255
prompt = [generate_prompt(model, tokenizer, prompt_length, args.use_graph_capture)] * batch_size
236256
tokens = tokenizer.encode_batch(prompt)
@@ -416,6 +436,7 @@ def str2strlist(value):
416436
parser.add_argument('-mn', '--model_name', type=str, default='model_name', help='Model name defined by users')
417437
parser.add_argument('-pr', '--precision', type=str, default='fp16', help='Model precision for metrics info')
418438
parser.add_argument('--use_random_tokens', action='store_true', help='Use random tokens instead of generating a prompt')
439+
parser.add_argument('--use_prompt_set', action='store_true', help='Use pre-generated prompt set instead of generating a prompt')
419440
args = parser.parse_args()
420441

421442
# check max_lengths

0 commit comments

Comments
 (0)