From e2915b34edb2f46b2fab6079482a62754b1f0500 Mon Sep 17 00:00:00 2001 From: Anthony Casagrande Date: Thu, 6 Nov 2025 09:53:49 -0800 Subject: [PATCH] feat: Add native ollama_generate endpoint type support --- docs/tutorials/ollama-endpoint.md | 233 ++++++++++++++++++++++ src/aiperf/common/enums/plugin_enums.py | 1 + src/aiperf/endpoints/__init__.py | 4 + src/aiperf/endpoints/ollama_generate.py | 118 +++++++++++ tests/endpoints/test_ollama_generate.py | 255 ++++++++++++++++++++++++ 5 files changed, 611 insertions(+) create mode 100644 docs/tutorials/ollama-endpoint.md create mode 100644 src/aiperf/endpoints/ollama_generate.py create mode 100644 tests/endpoints/test_ollama_generate.py diff --git a/docs/tutorials/ollama-endpoint.md b/docs/tutorials/ollama-endpoint.md new file mode 100644 index 000000000..2a114c6d9 --- /dev/null +++ b/docs/tutorials/ollama-endpoint.md @@ -0,0 +1,233 @@ + + +# Ollama Generate Endpoint + +The Ollama generate endpoint enables benchmarking of [Ollama](https://ollama.com/) models using the `/api/generate` endpoint. It supports both streaming and non-streaming text generation with full access to Ollama's configuration options. + +## When to Use + +Use the `ollama_generate` endpoint when: +- Benchmarking models running on Ollama +- You need access to Ollama-specific features like system prompts, JSON formatting, or raw mode +- You want to test Ollama's streaming capabilities + +## Basic Example + +Benchmark an Ollama model with default settings: + +```bash +aiperf profile \ + --model llama2 \ + --url http://localhost:11434 \ + --endpoint-type ollama_generate \ + --synthetic-input-tokens-mean 100 \ + --output-tokens-mean 50 \ + --concurrency 4 \ + --request-count 20 +``` + +## Configuration + +Configure the endpoint using `--extra-inputs` for Ollama-specific options: + +### Top-Level Parameters + +- **`system`**: System prompt to guide model behavior +- **`format`**: Output format (`"json"` or a JSON schema) +- **`raw`**: Skip prompt templating (boolean) +- **`keep_alive`**: Model persistence duration (e.g., `"5m"`, `"1h"`) +- **`images`**: List of base64-encoded images for vision models + +### Model Options + +Pass model parameters using the `options` object: + +- **`temperature`**: Sampling temperature (0.0-2.0) +- **`top_p`**: Nucleus sampling threshold +- **`top_k`**: Top-k sampling limit +- **`seed`**: Random seed for reproducibility +- **`num_ctx`**: Context window size +- **`stop`**: Stop sequences + +## Examples + +### Basic Text Generation + +```bash +aiperf profile \ + --model llama2 \ + --url http://localhost:11434 \ + --endpoint-type ollama_generate \ + --synthetic-input-tokens-mean 200 \ + --output-tokens-mean 100 \ + --concurrency 8 \ + --request-count 50 +``` + +### With System Prompt + +```bash +aiperf profile \ + --model mistral \ + --url http://localhost:11434 \ + --endpoint-type ollama_generate \ + --extra-inputs system:"You are a helpful AI assistant" \ + --synthetic-input-tokens-mean 150 \ + --output-tokens-mean 75 \ + --concurrency 4 \ + --request-count 25 +``` + +### With Model Options + +```bash +aiperf profile \ + --model llama2 \ + --url http://localhost:11434 \ + --endpoint-type ollama_generate \ + --extra-inputs options:'{ + "temperature": 0.7, + "top_p": 0.9, + "top_k": 40, + "seed": 42 + }' \ + --synthetic-input-tokens-mean 100 \ + --output-tokens-mean 50 \ + --concurrency 6 \ + --request-count 30 +``` + +### JSON Mode + +Force structured JSON output: + +```bash +aiperf profile \ + --model llama2 \ + --url http://localhost:11434 \ + --endpoint-type ollama_generate \ + --extra-inputs format:json \ + --extra-inputs system:"Return responses as valid JSON" \ + --synthetic-input-tokens-mean 100 \ + --output-tokens-mean 50 \ + --concurrency 4 \ + --request-count 20 +``` + +### Streaming Mode + +Enable streaming for token-by-token generation: + +```bash +aiperf profile \ + --model llama2 \ + --url http://localhost:11434 \ + --endpoint-type ollama_generate \ + --streaming \ + --synthetic-input-tokens-mean 200 \ + --output-tokens-mean 150 \ + --concurrency 2 \ + --request-count 10 +``` + +### With Custom Keep-Alive + +Control how long the model stays in memory: + +```bash +aiperf profile \ + --model codellama \ + --url http://localhost:11434 \ + --endpoint-type ollama_generate \ + --extra-inputs keep_alive:10m \ + --synthetic-input-tokens-mean 500 \ + --output-tokens-mean 200 \ + --concurrency 4 \ + --request-count 15 +``` + +### Vision Model (with Images) + +Benchmark vision-capable models: + +```bash +aiperf profile \ + --model llava \ + --url http://localhost:11434 \ + --endpoint-type ollama_generate \ + --extra-inputs images:'["base64_encoded_image_data"]' \ + --synthetic-input-tokens-mean 100 \ + --output-tokens-mean 50 \ + --concurrency 2 \ + --request-count 10 +``` + +### Complete Configuration + +Combine multiple options: + +```bash +aiperf profile \ + --model mistral \ + --url http://localhost:11434 \ + --endpoint-type ollama_generate \ + --streaming \ + --extra-inputs system:"You are a technical documentation writer" \ + --extra-inputs format:json \ + --extra-inputs keep_alive:5m \ + --extra-inputs options:'{ + "temperature": 0.3, + "top_p": 0.95, + "seed": 123, + "num_ctx": 4096 + }' \ + --synthetic-input-tokens-mean 300 \ + --output-tokens-mean 200 \ + --concurrency 4 \ + --request-count 50 +``` + +## Response Handling + +The endpoint automatically: +- Extracts generated text from the `response` field +- Parses token counts when `done: true`: + - `prompt_eval_count` → `prompt_tokens` + - `eval_count` → `completion_tokens` + - Calculates `total_tokens` +- Handles streaming chunks progressively + +## Tips + +- **Use `--streaming`** to benchmark Ollama's streaming performance +- **Set `keep_alive`** to avoid model reload overhead between requests +- **Use `format:json`** with a system prompt for structured output +- **Set `raw:true`** to skip Ollama's automatic prompt templating +- **Use `-v` or `-vv`** to see detailed request/response logs +- **Check `artifacts//`** for detailed metrics + +## Troubleshooting + +**Model not responding** +- Verify Ollama is running: `ollama list` +- Check the base URL is correct (default: `http://localhost:11434`) + +**Slow performance** +- Increase `keep_alive` to keep the model in memory +- Reduce concurrency if you're hitting resource limits + +**Invalid JSON responses** +- Add a system prompt when using `format:json` +- Not all models support JSON mode equally well + +**Token counts missing** +- Token counts only appear in the final response when `done: true` +- Check the model supports token counting + +## API Reference + +For complete Ollama API documentation, see: +- [Ollama Generate API](https://docs.ollama.com/api/generate) diff --git a/src/aiperf/common/enums/plugin_enums.py b/src/aiperf/common/enums/plugin_enums.py index 0264fd07f..06f283040 100644 --- a/src/aiperf/common/enums/plugin_enums.py +++ b/src/aiperf/common/enums/plugin_enums.py @@ -30,6 +30,7 @@ class EndpointType(CaseInsensitiveStrEnum): HF_TEI_RANKINGS = "hf_tei_rankings" HUGGINGFACE_GENERATE = "huggingface_generate" NIM_RANKINGS = "nim_rankings" + OLLAMA_GENERATE = "ollama_generate" SOLIDO_RAG = "solido_rag" TEMPLATE = "template" diff --git a/src/aiperf/endpoints/__init__.py b/src/aiperf/endpoints/__init__.py index b81e39f88..0eed7dbc1 100644 --- a/src/aiperf/endpoints/__init__.py +++ b/src/aiperf/endpoints/__init__.py @@ -19,6 +19,9 @@ from aiperf.endpoints.nim_rankings import ( NIMRankingsEndpoint, ) +from aiperf.endpoints.ollama_generate import ( + OllamaGenerateEndpoint, +) from aiperf.endpoints.openai_chat import ( ChatEndpoint, ) @@ -45,6 +48,7 @@ "HFTeiRankingsEndpoint", "HuggingFaceGenerateEndpoint", "NIMRankingsEndpoint", + "OllamaGenerateEndpoint", "SolidoEndpoint", "TemplateEndpoint", ] diff --git a/src/aiperf/endpoints/ollama_generate.py b/src/aiperf/endpoints/ollama_generate.py new file mode 100644 index 000000000..7c8e3755a --- /dev/null +++ b/src/aiperf/endpoints/ollama_generate.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import Any + +from aiperf.common.decorators import implements_protocol +from aiperf.common.enums import EndpointType +from aiperf.common.factories import EndpointFactory +from aiperf.common.models import ParsedResponse +from aiperf.common.models.metadata import EndpointMetadata +from aiperf.common.models.record_models import RequestInfo +from aiperf.common.protocols import EndpointProtocol, InferenceServerResponse +from aiperf.endpoints.base_endpoint import BaseEndpoint + + +@implements_protocol(EndpointProtocol) +@EndpointFactory.register(EndpointType.OLLAMA_GENERATE) +class OllamaGenerateEndpoint(BaseEndpoint): + """Ollama Generate endpoint. + + Supports both streaming and non-streaming text generation using Ollama's + /api/generate endpoint. This endpoint is designed for single-turn text + generation with optional system prompts and advanced parameters. + """ + + @classmethod + def metadata(cls) -> EndpointMetadata: + """Return Ollama Generate endpoint metadata.""" + return EndpointMetadata( + endpoint_path="/api/generate", + supports_streaming=True, + produces_tokens=True, + tokenizes_input=True, + metrics_title="LLM Metrics", + ) + + def format_payload(self, request_info: RequestInfo) -> dict[str, Any]: + """Format payload for Ollama Generate request. + + Args: + request_info: Request context including model endpoint, metadata, and turns + + Returns: + Ollama Generate API payload + """ + if not request_info.turns: + raise ValueError("Ollama Generate endpoint requires at least one turn.") + + turn = request_info.turns[0] + model_endpoint = request_info.model_endpoint + + prompt = " ".join( + [content for text in turn.texts for content in text.contents if content] + ) + + payload: dict[str, Any] = { + "model": turn.model or model_endpoint.primary_model_name, + "prompt": prompt, + "stream": model_endpoint.endpoint.streaming, + } + + if turn.max_tokens is not None: + payload.setdefault("options", {})["num_predict"] = turn.max_tokens + + if model_endpoint.endpoint.extra: + extra = dict(model_endpoint.endpoint.extra) + extra_options = extra.pop("options", {}) + + payload.update(extra) + + if extra_options: + payload.setdefault("options", {}).update(extra_options) + + self.debug(lambda: f"Formatted Ollama Generate payload: {payload}") + return payload + + def parse_response( + self, response: InferenceServerResponse + ) -> ParsedResponse | None: + """Parse Ollama Generate response. + + Handles both streaming and non-streaming modes. In streaming mode, + each chunk contains incremental response text. In non-streaming mode, + the complete response is returned at once. + + Args: + response: Raw response from inference server + + Returns: + Parsed response with extracted text and usage data + """ + json_obj = response.get_json() + if not json_obj: + return None + + text = json_obj.get("response") + if not text: + self.debug(lambda: f"No 'response' field in Ollama response: {json_obj}") + return None + + data = self.make_text_response_data(text) + + usage = None + if json_obj.get("done"): + prompt_eval_count = json_obj.get("prompt_eval_count") + eval_count = json_obj.get("eval_count") + + if prompt_eval_count is not None or eval_count is not None: + usage = { + "prompt_tokens": prompt_eval_count, + "completion_tokens": eval_count, + } + if prompt_eval_count is not None and eval_count is not None: + usage["total_tokens"] = prompt_eval_count + eval_count + + return ParsedResponse(perf_ns=response.perf_ns, data=data, usage=usage) diff --git a/tests/endpoints/test_ollama_generate.py b/tests/endpoints/test_ollama_generate.py new file mode 100644 index 000000000..cc7aa099f --- /dev/null +++ b/tests/endpoints/test_ollama_generate.py @@ -0,0 +1,255 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import Mock + +import pytest + +from aiperf.common.enums import EndpointType, ModelSelectionStrategy +from aiperf.common.models import ParsedResponse +from aiperf.common.models.metadata import EndpointMetadata +from aiperf.common.models.model_endpoint_info import ( + EndpointInfo, + ModelEndpointInfo, + ModelInfo, + ModelListInfo, +) +from aiperf.common.models.record_models import RequestInfo, Turn +from aiperf.common.protocols import InferenceServerResponse +from aiperf.endpoints.ollama_generate import OllamaGenerateEndpoint + + +class TestOllamaGenerateEndpoint: + """Unit tests for OllamaGenerateEndpoint.""" + + @pytest.fixture + def model_endpoint(self): + endpoint_info = EndpointInfo( + type=EndpointType.OLLAMA_GENERATE, + base_url="http://localhost:11434", + custom_endpoint=None, + ) + model_list = ModelListInfo( + models=[ModelInfo(name="llama2")], + model_selection_strategy=ModelSelectionStrategy.RANDOM, + ) + return ModelEndpointInfo(models=model_list, endpoint=endpoint_info) + + @pytest.fixture + def endpoint(self, model_endpoint): + ep = OllamaGenerateEndpoint(model_endpoint) + ep.debug = Mock() + ep.make_text_response_data = Mock(return_value={"text": "parsed"}) + return ep + + def test_metadata_values(self): + meta = OllamaGenerateEndpoint.metadata() + assert isinstance(meta, EndpointMetadata) + assert meta.endpoint_path == "/api/generate" + assert meta.supports_streaming + assert meta.produces_tokens + assert meta.tokenizes_input + assert meta.metrics_title == "LLM Metrics" + + def test_format_payload_basic(self, endpoint, model_endpoint): + turn = Turn(texts=[{"contents": ["Hello world"]}]) + request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn]) + + payload = endpoint.format_payload(request_info) + assert payload["model"] == "llama2" + assert payload["prompt"] == "Hello world" + assert payload["stream"] is False + + def test_format_payload_with_streaming(self, endpoint, model_endpoint): + model_endpoint.endpoint.streaming = True + turn = Turn(texts=[{"contents": ["Hi there"]}]) + request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn]) + + payload = endpoint.format_payload(request_info) + assert payload["stream"] is True + + def test_format_payload_with_max_tokens(self, endpoint, model_endpoint): + turn = Turn(texts=[{"contents": ["test"]}], max_tokens=100) + request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn]) + + payload = endpoint.format_payload(request_info) + assert payload["options"]["num_predict"] == 100 + + def test_format_payload_with_system_prompt(self, endpoint, model_endpoint): + model_endpoint.endpoint.extra = {"system": "You are a helpful assistant"} + turn = Turn(texts=[{"contents": ["test"]}]) + request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn]) + + payload = endpoint.format_payload(request_info) + assert payload["system"] == "You are a helpful assistant" + assert "system" not in payload.get("options", {}) + + def test_format_payload_with_format(self, endpoint, model_endpoint): + model_endpoint.endpoint.extra = {"format": "json"} + turn = Turn(texts=[{"contents": ["test"]}]) + request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn]) + + payload = endpoint.format_payload(request_info) + assert payload["format"] == "json" + assert "format" not in payload.get("options", {}) + + def test_format_payload_with_options(self, endpoint, model_endpoint): + model_endpoint.endpoint.extra = { + "options": { + "temperature": 0.7, + "top_p": 0.9, + "top_k": 40, + "seed": 42, + } + } + turn = Turn(texts=[{"contents": ["test"]}]) + request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn]) + + payload = endpoint.format_payload(request_info) + assert payload["options"]["temperature"] == 0.7 + assert payload["options"]["top_p"] == 0.9 + assert payload["options"]["top_k"] == 40 + assert payload["options"]["seed"] == 42 + + def test_format_payload_with_max_tokens_and_options(self, endpoint, model_endpoint): + model_endpoint.endpoint.extra = {"options": {"temperature": 0.8}} + turn = Turn(texts=[{"contents": ["test"]}], max_tokens=100) + request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn]) + + payload = endpoint.format_payload(request_info) + assert payload["options"]["num_predict"] == 100 + assert payload["options"]["temperature"] == 0.8 + + def test_format_payload_with_raw_flag(self, endpoint, model_endpoint): + model_endpoint.endpoint.extra = {"raw": True} + turn = Turn(texts=[{"contents": ["test"]}]) + request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn]) + + payload = endpoint.format_payload(request_info) + assert payload["raw"] is True + assert "raw" not in payload.get("options", {}) + + def test_format_payload_with_keep_alive(self, endpoint, model_endpoint): + model_endpoint.endpoint.extra = {"keep_alive": "5m"} + turn = Turn(texts=[{"contents": ["test"]}]) + request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn]) + + payload = endpoint.format_payload(request_info) + assert payload["keep_alive"] == "5m" + assert "keep_alive" not in payload.get("options", {}) + + def test_format_payload_with_images(self, endpoint, model_endpoint): + model_endpoint.endpoint.extra = {"images": ["base64_image_data"]} + turn = Turn(texts=[{"contents": ["test"]}]) + request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn]) + + payload = endpoint.format_payload(request_info) + assert payload["images"] == ["base64_image_data"] + assert "images" not in payload.get("options", {}) + + def test_format_payload_multiple_texts(self, endpoint, model_endpoint): + turn = Turn(texts=[{"contents": ["Hello", "world"]}, {"contents": ["test"]}]) + request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn]) + + payload = endpoint.format_payload(request_info) + assert payload["prompt"] == "Hello world test" + + def test_format_payload_custom_model(self, endpoint, model_endpoint): + turn = Turn(texts=[{"contents": ["test"]}], model="mistral") + request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn]) + + payload = endpoint.format_payload(request_info) + assert payload["model"] == "mistral" + + def test_format_payload_no_turns_raises(self, endpoint, model_endpoint): + request_info = RequestInfo(model_endpoint=model_endpoint, turns=[]) + with pytest.raises(ValueError, match="requires at least one turn"): + endpoint.format_payload(request_info) + + def test_parse_response_basic(self, endpoint): + response = Mock(spec=InferenceServerResponse) + response.get_json.return_value = { + "model": "llama2", + "response": "Hello!", + "done": False, + } + response.perf_ns = 123 + + result = endpoint.parse_response(response) + assert isinstance(result, ParsedResponse) + endpoint.make_text_response_data.assert_called_once_with("Hello!") + + def test_parse_response_with_done_and_usage(self, endpoint): + response = Mock(spec=InferenceServerResponse) + response.get_json.return_value = { + "model": "llama2", + "response": "Complete response", + "done": True, + "prompt_eval_count": 10, + "eval_count": 20, + } + response.perf_ns = 456 + + result = endpoint.parse_response(response) + assert isinstance(result, ParsedResponse) + assert result.usage.prompt_tokens == 10 + assert result.usage.completion_tokens == 20 + assert result.usage.total_tokens == 30 + + def test_parse_response_done_without_token_counts(self, endpoint): + response = Mock(spec=InferenceServerResponse) + response.get_json.return_value = { + "model": "llama2", + "response": "Done", + "done": True, + } + response.perf_ns = 789 + + result = endpoint.parse_response(response) + assert isinstance(result, ParsedResponse) + assert result.usage is None + + def test_parse_response_streaming_chunk(self, endpoint): + response = Mock(spec=InferenceServerResponse) + response.get_json.return_value = { + "model": "llama2", + "response": "Hi", + "done": False, + } + response.perf_ns = 111 + + result = endpoint.parse_response(response) + assert isinstance(result, ParsedResponse) + endpoint.make_text_response_data.assert_called_once_with("Hi") + assert result.usage is None + + def test_parse_response_no_response_field(self, endpoint): + response = Mock(spec=InferenceServerResponse) + response.get_json.return_value = {"model": "llama2", "done": False} + response.perf_ns = 222 + + result = endpoint.parse_response(response) + assert result is None + endpoint.debug.assert_called() + + def test_parse_response_empty_json(self, endpoint): + response = Mock(spec=InferenceServerResponse) + response.get_json.return_value = None + + result = endpoint.parse_response(response) + assert result is None + + def test_parse_response_with_partial_usage(self, endpoint): + response = Mock(spec=InferenceServerResponse) + response.get_json.return_value = { + "model": "llama2", + "response": "test", + "done": True, + "prompt_eval_count": 5, + } + response.perf_ns = 333 + + result = endpoint.parse_response(response) + assert isinstance(result, ParsedResponse) + assert result.usage.prompt_tokens == 5 + assert result.usage.completion_tokens is None