From e2915b34edb2f46b2fab6079482a62754b1f0500 Mon Sep 17 00:00:00 2001
From: Anthony Casagrande <acasagrande@nvidia.com>
Date: Thu, 6 Nov 2025 09:53:49 -0800
Subject: [PATCH] feat: Add native ollama_generate endpoint type support

---
 docs/tutorials/ollama-endpoint.md       | 233 ++++++++++++++++++++++
 src/aiperf/common/enums/plugin_enums.py |   1 +
 src/aiperf/endpoints/__init__.py        |   4 +
 src/aiperf/endpoints/ollama_generate.py | 118 +++++++++++
 tests/endpoints/test_ollama_generate.py | 255 ++++++++++++++++++++++++
 5 files changed, 611 insertions(+)
 create mode 100644 docs/tutorials/ollama-endpoint.md
 create mode 100644 src/aiperf/endpoints/ollama_generate.py
 create mode 100644 tests/endpoints/test_ollama_generate.py

diff --git a/docs/tutorials/ollama-endpoint.md b/docs/tutorials/ollama-endpoint.md
new file mode 100644
index 000000000..2a114c6d9
--- /dev/null
+++ b/docs/tutorials/ollama-endpoint.md
@@ -0,0 +1,233 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# Ollama Generate Endpoint
+
+The Ollama generate endpoint enables benchmarking of [Ollama](https://ollama.com/) models using the `/api/generate` endpoint. It supports both streaming and non-streaming text generation with full access to Ollama's configuration options.
+
+## When to Use
+
+Use the `ollama_generate` endpoint when:
+- Benchmarking models running on Ollama
+- You need access to Ollama-specific features like system prompts, JSON formatting, or raw mode
+- You want to test Ollama's streaming capabilities
+
+## Basic Example
+
+Benchmark an Ollama model with default settings:
+
+```bash
+aiperf profile \
+  --model llama2 \
+  --url http://localhost:11434 \
+  --endpoint-type ollama_generate \
+  --synthetic-input-tokens-mean 100 \
+  --output-tokens-mean 50 \
+  --concurrency 4 \
+  --request-count 20
+```
+
+## Configuration
+
+Configure the endpoint using `--extra-inputs` for Ollama-specific options:
+
+### Top-Level Parameters
+
+- **`system`**: System prompt to guide model behavior
+- **`format`**: Output format (`"json"` or a JSON schema)
+- **`raw`**: Skip prompt templating (boolean)
+- **`keep_alive`**: Model persistence duration (e.g., `"5m"`, `"1h"`)
+- **`images`**: List of base64-encoded images for vision models
+
+### Model Options
+
+Pass model parameters using the `options` object:
+
+- **`temperature`**: Sampling temperature (0.0-2.0)
+- **`top_p`**: Nucleus sampling threshold
+- **`top_k`**: Top-k sampling limit
+- **`seed`**: Random seed for reproducibility
+- **`num_ctx`**: Context window size
+- **`stop`**: Stop sequences
+
+## Examples
+
+### Basic Text Generation
+
+```bash
+aiperf profile \
+  --model llama2 \
+  --url http://localhost:11434 \
+  --endpoint-type ollama_generate \
+  --synthetic-input-tokens-mean 200 \
+  --output-tokens-mean 100 \
+  --concurrency 8 \
+  --request-count 50
+```
+
+### With System Prompt
+
+```bash
+aiperf profile \
+  --model mistral \
+  --url http://localhost:11434 \
+  --endpoint-type ollama_generate \
+  --extra-inputs system:"You are a helpful AI assistant" \
+  --synthetic-input-tokens-mean 150 \
+  --output-tokens-mean 75 \
+  --concurrency 4 \
+  --request-count 25
+```
+
+### With Model Options
+
+```bash
+aiperf profile \
+  --model llama2 \
+  --url http://localhost:11434 \
+  --endpoint-type ollama_generate \
+  --extra-inputs options:'{
+    "temperature": 0.7,
+    "top_p": 0.9,
+    "top_k": 40,
+    "seed": 42
+  }' \
+  --synthetic-input-tokens-mean 100 \
+  --output-tokens-mean 50 \
+  --concurrency 6 \
+  --request-count 30
+```
+
+### JSON Mode
+
+Force structured JSON output:
+
+```bash
+aiperf profile \
+  --model llama2 \
+  --url http://localhost:11434 \
+  --endpoint-type ollama_generate \
+  --extra-inputs format:json \
+  --extra-inputs system:"Return responses as valid JSON" \
+  --synthetic-input-tokens-mean 100 \
+  --output-tokens-mean 50 \
+  --concurrency 4 \
+  --request-count 20
+```
+
+### Streaming Mode
+
+Enable streaming for token-by-token generation:
+
+```bash
+aiperf profile \
+  --model llama2 \
+  --url http://localhost:11434 \
+  --endpoint-type ollama_generate \
+  --streaming \
+  --synthetic-input-tokens-mean 200 \
+  --output-tokens-mean 150 \
+  --concurrency 2 \
+  --request-count 10
+```
+
+### With Custom Keep-Alive
+
+Control how long the model stays in memory:
+
+```bash
+aiperf profile \
+  --model codellama \
+  --url http://localhost:11434 \
+  --endpoint-type ollama_generate \
+  --extra-inputs keep_alive:10m \
+  --synthetic-input-tokens-mean 500 \
+  --output-tokens-mean 200 \
+  --concurrency 4 \
+  --request-count 15
+```
+
+### Vision Model (with Images)
+
+Benchmark vision-capable models:
+
+```bash
+aiperf profile \
+  --model llava \
+  --url http://localhost:11434 \
+  --endpoint-type ollama_generate \
+  --extra-inputs images:'["base64_encoded_image_data"]' \
+  --synthetic-input-tokens-mean 100 \
+  --output-tokens-mean 50 \
+  --concurrency 2 \
+  --request-count 10
+```
+
+### Complete Configuration
+
+Combine multiple options:
+
+```bash
+aiperf profile \
+  --model mistral \
+  --url http://localhost:11434 \
+  --endpoint-type ollama_generate \
+  --streaming \
+  --extra-inputs system:"You are a technical documentation writer" \
+  --extra-inputs format:json \
+  --extra-inputs keep_alive:5m \
+  --extra-inputs options:'{
+    "temperature": 0.3,
+    "top_p": 0.95,
+    "seed": 123,
+    "num_ctx": 4096
+  }' \
+  --synthetic-input-tokens-mean 300 \
+  --output-tokens-mean 200 \
+  --concurrency 4 \
+  --request-count 50
+```
+
+## Response Handling
+
+The endpoint automatically:
+- Extracts generated text from the `response` field
+- Parses token counts when `done: true`:
+  - `prompt_eval_count` → `prompt_tokens`
+  - `eval_count` → `completion_tokens`
+  - Calculates `total_tokens`
+- Handles streaming chunks progressively
+
+## Tips
+
+- **Use `--streaming`** to benchmark Ollama's streaming performance
+- **Set `keep_alive`** to avoid model reload overhead between requests
+- **Use `format:json`** with a system prompt for structured output
+- **Set `raw:true`** to skip Ollama's automatic prompt templating
+- **Use `-v` or `-vv`** to see detailed request/response logs
+- **Check `artifacts/<run-name>/`** for detailed metrics
+
+## Troubleshooting
+
+**Model not responding**
+- Verify Ollama is running: `ollama list`
+- Check the base URL is correct (default: `http://localhost:11434`)
+
+**Slow performance**
+- Increase `keep_alive` to keep the model in memory
+- Reduce concurrency if you're hitting resource limits
+
+**Invalid JSON responses**
+- Add a system prompt when using `format:json`
+- Not all models support JSON mode equally well
+
+**Token counts missing**
+- Token counts only appear in the final response when `done: true`
+- Check the model supports token counting
+
+## API Reference
+
+For complete Ollama API documentation, see:
+- [Ollama Generate API](https://docs.ollama.com/api/generate)
diff --git a/src/aiperf/common/enums/plugin_enums.py b/src/aiperf/common/enums/plugin_enums.py
index 0264fd07f..06f283040 100644
--- a/src/aiperf/common/enums/plugin_enums.py
+++ b/src/aiperf/common/enums/plugin_enums.py
@@ -30,6 +30,7 @@ class EndpointType(CaseInsensitiveStrEnum):
     HF_TEI_RANKINGS = "hf_tei_rankings"
     HUGGINGFACE_GENERATE = "huggingface_generate"
     NIM_RANKINGS = "nim_rankings"
+    OLLAMA_GENERATE = "ollama_generate"
     SOLIDO_RAG = "solido_rag"
     TEMPLATE = "template"
 
diff --git a/src/aiperf/endpoints/__init__.py b/src/aiperf/endpoints/__init__.py
index b81e39f88..0eed7dbc1 100644
--- a/src/aiperf/endpoints/__init__.py
+++ b/src/aiperf/endpoints/__init__.py
@@ -19,6 +19,9 @@
 from aiperf.endpoints.nim_rankings import (
     NIMRankingsEndpoint,
 )
+from aiperf.endpoints.ollama_generate import (
+    OllamaGenerateEndpoint,
+)
 from aiperf.endpoints.openai_chat import (
     ChatEndpoint,
 )
@@ -45,6 +48,7 @@
     "HFTeiRankingsEndpoint",
     "HuggingFaceGenerateEndpoint",
     "NIMRankingsEndpoint",
+    "OllamaGenerateEndpoint",
     "SolidoEndpoint",
     "TemplateEndpoint",
 ]
diff --git a/src/aiperf/endpoints/ollama_generate.py b/src/aiperf/endpoints/ollama_generate.py
new file mode 100644
index 000000000..7c8e3755a
--- /dev/null
+++ b/src/aiperf/endpoints/ollama_generate.py
@@ -0,0 +1,118 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from typing import Any
+
+from aiperf.common.decorators import implements_protocol
+from aiperf.common.enums import EndpointType
+from aiperf.common.factories import EndpointFactory
+from aiperf.common.models import ParsedResponse
+from aiperf.common.models.metadata import EndpointMetadata
+from aiperf.common.models.record_models import RequestInfo
+from aiperf.common.protocols import EndpointProtocol, InferenceServerResponse
+from aiperf.endpoints.base_endpoint import BaseEndpoint
+
+
+@implements_protocol(EndpointProtocol)
+@EndpointFactory.register(EndpointType.OLLAMA_GENERATE)
+class OllamaGenerateEndpoint(BaseEndpoint):
+    """Ollama Generate endpoint.
+
+    Supports both streaming and non-streaming text generation using Ollama's
+    /api/generate endpoint. This endpoint is designed for single-turn text
+    generation with optional system prompts and advanced parameters.
+    """
+
+    @classmethod
+    def metadata(cls) -> EndpointMetadata:
+        """Return Ollama Generate endpoint metadata."""
+        return EndpointMetadata(
+            endpoint_path="/api/generate",
+            supports_streaming=True,
+            produces_tokens=True,
+            tokenizes_input=True,
+            metrics_title="LLM Metrics",
+        )
+
+    def format_payload(self, request_info: RequestInfo) -> dict[str, Any]:
+        """Format payload for Ollama Generate request.
+
+        Args:
+            request_info: Request context including model endpoint, metadata, and turns
+
+        Returns:
+            Ollama Generate API payload
+        """
+        if not request_info.turns:
+            raise ValueError("Ollama Generate endpoint requires at least one turn.")
+
+        turn = request_info.turns[0]
+        model_endpoint = request_info.model_endpoint
+
+        prompt = " ".join(
+            [content for text in turn.texts for content in text.contents if content]
+        )
+
+        payload: dict[str, Any] = {
+            "model": turn.model or model_endpoint.primary_model_name,
+            "prompt": prompt,
+            "stream": model_endpoint.endpoint.streaming,
+        }
+
+        if turn.max_tokens is not None:
+            payload.setdefault("options", {})["num_predict"] = turn.max_tokens
+
+        if model_endpoint.endpoint.extra:
+            extra = dict(model_endpoint.endpoint.extra)
+            extra_options = extra.pop("options", {})
+
+            payload.update(extra)
+
+            if extra_options:
+                payload.setdefault("options", {}).update(extra_options)
+
+        self.debug(lambda: f"Formatted Ollama Generate payload: {payload}")
+        return payload
+
+    def parse_response(
+        self, response: InferenceServerResponse
+    ) -> ParsedResponse | None:
+        """Parse Ollama Generate response.
+
+        Handles both streaming and non-streaming modes. In streaming mode,
+        each chunk contains incremental response text. In non-streaming mode,
+        the complete response is returned at once.
+
+        Args:
+            response: Raw response from inference server
+
+        Returns:
+            Parsed response with extracted text and usage data
+        """
+        json_obj = response.get_json()
+        if not json_obj:
+            return None
+
+        text = json_obj.get("response")
+        if not text:
+            self.debug(lambda: f"No 'response' field in Ollama response: {json_obj}")
+            return None
+
+        data = self.make_text_response_data(text)
+
+        usage = None
+        if json_obj.get("done"):
+            prompt_eval_count = json_obj.get("prompt_eval_count")
+            eval_count = json_obj.get("eval_count")
+
+            if prompt_eval_count is not None or eval_count is not None:
+                usage = {
+                    "prompt_tokens": prompt_eval_count,
+                    "completion_tokens": eval_count,
+                }
+                if prompt_eval_count is not None and eval_count is not None:
+                    usage["total_tokens"] = prompt_eval_count + eval_count
+
+        return ParsedResponse(perf_ns=response.perf_ns, data=data, usage=usage)
diff --git a/tests/endpoints/test_ollama_generate.py b/tests/endpoints/test_ollama_generate.py
new file mode 100644
index 000000000..cc7aa099f
--- /dev/null
+++ b/tests/endpoints/test_ollama_generate.py
@@ -0,0 +1,255 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from unittest.mock import Mock
+
+import pytest
+
+from aiperf.common.enums import EndpointType, ModelSelectionStrategy
+from aiperf.common.models import ParsedResponse
+from aiperf.common.models.metadata import EndpointMetadata
+from aiperf.common.models.model_endpoint_info import (
+    EndpointInfo,
+    ModelEndpointInfo,
+    ModelInfo,
+    ModelListInfo,
+)
+from aiperf.common.models.record_models import RequestInfo, Turn
+from aiperf.common.protocols import InferenceServerResponse
+from aiperf.endpoints.ollama_generate import OllamaGenerateEndpoint
+
+
+class TestOllamaGenerateEndpoint:
+    """Unit tests for OllamaGenerateEndpoint."""
+
+    @pytest.fixture
+    def model_endpoint(self):
+        endpoint_info = EndpointInfo(
+            type=EndpointType.OLLAMA_GENERATE,
+            base_url="http://localhost:11434",
+            custom_endpoint=None,
+        )
+        model_list = ModelListInfo(
+            models=[ModelInfo(name="llama2")],
+            model_selection_strategy=ModelSelectionStrategy.RANDOM,
+        )
+        return ModelEndpointInfo(models=model_list, endpoint=endpoint_info)
+
+    @pytest.fixture
+    def endpoint(self, model_endpoint):
+        ep = OllamaGenerateEndpoint(model_endpoint)
+        ep.debug = Mock()
+        ep.make_text_response_data = Mock(return_value={"text": "parsed"})
+        return ep
+
+    def test_metadata_values(self):
+        meta = OllamaGenerateEndpoint.metadata()
+        assert isinstance(meta, EndpointMetadata)
+        assert meta.endpoint_path == "/api/generate"
+        assert meta.supports_streaming
+        assert meta.produces_tokens
+        assert meta.tokenizes_input
+        assert meta.metrics_title == "LLM Metrics"
+
+    def test_format_payload_basic(self, endpoint, model_endpoint):
+        turn = Turn(texts=[{"contents": ["Hello world"]}])
+        request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn])
+
+        payload = endpoint.format_payload(request_info)
+        assert payload["model"] == "llama2"
+        assert payload["prompt"] == "Hello world"
+        assert payload["stream"] is False
+
+    def test_format_payload_with_streaming(self, endpoint, model_endpoint):
+        model_endpoint.endpoint.streaming = True
+        turn = Turn(texts=[{"contents": ["Hi there"]}])
+        request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn])
+
+        payload = endpoint.format_payload(request_info)
+        assert payload["stream"] is True
+
+    def test_format_payload_with_max_tokens(self, endpoint, model_endpoint):
+        turn = Turn(texts=[{"contents": ["test"]}], max_tokens=100)
+        request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn])
+
+        payload = endpoint.format_payload(request_info)
+        assert payload["options"]["num_predict"] == 100
+
+    def test_format_payload_with_system_prompt(self, endpoint, model_endpoint):
+        model_endpoint.endpoint.extra = {"system": "You are a helpful assistant"}
+        turn = Turn(texts=[{"contents": ["test"]}])
+        request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn])
+
+        payload = endpoint.format_payload(request_info)
+        assert payload["system"] == "You are a helpful assistant"
+        assert "system" not in payload.get("options", {})
+
+    def test_format_payload_with_format(self, endpoint, model_endpoint):
+        model_endpoint.endpoint.extra = {"format": "json"}
+        turn = Turn(texts=[{"contents": ["test"]}])
+        request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn])
+
+        payload = endpoint.format_payload(request_info)
+        assert payload["format"] == "json"
+        assert "format" not in payload.get("options", {})
+
+    def test_format_payload_with_options(self, endpoint, model_endpoint):
+        model_endpoint.endpoint.extra = {
+            "options": {
+                "temperature": 0.7,
+                "top_p": 0.9,
+                "top_k": 40,
+                "seed": 42,
+            }
+        }
+        turn = Turn(texts=[{"contents": ["test"]}])
+        request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn])
+
+        payload = endpoint.format_payload(request_info)
+        assert payload["options"]["temperature"] == 0.7
+        assert payload["options"]["top_p"] == 0.9
+        assert payload["options"]["top_k"] == 40
+        assert payload["options"]["seed"] == 42
+
+    def test_format_payload_with_max_tokens_and_options(self, endpoint, model_endpoint):
+        model_endpoint.endpoint.extra = {"options": {"temperature": 0.8}}
+        turn = Turn(texts=[{"contents": ["test"]}], max_tokens=100)
+        request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn])
+
+        payload = endpoint.format_payload(request_info)
+        assert payload["options"]["num_predict"] == 100
+        assert payload["options"]["temperature"] == 0.8
+
+    def test_format_payload_with_raw_flag(self, endpoint, model_endpoint):
+        model_endpoint.endpoint.extra = {"raw": True}
+        turn = Turn(texts=[{"contents": ["test"]}])
+        request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn])
+
+        payload = endpoint.format_payload(request_info)
+        assert payload["raw"] is True
+        assert "raw" not in payload.get("options", {})
+
+    def test_format_payload_with_keep_alive(self, endpoint, model_endpoint):
+        model_endpoint.endpoint.extra = {"keep_alive": "5m"}
+        turn = Turn(texts=[{"contents": ["test"]}])
+        request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn])
+
+        payload = endpoint.format_payload(request_info)
+        assert payload["keep_alive"] == "5m"
+        assert "keep_alive" not in payload.get("options", {})
+
+    def test_format_payload_with_images(self, endpoint, model_endpoint):
+        model_endpoint.endpoint.extra = {"images": ["base64_image_data"]}
+        turn = Turn(texts=[{"contents": ["test"]}])
+        request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn])
+
+        payload = endpoint.format_payload(request_info)
+        assert payload["images"] == ["base64_image_data"]
+        assert "images" not in payload.get("options", {})
+
+    def test_format_payload_multiple_texts(self, endpoint, model_endpoint):
+        turn = Turn(texts=[{"contents": ["Hello", "world"]}, {"contents": ["test"]}])
+        request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn])
+
+        payload = endpoint.format_payload(request_info)
+        assert payload["prompt"] == "Hello world test"
+
+    def test_format_payload_custom_model(self, endpoint, model_endpoint):
+        turn = Turn(texts=[{"contents": ["test"]}], model="mistral")
+        request_info = RequestInfo(model_endpoint=model_endpoint, turns=[turn])
+
+        payload = endpoint.format_payload(request_info)
+        assert payload["model"] == "mistral"
+
+    def test_format_payload_no_turns_raises(self, endpoint, model_endpoint):
+        request_info = RequestInfo(model_endpoint=model_endpoint, turns=[])
+        with pytest.raises(ValueError, match="requires at least one turn"):
+            endpoint.format_payload(request_info)
+
+    def test_parse_response_basic(self, endpoint):
+        response = Mock(spec=InferenceServerResponse)
+        response.get_json.return_value = {
+            "model": "llama2",
+            "response": "Hello!",
+            "done": False,
+        }
+        response.perf_ns = 123
+
+        result = endpoint.parse_response(response)
+        assert isinstance(result, ParsedResponse)
+        endpoint.make_text_response_data.assert_called_once_with("Hello!")
+
+    def test_parse_response_with_done_and_usage(self, endpoint):
+        response = Mock(spec=InferenceServerResponse)
+        response.get_json.return_value = {
+            "model": "llama2",
+            "response": "Complete response",
+            "done": True,
+            "prompt_eval_count": 10,
+            "eval_count": 20,
+        }
+        response.perf_ns = 456
+
+        result = endpoint.parse_response(response)
+        assert isinstance(result, ParsedResponse)
+        assert result.usage.prompt_tokens == 10
+        assert result.usage.completion_tokens == 20
+        assert result.usage.total_tokens == 30
+
+    def test_parse_response_done_without_token_counts(self, endpoint):
+        response = Mock(spec=InferenceServerResponse)
+        response.get_json.return_value = {
+            "model": "llama2",
+            "response": "Done",
+            "done": True,
+        }
+        response.perf_ns = 789
+
+        result = endpoint.parse_response(response)
+        assert isinstance(result, ParsedResponse)
+        assert result.usage is None
+
+    def test_parse_response_streaming_chunk(self, endpoint):
+        response = Mock(spec=InferenceServerResponse)
+        response.get_json.return_value = {
+            "model": "llama2",
+            "response": "Hi",
+            "done": False,
+        }
+        response.perf_ns = 111
+
+        result = endpoint.parse_response(response)
+        assert isinstance(result, ParsedResponse)
+        endpoint.make_text_response_data.assert_called_once_with("Hi")
+        assert result.usage is None
+
+    def test_parse_response_no_response_field(self, endpoint):
+        response = Mock(spec=InferenceServerResponse)
+        response.get_json.return_value = {"model": "llama2", "done": False}
+        response.perf_ns = 222
+
+        result = endpoint.parse_response(response)
+        assert result is None
+        endpoint.debug.assert_called()
+
+    def test_parse_response_empty_json(self, endpoint):
+        response = Mock(spec=InferenceServerResponse)
+        response.get_json.return_value = None
+
+        result = endpoint.parse_response(response)
+        assert result is None
+
+    def test_parse_response_with_partial_usage(self, endpoint):
+        response = Mock(spec=InferenceServerResponse)
+        response.get_json.return_value = {
+            "model": "llama2",
+            "response": "test",
+            "done": True,
+            "prompt_eval_count": 5,
+        }
+        response.perf_ns = 333
+
+        result = endpoint.parse_response(response)
+        assert isinstance(result, ParsedResponse)
+        assert result.usage.prompt_tokens == 5
+        assert result.usage.completion_tokens is None