diff --git a/declarative_poc/llmdbench/__init__.py b/declarative_poc/llmdbench/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/declarative_poc/llmdbench/llmdbench/__init__.py b/declarative_poc/llmdbench/llmdbench/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/declarative_poc/llmdbench/llmdbench/cli.py b/declarative_poc/llmdbench/llmdbench/cli.py new file mode 100644 index 00000000..112f65e9 --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/cli.py @@ -0,0 +1,131 @@ +from llmdbench.parser.systemparser import SystemParser +from llmdbench.logging.logger import get_logger, set_stage + +import json +import os +import argparse +import yaml + + +def cli(): + """ + Command-line interface for llmdbench. + + Subcommands: + - plan: Merge and render YAMLs (previously 'configure') + - prepare: Prepare environment or data before execution + - execute: Run workloads or apply configurations + - destroy: Clean up or rollback resources + - report: Generate summary or benchmark reports + """ + logger = get_logger("llmdbench.cli") + + parser = argparse.ArgumentParser( + prog="llmdbench", + description="Manage and benchmark llmd configurations.", + ) + + parser.add_argument( + "--workspace", + required=True, + default=".", + help="Workspace directory used as the root for configs, outputs, etc.", + ) + + subparsers = parser.add_subparsers(dest="command", required=True) + + # -------------------------- + # plan + # -------------------------- + plan_parser = subparsers.add_parser( + "plan", + help="Merge charts/images and render templates into a versioned YAML plan.", + ) + plan_parser.add_argument( + "--experiment", + required=True, + help="Path to the experiment file to plan.", + ) + + # -------------------------- + # prepare + # -------------------------- + prepare_parser = subparsers.add_parser( + "prepare", help="Prepare the environment or dependencies for execution." + ) + prepare_parser.add_argument( + "--config", required=False, help="Optional path to configuration YAML." + ) + + # -------------------------- + # execute + # -------------------------- + execute_parser = subparsers.add_parser( + "execute", help="Execute the benchmark or deployment defined in the plan." + ) + execute_parser.add_argument( + "--plan", required=True, help="Path to the planned YAML configuration." + ) + + # -------------------------- + # destroy + # -------------------------- + destroy_parser = subparsers.add_parser( + "destroy", help="Tear down or rollback any created resources." + ) + destroy_parser.add_argument( + "--plan", required=False, help="Path to the plan used for deployment." + ) + + # -------------------------- + # report + # -------------------------- + report_parser = subparsers.add_parser( + "report", help="Generate a report or analysis from execution results." + ) + report_parser.add_argument( + "--input", required=False, help="Path to execution results or metrics." + ) + report_parser.add_argument( + "--output", default="report.yaml", help="Path to save the report output." + ) + + # -------------------------- + # Parse and dispatch + # -------------------------- + args = parser.parse_args() + + with open(args.experiment, "r") as f: + data = yaml.safe_load(f) + default_file = os.path.join(args.workspace, data["values"]["path"].lstrip("/")) + template_path = os.path.join(args.workspace, data["templates"]["path"].lstrip("/")) + scenario_path = os.path.join(args.workspace, data["scenario"]["path"].lstrip("/")) + + # Regardless - we need create a plan - otherwise we won't have context of + # what to todo - in the future we can "import" a context to "rerun" a plan. + system = SystemParser(default_file, template_path, args.workspace, scenario_path) + system.parse() + + if args.command == "plan": + set_stage(logger, "๐Ÿ”ง PLAN") + logger.info("Creating execution and deployment plan...") + logger.info(f"Plan saved to {args.workspace}") + system.plan_to_yaml() + elif args.command == "prepare": + set_stage(logger, "๐Ÿ”ง PREPARE") + logger.info("Preparing environment...") + elif args.command == "execute": + set_stage(logger, "๐Ÿš€ EXECUTE") + logger.info(f"Executing plan: {args.plan}") + elif args.command == "destroy": + set_stage(logger, "๐Ÿงน DESTROY") + logger.info("Cleaning up resources...") + elif args.command == "report": + set_stage(logger, "๐Ÿ“Š REPORT") + logger.info("Generating report...") + else: + parser.print_help() + + +if __name__ == "__main__": + cli() diff --git a/declarative_poc/llmdbench/llmdbench/logging/logger.py b/declarative_poc/llmdbench/llmdbench/logging/logger.py new file mode 100644 index 00000000..c37115ce --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/logging/logger.py @@ -0,0 +1,40 @@ +import logging +import sys + + +class StageFormatter(logging.Formatter): + def __init__(self, stage="RUN", fmt=None, datefmt=None): + self.stage = stage + super().__init__(fmt=fmt, datefmt=datefmt) + + def format(self, record): + record.stage = getattr(record, "stage", self.stage) + return super().format(record) + + +def get_logger(name="llmdbench", stage="RUN", level=logging.INFO): + logger = logging.getLogger(name) + + if not logger.handlers: + handler = logging.StreamHandler(sys.stdout) + formatter = StageFormatter( + stage=stage, + fmt="%(asctime)s - %(levelname)-8s - %(name)s - %(stage)s - %(message)s", + datefmt="%H:%M:%S", + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(level) + logger.propagate = False + else: + for handler in logger.handlers: + if isinstance(handler.formatter, StageFormatter): + handler.formatter.stage = stage + + return logger + + +def set_stage(logger, stage): + for handler in logger.handlers: + if isinstance(handler.formatter, StageFormatter): + handler.formatter.stage = stage diff --git a/declarative_poc/llmdbench/llmdbench/parser/__init__.py b/declarative_poc/llmdbench/llmdbench/parser/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/declarative_poc/llmdbench/llmdbench/parser/systemparser.py b/declarative_poc/llmdbench/llmdbench/parser/systemparser.py new file mode 100644 index 00000000..29defa4f --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/parser/systemparser.py @@ -0,0 +1,310 @@ +import yaml +import copy +import json +import subprocess +import requests +import re +import os +from os.path import abspath +from jinja2 import Environment, FileSystemLoader + + +class LiteralStr(str): + pass + + +class SystemParser: + def __init__(self, defaults_file, templates_dir, workspace, scenario_file=None): + self.defaults = self._load_yaml(defaults_file) + + # Directory containing `.j2` (jinja) templates + self.templates_dir = templates_dir + + # Output of rendered templates, etc. + self.workspace = workspace + + # Overrides the "self.defaults" values file + self.scenario = self._load_yaml(scenario_file) if scenario_file else {} + + self._charts_key = "charts" + self.charts = {} + + self._images_key = "images" + self.images = {} + + self._system_stack_key = "system" + self.system_stack = {} + + self._system_prepare_key = "prepare" + self._system_prepare = {} + + self._system_harness_key = "harness" + self._system_harness = {} + + self._system_components_key = "components" + self._system_components = {} + + self.modelservice = {} + + self.gateway_api_inference_extension = {} + + self.standalone = {} + + def _load_yaml(self, file_path): + """Load YAML file""" + with open(file_path, "r") as f: + return yaml.safe_load(f) or {} + + def _merge_lists(self, base_list, override_list): + """ + Merge lists of dictionaries by 'name' field. + If items have 'name' field, merge by matching names. + Otherwise, replace the entire list. + """ + if base_list and isinstance(base_list[0], dict) and "name" in base_list[0]: + result = copy.deepcopy(base_list) + base_map = {item["name"]: idx for idx, item in enumerate(result)} + for override_item in override_list: + if "name" in override_item: + name = override_item["name"] + if name in base_map: + idx = base_map[name] + result[idx] = self._deep_merge(result[idx], override_item) + else: + result.append(copy.deepcopy(override_item)) + + return result + else: + return copy.deepcopy(override_list) + + def _deep_merge(self, base, overrides): + """ + Recursively merge overrides into base dictionary. + For lists of dicts with 'name' field, merge by matching names. + Overrides take precedence over base values. + """ + result = copy.deepcopy(base) + for key, value in overrides.items(): + if key in result: + if isinstance(result[key], dict) and isinstance(value, dict): + result[key] = self._deep_merge(result[key], value) + elif isinstance(result[key], list) and isinstance(value, list): + result[key] = self._merge_lists(result[key], value) + else: + result[key] = value + else: + result[key] = copy.deepcopy(value) + return result + + def _get_nested(self, data, path: str): + """ + Retrieves a nested structure using dotted paths. + Supports list indexes like key.0.name or key[0].name. + """ + path = re.sub(r"\[(\d+)\]", r".\1", path) + parts = path.split(".") + + current = data + + for part in parts: + if isinstance(current, dict): + current = current.get(part, {}) + elif isinstance(current, list): + if not part.isdigit(): + return {} + idx = int(part) + if idx < 0 or idx >= len(current): + return {} + current = current[idx] + else: + return {} + + return current + + def _render_values(self, key): + render = self._get_nested(self.defaults, key) + scenarios = self.scenario.get("scenario", []) + for i, _ in enumerate(scenarios): + path = f"scenario.{i}.{key}" + scenario_value = self._get_nested(self.scenario, path) + render = self._deep_merge(render, scenario_value) + + return render + + def _build_indexes(self): + """Build lookup dictionaries for all categories""" + self._indexes = {} + for category in [self._charts_key, self._images_key]: + data = getattr(self, category, {}) + self._indexes[category] = { + item["name"]: item for item in data.get("user-overrides", []) + } + + def _skopeo_list_tags(self, ref): + """ + Call: skopeo list-tags docker://ghcr.io/org/image + Return: list of tags (strings) + """ + try: + cmd = ["skopeo", "list-tags", f"docker://{ref}"] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + data = json.loads(result.stdout) + return data.get("Tags", []) + except Exception as e: + raise RuntimeError(f"Skopeo failed for {ref}: {e}") + + def _is_oci_repo(self, url: str) -> bool: + return url.startswith("oci://") + + def _helm_http_list_versions(self, url, chart_name): + """ + Given a Helm HTTP repo URL and chart name, return list of versions. + Uses index.yaml which lives at: /index.yaml + """ + index_url = url.rstrip("/") + "/index.yaml" + response = requests.get(index_url, timeout=10) + if response.status_code != 200: + raise RuntimeError(f"Failed to fetch {index_url}: {response.status_code}") + index = yaml.safe_load(response.text) + entries = index.get("entries", {}) + if chart_name not in entries: + raise RuntimeError(f"Chart '{chart_name}' not found at {index_url}") + versions = [entry["version"] for entry in entries[chart_name]] + return versions + + def _resolve_chart_auto_versions(self): + items = self.charts.get("user-overrides", []) + for item in items: + if str(item.get("version", "")) != ".auto": + continue + url = item["url"] + name = item["name"] + if self._is_oci_repo(url): + ref = url.replace("oci://", "") + tags = self._skopeo_list_tags(ref) + else: + tags = self._helm_http_list_versions(url, name) + if not tags: + raise RuntimeError(f"No chart versions found for {name}") + tags.sort() + latest = tags[-1] + item["version"] = latest + + def _resolve_image_auto_tags(self): + items = self.images.get("user-overrides", []) + for item in items: + if str(item.get("tag", "")) == ".auto": + registry = item["registry"] + repo = item["repo"] + image = item["image"] + ref = f"{registry}/{repo}/{image}" + tags = self._skopeo_list_tags(ref) + if not tags: + raise RuntimeError(f"No tags found for image {item['name']}") + tags.sort() + latest = tags[-1] + item["tag"] = latest + + def _literal_str_representer(self, dumper, data): + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + + def _convert_multiline_strings(self, obj): + if isinstance(obj, dict): + return {k: self._convert_multiline_strings(v) for k, v in obj.items()} + if isinstance(obj, list): + return [self._convert_multiline_strings(v) for v in obj] + if isinstance(obj, str) and "\n" in obj: + return LiteralStr(obj) + return obj + + def get_item_by_name(self, category, name): + """Generic method to get an item by name from any category""" + if category not in self._indexes: + raise ValueError(f"Unknown category: {category}") + return self._indexes[category].get(name) + + def get_chart_by_name(self, name): + return self.get_item_by_name(self._charts_key, name) + + def get_image_by_name(self, name): + return self.get_item_by_name(self._images_key, name) + + def plan_to_dict(self): + return { + self._charts_key: self.charts, + self._images_key: self.images, + self._system_stack_key: self.system_stack, + self._system_prepare_key: self.system_prepare, + self._system_harness_key: self.system_harness, + self._system_components_key: self.system_components, + } + + def plan_to_yaml(self): + plan = self.plan_to_dict() + plan = self._convert_multiline_strings(plan) + dir = abspath(self.workspace) + out_dir = os.path.join(self.workspace, "sut-plan-1") + output_file = os.path.join(out_dir, "sut-plan-1-defaults.yaml") + os.makedirs(dir, exist_ok=True) + print(output_file) + with open(output_file, "w") as f: + yaml.dump( + plan, + f, + default_flow_style=False, + sort_keys=False, + allow_unicode=True, + ) + + def render_components(self): + # Setup Jinja env + env = Environment( + loader=FileSystemLoader(self.templates_dir), + trim_blocks=True, + lstrip_blocks=True, + ) + + plan = self.plan_to_dict() + + # Make sure output directory exists + out_dir = os.path.join(self.workspace, "sut-plan-1") + os.makedirs(out_dir, exist_ok=True) + + # Walk through template directory + for root, _, files in os.walk(self.templates_dir): + for filename in files: + if filename.endswith(".j2"): + template_path = os.path.relpath( + os.path.join(root, filename), self.templates_dir + ) + + template = env.get_template(template_path) + + # Render with dict expansion so Jinja sees top-level keys + rendered = template.render(**plan) + + # Create output filename + output_filename = filename.replace(".yaml.j2", ".yaml") + output_path = os.path.join(out_dir, output_filename) + + with open(output_path, "w") as outfile: + outfile.write(rendered) + + def parse(self): + """Load defaults and apply overrides""" + yaml.add_representer(LiteralStr, self._literal_str_representer) + + self.charts = self._render_values(self._charts_key) + self.images = self._render_values(self._images_key) + self.system_stack = self._render_values(self._system_stack_key) + self.system_prepare = self._render_values(self._system_prepare_key) + self.system_harness = self._render_values(self._system_harness_key) + self.system_components = self._render_values(self._system_components_key) + + self._resolve_chart_auto_versions() + self._resolve_image_auto_tags() + self._build_indexes() + + self.render_components() + + return self.plan_to_dict() diff --git a/declarative_poc/llmdbench/llmdbench/plan/experiment.py b/declarative_poc/llmdbench/llmdbench/plan/experiment.py new file mode 100644 index 00000000..3e818027 --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/plan/experiment.py @@ -0,0 +1,3 @@ +class Experiment: + def __init__(self): + pass diff --git a/declarative_poc/llmdbench/llmdbench/plan/harness.py b/declarative_poc/llmdbench/llmdbench/plan/harness.py new file mode 100644 index 00000000..274f352c --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/plan/harness.py @@ -0,0 +1,4 @@ +class Harness: + def __init__(self, runner_name: str, runner_content: dict): + self.name = runner_name + self.runner = runner_content diff --git a/declarative_poc/llmdbench/llmdbench/plan/plan.py b/declarative_poc/llmdbench/llmdbench/plan/plan.py new file mode 100644 index 00000000..1767f422 --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/plan/plan.py @@ -0,0 +1,3 @@ +class Plan: + def __init__(self): + pass diff --git a/declarative_poc/llmdbench/llmdbench/plan/system.py b/declarative_poc/llmdbench/llmdbench/plan/system.py new file mode 100644 index 00000000..0028d443 --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/plan/system.py @@ -0,0 +1,3 @@ +class System: + def __init__(self): + pass diff --git a/declarative_poc/llmdbench/pyproject.toml b/declarative_poc/llmdbench/pyproject.toml new file mode 100644 index 00000000..d8e1509b --- /dev/null +++ b/declarative_poc/llmdbench/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "llmdbench" +version = "0.1.0" +description = "A library for configuration discovery and benchmarking for llm-d." +dependencies = [ + "PyYAML", + "Jinja2", + "requests", + "packaging" +] + +[project.scripts] +llmdbench = "llmdbench.cli:cli" diff --git a/declarative_poc/workspace/experiments/inference-scheduling.yaml b/declarative_poc/workspace/experiments/inference-scheduling.yaml new file mode 100644 index 00000000..89a4e198 --- /dev/null +++ b/declarative_poc/workspace/experiments/inference-scheduling.yaml @@ -0,0 +1,6 @@ +values: + path: templates/default_system.yaml +templates: + path: templates/ +scenario: + path: scenarios/inference-scheduling.yaml diff --git a/declarative_poc/workspace/experiments/pd-disaggregation.yaml b/declarative_poc/workspace/experiments/pd-disaggregation.yaml new file mode 100644 index 00000000..e2a105f3 --- /dev/null +++ b/declarative_poc/workspace/experiments/pd-disaggregation.yaml @@ -0,0 +1,4 @@ +template: + path: /Users/vezio/IBM/llmd/haul/templates/default_system.yaml +scenario: + path: /Users/vezio/IBM/llmd/haul/scenarios/pd-disaggregation.yaml \ No newline at end of file diff --git a/declarative_poc/workspace/scenarios/inference-scheduling.yaml b/declarative_poc/workspace/scenarios/inference-scheduling.yaml new file mode 100644 index 00000000..a1ed88bf --- /dev/null +++ b/declarative_poc/workspace/scenarios/inference-scheduling.yaml @@ -0,0 +1,40 @@ +scenario: + - name: "sut-1" + prepare: + user-overrides: + secrets: + - name: llm-d-hf-token + secret: HF_TOKEN + contents: REPLACE_HF_TOKEN + files: + - name: llm-d-benchmark-preprocesses + path: REPLACE_DIR_PATH + system: + user-overrides: + - name: "default" + inference-engine: + model: + - name: meta-llama/Llama-3.1-8B-Instruct + label: .auto + maxlen: 16384 + blocksize: 64 + replicas: + decode: 2 + prefill: 0 + volumes: + - name: model-storage + size: 1Ti + command: + decode: + type: vllmServe + args: + - "--enforce-eager" + - "--block-size" + - "64" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16374" + \ No newline at end of file diff --git a/declarative_poc/workspace/scenarios/pd-disaggregation.yaml b/declarative_poc/workspace/scenarios/pd-disaggregation.yaml new file mode 100644 index 00000000..77bba40b --- /dev/null +++ b/declarative_poc/workspace/scenarios/pd-disaggregation.yaml @@ -0,0 +1,65 @@ +scenario: + - name: "sut-1" + prepare: + user-overrides: + secrets: + - name: llm-d-hf-token + secret: HF_TOKEN + contents: TYLERS_TOKEN + system: + user-overrides: + - name: "default" + inference-engine: + model: + - name: meta-llama/Llama-3.1-8B-Instruct + label: .auto + maxlen: 16000 + blocksize: 128 + replicas: + decode: 2 + prefill: 2 + parallelism: + decode: + tensor: 1 + prefill: + tensor: 1 + resources: + decode: + memory: 128Gi + cpu: 32 + prefill: + memory: 128Gi + cpu: 32 + volumes: + - name: model-storage + size: 1Ti + command: + decode: + type: vllmServe + args: + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16000" + prefill: + type: vllmServe + args: + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16000" + harness: + user-overrides: + - name: default + namespace: harnessns + harness: + name: vllm-benchmark + profile: random_concurrent.yaml \ No newline at end of file diff --git a/declarative_poc/workspace/templates/default_system.yaml b/declarative_poc/workspace/templates/default_system.yaml new file mode 100644 index 00000000..42106dd3 --- /dev/null +++ b/declarative_poc/workspace/templates/default_system.yaml @@ -0,0 +1,350 @@ +images: + user-overrides: + - name: llm-d-benchmark + registry: ghcr.io + repo: llm-d + image: llm-d-benchmark + tag: .auto # list all tags, pick the latest + - name: llm-d + registry: ghcr.io + repo: llm-d + image: llm-d-cuda + tag: .auto # list all tags, pick the latest + - name: llm-d-model-service + registry: ghcr.io + repo: llm-d + image: llm-d-model-service + tag: .auto # list all tags, pick the latest + - name: llm-d-inference-scheduler + registry: ghcr.io + repo: llm-d + image: llm-d-inference-scheduler + tag: .auto # list all tags, pick the latest + - name: llm-d-routing-sidecar + registry: ghcr.io + repo: llm-d + image: llm-d-routing-sidecar + tag: .auto # list all tags, pick the latest + - name: llm-d-inference-sim + registry: ghcr.io + repo: llm-d + image: llm-d-inference-sim + tag: .auto # list all tags, pick the latest + - name: vllm + registry: docker.io + repo: vllm + image: vllm-openai + tag: latest + +charts: + user-overrides: + - name: kgateway-crds + url: oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds + version: 2.0.3 + - name: kgateway + url: oci://cr.kgateway.dev/kgateway-dev/charts/kgateway + version: 2.0.3 + - name: istio + url: oci://gcr.io/istio-testing/charts + version: 1.28-alpha.89f30b26ba71bf5e538083a4720d0bc2d8c06401 + - name: llm-d-infra + url: https://llm-d-incubation.github.io/llm-d-infra + version: 1.3.0 + - name: gateway-api-inference-extension + url: oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + version: .auto + - name: llm-d-modelservice + url: https://llm-d-incubation.github.io/llm-d-modelservice/ + version: .auto + +prepare: + user-overrides: + gateway : + provider : + - name: kgateway + charts: + - name: kgateway-crds + - name: kgateway + deploy: true + check: true + api: + url: https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd + version: 1.3.0 + deploy: true + check: true + inference_extension: + url: https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd + version: 1.0.1 + deploy: true + check: true + wva: + namespace: workload-variant-autoscaler-system + charts: + - name: workload-variant-autoscaler + images: + - name: workload-variant-autoscaler + replicas: 1 + version: 0.1.0 + autoscaling: + enabled: true + slo: + tpot: 30 + ttft: 1000 + hpa: + enabled: true + max_replicas: 10 + target_avg_value: 1 + vllm: + enabled: true + node_port_min: 30000 + node_port_max: 32767 + interval: 15 + workload_monitoring: + namespace: openshift-user-workload-monitoring + url: https://thanos-querier.openshift-monitoring.svc.cluster.local + port: 9091 + deploy: true + check: true + storage: + - name: model-storage + namespace: stackbenchns + class: default + size: 300Gi + download: + url: + enabled: true + timeout: 3600 + deploy: true + check: true + - name: replay-pvc + namespace: harnessns + class: default + size: 300Gi + download: + url: https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json + enabled: false + timeout: 3600 + deploy: true + check: true + - name: workload-pvc + namespace: harnessns + class: default + size: 300Gi + deploy: true + check: true + secrets: + - name: llm-d-hf-token + secret: HF_TOKEN + contents: REPLACE_TOKEN_BASE64_CONTENTS + files: + - name: llm-d-benchmark-preprocesses + path: REPLACE_DIR_PATH + +system: + user-overrides: + - name: default + namespace: stackbenchns + release: stackbenchr + gateway: + type: kgateway + router: + plugins: default-plugins.yaml + volumes: + - name: model-storage + type: pvc + mount: model-storage + size: 300Gi + - name: dshm + type: Memory + mount: /dev/shm + size: 16Gi + inference-engine: + type: modelservice + model: [] + accelerators: + standalone: + key: nvidia.com/gpu.product + value: NVIDIA-H100-80GB-HBM3 + decode: + key: nvidia.com/gpu.product + value: NVIDIA-H100-80GB-HBM3 + prefill: + key: nvidia.com/gpu.product + value: NVIDIA-H100-80GB-HBM3 + replicas: + standalone: 1 + decode: 1 + prefill: 1 + parallelism: + standalone: + data: 1 + tensor: 1 + decode: + data: 1 + tensor: 1 + prefill: + data: 1 + tensor: 1 + resources: + standalone: + memory: 40Gi + cpu: '4' + nvidia.com/gpu: '1' + ephemeral-storage: 20Gi + decode: + memory: 40Gi + cpu: '4' + nvidia.com/gpu: '1' + ephemeral-storage: 20Gi + prefill: + memory: 40Gi + cpu: '4' + nvidia.com/gpu: '1' + ephemeral-storage: 20Gi + annotations: + deployed-by: .auto + modelservice: llm-d-benchmark + ports: + service: 8000 + extra: 9002 + readiness: 8200 + zmq: 5557 + nixl: 5557 + labels: + app: .auto + stood-up-by: .auto + stood-up-from: llm-d-benchmark + stood-up-via: .auto + env: + standalone: + - name: LLMDBENCH_VLLM_STANDALONE_MODEL + value: TEMPLATE_MODEL_NAME + - name: LLMDBENCH_VLLM_STANDALONE_VLLM_LOAD_FORMAT + value: auto + - name: LLMDBENCH_VLLM_STANDALONE_MODEL_LOADER_EXTRA_CONFIG + value: '{}' + - name: VLLM_LOGGING_LEVEL + value: INFO + - name: HF_HOME + value: /TEMPLATE_MODEL_STORAGE + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: TEMPLATE_REPLACE_SECRET_NAME + key: TEMPLATE_REPLACE_SECRET + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: rc,sm,cuda_ipc,cuda_copy,tcp + - name: UCX_SOCKADDR_TLS_PRIORITY + value: tcp + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: TEMPLATE_REPLACE_NXL_PORT + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: '1' + - name: VLLM_SERVER_DEV_MODE + value: '1' + decode: + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: rc,sm,cuda_ipc,cuda_copy,tcp + - name: UCX_SOCKADDR_TLS_PRIORITY + value: tcp + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: TEMPLATE_REPLACE_NXL_PORT + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: '1' + - name: VLLM_SERVER_DEV_MODE + value: '1' + prefill: + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: rc,sm,cuda_ipc,cuda_copy,tcp + - name: UCX_SOCKADDR_TLS_PRIORITY + value: tcp + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: TEMPLATE_REPLACE_NXL_PORT + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: '1' + - name: VLLM_SERVER_DEV_MODE + value: '1' + +harness: + user-overrides: + - name: default + namespace: harnessns + harness: + name: inference-perf + profile: sanity_random.yaml + executable: llm-d-benchmark.sh + timeout: 3600 + resources: + memory: 32Gi + cpu: "16" + volumes: + - name: workload-pvc + type: pvc + mount: /requests + - name: replay-pvc + type: pvc + mount: /data + env: + - name: LLMDBENCH_RUN_EXPERIMENT_LAUNCHER + value: "1" + - name: LLMDBENCH_RUN_DATASET_URL + value: ".prepare.dependencies.user-overrides.storage[]|select(name=replay-pvc).download.url" + - name: LLMDBENCH_RUN_WORKSPACE_DIR + value: "/workspace" + - name: LLMDBENCH_HARNESS_NAME + value: ".experiments[0].user-overrides.runners[0].harness.name" + - name: LLMDBENCH_HARNESS_NAMESPACE + value: ".experiments[0].user-overrides.runners[0].namespace" + - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL + value: "${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL}" + - name: LLMDBENCH_HARNESS_STACK_NAME + value: "${LLMDBENCH_HARNESS_SANITIZED_STACK_NAME}" + - name: LLMDBENCH_DEPLOY_METHODS + value: "${LLMDBENCH_DEPLOY_METHODS}" + - name: LLMDBENCH_MAGIC_ENVAR + value: "harness_pod" + - name: HF_TOKEN_SECRET + value: "${LLMDBENCH_VLLM_COMMON_HF_TOKEN_NAME}" + - name: .prepare.dependencies.user-overrides.secrets[0].secret + valueFrom: + secretKeyRef: + name: .prepare.dependencies.user-overrides.secrets[0].name + key: .prepare.dependencies.user-overrides.secrets[0].secret + +components: + user-overrides: + - name: infra + charts: + - name: llm-d-infra + contents: + gateway: + gatewayClassName: modelservice + service: + type: NodePort + gatewayParameters: + enabled: true + - name: router + contents: + loadfrom: + - gateway-api-inference-extension.yaml + - name: inference-engine + modelservice: + contents: + loadfrom: + - modelservice.yaml + standalone: + contents: + loadfrom: + - standalone.yaml \ No newline at end of file diff --git a/declarative_poc/workspace/templates/gateway-api-inference-extension.yaml.j2 b/declarative_poc/workspace/templates/gateway-api-inference-extension.yaml.j2 new file mode 100644 index 00000000..9e34a262 --- /dev/null +++ b/declarative_poc/workspace/templates/gateway-api-inference-extension.yaml.j2 @@ -0,0 +1,32 @@ +inferenceExtension: + replicas: 1 + image: + name: llm-d-inference-scheduler + pullPolicy: Always + extProcPort: {{ system["user-overrides"][0]["inference-engine"].ports.extra }} + extraContainerPorts: + - name: zmq + containerPort: {{ system["user-overrides"][0]["inference-engine"].ports.zmq }} + protocol: TCP + extraServicePorts: + - name: zmq + port: {{ system["user-overrides"][0]["inference-engine"].ports.zmq }} + targetPort: {{ system["user-overrides"][0]["inference-engine"].ports.zmq }} + protocol: TCP + env: + - name: {{ prepare["user-overrides"].secrets[0].secret }} + valueFrom: + secretKeyRef: + name: {{ prepare["user-overrides"].secrets[0].name }} + key: {{ prepare["user-overrides"].secrets[0].secret }} + pluginsConfigFile: {{ system["user-overrides"][0]["router"].plugins }} +inferencePool: + targetPortNumber: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + modelServerType: vllm + apiVersion: "inference.networking.x-k8s.io/v1alpha2" + modelServers: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: {{ system["user-overrides"][0]["inference-engine"]["model"][0].label }} +provider: + name: {{ prepare["user-overrides"].gateway["provider"][0].name }} diff --git a/declarative_poc/workspace/templates/modelservice.yaml.j2 b/declarative_poc/workspace/templates/modelservice.yaml.j2 new file mode 100644 index 00000000..aecdc96d --- /dev/null +++ b/declarative_poc/workspace/templates/modelservice.yaml.j2 @@ -0,0 +1,173 @@ +fullnameOverride: {{ system["user-overrides"][0]["inference-engine"].model[0].label }} +multinode: False + +modelArtifacts: + uri: {{ system["user-overrides"][0].volumes[0].type }}://{{ system["user-overrides"][0].volumes[0].mount }}/models/{{ system["user-overrides"][0]["inference-engine"].model[0].name }} + + size: {{ prepare["user-overrides"].storage + | selectattr("name","equalto", system["user-overrides"][0].volumes[0].name) + | map(attribute="size") + | first }} + + authSecretName: {{ prepare["user-overrides"].secrets[0].name }} + + name: {{ system["user-overrides"][0]["inference-engine"].model[0].name }} + +routing: + servicePort: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: infra-{{ system["user-overrides"][0].release }}-inference-gateway + + proxy: + image: "{{ images['user-overrides'] | selectattr('name','equalto','llm-d-routing-sidecar') | map(attribute='registry') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d-routing-sidecar') | map(attribute='repo') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d-routing-sidecar') | map(attribute='image') | first }}:{{ images['user-overrides'] | selectattr('name','equalto','llm-d-routing-sidecar') | map(attribute='tag') | first }}" + secure: false + connector: nixlv2 + debugLevel: 3 + + inferencePool: + create: false + name: {{ system["user-overrides"][0]["inference-engine"].model[0].label }} + + httpRoute: + create: true + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: {{ system["user-overrides"][0]["inference-engine"].model[0].label }} + port: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + weight: 1 + timeouts: + backendRequest: 0s + request: 0s + + matches: + - path: + type: PathPrefix + value: /{{ system["user-overrides"][0]["inference-engine"].model[0].name }}/ + + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + + modelCommand: {{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("type", 0) }} + args: {{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("args", 0) }} + env: {{ system["user-overrides"][0]["inference-engine"].env.prefill | default(0) }} + {{ system["user-overrides"][0]["inference-engine"].env.get("prefill", {}) }} + + +decode: + create: {{ system["user-overrides"][0]["inference-engine"].replicas.get("decode", {}) }} + replicas: {{ system["user-overrides"][0]["inference-engine"].replicas.get("decode", {}) }} + + acceleratorTypes: + labelKey: {{ system["user-overrides"][0]["inference-engine"].accelerators.decode.key }} + labelValues: + - {{ system["user-overrides"][0]["inference-engine"].accelerators.decode.value }} + + parallelism: + data: {{ system["user-overrides"][0]["inference-engine"].parallelism.decode.data }} + tensor: {{ system["user-overrides"][0]["inference-engine"].parallelism.decode.tensor }} + + annotations: {{ system["user-overrides"][0]["inference-engine"].annotations }} + podAnnotations: {{ system["user-overrides"][0]["inference-engine"].annotations }} + + containers: + - name: "vllm" + mountModelVolume: true + image: "{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='registry') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='repo') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='image') | first }}:{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='tag') | first }}" + modelCommand: {{ system["user-overrides"][0].get("command", {}).get("decode", {}).get("type", 0) }} + args: {{ system["user-overrides"][0].get("command", {}).get("decode", {}).get("args", 0) }} + {{ system["user-overrides"][0]["inference-engine"].env.get("decode", {}) }} + + resources: + limits: {{ system["user-overrides"][0]["inference-engine"].resources.decode }} + requests: {{ system["user-overrides"][0]["inference-engine"].resources.decode }} + + extraConfig: + startupProbe: + httpGet: + path: /health + port: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + failureThreshold: 60 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + failureThreshold: 3 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: {{ system["user-overrides"][0]["inference-engine"].ports.readiness }} + failureThreshold: 3 + periodSeconds: 5 + +prefill: + create: {{ system["user-overrides"][0]["inference-engine"].replicas.prefill | default(0) }} + replicas: {{ system["user-overrides"][0]["inference-engine"].replicas.prefill | default(0) }} + + acceleratorTypes: + labelKey: {{ system["user-overrides"][0]["inference-engine"].accelerators.prefill.key }} + labelValues: + - {{ system["user-overrides"][0]["inference-engine"].accelerators.prefill.value }} + + parallelism: + data: {{ system["user-overrides"][0]["inference-engine"].parallelism.prefill.data }} + tensor: {{ system["user-overrides"][0]["inference-engine"].parallelism.prefill.tensor }} + + annotations: {{ system["user-overrides"][0]["inference-engine"].annotations }} + podAnnotations: {{ system["user-overrides"][0]["inference-engine"].annotations }} + + containers: + - name: "vllm" + mountModelVolume: true + image: "{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='registry') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='repo') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='image') | first }}:{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='tag') | first }}" + + modelCommand: {{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("type", 0) }} + args: {{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("args", 0) }} + env: {{ system["user-overrides"][0]["inference-engine"].env.prefill | default(0) }} + {{ system["user-overrides"][0]["inference-engine"].env.get("prefill", {}) }} + + resources: + limits: {{ system["user-overrides"][0]["inference-engine"].resources.prefill }} + requests: {{ system["user-overrides"][0]["inference-engine"].resources.prefill }} + + extraConfig: + startupProbe: + httpGet: + path: /health + port: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + failureThreshold: 60 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + failureThreshold: 3 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: {{ system["user-overrides"][0]["inference-engine"].ports.readiness }} + failureThreshold: 3 + periodSeconds: 5 + + volumeMounts: + - name: {{ system["user-overrides"][0].volumes[1].name }} + mountPath: {{ system["user-overrides"][0].volumes[1].mount }} + +volumes: + - name: {{ system["user-overrides"][0].volumes[1].name }} + emptyDir: + medium: {{ system["user-overrides"][0].volumes[1].type }} + sizeLimit: {{ system["user-overrides"][0].volumes[1].size }} diff --git a/declarative_poc/workspace/templates/standalone.yaml.j2 b/declarative_poc/workspace/templates/standalone.yaml.j2 new file mode 100644 index 00000000..5060bf55 --- /dev/null +++ b/declarative_poc/workspace/templates/standalone.yaml.j2 @@ -0,0 +1,95 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("label", {}) }} + labels: {{ system["user-overrides"][0].get("inference-engine", {}).get("labels", {}) }} + namespace: {{ system["user-overrides"][0].get("inference-engine", {}).get("namespace", "default") }} +spec: + replicas: {{ system["user-overrides"][0].get("inference-engine", {}).get("replicas", {}).get("standalone", 1) }} + selector: + matchLabels: + app: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("label", {}) }} + template: + metadata: + labels: {{ system["user-overrides"][0].get("inference-engine", {}).get("labels", {}) }} + annotations: {{ system["user-overrides"][0].get("inference-engine", {}).get("annotations", {}) }} + spec: + schedulerName: default-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ system["user-overrides"][0].get("inference-engine", {}).get("accelerators", {}).get("standalone", {}).get("key", "") }} + operator: In + values: + - {{ system["user-overrides"][0].get("inference-engine", {}).get("accelerators", {}).get("standalone", {}).get("value", "") }} + containers: + - name: vllm-standalone-{{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("label", {}) }} + image: {{ (images["user-overrides"] | selectattr("name", "equalto", "vllm") | list)[0].get("registry", "") }}/{{ (images["user-overrides"] | selectattr("name", "equalto", "vllm") | list)[0].get("repo", "") }}/{{ (images["user-overrides"] | selectattr("name", "equalto", "vllm") | list)[0].get("image", "") }}:{{ (images["user-overrides"] | selectattr("name", "equalto", "vllm") | list)[0].get("tag", "latest") }} + imagePullPolicy: Always + command: + - /bin/bash + - "-c" + args: {{ system["user-overrides"][0].get("inference-engine", {}).get("command", {}).get("standalone", []) }} + env: {{ system["user-overrides"][0].get("inference-engine", {}).get("env", {}).get("standalone", []) }} + ports: + - containerPort: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", {}) }} + startupProbe: + httpGet: + path: /health + port: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", {}) }} + failureThreshold: 200 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", {}) }} + failureThreshold: 3 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", {}) }} + failureThreshold: 3 + periodSeconds: 5 + resources: + limits: {{ system["user-overrides"][0].get("inference-engine", {}).get("resources", {}).get("prefill", {}) }} + requests: {{ system["user-overrides"][0].get("inference-engine", {}).get("resources", {}).get("prefill", {}) }} + volumeMounts: + - name: preprocesses + mountPath: /setup/preprocess + - name: cache-volume + mountPath: {{ system["user-overrides"][0].get("inference-engine", {}).get("volumes", [{}])[0].get("mount", {}) }} + - name: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("name", {}) }} + mountPath: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("mount", {}) }} + volumes: + - name: preprocesses + configMap: + name: {{ prepare["user-overrides"].get("files", [{}])[0].get("name", "preprocess-config") }} + defaultMode: 0500 + - name: cache-volume + persistentVolumeClaim: + claimName: {{ system["user-overrides"][0].get("inference-engine", {}).get("volumes", [{}])[0].get("name", {}) }} + - name: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("name", {}) }} + emptyDir: + medium: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("type", {}) }} + sizeLimit: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("size", {}) }} + +--- + +apiVersion: v1 +kind: Service +metadata: + name: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("name", {}) }} + namespace: {{ system["user-overrides"][0].get("inference-engine", {}).get("namespace", "default") }} + labels: {{ system["user-overrides"][0].get("inference-engine", {}).get("labels", {}) }} +spec: + ports: + - name: http + port: 80 + targetPort: {} + selector: + app: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("name", {}) }} + type: ClusterIP