From 632ff021d09a9604f46c02b43b1bbcba87fb125f Mon Sep 17 00:00:00 2001 From: vezio Date: Tue, 2 Dec 2025 15:49:49 -0500 Subject: [PATCH 1/5] very rough and in-progress parser for declarative approach Signed-off-by: vezio --- .../experiments/inference-scheduling.yaml | 4 + .../experiments/pd-disaggregation.yaml | 4 + .../llmdbench/llmdbench/__init__.py | 0 declarative_poc/llmdbench/llmdbench/cli.py | 128 ++++++ .../llmdbench/llmdbench/logging/logger.py | 40 ++ .../llmdbench/llmdbench/parser/__init__.py | 0 .../llmdbench/parser/systemparser.py | 251 ++++++++++++ .../llmdbench/llmdbench/plan/experiment.py | 3 + .../llmdbench/llmdbench/plan/harness.py | 4 + .../llmdbench/llmdbench/plan/plan.py | 3 + .../llmdbench/llmdbench/plan/system.py | 3 + declarative_poc/llmdbench/pyproject.toml | 13 + .../scenarios/inference-scheduling.yaml | 40 ++ .../scenarios/pd-disaggregation.yaml | 65 +++ declarative_poc/templates/default_system.yaml | 357 +++++++++++++++++ .../experiments/inference-scheduling.yaml | 4 + .../experiments/pd-disaggregation.yaml | 4 + .../llmdbench/build/lib/llmdbench/__init__.py | 0 .../llmdbench/build/lib/llmdbench/cli.py | 128 ++++++ .../build/lib/llmdbench/logging/logger.py | 40 ++ .../build/lib/llmdbench/parser/__init__.py | 0 .../build/lib/llmdbench/parser/parse.py | 39 ++ .../lib/llmdbench/parser/systemparser.py | 258 ++++++++++++ .../build/lib/llmdbench/plan/experiment.py | 3 + .../build/lib/llmdbench/plan/harness.py | 4 + .../build/lib/llmdbench/plan/plan.py | 3 + .../build/lib/llmdbench/plan/system.py | 3 + .../templates/llmdbench/llmdbench/__init__.py | 0 .../templates/llmdbench/llmdbench/cli.py | 128 ++++++ .../llmdbench/llmdbench/logging/logger.py | 40 ++ .../llmdbench/llmdbench/parser/__init__.py | 0 .../llmdbench/parser/systemparser.py | 251 ++++++++++++ .../llmdbench/llmdbench/plan/experiment.py | 3 + .../llmdbench/llmdbench/plan/harness.py | 4 + .../llmdbench/llmdbench/plan/plan.py | 3 + .../llmdbench/llmdbench/plan/system.py | 3 + .../templates/llmdbench/pyproject.toml | 13 + .../scenarios/inference-scheduling.yaml | 40 ++ .../scenarios/pd-disaggregation.yaml | 65 +++ declarative_poc/templates/system_plan.yaml | 370 ++++++++++++++++++ 40 files changed, 2321 insertions(+) create mode 100644 declarative_poc/experiments/inference-scheduling.yaml create mode 100644 declarative_poc/experiments/pd-disaggregation.yaml create mode 100644 declarative_poc/llmdbench/llmdbench/__init__.py create mode 100644 declarative_poc/llmdbench/llmdbench/cli.py create mode 100644 declarative_poc/llmdbench/llmdbench/logging/logger.py create mode 100644 declarative_poc/llmdbench/llmdbench/parser/__init__.py create mode 100644 declarative_poc/llmdbench/llmdbench/parser/systemparser.py create mode 100644 declarative_poc/llmdbench/llmdbench/plan/experiment.py create mode 100644 declarative_poc/llmdbench/llmdbench/plan/harness.py create mode 100644 declarative_poc/llmdbench/llmdbench/plan/plan.py create mode 100644 declarative_poc/llmdbench/llmdbench/plan/system.py create mode 100644 declarative_poc/llmdbench/pyproject.toml create mode 100644 declarative_poc/scenarios/inference-scheduling.yaml create mode 100644 declarative_poc/scenarios/pd-disaggregation.yaml create mode 100644 declarative_poc/templates/default_system.yaml create mode 100644 declarative_poc/templates/experiments/inference-scheduling.yaml create mode 100644 declarative_poc/templates/experiments/pd-disaggregation.yaml create mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/__init__.py create mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/cli.py create mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/logging/logger.py create mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/__init__.py create mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/parse.py create mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/systemparser.py create mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/experiment.py create mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/harness.py create mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/plan.py create mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/system.py create mode 100644 declarative_poc/templates/llmdbench/llmdbench/__init__.py create mode 100644 declarative_poc/templates/llmdbench/llmdbench/cli.py create mode 100644 declarative_poc/templates/llmdbench/llmdbench/logging/logger.py create mode 100644 declarative_poc/templates/llmdbench/llmdbench/parser/__init__.py create mode 100644 declarative_poc/templates/llmdbench/llmdbench/parser/systemparser.py create mode 100644 declarative_poc/templates/llmdbench/llmdbench/plan/experiment.py create mode 100644 declarative_poc/templates/llmdbench/llmdbench/plan/harness.py create mode 100644 declarative_poc/templates/llmdbench/llmdbench/plan/plan.py create mode 100644 declarative_poc/templates/llmdbench/llmdbench/plan/system.py create mode 100644 declarative_poc/templates/llmdbench/pyproject.toml create mode 100644 declarative_poc/templates/scenarios/inference-scheduling.yaml create mode 100644 declarative_poc/templates/scenarios/pd-disaggregation.yaml create mode 100644 declarative_poc/templates/system_plan.yaml diff --git a/declarative_poc/experiments/inference-scheduling.yaml b/declarative_poc/experiments/inference-scheduling.yaml new file mode 100644 index 00000000..8fb80a4d --- /dev/null +++ b/declarative_poc/experiments/inference-scheduling.yaml @@ -0,0 +1,4 @@ +template: + path: /Users/vezio/IBM/llmd/haul/templates/default_system.yaml +scenario: + path: /Users/vezio/IBM/llmd/haul/scenarios/inference-scheduling.yaml diff --git a/declarative_poc/experiments/pd-disaggregation.yaml b/declarative_poc/experiments/pd-disaggregation.yaml new file mode 100644 index 00000000..e2a105f3 --- /dev/null +++ b/declarative_poc/experiments/pd-disaggregation.yaml @@ -0,0 +1,4 @@ +template: + path: /Users/vezio/IBM/llmd/haul/templates/default_system.yaml +scenario: + path: /Users/vezio/IBM/llmd/haul/scenarios/pd-disaggregation.yaml \ No newline at end of file diff --git a/declarative_poc/llmdbench/llmdbench/__init__.py b/declarative_poc/llmdbench/llmdbench/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/declarative_poc/llmdbench/llmdbench/cli.py b/declarative_poc/llmdbench/llmdbench/cli.py new file mode 100644 index 00000000..cd49c405 --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/cli.py @@ -0,0 +1,128 @@ +from llmdbench.parser.systemparser import SystemParser +from llmdbench.logging.logger import get_logger, set_stage + +import json +import argparse +import yaml + + +def cli(): + """ + Command-line interface for llmdbench. + + Subcommands: + - plan: Merge and render YAMLs (previously 'configure') + - prepare: Prepare environment or data before execution + - execute: Run workloads or apply configurations + - destroy: Clean up or rollback resources + - report: Generate summary or benchmark reports + """ + logger = get_logger("llmdbench.cli") + + parser = argparse.ArgumentParser( + prog="llmdbench", + description="Manage and benchmark llmd configurations.", + ) + + subparsers = parser.add_subparsers(dest="command", required=True) + + # -------------------------- + # plan + # -------------------------- + plan_parser = subparsers.add_parser( + "plan", + help="Merge charts/images and render templates into a versioned YAML plan.", + ) + plan_parser.add_argument( + "--experiment", + required=True, + help="Path to the experiment file to plan.", + ) + plan_parser.add_argument( + "--output", + default="system_plan.yaml", + help="Path to save the output experiment as a YAML file.", + ) + + # -------------------------- + # prepare + # -------------------------- + prepare_parser = subparsers.add_parser( + "prepare", help="Prepare the environment or dependencies for execution." + ) + prepare_parser.add_argument( + "--config", required=False, help="Optional path to configuration YAML." + ) + + # -------------------------- + # execute + # -------------------------- + execute_parser = subparsers.add_parser( + "execute", help="Execute the benchmark or deployment defined in the plan." + ) + execute_parser.add_argument( + "--plan", required=True, help="Path to the planned YAML configuration." + ) + + # -------------------------- + # destroy + # -------------------------- + destroy_parser = subparsers.add_parser( + "destroy", help="Tear down or rollback any created resources." + ) + destroy_parser.add_argument( + "--plan", required=False, help="Path to the plan used for deployment." + ) + + # -------------------------- + # report + # -------------------------- + report_parser = subparsers.add_parser( + "report", help="Generate a report or analysis from execution results." + ) + report_parser.add_argument( + "--input", required=False, help="Path to execution results or metrics." + ) + report_parser.add_argument( + "--output", default="report.yaml", help="Path to save the report output." + ) + + # -------------------------- + # Parse and dispatch + # -------------------------- + args = parser.parse_args() + + with open(args.experiment, "r") as f: + data = yaml.safe_load(f) + template_path = data["template"]["path"] + scenario_path = data["scenario"]["path"] + + # Regardless - we need create a plan - otherwise we won't have context of + # what to todo - in the future we can "import" a context to "rerun" a plan. + system = SystemParser(template_path, args.output, scenario_path) + system.parse() + + if args.command == "plan": + set_stage(logger, "๐Ÿ”ง PLAN") + logger.info("Creating execution and deployment plan...") + logger.info(f"Plan saved to {args.output}") + # print(json.dumps(system.plan_to_dict(), indent=2)) + system.plan_to_yaml() + elif args.command == "prepare": + set_stage(logger, "๐Ÿ”ง PREPARE") + logger.info("Preparing environment...") + elif args.command == "execute": + set_stage(logger, "๐Ÿš€ EXECUTE") + logger.info(f"Executing plan: {args.plan}") + elif args.command == "destroy": + set_stage(logger, "๐Ÿงน DESTROY") + logger.info("Cleaning up resources...") + elif args.command == "report": + set_stage(logger, "๐Ÿ“Š REPORT") + logger.info("Generating report...") + else: + parser.print_help() + + +if __name__ == "__main__": + cli() diff --git a/declarative_poc/llmdbench/llmdbench/logging/logger.py b/declarative_poc/llmdbench/llmdbench/logging/logger.py new file mode 100644 index 00000000..c37115ce --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/logging/logger.py @@ -0,0 +1,40 @@ +import logging +import sys + + +class StageFormatter(logging.Formatter): + def __init__(self, stage="RUN", fmt=None, datefmt=None): + self.stage = stage + super().__init__(fmt=fmt, datefmt=datefmt) + + def format(self, record): + record.stage = getattr(record, "stage", self.stage) + return super().format(record) + + +def get_logger(name="llmdbench", stage="RUN", level=logging.INFO): + logger = logging.getLogger(name) + + if not logger.handlers: + handler = logging.StreamHandler(sys.stdout) + formatter = StageFormatter( + stage=stage, + fmt="%(asctime)s - %(levelname)-8s - %(name)s - %(stage)s - %(message)s", + datefmt="%H:%M:%S", + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(level) + logger.propagate = False + else: + for handler in logger.handlers: + if isinstance(handler.formatter, StageFormatter): + handler.formatter.stage = stage + + return logger + + +def set_stage(logger, stage): + for handler in logger.handlers: + if isinstance(handler.formatter, StageFormatter): + handler.formatter.stage = stage diff --git a/declarative_poc/llmdbench/llmdbench/parser/__init__.py b/declarative_poc/llmdbench/llmdbench/parser/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/declarative_poc/llmdbench/llmdbench/parser/systemparser.py b/declarative_poc/llmdbench/llmdbench/parser/systemparser.py new file mode 100644 index 00000000..15d32147 --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/parser/systemparser.py @@ -0,0 +1,251 @@ +import yaml +import copy +import json +import subprocess +import requests +import re + + +class LiteralStr(str): + pass + + +class SystemParser: + def __init__(self, defaults_file, output_file, scenario_file=None): + self.defaults = self._load_yaml(defaults_file) + self.output_file = output_file + self.scenario = self._load_yaml(scenario_file) if scenario_file else {} + + self._charts_key = "charts" + self.charts = {} + + self._images_key = "images" + self.images = {} + + self._system_stack_key = "system" + self.system_stack = {} + + self._system_prepare_key = "prepare" + self._system_prepare = {} + + self._system_harness_key = "harness" + self._system_harness = {} + + self.system_experiments_key = "experiments" + self.system_experiments = {} + + def _load_yaml(self, file_path): + """Load YAML file""" + with open(file_path, "r") as f: + return yaml.safe_load(f) or {} + + def _merge_lists(self, base_list, override_list): + """ + Merge lists of dictionaries by 'name' field. + If items have 'name' field, merge by matching names. + Otherwise, replace the entire list. + """ + if base_list and isinstance(base_list[0], dict) and "name" in base_list[0]: + result = copy.deepcopy(base_list) + base_map = {item["name"]: idx for idx, item in enumerate(result)} + for override_item in override_list: + if "name" in override_item: + name = override_item["name"] + if name in base_map: + idx = base_map[name] + result[idx] = self._deep_merge(result[idx], override_item) + else: + result.append(copy.deepcopy(override_item)) + + return result + else: + return copy.deepcopy(override_list) + + def _deep_merge(self, base, overrides): + """ + Recursively merge overrides into base dictionary. + For lists of dicts with 'name' field, merge by matching names. + Overrides take precedence over base values. + """ + result = copy.deepcopy(base) + for key, value in overrides.items(): + if key in result: + if isinstance(result[key], dict) and isinstance(value, dict): + result[key] = self._deep_merge(result[key], value) + elif isinstance(result[key], list) and isinstance(value, list): + result[key] = self._merge_lists(result[key], value) + else: + result[key] = value + else: + result[key] = copy.deepcopy(value) + return result + + def _get_nested(self, data, path: str): + """ + Retrieves a nested structure using dotted paths. + Supports list indexes like key.0.name or key[0].name. + """ + path = re.sub(r"\[(\d+)\]", r".\1", path) + parts = path.split(".") + + current = data + + for part in parts: + if isinstance(current, dict): + current = current.get(part, {}) + elif isinstance(current, list): + if not part.isdigit(): + return {} + idx = int(part) + if idx < 0 or idx >= len(current): + return {} + current = current[idx] + else: + return {} + + return current + + def _render_template_attribute(self, key): + render = self._get_nested(self.defaults, key) + scenarios = self.scenario.get("scenario", []) + for i, _ in enumerate(scenarios): + path = f"scenario.{i}.{key}" + scenario_value = self._get_nested(self.scenario, path) + render = self._deep_merge(render, scenario_value) + + return render + + def _build_indexes(self): + """Build lookup dictionaries for all categories""" + self._indexes = {} + for category in [self._charts_key, self._images_key]: + data = getattr(self, category, {}) + self._indexes[category] = { + item["name"]: item for item in data.get("user-overrides", []) + } + + def _skopeo_list_tags(self, ref): + """ + Call: skopeo list-tags docker://ghcr.io/org/image + Return: list of tags (strings) + """ + try: + cmd = ["skopeo", "list-tags", f"docker://{ref}"] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + data = json.loads(result.stdout) + return data.get("Tags", []) + except Exception as e: + raise RuntimeError(f"Skopeo failed for {ref}: {e}") + + def _is_oci_repo(self, url: str) -> bool: + return url.startswith("oci://") + + def _helm_http_list_versions(self, url, chart_name): + """ + Given a Helm HTTP repo URL and chart name, return list of versions. + Uses index.yaml which lives at: /index.yaml + """ + index_url = url.rstrip("/") + "/index.yaml" + response = requests.get(index_url, timeout=10) + if response.status_code != 200: + raise RuntimeError(f"Failed to fetch {index_url}: {response.status_code}") + index = yaml.safe_load(response.text) + entries = index.get("entries", {}) + if chart_name not in entries: + raise RuntimeError(f"Chart '{chart_name}' not found at {index_url}") + versions = [entry["version"] for entry in entries[chart_name]] + return versions + + def _resolve_chart_auto_versions(self): + items = self.charts.get("user-overrides", []) + for item in items: + if str(item.get("version", "")) != ".auto": + continue + url = item["url"] + name = item["name"] + if self._is_oci_repo(url): + ref = url.replace("oci://", "") + tags = self._skopeo_list_tags(ref) + else: + tags = self._helm_http_list_versions(url, name) + if not tags: + raise RuntimeError(f"No chart versions found for {name}") + tags.sort() + latest = tags[-1] + item["version"] = latest + + def _resolve_image_auto_tags(self): + items = self.images.get("user-overrides", []) + for item in items: + if str(item.get("tag", "")) == ".auto": + registry = item["registry"] + repo = item["repo"] + image = item["image"] + ref = f"{registry}/{repo}/{image}" + tags = self._skopeo_list_tags(ref) + if not tags: + raise RuntimeError(f"No tags found for image {item['name']}") + tags.sort() + latest = tags[-1] + item["tag"] = latest + + def _literal_str_representer(self, dumper, data): + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + + def _convert_multiline_strings(self, obj): + if isinstance(obj, dict): + return {k: self._convert_multiline_strings(v) for k, v in obj.items()} + if isinstance(obj, list): + return [self._convert_multiline_strings(v) for v in obj] + if isinstance(obj, str) and "\n" in obj: + return LiteralStr(obj) + return obj + + def get_item_by_name(self, category, name): + """Generic method to get an item by name from any category""" + if category not in self._indexes: + raise ValueError(f"Unknown category: {category}") + return self._indexes[category].get(name) + + def get_chart_by_name(self, name): + return self.get_item_by_name(self._charts_key, name) + + def get_image_by_name(self, name): + return self.get_item_by_name(self._images_key, name) + + def plan_to_dict(self): + return { + self._charts_key: self.charts, + self._images_key: self.images, + self._system_stack_key: self.system_stack, + self._system_prepare_key: self.system_prepare, + self._system_harness_key: self.system_harness, + } + + def plan_to_yaml(self): + plan = self.plan_to_dict() + plan = self._convert_multiline_strings(plan) + with open(self.output_file, "w") as f: + yaml.dump( + plan, + f, + default_flow_style=False, + sort_keys=False, + allow_unicode=True, + ) + + def parse(self): + """Load defaults and apply overrides""" + yaml.add_representer(LiteralStr, self._literal_str_representer) + + self.charts = self._render_template_attribute(self._charts_key) + self.images = self._render_template_attribute(self._images_key) + self.system_stack = self._render_template_attribute(self._system_stack_key) + self.system_prepare = self._render_template_attribute(self._system_prepare_key) + self.system_harness = self._render_template_attribute(self._system_harness_key) + + self._resolve_chart_auto_versions() + self._resolve_image_auto_tags() + self._build_indexes() + + return self.plan_to_dict() diff --git a/declarative_poc/llmdbench/llmdbench/plan/experiment.py b/declarative_poc/llmdbench/llmdbench/plan/experiment.py new file mode 100644 index 00000000..3e818027 --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/plan/experiment.py @@ -0,0 +1,3 @@ +class Experiment: + def __init__(self): + pass diff --git a/declarative_poc/llmdbench/llmdbench/plan/harness.py b/declarative_poc/llmdbench/llmdbench/plan/harness.py new file mode 100644 index 00000000..274f352c --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/plan/harness.py @@ -0,0 +1,4 @@ +class Harness: + def __init__(self, runner_name: str, runner_content: dict): + self.name = runner_name + self.runner = runner_content diff --git a/declarative_poc/llmdbench/llmdbench/plan/plan.py b/declarative_poc/llmdbench/llmdbench/plan/plan.py new file mode 100644 index 00000000..1767f422 --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/plan/plan.py @@ -0,0 +1,3 @@ +class Plan: + def __init__(self): + pass diff --git a/declarative_poc/llmdbench/llmdbench/plan/system.py b/declarative_poc/llmdbench/llmdbench/plan/system.py new file mode 100644 index 00000000..0028d443 --- /dev/null +++ b/declarative_poc/llmdbench/llmdbench/plan/system.py @@ -0,0 +1,3 @@ +class System: + def __init__(self): + pass diff --git a/declarative_poc/llmdbench/pyproject.toml b/declarative_poc/llmdbench/pyproject.toml new file mode 100644 index 00000000..4f484cc4 --- /dev/null +++ b/declarative_poc/llmdbench/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "llmdbench" +version = "0.0.0" +description = "A library for configuration discovery and benchmarking for llm-d." +dependencies = [ + "PyYAML", + "Jinja2", + "requests", + "packaging" +] + +[project.scripts] +llmdbench = "llmdbench.cli:cli" diff --git a/declarative_poc/scenarios/inference-scheduling.yaml b/declarative_poc/scenarios/inference-scheduling.yaml new file mode 100644 index 00000000..a1ed88bf --- /dev/null +++ b/declarative_poc/scenarios/inference-scheduling.yaml @@ -0,0 +1,40 @@ +scenario: + - name: "sut-1" + prepare: + user-overrides: + secrets: + - name: llm-d-hf-token + secret: HF_TOKEN + contents: REPLACE_HF_TOKEN + files: + - name: llm-d-benchmark-preprocesses + path: REPLACE_DIR_PATH + system: + user-overrides: + - name: "default" + inference-engine: + model: + - name: meta-llama/Llama-3.1-8B-Instruct + label: .auto + maxlen: 16384 + blocksize: 64 + replicas: + decode: 2 + prefill: 0 + volumes: + - name: model-storage + size: 1Ti + command: + decode: + type: vllmServe + args: + - "--enforce-eager" + - "--block-size" + - "64" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16374" + \ No newline at end of file diff --git a/declarative_poc/scenarios/pd-disaggregation.yaml b/declarative_poc/scenarios/pd-disaggregation.yaml new file mode 100644 index 00000000..77bba40b --- /dev/null +++ b/declarative_poc/scenarios/pd-disaggregation.yaml @@ -0,0 +1,65 @@ +scenario: + - name: "sut-1" + prepare: + user-overrides: + secrets: + - name: llm-d-hf-token + secret: HF_TOKEN + contents: TYLERS_TOKEN + system: + user-overrides: + - name: "default" + inference-engine: + model: + - name: meta-llama/Llama-3.1-8B-Instruct + label: .auto + maxlen: 16000 + blocksize: 128 + replicas: + decode: 2 + prefill: 2 + parallelism: + decode: + tensor: 1 + prefill: + tensor: 1 + resources: + decode: + memory: 128Gi + cpu: 32 + prefill: + memory: 128Gi + cpu: 32 + volumes: + - name: model-storage + size: 1Ti + command: + decode: + type: vllmServe + args: + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16000" + prefill: + type: vllmServe + args: + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16000" + harness: + user-overrides: + - name: default + namespace: harnessns + harness: + name: vllm-benchmark + profile: random_concurrent.yaml \ No newline at end of file diff --git a/declarative_poc/templates/default_system.yaml b/declarative_poc/templates/default_system.yaml new file mode 100644 index 00000000..1803c588 --- /dev/null +++ b/declarative_poc/templates/default_system.yaml @@ -0,0 +1,357 @@ +images: + user-overrides: + - name: llm-d-benchmark + registry: ghcr.io + repo: llm-d + image: llm-d-benchmark + tag: .auto # list all tags, pick the latest + - name: llm-d + registry: ghcr.io + repo: llm-d + image: llm-d-cuda + tag: .auto # list all tags, pick the latest + - name: llm-d-model-service + registry: ghcr.io + repo: llm-d + image: llm-d-model-service + tag: .auto # list all tags, pick the latest + - name: llm-d-inference-scheduler + registry: ghcr.io + repo: llm-d + image: llm-d-inference-scheduler + tag: .auto # list all tags, pick the latest + - name: llm-d-routing-sidecar + registry: ghcr.io + repo: llm-d + image: llm-d-routing-sidecar + tag: .auto # list all tags, pick the latest + - name: llm-d-inference-sim + registry: ghcr.io + repo: llm-d + image: llm-d-inference-sim + tag: .auto # list all tags, pick the latest + - name: vllm + registry: docker.io + repo: vllm + image: vllm-openai + tag: latest + +charts: + user-overrides: + - name: kgateway-crds + url: oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds + version: 2.0.3 + - name: kgateway + url: oci://cr.kgateway.dev/kgateway-dev/charts/kgateway + version: 2.0.3 + - name: istio + url: oci://gcr.io/istio-testing/charts + version: 1.28-alpha.89f30b26ba71bf5e538083a4720d0bc2d8c06401 + - name: llm-d-infra + url: https://llm-d-incubation.github.io/llm-d-infra + version: 1.3.0 + - name: gateway-api-inference-extension + url: oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + version: .auto + - name: llm-d-modelservice + url: https://llm-d-incubation.github.io/llm-d-modelservice/ + version: .auto + +prepare: + user-overrides: + gateway : + provider : + - name: kgateway + charts: + - name: kgateway-crds + - name: kgateway + deploy: true + check: true + - name: istio + charts: + - name: istio + deploy: false + check: false + api: + url: https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd + version: 1.3.0 + deploy: true + check: true + inference_extension: + url: https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd + version: 1.0.1 + deploy: true + check: true + wva: + namespace: workload-variant-autoscaler-system + charts: + - name: workload-variant-autoscaler + images: + - name: workload-variant-autoscaler + replicas: 1 + version: 0.1.0 + autoscaling: + enabled: true + slo: + tpot: 30 + ttft: 1000 + hpa: + enabled: true + max_replicas: 10 + target_avg_value: 1 + vllm: + enabled: true + node_port_min: 30000 + node_port_max: 32767 + interval: 15 + workload_monitoring: + namespace: openshift-user-workload-monitoring + url: https://thanos-querier.openshift-monitoring.svc.cluster.local + port: 9091 + deploy: true + check: true + storage: + - name: model-storage + namespace: stackbenchns + class: default + size: 300Gi + download: + url: + enabled: true + timeout: 3600 + deploy: true + check: true + - name: replay-pvc + namespace: harnessns + class: default + size: 300Gi + download: + url: https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json + enabled: false + timeout: 3600 + deploy: true + check: true + - name: workload-pvc + namespace: harnessns + class: default + size: 300Gi + deploy: true + check: true + secrets: + - name: llm-d-hf-token + secret: HF_TOKEN + contents: REPLACE_TOKEN_BASE64_CONTENTS + files: + - name: llm-d-benchmark-preprocesses + path: REPLACE_DIR_PATH + +system: + user-overrides: + - name: default + namespace: stackbenchns + release: stackbenchr + gateway: + type: kgateway + router: + plugins: default-plugins.yaml + volumes: + - name: model-storage + type: pvc + mount: model-storage + size: 300Gi + - name: dshm + type: Memory + mount: /dev/shm + size: 16Gi + components: + - name: infra + charts: + - name: llm-d-infra + contents: + gateway: + gatewayClassName: modelservice + service: + type: NodePort + gatewayParameters: + enabled: true + - name: router + charts: + - name: gateway-api-inference-extension + contents: + loadfrom: + - BASE_DIR/_templates/gateway-api-inference-extension.yaml + - name: inference-engine + modelservice: + charts: + - name: llm-d-modelservice + contents: + loadfrom: + - BASE_DIR/_templates/modelservice.yaml + standalone: + contents: + loadfrom: + - BASE_DIR/_templates/standalone.yaml + inference-engine: + type: modelservice + model: [] + accelerators: + standalone: + key: nvidia.com/gpu.product + value: NVIDIA-H100-80GB-HBM3 + decode: + key: nvidia.com/gpu.product + value: NVIDIA-H100-80GB-HBM3 + prefill: + key: nvidia.com/gpu.product + value: NVIDIA-H100-80GB-HBM3 + replicas: + standalone: 1 + decode: 1 + prefill: 1 + parallelism: + standalone: + data: 1 + tensor: 1 + decode: + data: 1 + tensor: 1 + prefill: + data: 1 + tensor: 1 + resources: + standalone: + memory: 40Gi + cpu: '4' + nvidia.com/gpu: '1' + ephemeral-storage: 20Gi + decode: + memory: 40Gi + cpu: '4' + nvidia.com/gpu: '1' + ephemeral-storage: 20Gi + prefill: + memory: 40Gi + cpu: '4' + nvidia.com/gpu: '1' + ephemeral-storage: 20Gi + annotations: + deployed-by: .auto + modelservice: llm-d-benchmark + ports: + service: 8000 + extra: 9002 + readiness: 8200 + zmq: 5557 + nixl: 5557 + labels: + app: .auto + stood-up-by: .auto + stood-up-from: llm-d-benchmark + stood-up-via: .auto + env: + standalone: + - name: LLMDBENCH_VLLM_STANDALONE_MODEL + value: TEMPLATE_MODEL_NAME + - name: LLMDBENCH_VLLM_STANDALONE_VLLM_LOAD_FORMAT + value: auto + - name: LLMDBENCH_VLLM_STANDALONE_MODEL_LOADER_EXTRA_CONFIG + value: '{}' + - name: VLLM_LOGGING_LEVEL + value: INFO + - name: HF_HOME + value: /TEMPLATE_MODEL_STORAGE + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: TEMPLATE_REPLACE_SECRET_NAME + key: TEMPLATE_REPLACE_SECRET + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: rc,sm,cuda_ipc,cuda_copy,tcp + - name: UCX_SOCKADDR_TLS_PRIORITY + value: tcp + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: TEMPLATE_REPLACE_NXL_PORT + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: '1' + - name: VLLM_SERVER_DEV_MODE + value: '1' + decode: + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: rc,sm,cuda_ipc,cuda_copy,tcp + - name: UCX_SOCKADDR_TLS_PRIORITY + value: tcp + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: TEMPLATE_REPLACE_NXL_PORT + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: '1' + - name: VLLM_SERVER_DEV_MODE + value: '1' + prefill: + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: rc,sm,cuda_ipc,cuda_copy,tcp + - name: UCX_SOCKADDR_TLS_PRIORITY + value: tcp + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: TEMPLATE_REPLACE_NXL_PORT + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: '1' + - name: VLLM_SERVER_DEV_MODE + value: '1' + +harness: + user-overrides: + - name: default + namespace: harnessns + harness: + name: inference-perf + profile: sanity_random.yaml + executable: llm-d-benchmark.sh + timeout: 3600 + resources: + memory: 32Gi + cpu: "16" + volumes: + - name: workload-pvc + type: pvc + mount: /requests + - name: replay-pvc + type: pvc + mount: /data + env: + - name: LLMDBENCH_RUN_EXPERIMENT_LAUNCHER + value: "1" + - name: LLMDBENCH_RUN_DATASET_URL + value: ".prepare.dependencies.user-overrides.storage[]|select(name=replay-pvc).download.url" + - name: LLMDBENCH_RUN_WORKSPACE_DIR + value: "/workspace" + - name: LLMDBENCH_HARNESS_NAME + value: ".experiments[0].user-overrides.runners[0].harness.name" + - name: LLMDBENCH_HARNESS_NAMESPACE + value: ".experiments[0].user-overrides.runners[0].namespace" + - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL + value: "${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL}" + - name: LLMDBENCH_HARNESS_STACK_NAME + value: "${LLMDBENCH_HARNESS_SANITIZED_STACK_NAME}" + - name: LLMDBENCH_DEPLOY_METHODS + value: "${LLMDBENCH_DEPLOY_METHODS}" + - name: LLMDBENCH_MAGIC_ENVAR + value: "harness_pod" + - name: HF_TOKEN_SECRET + value: "${LLMDBENCH_VLLM_COMMON_HF_TOKEN_NAME}" + - name: .prepare.dependencies.user-overrides.secrets[0].secret + valueFrom: + secretKeyRef: + name: .prepare.dependencies.user-overrides.secrets[0].name + key: .prepare.dependencies.user-overrides.secrets[0].secret diff --git a/declarative_poc/templates/experiments/inference-scheduling.yaml b/declarative_poc/templates/experiments/inference-scheduling.yaml new file mode 100644 index 00000000..8fb80a4d --- /dev/null +++ b/declarative_poc/templates/experiments/inference-scheduling.yaml @@ -0,0 +1,4 @@ +template: + path: /Users/vezio/IBM/llmd/haul/templates/default_system.yaml +scenario: + path: /Users/vezio/IBM/llmd/haul/scenarios/inference-scheduling.yaml diff --git a/declarative_poc/templates/experiments/pd-disaggregation.yaml b/declarative_poc/templates/experiments/pd-disaggregation.yaml new file mode 100644 index 00000000..e2a105f3 --- /dev/null +++ b/declarative_poc/templates/experiments/pd-disaggregation.yaml @@ -0,0 +1,4 @@ +template: + path: /Users/vezio/IBM/llmd/haul/templates/default_system.yaml +scenario: + path: /Users/vezio/IBM/llmd/haul/scenarios/pd-disaggregation.yaml \ No newline at end of file diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/__init__.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/cli.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/cli.py new file mode 100644 index 00000000..cd49c405 --- /dev/null +++ b/declarative_poc/templates/llmdbench/build/lib/llmdbench/cli.py @@ -0,0 +1,128 @@ +from llmdbench.parser.systemparser import SystemParser +from llmdbench.logging.logger import get_logger, set_stage + +import json +import argparse +import yaml + + +def cli(): + """ + Command-line interface for llmdbench. + + Subcommands: + - plan: Merge and render YAMLs (previously 'configure') + - prepare: Prepare environment or data before execution + - execute: Run workloads or apply configurations + - destroy: Clean up or rollback resources + - report: Generate summary or benchmark reports + """ + logger = get_logger("llmdbench.cli") + + parser = argparse.ArgumentParser( + prog="llmdbench", + description="Manage and benchmark llmd configurations.", + ) + + subparsers = parser.add_subparsers(dest="command", required=True) + + # -------------------------- + # plan + # -------------------------- + plan_parser = subparsers.add_parser( + "plan", + help="Merge charts/images and render templates into a versioned YAML plan.", + ) + plan_parser.add_argument( + "--experiment", + required=True, + help="Path to the experiment file to plan.", + ) + plan_parser.add_argument( + "--output", + default="system_plan.yaml", + help="Path to save the output experiment as a YAML file.", + ) + + # -------------------------- + # prepare + # -------------------------- + prepare_parser = subparsers.add_parser( + "prepare", help="Prepare the environment or dependencies for execution." + ) + prepare_parser.add_argument( + "--config", required=False, help="Optional path to configuration YAML." + ) + + # -------------------------- + # execute + # -------------------------- + execute_parser = subparsers.add_parser( + "execute", help="Execute the benchmark or deployment defined in the plan." + ) + execute_parser.add_argument( + "--plan", required=True, help="Path to the planned YAML configuration." + ) + + # -------------------------- + # destroy + # -------------------------- + destroy_parser = subparsers.add_parser( + "destroy", help="Tear down or rollback any created resources." + ) + destroy_parser.add_argument( + "--plan", required=False, help="Path to the plan used for deployment." + ) + + # -------------------------- + # report + # -------------------------- + report_parser = subparsers.add_parser( + "report", help="Generate a report or analysis from execution results." + ) + report_parser.add_argument( + "--input", required=False, help="Path to execution results or metrics." + ) + report_parser.add_argument( + "--output", default="report.yaml", help="Path to save the report output." + ) + + # -------------------------- + # Parse and dispatch + # -------------------------- + args = parser.parse_args() + + with open(args.experiment, "r") as f: + data = yaml.safe_load(f) + template_path = data["template"]["path"] + scenario_path = data["scenario"]["path"] + + # Regardless - we need create a plan - otherwise we won't have context of + # what to todo - in the future we can "import" a context to "rerun" a plan. + system = SystemParser(template_path, args.output, scenario_path) + system.parse() + + if args.command == "plan": + set_stage(logger, "๐Ÿ”ง PLAN") + logger.info("Creating execution and deployment plan...") + logger.info(f"Plan saved to {args.output}") + # print(json.dumps(system.plan_to_dict(), indent=2)) + system.plan_to_yaml() + elif args.command == "prepare": + set_stage(logger, "๐Ÿ”ง PREPARE") + logger.info("Preparing environment...") + elif args.command == "execute": + set_stage(logger, "๐Ÿš€ EXECUTE") + logger.info(f"Executing plan: {args.plan}") + elif args.command == "destroy": + set_stage(logger, "๐Ÿงน DESTROY") + logger.info("Cleaning up resources...") + elif args.command == "report": + set_stage(logger, "๐Ÿ“Š REPORT") + logger.info("Generating report...") + else: + parser.print_help() + + +if __name__ == "__main__": + cli() diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/logging/logger.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/logging/logger.py new file mode 100644 index 00000000..c37115ce --- /dev/null +++ b/declarative_poc/templates/llmdbench/build/lib/llmdbench/logging/logger.py @@ -0,0 +1,40 @@ +import logging +import sys + + +class StageFormatter(logging.Formatter): + def __init__(self, stage="RUN", fmt=None, datefmt=None): + self.stage = stage + super().__init__(fmt=fmt, datefmt=datefmt) + + def format(self, record): + record.stage = getattr(record, "stage", self.stage) + return super().format(record) + + +def get_logger(name="llmdbench", stage="RUN", level=logging.INFO): + logger = logging.getLogger(name) + + if not logger.handlers: + handler = logging.StreamHandler(sys.stdout) + formatter = StageFormatter( + stage=stage, + fmt="%(asctime)s - %(levelname)-8s - %(name)s - %(stage)s - %(message)s", + datefmt="%H:%M:%S", + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(level) + logger.propagate = False + else: + for handler in logger.handlers: + if isinstance(handler.formatter, StageFormatter): + handler.formatter.stage = stage + + return logger + + +def set_stage(logger, stage): + for handler in logger.handlers: + if isinstance(handler.formatter, StageFormatter): + handler.formatter.stage = stage diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/__init__.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/parse.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/parse.py new file mode 100644 index 00000000..db817524 --- /dev/null +++ b/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/parse.py @@ -0,0 +1,39 @@ +import yaml +import os +import sys +from jinja2 import ( + Environment, + FileSystemLoader, + StrictUndefined, + Template, + ChainableUndefined, +) + + +# Load values.yaml if present +def load_values(path="values.yaml"): + try: + with open(path, "r") as f: + return yaml.safe_load(f) or {} + except FileNotFoundError: + return {} + + +def parse( + template_dir=None, values_file=None, output_dir="/Users/vezio/IBM/llmd/haul/stack" +): + + # Jinja environment + env = Environment( + loader=FileSystemLoader(template_dir), undefined=ChainableUndefined + ) + + # Iterate over all template files + for filename in os.listdir(template_dir): + if not filename.endswith((".j2", ".jinja", ".tmpl", ".template")): + continue + + template = env.get_template(filename) + print(template) + rendered = template.render(charts={}) + print(rendered) diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/systemparser.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/systemparser.py new file mode 100644 index 00000000..baa6814a --- /dev/null +++ b/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/systemparser.py @@ -0,0 +1,258 @@ +import yaml +import copy +import json +import subprocess +import requests +import re + + +class LiteralStr(str): + pass + + +class SystemParser: + def __init__(self, defaults_file, output_file, scenario_file=None): + self.defaults = self._load_yaml(defaults_file) + self.output_file = output_file + self.scenario = self._load_yaml(scenario_file) if scenario_file else {} + + self._charts_key = "charts" + self.charts = {} + + self._images_key = "images" + self.images = {} + + self._system_stack_key = "system" + self.system_stack = {} + + self._system_prepare_key = "prepare" + self._system_prepare = {} + + self._system_harness_key = "harness" + self._system_harness = {} + + self.system_experiments_key = "experiments" + self.system_experiments = {} + + def _load_yaml(self, file_path): + """Load YAML file""" + with open(file_path, "r") as f: + return yaml.safe_load(f) or {} + + def _merge_lists(self, base_list, override_list): + """ + Merge lists of dictionaries by 'name' field. + If items have 'name' field, merge by matching names. + Otherwise, replace the entire list. + """ + if base_list and isinstance(base_list[0], dict) and "name" in base_list[0]: + result = copy.deepcopy(base_list) + base_map = {item["name"]: idx for idx, item in enumerate(result)} + for override_item in override_list: + if "name" in override_item: + name = override_item["name"] + if name in base_map: + idx = base_map[name] + result[idx] = self._deep_merge(result[idx], override_item) + else: + result.append(copy.deepcopy(override_item)) + + return result + else: + return copy.deepcopy(override_list) + + def _deep_merge(self, base, overrides): + """ + Recursively merge overrides into base dictionary. + For lists of dicts with 'name' field, merge by matching names. + Overrides take precedence over base values. + """ + result = copy.deepcopy(base) + for key, value in overrides.items(): + if key in result: + if isinstance(result[key], dict) and isinstance(value, dict): + result[key] = self._deep_merge(result[key], value) + elif isinstance(result[key], list) and isinstance(value, list): + result[key] = self._merge_lists(result[key], value) + else: + result[key] = value + else: + result[key] = copy.deepcopy(value) + return result + + def _get_nested(self, data, path: str): + """ + Retrieves a nested structure using dotted paths. + Supports list indexes like key.0.name or key[0].name. + """ + # Normalize bracket indices: "scenario[0]" -> "scenario.0" + path = re.sub(r"\[(\d+)\]", r".\1", path) + parts = path.split(".") + + current = data + + for part in parts: + if isinstance(current, dict): + # dict lookup + current = current.get(part, {}) + elif isinstance(current, list): + # list index lookup + if not part.isdigit(): + return {} # trying to index a list with non-int + idx = int(part) + if idx < 0 or idx >= len(current): + return {} + current = current[idx] + else: + # neither list nor dict - cannot go deeper + return {} + + return current + + def _render_template_attribute(self, key): + # Start with defaults + render = self._get_nested(self.defaults, key) + + # Merge all scenarios (scenario[0], scenario[1], ...) + scenarios = self.scenario.get("scenario", []) + for i, _ in enumerate(scenarios): + path = f"scenario.{i}.{key}" + scenario_value = self._get_nested(self.scenario, path) + render = self._deep_merge(render, scenario_value) + + return render + + def _build_indexes(self): + """Build lookup dictionaries for all categories""" + self._indexes = {} + for category in [self._charts_key, self._images_key]: + data = getattr(self, category, {}) + self._indexes[category] = { + item["name"]: item for item in data.get("user-overrides", []) + } + + def _skopeo_list_tags(self, ref): + """ + Call: skopeo list-tags docker://ghcr.io/org/image + Return: list of tags (strings) + """ + try: + cmd = ["skopeo", "list-tags", f"docker://{ref}"] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + data = json.loads(result.stdout) + return data.get("Tags", []) + except Exception as e: + raise RuntimeError(f"Skopeo failed for {ref}: {e}") + + def _is_oci_repo(self, url: str) -> bool: + return url.startswith("oci://") + + def _helm_http_list_versions(self, url, chart_name): + """ + Given a Helm HTTP repo URL and chart name, return list of versions. + Uses index.yaml which lives at: /index.yaml + """ + index_url = url.rstrip("/") + "/index.yaml" + response = requests.get(index_url, timeout=10) + if response.status_code != 200: + raise RuntimeError(f"Failed to fetch {index_url}: {response.status_code}") + index = yaml.safe_load(response.text) + entries = index.get("entries", {}) + if chart_name not in entries: + raise RuntimeError(f"Chart '{chart_name}' not found at {index_url}") + versions = [entry["version"] for entry in entries[chart_name]] + return versions + + def _resolve_chart_auto_versions(self): + items = self.charts.get("user-overrides", []) + for item in items: + if str(item.get("version", "")) != ".auto": + continue + url = item["url"] + name = item["name"] + if self._is_oci_repo(url): + ref = url.replace("oci://", "") + tags = self._skopeo_list_tags(ref) + else: + tags = self._helm_http_list_versions(url, name) + if not tags: + raise RuntimeError(f"No chart versions found for {name}") + tags.sort() + latest = tags[-1] + item["version"] = latest + + def _resolve_image_auto_tags(self): + items = self.images.get("user-overrides", []) + for item in items: + if str(item.get("tag", "")) == ".auto": + registry = item["registry"] + repo = item["repo"] + image = item["image"] + ref = f"{registry}/{repo}/{image}" + tags = self._skopeo_list_tags(ref) + if not tags: + raise RuntimeError(f"No tags found for image {item['name']}") + tags.sort() + latest = tags[-1] + item["tag"] = latest + + def _literal_str_representer(self, dumper, data): + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + + def _convert_multiline_strings(self, obj): + if isinstance(obj, dict): + return {k: self._convert_multiline_strings(v) for k, v in obj.items()} + if isinstance(obj, list): + return [self._convert_multiline_strings(v) for v in obj] + if isinstance(obj, str) and "\n" in obj: + return LiteralStr(obj) + return obj + + def get_item_by_name(self, category, name): + """Generic method to get an item by name from any category""" + if category not in self._indexes: + raise ValueError(f"Unknown category: {category}") + return self._indexes[category].get(name) + + def get_chart_by_name(self, name): + return self.get_item_by_name(self._charts_key, name) + + def get_image_by_name(self, name): + return self.get_item_by_name(self._images_key, name) + + def plan_to_dict(self): + return { + self._charts_key: self.charts, + self._images_key: self.images, + self._system_stack_key: self.system_stack, + self._system_prepare_key: self.system_prepare, + self._system_harness_key: self.system_harness, + } + + def plan_to_yaml(self): + plan = self.plan_to_dict() + plan = self._convert_multiline_strings(plan) + with open(self.output_file, "w") as f: + yaml.dump( + plan, + f, + default_flow_style=False, + sort_keys=False, + allow_unicode=True, + ) + + def parse(self): + """Load defaults and apply overrides""" + yaml.add_representer(LiteralStr, self._literal_str_representer) + + self.charts = self._render_template_attribute(self._charts_key) + self.images = self._render_template_attribute(self._images_key) + self.system_stack = self._render_template_attribute(self._system_stack_key) + self.system_prepare = self._render_template_attribute(self._system_prepare_key) + self.system_harness = self._render_template_attribute(self._system_harness_key) + + self._resolve_chart_auto_versions() + self._resolve_image_auto_tags() + self._build_indexes() + + return self.plan_to_dict() diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/experiment.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/experiment.py new file mode 100644 index 00000000..3e818027 --- /dev/null +++ b/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/experiment.py @@ -0,0 +1,3 @@ +class Experiment: + def __init__(self): + pass diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/harness.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/harness.py new file mode 100644 index 00000000..274f352c --- /dev/null +++ b/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/harness.py @@ -0,0 +1,4 @@ +class Harness: + def __init__(self, runner_name: str, runner_content: dict): + self.name = runner_name + self.runner = runner_content diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/plan.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/plan.py new file mode 100644 index 00000000..1767f422 --- /dev/null +++ b/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/plan.py @@ -0,0 +1,3 @@ +class Plan: + def __init__(self): + pass diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/system.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/system.py new file mode 100644 index 00000000..0028d443 --- /dev/null +++ b/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/system.py @@ -0,0 +1,3 @@ +class System: + def __init__(self): + pass diff --git a/declarative_poc/templates/llmdbench/llmdbench/__init__.py b/declarative_poc/templates/llmdbench/llmdbench/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/declarative_poc/templates/llmdbench/llmdbench/cli.py b/declarative_poc/templates/llmdbench/llmdbench/cli.py new file mode 100644 index 00000000..cd49c405 --- /dev/null +++ b/declarative_poc/templates/llmdbench/llmdbench/cli.py @@ -0,0 +1,128 @@ +from llmdbench.parser.systemparser import SystemParser +from llmdbench.logging.logger import get_logger, set_stage + +import json +import argparse +import yaml + + +def cli(): + """ + Command-line interface for llmdbench. + + Subcommands: + - plan: Merge and render YAMLs (previously 'configure') + - prepare: Prepare environment or data before execution + - execute: Run workloads or apply configurations + - destroy: Clean up or rollback resources + - report: Generate summary or benchmark reports + """ + logger = get_logger("llmdbench.cli") + + parser = argparse.ArgumentParser( + prog="llmdbench", + description="Manage and benchmark llmd configurations.", + ) + + subparsers = parser.add_subparsers(dest="command", required=True) + + # -------------------------- + # plan + # -------------------------- + plan_parser = subparsers.add_parser( + "plan", + help="Merge charts/images and render templates into a versioned YAML plan.", + ) + plan_parser.add_argument( + "--experiment", + required=True, + help="Path to the experiment file to plan.", + ) + plan_parser.add_argument( + "--output", + default="system_plan.yaml", + help="Path to save the output experiment as a YAML file.", + ) + + # -------------------------- + # prepare + # -------------------------- + prepare_parser = subparsers.add_parser( + "prepare", help="Prepare the environment or dependencies for execution." + ) + prepare_parser.add_argument( + "--config", required=False, help="Optional path to configuration YAML." + ) + + # -------------------------- + # execute + # -------------------------- + execute_parser = subparsers.add_parser( + "execute", help="Execute the benchmark or deployment defined in the plan." + ) + execute_parser.add_argument( + "--plan", required=True, help="Path to the planned YAML configuration." + ) + + # -------------------------- + # destroy + # -------------------------- + destroy_parser = subparsers.add_parser( + "destroy", help="Tear down or rollback any created resources." + ) + destroy_parser.add_argument( + "--plan", required=False, help="Path to the plan used for deployment." + ) + + # -------------------------- + # report + # -------------------------- + report_parser = subparsers.add_parser( + "report", help="Generate a report or analysis from execution results." + ) + report_parser.add_argument( + "--input", required=False, help="Path to execution results or metrics." + ) + report_parser.add_argument( + "--output", default="report.yaml", help="Path to save the report output." + ) + + # -------------------------- + # Parse and dispatch + # -------------------------- + args = parser.parse_args() + + with open(args.experiment, "r") as f: + data = yaml.safe_load(f) + template_path = data["template"]["path"] + scenario_path = data["scenario"]["path"] + + # Regardless - we need create a plan - otherwise we won't have context of + # what to todo - in the future we can "import" a context to "rerun" a plan. + system = SystemParser(template_path, args.output, scenario_path) + system.parse() + + if args.command == "plan": + set_stage(logger, "๐Ÿ”ง PLAN") + logger.info("Creating execution and deployment plan...") + logger.info(f"Plan saved to {args.output}") + # print(json.dumps(system.plan_to_dict(), indent=2)) + system.plan_to_yaml() + elif args.command == "prepare": + set_stage(logger, "๐Ÿ”ง PREPARE") + logger.info("Preparing environment...") + elif args.command == "execute": + set_stage(logger, "๐Ÿš€ EXECUTE") + logger.info(f"Executing plan: {args.plan}") + elif args.command == "destroy": + set_stage(logger, "๐Ÿงน DESTROY") + logger.info("Cleaning up resources...") + elif args.command == "report": + set_stage(logger, "๐Ÿ“Š REPORT") + logger.info("Generating report...") + else: + parser.print_help() + + +if __name__ == "__main__": + cli() diff --git a/declarative_poc/templates/llmdbench/llmdbench/logging/logger.py b/declarative_poc/templates/llmdbench/llmdbench/logging/logger.py new file mode 100644 index 00000000..c37115ce --- /dev/null +++ b/declarative_poc/templates/llmdbench/llmdbench/logging/logger.py @@ -0,0 +1,40 @@ +import logging +import sys + + +class StageFormatter(logging.Formatter): + def __init__(self, stage="RUN", fmt=None, datefmt=None): + self.stage = stage + super().__init__(fmt=fmt, datefmt=datefmt) + + def format(self, record): + record.stage = getattr(record, "stage", self.stage) + return super().format(record) + + +def get_logger(name="llmdbench", stage="RUN", level=logging.INFO): + logger = logging.getLogger(name) + + if not logger.handlers: + handler = logging.StreamHandler(sys.stdout) + formatter = StageFormatter( + stage=stage, + fmt="%(asctime)s - %(levelname)-8s - %(name)s - %(stage)s - %(message)s", + datefmt="%H:%M:%S", + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(level) + logger.propagate = False + else: + for handler in logger.handlers: + if isinstance(handler.formatter, StageFormatter): + handler.formatter.stage = stage + + return logger + + +def set_stage(logger, stage): + for handler in logger.handlers: + if isinstance(handler.formatter, StageFormatter): + handler.formatter.stage = stage diff --git a/declarative_poc/templates/llmdbench/llmdbench/parser/__init__.py b/declarative_poc/templates/llmdbench/llmdbench/parser/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/declarative_poc/templates/llmdbench/llmdbench/parser/systemparser.py b/declarative_poc/templates/llmdbench/llmdbench/parser/systemparser.py new file mode 100644 index 00000000..15d32147 --- /dev/null +++ b/declarative_poc/templates/llmdbench/llmdbench/parser/systemparser.py @@ -0,0 +1,251 @@ +import yaml +import copy +import json +import subprocess +import requests +import re + + +class LiteralStr(str): + pass + + +class SystemParser: + def __init__(self, defaults_file, output_file, scenario_file=None): + self.defaults = self._load_yaml(defaults_file) + self.output_file = output_file + self.scenario = self._load_yaml(scenario_file) if scenario_file else {} + + self._charts_key = "charts" + self.charts = {} + + self._images_key = "images" + self.images = {} + + self._system_stack_key = "system" + self.system_stack = {} + + self._system_prepare_key = "prepare" + self._system_prepare = {} + + self._system_harness_key = "harness" + self._system_harness = {} + + self.system_experiments_key = "experiments" + self.system_experiments = {} + + def _load_yaml(self, file_path): + """Load YAML file""" + with open(file_path, "r") as f: + return yaml.safe_load(f) or {} + + def _merge_lists(self, base_list, override_list): + """ + Merge lists of dictionaries by 'name' field. + If items have 'name' field, merge by matching names. + Otherwise, replace the entire list. + """ + if base_list and isinstance(base_list[0], dict) and "name" in base_list[0]: + result = copy.deepcopy(base_list) + base_map = {item["name"]: idx for idx, item in enumerate(result)} + for override_item in override_list: + if "name" in override_item: + name = override_item["name"] + if name in base_map: + idx = base_map[name] + result[idx] = self._deep_merge(result[idx], override_item) + else: + result.append(copy.deepcopy(override_item)) + + return result + else: + return copy.deepcopy(override_list) + + def _deep_merge(self, base, overrides): + """ + Recursively merge overrides into base dictionary. + For lists of dicts with 'name' field, merge by matching names. + Overrides take precedence over base values. + """ + result = copy.deepcopy(base) + for key, value in overrides.items(): + if key in result: + if isinstance(result[key], dict) and isinstance(value, dict): + result[key] = self._deep_merge(result[key], value) + elif isinstance(result[key], list) and isinstance(value, list): + result[key] = self._merge_lists(result[key], value) + else: + result[key] = value + else: + result[key] = copy.deepcopy(value) + return result + + def _get_nested(self, data, path: str): + """ + Retrieves a nested structure using dotted paths. + Supports list indexes like key.0.name or key[0].name. + """ + path = re.sub(r"\[(\d+)\]", r".\1", path) + parts = path.split(".") + + current = data + + for part in parts: + if isinstance(current, dict): + current = current.get(part, {}) + elif isinstance(current, list): + if not part.isdigit(): + return {} + idx = int(part) + if idx < 0 or idx >= len(current): + return {} + current = current[idx] + else: + return {} + + return current + + def _render_template_attribute(self, key): + render = self._get_nested(self.defaults, key) + scenarios = self.scenario.get("scenario", []) + for i, _ in enumerate(scenarios): + path = f"scenario.{i}.{key}" + scenario_value = self._get_nested(self.scenario, path) + render = self._deep_merge(render, scenario_value) + + return render + + def _build_indexes(self): + """Build lookup dictionaries for all categories""" + self._indexes = {} + for category in [self._charts_key, self._images_key]: + data = getattr(self, category, {}) + self._indexes[category] = { + item["name"]: item for item in data.get("user-overrides", []) + } + + def _skopeo_list_tags(self, ref): + """ + Call: skopeo list-tags docker://ghcr.io/org/image + Return: list of tags (strings) + """ + try: + cmd = ["skopeo", "list-tags", f"docker://{ref}"] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + data = json.loads(result.stdout) + return data.get("Tags", []) + except Exception as e: + raise RuntimeError(f"Skopeo failed for {ref}: {e}") + + def _is_oci_repo(self, url: str) -> bool: + return url.startswith("oci://") + + def _helm_http_list_versions(self, url, chart_name): + """ + Given a Helm HTTP repo URL and chart name, return list of versions. + Uses index.yaml which lives at: /index.yaml + """ + index_url = url.rstrip("/") + "/index.yaml" + response = requests.get(index_url, timeout=10) + if response.status_code != 200: + raise RuntimeError(f"Failed to fetch {index_url}: {response.status_code}") + index = yaml.safe_load(response.text) + entries = index.get("entries", {}) + if chart_name not in entries: + raise RuntimeError(f"Chart '{chart_name}' not found at {index_url}") + versions = [entry["version"] for entry in entries[chart_name]] + return versions + + def _resolve_chart_auto_versions(self): + items = self.charts.get("user-overrides", []) + for item in items: + if str(item.get("version", "")) != ".auto": + continue + url = item["url"] + name = item["name"] + if self._is_oci_repo(url): + ref = url.replace("oci://", "") + tags = self._skopeo_list_tags(ref) + else: + tags = self._helm_http_list_versions(url, name) + if not tags: + raise RuntimeError(f"No chart versions found for {name}") + tags.sort() + latest = tags[-1] + item["version"] = latest + + def _resolve_image_auto_tags(self): + items = self.images.get("user-overrides", []) + for item in items: + if str(item.get("tag", "")) == ".auto": + registry = item["registry"] + repo = item["repo"] + image = item["image"] + ref = f"{registry}/{repo}/{image}" + tags = self._skopeo_list_tags(ref) + if not tags: + raise RuntimeError(f"No tags found for image {item['name']}") + tags.sort() + latest = tags[-1] + item["tag"] = latest + + def _literal_str_representer(self, dumper, data): + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + + def _convert_multiline_strings(self, obj): + if isinstance(obj, dict): + return {k: self._convert_multiline_strings(v) for k, v in obj.items()} + if isinstance(obj, list): + return [self._convert_multiline_strings(v) for v in obj] + if isinstance(obj, str) and "\n" in obj: + return LiteralStr(obj) + return obj + + def get_item_by_name(self, category, name): + """Generic method to get an item by name from any category""" + if category not in self._indexes: + raise ValueError(f"Unknown category: {category}") + return self._indexes[category].get(name) + + def get_chart_by_name(self, name): + return self.get_item_by_name(self._charts_key, name) + + def get_image_by_name(self, name): + return self.get_item_by_name(self._images_key, name) + + def plan_to_dict(self): + return { + self._charts_key: self.charts, + self._images_key: self.images, + self._system_stack_key: self.system_stack, + self._system_prepare_key: self.system_prepare, + self._system_harness_key: self.system_harness, + } + + def plan_to_yaml(self): + plan = self.plan_to_dict() + plan = self._convert_multiline_strings(plan) + with open(self.output_file, "w") as f: + yaml.dump( + plan, + f, + default_flow_style=False, + sort_keys=False, + allow_unicode=True, + ) + + def parse(self): + """Load defaults and apply overrides""" + yaml.add_representer(LiteralStr, self._literal_str_representer) + + self.charts = self._render_template_attribute(self._charts_key) + self.images = self._render_template_attribute(self._images_key) + self.system_stack = self._render_template_attribute(self._system_stack_key) + self.system_prepare = self._render_template_attribute(self._system_prepare_key) + self.system_harness = self._render_template_attribute(self._system_harness_key) + + self._resolve_chart_auto_versions() + self._resolve_image_auto_tags() + self._build_indexes() + + return self.plan_to_dict() diff --git a/declarative_poc/templates/llmdbench/llmdbench/plan/experiment.py b/declarative_poc/templates/llmdbench/llmdbench/plan/experiment.py new file mode 100644 index 00000000..3e818027 --- /dev/null +++ b/declarative_poc/templates/llmdbench/llmdbench/plan/experiment.py @@ -0,0 +1,3 @@ +class Experiment: + def __init__(self): + pass diff --git a/declarative_poc/templates/llmdbench/llmdbench/plan/harness.py b/declarative_poc/templates/llmdbench/llmdbench/plan/harness.py new file mode 100644 index 00000000..274f352c --- /dev/null +++ b/declarative_poc/templates/llmdbench/llmdbench/plan/harness.py @@ -0,0 +1,4 @@ +class Harness: + def __init__(self, runner_name: str, runner_content: dict): + self.name = runner_name + self.runner = runner_content diff --git a/declarative_poc/templates/llmdbench/llmdbench/plan/plan.py b/declarative_poc/templates/llmdbench/llmdbench/plan/plan.py new file mode 100644 index 00000000..1767f422 --- /dev/null +++ b/declarative_poc/templates/llmdbench/llmdbench/plan/plan.py @@ -0,0 +1,3 @@ +class Plan: + def __init__(self): + pass diff --git a/declarative_poc/templates/llmdbench/llmdbench/plan/system.py b/declarative_poc/templates/llmdbench/llmdbench/plan/system.py new file mode 100644 index 00000000..0028d443 --- /dev/null +++ b/declarative_poc/templates/llmdbench/llmdbench/plan/system.py @@ -0,0 +1,3 @@ +class System: + def __init__(self): + pass diff --git a/declarative_poc/templates/llmdbench/pyproject.toml b/declarative_poc/templates/llmdbench/pyproject.toml new file mode 100644 index 00000000..d8e1509b --- /dev/null +++ b/declarative_poc/templates/llmdbench/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "llmdbench" +version = "0.1.0" +description = "A library for configuration discovery and benchmarking for llm-d." +dependencies = [ + "PyYAML", + "Jinja2", + "requests", + "packaging" +] + +[project.scripts] +llmdbench = "llmdbench.cli:cli" diff --git a/declarative_poc/templates/scenarios/inference-scheduling.yaml b/declarative_poc/templates/scenarios/inference-scheduling.yaml new file mode 100644 index 00000000..a1ed88bf --- /dev/null +++ b/declarative_poc/templates/scenarios/inference-scheduling.yaml @@ -0,0 +1,40 @@ +scenario: + - name: "sut-1" + prepare: + user-overrides: + secrets: + - name: llm-d-hf-token + secret: HF_TOKEN + contents: REPLACE_HF_TOKEN + files: + - name: llm-d-benchmark-preprocesses + path: REPLACE_DIR_PATH + system: + user-overrides: + - name: "default" + inference-engine: + model: + - name: meta-llama/Llama-3.1-8B-Instruct + label: .auto + maxlen: 16384 + blocksize: 64 + replicas: + decode: 2 + prefill: 0 + volumes: + - name: model-storage + size: 1Ti + command: + decode: + type: vllmServe + args: + - "--enforce-eager" + - "--block-size" + - "64" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16374" + \ No newline at end of file diff --git a/declarative_poc/templates/scenarios/pd-disaggregation.yaml b/declarative_poc/templates/scenarios/pd-disaggregation.yaml new file mode 100644 index 00000000..77bba40b --- /dev/null +++ b/declarative_poc/templates/scenarios/pd-disaggregation.yaml @@ -0,0 +1,65 @@ +scenario: + - name: "sut-1" + prepare: + user-overrides: + secrets: + - name: llm-d-hf-token + secret: HF_TOKEN + contents: TYLERS_TOKEN + system: + user-overrides: + - name: "default" + inference-engine: + model: + - name: meta-llama/Llama-3.1-8B-Instruct + label: .auto + maxlen: 16000 + blocksize: 128 + replicas: + decode: 2 + prefill: 2 + parallelism: + decode: + tensor: 1 + prefill: + tensor: 1 + resources: + decode: + memory: 128Gi + cpu: 32 + prefill: + memory: 128Gi + cpu: 32 + volumes: + - name: model-storage + size: 1Ti + command: + decode: + type: vllmServe + args: + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16000" + prefill: + type: vllmServe + args: + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16000" + harness: + user-overrides: + - name: default + namespace: harnessns + harness: + name: vllm-benchmark + profile: random_concurrent.yaml \ No newline at end of file diff --git a/declarative_poc/templates/system_plan.yaml b/declarative_poc/templates/system_plan.yaml new file mode 100644 index 00000000..13746bc9 --- /dev/null +++ b/declarative_poc/templates/system_plan.yaml @@ -0,0 +1,370 @@ +charts: + user-overrides: + - name: kgateway-crds + url: oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds + version: 2.0.3 + - name: kgateway + url: oci://cr.kgateway.dev/kgateway-dev/charts/kgateway + version: 2.0.3 + - name: istio + url: oci://gcr.io/istio-testing/charts + version: 1.28-alpha.89f30b26ba71bf5e538083a4720d0bc2d8c06401 + - name: llm-d-infra + url: https://llm-d-incubation.github.io/llm-d-infra + version: 1.3.0 + - name: gateway-api-inference-extension + url: oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + version: v1.2.0-rc.1 + - name: llm-d-modelservice + url: https://llm-d-incubation.github.io/llm-d-modelservice/ + version: v0.3.9 +images: + user-overrides: + - name: llm-d-benchmark + registry: ghcr.io + repo: llm-d + image: llm-d-benchmark + tag: v0.3.7 + - name: llm-d + registry: ghcr.io + repo: llm-d + image: llm-d-cuda + tag: v0.4.0 + - name: llm-d-model-service + registry: ghcr.io + repo: llm-d + image: llm-d-model-service + tag: v0.0.15 + - name: llm-d-inference-scheduler + registry: ghcr.io + repo: llm-d + image: llm-d-inference-scheduler + tag: v0.4.0-rc.1 + - name: llm-d-routing-sidecar + registry: ghcr.io + repo: llm-d + image: llm-d-routing-sidecar + tag: v0.4.0-rc.1 + - name: llm-d-inference-sim + registry: ghcr.io + repo: llm-d + image: llm-d-inference-sim + tag: v0.6.1 + - name: vllm + registry: docker.io + repo: vllm + image: vllm-openai + tag: latest +system: + user-overrides: + - name: default + namespace: stackbenchns + release: stackbenchr + gateway: + type: kgateway + router: + plugins: default-plugins.yaml + volumes: + - name: model-storage + type: pvc + mount: model-storage + size: 1Ti + - name: dshm + type: Memory + mount: /dev/shm + size: 16Gi + components: + - name: infra + charts: + - name: llm-d-infra + contents: + gateway: + gatewayClassName: modelservice + service: + type: NodePort + gatewayParameters: + enabled: true + - name: router + charts: + - name: gateway-api-inference-extension + contents: + loadfrom: + - BASE_DIR/_templates/gateway-api-inference-extension.yaml + - name: inference-engine + modelservice: + charts: + - name: llm-d-modelservice + contents: + loadfrom: + - BASE_DIR/_templates/modelservice.yaml + standalone: + contents: + loadfrom: + - BASE_DIR/_templates/standalone.yaml + inference-engine: + type: modelservice + model: + - name: meta-llama/Llama-3.1-8B-Instruct + label: .auto + maxlen: 16384 + blocksize: 64 + accelerators: + standalone: + key: nvidia.com/gpu.product + value: NVIDIA-H100-80GB-HBM3 + decode: + key: nvidia.com/gpu.product + value: NVIDIA-H100-80GB-HBM3 + prefill: + key: nvidia.com/gpu.product + value: NVIDIA-H100-80GB-HBM3 + replicas: + standalone: 1 + decode: 2 + prefill: 0 + parallelism: + standalone: + data: 1 + tensor: 1 + decode: + data: 1 + tensor: 1 + prefill: + data: 1 + tensor: 1 + resources: + standalone: + memory: 40Gi + cpu: '4' + nvidia.com/gpu: '1' + ephemeral-storage: 20Gi + decode: + memory: 40Gi + cpu: '4' + nvidia.com/gpu: '1' + ephemeral-storage: 20Gi + prefill: + memory: 40Gi + cpu: '4' + nvidia.com/gpu: '1' + ephemeral-storage: 20Gi + annotations: + deployed-by: .auto + modelservice: llm-d-benchmark + ports: + service: 8000 + extra: 9002 + readiness: 8200 + zmq: 5557 + nixl: 5557 + labels: + app: .auto + stood-up-by: .auto + stood-up-from: llm-d-benchmark + stood-up-via: .auto + env: + standalone: + - name: LLMDBENCH_VLLM_STANDALONE_MODEL + value: TEMPLATE_MODEL_NAME + - name: LLMDBENCH_VLLM_STANDALONE_VLLM_LOAD_FORMAT + value: auto + - name: LLMDBENCH_VLLM_STANDALONE_MODEL_LOADER_EXTRA_CONFIG + value: '{}' + - name: VLLM_LOGGING_LEVEL + value: INFO + - name: HF_HOME + value: /TEMPLATE_MODEL_STORAGE + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: TEMPLATE_REPLACE_SECRET_NAME + key: TEMPLATE_REPLACE_SECRET + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: rc,sm,cuda_ipc,cuda_copy,tcp + - name: UCX_SOCKADDR_TLS_PRIORITY + value: tcp + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: TEMPLATE_REPLACE_NXL_PORT + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: '1' + - name: VLLM_SERVER_DEV_MODE + value: '1' + decode: + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: rc,sm,cuda_ipc,cuda_copy,tcp + - name: UCX_SOCKADDR_TLS_PRIORITY + value: tcp + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: TEMPLATE_REPLACE_NXL_PORT + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: '1' + - name: VLLM_SERVER_DEV_MODE + value: '1' + prefill: + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: rc,sm,cuda_ipc,cuda_copy,tcp + - name: UCX_SOCKADDR_TLS_PRIORITY + value: tcp + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: TEMPLATE_REPLACE_NXL_PORT + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: '1' + - name: VLLM_SERVER_DEV_MODE + value: '1' + command: + decode: + type: vllmServe + args: + - --enforce-eager + - --block-size + - '64' + - --kv-transfer-config + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - --disable-log-requests + - --disable-uvicorn-access-log + - --max-model-len + - '16374' +prepare: + user-overrides: + gateway: + provider: + - name: kgateway + charts: + - name: kgateway-crds + - name: kgateway + deploy: true + check: true + - name: istio + charts: + - name: istio + deploy: false + check: false + api: + url: https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd + version: 1.3.0 + deploy: true + check: true + inference_extension: + url: https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd + version: 1.0.1 + deploy: true + check: true + wva: + namespace: workload-variant-autoscaler-system + charts: + - name: workload-variant-autoscaler + images: + - name: workload-variant-autoscaler + replicas: 1 + version: 0.1.0 + autoscaling: + enabled: true + slo: + tpot: 30 + ttft: 1000 + hpa: + enabled: true + max_replicas: 10 + target_avg_value: 1 + vllm: + enabled: true + node_port_min: 30000 + node_port_max: 32767 + interval: 15 + workload_monitoring: + namespace: openshift-user-workload-monitoring + url: https://thanos-querier.openshift-monitoring.svc.cluster.local + port: 9091 + deploy: true + check: true + storage: + - name: model-storage + namespace: stackbenchns + class: default + size: 300Gi + download: + url: + enabled: true + timeout: 3600 + deploy: true + check: true + - name: replay-pvc + namespace: harnessns + class: default + size: 300Gi + download: + url: https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json + enabled: false + timeout: 3600 + deploy: true + check: true + - name: workload-pvc + namespace: harnessns + class: default + size: 300Gi + deploy: true + check: true + secrets: + - name: llm-d-hf-token + secret: HF_TOKEN + contents: TYLERS_TOKEN + files: + - name: llm-d-benchmark-preprocesses + path: REPLACE_BASE_DIR/setup/preprocess +harness: + user-overrides: + - name: default + namespace: harnessns + harness: + name: inference-perf + profile: sanity_random.yaml + executable: llm-d-benchmark.sh + timeout: 3600 + resources: + memory: 32Gi + cpu: '16' + volumes: + - name: workload-pvc + type: pvc + mount: /requests + - name: replay-pvc + type: pvc + mount: /data + env: + - name: LLMDBENCH_RUN_EXPERIMENT_LAUNCHER + value: '1' + - name: LLMDBENCH_RUN_DATASET_URL + value: .prepare.dependencies.user-overrides.storage[]|select(name=replay-pvc).download.url + - name: LLMDBENCH_RUN_WORKSPACE_DIR + value: /workspace + - name: LLMDBENCH_HARNESS_NAME + value: .experiments[0].user-overrides.runners[0].harness.name + - name: LLMDBENCH_HARNESS_NAMESPACE + value: .experiments[0].user-overrides.runners[0].namespace + - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL + value: ${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL} + - name: LLMDBENCH_HARNESS_STACK_NAME + value: ${LLMDBENCH_HARNESS_SANITIZED_STACK_NAME} + - name: LLMDBENCH_DEPLOY_METHODS + value: ${LLMDBENCH_DEPLOY_METHODS} + - name: LLMDBENCH_MAGIC_ENVAR + value: harness_pod + - name: HF_TOKEN_SECRET + value: ${LLMDBENCH_VLLM_COMMON_HF_TOKEN_NAME} + - name: .prepare.dependencies.user-overrides.secrets[0].secret + valueFrom: + secretKeyRef: + name: .prepare.dependencies.user-overrides.secrets[0].name + key: .prepare.dependencies.user-overrides.secrets[0].secret From 79b662994b5543a480e1a746ca5d5479eff64942 Mon Sep 17 00:00:00 2001 From: vezio Date: Tue, 2 Dec 2025 15:52:09 -0500 Subject: [PATCH 2/5] unadd Signed-off-by: vezio --- .../experiments/inference-scheduling.yaml | 4 - .../experiments/pd-disaggregation.yaml | 4 - .../llmdbench/build/lib/llmdbench/__init__.py | 0 .../llmdbench/build/lib/llmdbench/cli.py | 128 ------ .../build/lib/llmdbench/logging/logger.py | 40 -- .../build/lib/llmdbench/parser/__init__.py | 0 .../build/lib/llmdbench/parser/parse.py | 39 -- .../lib/llmdbench/parser/systemparser.py | 258 ------------ .../build/lib/llmdbench/plan/experiment.py | 3 - .../build/lib/llmdbench/plan/harness.py | 4 - .../build/lib/llmdbench/plan/plan.py | 3 - .../build/lib/llmdbench/plan/system.py | 3 - .../templates/llmdbench/llmdbench/__init__.py | 0 .../templates/llmdbench/llmdbench/cli.py | 128 ------ .../llmdbench/llmdbench/logging/logger.py | 40 -- .../llmdbench/llmdbench/parser/__init__.py | 0 .../llmdbench/parser/systemparser.py | 251 ------------ .../llmdbench/llmdbench/plan/experiment.py | 3 - .../llmdbench/llmdbench/plan/harness.py | 4 - .../llmdbench/llmdbench/plan/plan.py | 3 - .../llmdbench/llmdbench/plan/system.py | 3 - .../templates/llmdbench/pyproject.toml | 13 - .../scenarios/inference-scheduling.yaml | 40 -- .../scenarios/pd-disaggregation.yaml | 65 --- declarative_poc/templates/system_plan.yaml | 370 ------------------ 25 files changed, 1406 deletions(-) delete mode 100644 declarative_poc/templates/experiments/inference-scheduling.yaml delete mode 100644 declarative_poc/templates/experiments/pd-disaggregation.yaml delete mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/__init__.py delete mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/cli.py delete mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/logging/logger.py delete mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/__init__.py delete mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/parse.py delete mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/systemparser.py delete mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/experiment.py delete mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/harness.py delete mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/plan.py delete mode 100644 declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/system.py delete mode 100644 declarative_poc/templates/llmdbench/llmdbench/__init__.py delete mode 100644 declarative_poc/templates/llmdbench/llmdbench/cli.py delete mode 100644 declarative_poc/templates/llmdbench/llmdbench/logging/logger.py delete mode 100644 declarative_poc/templates/llmdbench/llmdbench/parser/__init__.py delete mode 100644 declarative_poc/templates/llmdbench/llmdbench/parser/systemparser.py delete mode 100644 declarative_poc/templates/llmdbench/llmdbench/plan/experiment.py delete mode 100644 declarative_poc/templates/llmdbench/llmdbench/plan/harness.py delete mode 100644 declarative_poc/templates/llmdbench/llmdbench/plan/plan.py delete mode 100644 declarative_poc/templates/llmdbench/llmdbench/plan/system.py delete mode 100644 declarative_poc/templates/llmdbench/pyproject.toml delete mode 100644 declarative_poc/templates/scenarios/inference-scheduling.yaml delete mode 100644 declarative_poc/templates/scenarios/pd-disaggregation.yaml delete mode 100644 declarative_poc/templates/system_plan.yaml diff --git a/declarative_poc/templates/experiments/inference-scheduling.yaml b/declarative_poc/templates/experiments/inference-scheduling.yaml deleted file mode 100644 index 8fb80a4d..00000000 --- a/declarative_poc/templates/experiments/inference-scheduling.yaml +++ /dev/null @@ -1,4 +0,0 @@ -template: - path: /Users/vezio/IBM/llmd/haul/templates/default_system.yaml -scenario: - path: /Users/vezio/IBM/llmd/haul/scenarios/inference-scheduling.yaml diff --git a/declarative_poc/templates/experiments/pd-disaggregation.yaml b/declarative_poc/templates/experiments/pd-disaggregation.yaml deleted file mode 100644 index e2a105f3..00000000 --- a/declarative_poc/templates/experiments/pd-disaggregation.yaml +++ /dev/null @@ -1,4 +0,0 @@ -template: - path: /Users/vezio/IBM/llmd/haul/templates/default_system.yaml -scenario: - path: /Users/vezio/IBM/llmd/haul/scenarios/pd-disaggregation.yaml \ No newline at end of file diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/__init__.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/cli.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/cli.py deleted file mode 100644 index cd49c405..00000000 --- a/declarative_poc/templates/llmdbench/build/lib/llmdbench/cli.py +++ /dev/null @@ -1,128 +0,0 @@ -from llmdbench.parser.systemparser import SystemParser -from llmdbench.logging.logger import get_logger, set_stage - -import json -import argparse -import yaml - - -def cli(): - """ - Command-line interface for llmdbench. - - Subcommands: - - plan: Merge and render YAMLs (previously 'configure') - - prepare: Prepare environment or data before execution - - execute: Run workloads or apply configurations - - destroy: Clean up or rollback resources - - report: Generate summary or benchmark reports - """ - logger = get_logger("llmdbench.cli") - - parser = argparse.ArgumentParser( - prog="llmdbench", - description="Manage and benchmark llmd configurations.", - ) - - subparsers = parser.add_subparsers(dest="command", required=True) - - # -------------------------- - # plan - # -------------------------- - plan_parser = subparsers.add_parser( - "plan", - help="Merge charts/images and render templates into a versioned YAML plan.", - ) - plan_parser.add_argument( - "--experiment", - required=True, - help="Path to the experiment file to plan.", - ) - plan_parser.add_argument( - "--output", - default="system_plan.yaml", - help="Path to save the output experiment as a YAML file.", - ) - - # -------------------------- - # prepare - # -------------------------- - prepare_parser = subparsers.add_parser( - "prepare", help="Prepare the environment or dependencies for execution." - ) - prepare_parser.add_argument( - "--config", required=False, help="Optional path to configuration YAML." - ) - - # -------------------------- - # execute - # -------------------------- - execute_parser = subparsers.add_parser( - "execute", help="Execute the benchmark or deployment defined in the plan." - ) - execute_parser.add_argument( - "--plan", required=True, help="Path to the planned YAML configuration." - ) - - # -------------------------- - # destroy - # -------------------------- - destroy_parser = subparsers.add_parser( - "destroy", help="Tear down or rollback any created resources." - ) - destroy_parser.add_argument( - "--plan", required=False, help="Path to the plan used for deployment." - ) - - # -------------------------- - # report - # -------------------------- - report_parser = subparsers.add_parser( - "report", help="Generate a report or analysis from execution results." - ) - report_parser.add_argument( - "--input", required=False, help="Path to execution results or metrics." - ) - report_parser.add_argument( - "--output", default="report.yaml", help="Path to save the report output." - ) - - # -------------------------- - # Parse and dispatch - # -------------------------- - args = parser.parse_args() - - with open(args.experiment, "r") as f: - data = yaml.safe_load(f) - template_path = data["template"]["path"] - scenario_path = data["scenario"]["path"] - - # Regardless - we need create a plan - otherwise we won't have context of - # what to todo - in the future we can "import" a context to "rerun" a plan. - system = SystemParser(template_path, args.output, scenario_path) - system.parse() - - if args.command == "plan": - set_stage(logger, "๐Ÿ”ง PLAN") - logger.info("Creating execution and deployment plan...") - logger.info(f"Plan saved to {args.output}") - # print(json.dumps(system.plan_to_dict(), indent=2)) - system.plan_to_yaml() - elif args.command == "prepare": - set_stage(logger, "๐Ÿ”ง PREPARE") - logger.info("Preparing environment...") - elif args.command == "execute": - set_stage(logger, "๐Ÿš€ EXECUTE") - logger.info(f"Executing plan: {args.plan}") - elif args.command == "destroy": - set_stage(logger, "๐Ÿงน DESTROY") - logger.info("Cleaning up resources...") - elif args.command == "report": - set_stage(logger, "๐Ÿ“Š REPORT") - logger.info("Generating report...") - else: - parser.print_help() - - -if __name__ == "__main__": - cli() diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/logging/logger.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/logging/logger.py deleted file mode 100644 index c37115ce..00000000 --- a/declarative_poc/templates/llmdbench/build/lib/llmdbench/logging/logger.py +++ /dev/null @@ -1,40 +0,0 @@ -import logging -import sys - - -class StageFormatter(logging.Formatter): - def __init__(self, stage="RUN", fmt=None, datefmt=None): - self.stage = stage - super().__init__(fmt=fmt, datefmt=datefmt) - - def format(self, record): - record.stage = getattr(record, "stage", self.stage) - return super().format(record) - - -def get_logger(name="llmdbench", stage="RUN", level=logging.INFO): - logger = logging.getLogger(name) - - if not logger.handlers: - handler = logging.StreamHandler(sys.stdout) - formatter = StageFormatter( - stage=stage, - fmt="%(asctime)s - %(levelname)-8s - %(name)s - %(stage)s - %(message)s", - datefmt="%H:%M:%S", - ) - handler.setFormatter(formatter) - logger.addHandler(handler) - logger.setLevel(level) - logger.propagate = False - else: - for handler in logger.handlers: - if isinstance(handler.formatter, StageFormatter): - handler.formatter.stage = stage - - return logger - - -def set_stage(logger, stage): - for handler in logger.handlers: - if isinstance(handler.formatter, StageFormatter): - handler.formatter.stage = stage diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/__init__.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/parse.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/parse.py deleted file mode 100644 index db817524..00000000 --- a/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/parse.py +++ /dev/null @@ -1,39 +0,0 @@ -import yaml -import os -import sys -from jinja2 import ( - Environment, - FileSystemLoader, - StrictUndefined, - Template, - ChainableUndefined, -) - - -# Load values.yaml if present -def load_values(path="values.yaml"): - try: - with open(path, "r") as f: - return yaml.safe_load(f) or {} - except FileNotFoundError: - return {} - - -def parse( - template_dir=None, values_file=None, output_dir="/Users/vezio/IBM/llmd/haul/stack" -): - - # Jinja environment - env = Environment( - loader=FileSystemLoader(template_dir), undefined=ChainableUndefined - ) - - # Iterate over all template files - for filename in os.listdir(template_dir): - if not filename.endswith((".j2", ".jinja", ".tmpl", ".template")): - continue - - template = env.get_template(filename) - print(template) - rendered = template.render(charts={}) - print(rendered) diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/systemparser.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/systemparser.py deleted file mode 100644 index baa6814a..00000000 --- a/declarative_poc/templates/llmdbench/build/lib/llmdbench/parser/systemparser.py +++ /dev/null @@ -1,258 +0,0 @@ -import yaml -import copy -import json -import subprocess -import requests -import re - - -class LiteralStr(str): - pass - - -class SystemParser: - def __init__(self, defaults_file, output_file, scenario_file=None): - self.defaults = self._load_yaml(defaults_file) - self.output_file = output_file - self.scenario = self._load_yaml(scenario_file) if scenario_file else {} - - self._charts_key = "charts" - self.charts = {} - - self._images_key = "images" - self.images = {} - - self._system_stack_key = "system" - self.system_stack = {} - - self._system_prepare_key = "prepare" - self._system_prepare = {} - - self._system_harness_key = "harness" - self._system_harness = {} - - self.system_experiments_key = "experiments" - self.system_experiments = {} - - def _load_yaml(self, file_path): - """Load YAML file""" - with open(file_path, "r") as f: - return yaml.safe_load(f) or {} - - def _merge_lists(self, base_list, override_list): - """ - Merge lists of dictionaries by 'name' field. - If items have 'name' field, merge by matching names. - Otherwise, replace the entire list. - """ - if base_list and isinstance(base_list[0], dict) and "name" in base_list[0]: - result = copy.deepcopy(base_list) - base_map = {item["name"]: idx for idx, item in enumerate(result)} - for override_item in override_list: - if "name" in override_item: - name = override_item["name"] - if name in base_map: - idx = base_map[name] - result[idx] = self._deep_merge(result[idx], override_item) - else: - result.append(copy.deepcopy(override_item)) - - return result - else: - return copy.deepcopy(override_list) - - def _deep_merge(self, base, overrides): - """ - Recursively merge overrides into base dictionary. - For lists of dicts with 'name' field, merge by matching names. - Overrides take precedence over base values. - """ - result = copy.deepcopy(base) - for key, value in overrides.items(): - if key in result: - if isinstance(result[key], dict) and isinstance(value, dict): - result[key] = self._deep_merge(result[key], value) - elif isinstance(result[key], list) and isinstance(value, list): - result[key] = self._merge_lists(result[key], value) - else: - result[key] = value - else: - result[key] = copy.deepcopy(value) - return result - - def _get_nested(self, data, path: str): - """ - Retrieves a nested structure using dotted paths. - Supports list indexes like key.0.name or key[0].name. - """ - # Normalize bracket indices: "scenario[0]" -> "scenario.0" - path = re.sub(r"\[(\d+)\]", r".\1", path) - parts = path.split(".") - - current = data - - for part in parts: - if isinstance(current, dict): - # dict lookup - current = current.get(part, {}) - elif isinstance(current, list): - # list index lookup - if not part.isdigit(): - return {} # trying to index a list with non-int - idx = int(part) - if idx < 0 or idx >= len(current): - return {} - current = current[idx] - else: - # neither list nor dict - cannot go deeper - return {} - - return current - - def _render_template_attribute(self, key): - # Start with defaults - render = self._get_nested(self.defaults, key) - - # Merge all scenarios (scenario[0], scenario[1], ...) - scenarios = self.scenario.get("scenario", []) - for i, _ in enumerate(scenarios): - path = f"scenario.{i}.{key}" - scenario_value = self._get_nested(self.scenario, path) - render = self._deep_merge(render, scenario_value) - - return render - - def _build_indexes(self): - """Build lookup dictionaries for all categories""" - self._indexes = {} - for category in [self._charts_key, self._images_key]: - data = getattr(self, category, {}) - self._indexes[category] = { - item["name"]: item for item in data.get("user-overrides", []) - } - - def _skopeo_list_tags(self, ref): - """ - Call: skopeo list-tags docker://ghcr.io/org/image - Return: list of tags (strings) - """ - try: - cmd = ["skopeo", "list-tags", f"docker://{ref}"] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - data = json.loads(result.stdout) - return data.get("Tags", []) - except Exception as e: - raise RuntimeError(f"Skopeo failed for {ref}: {e}") - - def _is_oci_repo(self, url: str) -> bool: - return url.startswith("oci://") - - def _helm_http_list_versions(self, url, chart_name): - """ - Given a Helm HTTP repo URL and chart name, return list of versions. - Uses index.yaml which lives at: /index.yaml - """ - index_url = url.rstrip("/") + "/index.yaml" - response = requests.get(index_url, timeout=10) - if response.status_code != 200: - raise RuntimeError(f"Failed to fetch {index_url}: {response.status_code}") - index = yaml.safe_load(response.text) - entries = index.get("entries", {}) - if chart_name not in entries: - raise RuntimeError(f"Chart '{chart_name}' not found at {index_url}") - versions = [entry["version"] for entry in entries[chart_name]] - return versions - - def _resolve_chart_auto_versions(self): - items = self.charts.get("user-overrides", []) - for item in items: - if str(item.get("version", "")) != ".auto": - continue - url = item["url"] - name = item["name"] - if self._is_oci_repo(url): - ref = url.replace("oci://", "") - tags = self._skopeo_list_tags(ref) - else: - tags = self._helm_http_list_versions(url, name) - if not tags: - raise RuntimeError(f"No chart versions found for {name}") - tags.sort() - latest = tags[-1] - item["version"] = latest - - def _resolve_image_auto_tags(self): - items = self.images.get("user-overrides", []) - for item in items: - if str(item.get("tag", "")) == ".auto": - registry = item["registry"] - repo = item["repo"] - image = item["image"] - ref = f"{registry}/{repo}/{image}" - tags = self._skopeo_list_tags(ref) - if not tags: - raise RuntimeError(f"No tags found for image {item['name']}") - tags.sort() - latest = tags[-1] - item["tag"] = latest - - def _literal_str_representer(self, dumper, data): - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - - def _convert_multiline_strings(self, obj): - if isinstance(obj, dict): - return {k: self._convert_multiline_strings(v) for k, v in obj.items()} - if isinstance(obj, list): - return [self._convert_multiline_strings(v) for v in obj] - if isinstance(obj, str) and "\n" in obj: - return LiteralStr(obj) - return obj - - def get_item_by_name(self, category, name): - """Generic method to get an item by name from any category""" - if category not in self._indexes: - raise ValueError(f"Unknown category: {category}") - return self._indexes[category].get(name) - - def get_chart_by_name(self, name): - return self.get_item_by_name(self._charts_key, name) - - def get_image_by_name(self, name): - return self.get_item_by_name(self._images_key, name) - - def plan_to_dict(self): - return { - self._charts_key: self.charts, - self._images_key: self.images, - self._system_stack_key: self.system_stack, - self._system_prepare_key: self.system_prepare, - self._system_harness_key: self.system_harness, - } - - def plan_to_yaml(self): - plan = self.plan_to_dict() - plan = self._convert_multiline_strings(plan) - with open(self.output_file, "w") as f: - yaml.dump( - plan, - f, - default_flow_style=False, - sort_keys=False, - allow_unicode=True, - ) - - def parse(self): - """Load defaults and apply overrides""" - yaml.add_representer(LiteralStr, self._literal_str_representer) - - self.charts = self._render_template_attribute(self._charts_key) - self.images = self._render_template_attribute(self._images_key) - self.system_stack = self._render_template_attribute(self._system_stack_key) - self.system_prepare = self._render_template_attribute(self._system_prepare_key) - self.system_harness = self._render_template_attribute(self._system_harness_key) - - self._resolve_chart_auto_versions() - self._resolve_image_auto_tags() - self._build_indexes() - - return self.plan_to_dict() diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/experiment.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/experiment.py deleted file mode 100644 index 3e818027..00000000 --- a/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/experiment.py +++ /dev/null @@ -1,3 +0,0 @@ -class Experiment: - def __init__(self): - pass diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/harness.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/harness.py deleted file mode 100644 index 274f352c..00000000 --- a/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/harness.py +++ /dev/null @@ -1,4 +0,0 @@ -class Harness: - def __init__(self, runner_name: str, runner_content: dict): - self.name = runner_name - self.runner = runner_content diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/plan.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/plan.py deleted file mode 100644 index 1767f422..00000000 --- a/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/plan.py +++ /dev/null @@ -1,3 +0,0 @@ -class Plan: - def __init__(self): - pass diff --git a/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/system.py b/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/system.py deleted file mode 100644 index 0028d443..00000000 --- a/declarative_poc/templates/llmdbench/build/lib/llmdbench/plan/system.py +++ /dev/null @@ -1,3 +0,0 @@ -class System: - def __init__(self): - pass diff --git a/declarative_poc/templates/llmdbench/llmdbench/__init__.py b/declarative_poc/templates/llmdbench/llmdbench/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/declarative_poc/templates/llmdbench/llmdbench/cli.py b/declarative_poc/templates/llmdbench/llmdbench/cli.py deleted file mode 100644 index cd49c405..00000000 --- a/declarative_poc/templates/llmdbench/llmdbench/cli.py +++ /dev/null @@ -1,128 +0,0 @@ -from llmdbench.parser.systemparser import SystemParser -from llmdbench.logging.logger import get_logger, set_stage - -import json -import argparse -import yaml - - -def cli(): - """ - Command-line interface for llmdbench. - - Subcommands: - - plan: Merge and render YAMLs (previously 'configure') - - prepare: Prepare environment or data before execution - - execute: Run workloads or apply configurations - - destroy: Clean up or rollback resources - - report: Generate summary or benchmark reports - """ - logger = get_logger("llmdbench.cli") - - parser = argparse.ArgumentParser( - prog="llmdbench", - description="Manage and benchmark llmd configurations.", - ) - - subparsers = parser.add_subparsers(dest="command", required=True) - - # -------------------------- - # plan - # -------------------------- - plan_parser = subparsers.add_parser( - "plan", - help="Merge charts/images and render templates into a versioned YAML plan.", - ) - plan_parser.add_argument( - "--experiment", - required=True, - help="Path to the experiment file to plan.", - ) - plan_parser.add_argument( - "--output", - default="system_plan.yaml", - help="Path to save the output experiment as a YAML file.", - ) - - # -------------------------- - # prepare - # -------------------------- - prepare_parser = subparsers.add_parser( - "prepare", help="Prepare the environment or dependencies for execution." - ) - prepare_parser.add_argument( - "--config", required=False, help="Optional path to configuration YAML." - ) - - # -------------------------- - # execute - # -------------------------- - execute_parser = subparsers.add_parser( - "execute", help="Execute the benchmark or deployment defined in the plan." - ) - execute_parser.add_argument( - "--plan", required=True, help="Path to the planned YAML configuration." - ) - - # -------------------------- - # destroy - # -------------------------- - destroy_parser = subparsers.add_parser( - "destroy", help="Tear down or rollback any created resources." - ) - destroy_parser.add_argument( - "--plan", required=False, help="Path to the plan used for deployment." - ) - - # -------------------------- - # report - # -------------------------- - report_parser = subparsers.add_parser( - "report", help="Generate a report or analysis from execution results." - ) - report_parser.add_argument( - "--input", required=False, help="Path to execution results or metrics." - ) - report_parser.add_argument( - "--output", default="report.yaml", help="Path to save the report output." - ) - - # -------------------------- - # Parse and dispatch - # -------------------------- - args = parser.parse_args() - - with open(args.experiment, "r") as f: - data = yaml.safe_load(f) - template_path = data["template"]["path"] - scenario_path = data["scenario"]["path"] - - # Regardless - we need create a plan - otherwise we won't have context of - # what to todo - in the future we can "import" a context to "rerun" a plan. - system = SystemParser(template_path, args.output, scenario_path) - system.parse() - - if args.command == "plan": - set_stage(logger, "๐Ÿ”ง PLAN") - logger.info("Creating execution and deployment plan...") - logger.info(f"Plan saved to {args.output}") - # print(json.dumps(system.plan_to_dict(), indent=2)) - system.plan_to_yaml() - elif args.command == "prepare": - set_stage(logger, "๐Ÿ”ง PREPARE") - logger.info("Preparing environment...") - elif args.command == "execute": - set_stage(logger, "๐Ÿš€ EXECUTE") - logger.info(f"Executing plan: {args.plan}") - elif args.command == "destroy": - set_stage(logger, "๐Ÿงน DESTROY") - logger.info("Cleaning up resources...") - elif args.command == "report": - set_stage(logger, "๐Ÿ“Š REPORT") - logger.info("Generating report...") - else: - parser.print_help() - - -if __name__ == "__main__": - cli() diff --git a/declarative_poc/templates/llmdbench/llmdbench/logging/logger.py b/declarative_poc/templates/llmdbench/llmdbench/logging/logger.py deleted file mode 100644 index c37115ce..00000000 --- a/declarative_poc/templates/llmdbench/llmdbench/logging/logger.py +++ /dev/null @@ -1,40 +0,0 @@ -import logging -import sys - - -class StageFormatter(logging.Formatter): - def __init__(self, stage="RUN", fmt=None, datefmt=None): - self.stage = stage - super().__init__(fmt=fmt, datefmt=datefmt) - - def format(self, record): - record.stage = getattr(record, "stage", self.stage) - return super().format(record) - - -def get_logger(name="llmdbench", stage="RUN", level=logging.INFO): - logger = logging.getLogger(name) - - if not logger.handlers: - handler = logging.StreamHandler(sys.stdout) - formatter = StageFormatter( - stage=stage, - fmt="%(asctime)s - %(levelname)-8s - %(name)s - %(stage)s - %(message)s", - datefmt="%H:%M:%S", - ) - handler.setFormatter(formatter) - logger.addHandler(handler) - logger.setLevel(level) - logger.propagate = False - else: - for handler in logger.handlers: - if isinstance(handler.formatter, StageFormatter): - handler.formatter.stage = stage - - return logger - - -def set_stage(logger, stage): - for handler in logger.handlers: - if isinstance(handler.formatter, StageFormatter): - handler.formatter.stage = stage diff --git a/declarative_poc/templates/llmdbench/llmdbench/parser/__init__.py b/declarative_poc/templates/llmdbench/llmdbench/parser/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/declarative_poc/templates/llmdbench/llmdbench/parser/systemparser.py b/declarative_poc/templates/llmdbench/llmdbench/parser/systemparser.py deleted file mode 100644 index 15d32147..00000000 --- a/declarative_poc/templates/llmdbench/llmdbench/parser/systemparser.py +++ /dev/null @@ -1,251 +0,0 @@ -import yaml -import copy -import json -import subprocess -import requests -import re - - -class LiteralStr(str): - pass - - -class SystemParser: - def __init__(self, defaults_file, output_file, scenario_file=None): - self.defaults = self._load_yaml(defaults_file) - self.output_file = output_file - self.scenario = self._load_yaml(scenario_file) if scenario_file else {} - - self._charts_key = "charts" - self.charts = {} - - self._images_key = "images" - self.images = {} - - self._system_stack_key = "system" - self.system_stack = {} - - self._system_prepare_key = "prepare" - self._system_prepare = {} - - self._system_harness_key = "harness" - self._system_harness = {} - - self.system_experiments_key = "experiments" - self.system_experiments = {} - - def _load_yaml(self, file_path): - """Load YAML file""" - with open(file_path, "r") as f: - return yaml.safe_load(f) or {} - - def _merge_lists(self, base_list, override_list): - """ - Merge lists of dictionaries by 'name' field. - If items have 'name' field, merge by matching names. - Otherwise, replace the entire list. - """ - if base_list and isinstance(base_list[0], dict) and "name" in base_list[0]: - result = copy.deepcopy(base_list) - base_map = {item["name"]: idx for idx, item in enumerate(result)} - for override_item in override_list: - if "name" in override_item: - name = override_item["name"] - if name in base_map: - idx = base_map[name] - result[idx] = self._deep_merge(result[idx], override_item) - else: - result.append(copy.deepcopy(override_item)) - - return result - else: - return copy.deepcopy(override_list) - - def _deep_merge(self, base, overrides): - """ - Recursively merge overrides into base dictionary. - For lists of dicts with 'name' field, merge by matching names. - Overrides take precedence over base values. - """ - result = copy.deepcopy(base) - for key, value in overrides.items(): - if key in result: - if isinstance(result[key], dict) and isinstance(value, dict): - result[key] = self._deep_merge(result[key], value) - elif isinstance(result[key], list) and isinstance(value, list): - result[key] = self._merge_lists(result[key], value) - else: - result[key] = value - else: - result[key] = copy.deepcopy(value) - return result - - def _get_nested(self, data, path: str): - """ - Retrieves a nested structure using dotted paths. - Supports list indexes like key.0.name or key[0].name. - """ - path = re.sub(r"\[(\d+)\]", r".\1", path) - parts = path.split(".") - - current = data - - for part in parts: - if isinstance(current, dict): - current = current.get(part, {}) - elif isinstance(current, list): - if not part.isdigit(): - return {} - idx = int(part) - if idx < 0 or idx >= len(current): - return {} - current = current[idx] - else: - return {} - - return current - - def _render_template_attribute(self, key): - render = self._get_nested(self.defaults, key) - scenarios = self.scenario.get("scenario", []) - for i, _ in enumerate(scenarios): - path = f"scenario.{i}.{key}" - scenario_value = self._get_nested(self.scenario, path) - render = self._deep_merge(render, scenario_value) - - return render - - def _build_indexes(self): - """Build lookup dictionaries for all categories""" - self._indexes = {} - for category in [self._charts_key, self._images_key]: - data = getattr(self, category, {}) - self._indexes[category] = { - item["name"]: item for item in data.get("user-overrides", []) - } - - def _skopeo_list_tags(self, ref): - """ - Call: skopeo list-tags docker://ghcr.io/org/image - Return: list of tags (strings) - """ - try: - cmd = ["skopeo", "list-tags", f"docker://{ref}"] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - data = json.loads(result.stdout) - return data.get("Tags", []) - except Exception as e: - raise RuntimeError(f"Skopeo failed for {ref}: {e}") - - def _is_oci_repo(self, url: str) -> bool: - return url.startswith("oci://") - - def _helm_http_list_versions(self, url, chart_name): - """ - Given a Helm HTTP repo URL and chart name, return list of versions. - Uses index.yaml which lives at: /index.yaml - """ - index_url = url.rstrip("/") + "/index.yaml" - response = requests.get(index_url, timeout=10) - if response.status_code != 200: - raise RuntimeError(f"Failed to fetch {index_url}: {response.status_code}") - index = yaml.safe_load(response.text) - entries = index.get("entries", {}) - if chart_name not in entries: - raise RuntimeError(f"Chart '{chart_name}' not found at {index_url}") - versions = [entry["version"] for entry in entries[chart_name]] - return versions - - def _resolve_chart_auto_versions(self): - items = self.charts.get("user-overrides", []) - for item in items: - if str(item.get("version", "")) != ".auto": - continue - url = item["url"] - name = item["name"] - if self._is_oci_repo(url): - ref = url.replace("oci://", "") - tags = self._skopeo_list_tags(ref) - else: - tags = self._helm_http_list_versions(url, name) - if not tags: - raise RuntimeError(f"No chart versions found for {name}") - tags.sort() - latest = tags[-1] - item["version"] = latest - - def _resolve_image_auto_tags(self): - items = self.images.get("user-overrides", []) - for item in items: - if str(item.get("tag", "")) == ".auto": - registry = item["registry"] - repo = item["repo"] - image = item["image"] - ref = f"{registry}/{repo}/{image}" - tags = self._skopeo_list_tags(ref) - if not tags: - raise RuntimeError(f"No tags found for image {item['name']}") - tags.sort() - latest = tags[-1] - item["tag"] = latest - - def _literal_str_representer(self, dumper, data): - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - - def _convert_multiline_strings(self, obj): - if isinstance(obj, dict): - return {k: self._convert_multiline_strings(v) for k, v in obj.items()} - if isinstance(obj, list): - return [self._convert_multiline_strings(v) for v in obj] - if isinstance(obj, str) and "\n" in obj: - return LiteralStr(obj) - return obj - - def get_item_by_name(self, category, name): - """Generic method to get an item by name from any category""" - if category not in self._indexes: - raise ValueError(f"Unknown category: {category}") - return self._indexes[category].get(name) - - def get_chart_by_name(self, name): - return self.get_item_by_name(self._charts_key, name) - - def get_image_by_name(self, name): - return self.get_item_by_name(self._images_key, name) - - def plan_to_dict(self): - return { - self._charts_key: self.charts, - self._images_key: self.images, - self._system_stack_key: self.system_stack, - self._system_prepare_key: self.system_prepare, - self._system_harness_key: self.system_harness, - } - - def plan_to_yaml(self): - plan = self.plan_to_dict() - plan = self._convert_multiline_strings(plan) - with open(self.output_file, "w") as f: - yaml.dump( - plan, - f, - default_flow_style=False, - sort_keys=False, - allow_unicode=True, - ) - - def parse(self): - """Load defaults and apply overrides""" - yaml.add_representer(LiteralStr, self._literal_str_representer) - - self.charts = self._render_template_attribute(self._charts_key) - self.images = self._render_template_attribute(self._images_key) - self.system_stack = self._render_template_attribute(self._system_stack_key) - self.system_prepare = self._render_template_attribute(self._system_prepare_key) - self.system_harness = self._render_template_attribute(self._system_harness_key) - - self._resolve_chart_auto_versions() - self._resolve_image_auto_tags() - self._build_indexes() - - return self.plan_to_dict() diff --git a/declarative_poc/templates/llmdbench/llmdbench/plan/experiment.py b/declarative_poc/templates/llmdbench/llmdbench/plan/experiment.py deleted file mode 100644 index 3e818027..00000000 --- a/declarative_poc/templates/llmdbench/llmdbench/plan/experiment.py +++ /dev/null @@ -1,3 +0,0 @@ -class Experiment: - def __init__(self): - pass diff --git a/declarative_poc/templates/llmdbench/llmdbench/plan/harness.py b/declarative_poc/templates/llmdbench/llmdbench/plan/harness.py deleted file mode 100644 index 274f352c..00000000 --- a/declarative_poc/templates/llmdbench/llmdbench/plan/harness.py +++ /dev/null @@ -1,4 +0,0 @@ -class Harness: - def __init__(self, runner_name: str, runner_content: dict): - self.name = runner_name - self.runner = runner_content diff --git a/declarative_poc/templates/llmdbench/llmdbench/plan/plan.py b/declarative_poc/templates/llmdbench/llmdbench/plan/plan.py deleted file mode 100644 index 1767f422..00000000 --- a/declarative_poc/templates/llmdbench/llmdbench/plan/plan.py +++ /dev/null @@ -1,3 +0,0 @@ -class Plan: - def __init__(self): - pass diff --git a/declarative_poc/templates/llmdbench/llmdbench/plan/system.py b/declarative_poc/templates/llmdbench/llmdbench/plan/system.py deleted file mode 100644 index 0028d443..00000000 --- a/declarative_poc/templates/llmdbench/llmdbench/plan/system.py +++ /dev/null @@ -1,3 +0,0 @@ -class System: - def __init__(self): - pass diff --git a/declarative_poc/templates/llmdbench/pyproject.toml b/declarative_poc/templates/llmdbench/pyproject.toml deleted file mode 100644 index d8e1509b..00000000 --- a/declarative_poc/templates/llmdbench/pyproject.toml +++ /dev/null @@ -1,13 +0,0 @@ -[project] -name = "llmdbench" -version = "0.1.0" -description = "A library for configuration discovery and benchmarking for llm-d." -dependencies = [ - "PyYAML", - "Jinja2", - "requests", - "packaging" -] - -[project.scripts] -llmdbench = "llmdbench.cli:cli" diff --git a/declarative_poc/templates/scenarios/inference-scheduling.yaml b/declarative_poc/templates/scenarios/inference-scheduling.yaml deleted file mode 100644 index a1ed88bf..00000000 --- a/declarative_poc/templates/scenarios/inference-scheduling.yaml +++ /dev/null @@ -1,40 +0,0 @@ -scenario: - - name: "sut-1" - prepare: - user-overrides: - secrets: - - name: llm-d-hf-token - secret: HF_TOKEN - contents: REPLACE_HF_TOKEN - files: - - name: llm-d-benchmark-preprocesses - path: REPLACE_DIR_PATH - system: - user-overrides: - - name: "default" - inference-engine: - model: - - name: meta-llama/Llama-3.1-8B-Instruct - label: .auto - maxlen: 16384 - blocksize: 64 - replicas: - decode: 2 - prefill: 0 - volumes: - - name: model-storage - size: 1Ti - command: - decode: - type: vllmServe - args: - - "--enforce-eager" - - "--block-size" - - "64" - - "--kv-transfer-config" - - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' - - "--disable-log-requests" - - "--disable-uvicorn-access-log" - - "--max-model-len" - - "16374" - \ No newline at end of file diff --git a/declarative_poc/templates/scenarios/pd-disaggregation.yaml b/declarative_poc/templates/scenarios/pd-disaggregation.yaml deleted file mode 100644 index 77bba40b..00000000 --- a/declarative_poc/templates/scenarios/pd-disaggregation.yaml +++ /dev/null @@ -1,65 +0,0 @@ -scenario: - - name: "sut-1" - prepare: - user-overrides: - secrets: - - name: llm-d-hf-token - secret: HF_TOKEN - contents: TYLERS_TOKEN - system: - user-overrides: - - name: "default" - inference-engine: - model: - - name: meta-llama/Llama-3.1-8B-Instruct - label: .auto - maxlen: 16000 - blocksize: 128 - replicas: - decode: 2 - prefill: 2 - parallelism: - decode: - tensor: 1 - prefill: - tensor: 1 - resources: - decode: - memory: 128Gi - cpu: 32 - prefill: - memory: 128Gi - cpu: 32 - volumes: - - name: model-storage - size: 1Ti - command: - decode: - type: vllmServe - args: - - "--block-size" - - "128" - - "--kv-transfer-config" - - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' - - "--disable-log-requests" - - "--disable-uvicorn-access-log" - - "--max-model-len" - - "16000" - prefill: - type: vllmServe - args: - - "--block-size" - - "128" - - "--kv-transfer-config" - - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' - - "--disable-log-requests" - - "--disable-uvicorn-access-log" - - "--max-model-len" - - "16000" - harness: - user-overrides: - - name: default - namespace: harnessns - harness: - name: vllm-benchmark - profile: random_concurrent.yaml \ No newline at end of file diff --git a/declarative_poc/templates/system_plan.yaml b/declarative_poc/templates/system_plan.yaml deleted file mode 100644 index 13746bc9..00000000 --- a/declarative_poc/templates/system_plan.yaml +++ /dev/null @@ -1,370 +0,0 @@ -charts: - user-overrides: - - name: kgateway-crds - url: oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds - version: 2.0.3 - - name: kgateway - url: oci://cr.kgateway.dev/kgateway-dev/charts/kgateway - version: 2.0.3 - - name: istio - url: oci://gcr.io/istio-testing/charts - version: 1.28-alpha.89f30b26ba71bf5e538083a4720d0bc2d8c06401 - - name: llm-d-infra - url: https://llm-d-incubation.github.io/llm-d-infra - version: 1.3.0 - - name: gateway-api-inference-extension - url: oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool - version: v1.2.0-rc.1 - - name: llm-d-modelservice - url: https://llm-d-incubation.github.io/llm-d-modelservice/ - version: v0.3.9 -images: - user-overrides: - - name: llm-d-benchmark - registry: ghcr.io - repo: llm-d - image: llm-d-benchmark - tag: v0.3.7 - - name: llm-d - registry: ghcr.io - repo: llm-d - image: llm-d-cuda - tag: v0.4.0 - - name: llm-d-model-service - registry: ghcr.io - repo: llm-d - image: llm-d-model-service - tag: v0.0.15 - - name: llm-d-inference-scheduler - registry: ghcr.io - repo: llm-d - image: llm-d-inference-scheduler - tag: v0.4.0-rc.1 - - name: llm-d-routing-sidecar - registry: ghcr.io - repo: llm-d - image: llm-d-routing-sidecar - tag: v0.4.0-rc.1 - - name: llm-d-inference-sim - registry: ghcr.io - repo: llm-d - image: llm-d-inference-sim - tag: v0.6.1 - - name: vllm - registry: docker.io - repo: vllm - image: vllm-openai - tag: latest -system: - user-overrides: - - name: default - namespace: stackbenchns - release: stackbenchr - gateway: - type: kgateway - router: - plugins: default-plugins.yaml - volumes: - - name: model-storage - type: pvc - mount: model-storage - size: 1Ti - - name: dshm - type: Memory - mount: /dev/shm - size: 16Gi - components: - - name: infra - charts: - - name: llm-d-infra - contents: - gateway: - gatewayClassName: modelservice - service: - type: NodePort - gatewayParameters: - enabled: true - - name: router - charts: - - name: gateway-api-inference-extension - contents: - loadfrom: - - BASE_DIR/_templates/gateway-api-inference-extension.yaml - - name: inference-engine - modelservice: - charts: - - name: llm-d-modelservice - contents: - loadfrom: - - BASE_DIR/_templates/modelservice.yaml - standalone: - contents: - loadfrom: - - BASE_DIR/_templates/standalone.yaml - inference-engine: - type: modelservice - model: - - name: meta-llama/Llama-3.1-8B-Instruct - label: .auto - maxlen: 16384 - blocksize: 64 - accelerators: - standalone: - key: nvidia.com/gpu.product - value: NVIDIA-H100-80GB-HBM3 - decode: - key: nvidia.com/gpu.product - value: NVIDIA-H100-80GB-HBM3 - prefill: - key: nvidia.com/gpu.product - value: NVIDIA-H100-80GB-HBM3 - replicas: - standalone: 1 - decode: 2 - prefill: 0 - parallelism: - standalone: - data: 1 - tensor: 1 - decode: - data: 1 - tensor: 1 - prefill: - data: 1 - tensor: 1 - resources: - standalone: - memory: 40Gi - cpu: '4' - nvidia.com/gpu: '1' - ephemeral-storage: 20Gi - decode: - memory: 40Gi - cpu: '4' - nvidia.com/gpu: '1' - ephemeral-storage: 20Gi - prefill: - memory: 40Gi - cpu: '4' - nvidia.com/gpu: '1' - ephemeral-storage: 20Gi - annotations: - deployed-by: .auto - modelservice: llm-d-benchmark - ports: - service: 8000 - extra: 9002 - readiness: 8200 - zmq: 5557 - nixl: 5557 - labels: - app: .auto - stood-up-by: .auto - stood-up-from: llm-d-benchmark - stood-up-via: .auto - env: - standalone: - - name: LLMDBENCH_VLLM_STANDALONE_MODEL - value: TEMPLATE_MODEL_NAME - - name: LLMDBENCH_VLLM_STANDALONE_VLLM_LOAD_FORMAT - value: auto - - name: LLMDBENCH_VLLM_STANDALONE_MODEL_LOADER_EXTRA_CONFIG - value: '{}' - - name: VLLM_LOGGING_LEVEL - value: INFO - - name: HF_HOME - value: /TEMPLATE_MODEL_STORAGE - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: TEMPLATE_REPLACE_SECRET_NAME - key: TEMPLATE_REPLACE_SECRET - - name: VLLM_NIXL_SIDE_CHANNEL_HOST - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: UCX_TLS - value: rc,sm,cuda_ipc,cuda_copy,tcp - - name: UCX_SOCKADDR_TLS_PRIORITY - value: tcp - - name: VLLM_NIXL_SIDE_CHANNEL_PORT - value: TEMPLATE_REPLACE_NXL_PORT - - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN - value: '1' - - name: VLLM_SERVER_DEV_MODE - value: '1' - decode: - - name: VLLM_NIXL_SIDE_CHANNEL_HOST - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: UCX_TLS - value: rc,sm,cuda_ipc,cuda_copy,tcp - - name: UCX_SOCKADDR_TLS_PRIORITY - value: tcp - - name: VLLM_NIXL_SIDE_CHANNEL_PORT - value: TEMPLATE_REPLACE_NXL_PORT - - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN - value: '1' - - name: VLLM_SERVER_DEV_MODE - value: '1' - prefill: - - name: VLLM_NIXL_SIDE_CHANNEL_HOST - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: UCX_TLS - value: rc,sm,cuda_ipc,cuda_copy,tcp - - name: UCX_SOCKADDR_TLS_PRIORITY - value: tcp - - name: VLLM_NIXL_SIDE_CHANNEL_PORT - value: TEMPLATE_REPLACE_NXL_PORT - - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN - value: '1' - - name: VLLM_SERVER_DEV_MODE - value: '1' - command: - decode: - type: vllmServe - args: - - --enforce-eager - - --block-size - - '64' - - --kv-transfer-config - - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' - - --disable-log-requests - - --disable-uvicorn-access-log - - --max-model-len - - '16374' -prepare: - user-overrides: - gateway: - provider: - - name: kgateway - charts: - - name: kgateway-crds - - name: kgateway - deploy: true - check: true - - name: istio - charts: - - name: istio - deploy: false - check: false - api: - url: https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd - version: 1.3.0 - deploy: true - check: true - inference_extension: - url: https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd - version: 1.0.1 - deploy: true - check: true - wva: - namespace: workload-variant-autoscaler-system - charts: - - name: workload-variant-autoscaler - images: - - name: workload-variant-autoscaler - replicas: 1 - version: 0.1.0 - autoscaling: - enabled: true - slo: - tpot: 30 - ttft: 1000 - hpa: - enabled: true - max_replicas: 10 - target_avg_value: 1 - vllm: - enabled: true - node_port_min: 30000 - node_port_max: 32767 - interval: 15 - workload_monitoring: - namespace: openshift-user-workload-monitoring - url: https://thanos-querier.openshift-monitoring.svc.cluster.local - port: 9091 - deploy: true - check: true - storage: - - name: model-storage - namespace: stackbenchns - class: default - size: 300Gi - download: - url: - enabled: true - timeout: 3600 - deploy: true - check: true - - name: replay-pvc - namespace: harnessns - class: default - size: 300Gi - download: - url: https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json - enabled: false - timeout: 3600 - deploy: true - check: true - - name: workload-pvc - namespace: harnessns - class: default - size: 300Gi - deploy: true - check: true - secrets: - - name: llm-d-hf-token - secret: HF_TOKEN - contents: TYLERS_TOKEN - files: - - name: llm-d-benchmark-preprocesses - path: REPLACE_BASE_DIR/setup/preprocess -harness: - user-overrides: - - name: default - namespace: harnessns - harness: - name: inference-perf - profile: sanity_random.yaml - executable: llm-d-benchmark.sh - timeout: 3600 - resources: - memory: 32Gi - cpu: '16' - volumes: - - name: workload-pvc - type: pvc - mount: /requests - - name: replay-pvc - type: pvc - mount: /data - env: - - name: LLMDBENCH_RUN_EXPERIMENT_LAUNCHER - value: '1' - - name: LLMDBENCH_RUN_DATASET_URL - value: .prepare.dependencies.user-overrides.storage[]|select(name=replay-pvc).download.url - - name: LLMDBENCH_RUN_WORKSPACE_DIR - value: /workspace - - name: LLMDBENCH_HARNESS_NAME - value: .experiments[0].user-overrides.runners[0].harness.name - - name: LLMDBENCH_HARNESS_NAMESPACE - value: .experiments[0].user-overrides.runners[0].namespace - - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL - value: ${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL} - - name: LLMDBENCH_HARNESS_STACK_NAME - value: ${LLMDBENCH_HARNESS_SANITIZED_STACK_NAME} - - name: LLMDBENCH_DEPLOY_METHODS - value: ${LLMDBENCH_DEPLOY_METHODS} - - name: LLMDBENCH_MAGIC_ENVAR - value: harness_pod - - name: HF_TOKEN_SECRET - value: ${LLMDBENCH_VLLM_COMMON_HF_TOKEN_NAME} - - name: .prepare.dependencies.user-overrides.secrets[0].secret - valueFrom: - secretKeyRef: - name: .prepare.dependencies.user-overrides.secrets[0].name - key: .prepare.dependencies.user-overrides.secrets[0].secret From e8883e5025f552d6ce11e93995708649e92bad35 Mon Sep 17 00:00:00 2001 From: vezio Date: Wed, 3 Dec 2025 00:38:03 -0500 Subject: [PATCH 3/5] revise the templates to make them actual templates Signed-off-by: vezio --- .../gateway-api-inference-extension.yaml.j2 | 32 ++++ .../templates/modelservice.yaml.j2 | 173 ++++++++++++++++++ declarative_poc/templates/standalone.yaml.j2 | 95 ++++++++++ 3 files changed, 300 insertions(+) create mode 100644 declarative_poc/templates/gateway-api-inference-extension.yaml.j2 create mode 100644 declarative_poc/templates/modelservice.yaml.j2 create mode 100644 declarative_poc/templates/standalone.yaml.j2 diff --git a/declarative_poc/templates/gateway-api-inference-extension.yaml.j2 b/declarative_poc/templates/gateway-api-inference-extension.yaml.j2 new file mode 100644 index 00000000..9e34a262 --- /dev/null +++ b/declarative_poc/templates/gateway-api-inference-extension.yaml.j2 @@ -0,0 +1,32 @@ +inferenceExtension: + replicas: 1 + image: + name: llm-d-inference-scheduler + pullPolicy: Always + extProcPort: {{ system["user-overrides"][0]["inference-engine"].ports.extra }} + extraContainerPorts: + - name: zmq + containerPort: {{ system["user-overrides"][0]["inference-engine"].ports.zmq }} + protocol: TCP + extraServicePorts: + - name: zmq + port: {{ system["user-overrides"][0]["inference-engine"].ports.zmq }} + targetPort: {{ system["user-overrides"][0]["inference-engine"].ports.zmq }} + protocol: TCP + env: + - name: {{ prepare["user-overrides"].secrets[0].secret }} + valueFrom: + secretKeyRef: + name: {{ prepare["user-overrides"].secrets[0].name }} + key: {{ prepare["user-overrides"].secrets[0].secret }} + pluginsConfigFile: {{ system["user-overrides"][0]["router"].plugins }} +inferencePool: + targetPortNumber: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + modelServerType: vllm + apiVersion: "inference.networking.x-k8s.io/v1alpha2" + modelServers: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: {{ system["user-overrides"][0]["inference-engine"]["model"][0].label }} +provider: + name: {{ prepare["user-overrides"].gateway["provider"][0].name }} diff --git a/declarative_poc/templates/modelservice.yaml.j2 b/declarative_poc/templates/modelservice.yaml.j2 new file mode 100644 index 00000000..164acdfe --- /dev/null +++ b/declarative_poc/templates/modelservice.yaml.j2 @@ -0,0 +1,173 @@ +fullnameOverride: {{ system["user-overrides"][0]["inference-engine"].model[0].label }} +multinode: False + +modelArtifacts: + uri: {{ system["user-overrides"][0].volumes[0].type }}://{{ system["user-overrides"][0].volumes[0].mount }}/models/{{ system["user-overrides"][0]["inference-engine"].model[0].name }} + + size: {{ prepare["user-overrides"].storage + | selectattr("name","equalto", system["user-overrides"][0].volumes[0].name) + | map(attribute="size") + | first }} + + authSecretName: {{ prepare["user-overrides"].secrets[0].name }} + + name: {{ system["user-overrides"][0]["inference-engine"].model[0].name }} + +routing: + servicePort: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: infra-{{ system["user-overrides"][0].release }}-inference-gateway + + proxy: + image: "{{ images['user-overrides'] | selectattr('name','equalto','llm-d-routing-sidecar') | map(attribute='registry') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d-routing-sidecar') | map(attribute='repo') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d-routing-sidecar') | map(attribute='image') | first }}:{{ images['user-overrides'] | selectattr('name','equalto','llm-d-routing-sidecar') | map(attribute='tag') | first }}" + secure: false + connector: nixlv2 + debugLevel: 3 + + inferencePool: + create: false + name: {{ system["user-overrides"][0]["inference-engine"].model[0].label }} + + httpRoute: + create: true + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: {{ system["user-overrides"][0]["inference-engine"].model[0].label }} + port: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + weight: 1 + timeouts: + backendRequest: 0s + request: 0s + + matches: + - path: + type: PathPrefix + value: /{{ system["user-overrides"][0]["inference-engine"].model[0].name }}/ + + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + + modelCommand: {{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("type", 0) }} + args:{{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("args", 0) }} + env: {{ system["user-overrides"][0]["inference-engine"].env.prefill | default(0) }} + {{ system["user-overrides"][0]["inference-engine"].env.get("prefill", {}) }} + + +decode: + create: {{ system["user-overrides"][0]["inference-engine"].replicas.get("decode", {}) }} + replicas: {{ system["user-overrides"][0]["inference-engine"].replicas.get("decode", {}) }} + + acceleratorTypes: + labelKey: {{ system["user-overrides"][0]["inference-engine"].accelerators.decode.key }} + labelValues: + - {{ system["user-overrides"][0]["inference-engine"].accelerators.decode.value }} + + parallelism: + data: {{ system["user-overrides"][0]["inference-engine"].parallelism.decode.data }} + tensor: {{ system["user-overrides"][0]["inference-engine"].parallelism.decode.tensor }} + + annotations: {{ system["user-overrides"][0]["inference-engine"].annotations }} + podAnnotations: {{ system["user-overrides"][0]["inference-engine"].annotations }} + + containers: + - name: "vllm" + mountModelVolume: true + image: "{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='registry') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='repo') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='image') | first }}:{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='tag') | first }}" + modelCommand: {{ system["user-overrides"][0].get("command", {}).get("decode", {}).get("type", 0) }} + args:{{ system["user-overrides"][0].get("command", {}).get("decode", {}).get("args", 0) }} + {{ system["user-overrides"][0]["inference-engine"].env.get("decode", {}) }} + + resources: + limits: {{ system["user-overrides"][0]["inference-engine"].resources.decode }} + requests: {{ system["user-overrides"][0]["inference-engine"].resources.decode }} + + extraConfig: + startupProbe: + httpGet: + path: /health + port: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + failureThreshold: 60 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + failureThreshold: 3 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: {{ system["user-overrides"][0]["inference-engine"].ports.readiness }} + failureThreshold: 3 + periodSeconds: 5 + +prefill: + create: {{ system["user-overrides"][0]["inference-engine"].replicas.prefill | default(0) }} + replicas: {{ system["user-overrides"][0]["inference-engine"].replicas.prefill | default(0) }} + + acceleratorTypes: + labelKey: {{ system["user-overrides"][0]["inference-engine"].accelerators.prefill.key }} + labelValues: + - {{ system["user-overrides"][0]["inference-engine"].accelerators.prefill.value }} + + parallelism: + data: {{ system["user-overrides"][0]["inference-engine"].parallelism.prefill.data }} + tensor: {{ system["user-overrides"][0]["inference-engine"].parallelism.prefill.tensor }} + + annotations: {{ system["user-overrides"][0]["inference-engine"].annotations }} + podAnnotations: {{ system["user-overrides"][0]["inference-engine"].annotations }} + + containers: + - name: "vllm" + mountModelVolume: true + image: "{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='registry') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='repo') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='image') | first }}:{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='tag') | first }}" + + modelCommand: {{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("type", 0) }} + args:{{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("args", 0) }} + env: {{ system["user-overrides"][0]["inference-engine"].env.prefill | default(0) }} + {{ system["user-overrides"][0]["inference-engine"].env.get("prefill", {}) }} + + resources: + limits: {{ system["user-overrides"][0]["inference-engine"].resources.prefill }} + requests: {{ system["user-overrides"][0]["inference-engine"].resources.prefill }} + + extraConfig: + startupProbe: + httpGet: + path: /health + port: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + failureThreshold: 60 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: {{ system["user-overrides"][0]["inference-engine"].ports.service }} + failureThreshold: 3 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: {{ system["user-overrides"][0]["inference-engine"].ports.readiness }} + failureThreshold: 3 + periodSeconds: 5 + + volumeMounts: + - name: {{ system["user-overrides"][0].volumes[1].name }} + mountPath: {{ system["user-overrides"][0].volumes[1].mount }} + +volumes: + - name: {{ system["user-overrides"][0].volumes[1].name }} + emptyDir: + medium: {{ system["user-overrides"][0].volumes[1].type }} + sizeLimit: {{ system["user-overrides"][0].volumes[1].size }} diff --git a/declarative_poc/templates/standalone.yaml.j2 b/declarative_poc/templates/standalone.yaml.j2 new file mode 100644 index 00000000..808e9640 --- /dev/null +++ b/declarative_poc/templates/standalone.yaml.j2 @@ -0,0 +1,95 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("label", "default-model") }} + labels: {{ system["user-overrides"][0].get("inference-engine", {}).get("labels", {}) }} + namespace: {{ system["user-overrides"][0].get("inference-engine", {}).get("namespace", "default") }} +spec: + replicas: {{ system["user-overrides"][0].get("inference-engine", {}).get("replicas", {}).get("standalone", 1) }} + selector: + matchLabels: + app: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("label", "default-model") }} + template: + metadata: + labels: {{ system["user-overrides"][0].get("inference-engine", {}).get("labels", {}) }} + annotations: {{ system["user-overrides"][0].get("inference-engine", {}).get("annotations", {}) }} + spec: + schedulerName: default-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ system["user-overrides"][0].get("inference-engine", {}).get("accelerators", {}).get("standalone", {}).get("key", "") }} + operator: In + values: + - {{ system["user-overrides"][0].get("inference-engine", {}).get("accelerators", {}).get("standalone", {}).get("value", "") }} + containers: + - name: vllm-standalone-{{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("label", "default-model") }} + image: {{ (images["user-overrides"] | selectattr("name", "equalto", "vllm") | list)[0].get("registry", "") }}/{{ (images["user-overrides"] | selectattr("name", "equalto", "vllm") | list)[0].get("repo", "") }}/{{ (images["user-overrides"] | selectattr("name", "equalto", "vllm") | list)[0].get("image", "") }}:{{ (images["user-overrides"] | selectattr("name", "equalto", "vllm") | list)[0].get("tag", "latest") }} + imagePullPolicy: Always + command: + - /bin/bash + - "-c" + args: {{ system["user-overrides"][0].get("inference-engine", {}).get("command", {}).get("standalone", []) }} + env: {{ system["user-overrides"][0].get("inference-engine", {}).get("env", {}).get("standalone", []) }} + ports: + - containerPort: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", 8000) }} + startupProbe: + httpGet: + path: /health + port: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", 8000) }} + failureThreshold: 200 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", 8000) }} + failureThreshold: 3 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", 8000) }} + failureThreshold: 3 + periodSeconds: 5 + resources: + limits: {{ system["user-overrides"][0].get("inference-engine", {}).get("resources", {}).get("prefill", {}) }} + requests: {{ system["user-overrides"][0].get("inference-engine", {}).get("resources", {}).get("prefill", {}) }} + volumeMounts: + - name: preprocesses + mountPath: /setup/preprocess + - name: cache-volume + mountPath: {{ system["user-overrides"][0].get("inference-engine", {}).get("volumes", [{}])[0].get("mount", "/cache") }} + - name: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("name", "dshm") }} + mountPath: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("mount", "/dev/shm") }} + volumes: + - name: preprocesses + configMap: + name: {{ prepare["user-overrides"].get("files", [{}])[0].get("name", "preprocess-config") }} + defaultMode: 0500 + - name: cache-volume + persistentVolumeClaim: + claimName: {{ system["user-overrides"][0].get("inference-engine", {}).get("volumes", [{}])[0].get("name", "model-storage") }} + - name: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("name", "dshm") }} + emptyDir: + medium: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("type", "Memory") }} + sizeLimit: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("size", "16Gi") }} + +--- + +apiVersion: v1 +kind: Service +metadata: + name: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("name", "default-model") }} + namespace: {{ system["user-overrides"][0].get("inference-engine", {}).get("namespace", "default") }} + labels: {{ system["user-overrides"][0].get("inference-engine", {}).get("labels", {}) }} +spec: + ports: + - name: http + port: 80 + targetPort: 8000 + selector: + app: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("name", "default-model") }} + type: ClusterIP From 78c758a5456c2b188472986cabdb3bf4ecadc1b3 Mon Sep 17 00:00:00 2001 From: vezio Date: Thu, 4 Dec 2025 12:55:29 -0500 Subject: [PATCH 4/5] build sub templates Signed-off-by: vezio --- .../experiments/inference-scheduling.yaml | 4 - declarative_poc/llmdbench/llmdbench/cli.py | 23 +++--- .../llmdbench/parser/systemparser.py | 81 ++++++++++++++++--- declarative_poc/llmdbench/pyproject.toml | 13 --- .../experiments/inference-scheduling.yaml | 6 ++ .../experiments/pd-disaggregation.yaml | 0 .../scenarios/inference-scheduling.yaml | 0 .../scenarios/pd-disaggregation.yaml | 0 .../templates/default_system.yaml | 71 ++++++++-------- .../gateway-api-inference-extension.yaml.j2 | 0 .../templates/modelservice.yaml.j2 | 6 +- .../templates/standalone.yaml.j2 | 34 ++++---- 12 files changed, 141 insertions(+), 97 deletions(-) delete mode 100644 declarative_poc/experiments/inference-scheduling.yaml delete mode 100644 declarative_poc/llmdbench/pyproject.toml create mode 100644 declarative_poc/workspace/experiments/inference-scheduling.yaml rename declarative_poc/{ => workspace}/experiments/pd-disaggregation.yaml (100%) rename declarative_poc/{ => workspace}/scenarios/inference-scheduling.yaml (100%) rename declarative_poc/{ => workspace}/scenarios/pd-disaggregation.yaml (100%) rename declarative_poc/{ => workspace}/templates/default_system.yaml (89%) rename declarative_poc/{ => workspace}/templates/gateway-api-inference-extension.yaml.j2 (100%) rename declarative_poc/{ => workspace}/templates/modelservice.yaml.j2 (96%) rename declarative_poc/{ => workspace}/templates/standalone.yaml.j2 (83%) diff --git a/declarative_poc/experiments/inference-scheduling.yaml b/declarative_poc/experiments/inference-scheduling.yaml deleted file mode 100644 index 8fb80a4d..00000000 --- a/declarative_poc/experiments/inference-scheduling.yaml +++ /dev/null @@ -1,4 +0,0 @@ -template: - path: /Users/vezio/IBM/llmd/haul/templates/default_system.yaml -scenario: - path: /Users/vezio/IBM/llmd/haul/scenarios/inference-scheduling.yaml diff --git a/declarative_poc/llmdbench/llmdbench/cli.py b/declarative_poc/llmdbench/llmdbench/cli.py index cd49c405..112f65e9 100644 --- a/declarative_poc/llmdbench/llmdbench/cli.py +++ b/declarative_poc/llmdbench/llmdbench/cli.py @@ -2,6 +2,7 @@ from llmdbench.logging.logger import get_logger, set_stage import json +import os import argparse import yaml @@ -24,6 +25,13 @@ def cli(): description="Manage and benchmark llmd configurations.", ) + parser.add_argument( + "--workspace", + required=True, + default=".", + help="Workspace directory used as the root for configs, outputs, etc.", + ) + subparsers = parser.add_subparsers(dest="command", required=True) # -------------------------- @@ -38,11 +46,6 @@ def cli(): required=True, help="Path to the experiment file to plan.", ) - plan_parser.add_argument( - "--output", - default="system_plan.yaml", - help="Path to save the output experiment as a YAML file.", - ) # -------------------------- # prepare @@ -94,19 +97,19 @@ def cli(): with open(args.experiment, "r") as f: data = yaml.safe_load(f) - template_path = data["template"]["path"] - scenario_path = data["scenario"]["path"] + default_file = os.path.join(args.workspace, data["values"]["path"].lstrip("/")) + template_path = os.path.join(args.workspace, data["templates"]["path"].lstrip("/")) + scenario_path = os.path.join(args.workspace, data["scenario"]["path"].lstrip("/")) # Regardless - we need create a plan - otherwise we won't have context of # what to todo - in the future we can "import" a context to "rerun" a plan. - system = SystemParser(template_path, args.output, scenario_path) + system = SystemParser(default_file, template_path, args.workspace, scenario_path) system.parse() if args.command == "plan": set_stage(logger, "๐Ÿ”ง PLAN") logger.info("Creating execution and deployment plan...") - logger.info(f"Plan saved to {args.output}") - # print(json.dumps(system.plan_to_dict(), indent=2)) + logger.info(f"Plan saved to {args.workspace}") system.plan_to_yaml() elif args.command == "prepare": set_stage(logger, "๐Ÿ”ง PREPARE") diff --git a/declarative_poc/llmdbench/llmdbench/parser/systemparser.py b/declarative_poc/llmdbench/llmdbench/parser/systemparser.py index 15d32147..29defa4f 100644 --- a/declarative_poc/llmdbench/llmdbench/parser/systemparser.py +++ b/declarative_poc/llmdbench/llmdbench/parser/systemparser.py @@ -4,6 +4,9 @@ import subprocess import requests import re +import os +from os.path import abspath +from jinja2 import Environment, FileSystemLoader class LiteralStr(str): @@ -11,9 +14,16 @@ class LiteralStr(str): class SystemParser: - def __init__(self, defaults_file, output_file, scenario_file=None): + def __init__(self, defaults_file, templates_dir, workspace, scenario_file=None): self.defaults = self._load_yaml(defaults_file) - self.output_file = output_file + + # Directory containing `.j2` (jinja) templates + self.templates_dir = templates_dir + + # Output of rendered templates, etc. + self.workspace = workspace + + # Overrides the "self.defaults" values file self.scenario = self._load_yaml(scenario_file) if scenario_file else {} self._charts_key = "charts" @@ -31,8 +41,14 @@ def __init__(self, defaults_file, output_file, scenario_file=None): self._system_harness_key = "harness" self._system_harness = {} - self.system_experiments_key = "experiments" - self.system_experiments = {} + self._system_components_key = "components" + self._system_components = {} + + self.modelservice = {} + + self.gateway_api_inference_extension = {} + + self.standalone = {} def _load_yaml(self, file_path): """Load YAML file""" @@ -105,7 +121,7 @@ def _get_nested(self, data, path: str): return current - def _render_template_attribute(self, key): + def _render_values(self, key): render = self._get_nested(self.defaults, key) scenarios = self.scenario.get("scenario", []) for i, _ in enumerate(scenarios): @@ -220,12 +236,18 @@ def plan_to_dict(self): self._system_stack_key: self.system_stack, self._system_prepare_key: self.system_prepare, self._system_harness_key: self.system_harness, + self._system_components_key: self.system_components, } def plan_to_yaml(self): plan = self.plan_to_dict() plan = self._convert_multiline_strings(plan) - with open(self.output_file, "w") as f: + dir = abspath(self.workspace) + out_dir = os.path.join(self.workspace, "sut-plan-1") + output_file = os.path.join(out_dir, "sut-plan-1-defaults.yaml") + os.makedirs(dir, exist_ok=True) + print(output_file) + with open(output_file, "w") as f: yaml.dump( plan, f, @@ -234,18 +256,55 @@ def plan_to_yaml(self): allow_unicode=True, ) + def render_components(self): + # Setup Jinja env + env = Environment( + loader=FileSystemLoader(self.templates_dir), + trim_blocks=True, + lstrip_blocks=True, + ) + + plan = self.plan_to_dict() + + # Make sure output directory exists + out_dir = os.path.join(self.workspace, "sut-plan-1") + os.makedirs(out_dir, exist_ok=True) + + # Walk through template directory + for root, _, files in os.walk(self.templates_dir): + for filename in files: + if filename.endswith(".j2"): + template_path = os.path.relpath( + os.path.join(root, filename), self.templates_dir + ) + + template = env.get_template(template_path) + + # Render with dict expansion so Jinja sees top-level keys + rendered = template.render(**plan) + + # Create output filename + output_filename = filename.replace(".yaml.j2", ".yaml") + output_path = os.path.join(out_dir, output_filename) + + with open(output_path, "w") as outfile: + outfile.write(rendered) + def parse(self): """Load defaults and apply overrides""" yaml.add_representer(LiteralStr, self._literal_str_representer) - self.charts = self._render_template_attribute(self._charts_key) - self.images = self._render_template_attribute(self._images_key) - self.system_stack = self._render_template_attribute(self._system_stack_key) - self.system_prepare = self._render_template_attribute(self._system_prepare_key) - self.system_harness = self._render_template_attribute(self._system_harness_key) + self.charts = self._render_values(self._charts_key) + self.images = self._render_values(self._images_key) + self.system_stack = self._render_values(self._system_stack_key) + self.system_prepare = self._render_values(self._system_prepare_key) + self.system_harness = self._render_values(self._system_harness_key) + self.system_components = self._render_values(self._system_components_key) self._resolve_chart_auto_versions() self._resolve_image_auto_tags() self._build_indexes() + self.render_components() + return self.plan_to_dict() diff --git a/declarative_poc/llmdbench/pyproject.toml b/declarative_poc/llmdbench/pyproject.toml deleted file mode 100644 index 4f484cc4..00000000 --- a/declarative_poc/llmdbench/pyproject.toml +++ /dev/null @@ -1,13 +0,0 @@ -[project] -name = "llmdbench" -version = "0.0.0" -description = "A library for configuration discovery and benchmarking for llm-d." -dependencies = [ - "PyYAML", - "Jinja2", - "requests", - "packaging" -] - -[project.scripts] -llmdbench = "llmdbench.cli:cli" diff --git a/declarative_poc/workspace/experiments/inference-scheduling.yaml b/declarative_poc/workspace/experiments/inference-scheduling.yaml new file mode 100644 index 00000000..89a4e198 --- /dev/null +++ b/declarative_poc/workspace/experiments/inference-scheduling.yaml @@ -0,0 +1,6 @@ +values: + path: templates/default_system.yaml +templates: + path: templates/ +scenario: + path: scenarios/inference-scheduling.yaml diff --git a/declarative_poc/experiments/pd-disaggregation.yaml b/declarative_poc/workspace/experiments/pd-disaggregation.yaml similarity index 100% rename from declarative_poc/experiments/pd-disaggregation.yaml rename to declarative_poc/workspace/experiments/pd-disaggregation.yaml diff --git a/declarative_poc/scenarios/inference-scheduling.yaml b/declarative_poc/workspace/scenarios/inference-scheduling.yaml similarity index 100% rename from declarative_poc/scenarios/inference-scheduling.yaml rename to declarative_poc/workspace/scenarios/inference-scheduling.yaml diff --git a/declarative_poc/scenarios/pd-disaggregation.yaml b/declarative_poc/workspace/scenarios/pd-disaggregation.yaml similarity index 100% rename from declarative_poc/scenarios/pd-disaggregation.yaml rename to declarative_poc/workspace/scenarios/pd-disaggregation.yaml diff --git a/declarative_poc/templates/default_system.yaml b/declarative_poc/workspace/templates/default_system.yaml similarity index 89% rename from declarative_poc/templates/default_system.yaml rename to declarative_poc/workspace/templates/default_system.yaml index 1803c588..42106dd3 100644 --- a/declarative_poc/templates/default_system.yaml +++ b/declarative_poc/workspace/templates/default_system.yaml @@ -61,17 +61,12 @@ prepare: user-overrides: gateway : provider : - - name: kgateway - charts: - - name: kgateway-crds - - name: kgateway - deploy: true - check: true - - name: istio - charts: - - name: istio - deploy: false - check: false + - name: kgateway + charts: + - name: kgateway-crds + - name: kgateway + deploy: true + check: true api: url: https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd version: 1.3.0 @@ -163,34 +158,6 @@ system: type: Memory mount: /dev/shm size: 16Gi - components: - - name: infra - charts: - - name: llm-d-infra - contents: - gateway: - gatewayClassName: modelservice - service: - type: NodePort - gatewayParameters: - enabled: true - - name: router - charts: - - name: gateway-api-inference-extension - contents: - loadfrom: - - BASE_DIR/_templates/gateway-api-inference-extension.yaml - - name: inference-engine - modelservice: - charts: - - name: llm-d-modelservice - contents: - loadfrom: - - BASE_DIR/_templates/modelservice.yaml - standalone: - contents: - loadfrom: - - BASE_DIR/_templates/standalone.yaml inference-engine: type: modelservice model: [] @@ -355,3 +322,29 @@ harness: secretKeyRef: name: .prepare.dependencies.user-overrides.secrets[0].name key: .prepare.dependencies.user-overrides.secrets[0].secret + +components: + user-overrides: + - name: infra + charts: + - name: llm-d-infra + contents: + gateway: + gatewayClassName: modelservice + service: + type: NodePort + gatewayParameters: + enabled: true + - name: router + contents: + loadfrom: + - gateway-api-inference-extension.yaml + - name: inference-engine + modelservice: + contents: + loadfrom: + - modelservice.yaml + standalone: + contents: + loadfrom: + - standalone.yaml \ No newline at end of file diff --git a/declarative_poc/templates/gateway-api-inference-extension.yaml.j2 b/declarative_poc/workspace/templates/gateway-api-inference-extension.yaml.j2 similarity index 100% rename from declarative_poc/templates/gateway-api-inference-extension.yaml.j2 rename to declarative_poc/workspace/templates/gateway-api-inference-extension.yaml.j2 diff --git a/declarative_poc/templates/modelservice.yaml.j2 b/declarative_poc/workspace/templates/modelservice.yaml.j2 similarity index 96% rename from declarative_poc/templates/modelservice.yaml.j2 rename to declarative_poc/workspace/templates/modelservice.yaml.j2 index 164acdfe..aecdc96d 100644 --- a/declarative_poc/templates/modelservice.yaml.j2 +++ b/declarative_poc/workspace/templates/modelservice.yaml.j2 @@ -57,7 +57,7 @@ routing: replacePrefixMatch: / modelCommand: {{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("type", 0) }} - args:{{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("args", 0) }} + args: {{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("args", 0) }} env: {{ system["user-overrides"][0]["inference-engine"].env.prefill | default(0) }} {{ system["user-overrides"][0]["inference-engine"].env.get("prefill", {}) }} @@ -83,7 +83,7 @@ decode: mountModelVolume: true image: "{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='registry') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='repo') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='image') | first }}:{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='tag') | first }}" modelCommand: {{ system["user-overrides"][0].get("command", {}).get("decode", {}).get("type", 0) }} - args:{{ system["user-overrides"][0].get("command", {}).get("decode", {}).get("args", 0) }} + args: {{ system["user-overrides"][0].get("command", {}).get("decode", {}).get("args", 0) }} {{ system["user-overrides"][0]["inference-engine"].env.get("decode", {}) }} resources: @@ -133,7 +133,7 @@ prefill: image: "{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='registry') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='repo') | first }}/{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='image') | first }}:{{ images['user-overrides'] | selectattr('name','equalto','llm-d') | map(attribute='tag') | first }}" modelCommand: {{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("type", 0) }} - args:{{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("args", 0) }} + args: {{ system["user-overrides"][0].get("command", {}).get("prefill", {}).get("args", 0) }} env: {{ system["user-overrides"][0]["inference-engine"].env.prefill | default(0) }} {{ system["user-overrides"][0]["inference-engine"].env.get("prefill", {}) }} diff --git a/declarative_poc/templates/standalone.yaml.j2 b/declarative_poc/workspace/templates/standalone.yaml.j2 similarity index 83% rename from declarative_poc/templates/standalone.yaml.j2 rename to declarative_poc/workspace/templates/standalone.yaml.j2 index 808e9640..5060bf55 100644 --- a/declarative_poc/templates/standalone.yaml.j2 +++ b/declarative_poc/workspace/templates/standalone.yaml.j2 @@ -1,14 +1,14 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("label", "default-model") }} + name: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("label", {}) }} labels: {{ system["user-overrides"][0].get("inference-engine", {}).get("labels", {}) }} namespace: {{ system["user-overrides"][0].get("inference-engine", {}).get("namespace", "default") }} spec: replicas: {{ system["user-overrides"][0].get("inference-engine", {}).get("replicas", {}).get("standalone", 1) }} selector: matchLabels: - app: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("label", "default-model") }} + app: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("label", {}) }} template: metadata: labels: {{ system["user-overrides"][0].get("inference-engine", {}).get("labels", {}) }} @@ -25,7 +25,7 @@ spec: values: - {{ system["user-overrides"][0].get("inference-engine", {}).get("accelerators", {}).get("standalone", {}).get("value", "") }} containers: - - name: vllm-standalone-{{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("label", "default-model") }} + - name: vllm-standalone-{{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("label", {}) }} image: {{ (images["user-overrides"] | selectattr("name", "equalto", "vllm") | list)[0].get("registry", "") }}/{{ (images["user-overrides"] | selectattr("name", "equalto", "vllm") | list)[0].get("repo", "") }}/{{ (images["user-overrides"] | selectattr("name", "equalto", "vllm") | list)[0].get("image", "") }}:{{ (images["user-overrides"] | selectattr("name", "equalto", "vllm") | list)[0].get("tag", "latest") }} imagePullPolicy: Always command: @@ -34,24 +34,24 @@ spec: args: {{ system["user-overrides"][0].get("inference-engine", {}).get("command", {}).get("standalone", []) }} env: {{ system["user-overrides"][0].get("inference-engine", {}).get("env", {}).get("standalone", []) }} ports: - - containerPort: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", 8000) }} + - containerPort: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", {}) }} startupProbe: httpGet: path: /health - port: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", 8000) }} + port: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", {}) }} failureThreshold: 200 initialDelaySeconds: 30 periodSeconds: 30 timeoutSeconds: 5 livenessProbe: tcpSocket: - port: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", 8000) }} + port: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", {}) }} failureThreshold: 3 periodSeconds: 10 readinessProbe: httpGet: path: /health - port: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", 8000) }} + port: {{ system["user-overrides"][0].get("inference-engine", {}).get("ports", {}).get("service", {}) }} failureThreshold: 3 periodSeconds: 5 resources: @@ -61,9 +61,9 @@ spec: - name: preprocesses mountPath: /setup/preprocess - name: cache-volume - mountPath: {{ system["user-overrides"][0].get("inference-engine", {}).get("volumes", [{}])[0].get("mount", "/cache") }} - - name: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("name", "dshm") }} - mountPath: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("mount", "/dev/shm") }} + mountPath: {{ system["user-overrides"][0].get("inference-engine", {}).get("volumes", [{}])[0].get("mount", {}) }} + - name: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("name", {}) }} + mountPath: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("mount", {}) }} volumes: - name: preprocesses configMap: @@ -71,25 +71,25 @@ spec: defaultMode: 0500 - name: cache-volume persistentVolumeClaim: - claimName: {{ system["user-overrides"][0].get("inference-engine", {}).get("volumes", [{}])[0].get("name", "model-storage") }} - - name: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("name", "dshm") }} + claimName: {{ system["user-overrides"][0].get("inference-engine", {}).get("volumes", [{}])[0].get("name", {}) }} + - name: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("name", {}) }} emptyDir: - medium: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("type", "Memory") }} - sizeLimit: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("size", "16Gi") }} + medium: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("type", {}) }} + sizeLimit: {{ system["user-overrides"][0].get("volumes", [{}])[1].get("size", {}) }} --- apiVersion: v1 kind: Service metadata: - name: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("name", "default-model") }} + name: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("name", {}) }} namespace: {{ system["user-overrides"][0].get("inference-engine", {}).get("namespace", "default") }} labels: {{ system["user-overrides"][0].get("inference-engine", {}).get("labels", {}) }} spec: ports: - name: http port: 80 - targetPort: 8000 + targetPort: {} selector: - app: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("name", "default-model") }} + app: {{ system["user-overrides"][0].get("inference-engine", {}).get("model", [{}])[0].get("name", {}) }} type: ClusterIP From 43b4f29d4cd792327c75f8084dc1212d36fc7054 Mon Sep 17 00:00:00 2001 From: vezio Date: Thu, 4 Dec 2025 13:49:22 -0500 Subject: [PATCH 5/5] fix Signed-off-by: vezio --- declarative_poc/llmdbench/__init__.py | 0 declarative_poc/llmdbench/pyproject.toml | 13 +++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 declarative_poc/llmdbench/__init__.py create mode 100644 declarative_poc/llmdbench/pyproject.toml diff --git a/declarative_poc/llmdbench/__init__.py b/declarative_poc/llmdbench/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/declarative_poc/llmdbench/pyproject.toml b/declarative_poc/llmdbench/pyproject.toml new file mode 100644 index 00000000..d8e1509b --- /dev/null +++ b/declarative_poc/llmdbench/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "llmdbench" +version = "0.1.0" +description = "A library for configuration discovery and benchmarking for llm-d." +dependencies = [ + "PyYAML", + "Jinja2", + "requests", + "packaging" +] + +[project.scripts] +llmdbench = "llmdbench.cli:cli"