diff --git a/README.md b/README.md index 5a0576355c..9b0cc366be 100644 --- a/README.md +++ b/README.md @@ -101,9 +101,8 @@ To coordinate across a data center, Dynamo relies on etcd and NATS. To run Dynam To quickly setup etcd & NATS, you can also run: -``` +```bash # At the root of the repository: -# Edit deploy/docker-compose.yml to comment out "runtime: nvidia" of the dcgm-exporter service if the nvidia container runtime isn't deployed or to be used. docker compose -f deploy/docker-compose.yml up -d ``` diff --git a/components/src/dynamo/sglang/publisher.py b/components/src/dynamo/sglang/publisher.py index 5f24ba26a9..986ffc10e0 100644 --- a/components/src/dynamo/sglang/publisher.py +++ b/components/src/dynamo/sglang/publisher.py @@ -204,7 +204,7 @@ def setup_prometheus_registry( SGLang uses multiprocess architecture where metrics are stored in shared memory. MultiProcessCollector aggregates metrics from all worker processes. The Prometheus registry collects sglang:* metrics which are exposed via the metrics server endpoint - (typically port 8081) when DYN_SYSTEM_ENABLED=true. + (set DYN_SYSTEM_PORT to a positive value to enable, e.g., DYN_SYSTEM_PORT=8081). Args: engine: The SGLang engine instance. diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index 2b19741f7b..31ded423ae 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -1,26 +1,13 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# IMPORT NOTE: Make sure this is in sync with lib/runtime/docker-compose.yml +# Bare minimum infrastructure services for Dynamo. +# For observability (metrics, tracing, dashboards), use docker-observability.yml + networks: server: driver: bridge - monitoring: - driver: bridge -# Note that the images are pinned to specific versions to avoid breaking changes. services: nats-server: image: nats:2.11.4 @@ -31,7 +18,6 @@ services: - 8222:8222 # the endpoints include /varz, /healthz, ... networks: - server - - monitoring etcd-server: image: bitnamilegacy/etcd:3.6.1 @@ -42,108 +28,3 @@ services: - 2380:2380 networks: - server - - monitoring - - # All the services below are part of the metrics profile and monitoring network. - - # The exporter translates from /varz and other stats to Prometheus metrics - nats-prometheus-exporter: - image: natsio/prometheus-nats-exporter:0.17.3 - command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"] - ports: - - 7777:7777 - networks: - - monitoring - profiles: [metrics] - depends_on: - - nats-server - - # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm - # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format. - dcgm-exporter: - image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9 - ports: - # Expose dcgm-exporter on port 9401 both inside and outside the container - # to avoid conflicts with other dcgm-exporter instances in distributed environments. - # To access DCGM metrics: - # Outside the container: curl http://localhost:9401/metrics (or the host IP) - # Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics - - 9401:9401 - cap_add: - - SYS_ADMIN - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - environment: - # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES - - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all} - - DCGM_EXPORTER_LISTEN=:9401 - runtime: nvidia # Specify the NVIDIA runtime - networks: - - monitoring - - # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu: - # sudo ufw allow 9090/tcp - prometheus: - image: prom/prometheus:v3.4.1 - container_name: prometheus - volumes: - - ./metrics/prometheus.yml:/etc/prometheus/prometheus.yml - command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - # These provide the web console functionality - - '--web.console.libraries=/etc/prometheus/console_libraries' - - '--web.console.templates=/etc/prometheus/consoles' - - '--web.enable-lifecycle' - restart: unless-stopped - # Example to pull from the /query endpoint: - # {__name__=~"DCGM.*", job="dcgm-exporter"} - networks: - - monitoring - ports: - - "9090:9090" - profiles: [metrics] - extra_hosts: - - "host.docker.internal:host-gateway" - depends_on: - - dcgm-exporter - - nats-prometheus-exporter - - etcd-server - - # grafana connects to prometheus via the /query endpoint. - # Default credentials are dynamo/dynamo. - # To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu: - # sudo ufw allow 3001/tcp - grafana: - image: grafana/grafana-enterprise:12.0.1 - container_name: grafana - volumes: - - ./metrics/grafana_dashboards:/etc/grafana/provisioning/dashboards - - ./metrics/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml - environment: - - GF_SERVER_HTTP_PORT=3001 - # do not make it admin/admin, because you will be prompted to change the password every time - - GF_SECURITY_ADMIN_USER=dynamo - - GF_SECURITY_ADMIN_PASSWORD=dynamo - - GF_USERS_ALLOW_SIGN_UP=false - - GF_INSTALL_PLUGINS=grafana-piechart-panel - # Default min interval is 5s, but can be configured lower - - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s - # Disable password change requirement - - GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false - - GF_SECURITY_ADMIN_PASSWORD_POLICY=false - - GF_AUTH_DISABLE_LOGIN_FORM=false - - GF_AUTH_DISABLE_SIGNOUT_MENU=false - restart: unless-stopped - ports: - - "3001:3001" - networks: - - monitoring - profiles: [metrics] - depends_on: - - prometheus diff --git a/deploy/docker-observability.yml b/deploy/docker-observability.yml new file mode 100644 index 0000000000..b8e57aa6c3 --- /dev/null +++ b/deploy/docker-observability.yml @@ -0,0 +1,137 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Observability stack for Dynamo: metrics, tracing, and visualization. +# Requires deploy/docker-compose.yml to be running for NATS and etcd connectivity. +# +# Usage: +# docker compose -f deploy/docker-observability.yml up -d + +version: '3.8' + +networks: + server: + external: true + name: deploy_server + +volumes: + grafana-data: + tempo-data: + +services: + # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm + # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format. + dcgm-exporter: + image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9 + ports: + # Expose dcgm-exporter on port 9401 both inside and outside the container + # to avoid conflicts with other dcgm-exporter instances in distributed environments. + # To access DCGM metrics: + # Outside the container: curl http://localhost:9401/metrics (or the host IP) + # Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics + - 9401:9401 + cap_add: + - SYS_ADMIN + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + environment: + # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES + - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all} + - DCGM_EXPORTER_LISTEN=:9401 + runtime: nvidia # Specify the NVIDIA runtime + networks: + - server + + # The exporter translates from /varz and other stats to Prometheus metrics + nats-prometheus-exporter: + image: natsio/prometheus-nats-exporter:0.17.3 + command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"] + ports: + - 7777:7777 + networks: + - server + + # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu: + # sudo ufw allow 9090/tcp + prometheus: + image: prom/prometheus:v3.4.1 + container_name: prometheus + volumes: + - ./observability/prometheus.yml:/etc/prometheus/prometheus.yml + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + # These provide the web console functionality + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--web.enable-lifecycle' + restart: unless-stopped + # Example to pull from the /query endpoint: + # {__name__=~"DCGM.*", job="dcgm-exporter"} + ports: + - "9090:9090" + networks: + - server + extra_hosts: + - "host.docker.internal:host-gateway" + depends_on: + - dcgm-exporter + - nats-prometheus-exporter + + # Tempo - Distributed tracing backend + tempo: + image: grafana/tempo:2.8.2 + command: [ "-config.file=/etc/tempo.yaml" ] + user: root + volumes: + - ./observability/tempo.yaml:/etc/tempo.yaml + - tempo-data:/tmp/tempo + ports: + - "3200:3200" # Tempo HTTP + - "4317:4317" # OTLP gRPC receiver (accessible from host) + - "4318:4318" # OTLP HTTP receiver (accessible from host) + networks: + - server + + # Grafana - Visualization and dashboards + # Supports both Prometheus (metrics) and Tempo (tracing) datasources + # Default credentials: dynamo/dynamo + # To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu: + # sudo ufw allow 3000/tcp + grafana: + image: grafana/grafana:12.2.0 + container_name: grafana + volumes: + - grafana-data:/var/lib/grafana + - ./observability/grafana_dashboards:/etc/grafana/provisioning/dashboards + - ./observability/grafana-datasources.yml:/etc/grafana/provisioning/datasources/prometheus.yml + - ./observability/tempo-datasource.yml:/etc/grafana/provisioning/datasources/tempo.yml + environment: + - GF_SERVER_HTTP_PORT=3000 + # do not make it admin/admin, because you will be prompted to change the password every time + - GF_SECURITY_ADMIN_USER=dynamo + - GF_SECURITY_ADMIN_PASSWORD=dynamo + - GF_USERS_ALLOW_SIGN_UP=false + - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor + - GF_INSTALL_PLUGINS=grafana-piechart-panel + # Default min interval is 5s, but can be configured lower + - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s + # Disable password change requirement + - GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false + - GF_SECURITY_ADMIN_PASSWORD_POLICY=false + - GF_AUTH_DISABLE_LOGIN_FORM=false + - GF_AUTH_DISABLE_SIGNOUT_MENU=false + restart: unless-stopped + ports: + - "3000:3000" + networks: + - server + depends_on: + - prometheus + - tempo + diff --git a/deploy/metrics/k8s/frontend-podmonitor.yaml b/deploy/metrics/k8s/frontend-podmonitor.yaml deleted file mode 100644 index c7560797dc..0000000000 --- a/deploy/metrics/k8s/frontend-podmonitor.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - name: dynamo-frontend-metrics - namespace: ${NAMESPACE} -spec: - selector: - matchLabels: - nvidia.com/metrics-enabled: "true" - nvidia.com/dynamo-component-type: "frontend" - podMetricsEndpoints: - - port: http - path: /metrics - interval: 5s - relabelings: - - action: replace - sourceLabels: - - __meta_kubernetes_pod_label_nvidia_com_dynamo_namespace - targetLabel: dynamo_namespace - namespaceSelector: - matchNames: - - ${NAMESPACE} diff --git a/deploy/metrics/k8s/planner-podmonitor.yaml b/deploy/metrics/k8s/planner-podmonitor.yaml deleted file mode 100644 index 15f2e90a96..0000000000 --- a/deploy/metrics/k8s/planner-podmonitor.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - name: dynamo-planner-metrics - namespace: $NAMESPACE -spec: - selector: - matchLabels: - nvidia.com/metrics-enabled: "true" - nvidia.com/dynamo-component-type: "planner" - podMetricsEndpoints: - - port: metrics - path: /metrics - interval: 5s - namespaceSelector: - matchNames: - - $NAMESPACE \ No newline at end of file diff --git a/deploy/metrics/k8s/worker-podmonitor.yaml b/deploy/metrics/k8s/worker-podmonitor.yaml deleted file mode 100644 index 1fb44cbbc5..0000000000 --- a/deploy/metrics/k8s/worker-podmonitor.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - name: dynamo-worker-metrics - namespace: ${NAMESPACE} -spec: - selector: - matchLabels: - nvidia.com/metrics-enabled: "true" - nvidia.com/dynamo-component-type: "worker" - podMetricsEndpoints: - - port: system - path: /metrics - interval: 5s - namespaceSelector: - matchNames: - - ${NAMESPACE} diff --git a/deploy/observability/README.md b/deploy/observability/README.md new file mode 100644 index 0000000000..0fb3e7723c --- /dev/null +++ b/deploy/observability/README.md @@ -0,0 +1,3 @@ +# Dynamo Observability + +For detailed documentation on Observability (Prometheus metrics, tracing, and logging), please refer to [docs/observability/](../../docs/observability/). diff --git a/deploy/metrics/grafana-datasources.yml b/deploy/observability/grafana-datasources.yml similarity index 100% rename from deploy/metrics/grafana-datasources.yml rename to deploy/observability/grafana-datasources.yml diff --git a/deploy/observability/grafana_dashboards/README.md b/deploy/observability/grafana_dashboards/README.md new file mode 100644 index 0000000000..7eaeb16808 --- /dev/null +++ b/deploy/observability/grafana_dashboards/README.md @@ -0,0 +1,11 @@ +# Example Grafana Dashboards + +This directory contains example Grafana dashboards for Dynamo observability. These are starter files that you can use as references for building your own custom dashboards. + +- `dynamo.json` - General Dynamo dashboard showing software and hardware metrics +- `dcgm-metrics.json` - GPU metrics dashboard using DCGM exporter data +- `kvbm.json` - KV Block Manager metrics dashboard +- `temp-loki.json` - Logging dashboard for Loki integration +- `dashboard-providers.yml` - Configuration file for dashboard provisioning + +For setup instructions and usage, see [Observability Documentation](../../../docs/observability/). diff --git a/deploy/metrics/grafana_dashboards/grafana-dashboard-providers.yml b/deploy/observability/grafana_dashboards/dashboard-providers.yml similarity index 100% rename from deploy/metrics/grafana_dashboards/grafana-dashboard-providers.yml rename to deploy/observability/grafana_dashboards/dashboard-providers.yml diff --git a/deploy/metrics/grafana_dashboards/grafana-dcgm-metrics.json b/deploy/observability/grafana_dashboards/dcgm-metrics.json similarity index 96% rename from deploy/metrics/grafana_dashboards/grafana-dcgm-metrics.json rename to deploy/observability/grafana_dashboards/dcgm-metrics.json index b662e497bc..e82c827e1b 100644 --- a/deploy/metrics/grafana_dashboards/grafana-dcgm-metrics.json +++ b/deploy/observability/grafana_dashboards/dcgm-metrics.json @@ -15,19 +15,7 @@ } ] }, - "copyright": [ - "SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.", - "SPDX-License-Identifier: Apache-2.0", - "Licensed under the Apache License, Version 2.0 (the \"License\");", - "you may not use this file except in compliance with the License.", - "You may obtain a copy of the License at", - "http://www.apache.org/licenses/LICENSE-2.0", - "Unless required by applicable law or agreed to in writing, software", - "distributed under the License is distributed on an \"AS IS\" BASIS,", - "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.", - "See the License for the specific language governing permissions and", - "limitations under the License." - ], + "_copyright": "SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, diff --git a/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json b/deploy/observability/grafana_dashboards/dynamo.json similarity index 99% rename from deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json rename to deploy/observability/grafana_dashboards/dynamo.json index 76b822c6f9..1ef1abc7c1 100644 --- a/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json +++ b/deploy/observability/grafana_dashboards/dynamo.json @@ -1020,7 +1020,7 @@ }, "timepicker": {}, "timezone": "browser", - "title": "Dynamo Dashboard", + "title": "Dynamo Dashboard (generic)", "uid": "97ae8df9-138a-4f7a-9b0f-635b77d818fe", "version": 1 } \ No newline at end of file diff --git a/deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json b/deploy/observability/grafana_dashboards/kvbm.json similarity index 100% rename from deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json rename to deploy/observability/grafana_dashboards/kvbm.json diff --git a/deploy/logging/grafana/dashboard.json b/deploy/observability/grafana_dashboards/temp-loki.json similarity index 100% rename from deploy/logging/grafana/dashboard.json rename to deploy/observability/grafana_dashboards/temp-loki.json diff --git a/deploy/metrics/k8s/README.md b/deploy/observability/k8s/README.md similarity index 100% rename from deploy/metrics/k8s/README.md rename to deploy/observability/k8s/README.md diff --git a/deploy/metrics/k8s/grafana-dynamo-dashboard-configmap.yaml b/deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml similarity index 99% rename from deploy/metrics/k8s/grafana-dynamo-dashboard-configmap.yaml rename to deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml index 0c4ed0c011..ee1088556b 100644 --- a/deploy/metrics/k8s/grafana-dynamo-dashboard-configmap.yaml +++ b/deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml @@ -1002,7 +1002,7 @@ data: }, "timepicker": {}, "timezone": "browser", - "title": "Dynamo Dashboard", + "title": "Dynamo Dashboard (generic)", "uid": "dynamo-dashboard", "version": 1 } diff --git a/deploy/logging/README.md b/deploy/observability/k8s/logging/README.md similarity index 75% rename from deploy/logging/README.md rename to deploy/observability/k8s/logging/README.md index 2423989d99..85634e5273 100644 --- a/deploy/logging/README.md +++ b/deploy/observability/k8s/logging/README.md @@ -1,3 +1,3 @@ # Dynamo Logging on Kubernetes -For detailed documentation on collecting and visualizing logs on Kubernetes, see [docs/kubernetes/observability/logging.md](../../docs/kubernetes/observability/logging.md). +For detailed documentation on collecting and visualizing logs on Kubernetes, see [docs/kubernetes/observability/logging.md](../../../../docs/kubernetes/observability/logging.md). diff --git a/deploy/observability/k8s/logging/grafana/dashboard.json b/deploy/observability/k8s/logging/grafana/dashboard.json new file mode 100644 index 0000000000..04f27cd250 --- /dev/null +++ b/deploy/observability/k8s/logging/grafana/dashboard.json @@ -0,0 +1,214 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 21, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "$datasource" + }, + "direction": "backward", + "editorMode": "builder", + "expr": "{namespace=~\"$namespace\", nvidia_com_dynamo_graph_deployment_name=~\"$dynamographdeployment\", nvidia_com_dynamo_component_type=~\"$component\"} |= \"$search\" |= \"$trace_id\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "DynamoGraph Logs", + "type": "logs" + } + ], + "preload": false, + "schemaVersion": 41, + "tags": ["dynamograph", "logs"], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "Loki", + "value": "Loki" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "loki", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "loki", + "uid": "$datasource" + }, + "definition": "label_values(namespace)", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "options": [], + "query": "label_values(namespace)", + "refresh": 1, + "regex": ".+", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "loki", + "uid": "$datasource" + }, + "definition": "label_values(nvidia_com_dynamo_graph_deployment_name)", + "hide": 0, + "includeAll": true, + "label": "DynamoGraph Deployment", + "multi": true, + "name": "dynamographdeployment", + "options": [], + "query": "label_values(nvidia_com_dynamo_graph_deployment_name)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "loki", + "uid": "$datasource" + }, + "definition": "label_values(nvidia_com_dynamo_component_type)", + "hide": 0, + "includeAll": true, + "label": "Component", + "multi": true, + "name": "component", + "options": [], + "query": "label_values(nvidia_com_dynamo_component_type)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "", + "value": "" + }, + "label": "Trace ID", + "name": "trace_id", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "type": "textbox" + }, + { + "current": { + "selected": true, + "text": "", + "value": "" + }, + "label": "Search", + "name": "search", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "type": "textbox" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "DynamoGraph Logs", + "description": "Dashboard for viewing DynamoGraph deployment logs across components and namespaces", + "version": 1 + } diff --git a/deploy/logging/grafana/logging-dashboard.yaml b/deploy/observability/k8s/logging/grafana/logging-dashboard.yaml similarity index 100% rename from deploy/logging/grafana/logging-dashboard.yaml rename to deploy/observability/k8s/logging/grafana/logging-dashboard.yaml diff --git a/deploy/logging/grafana/loki-datasource.yaml b/deploy/observability/k8s/logging/grafana/loki-datasource.yaml similarity index 100% rename from deploy/logging/grafana/loki-datasource.yaml rename to deploy/observability/k8s/logging/grafana/loki-datasource.yaml diff --git a/deploy/logging/values/alloy-values.yaml b/deploy/observability/k8s/logging/values/alloy-values.yaml similarity index 100% rename from deploy/logging/values/alloy-values.yaml rename to deploy/observability/k8s/logging/values/alloy-values.yaml diff --git a/deploy/logging/values/loki-values.yaml b/deploy/observability/k8s/logging/values/loki-values.yaml similarity index 100% rename from deploy/logging/values/loki-values.yaml rename to deploy/observability/k8s/logging/values/loki-values.yaml diff --git a/deploy/metrics/prometheus.yml b/deploy/observability/prometheus.yml similarity index 100% rename from deploy/metrics/prometheus.yml rename to deploy/observability/prometheus.yml diff --git a/deploy/tracing/grafana/provisioning/datasources/tempo.yaml b/deploy/observability/tempo-datasource.yml similarity index 96% rename from deploy/tracing/grafana/provisioning/datasources/tempo.yaml rename to deploy/observability/tempo-datasource.yml index 388c461371..14efa7c770 100644 --- a/deploy/tracing/grafana/provisioning/datasources/tempo.yaml +++ b/deploy/observability/tempo-datasource.yml @@ -9,7 +9,7 @@ datasources: access: proxy url: http://tempo:3200 uid: tempo - isDefault: true + isDefault: false editable: true jsonData: httpMethod: GET diff --git a/deploy/tracing/tempo.yaml b/deploy/observability/tempo.yaml similarity index 100% rename from deploy/tracing/tempo.yaml rename to deploy/observability/tempo.yaml diff --git a/deploy/tracing/docker-compose.yml b/deploy/tracing/docker-compose.yml deleted file mode 100644 index 16a5f0657d..0000000000 --- a/deploy/tracing/docker-compose.yml +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -version: '3.8' - -services: - # Tempo - Distributed tracing backend - tempo: - image: grafana/tempo:2.8.2 - command: [ "-config.file=/etc/tempo.yaml" ] - volumes: - - ./tempo.yaml:/etc/tempo.yaml - - tempo-data:/tmp/tempo - ports: - - "3200:3200" # Tempo HTTP - - "4317:4317" # OTLP gRPC receiver (accessible from host) - - "4318:4318" # OTLP HTTP receiver (accessible from host) - - # Grafana - Visualization and dashboards - grafana: - image: grafana/grafana:12.2.0 - ports: - - "3000:3000" - environment: - - GF_SECURITY_ADMIN_PASSWORD=admin - - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor - volumes: - - grafana-data:/var/lib/grafana - - ./grafana/provisioning:/etc/grafana/provisioning - depends_on: - - tempo - -volumes: - tempo-data: - grafana-data: diff --git a/docs/_sections/observability.rst b/docs/_sections/observability.rst index f91973e7d1..c1b108c975 100644 --- a/docs/_sections/observability.rst +++ b/docs/_sections/observability.rst @@ -4,6 +4,10 @@ Observability .. toctree:: :hidden: + Overview <../observability/README> + Prometheus + Grafana Setup <../observability/prometheus-grafana> Metrics <../observability/metrics> + Metrics Developer Guide <../observability/metrics-developer-guide> + Health Checks <../observability/health-checks> + Tracing <../observability/tracing> Logging <../observability/logging> - Health Checks <../observability/health-checks> \ No newline at end of file diff --git a/docs/hidden_toctree.rst b/docs/hidden_toctree.rst index e547694d3d..fdc52598e2 100644 --- a/docs/hidden_toctree.rst +++ b/docs/hidden_toctree.rst @@ -26,6 +26,7 @@ kubernetes/api_reference.md kubernetes/deployment/create_deployment.md + kubernetes/deployment/dynamomodel-guide.md kubernetes/fluxcd.md kubernetes/grove.md diff --git a/docs/kubernetes/observability/logging.md b/docs/kubernetes/observability/logging.md index 0784cf05c7..66b29dd28a 100644 --- a/docs/kubernetes/observability/logging.md +++ b/docs/kubernetes/observability/logging.md @@ -25,6 +25,8 @@ While this guide does not use Prometheus, it assumes Grafana is pre-installed wi ### 3. Environment Variables +#### Kubernetes Setup Variables + The following env variables are set: - `MONITORING_NAMESPACE`: The namespace where Loki is installed - `DYN_NAMESPACE`: The namespace where Dynamo Cloud Operator is installed @@ -34,6 +36,14 @@ export MONITORING_NAMESPACE=monitoring export DYN_NAMESPACE=dynamo-system ``` +#### Dynamo Logging Variables + +| Variable | Description | Example | +|----------|-------------|---------| +| `DYN_LOGGING_JSONL` | Enable JSONL logging format (required for Loki) | `true` | +| `DYN_LOG` | Log levels per target `,=,=` | `DYN_LOG=info,dynamo_runtime::system_status_server:trace` | +| `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for timestamps | `true` | + ## Installation Steps ### 1. Install Loki @@ -46,7 +56,7 @@ helm repo add grafana https://grafana.github.io/helm-charts helm repo update # Install Loki -helm install --values deploy/logging/values/loki-values.yaml loki grafana/loki -n $MONITORING_NAMESPACE +helm install --values deploy/observability/k8s/logging/values/loki-values.yaml loki grafana/loki -n $MONITORING_NAMESPACE ``` Our configuration (`loki-values.yaml`) sets up Loki in a simple configuration that is suitable for testing and development. It uses a local MinIO for storage. The installation pods can be viewed with: @@ -60,7 +70,7 @@ Next, install the Grafana Alloy collector to gather logs from your Kubernetes cl ```bash # Generate a custom values file with the namespace information -envsubst < deploy/logging/values/alloy-values.yaml > alloy-custom-values.yaml +envsubst < deploy/observability/k8s/logging/values/alloy-values.yaml > alloy-custom-values.yaml # Install the collector helm install --values alloy-custom-values.yaml alloy grafana/k8s-monitoring -n $MONITORING_NAMESPACE @@ -110,10 +120,10 @@ Since we are using Grafana with the Prometheus Operator, we can simply apply the ```bash # Configure Grafana with the Loki datasource -envsubst < deploy/logging/grafana/loki-datasource.yaml | kubectl apply -n $MONITORING_NAMESPACE -f - +envsubst < deploy/observability/k8s/logging/grafana/loki-datasource.yaml | kubectl apply -n $MONITORING_NAMESPACE -f - # Configure Grafana with the Dynamo Logs dashboard -envsubst < deploy/logging/grafana/logging-dashboard.yaml | kubectl apply -n $MONITORING_NAMESPACE -f - +envsubst < deploy/observability/k8s/logging/grafana/logging-dashboard.yaml | kubectl apply -n $MONITORING_NAMESPACE -f - ``` > [!Note] @@ -141,4 +151,4 @@ kubectl port-forward svc/prometheus-grafana 3000:80 -n $MONITORING_NAMESPACE If everything is working, under Home > Dashboards > Dynamo Logs, you should see a dashboard that can be used to view the logs associated with our DynamoGraphDeployments -The dashboard enables filtering by DynamoGraphDeployment, namespace, and component type (e.g frontend, worker, etc). \ No newline at end of file +The dashboard enables filtering by DynamoGraphDeployment, namespace, and component type (e.g., frontend, worker, etc.). diff --git a/docs/kubernetes/observability/metrics.md b/docs/kubernetes/observability/metrics.md index e03ec3efeb..f8d6f8696b 100644 --- a/docs/kubernetes/observability/metrics.md +++ b/docs/kubernetes/observability/metrics.md @@ -128,9 +128,7 @@ spec: Apply the Dynamo dashboard configuration to populate Grafana with the Dynamo dashboard: ```bash -pushd deploy/metrics/k8s -kubectl apply -n monitoring -f grafana-dynamo-dashboard-configmap.yaml -popd +kubectl apply -n monitoring -f deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml ``` The dashboard is embedded in the ConfigMap. Since it is labeled with `grafana_dashboard: "1"`, the Grafana will discover and populate it to its list of available dashboards. The dashboard includes panels for: diff --git a/docs/observability/README.md b/docs/observability/README.md new file mode 100644 index 0000000000..e802f9b10c --- /dev/null +++ b/docs/observability/README.md @@ -0,0 +1,96 @@ + + +# Dynamo Observability + +## Getting Started Quickly + +This is an example to get started quickly on a single machine. + +### Prerequisites + +Install these on your machine: + +- [Docker](https://docs.docker.com/get-docker/) +- [Docker Compose](https://docs.docker.com/compose/install/) + +### Starting the Observability Stack + +Dynamo provides a Docker Compose-based observability stack that includes Prometheus, Grafana, Tempo, and various exporters for metrics, tracing, and visualization. + +From the Dynamo root directory: + +```bash +# Start infrastructure (NATS, etcd) +docker compose -f deploy/docker-compose.yml up -d + +# Start observability stack (Prometheus, Grafana, Tempo, DCGM GPU exporter, NATS exporter) +docker compose -f deploy/docker-observability.yml up -d +``` + +For detailed setup instructions and configuration, see [Prometheus + Grafana Setup](prometheus-grafana.md). + +## Observability Documentations + +| Guide | Description | Environment Variables to Control | +|-------|-------------|----------------------------------| +| [Metrics](metrics.md) | Available metrics reference | `DYN_SYSTEM_PORT`† | +| [Health Checks](health-checks.md) | Component health monitoring and readiness probes | `DYN_SYSTEM_PORT`†, `DYN_SYSTEM_STARTING_HEALTH_STATUS`, `DYN_SYSTEM_HEALTH_PATH`, `DYN_SYSTEM_LIVE_PATH`, `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | +| [Tracing](tracing.md) | Distributed tracing with OpenTelemetry and Tempo | `DYN_LOGGING_JSONL`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`†, `OTEL_SERVICE_NAME`† | +| [Logging](logging.md) | Structured logging configuration | `DYN_LOGGING_JSONL`†, `DYN_LOG`, `DYN_LOG_USE_LOCAL_TZ`, `DYN_LOGGING_CONFIG_PATH`, `OTEL_SERVICE_NAME`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`† | + +**Variables marked with † are shared across multiple observability systems.** + +## Developer Guides + +| Guide | Description | Environment Variables to Control | +|-------|-------------|----------------------------------| +| [Metrics Developer Guide](metrics-developer-guide.md) | Creating custom metrics in Rust and Python | `DYN_SYSTEM_PORT`† | + +## Kubernetes + +For Kubernetes-specific setup and configuration, see [docs/kubernetes/observability/](../kubernetes/observability/). + +--- + +## Topology + +This provides: +- **Prometheus** on `http://localhost:9090` - metrics collection and querying +- **Grafana** on `http://localhost:3000` - visualization dashboards (username: `dynamo`, password: `dynamo`) +- **Tempo** on `http://localhost:3200` - distributed tracing backend +- **DCGM Exporter** on `http://localhost:9401/metrics` - GPU metrics +- **NATS Exporter** on `http://localhost:7777/metrics` - NATS messaging metrics + +### Service Relationship Diagram +```mermaid +graph TD + BROWSER[Browser] -->|:3000| GRAFANA[Grafana :3000] + subgraph DockerComposeNetwork [Network inside Docker Compose] + NATS_PROM_EXP[nats-prom-exp :7777 /metrics] -->|:8222/varz| NATS_SERVER[nats-server :4222, :6222, :8222] + PROMETHEUS[Prometheus server :9090] -->|:2379/metrics| ETCD_SERVER[etcd-server :2379, :2380] + PROMETHEUS -->|:9401/metrics| DCGM_EXPORTER[dcgm-exporter :9401] + PROMETHEUS -->|:7777/metrics| NATS_PROM_EXP + PROMETHEUS -->|:8000/metrics| DYNAMOFE[Dynamo HTTP FE :8000] + PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081] + DYNAMOFE --> DYNAMOBACKEND + GRAFANA -->|:9090/query API| PROMETHEUS + end +``` + +The dcgm-exporter service in the Docker Compose network is configured to use port 9401 instead of the default port 9400. This adjustment is made to avoid port conflicts with other dcgm-exporter instances that may be running simultaneously. Such a configuration is typical in distributed systems like SLURM. + +### Configuration Files + +The following configuration files are located in the `deploy/observability/` directory: +- [docker-compose.yml](../../deploy/docker-compose.yml): Defines NATS and etcd services +- [docker-observability.yml](../../deploy/docker-observability.yml): Defines Prometheus, Grafana, Tempo, and exporters +- [prometheus.yml](../../deploy/observability/prometheus.yml): Contains Prometheus scraping configuration +- [grafana-datasources.yml](../../deploy/observability/grafana-datasources.yml): Contains Grafana datasource configuration +- [grafana_dashboards/dashboard-providers.yml](../../deploy/observability/grafana_dashboards/dashboard-providers.yml): Contains Grafana dashboard provider configuration +- [grafana_dashboards/dynamo.json](../../deploy/observability/grafana_dashboards/dynamo.json): A general Dynamo Dashboard for both SW and HW metrics +- [grafana_dashboards/dcgm-metrics.json](../../deploy/observability/grafana_dashboards/dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics +- [grafana_dashboards/kvbm.json](../../deploy/observability/grafana_dashboards/kvbm.json): Contains Grafana dashboard configuration for KVBM metrics + diff --git a/docs/observability/health-checks.md b/docs/observability/health-checks.md index a35582554e..895b67a474 100644 --- a/docs/observability/health-checks.md +++ b/docs/observability/health-checks.md @@ -11,6 +11,38 @@ Dynamo provides health check and liveness HTTP endpoints for each component whic can be used to configure startup, liveness and readiness probes in orchestration frameworks such as Kubernetes. +## Environment Variables + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_SYSTEM_PORT` | System status server port | `8081` | `9090` | +| `DYN_SYSTEM_STARTING_HEALTH_STATUS` | Initial health status | `notready` | `ready`, `notready` | +| `DYN_SYSTEM_HEALTH_PATH` | Custom health endpoint path | `/health` | `/custom/health` | +| `DYN_SYSTEM_LIVE_PATH` | Custom liveness endpoint path | `/live` | `/custom/live` | +| `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | Endpoints required for ready state | none | `["generate"]` | + +## Getting Started Quickly + +Enable health checks and query endpoints: + +```bash +# Start your Dynamo components +python -m dynamo.frontend --http-port 8000 & + +# Enable system status server on port 8081 +DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & +``` + +Check health status: + +```bash +# Frontend health (port 8000) +curl -s localhost:8000/health | jq + +# Worker health (port 8081) +curl -s localhost:8081/health | jq +``` + ## Frontend Liveness Check The frontend liveness endpoint reports a status of `live` as long as @@ -124,16 +156,6 @@ when initializing and HTTP status code `HTTP/1.1 200 OK` once ready. > **Note**: Both /live and /ready return the same information -### Environment Variables for Enabling Health Checks - -| **Environment Variable** | **Description** | **Example Settings** | -| -------------------------| ------------------- | ------------------------------------------------ | -| `DYN_SYSTEM_PORT` | Specifies the port for the system status server (automatically enables it when set to a positive value). | `9090`, `8081` | -| `DYN_SYSTEM_STARTING_HEALTH_STATUS` | Sets the initial health status of the system (ready/not ready). | `ready`, `notready` | -| `DYN_SYSTEM_HEALTH_PATH` | Custom path for the health endpoint. | `/custom/health` | -| `DYN_SYSTEM_LIVE_PATH` | Custom path for the liveness endpoint. | `/custom/live` | -| `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | Specifies endpoints to check for determining overall system health status. | `["generate"]` | - ### Example Environment Setting ``` diff --git a/docs/observability/logging.md b/docs/observability/logging.md index 161e6aafa3..4364bbe7e2 100644 --- a/docs/observability/logging.md +++ b/docs/observability/logging.md @@ -1,18 +1,6 @@ # Dynamo Logging @@ -24,18 +12,38 @@ JSONL is enabled logs additionally contain `span` creation and exit events as well as support for `trace_id` and `span_id` fields for distributed tracing. -## Environment Variables for configuring Logging +## Environment Variables + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_LOGGING_JSONL` | Enable JSONL logging format | `false` | `true` | +| `DYN_LOG` | Log levels per target `,=,=` | `info` | `DYN_LOG=info,dynamo_runtime::system_status_server:trace` | +| `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for timestamps (default is UTC) | `false` | `true` | +| `DYN_LOGGING_CONFIG_PATH` | Path to custom TOML logging configuration | none | `/path/to/config.toml` | +| `OTEL_SERVICE_NAME` | Service name for trace and span information | `dynamo` | `dynamo-frontend` | +| `OTEL_EXPORT_ENABLED` | Enable OTLP trace exporting | `false` | `true` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP exporter endpoint | `http://localhost:4317` | `http://tempo:4317` | + +## Getting Started Quickly + +### Start Observability Stack + +For collecting and visualizing logs with Grafana Loki (Kubernetes), or viewing trace context in logs alongside Grafana Tempo, start the observability stack. See [Observability Getting Started](README.md#getting-started-quickly) for instructions. + +### Enable Structured Logging + +Enable structured JSONL logging: -| Environment Variable | Description | Example Settings | -| ----------------------------------- | --------------------------------------------| ---------------------------------------------------- | -| `DYN_LOGGING_JSONL` | Enable JSONL logging format (default: READABLE) | `DYN_LOGGING_JSONL=true` | -| `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for logging timestamps (default: UTC) | `DYN_LOG_USE_LOCAL_TZ=1` | -| `DYN_LOG` | Log levels per target `,=,=` | `DYN_LOG=info,dynamo_runtime::system_status_server:trace` | -| `DYN_LOGGING_CONFIG_PATH` | Path to custom TOML logging configuration file | `DYN_LOGGING_CONFIG_PATH=/path/to/config.toml`| -| `OTEL_SERVICE_NAME` | Service name for OpenTelemetry traces (default: `dynamo`) | `OTEL_SERVICE_NAME=dynamo-frontend` | -| `OTEL_EXPORT_ENABLED` | Enable OTLP trace exporting (set to `1` to enable) | `OTEL_EXPORT_ENABLED=1` | -| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP exporter endpoint (default: http://localhost:4317) | `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://tempo:4317` | +```bash +export DYN_LOGGING_JSONL=true +export DYN_LOG=debug +# Start your Dynamo components +python -m dynamo.frontend --http-port 8000 & +python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & +``` + +Logs will be written to stderr in JSONL format with trace context. ## Available Logging Levels @@ -85,68 +93,57 @@ Resulting Log format: {"time":"2025-09-02T15:53:31.943747Z","level":"INFO","target":"log","message":"Scheduler config values: {'max_num_seqs': 256, 'max_num_batched_tokens': 2048}","log.file":"/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/vllm/main.py","log.line":268,"log.target":"main.get_engine_cache_info"} ``` -## OpenTelemetry Distributed Tracing - -When `DYN_LOGGING_JSONL` is enabled, Dynamo uses OpenTelemetry for distributed tracing. All logs include `trace_id` and `span_id` fields, and spans are automatically created for requests. By default, traces are **not exported**. To export traces to an observability backend (like Tempo, Jaeger, or Zipkin), set `OTEL_EXPORT_ENABLED=1`. +## Logging of Trace and Span IDs -### Behavior +When `DYN_LOGGING_JSONL` is enabled, all logs include `trace_id` and `span_id` fields, and spans are automatically created for requests. This is useful for short debugging sessions where you want to examine trace context in logs without setting up a full tracing backend and for correlating log messages with traces. -- **With `DYN_LOGGING_JSONL=true` only**: OpenTelemetry layer is active, generating trace context and span IDs for all requests. Traces appear in logs but are not exported anywhere. -- **With `OTEL_EXPORT_ENABLED=1` and `DYN_LOGGING_JSONL=true`**: Same as above, plus traces are exported to an OTLP collector for visualization. +The trace and span information uses the OpenTelemetry format and libraries, which means the IDs are compatible with OpenTelemetry-based tracing backends like Tempo or Jaeger if you later choose to enable trace export. -### Configuration +**Note:** This section has overlap with [Distributed Tracing with Tempo](tracing.md). For trace visualization in Grafana Tempo and persistent trace analysis, see [Distributed Tracing with Tempo](tracing.md). -To enable OTLP trace exporting: +### Configuration for Logging -1. Set `OTEL_EXPORT_ENABLED=1` to enable trace export -2. Optionally configure the endpoint using `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` (default: `http://localhost:4317`) -3. Optionally set `OTEL_SERVICE_NAME` to identify the service (useful in Kubernetes, default: `dynamo`) - -**Export Settings:** -- **Protocol**: gRPC (Tonic) -- **Service Name**: Value of `OTEL_SERVICE_NAME` env var, or `dynamo` if not set -- **Endpoint**: Value of `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` env var, or `http://localhost:4317` if not set - -### Example: JSONL Logging Only (No Export) +To see trace information in logs: ```bash export DYN_LOGGING_JSONL=true -# OpenTelemetry is active, traces appear in logs, but nothing is exported +export DYN_LOG=debug # Set to debug to see detailed trace logs + +# Start your Dynamo components (e.g., frontend and worker) +python -m dynamo.frontend --http-port 8000 & +python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & ``` -### Example: JSONL Logging + Trace Export to Tempo +This enables JSONL logging with `trace_id` and `span_id` fields. Traces appear in logs but are not exported to any backend. + +### Example Request + +Send a request to generate logs with trace context: ```bash -export DYN_LOGGING_JSONL=true -export OTEL_EXPORT_ENABLED=1 -export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://tempo:4317 -export OTEL_SERVICE_NAME=dynamo-frontend -# OpenTelemetry is active, traces appear in logs AND are exported to Tempo +curl -H 'Content-Type: application/json' \ +-H 'x-request-id: test-trace-001' \ +-d '{ + "model": "Qwen/Qwen3-0.6B", + "max_completion_tokens": 100, + "messages": [ + {"role": "user", "content": "What is the capital of France?"} + ] +}' \ +http://localhost:8000/v1/chat/completions ``` -## Trace and Span Information +Check the logs (stderr) for JSONL output containing `trace_id`, `span_id`, and `x_request_id` fields. -### Example Request +## Trace and Span Information in Logs -```sh -curl -X POST http://localhost:8000/v1/chat/completions \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "Qwen/Qwen3-0.6B", - "messages": [ - { - "role": "user", - "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time" - } - ], - "stream": true, - "max_tokens": 1000, - }' -``` +This section shows how trace and span information appears in JSONL logs. These logs can be used to understand request flows even without a trace visualization backend. + +### Example Disaggregated Trace in Grafana When viewing the corresponding trace in Grafana, you should be able to see something like the following: -![Trace Example](./grafana-disagg-trace.png) +![Disaggregated Trace Example](grafana-disagg-trace.png) ### Trace Overview @@ -208,18 +205,18 @@ When viewing the corresponding trace in Grafana, you should be able to see somet | **Busy Time** | 3,795,258 ns (3.79ms) | | **Idle Time** | 3,996,532,471 ns (3.99s) | -### Frontend Logs +### Frontend Logs with Trace Context The following shows the JSONL logs from the frontend service for the same request. Note the `trace_id` field (`b672ccf48683b392891c5cb4163d4b51`) that correlates all logs for this request, and the `span_id` field that identifies individual operations: ``` -{"time":"2025-10-31T20:52:07.707164Z","level":"INFO","file":"/opt/dynamo/lib/runtime/src/logging.rs","line":806,"target":"dynamo_runtime::logging","message":"OpenTelemetry OTLP export enabled","endpoint":"http://tempo.tm.svc.cluster.local:4317","service":"frontend"} +{"time":"2025-10-31T20:52:07.707164Z","level":"INFO","file":"/opt/dynamo/lib/runtime/src/logging.rs","line":806,"target":"dynamo_runtime::logging","message":"OTLP export enabled","endpoint":"http://tempo.tm.svc.cluster.local:4317","service":"frontend"} {"time":"2025-10-31T20:52:10.707164Z","level":"DEBUG","file":"/opt/dynamo/lib/runtime/src/pipeline/network/tcp/server.rs","line":230,"target":"dynamo_runtime::pipeline::network::tcp::server","message":"Registering new TcpStream on 10.0.4.65:41959","method":"POST","span_id":"5c20cc08e6afb2b7","span_name":"http-request","trace_id":"b672ccf48683b392891c5cb4163d4b51","uri":"/v1/chat/completions","version":"HTTP/1.1"} {"time":"2025-10-31T20:52:10.745264Z","level":"DEBUG","file":"/opt/dynamo/lib/llm/src/kv_router/prefill_router.rs","line":232,"target":"dynamo_llm::kv_router::prefill_router","message":"Prefill succeeded, using disaggregated params for decode","method":"POST","span_id":"5c20cc08e6afb2b7","span_name":"http-request","trace_id":"b672ccf48683b392891c5cb4163d4b51","uri":"/v1/chat/completions","version":"HTTP/1.1"} {"time":"2025-10-31T20:52:10.745545Z","level":"DEBUG","file":"/opt/dynamo/lib/runtime/src/pipeline/network/tcp/server.rs","line":230,"target":"dynamo_runtime::pipeline::network::tcp::server","message":"Registering new TcpStream on 10.0.4.65:41959","method":"POST","span_id":"5c20cc08e6afb2b7","span_name":"http-request","trace_id":"b672ccf48683b392891c5cb4163d4b51","uri":"/v1/chat/completions","version":"HTTP/1.1"} ``` -## Custom Request IDs +## Custom Request IDs in Logs You can provide a custom request ID using the `x-request-id` header. This ID will be attached to all spans and logs for that request, making it easier to correlate traces with application-level request tracking. @@ -237,7 +234,7 @@ curl -X POST http://localhost:8000/v1/chat/completions \ "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time" } ], - "stream": true, + "stream": false, "max_tokens": 1000 }' ``` diff --git a/docs/observability/metrics-developer-guide.md b/docs/observability/metrics-developer-guide.md new file mode 100644 index 0000000000..036f4bd401 --- /dev/null +++ b/docs/observability/metrics-developer-guide.md @@ -0,0 +1,269 @@ + + +# Metrics Developer Guide + +This guide explains how to create and use custom metrics in Dynamo components using the Dynamo metrics API. + +## Metrics Exposure + +All metrics created via the Dynamo metrics API are automatically exposed on the `/metrics` HTTP endpoint in Prometheus Exposition Format text when the following environment variable is set: + +- `DYN_SYSTEM_PORT=` - Port for the metrics endpoint (set to positive value to enable, default: `-1` disabled) + +Example: +```bash +DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model +``` + +Prometheus Exposition Format text metrics will be available at: `http://localhost:8081/metrics` + +## Metric Name Constants + +The [prometheus_names.rs](../../lib/runtime/src/metrics/prometheus_names.rs) module provides centralized metric name constants and sanitization functions to ensure consistency across all Dynamo components. + +--- + +## Metrics API in Rust + +The metrics API is accessible through the `.metrics()` method on runtime, namespace, component, and endpoint objects. See [Runtime Hierarchy](metrics.md#runtime-hierarchy) for details on the hierarchical structure. + +### Available Methods + +- `.metrics().create_counter()`: Create a counter metric +- `.metrics().create_gauge()`: Create a gauge metric +- `.metrics().create_histogram()`: Create a histogram metric +- `.metrics().create_countervec()`: Create a counter with labels +- `.metrics().create_gaugevec()`: Create a gauge with labels +- `.metrics().create_histogramvec()`: Create a histogram with labels + +### Creating Metrics + +```rust +use dynamo_runtime::DistributedRuntime; + +let runtime = DistributedRuntime::new()?; +let endpoint = runtime.namespace("my_namespace").component("my_component").endpoint("my_endpoint"); + +// Simple metrics +let requests_total = endpoint.metrics().create_counter( + "requests_total", + "Total requests", + &[] +)?; + +let active_connections = endpoint.metrics().create_gauge( + "active_connections", + "Active connections", + &[] +)?; + +let latency = endpoint.metrics().create_histogram( + "latency_seconds", + "Request latency", + &[], + Some(vec![0.001, 0.01, 0.1, 1.0, 10.0]) +)?; +``` + +### Using Metrics + +```rust +// Counters +requests_total.inc(); + +// Gauges +active_connections.set(42.0); +active_connections.inc(); +active_connections.dec(); + +// Histograms +latency.observe(0.023); // 23ms +``` + +### Vector Metrics with Labels + +```rust +// Create vector metrics with label names +let requests_by_model = endpoint.metrics().create_countervec( + "requests_by_model", + "Requests by model", + &["model_type", "model_size"], + &[] +)?; + +let memory_by_gpu = endpoint.metrics().create_gaugevec( + "gpu_memory_bytes", + "GPU memory by device", + &["gpu_id", "memory_type"], + &[] +)?; + +// Use with specific label values +requests_by_model.with_label_values(&["llama", "7b"]).inc(); +memory_by_gpu.with_label_values(&["0", "allocated"]).set(8192.0); +``` + +### Advanced Features + +**Custom histogram buckets:** +```rust +let latency = endpoint.metrics().create_histogram( + "latency_seconds", + "Request latency", + &[], + Some(vec![0.001, 0.01, 0.1, 1.0, 10.0]) +)?; +``` + +**Constant labels:** +```rust +let counter = endpoint.metrics().create_counter( + "requests_total", + "Total requests", + &[("region", "us-west"), ("env", "prod")] +)?; +``` + +--- + +## Metrics API in Python + +Python components can create and manage Prometheus metrics using the same metrics API through Python bindings. + +### Available Methods + +- `endpoint.metrics.create_counter()` / `create_intcounter()`: Create a counter metric +- `endpoint.metrics.create_gauge()` / `create_intgauge()`: Create a gauge metric +- `endpoint.metrics.create_histogram()`: Create a histogram metric +- `endpoint.metrics.create_countervec()` / `create_intcountervec()`: Create a counter with labels +- `endpoint.metrics.create_gaugevec()` / `create_intgaugevec()`: Create a gauge with labels +- `endpoint.metrics.create_histogramvec()`: Create a histogram with labels + +All metrics are imported from `dynamo.prometheus_metrics`. + +### Creating Metrics + +```python +from dynamo.runtime import DistributedRuntime + +drt = DistributedRuntime() +endpoint = drt.namespace("my_namespace").component("my_component").endpoint("my_endpoint") + +# Simple metrics +requests_total = endpoint.metrics.create_intcounter( + "requests_total", + "Total requests" +) + +active_connections = endpoint.metrics.create_intgauge( + "active_connections", + "Active connections" +) + +latency = endpoint.metrics.create_histogram( + "latency_seconds", + "Request latency", + buckets=[0.001, 0.01, 0.1, 1.0, 10.0] +) +``` + +### Using Metrics + +```python +# Counters +requests_total.inc() +requests_total.inc_by(5) + +# Gauges +active_connections.set(42) +active_connections.inc() +active_connections.dec() + +# Histograms +latency.observe(0.023) # 23ms +``` + +### Vector Metrics with Labels + +```python +# Create vector metrics with label names +requests_by_model = endpoint.metrics.create_intcountervec( + "requests_by_model", + "Requests by model", + ["model_type", "model_size"] +) + +memory_by_gpu = endpoint.metrics.create_intgaugevec( + "gpu_memory_bytes", + "GPU memory by device", + ["gpu_id", "memory_type"] +) + +# Use with specific label values +requests_by_model.inc({"model_type": "llama", "model_size": "7b"}) +memory_by_gpu.set(8192, {"gpu_id": "0", "memory_type": "allocated"}) +``` + +### Advanced Features + +**Constant labels:** +```python +counter = endpoint.metrics.create_intcounter( + "requests_total", + "Total requests", + [("region", "us-west"), ("env", "prod")] +) +``` + +**Metric introspection:** +```python +print(counter.name()) # "my_namespace_my_component_my_endpoint_requests_total" +print(counter.const_labels()) # {"dynamo_namespace": "my_namespace", ...} +print(gauge_vec.variable_labels()) # ["model_type", "model_size"] +``` + +**Update patterns:** + +Background thread updates: +```python +import threading +import time + +def update_loop(): + while True: + active_connections.set(compute_current_connections()) + time.sleep(2) + +threading.Thread(target=update_loop, daemon=True).start() +``` + +Callback-based updates (called before each `/metrics` scrape): +```python +def update_metrics(): + active_connections.set(compute_current_connections()) + +endpoint.metrics.register_callback(update_metrics) +``` + +### Examples + +Example scripts: [lib/bindings/python/examples/metrics/](../../lib/bindings/python/examples/metrics/) + +```bash +cd ~/dynamo/lib/bindings/python/examples/metrics +DYN_SYSTEM_PORT=8081 ./server_with_loop.py +DYN_SYSTEM_PORT=8081 ./server_with_callback.py +``` + +--- + +## Related Documentation + +- [Metrics Overview](metrics.md) +- [Prometheus and Grafana Setup](prometheus-grafana.md) +- [Distributed Runtime Architecture](../design_docs/distributed_runtime.md) +- [Python Metrics Examples](../../lib/bindings/python/examples/metrics/) + diff --git a/docs/observability/metrics.md b/docs/observability/metrics.md index 7e2beb34c5..325457fbc6 100644 --- a/docs/observability/metrics.md +++ b/docs/observability/metrics.md @@ -3,27 +3,91 @@ SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All SPDX-License-Identifier: Apache-2.0 --> -# Dynamo MetricsRegistry +# Dynamo Metrics ## Overview -Dynamo provides built-in metrics capabilities through the `MetricsRegistry` trait, which is automatically available whenever you use the `DistributedRuntime` framework. This guide explains how to use metrics for observability and monitoring across all Dynamo components. +Dynamo provides built-in metrics capabilities through the Dynamo metrics API, which is automatically available whenever you use the `DistributedRuntime` framework. This document serves as a reference for all available metrics in Dynamo. -## Automatic Metrics +**For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](prometheus-grafana.md). -Dynamo automatically exposes metrics with the `dynamo_` name prefixes. It also adds the following labels `dynamo_namespace`, `dynamo_component`, and `dynamo_endpoint` to indicate which component is providing the metric. +**For creating custom metrics**, see the [Metrics Developer Guide](metrics-developer-guide.md). -**Frontend Metrics**: When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), these metrics are automatically exposed with the `dynamo_frontend_*` prefix and include `model` labels containing the model name. These cover request handling, token processing, and latency measurements. See [prometheus-grafana.md](prometheus-grafana.md#available-metrics) for the complete list of frontend metrics. +## Environment Variables -**Component Metrics**: The core Dynamo backend system automatically exposes metrics with the `dynamo_component_*` prefix for all components that use the `DistributedRuntime` framework. These include request counts, processing times, byte transfers, and system uptime metrics. See [prometheus-grafana.md](prometheus-grafana.md#available-metrics) for the complete list of component metrics. +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_SYSTEM_PORT` | System metrics/health port | `-1` (disabled) | `8081` | -**Specialized Component Metrics**: Components can also expose additional metrics specific to their functionality. For example, a `preprocessor` component exposes metrics with the `dynamo_preprocessor_*` prefix. See [prometheus-grafana.md](prometheus-grafana.md#available-metrics) for details on specialized component metrics. +## Getting Started Quickly -**Kubernetes Integration**: For comprehensive Kubernetes deployment and monitoring setup, see the [Kubernetes Metrics Guide](../kubernetes/observability/metrics.md). This includes Prometheus Operator setup, metrics collection configuration, and visualization in Grafana. +This is a single machine example. -## Metrics Hierarchy +### Start Observability Stack -The `MetricsRegistry` trait is implemented by `DistributedRuntime`, `Namespace`, `Component`, and `Endpoint`, providing a hierarchical approach to metric collection that matches Dynamo's distributed architecture: +For visualizing metrics with Prometheus and Grafana, start the observability stack. See [Observability Getting Started](README.md#getting-started-quickly) for instructions. + + +### Launch Dynamo Components + +Launch a frontend and vLLM backend to test metrics: + +```bash +$ python -m dynamo.frontend --http-port 8000 + +# Enable system metrics server on port 8081 +$ DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B \ + --enforce-eager --no-enable-prefix-caching --max-num-seqs 3 +``` + +Wait for the vLLM worker to start, then send requests and check metrics: + +```bash +# Send a request +curl -H 'Content-Type: application/json' \ +-d '{ + "model": "Qwen/Qwen3-0.6B", + "max_completion_tokens": 100, + "messages": [{"role": "user", "content": "Hello"}] +}' \ +http://localhost:8000/v1/chat/completions + +# Check metrics from the worker +curl -s localhost:8081/metrics | grep dynamo_component +``` + +## Exposed Metrics + +Dynamo exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All Dynamo-generated metrics use the `dynamo_*` prefix and include labels (`dynamo_namespace`, `dynamo_component`, `dynamo_endpoint`) to identify the source component. + +**Example Prometheus Exposition Format text:** + +``` +# HELP dynamo_component_requests_total Total requests processed +# TYPE dynamo_component_requests_total counter +dynamo_component_requests_total{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate"} 42 + +# HELP dynamo_component_request_duration_seconds Request processing time +# TYPE dynamo_component_request_duration_seconds histogram +dynamo_component_request_duration_seconds_bucket{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate",le="0.005"} 10 +dynamo_component_request_duration_seconds_bucket{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate",le="0.01"} 15 +dynamo_component_request_duration_seconds_bucket{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate",le="+Inf"} 42 +dynamo_component_request_duration_seconds_sum{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate"} 2.5 +dynamo_component_request_duration_seconds_count{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate"} 42 +``` + +### Metric Categories + +Dynamo exposes several categories of metrics: + +- **Frontend Metrics** (`dynamo_frontend_*`) - Request handling, token processing, and latency measurements +- **Component Metrics** (`dynamo_component_*`) - Request counts, processing times, byte transfers, and system uptime +- **Specialized Component Metrics** (e.g., `dynamo_preprocessor_*`) - Component-specific metrics +- **Engine Metrics** (Pass-through) - Backend engines expose their own metrics: [vLLM](../backends/vllm/prometheus.md) (`vllm:*`), [SGLang](../backends/sglang/prometheus.md) (`sglang:*`), [TensorRT-LLM](../backends/trtllm/prometheus.md) (`trtllm:*`) + +## Runtime Hierarchy + +The Dynamo metrics API is available on `DistributedRuntime`, `Namespace`, `Component`, and `Endpoint`, providing a hierarchical approach to metric collection that matches Dynamo's distributed architecture: - `DistributedRuntime`: Global metrics across the entire runtime - `Namespace`: Metrics scoped to a specific dynamo_namespace @@ -32,65 +96,116 @@ The `MetricsRegistry` trait is implemented by `DistributedRuntime`, `Namespace`, This hierarchical structure allows you to create metrics at the appropriate level of granularity for your monitoring needs. +## Available Metrics -## Getting Started +### Backend Component Metrics -For a complete setup guide including Docker Compose configuration, Prometheus setup, and Grafana dashboards, see the [Getting Started section](prometheus-grafana.md#getting-started) in the Prometheus and Grafana guide. +The core Dynamo backend system automatically exposes metrics with the `dynamo_component_*` prefix for all components that use the `DistributedRuntime` framework: -The quick start includes: -- Docker Compose setup for Prometheus and Grafana -- Pre-configured dashboards and datasources -- Access URLs for all monitoring endpoints -- GPU targeting configuration +- `dynamo_component_inflight_requests`: Requests currently being processed (gauge) +- `dynamo_component_request_bytes_total`: Total bytes received in requests (counter) +- `dynamo_component_request_duration_seconds`: Request processing time (histogram) +- `dynamo_component_requests_total`: Total requests processed (counter) +- `dynamo_component_response_bytes_total`: Total bytes sent in responses (counter) +- `dynamo_component_system_uptime_seconds`: DistributedRuntime uptime (gauge) -## Implementation Examples +### KV Router Statistics (kvstats) -Examples of creating metrics at different hierarchy levels and using dynamic labels are included in this document below. +KV router statistics are automatically exposed by LLM workers and KV router components with the `dynamo_component_kvstats_*` prefix. These metrics provide insights into GPU memory usage and cache efficiency: -### Grafana Dashboards +- `dynamo_component_kvstats_active_blocks`: Number of active KV cache blocks currently in use (gauge) +- `dynamo_component_kvstats_total_blocks`: Total number of KV cache blocks available (gauge) +- `dynamo_component_kvstats_gpu_cache_usage_percent`: GPU cache usage as a percentage (0.0-1.0) (gauge) +- `dynamo_component_kvstats_gpu_prefix_cache_hit_rate`: GPU prefix cache hit rate as a percentage (0.0-1.0) (gauge) -Use dashboards in `deploy/metrics/grafana_dashboards/`: -- `grafana-dynamo-dashboard.json`: General Dynamo dashboard -- `grafana-dcgm-metrics.json`: DCGM GPU metrics dashboard +These metrics are published by: +- **LLM Workers**: vLLM and TRT-LLM backends publish these metrics through their respective publishers +- **KV Router**: The KV router component aggregates and exposes these metrics for load balancing decisions -## Metrics Visualization Architecture +### Specialized Component Metrics -### Service Topology +Some components expose additional metrics specific to their functionality: -The metrics system follows this architecture for collecting and visualizing metrics: +- `dynamo_preprocessor_*`: Metrics specific to preprocessor components -```mermaid -graph TD - BROWSER[Browser] -->|:3001| GRAFANA[Grafana :3001] - subgraph DockerComposeNetwork [Network inside Docker Compose] - NATS_PROM_EXP[nats-prom-exp :7777 /metrics] -->|:8222/varz| NATS_SERVER[nats-server :4222, :6222, :8222] - PROMETHEUS[Prometheus server :9090] -->|:2379/metrics| ETCD_SERVER[etcd-server :2379, :2380] - PROMETHEUS -->|:9401/metrics| DCGM_EXPORTER[dcgm-exporter :9401] - PROMETHEUS -->|:7777/metrics| NATS_PROM_EXP - PROMETHEUS -->|:8000/metrics| DYNAMOFE[Dynamo HTTP FE :8000] - PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081] - DYNAMOFE --> DYNAMOBACKEND - GRAFANA -->|:9090/query API| PROMETHEUS - end -``` +### Frontend Metrics + +When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), these metrics are automatically exposed with the `dynamo_frontend_*` prefix and include `model` labels containing the model name: + +- `dynamo_frontend_inflight_requests`: Inflight requests (gauge) +- `dynamo_frontend_queued_requests`: Number of requests in HTTP processing queue (gauge) +- `dynamo_frontend_input_sequence_tokens`: Input sequence length (histogram) +- `dynamo_frontend_inter_token_latency_seconds`: Inter-token latency (histogram) +- `dynamo_frontend_output_sequence_tokens`: Output sequence length (histogram) +- `dynamo_frontend_request_duration_seconds`: LLM request duration (histogram) +- `dynamo_frontend_requests_total`: Total LLM requests (counter) +- `dynamo_frontend_time_to_first_token_seconds`: Time to first token (histogram) + +**Note**: The `dynamo_frontend_inflight_requests` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time. + +#### Model Configuration Metrics -### Grafana Dashboard +The frontend also exposes model configuration metrics with the `dynamo_frontend_model_*` prefix. These metrics are populated from the worker backend registration service when workers register with the system: -The metrics system includes a pre-configured Grafana dashboard for visualizing service metrics: +**Runtime Config Metrics (from ModelRuntimeConfig):** +These metrics come from the runtime configuration provided by worker backends during registration. -![Grafana Dynamo Dashboard](./grafana-dynamo-composite.png) +- `dynamo_frontend_model_total_kv_blocks`: Total KV blocks available for a worker serving the model (gauge) +- `dynamo_frontend_model_max_num_seqs`: Maximum number of sequences for a worker serving the model (gauge) +- `dynamo_frontend_model_max_num_batched_tokens`: Maximum number of batched tokens for a worker serving the model (gauge) -## Detailed Setup Guide +**MDC Metrics (from ModelDeploymentCard):** +These metrics come from the Model Deployment Card information provided by worker backends during registration. Note that when multiple worker instances register with the same model name, only the first instance's configuration metrics (runtime config and MDC metrics) will be populated. Subsequent instances with duplicate model names will be skipped for configuration metric updates, though the worker count metric will reflect all instances. -For complete setup instructions including Docker Compose, Prometheus configuration, and Grafana dashboards, see: +- `dynamo_frontend_model_context_length`: Maximum context length for a worker serving the model (gauge) +- `dynamo_frontend_model_kv_cache_block_size`: KV cache block size for a worker serving the model (gauge) +- `dynamo_frontend_model_migration_limit`: Request migration limit for a worker serving the model (gauge) -```{toctree} -:hidden: +**Worker Management Metrics:** +- `dynamo_frontend_model_workers`: Number of worker instances currently serving the model (gauge) -prometheus-grafana +### Request Processing Flow + +This section explains the distinction between two key metrics used to track request processing: + +1. **Inflight**: Tracks requests from HTTP handler start until the complete response is finished +2. **HTTP Queue**: Tracks requests from HTTP handler start until first token generation begins (including prefill time) + +**Example Request Flow:** +``` +curl -s localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ + "model": "Qwen/Qwen3-0.6B", + "prompt": "Hello let's talk about LLMs", + "stream": false, + "max_tokens": 1000 +}' ``` -- [Prometheus and Grafana Setup Guide](prometheus-grafana.md) +**Timeline:** +``` +Timeline: 0, 1, ... +Client ────> Frontend:8000 ────────────────────> Dynamo component/backend (vLLM, SGLang, TRT) + │request start │received │ + | | | + │ ├──> start prefill ──> first token ──> |last token + │ │ (not impl) | | + ├─────actual HTTP queue¹ ──────────┘ │ | + │ │ │ + ├─────implemented HTTP queue ─────────────────────────────┘ | + │ │ + └─────────────────────────────────── Inflight ────────────────────────────┘ +``` + +**Concurrency Example:** +Suppose the backend allows 3 concurrent requests and there are 10 clients continuously hitting the frontend: +- All 10 requests will be counted as inflight (from start until complete response) +- 7 requests will be in HTTP queue most of the time +- 3 requests will be actively processed (between first token and last token) + +**Key Differences:** +- **Inflight**: Measures total request lifetime including processing time +- **HTTP Queue**: Measures queuing time before processing begins (including prefill time) +- **HTTP Queue ≤ Inflight** (HTTP queue is a subset of inflight time) ## Related Documentation diff --git a/docs/observability/prometheus-grafana.md b/docs/observability/prometheus-grafana.md index dbec30a970..949273e537 100644 --- a/docs/observability/prometheus-grafana.md +++ b/docs/observability/prometheus-grafana.md @@ -1,492 +1,112 @@ -# Metrics Visualization with Prometheus and Grafana - -This directory contains configuration for visualizing metrics from the metrics aggregation service using Prometheus and Grafana. + -> [!NOTE] -> For detailed information about Dynamo's metrics system, including hierarchical metrics, automatic labeling, and usage examples, see the [Metrics Guide](./metrics.md). +# Metrics Visualization with Prometheus and Grafana ## Overview -### Components - -- **Prometheus Server**: Collects and stores metrics from Dynamo services and other components. -- **Grafana**: Provides dashboards by querying the Prometheus Server. - -### Topology - -Default Service Relationship Diagram: -```mermaid -graph TD - BROWSER[Browser] -->|:3001| GRAFANA[Grafana :3001] - subgraph DockerComposeNetwork [Network inside Docker Compose] - NATS_PROM_EXP[nats-prom-exp :7777 /metrics] -->|:8222/varz| NATS_SERVER[nats-server :4222, :6222, :8222] - PROMETHEUS[Prometheus server :9090] -->|:2379/metrics| ETCD_SERVER[etcd-server :2379, :2380] - PROMETHEUS -->|:9401/metrics| DCGM_EXPORTER[dcgm-exporter :9401] - PROMETHEUS -->|:7777/metrics| NATS_PROM_EXP - PROMETHEUS -->|:8000/metrics| DYNAMOFE[Dynamo HTTP FE :8000] - PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081] - DYNAMOFE --> DYNAMOBACKEND - GRAFANA -->|:9090/query API| PROMETHEUS - end -``` - -The dcgm-exporter service in the Docker Compose network is configured to use port 9401 instead of the default port 9400. This adjustment is made to avoid port conflicts with other dcgm-exporter instances that may be running simultaneously. Such a configuration is typical in distributed systems like SLURM. - -As of Q2 2025, Dynamo HTTP Frontend metrics are exposed when you build containers with `--framework VLLM` or `--framework TRTLLM`. - -### Available Metrics - -#### Backend Component Metrics - -The core Dynamo backend system automatically exposes metrics with the `dynamo_component_*` prefix for all components that use the `DistributedRuntime` framework: - -- `dynamo_component_inflight_requests`: Requests currently being processed (gauge) -- `dynamo_component_request_bytes_total`: Total bytes received in requests (counter) -- `dynamo_component_request_duration_seconds`: Request processing time (histogram) -- `dynamo_component_requests_total`: Total requests processed (counter) -- `dynamo_component_response_bytes_total`: Total bytes sent in responses (counter) -- `dynamo_component_system_uptime_seconds`: DistributedRuntime uptime (gauge) - -#### KV Router Statistics (kvstats) - -KV router statistics are automatically exposed by LLM workers and KV router components with the `dynamo_component_kvstats_*` prefix. These metrics provide insights into GPU memory usage and cache efficiency: - -- `dynamo_component_kvstats_active_blocks`: Number of active KV cache blocks currently in use (gauge) -- `dynamo_component_kvstats_total_blocks`: Total number of KV cache blocks available (gauge) -- `dynamo_component_kvstats_gpu_cache_usage_percent`: GPU cache usage as a percentage (0.0-1.0) (gauge) -- `dynamo_component_kvstats_gpu_prefix_cache_hit_rate`: GPU prefix cache hit rate as a percentage (0.0-1.0) (gauge) +This guide shows how to set up Prometheus and Grafana for visualizing Dynamo metrics on a single machine for demo purposes. -These metrics are published by: -- **LLM Workers**: vLLM and TRT-LLM backends publish these metrics through their respective publishers -- **KV Router**: The KV router component aggregates and exposes these metrics for load balancing decisions +![Grafana Dynamo Dashboard](./grafana-dynamo-composite.png) -#### Specialized Component Metrics +**Components:** +- **Prometheus Server** - Collects and stores metrics from Dynamo services +- **Grafana** - Provides dashboards by querying the Prometheus Server -Some components expose additional metrics specific to their functionality: +**For metrics reference**, see [Metrics Documentation](metrics.md). -- `dynamo_preprocessor_*`: Metrics specific to preprocessor components +## Environment Variables -#### Frontend Metrics +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_SYSTEM_PORT` | System metrics/health port | `-1` (disabled) | `8081` | -When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), these metrics are automatically exposed with the `dynamo_frontend_*` prefix and include `model` labels containing the model name: +## Getting Started Quickly -- `dynamo_frontend_inflight_requests`: Inflight requests (gauge) -- `dynamo_frontend_queued_requests`: Number of requests in HTTP processing queue (gauge) -- `dynamo_frontend_input_sequence_tokens`: Input sequence length (histogram) -- `dynamo_frontend_inter_token_latency_seconds`: Inter-token latency (histogram) -- `dynamo_frontend_output_sequence_tokens`: Output sequence length (histogram) -- `dynamo_frontend_request_duration_seconds`: LLM request duration (histogram) -- `dynamo_frontend_requests_total`: Total LLM requests (counter) -- `dynamo_frontend_time_to_first_token_seconds`: Time to first token (histogram) +This is a single machine example. -**Note**: The `dynamo_frontend_inflight_requests` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time. +### Start the Observability Stack -##### Model Configuration Metrics +Start the observability stack (Prometheus, Grafana, Tempo, exporters). See [Observability Getting Started](README.md#getting-started-quickly) for instructions and prerequisites. -The frontend also exposes model configuration metrics with the `dynamo_frontend_model_*` prefix. These metrics are populated from the worker backend registration service when workers register with the system: +### Start Dynamo Components -**Runtime Config Metrics (from ModelRuntimeConfig):** -These metrics come from the runtime configuration provided by worker backends during registration. +Start frontend and worker (a simple single GPU example): -- `dynamo_frontend_model_total_kv_blocks`: Total KV blocks available for a worker serving the model (gauge) -- `dynamo_frontend_model_max_num_seqs`: Maximum number of sequences for a worker serving the model (gauge) -- `dynamo_frontend_model_max_num_batched_tokens`: Maximum number of batched tokens for a worker serving the model (gauge) - -**MDC Metrics (from ModelDeploymentCard):** -These metrics come from the Model Deployment Card information provided by worker backends during registration. Note that when multiple worker instances register with the same model name, only the first instance's configuration metrics (runtime config and MDC metrics) will be populated. Subsequent instances with duplicate model names will be skipped for configuration metric updates, though the worker count metric will reflect all instances. - -- `dynamo_frontend_model_context_length`: Maximum context length for a worker serving the model (gauge) -- `dynamo_frontend_model_kv_cache_block_size`: KV cache block size for a worker serving the model (gauge) -- `dynamo_frontend_model_migration_limit`: Request migration limit for a worker serving the model (gauge) - -**Worker Management Metrics:** -- `dynamo_frontend_model_workers`: Number of worker instances currently serving the model (gauge) - -#### Request Processing Flow - -This section explains the distinction between two key metrics used to track request processing: - -1. **Inflight**: Tracks requests from HTTP handler start until the complete response is finished -2. **HTTP Queue**: Tracks requests from HTTP handler start until first token generation begins (including prefill time) - -**Example Request Flow:** -``` -curl -s localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ - "model": "Qwen/Qwen3-0.6B", - "prompt": "Hello let's talk about LLMs", - "stream": false, - "max_tokens": 1000 -}' -``` +```bash +# Start frontend in one process +python -m dynamo.frontend --http-port 8000 & -**Timeline:** -``` -Timeline: 0, 1, ... -Client ────> Frontend:8000 ────────────────────> Dynamo component/backend (vLLM, SGLang, TRT) - │request start │received │ - | | | - │ ├──> start prefill ──> first token ──> |last token - │ │ (not impl) | | - ├─────actual HTTP queue¹ ──────────┘ │ | - │ │ │ - ├─────implemented HTTP queue ─────────────────────────────┘ | - │ │ - └─────────────────────────────────── Inflight ────────────────────────────┘ +# Start vLLM worker with metrics enabled on port 8081 +DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager ``` -**Concurrency Example:** -Suppose the backend allows 3 concurrent requests and there are 10 clients continuously hitting the frontend: -- All 10 requests will be counted as inflight (from start until complete response) -- 7 requests will be in HTTP queue most of the time -- 3 requests will be actively processed (between first token and last token) - -**Testing Setup:** -Try launching a frontend and a Mocker backend that allows 3 concurrent requests: -```bash -$ python -m dynamo.frontend --http-port 8000 -$ python -m dynamo.mocker --model-path Qwen/Qwen3-0.6B --max-num-seqs 3 -# Launch your 10 concurrent clients here -# Then check the queued_requests and inflight_requests metrics from the frontend: -$ curl -s localhost:8000/metrics|grep -v '^#'|grep -E 'queue|inflight' -dynamo_frontend_queued_requests{model="qwen/qwen3-0.6b"} 7 -dynamo_frontend_inflight_requests{model="qwen/qwen3-0.6b"} 10 -``` +After the workers are running, send a few test requests to populate metrics in the system: -**Real setup using vLLM (instead of Mocker):** ```bash -$ python -m dynamo.vllm --model Qwen/Qwen3-0.6B \ - --enforce-eager --no-enable-prefix-caching --max-num-seqs 3 +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "messages": [{"role": "user", "content": "Hello"}], + "max_completion_tokens": 100 + }' ``` -**Key Differences:** -- **Inflight**: Measures total request lifetime including processing time -- **HTTP Queue**: Measures queuing time before processing begins (including prefill time) -- **HTTP Queue ≤ Inflight** (HTTP queue is a subset of inflight time) - -### Required Files - -The following configuration files are located in the `deploy/metrics/` directory: -- [docker-compose.yml](../../deploy/docker-compose.yml): Defines the Prometheus and Grafana services -- [prometheus.yml](../../deploy/metrics/prometheus.yml): Contains Prometheus scraping configuration -- [grafana-datasources.yml](../../deploy/metrics/grafana-datasources.yml): Contains Grafana datasource configuration -- [grafana_dashboards/grafana-dashboard-providers.yml](../../deploy/metrics/grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration -- [grafana_dashboards/grafana-dynamo-dashboard.json](../../deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics. -- [grafana_dashboards/grafana-dcgm-metrics.json](../../deploy/metrics/grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics -- [grafana_dashboards/grafana-kvbm-dashboard.json](../../deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json): Contains Grafana dashboard configuration for KVBM metrics - -### Metric Name Constants - -The [prometheus_names.rs](../../lib/runtime/src/metrics/prometheus_names.rs) module provides centralized Prometheus metric name constants and sanitization utilities for the Dynamo metrics system. This module ensures consistency across all components and prevents metric name duplication. - -#### Key Features - -- **Centralized Constants**: All Prometheus metric names are defined as constants to avoid duplication and typos -- **Automatic Sanitization**: Functions to sanitize metric and label names according to Prometheus naming rules -- **Component Organization**: Metric names are organized by component (frontend, work_handler, nats_client, etc.) -- **Validation Arrays**: Arrays of metric names for iteration and validation purposes - -#### Metric Name Prefixes - -- `dynamo_component_*`: Core component metrics (requests, latency, bytes, etc.) -- `dynamo_frontend_*`: Frontend service metrics (LLM HTTP service) -- `nats_client_*`: NATS client connection and message metrics -- `nats_service_*`: NATS service statistics metrics -- `kvstats_*`: KV cache statistics from LLM workers - -#### Sanitization Functions - -The module provides functions to ensure metric and label names comply with Prometheus naming conventions: - -- `sanitize_prometheus_name()`: Sanitizes metric names (allows colons and `__`) -- `sanitize_prometheus_label()`: Sanitizes label names (no colons, no `__` prefix) -- `build_component_metric_name()`: Builds full component metric names with proper prefixing +After sending a few requests, the Prometheus Exposition Format text metrics are available at: +- Frontend: `http://localhost:8000/metrics` +- Backend worker: `http://localhost:8081/metrics` -This centralized approach ensures all Dynamo components use consistent, valid Prometheus metric names without manual coordination. +### Access Web Interfaces -## Getting Started +Once Dynamo components are running: -### Prerequisites +1. Open **Grafana** at `http://localhost:3000` (username: `dynamo`, password: `dynamo`) +2. Click on **Dashboards** in the left sidebar +3. Select **Dynamo Dashboard** to view metrics and traces -1. Make sure Docker and Docker Compose are installed on your system +Other interfaces: +- **Prometheus**: `http://localhost:9090` +- **Tempo** (tracing): Accessible through Grafana's Explore view. See [Tracing Guide](tracing.md) for details. -### Quick Start +**Note:** If accessing from another machine, replace `localhost` with the machine's hostname or IP address, and ensure firewall rules allow access to these ports (3000, 9090). -1. Start Dynamo dependencies. Assume you're at the root dynamo path: +--- - ```bash - # Start the basic services (etcd & natsd), along with Prometheus and Grafana - docker compose -f deploy/docker-compose.yml --profile metrics up -d +## Configuration - # Minimum components for Dynamo (will not have Prometheus and Grafana): etcd/nats/dcgm-exporter - docker compose -f deploy/docker-compose.yml up -d - ``` +### Prometheus - Optional: To target specific GPU(s), export the variable below before running Docker Compose - ```bash - export CUDA_VISIBLE_DEVICES=0,2 - ``` - -2. Web servers started. The ones that end in /metrics are in Prometheus format: - - Grafana: `http://localhost:3001` (default login: dynamo/dynamo) - - Prometheus Server: `http://localhost:9090` - - NATS Server: `http://localhost:8222` (monitoring endpoints: /varz, /healthz, etc.) - - NATS Prometheus Exporter: `http://localhost:7777/metrics` - - etcd Server: `http://localhost:2379/metrics` - - DCGM Exporter: `http://localhost:9401/metrics` - - - - Start worker(s) that publishes KV Cache metrics: [lib/runtime/examples/service_metrics/README.md](../../lib/runtime/examples/service_metrics/README.md) can populate dummy KV Cache metrics. - -### Configuration - -#### Prometheus - -The Prometheus configuration is specified in [prometheus.yml](../../deploy/metrics/prometheus.yml). This file is set up to collect metrics from the metrics aggregation service endpoint. +The Prometheus configuration is specified in [prometheus.yml](../../deploy/observability/prometheus.yml). This file is set up to collect metrics from the metrics aggregation service endpoint. Please be aware that you might need to modify the target settings to align with your specific host configuration and network environment. -After making changes to prometheus.yml, it is necessary to reload the configuration using the command below. Simply sending a kill -HUP signal will not suffice due to the caching of the volume that contains the prometheus.yml file. +After making changes to prometheus.yml, restart the Prometheus service. See [Observability Getting Started](README.md#getting-started-quickly) for Docker Compose commands. -``` -docker compose -f deploy/docker-compose.yml up prometheus -d --force-recreate -``` - -#### Grafana +### Grafana Grafana is pre-configured with: - Prometheus datasource - Sample dashboard for visualizing service metrics -![grafana image](./grafana-dynamo-composite.png) ### Troubleshooting -1. Verify services are running: - ```bash - docker compose ps - ``` +1. Verify services are running using `docker compose ps` -2. Check logs: - ```bash - docker compose logs prometheus - docker compose logs grafana - ``` +2. Check logs using `docker compose logs` 3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection. -## Developer Guide - -### Creating Metrics at Different Hierarchy Levels - -#### Runtime-Level Metrics +4. If you encounter issues with stale data or configuration, stop services and wipe volumes using `docker compose down -v` then restart. -```rust -use dynamo_runtime::DistributedRuntime; + **Note:** The `-v` flag removes named volumes (grafana-data, tempo-data), which will reset dashboards and stored metrics. -let runtime = DistributedRuntime::new()?; -let namespace = runtime.namespace("my_namespace")?; -let component = namespace.component("my_component")?; -let endpoint = component.endpoint("my_endpoint")?; +For specific Docker Compose commands, see [Observability Getting Started](README.md#getting-started-quickly). -// Create endpoint-level counters (this is a Prometheus Counter type) -let requests_total = endpoint.metrics().create_counter( - "requests_total", - "Total requests across all namespaces", - &[] -)?; - -let active_connections = endpoint.metrics().create_gauge( - "active_connections", - "Number of active client connections", - &[] -)?; -``` - -#### Namespace-Level Metrics - -```rust -let namespace = runtime.namespace("my_model")?; - -// Namespace-scoped metrics -let model_requests = namespace.metrics().create_counter( - "model_requests", - "Requests for this specific model", - &[] -)?; - -let model_latency = namespace.metrics().create_histogram( - "model_latency_seconds", - "Model inference latency", - &[], - Some(vec![0.001, 0.01, 0.1, 1.0, 10.0]) -)?; -``` - -#### Component-Level Metrics - -```rust -let component = namespace.component("backend")?; - -// Component-specific metrics -let backend_requests = component.metrics().create_counter( - "backend_requests", - "Requests handled by this backend component", - &[] -)?; - -let gpu_memory_usage = component.metrics().create_gauge( - "gpu_memory_bytes", - "GPU memory usage in bytes", - &[] -)?; -``` - -#### Endpoint-Level Metrics - -```rust -let endpoint = component.endpoint("generate")?; - -// Endpoint-specific metrics -let generate_requests = endpoint.metrics().create_counter( - "generate_requests", - "Generate endpoint requests", - &[] -)?; - -let generate_latency = endpoint.metrics().create_histogram( - "generate_latency_seconds", - "Generate endpoint latency", - &[], - Some(vec![0.001, 0.01, 0.1, 1.0, 10.0]) -)?; -``` - -### Creating Vector Metrics with Dynamic Labels - -Use vector metrics when you need to track metrics with different label values: - -```rust -// Counter with labels -let requests_by_model = endpoint.metrics().create_countervec( - "requests_by_model", - "Requests by model type", - &["model_type", "model_size"], - &[] // no constant labels -)?; - -// Increment with specific labels -requests_by_model.with_label_values(&["llama", "7b"]).inc(); -requests_by_model.with_label_values(&["gpt", "13b"]).inc(); - -// Gauge with labels -let memory_by_gpu = component.metrics().create_gaugevec( - "gpu_memory_bytes", - "GPU memory usage by device", - &["gpu_id", "memory_type"], - &[] // no constant labels -)?; - -memory_by_gpu.with_label_values(&["0", "allocated"]).set(8192.0); -memory_by_gpu.with_label_values(&["0", "cached"]).set(4096.0); -``` - -### Creating Histograms - -Histograms are useful for measuring distributions of values like latency: - -```rust -let latency_histogram = endpoint.metrics().create_histogram( - "request_latency_seconds", - "Request latency distribution", - &[], - Some(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0]) -)?; - -// Record latency values -latency_histogram.observe(0.023); // 23ms -latency_histogram.observe(0.156); // 156ms -``` - -### Transitioning from Plain Prometheus - -If you're currently using plain Prometheus metrics, transitioning to Dynamo's `MetricsRegistry` is straightforward: - -#### Before (Plain Prometheus) - -```rust -use prometheus::{Counter, Opts, Registry}; - -// Create a registry to hold metrics -let registry = Registry::new(); -let counter_opts = Opts::new("my_counter", "My custom counter"); -let counter = Counter::with_opts(counter_opts).unwrap(); -registry.register(Box::new(counter.clone())).unwrap(); - -// Use the counter -counter.inc(); - -// To expose metrics, you'd need to set up an HTTP server manually -// and implement the /metrics endpoint yourself -``` - -#### After (Dynamo MetricsRegistry) - -```rust -let counter = endpoint.metrics().create_counter( - "my_counter", - "My custom counter", - &[] -)?; - -counter.inc(); -``` - -**Note:** The metric is automatically registered when created via the endpoint's `metrics().create_counter()` factory method. - -**Benefits of Dynamo's approach:** -- **Automatic registration**: Metrics created via endpoint's `metrics().create_*()` factory methods are automatically registered with the system -- Automatic labeling with namespace, component, and endpoint information -- Consistent metric naming with `dynamo_` prefix -- Built-in HTTP metrics endpoint when `DYN_SYSTEM_PORT` is set to a positive value -- Hierarchical metric organization - -### Advanced Features - -#### Custom Buckets for Histograms - -```rust -// Define custom buckets for your use case -let custom_buckets = vec![0.001, 0.01, 0.1, 1.0, 10.0]; -let latency = endpoint.metrics().create_histogram( - "api_latency_seconds", - "API latency in seconds", - &[], - Some(custom_buckets) -)?; -``` - -#### Metric Aggregation - -```rust -// Aggregate metrics across multiple endpoints -let requests_total = namespace.metrics().create_counter( - "requests_total", - "Total requests across all endpoints", - &[] -)?; -``` - - -## Troubleshooting - -1. Verify services are running: - ```bash - docker compose ps - ``` +## Developer Guide -2. Check logs: - ```bash - docker compose logs prometheus - docker compose logs grafana - ``` +For detailed information on creating custom metrics in Dynamo components, see: -3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection. +- [Metrics Developer Guide](metrics-developer-guide.md) diff --git a/deploy/tracing/trace.png b/docs/observability/trace.png similarity index 100% rename from deploy/tracing/trace.png rename to docs/observability/trace.png diff --git a/deploy/tracing/README.md b/docs/observability/tracing.md similarity index 59% rename from deploy/tracing/README.md rename to docs/observability/tracing.md index a2efa75bd5..a87c2a46c5 100644 --- a/deploy/tracing/README.md +++ b/docs/observability/tracing.md @@ -5,87 +5,61 @@ SPDX-License-Identifier: Apache-2.0 # Distributed Tracing with Tempo -This guide explains how to set up and view distributed traces in Grafana Tempo for Dynamo workloads. - ## Overview -Dynamo supports OpenTelemetry-based distributed tracing, allowing you to visualize request flows across Frontend and Worker components. Traces are exported to Tempo via OTLP (OpenTelemetry Protocol) and visualized in Grafana. +Dynamo supports OpenTelemetry-based distributed tracing for visualizing request flows across Frontend and Worker components. Traces are exported to Tempo via OTLP (OpenTelemetry Protocol) and visualized in Grafana. + +**Requirements:** Set `DYN_LOGGING_JSONL=true` and `OTEL_EXPORT_ENABLED=true` to export traces to Tempo. -## Prerequisites +This guide covers single GPU demo setup using Docker Compose. For Kubernetes deployments, see [Kubernetes Deployment](#kubernetes-deployment). -- Docker and Docker Compose (for local deployment) -- Kubernetes cluster with kubectl access (for Kubernetes deployment) -- Dynamo runtime with tracing support +**Note:** This section has overlap with [Logging of OpenTelemetry Tracing](logging.md) since OpenTelemetry has aspects of both logging and tracing. The tracing approach documented here is for persistent trace visualization and analysis. For short debugging sessions examining trace context directly in logs, see the [Logging](logging.md) guide. ## Environment Variables -Dynamo's tracing is configured via environment variables. For complete logging documentation, see [docs/observability/logging.md](../../docs/observability/logging.md). +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_LOGGING_JSONL` | Enable JSONL logging format (required for tracing) | `false` | `true` | +| `OTEL_EXPORT_ENABLED` | Enable OTLP trace export | `false` | `true` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP gRPC endpoint for Tempo | `http://localhost:4317` | `http://tempo:4317` | +| `OTEL_SERVICE_NAME` | Service name for identifying components | `dynamo` | `dynamo-frontend` | -### Required Environment Variables +## Getting Started Quickly -| Variable | Description | Example Value | -|----------|-------------|---------------| -| `DYN_LOGGING_JSONL` | Enable JSONL logging format (required for tracing) | `true` | -| `OTEL_EXPORT_ENABLED` | Enable OTLP trace export | `1` | -| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP gRPC endpoint for Tempo | `http://localhost:4317` (local) or `http://tempo:4317` (docker) | -| `OTEL_SERVICE_NAME` | Service name for identifying components | `dynamo-frontend`, `dynamo-worker-prefill`, `dynamo-worker-decode` | +### 1. Start Observability Stack -**Note:** When `OTEL_EXPORT_ENABLED=1`, logging initialization is deferred until the runtime is available (required by the OTEL exporter). This means some early logs will be dropped. This will be fixed in a future release. +Start the observability stack (Prometheus, Grafana, Tempo, exporters). See [Observability Getting Started](README.md#getting-started-quickly) for instructions. -### Example Configuration +### 2. Set Environment Variables + +Configure Dynamo components to export traces: ```bash # Enable JSONL logging and tracing export DYN_LOGGING_JSONL=true - -# Enable trace export to Tempo -export OTEL_EXPORT_ENABLED=1 - -# Set the Tempo endpoint (docker-compose network) -export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://tempo:4317 - -# Set service name to identify this component -export OTEL_SERVICE_NAME=dynamo-frontend +export OTEL_EXPORT_ENABLED=true +export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4317 ``` ---- - -## Local Deployment with Docker Compose - -### 1. Start Tempo and Grafana +### 3. Start Dynamo Components (Single GPU) -From the `deploy/tracing` directory, start the observability stack: +For a simple single-GPU deployment, start the frontend and a single vLLM worker: ```bash -cd deploy/tracing -docker-compose up -d -``` - -This will start: -- **Tempo** on `http://localhost:3200` (HTTP API) and `localhost:4317` (OTLP gRPC) -- **Grafana** on `http://localhost:3000` (username: `admin`, password: `admin`) +# Start the frontend with tracing enabled +export OTEL_SERVICE_NAME=dynamo-frontend +python -m dynamo.frontend --router-mode kv --http-port=8000 & -Verify services are running: +# Start a single vLLM worker (aggregated prefill and decode) +export OTEL_SERVICE_NAME=dynamo-worker-vllm +python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & -```bash -docker-compose ps +wait ``` -### 2. Set Environment Variables +This runs both prefill and decode on the same GPU, providing a simpler setup for testing tracing. -Configure Dynamo components to export traces: - -```bash -# Enable JSONL logging and tracing -export DYN_LOGGING_JSONL=true -export OTEL_EXPORT_ENABLED=1 -export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4317 - -# Set service names for each component -export OTEL_SERVICE_NAME=dynamo-frontend -``` - -### 3. Run vLLM Disaggregated Deployment +### Alternative: Disaggregated Deployment (2 GPUs) Run the vLLM disaggregated script with tracing enabled: @@ -106,70 +80,66 @@ trap 'echo Cleaning up...; kill 0' EXIT # Enable tracing export DYN_LOGGING_JSONL=true -export OTEL_EXPORT_ENABLED=1 +export OTEL_EXPORT_ENABLED=true export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4317 # Run frontend export OTEL_SERVICE_NAME=dynamo-frontend python -m dynamo.frontend --router-mode kv --http-port=8000 & -# Run decode worker +# Run decode worker, make sure to wait for start up export OTEL_SERVICE_NAME=dynamo-worker-decode CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & -# Run prefill worker +# Run prefill worker, make sure to wait for start up export OTEL_SERVICE_NAME=dynamo-worker-prefill CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ --model Qwen/Qwen3-0.6B \ --enforce-eager \ --is-prefill-worker & - -wait ``` +For disaggregated deployments, this separates prefill and decode onto different GPUs for better resource utilization. + ### 4. Generate Traces -Send requests to the frontend to generate traces: +Send requests to the frontend to generate traces (works for both aggregated and disaggregated deployments). **Note the `x-request-id` header**, which allows you to easily search for and correlate this specific trace in Grafana: ```bash -curl -d '{ +curl -H 'Content-Type: application/json' \ +-H 'x-request-id: test-trace-001' \ +-d '{ "model": "Qwen/Qwen3-0.6B", "max_completion_tokens": 100, "messages": [ {"role": "user", "content": "What is the capital of France?"} ] }' \ --H 'Content-Type: application/json' \ --H 'x-request-id: test-trace-001' \ http://localhost:8000/v1/chat/completions ``` ### 5. View Traces in Grafana Tempo 1. Open Grafana at `http://localhost:3000` -2. Login with username `admin` and password `admin` +2. Login with username `dynamo` and password `dynamo` 3. Navigate to **Explore** (compass icon in the left sidebar) 4. Select **Tempo** as the data source (should be selected by default) -5. Use the **Search** tab to find traces: +5. In the query type, select **"Search"** (not TraceQL, not Service Graph) +6. Use the **Search** tab to find traces: - Search by **Service Name** (e.g., `dynamo-frontend`) - Search by **Span Name** (e.g., `http-request`, `handle_payload`) - Search by **Tags** (e.g., `x_request_id=test-trace-001`) -6. Click on a trace to view the detailed flame graph +7. Click on a trace to view the detailed flame graph #### Example Trace View Below is an example of what a trace looks like in Grafana Tempo: -![Trace Example](./trace.png) +![Trace Example](trace.png) ### 6. Stop Services -When done, stop the Tempo and Grafana stack: - -```bash -cd deploy/tracing -docker-compose down -``` +When done, stop the observability stack. See [Observability Getting Started](README.md#getting-started-quickly) for Docker Compose commands. --- @@ -192,7 +162,7 @@ spec: - name: DYN_LOGGING_JSONL value: "true" - name: OTEL_EXPORT_ENABLED - value: "1" + value: "true" - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT value: "http://tempo.observability.svc.cluster.local:4317" diff --git a/examples/backends/sglang/launch/agg.sh b/examples/backends/sglang/launch/agg.sh index ca51036081..feb63d0362 100755 --- a/examples/backends/sglang/launch/agg.sh +++ b/examples/backends/sglang/launch/agg.sh @@ -17,7 +17,7 @@ python3 -m dynamo.frontend --http-port=8000 & DYNAMO_PID=$! # run worker with metrics enabled -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=8081 \ python3 -m dynamo.sglang \ --model-path Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \ diff --git a/examples/backends/sglang/launch/disagg_same_gpu.sh b/examples/backends/sglang/launch/disagg_same_gpu.sh index 555970bd9a..ba309e56a7 100755 --- a/examples/backends/sglang/launch/disagg_same_gpu.sh +++ b/examples/backends/sglang/launch/disagg_same_gpu.sh @@ -41,7 +41,7 @@ python3 -m dynamo.frontend --router-mode kv --http-port=8000 & DYNAMO_PID=$! # run prefill worker with metrics on port 8081 -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=8081 \ python3 -m dynamo.sglang \ --model-path Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \ @@ -71,7 +71,7 @@ echo "Waiting for prefill worker to initialize..." sleep 5 # run decode worker with metrics on port 8082 (foreground) -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8082 \ +DYN_SYSTEM_PORT=8082 \ python3 -m dynamo.sglang \ --model-path Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \ diff --git a/examples/backends/trtllm/launch/disagg_same_gpu.sh b/examples/backends/trtllm/launch/disagg_same_gpu.sh index 695b32b637..1036329e8d 100755 --- a/examples/backends/trtllm/launch/disagg_same_gpu.sh +++ b/examples/backends/trtllm/launch/disagg_same_gpu.sh @@ -53,7 +53,7 @@ DYNAMO_PID=$! # run prefill worker (shares GPU with decode) CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=8081 \ python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ @@ -65,7 +65,7 @@ PREFILL_PID=$! # run decode worker (shares GPU with prefill) CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8082 \ +DYN_SYSTEM_PORT=8082 \ python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ diff --git a/examples/backends/vllm/launch/agg_multimodal.sh b/examples/backends/vllm/launch/agg_multimodal.sh index c1a667b686..51ada957f6 100755 --- a/examples/backends/vllm/launch/agg_multimodal.sh +++ b/examples/backends/vllm/launch/agg_multimodal.sh @@ -52,7 +52,7 @@ fi # Multimodal data (images) are decoded in the backend worker using ImageLoader # --enforce-eager: Quick deployment (remove for production) # --connector none: No KV transfer needed for aggregated serving -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=8081 \ python -m dynamo.vllm --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS # Wait for all background processes to complete diff --git a/lib/bindings/python/examples/metrics/README.md b/lib/bindings/python/examples/metrics/README.md deleted file mode 100644 index cdcb486ca0..0000000000 --- a/lib/bindings/python/examples/metrics/README.md +++ /dev/null @@ -1,424 +0,0 @@ - - - -# Dynamo MetricsRegistry for Python - -Python MetricsRegistry allows you to create and manage Prometheus metrics from Python: - -- **Metric Types**: Counter, IntCounter, Gauge, IntGauge, Histogram, and their Vec variants (CounterVec, IntCounterVec, GaugeVec, IntGaugeVec) -- **Metric Introspection**: Access metric names, constant labels, and variable label names -- **Automatic Registration**: Metrics are automatically registered with the component hierarchy (namespace/component/endpoint) and available on the HTTP system status server -- **Optional Callback Support**: Register Python callbacks to update metrics before scraping - -Example: -```python -from dynamo.runtime import DistributedRuntime - -async def main(): - drt = DistributedRuntime() - endpoint = drt.namespace("ns").component("comp").endpoint("ep") - - # Create metrics - counter = endpoint.metrics.create_intcounter("requests_total", "Total requests") - gauge_vec = endpoint.metrics.create_intgaugevec( - "active_connections", - "Active connections by status", - ["status"], # variable labels - [("region", "us-west")] # constant labels - ) - - # Introspect metrics - print(counter.name()) # "ns_comp_ep_requests_total" - print(counter.const_labels()) # {"dynamo_namespace": "ns", ...} - print(gauge_vec.variable_labels()) # ["status"] - - # Use metrics - counter.inc() - gauge_vec.set(5, {"status": "active"}) -``` - -## Python-Rust Metrics Integration - -This directory demonstrates two methods for passing metrics between Python and Rust in the Dynamo runtime. - -### Method 1: ForwardPassMetrics Pub/Sub via NATS (Legacy method for passing metrics) - -Python maintains its own metrics dictionary, serializes it, and publishes to NATS. Rust subscribes to NATS, deserializes the metrics, and updates Prometheus gauges. - -**Communication pattern**: Unidirectional (Python → NATS → Rust). Python publishes metrics; no feedback from Rust to Python. - -**Example**: Used by `WorkerMetricsPublisher` in production code - -```python -from dynamo.llm import WorkerMetricsPublisher, ForwardPassMetrics - -# Create publisher -publisher = WorkerMetricsPublisher() -await publisher.create_endpoint(component, metrics_labels) - -# Python maintains its own metrics dict -metrics_dict = { - "num_running_reqs": 5, - "num_waiting_reqs": 10, - "gpu_cache_usage": 0.75, -} - -# Serialize and publish to NATS -metrics = ForwardPassMetrics(metrics_dict) -publisher.publish(metrics) - -# Rust subscribes to NATS, deserializes, and updates Prometheus -``` - -### Adding/Changing Metrics in Method 1 - -When you need to add or modify metrics in Method 1 (ForwardPassMetrics Pub/Sub via NATS), you must update **multiple files**: - -1. **`lib/llm/src/kv_router/protocols.rs`** - Add field to struct (WorkerStats is part of ForwardPassMetrics): - ```rust - pub struct WorkerStats { - pub request_active_slots: u64, - pub request_total_slots: u64, - pub num_requests_waiting: u64, - pub new_metric_field: u64, // ADD THIS - } - ``` - -2. **`lib/llm/src/kv_router/publisher.rs`** - Manually create Prometheus gauge using DRT: - ```rust - fn new(component: &Component) -> Result { - use dynamo_runtime::metrics::MetricsRegistry; - - // ... existing gauges ... - - // Manually create and register new Prometheus gauge - let new_metric_gauge = component.metrics().create_gauge( - "new_metric_name", - "Description of new metric", - &[], // labels - )?; - - // Store in struct - Ok(KvStatsPrometheusGauges { - kv_active_blocks_gauge, - kv_total_blocks_gauge, - gpu_cache_usage_gauge, - gpu_prefix_cache_hit_rate_gauge, - new_metric_gauge, // ADD THIS - }) - } - ``` - -3. **`lib/llm/src/kv_router/publisher.rs`** - Update gauge in `update_from_kvstats()`: - ```rust - fn update_from_kvstats(&self, kv_stats: &KvStats) { - // ... existing updates ... - self.new_metric_gauge.set(worker_stats.new_metric_field as f64); - } - ``` - -4. **`components/src/dynamo/sglang/publisher.py`** - Update Python code to compute new metric: - ```python - def collect_metrics(): - worker_stats = WorkerStats( - request_active_slots=..., - new_metric_field=compute_new_metric(), # ADD THIS - ) - ``` - -**Result**: Changes require touching 3-4 files across Rust and Python codebases. - -### Method 2: Dynamo MetricsRegistry in Python - -Python creates typed metric objects using `endpoint.metrics.create_*()` methods, which automatically register with the endpoint. Python updates values through these objects with methods that have type hints (via `.pyi` files). Rust creates the underlying Prometheus metrics and calls Python callbacks before scraping. - -**Communication pattern**: Currently unidirectional (Python → Rust for updates, Rust → Python for callback invocation). Could be extended to bidirectional communication in the future (e.g., Rust notifying Python of scrape events, configuration changes) without major architectural changes. - -**Key advantage:** No Rust code modifications needed - metrics are defined and updated entirely in Python. - -This method supports two update patterns: - -#### Example A: Background Thread Updates (server_with_loop.py) - -Update metrics continuously from a background thread, independent of scraping: - -```python -# Create metric objects (automatically registered) -# Note: Prometheus prefixes these with "dynamo_component_", so they appear as: -# - dynamo_component_request_total_slots -# - dynamo_component_gpu_cache_usage_percent -request_slots: IntGauge = endpoint.metrics.create_intgauge( - "request_total_slots", "Total request slots available" -) -gpu_usage: Gauge = endpoint.metrics.create_gauge( - "gpu_cache_usage_percent", "GPU cache usage percentage" -) - -# Background thread continuously updates metrics -def update_metrics_in_loop(): - count = 0 - while True: - count += 1 - request_slots.set(1024 + count) - gpu_usage.set(0.01 + (count * 0.01)) - time.sleep(2) - -updater = threading.Thread(target=update_metrics_in_loop, daemon=True) -updater.start() -``` - -#### Example B: Callback-based Updates (server_with_callback.py) - -Register a callback that updates metrics on-demand when Prometheus scrapes the `/metrics` endpoint: - -```python -# Create metric objects (automatically registered) -# Note: Prometheus prefixes these with "dynamo_component_", so they appear as: -# - dynamo_component_request_total_slots -# - dynamo_component_gpu_cache_usage_percent -request_slots: IntGauge = endpoint.metrics.create_intgauge( - "request_total_slots", "Total request slots available" -) -gpu_usage: Gauge = endpoint.metrics.create_gauge( - "gpu_cache_usage_percent", "GPU cache usage percentage" -) - -# Register callback for dynamic updates before scraping -def update_metrics(): - request_slots.set(compute_current_slots()) - gpu_usage.set(get_gpu_usage()) - -endpoint.metrics.register_callback(update_metrics) -``` - -Both examples support vector metrics with labels: - -```python -# Create vector metrics with labels -worker_requests: IntGaugeVec = endpoint.metrics.create_intgaugevec( - "worker_active_requests", - "Active requests per worker", - ["worker_id", "model"] -) - -# Update vector metrics with specific label values -worker_requests.set(5, {"worker_id": "worker_1", "model": "llama-3"}) -worker_requests.set(3, {"worker_id": "worker_2", "model": "llama-3"}) -``` - -#### Available Metric Types - -Method 2 supports all standard Prometheus metric types: - -- **Gauges**: `Gauge` (float), `IntGauge` (integer) -- **GaugeVec**: `GaugeVec` (float with labels), `IntGaugeVec` (integer with labels) -- **Counters**: `Counter` (float), `IntCounter` (integer) -- **CounterVec**: `CounterVec` (float with labels), `IntCounterVec` (integer with labels) -- **Histograms**: `Histogram` - -All metrics are imported from `dynamo.prometheus_metrics`. - -#### Adding/Changing Metrics in Method 2 - -When you need to add or modify metrics in Method 2 (Dynamic Registration), you only update **Python code**: - -1. **Create new metric** - Just add one line in Python (automatically registered): - ```python - new_metric: IntGauge = endpoint.metrics.create_intgauge( - "new_metric_name", "Description of the metric" - ) - ``` - -2. **Update in callback** - Add update logic: - ```python - def update_metrics(): - request_slots.set(compute_slots()) - gpu_usage.set(compute_gpu_usage()) - new_metric.set(compute_new_metric()) # ADD THIS - ``` - -3. **For vector metrics with labels** - Create with label names, update with label values: - ```python - # Create vector metric - new_vec: IntGaugeVec = endpoint.metrics.create_intgaugevec( - "new_metric_vec", "Description", ["label1", "label2"] - ) - - # Update with specific label values - new_vec.set(100, {"label1": "value1", "label2": "value2"}) - ``` - -**Result**: Changes only require modifying Python code. No Rust changes needed. Metrics are automatically created and registered with Prometheus by the Rust runtime when you call `create_*()`. - -#### Type-Hinted Methods - -Dynamic Registration provides type hints (via `.pyi` stub files) for typed metric classes: - -- **Gauges** use `.set()`, `.get()`, `.inc()`, `.dec()`, `.add()`, `.sub()` -- **Counters** use `.inc()`, `.inc_by()`, `.get()` (counters only increase) -- **Histograms** use `.observe()` -- **Vec metrics** take a `labels: Dict[str, str]` parameter for operations - -### Architecture Diagrams - -#### Component Architecture - -##### Method 1: ForwardPassMetrics Pub/Sub via NATS - Component View - -```mermaid -graph TB - subgraph "Python Layer" - PY[Python Application
components/src/dynamo/sglang/main.py] - style PY fill:#3776ab,color:#fff - end - - subgraph "Python/Rust Interface (PyO3)" - WMPB[WorkerMetricsPublisher Bindings
bindings/python/rust/llm/kv.rs] - FPM[ForwardPassMetrics Struct
bindings/python/rust/llm/kv.rs] - style WMPB fill:#f4a261,color:#000 - style FPM fill:#f4a261,color:#000 - end - - subgraph "Rust Core" - subgraph "Worker Process Components" - WMP[WorkerMetricsPublisher
llm/src/kv_router/publisher.rs] - WATCH[Watch Channel
tokio::sync::watch] - PROM1[Local Prometheus Gauges
prometheus::Gauge] - end - - subgraph "NATS Infrastructure" - NATS[NATS Server
KV_METRICS_SUBJECT] - end - - subgraph "Other Consumers (e.g., KvWorkerMonitor)" - SUB[NATS Subscriber
component/namespace.rs] - end - - subgraph "System Status Servers" - SS[System Status Server
runtime/src/system_status_server.rs
Started by DistributedRuntime] - end - - style WMP fill:#ce422b,color:#fff - style WATCH fill:#ce422b,color:#fff - style PROM1 fill:#ce422b,color:#fff - style NATS fill:#27aae1,color:#fff - style SUB fill:#ce422b,color:#fff - style SS fill:#6c757d,color:#fff - end - - PY -->|"WorkerMetricsPublisher()"| WMPB - PY -->|"ForwardPassMetrics(worker_stats, kv_stats, spec_decode_stats)"| FPM - PY -->|"publish(metrics)"| WMPB - WMPB -->|"FFI: publish(Arc ForwardPassMetrics)"| WMP - WMP -->|"update_from_kvstats(kv_stats)"| PROM1 - WMP -->|"tx.send(metrics)"| WATCH - WATCH -->|"publish(KV_METRICS_SUBJECT, LoadEvent)"| NATS - NATS -->|"subscribe_with_type LoadEvent"| SUB - SS -->|"Worker: gather() from PROM1"| PROM1 -``` - -##### Method 2: Dynamic Registration - Component View - -```mermaid -graph TD - subgraph Python["Python Layer"] - PY[Python Application
main.py] - style PY fill:#3776ab,color:#fff - end - - subgraph PyO3["Python/Rust Interface - PyO3"] - PM[PrometheusMetricsUtils
endpoint.metrics
prometheus_metrics.rs] - MT[Metric Type Objects
IntGauge/Gauge/Counter/etc.
prometheus_metrics.rs] - style PM fill:#f4a261,color:#000 - style MT fill:#f4a261,color:#000 - end - - subgraph Rust["Rust Core"] - EP[Endpoint
component/endpoint.rs] - DRT[DistributedRuntime
distributed.rs] - PROM["Prometheus Registry
prometheus::IntGauge/Gauge/etc."] - SS[System Status Server
system_status_server.rs] - style EP fill:#ce422b,color:#fff - style DRT fill:#ce422b,color:#fff - style PROM fill:#ce422b,color:#fff - style SS fill:#6c757d,color:#fff - end - - PY -->|endpoint.metrics.create_intgauge| PM - PM -->|endpoint.metrics.create_intgauge| EP - EP -->|create & register| PROM - PM -->|wrap & return| MT - MT -->|return to Python| PY - PY -->|metric.set/get| MT - MT -->|direct FFI call| PROM - PY -.->|endpoint.metrics.register_callback| PM - PM -.->|drt.register_metrics_callback| DRT - SS ==>|execute_metrics_callbacks| DRT - DRT -.->|invoke Python callback| PY - SS -->|gather| PROM - - linkStyle 7 stroke:#ff6b6b,stroke-width:2px - linkStyle 8 stroke:#ff6b6b,stroke-width:2px - linkStyle 9 stroke:#ff6b6b,stroke-width:2px - linkStyle 10 stroke:#ff6b6b,stroke-width:2px -``` - -### Running the Examples - -The examples demonstrate Method 2 (Dynamo MetricsRegistry in Python) with two different update patterns. - -#### Prerequisites - -Update Python bindings if needed: -```bash -cd ~/dynamo/lib/bindings/python -maturin develop -``` - -#### Run Example A: Background Thread Updates - -```bash -cd ~/dynamo/lib/bindings/python/examples/metrics -DYN_SYSTEM_PORT=8081 ./server_with_loop.py -``` - -#### Run Example B: Callback-based Updates - -```bash -cd ~/dynamo/lib/bindings/python/examples/metrics -DYN_SYSTEM_PORT=8081 ./server_with_callback.py -``` - -**Note:** The environment variable is required: -- `DYN_SYSTEM_PORT=8081` - Sets the port for the metrics endpoint (automatically enables the system status server) - -#### Check the Metrics - -The metrics are served via the system status server at: - -```bash -curl http://localhost:8081/metrics -``` - -Expected output includes: - -``` -# HELP request_total_slots Total request slots available -# TYPE request_total_slots gauge -request_total_slots{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556"} 1024 - -# HELP gpu_cache_usage_percent GPU cache usage percentage -# TYPE gpu_cache_usage_percent gauge -gpu_cache_usage_percent{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556"} 0.00 - -# HELP worker_active_requests Active requests per worker -# TYPE worker_active_requests gauge -worker_active_requests{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556",worker_id="worker_1",model="llama-3"} 5 -worker_active_requests{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556",worker_id="worker_2",model="llama-3"} 3 - -# HELP internal_update_count Number of times metrics callback was invoked -# TYPE internal_update_count counter -internal_update_count{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556",type="internal"} 1 -``` - -Each time you query the `/metrics` endpoint, the `update_metrics()` callback is invoked, updating the metric values with fresh data. diff --git a/lib/bindings/python/rust/lib.rs b/lib/bindings/python/rust/lib.rs index bd148def04..c5bfe01aad 100644 --- a/lib/bindings/python/rust/lib.rs +++ b/lib/bindings/python/rust/lib.rs @@ -128,12 +128,9 @@ fn create_request_context( #[pymodule] fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { // Initialize logging early unless OTEL export is enabled (which requires tokio runtime) - if std::env::var("OTEL_EXPORT_ENABLED") - .map(|v| v == "1") - .unwrap_or(false) - { + if rs::config::env_is_truthy("OTEL_EXPORT_ENABLED") { eprintln!( - "Warning: OTEL_EXPORT_ENABLED=1 detected. Logging initialization deferred until runtime is available. Early logs may be dropped." + "Warning: OTEL_EXPORT_ENABLED detected. Logging initialization deferred until runtime is available. Early logs may be dropped." ); } else { rs::logging::init(); @@ -460,10 +457,7 @@ impl DistributedRuntime { // Initialize logging in context where tokio runtime is available // otel exporter requires it - if std::env::var("OTEL_EXPORT_ENABLED") - .map(|v| v == "1") - .unwrap_or(false) - { + if rs::config::env_is_truthy("OTEL_EXPORT_ENABLED") { runtime.secondary().block_on(async { rs::logging::init(); }); diff --git a/lib/llm/tests/http-service.rs b/lib/llm/tests/http-service.rs index 04c8589a56..71ed9abc5e 100644 --- a/lib/llm/tests/http-service.rs +++ b/lib/llm/tests/http-service.rs @@ -277,6 +277,9 @@ async fn test_http_service() { let cancel_token = token.clone(); let task = tokio::spawn(async move { service.run(token.clone()).await }); + // Wait for the service to be ready before proceeding + wait_for_service_ready(port).await; + let registry = Registry::new(); // TODO: Shouldn't this test know the card before it registers a model? diff --git a/lib/runtime/examples/system_metrics/README.md b/lib/runtime/examples/system_metrics/README.md index dfbd4291d0..bc47cce0cd 100644 --- a/lib/runtime/examples/system_metrics/README.md +++ b/lib/runtime/examples/system_metrics/README.md @@ -180,7 +180,7 @@ if enable_custom_metrics { ```bash # Run the system metrics example -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 cargo run --bin system_server +DYN_SYSTEM_PORT=8081 cargo run --bin system_server ``` The server will start an system status server on the specified port (8081 in this example) that exposes the Prometheus metrics endpoint at `/metrics`. @@ -189,7 +189,7 @@ To Run an actual LLM frontend + server (aggregated example), launch both of them ``` python -m dynamo.frontend & -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --no-enable-prefix-caching & +DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --no-enable-prefix-caching & ``` Then make curl requests to the frontend (see the [main README](../../../../README.md)) diff --git a/lib/runtime/examples/system_metrics/tests/integration_test.rs b/lib/runtime/examples/system_metrics/tests/integration_test.rs index 3568b37abb..8f78178265 100644 --- a/lib/runtime/examples/system_metrics/tests/integration_test.rs +++ b/lib/runtime/examples/system_metrics/tests/integration_test.rs @@ -15,8 +15,7 @@ use tokio::time::{Duration, sleep}; #[tokio::test] async fn test_backend_with_metrics() -> Result<()> { - // Set environment variables for dynamic port allocation - env::set_var("DYN_SYSTEM_ENABLED", "true"); + // Set environment variable for dynamic port allocation (0 = auto-assign) env::set_var("DYN_SYSTEM_PORT", "0"); // Generate a random endpoint name to avoid collisions @@ -38,9 +37,7 @@ async fn test_backend_with_metrics() -> Result<()> { info.port() } None => { - panic!( - "System status server not started - check DYN_SYSTEM_ENABLED environment variable" - ); + panic!("System status server not started - check DYN_SYSTEM_PORT environment variable"); } }; diff --git a/lib/runtime/src/logging.rs b/lib/runtime/src/logging.rs index 4250c82f17..a46b043eb8 100644 --- a/lib/runtime/src/logging.rs +++ b/lib/runtime/src/logging.rs @@ -144,11 +144,9 @@ impl Default for LoggingConfig { } } -/// Check if OTLP trace exporting is enabled (set OTEL_EXPORT_ENABLED=1 to enable) +/// Check if OTLP trace exporting is enabled (set OTEL_EXPORT_ENABLED to a truthy value: 1, true, on, yes) fn otlp_exporter_enabled() -> bool { - std::env::var(OTEL_EXPORT_ENABLED_ENV) - .map(|v| v == "1") - .unwrap_or(false) + crate::config::env_is_truthy(OTEL_EXPORT_ENABLED_ENV) } /// Get the service name from environment or use default diff --git a/tests/fault_tolerance/etcd_ha/test_sglang.py b/tests/fault_tolerance/etcd_ha/test_sglang.py index 8783a0fb6d..f4c099bcf3 100644 --- a/tests/fault_tolerance/etcd_ha/test_sglang.py +++ b/tests/fault_tolerance/etcd_ha/test_sglang.py @@ -88,7 +88,6 @@ def __init__(self, request, etcd_endpoints: list, mode: str = "agg"): env = os.environ.copy() env["DYN_LOG"] = "debug" env["ETCD_ENDPOINTS"] = ",".join(etcd_endpoints) - env["DYN_SYSTEM_ENABLED"] = "true" env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]' env["DYN_SYSTEM_PORT"] = port diff --git a/tests/fault_tolerance/etcd_ha/test_trtllm.py b/tests/fault_tolerance/etcd_ha/test_trtllm.py index 67e839aeb5..330c4071df 100644 --- a/tests/fault_tolerance/etcd_ha/test_trtllm.py +++ b/tests/fault_tolerance/etcd_ha/test_trtllm.py @@ -88,7 +88,6 @@ def __init__( env = os.environ.copy() env["DYN_LOG"] = "debug" env["ETCD_ENDPOINTS"] = ",".join(etcd_endpoints) - env["DYN_SYSTEM_ENABLED"] = "true" env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]' env["DYN_SYSTEM_PORT"] = port diff --git a/tests/fault_tolerance/etcd_ha/test_vllm.py b/tests/fault_tolerance/etcd_ha/test_vllm.py index 2e58fdfe4b..d286444221 100644 --- a/tests/fault_tolerance/etcd_ha/test_vllm.py +++ b/tests/fault_tolerance/etcd_ha/test_vllm.py @@ -60,7 +60,6 @@ def __init__(self, request, etcd_endpoints: list, is_prefill: bool = False): env = os.environ.copy() env["DYN_LOG"] = "debug" env["ETCD_ENDPOINTS"] = ",".join(etcd_endpoints) - env["DYN_SYSTEM_ENABLED"] = "true" env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]' env["DYN_SYSTEM_PORT"] = port